fix: format .py
Browse files- config.json +1 -1
- generation_config.json +1 -1
- got_vision_b.py +23 -41
- modeling_GOT.py +201 -204
- render_tools.py +6 -13
- requirements.txt +6 -0
- special_tokens_map.json +1 -1
- tokenization_qwen.py +21 -40
config.json
CHANGED
|
@@ -35,4 +35,4 @@
|
|
| 35 |
"use_im_start_end": true,
|
| 36 |
"use_sliding_window": false,
|
| 37 |
"vocab_size": 151860
|
| 38 |
-
}
|
|
|
|
| 35 |
"use_im_start_end": true,
|
| 36 |
"use_sliding_window": false,
|
| 37 |
"vocab_size": 151860
|
| 38 |
+
}
|
generation_config.json
CHANGED
|
@@ -3,4 +3,4 @@
|
|
| 3 |
"eos_token_id": 151643,
|
| 4 |
"max_new_tokens": 2048,
|
| 5 |
"transformers_version": "4.37.2"
|
| 6 |
-
}
|
|
|
|
| 3 |
"eos_token_id": 151643,
|
| 4 |
"max_new_tokens": 2048,
|
| 5 |
"transformers_version": "4.37.2"
|
| 6 |
+
}
|
got_vision_b.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import torch.nn.functional as F
|
| 3 |
-
from typing import Optional, Tuple, Type
|
| 4 |
from functools import partial
|
| 5 |
-
import
|
| 6 |
-
from typing import Type
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class MLPBlock(nn.Module):
|
|
@@ -23,7 +22,6 @@ class MLPBlock(nn.Module):
|
|
| 23 |
return self.lin2(self.act(self.lin1(x)))
|
| 24 |
|
| 25 |
|
| 26 |
-
|
| 27 |
class LayerNorm2d(nn.Module):
|
| 28 |
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
|
| 29 |
super().__init__()
|
|
@@ -39,7 +37,6 @@ class LayerNorm2d(nn.Module):
|
|
| 39 |
return x
|
| 40 |
|
| 41 |
|
| 42 |
-
|
| 43 |
class ImageEncoderViT(nn.Module):
|
| 44 |
def __init__(
|
| 45 |
self,
|
|
@@ -91,9 +88,7 @@ class ImageEncoderViT(nn.Module):
|
|
| 91 |
self.pos_embed: Optional[nn.Parameter] = None
|
| 92 |
if use_abs_pos:
|
| 93 |
# Initialize absolute positional embedding with pretrain image size.
|
| 94 |
-
self.pos_embed = nn.Parameter(
|
| 95 |
-
torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
|
| 96 |
-
)
|
| 97 |
|
| 98 |
self.blocks = nn.ModuleList()
|
| 99 |
for i in range(depth):
|
|
@@ -129,7 +124,6 @@ class ImageEncoderViT(nn.Module):
|
|
| 129 |
LayerNorm2d(out_chans),
|
| 130 |
)
|
| 131 |
|
| 132 |
-
|
| 133 |
self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
|
| 134 |
self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
|
| 135 |
|
|
@@ -145,7 +139,6 @@ class ImageEncoderViT(nn.Module):
|
|
| 145 |
x = self.net_2(x)
|
| 146 |
x = self.net_3(x)
|
| 147 |
|
| 148 |
-
|
| 149 |
return x
|
| 150 |
|
| 151 |
|
|
@@ -247,9 +240,7 @@ class Attention(nn.Module):
|
|
| 247 |
|
| 248 |
self.use_rel_pos = use_rel_pos
|
| 249 |
if self.use_rel_pos:
|
| 250 |
-
assert
|
| 251 |
-
input_size is not None
|
| 252 |
-
), "Input size must be provided if using relative positional encoding."
|
| 253 |
# initialize relative positional embeddings
|
| 254 |
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
|
| 255 |
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
|
|
@@ -297,9 +288,7 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T
|
|
| 297 |
return windows, (Hp, Wp)
|
| 298 |
|
| 299 |
|
| 300 |
-
def window_unpartition(
|
| 301 |
-
windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
|
| 302 |
-
) -> torch.Tensor:
|
| 303 |
"""
|
| 304 |
Window unpartition into original sequences and removing padding.
|
| 305 |
Args:
|
|
@@ -385,9 +374,7 @@ def add_decomposed_rel_pos(
|
|
| 385 |
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
|
| 386 |
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
|
| 387 |
|
| 388 |
-
attn = (
|
| 389 |
-
attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
|
| 390 |
-
).view(B, q_h * q_w, k_h * k_w)
|
| 391 |
|
| 392 |
return attn
|
| 393 |
|
|
@@ -415,9 +402,7 @@ class PatchEmbed(nn.Module):
|
|
| 415 |
"""
|
| 416 |
super().__init__()
|
| 417 |
|
| 418 |
-
self.proj = nn.Conv2d(
|
| 419 |
-
in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
|
| 420 |
-
)
|
| 421 |
|
| 422 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 423 |
x = self.proj(x)
|
|
@@ -426,7 +411,6 @@ class PatchEmbed(nn.Module):
|
|
| 426 |
return x
|
| 427 |
|
| 428 |
|
| 429 |
-
|
| 430 |
def build_GOT_vit_b(checkpoint=None):
|
| 431 |
return _build_GOT_vision(
|
| 432 |
encoder_embed_dim=768,
|
|
@@ -448,21 +432,19 @@ def _build_GOT_vision(
|
|
| 448 |
image_size = 1024
|
| 449 |
vit_patch_size = 16
|
| 450 |
image_embedding_size = image_size // vit_patch_size
|
| 451 |
-
image_encoder=ImageEncoderViT(
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
|
| 467 |
return image_encoder
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
+
from typing import Optional, Tuple, Type
|
|
|
|
| 3 |
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
|
| 8 |
|
| 9 |
class MLPBlock(nn.Module):
|
|
|
|
| 22 |
return self.lin2(self.act(self.lin1(x)))
|
| 23 |
|
| 24 |
|
|
|
|
| 25 |
class LayerNorm2d(nn.Module):
|
| 26 |
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
|
| 27 |
super().__init__()
|
|
|
|
| 37 |
return x
|
| 38 |
|
| 39 |
|
|
|
|
| 40 |
class ImageEncoderViT(nn.Module):
|
| 41 |
def __init__(
|
| 42 |
self,
|
|
|
|
| 88 |
self.pos_embed: Optional[nn.Parameter] = None
|
| 89 |
if use_abs_pos:
|
| 90 |
# Initialize absolute positional embedding with pretrain image size.
|
| 91 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
|
|
|
|
|
|
|
| 92 |
|
| 93 |
self.blocks = nn.ModuleList()
|
| 94 |
for i in range(depth):
|
|
|
|
| 124 |
LayerNorm2d(out_chans),
|
| 125 |
)
|
| 126 |
|
|
|
|
| 127 |
self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
|
| 128 |
self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
|
| 129 |
|
|
|
|
| 139 |
x = self.net_2(x)
|
| 140 |
x = self.net_3(x)
|
| 141 |
|
|
|
|
| 142 |
return x
|
| 143 |
|
| 144 |
|
|
|
|
| 240 |
|
| 241 |
self.use_rel_pos = use_rel_pos
|
| 242 |
if self.use_rel_pos:
|
| 243 |
+
assert input_size is not None, "Input size must be provided if using relative positional encoding."
|
|
|
|
|
|
|
| 244 |
# initialize relative positional embeddings
|
| 245 |
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
|
| 246 |
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
|
|
|
|
| 288 |
return windows, (Hp, Wp)
|
| 289 |
|
| 290 |
|
| 291 |
+
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]) -> torch.Tensor:
|
|
|
|
|
|
|
| 292 |
"""
|
| 293 |
Window unpartition into original sequences and removing padding.
|
| 294 |
Args:
|
|
|
|
| 374 |
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
|
| 375 |
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
|
| 376 |
|
| 377 |
+
attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w)
|
|
|
|
|
|
|
| 378 |
|
| 379 |
return attn
|
| 380 |
|
|
|
|
| 402 |
"""
|
| 403 |
super().__init__()
|
| 404 |
|
| 405 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
|
|
|
|
|
|
|
| 406 |
|
| 407 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 408 |
x = self.proj(x)
|
|
|
|
| 411 |
return x
|
| 412 |
|
| 413 |
|
|
|
|
| 414 |
def build_GOT_vit_b(checkpoint=None):
|
| 415 |
return _build_GOT_vision(
|
| 416 |
encoder_embed_dim=768,
|
|
|
|
| 432 |
image_size = 1024
|
| 433 |
vit_patch_size = 16
|
| 434 |
image_embedding_size = image_size // vit_patch_size
|
| 435 |
+
image_encoder = ImageEncoderViT(
|
| 436 |
+
depth=encoder_depth,
|
| 437 |
+
embed_dim=encoder_embed_dim,
|
| 438 |
+
img_size=image_size,
|
| 439 |
+
mlp_ratio=4,
|
| 440 |
+
norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
|
| 441 |
+
num_heads=encoder_num_heads,
|
| 442 |
+
patch_size=vit_patch_size,
|
| 443 |
+
qkv_bias=True,
|
| 444 |
+
use_rel_pos=True,
|
| 445 |
+
global_attn_indexes=encoder_global_attn_indexes,
|
| 446 |
+
window_size=14,
|
| 447 |
+
out_chans=prompt_embed_dim,
|
| 448 |
+
)
|
|
|
|
| 449 |
|
| 450 |
return image_encoder
|
|
|
modeling_GOT.py
CHANGED
|
@@ -1,27 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
from
|
|
|
|
| 3 |
from typing import List, Optional, Tuple, Union
|
| 4 |
-
|
| 5 |
import requests
|
| 6 |
-
from PIL import Image
|
| 7 |
-
from io import BytesIO
|
| 8 |
import torch
|
| 9 |
import torch.nn as nn
|
|
|
|
| 10 |
from torch.nn import CrossEntropyLoss
|
| 11 |
-
from .got_vision_b import build_GOT_vit_b
|
| 12 |
from torchvision import transforms
|
| 13 |
from torchvision.transforms.functional import InterpolationMode
|
| 14 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
###
|
| 16 |
|
| 17 |
DEFAULT_IMAGE_TOKEN = "<image>"
|
| 18 |
-
DEFAULT_IMAGE_PATCH_TOKEN =
|
| 19 |
-
DEFAULT_IM_START_TOKEN =
|
| 20 |
-
DEFAULT_IM_END_TOKEN =
|
|
|
|
| 21 |
|
| 22 |
-
from enum import auto, Enum
|
| 23 |
class SeparatorStyle(Enum):
|
| 24 |
"""Different separator style."""
|
|
|
|
| 25 |
SINGLE = auto()
|
| 26 |
TWO = auto()
|
| 27 |
MPT = auto()
|
|
@@ -30,6 +35,7 @@ class SeparatorStyle(Enum):
|
|
| 30 |
@dataclasses.dataclass
|
| 31 |
class Conversation:
|
| 32 |
"""A class that keeps all conversation history."""
|
|
|
|
| 33 |
system: str
|
| 34 |
roles: List[str]
|
| 35 |
messages: List[List[str]]
|
|
@@ -43,7 +49,7 @@ class Conversation:
|
|
| 43 |
|
| 44 |
def get_prompt(self):
|
| 45 |
if self.sep_style == SeparatorStyle.SINGLE:
|
| 46 |
-
ret = self.system + self.sep +
|
| 47 |
for role, message in self.messages:
|
| 48 |
if message:
|
| 49 |
if type(message) is tuple:
|
|
@@ -65,9 +71,9 @@ class Conversation:
|
|
| 65 |
return ret
|
| 66 |
if self.sep_style == SeparatorStyle.MPT:
|
| 67 |
if self.system:
|
| 68 |
-
ret = self.system + self.sep
|
| 69 |
else:
|
| 70 |
-
ret =
|
| 71 |
for role, message in self.messages:
|
| 72 |
if message:
|
| 73 |
if type(message) is tuple:
|
|
@@ -79,7 +85,6 @@ class Conversation:
|
|
| 79 |
else:
|
| 80 |
raise ValueError(f"Invalid style: {self.sep_style}")
|
| 81 |
|
| 82 |
-
|
| 83 |
def append_message(self, role, message):
|
| 84 |
self.messages.append([role, message])
|
| 85 |
|
|
@@ -91,8 +96,8 @@ class Conversation:
|
|
| 91 |
offset=self.offset,
|
| 92 |
sep_style=self.sep_style,
|
| 93 |
sep=self.sep,
|
| 94 |
-
sep2=self.sep2
|
| 95 |
-
|
| 96 |
|
| 97 |
|
| 98 |
class KeywordsStoppingCriteria(StoppingCriteria):
|
|
@@ -111,12 +116,12 @@ class KeywordsStoppingCriteria(StoppingCriteria):
|
|
| 111 |
for keyword_id in self.keyword_ids:
|
| 112 |
if output_ids[0, -1] == keyword_id:
|
| 113 |
return True
|
| 114 |
-
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
|
| 115 |
for keyword in self.keywords:
|
| 116 |
if keyword in outputs:
|
| 117 |
return True
|
| 118 |
return False
|
| 119 |
-
|
| 120 |
|
| 121 |
class GOTImageEvalProcessor:
|
| 122 |
def __init__(self, image_size=384, mean=None, std=None):
|
|
@@ -129,18 +134,16 @@ class GOTImageEvalProcessor:
|
|
| 129 |
|
| 130 |
self.transform = transforms.Compose(
|
| 131 |
[
|
| 132 |
-
transforms.Resize(
|
| 133 |
-
(image_size, image_size), interpolation=InterpolationMode.BICUBIC
|
| 134 |
-
),
|
| 135 |
transforms.ToTensor(),
|
| 136 |
self.normalize,
|
| 137 |
]
|
| 138 |
)
|
|
|
|
| 139 |
def __call__(self, item):
|
| 140 |
return self.transform(item)
|
| 141 |
|
| 142 |
|
| 143 |
-
|
| 144 |
class GOTConfig(Qwen2Config):
|
| 145 |
model_type = "GOT"
|
| 146 |
|
|
@@ -153,28 +156,24 @@ class GOTQwenModel(Qwen2Model):
|
|
| 153 |
|
| 154 |
self.vision_tower_high = build_GOT_vit_b()
|
| 155 |
|
| 156 |
-
self.mm_projector_vary =
|
| 157 |
-
|
| 158 |
|
| 159 |
def initialize_vision_modules(
|
| 160 |
-
self,
|
| 161 |
vision_tower,
|
| 162 |
pretrained_stage1_model=None,
|
| 163 |
freeze_vision_tower=False,
|
| 164 |
use_im_start_end=False,
|
| 165 |
vision_select_layer=-1,
|
| 166 |
dtype=torch.float16,
|
| 167 |
-
device="cuda"
|
| 168 |
):
|
| 169 |
-
|
| 170 |
-
|
| 171 |
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
| 172 |
-
|
| 173 |
self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
|
| 174 |
|
| 175 |
self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
|
| 176 |
|
| 177 |
-
|
| 178 |
image_token_len = 256
|
| 179 |
|
| 180 |
self.config.vision_tower = vision_tower
|
|
@@ -184,13 +183,12 @@ class GOTQwenModel(Qwen2Model):
|
|
| 184 |
|
| 185 |
self.config.vision_select_layer = vision_select_layer
|
| 186 |
self.config.freeze_vision_tower = freeze_vision_tower
|
| 187 |
-
|
| 188 |
return dict(
|
| 189 |
image_processor_high=image_processor_high,
|
| 190 |
image_token_len=image_token_len,
|
| 191 |
)
|
| 192 |
-
|
| 193 |
-
|
| 194 |
def forward(
|
| 195 |
self,
|
| 196 |
input_ids: torch.LongTensor = None,
|
|
@@ -204,19 +202,16 @@ class GOTQwenModel(Qwen2Model):
|
|
| 204 |
images: Optional[torch.FloatTensor] = None,
|
| 205 |
return_dict: Optional[bool] = None,
|
| 206 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
| 207 |
-
|
| 208 |
# HACK: replace back original embeddings for LLaVA pretraining
|
| 209 |
-
orig_embeds_params = getattr(self,
|
| 210 |
if orig_embeds_params is not None:
|
| 211 |
with torch.no_grad():
|
| 212 |
-
self.get_input_embeddings().weight[:-self.num_new_tokens] = orig_embeds_params[:-self.num_new_tokens].data
|
| 213 |
|
| 214 |
if inputs_embeds is None:
|
| 215 |
inputs_embeds = self.embed_tokens(input_ids)
|
| 216 |
|
| 217 |
-
|
| 218 |
-
vision_tower_high = getattr(self, 'vision_tower_high', None)
|
| 219 |
-
|
| 220 |
|
| 221 |
if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
|
| 222 |
use_im_start_end = getattr(self.config, "use_im_start_end", -1)
|
|
@@ -232,15 +227,15 @@ class GOTQwenModel(Qwen2Model):
|
|
| 232 |
im_start_token = 151857
|
| 233 |
|
| 234 |
im_end_token = 151858
|
| 235 |
-
|
| 236 |
image_features = []
|
| 237 |
-
|
| 238 |
for image in images:
|
| 239 |
P, C, H, W = image.shape
|
| 240 |
if P == 1:
|
| 241 |
with torch.set_grad_enabled(False):
|
| 242 |
cnn_feature = vision_tower_high(image)
|
| 243 |
-
cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1)
|
| 244 |
image_feature = self.mm_projector_vary(cnn_feature)
|
| 245 |
image_features.append(image_feature)
|
| 246 |
|
|
@@ -249,7 +244,7 @@ class GOTQwenModel(Qwen2Model):
|
|
| 249 |
image_patches_features = []
|
| 250 |
for image_patch in image_patches:
|
| 251 |
image_p = torch.stack([image_patch])
|
| 252 |
-
|
| 253 |
with torch.set_grad_enabled(False):
|
| 254 |
cnn_feature_p = vision_tower_high(image_p)
|
| 255 |
cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
|
|
@@ -258,21 +253,20 @@ class GOTQwenModel(Qwen2Model):
|
|
| 258 |
image_feature = torch.cat(image_patches_features, dim=1)
|
| 259 |
image_features.append(image_feature)
|
| 260 |
|
| 261 |
-
|
| 262 |
dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
|
| 263 |
dummy_image_features = dummy_image_features_2
|
| 264 |
use_im_start_end = True
|
| 265 |
new_input_embeds = []
|
| 266 |
for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
|
| 267 |
if (cur_input_ids == im_patch_token).sum() == 0:
|
| 268 |
-
cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
|
| 269 |
new_input_embeds.append(cur_input_embeds)
|
| 270 |
continue
|
| 271 |
|
| 272 |
if use_im_start_end:
|
| 273 |
if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
|
| 274 |
raise ValueError("The number of image start tokens and image end tokens should be the same.")
|
| 275 |
-
|
| 276 |
image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
|
| 277 |
for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
|
| 278 |
per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
|
|
@@ -280,17 +274,16 @@ class GOTQwenModel(Qwen2Model):
|
|
| 280 |
|
| 281 |
if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
|
| 282 |
raise ValueError("The image end token should follow the image start token.")
|
| 283 |
-
|
| 284 |
cur_input_embeds = torch.cat(
|
| 285 |
(
|
| 286 |
-
cur_input_embeds[:image_start_token_pos+1],
|
| 287 |
-
per_cur_image_features,
|
| 288 |
-
cur_input_embeds[image_start_token_pos + num_patches + 1:]
|
| 289 |
-
),
|
| 290 |
-
dim=0
|
| 291 |
)
|
| 292 |
|
| 293 |
-
|
| 294 |
new_input_embeds.append(cur_input_embeds)
|
| 295 |
else:
|
| 296 |
raise NotImplementedError
|
|
@@ -298,14 +291,18 @@ class GOTQwenModel(Qwen2Model):
|
|
| 298 |
inputs_embeds = torch.stack(new_input_embeds, dim=0)
|
| 299 |
|
| 300 |
return super(GOTQwenModel, self).forward(
|
| 301 |
-
input_ids=None,
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
)
|
| 306 |
|
| 307 |
|
| 308 |
-
|
| 309 |
class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
| 310 |
config_class = GOTConfig
|
| 311 |
# supports_gradient_checkpointing = True
|
|
@@ -336,15 +333,12 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 336 |
output_hidden_states: Optional[bool] = None,
|
| 337 |
images: Optional[torch.FloatTensor] = None,
|
| 338 |
return_dict: Optional[bool] = None,
|
| 339 |
-
|
| 340 |
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 341 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 342 |
-
output_hidden_states =
|
| 343 |
-
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 344 |
-
)
|
| 345 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 346 |
|
| 347 |
-
outputs
|
| 348 |
input_ids=input_ids,
|
| 349 |
past_key_values=past_key_values,
|
| 350 |
attention_mask=attention_mask,
|
|
@@ -354,8 +348,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 354 |
output_attentions=output_attentions,
|
| 355 |
output_hidden_states=output_hidden_states,
|
| 356 |
images=images,
|
| 357 |
-
return_dict=return_dict
|
| 358 |
-
|
| 359 |
)
|
| 360 |
|
| 361 |
hidden_states = outputs[0]
|
|
@@ -389,10 +382,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 389 |
attentions=outputs.attentions,
|
| 390 |
)
|
| 391 |
|
| 392 |
-
|
| 393 |
-
def prepare_inputs_for_generation(
|
| 394 |
-
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
|
| 395 |
-
):
|
| 396 |
# Omit tokens covered by past_key_values
|
| 397 |
if past_key_values is not None:
|
| 398 |
if isinstance(past_key_values, Cache):
|
|
@@ -416,11 +406,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 416 |
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
|
| 417 |
|
| 418 |
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
|
| 419 |
-
if
|
| 420 |
-
max_cache_length is not None
|
| 421 |
-
and attention_mask is not None
|
| 422 |
-
and cache_length + input_ids.shape[1] > max_cache_length
|
| 423 |
-
):
|
| 424 |
attention_mask = attention_mask[:, -max_cache_length:]
|
| 425 |
|
| 426 |
position_ids = kwargs.get("position_ids", None)
|
|
@@ -448,16 +434,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 448 |
)
|
| 449 |
return model_inputs
|
| 450 |
|
| 451 |
-
def initialize_vision_tokenizer(
|
| 452 |
-
self,
|
| 453 |
-
tokenizer,
|
| 454 |
-
freeze_lm_model=False,
|
| 455 |
-
pretrained_stage1_model=None,
|
| 456 |
-
device="cuda"
|
| 457 |
-
):
|
| 458 |
config = self.get_model().config
|
| 459 |
|
| 460 |
-
|
| 461 |
self.resize_token_embeddings(len(tokenizer))
|
| 462 |
|
| 463 |
config.im_patch_token = 151859
|
|
@@ -469,11 +448,11 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 469 |
config.im_start_token, config.im_end_token = 151857, 151858
|
| 470 |
|
| 471 |
def load_image(self, image_file):
|
| 472 |
-
if image_file.startswith(
|
| 473 |
response = requests.get(image_file)
|
| 474 |
-
image = Image.open(BytesIO(response.content)).convert(
|
| 475 |
else:
|
| 476 |
-
image = Image.open(image_file).convert(
|
| 477 |
return image
|
| 478 |
|
| 479 |
def disable_torch_init(self):
|
|
@@ -481,15 +460,26 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 481 |
Disable the redundant torch default initialization to accelerate model creation.
|
| 482 |
"""
|
| 483 |
import torch
|
|
|
|
| 484 |
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
| 485 |
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
| 486 |
|
| 487 |
-
def chat(
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
self.disable_torch_init()
|
| 490 |
|
| 491 |
-
|
| 492 |
-
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
| 493 |
|
| 494 |
use_im_start_end = True
|
| 495 |
|
|
@@ -501,38 +491,37 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 501 |
image = self.load_image(image_file)
|
| 502 |
|
| 503 |
w, h = image.size
|
| 504 |
-
|
| 505 |
-
if ocr_type ==
|
| 506 |
-
qs =
|
| 507 |
else:
|
| 508 |
-
qs =
|
| 509 |
|
| 510 |
if ocr_box:
|
| 511 |
bbox = eval(ocr_box)
|
| 512 |
if len(bbox) == 2:
|
| 513 |
-
bbox[0] = int(bbox[0]/w*1000)
|
| 514 |
-
bbox[1] = int(bbox[1]/h*1000)
|
| 515 |
if len(bbox) == 4:
|
| 516 |
-
bbox[0] = int(bbox[0]/w*1000)
|
| 517 |
-
bbox[1] = int(bbox[1]/h*1000)
|
| 518 |
-
bbox[2] = int(bbox[2]/w*1000)
|
| 519 |
-
bbox[3] = int(bbox[3]/h*1000)
|
| 520 |
-
if ocr_type ==
|
| 521 |
-
qs = str(bbox) +
|
| 522 |
else:
|
| 523 |
-
qs = str(bbox) +
|
| 524 |
|
| 525 |
if ocr_color:
|
| 526 |
-
if ocr_type ==
|
| 527 |
-
qs =
|
| 528 |
else:
|
| 529 |
-
qs =
|
| 530 |
|
| 531 |
if use_im_start_end:
|
| 532 |
-
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN +
|
| 533 |
else:
|
| 534 |
-
qs = DEFAULT_IMAGE_TOKEN +
|
| 535 |
-
|
| 536 |
|
| 537 |
conv_mpt = Conversation(
|
| 538 |
system="""<|im_start|>system
|
|
@@ -571,109 +560,113 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 571 |
input_ids,
|
| 572 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
| 573 |
do_sample=False,
|
| 574 |
-
num_beams
|
| 575 |
-
no_repeat_ngram_size
|
| 576 |
streamer=streamer,
|
| 577 |
max_new_tokens=4096,
|
| 578 |
-
stopping_criteria=[stopping_criteria]
|
| 579 |
-
|
| 580 |
else:
|
| 581 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 582 |
output_ids = self.generate(
|
| 583 |
input_ids,
|
| 584 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
| 585 |
do_sample=False,
|
| 586 |
-
num_beams
|
| 587 |
-
no_repeat_ngram_size
|
| 588 |
# streamer=streamer,
|
| 589 |
max_new_tokens=4096,
|
| 590 |
-
stopping_criteria=[stopping_criteria]
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
|
| 594 |
-
|
| 595 |
if outputs.endswith(stop_str):
|
| 596 |
-
outputs = outputs[:-len(stop_str)]
|
| 597 |
outputs = outputs.strip()
|
| 598 |
response_str = outputs
|
| 599 |
|
| 600 |
if render:
|
| 601 |
-
print(
|
| 602 |
-
from .render_tools import
|
| 603 |
|
| 604 |
-
if
|
| 605 |
import verovio
|
|
|
|
| 606 |
tk = verovio.toolkit()
|
| 607 |
tk.loadData(outputs)
|
| 608 |
-
tk.setOptions({"pageWidth": 2100, "footer":
|
| 609 |
-
'barLineWidth': 0.5, 'beamMaxSlope': 15,
|
| 610 |
-
'staffLineWidth': 0.2, 'spacingStaff': 6})
|
| 611 |
tk.getPageCount()
|
| 612 |
svg = tk.renderToSVG()
|
| 613 |
-
svg = svg.replace(
|
| 614 |
|
| 615 |
svg_to_html(svg, save_render_file)
|
| 616 |
|
| 617 |
-
if ocr_type ==
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
if '\\begin{tikzpicture}' not in outputs:
|
| 621 |
html_path_2 = save_render_file
|
| 622 |
-
right_num = outputs.count(
|
| 623 |
-
left_num = outputs.count(
|
| 624 |
|
| 625 |
if right_num != left_num:
|
| 626 |
-
outputs =
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
-
outputs = outputs.replace('"',
|
| 630 |
|
| 631 |
-
outputs_list = outputs.split(
|
| 632 |
-
gt=
|
| 633 |
for out in outputs_list:
|
| 634 |
-
gt +=
|
| 635 |
-
|
| 636 |
-
gt = gt[:-2]
|
| 637 |
|
|
|
|
| 638 |
|
| 639 |
lines = content_mmd_to_html
|
| 640 |
lines = lines.split("const text =")
|
| 641 |
-
new_web = lines[0] +
|
| 642 |
|
| 643 |
else:
|
| 644 |
html_path_2 = save_render_file
|
| 645 |
outputs = outputs.translate(translation_table)
|
| 646 |
-
outputs_list = outputs.split(
|
| 647 |
-
gt=
|
| 648 |
for out in outputs_list:
|
| 649 |
if out:
|
| 650 |
-
if
|
| 651 |
-
while out[-1] ==
|
| 652 |
out = out[:-1]
|
| 653 |
if out is None:
|
| 654 |
break
|
| 655 |
-
|
| 656 |
if out:
|
| 657 |
-
if out[-1] !=
|
| 658 |
-
gt += out[:-1] +
|
| 659 |
else:
|
| 660 |
-
gt += out +
|
| 661 |
else:
|
| 662 |
-
gt += out +
|
| 663 |
-
|
| 664 |
|
| 665 |
lines = tik_html
|
| 666 |
lines = lines.split("const text =")
|
| 667 |
new_web = lines[0] + gt + lines[1]
|
| 668 |
|
| 669 |
-
with open(html_path_2,
|
| 670 |
web_f_new.write(new_web)
|
| 671 |
return response_str
|
| 672 |
|
| 673 |
def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
|
| 674 |
-
|
| 675 |
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
| 676 |
-
best_ratio_diff = float(
|
| 677 |
best_ratio = (1, 1)
|
| 678 |
area = width * height
|
| 679 |
for ratio in target_ratios:
|
|
@@ -687,20 +680,19 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 687 |
best_ratio = ratio
|
| 688 |
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
|
| 689 |
return best_ratio
|
| 690 |
-
|
| 691 |
orig_width, orig_height = image.size
|
| 692 |
aspect_ratio = orig_width / orig_height
|
| 693 |
|
| 694 |
# calculate the existing image aspect ratio
|
| 695 |
target_ratios = set(
|
| 696 |
-
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
| 697 |
-
|
| 698 |
# print(target_ratios)
|
| 699 |
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
| 700 |
|
| 701 |
# find the closest aspect ratio to the target
|
| 702 |
-
target_aspect_ratio = find_closest_aspect_ratio(
|
| 703 |
-
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
| 704 |
|
| 705 |
# print(target_aspect_ratio)
|
| 706 |
# calculate the target width and height
|
|
@@ -716,7 +708,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 716 |
(i % (target_width // image_size)) * image_size,
|
| 717 |
(i // (target_width // image_size)) * image_size,
|
| 718 |
((i % (target_width // image_size)) + 1) * image_size,
|
| 719 |
-
((i // (target_width // image_size)) + 1) * image_size
|
| 720 |
)
|
| 721 |
# split the image
|
| 722 |
split_img = resized_img.crop(box)
|
|
@@ -727,18 +719,15 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 727 |
processed_images.append(thumbnail_img)
|
| 728 |
return processed_images
|
| 729 |
|
| 730 |
-
|
| 731 |
-
def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
|
| 732 |
# Model
|
| 733 |
self.disable_torch_init()
|
| 734 |
-
multi_page=False
|
| 735 |
-
|
| 736 |
|
| 737 |
-
image_processor_high =
|
| 738 |
|
| 739 |
use_im_start_end = True
|
| 740 |
|
| 741 |
-
|
| 742 |
image_token_len = 256
|
| 743 |
|
| 744 |
image_list = []
|
|
@@ -747,7 +736,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 747 |
# multi_page = True
|
| 748 |
|
| 749 |
if multi_page:
|
| 750 |
-
qs =
|
| 751 |
# only for png files
|
| 752 |
# import glob
|
| 753 |
# from natsort import natsorted
|
|
@@ -763,10 +752,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 763 |
# print("len ll: ", ll)
|
| 764 |
|
| 765 |
else:
|
| 766 |
-
if ocr_type ==
|
| 767 |
-
qs =
|
| 768 |
else:
|
| 769 |
-
qs =
|
| 770 |
if gradio_input:
|
| 771 |
img = image_file.copy()
|
| 772 |
else:
|
|
@@ -778,17 +767,14 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 778 |
image_tensor_1 = image_processor_high(image)
|
| 779 |
image_list.append(image_tensor_1)
|
| 780 |
|
| 781 |
-
|
| 782 |
image_list = torch.stack(image_list)
|
| 783 |
|
| 784 |
-
print(
|
| 785 |
-
|
| 786 |
|
| 787 |
if use_im_start_end:
|
| 788 |
-
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN +
|
| 789 |
else:
|
| 790 |
-
qs = DEFAULT_IMAGE_TOKEN +
|
| 791 |
-
|
| 792 |
|
| 793 |
conv_mpt = Conversation(
|
| 794 |
system="""<|im_start|>system
|
|
@@ -825,57 +811,68 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 825 |
input_ids,
|
| 826 |
images=[image_list.half().cuda()],
|
| 827 |
do_sample=False,
|
| 828 |
-
num_beams
|
| 829 |
# no_repeat_ngram_size = 20,
|
| 830 |
streamer=streamer,
|
| 831 |
max_new_tokens=4096,
|
| 832 |
-
stopping_criteria=[stopping_criteria]
|
| 833 |
-
|
| 834 |
else:
|
| 835 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 836 |
output_ids = self.generate(
|
| 837 |
input_ids,
|
| 838 |
images=[image_list.half().cuda()],
|
| 839 |
do_sample=False,
|
| 840 |
-
num_beams
|
| 841 |
# no_repeat_ngram_size = 20,
|
| 842 |
# streamer=streamer,
|
| 843 |
max_new_tokens=4096,
|
| 844 |
-
stopping_criteria=[stopping_criteria]
|
| 845 |
-
|
|
|
|
|
|
|
| 846 |
|
| 847 |
-
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
|
| 848 |
-
|
| 849 |
if outputs.endswith(stop_str):
|
| 850 |
-
outputs = outputs[:-len(stop_str)]
|
| 851 |
-
outputs = outputs.strip()
|
| 852 |
response_str = outputs
|
| 853 |
|
| 854 |
if render:
|
| 855 |
-
print(
|
| 856 |
from .render_tools import content_mmd_to_html
|
|
|
|
| 857 |
html_path_2 = save_render_file
|
| 858 |
-
right_num = outputs.count(
|
| 859 |
-
left_num = outputs.count(
|
| 860 |
|
| 861 |
if right_num != left_num:
|
| 862 |
-
outputs =
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 869 |
for out in outputs_list:
|
| 870 |
-
gt +=
|
| 871 |
-
|
| 872 |
gt = gt[:-2]
|
| 873 |
|
| 874 |
lines = content_mmd_to_html
|
| 875 |
lines = lines.split("const text =")
|
| 876 |
-
new_web = lines[0] +
|
| 877 |
-
|
| 878 |
-
with open(html_path_2,
|
| 879 |
web_f_new.write(new_web)
|
| 880 |
|
| 881 |
-
return response_str
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
from enum import Enum, auto
|
| 3 |
+
from io import BytesIO
|
| 4 |
from typing import List, Optional, Tuple, Union
|
| 5 |
+
|
| 6 |
import requests
|
|
|
|
|
|
|
| 7 |
import torch
|
| 8 |
import torch.nn as nn
|
| 9 |
+
from PIL import Image
|
| 10 |
from torch.nn import CrossEntropyLoss
|
|
|
|
| 11 |
from torchvision import transforms
|
| 12 |
from torchvision.transforms.functional import InterpolationMode
|
| 13 |
+
from transformers import Qwen2Config, Qwen2ForCausalLM, Qwen2Model, StoppingCriteria, TextStreamer
|
| 14 |
+
from transformers.cache_utils import Cache
|
| 15 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
| 16 |
+
|
| 17 |
+
from .got_vision_b import build_GOT_vit_b
|
| 18 |
+
|
| 19 |
###
|
| 20 |
|
| 21 |
DEFAULT_IMAGE_TOKEN = "<image>"
|
| 22 |
+
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>"
|
| 23 |
+
DEFAULT_IM_START_TOKEN = "<img>"
|
| 24 |
+
DEFAULT_IM_END_TOKEN = "</img>"
|
| 25 |
+
|
| 26 |
|
|
|
|
| 27 |
class SeparatorStyle(Enum):
|
| 28 |
"""Different separator style."""
|
| 29 |
+
|
| 30 |
SINGLE = auto()
|
| 31 |
TWO = auto()
|
| 32 |
MPT = auto()
|
|
|
|
| 35 |
@dataclasses.dataclass
|
| 36 |
class Conversation:
|
| 37 |
"""A class that keeps all conversation history."""
|
| 38 |
+
|
| 39 |
system: str
|
| 40 |
roles: List[str]
|
| 41 |
messages: List[List[str]]
|
|
|
|
| 49 |
|
| 50 |
def get_prompt(self):
|
| 51 |
if self.sep_style == SeparatorStyle.SINGLE:
|
| 52 |
+
ret = self.system + self.sep + "\n"
|
| 53 |
for role, message in self.messages:
|
| 54 |
if message:
|
| 55 |
if type(message) is tuple:
|
|
|
|
| 71 |
return ret
|
| 72 |
if self.sep_style == SeparatorStyle.MPT:
|
| 73 |
if self.system:
|
| 74 |
+
ret = self.system + self.sep
|
| 75 |
else:
|
| 76 |
+
ret = ""
|
| 77 |
for role, message in self.messages:
|
| 78 |
if message:
|
| 79 |
if type(message) is tuple:
|
|
|
|
| 85 |
else:
|
| 86 |
raise ValueError(f"Invalid style: {self.sep_style}")
|
| 87 |
|
|
|
|
| 88 |
def append_message(self, role, message):
|
| 89 |
self.messages.append([role, message])
|
| 90 |
|
|
|
|
| 96 |
offset=self.offset,
|
| 97 |
sep_style=self.sep_style,
|
| 98 |
sep=self.sep,
|
| 99 |
+
sep2=self.sep2,
|
| 100 |
+
)
|
| 101 |
|
| 102 |
|
| 103 |
class KeywordsStoppingCriteria(StoppingCriteria):
|
|
|
|
| 116 |
for keyword_id in self.keyword_ids:
|
| 117 |
if output_ids[0, -1] == keyword_id:
|
| 118 |
return True
|
| 119 |
+
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len :], skip_special_tokens=True)[0]
|
| 120 |
for keyword in self.keywords:
|
| 121 |
if keyword in outputs:
|
| 122 |
return True
|
| 123 |
return False
|
| 124 |
+
|
| 125 |
|
| 126 |
class GOTImageEvalProcessor:
|
| 127 |
def __init__(self, image_size=384, mean=None, std=None):
|
|
|
|
| 134 |
|
| 135 |
self.transform = transforms.Compose(
|
| 136 |
[
|
| 137 |
+
transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
|
|
|
|
|
|
|
| 138 |
transforms.ToTensor(),
|
| 139 |
self.normalize,
|
| 140 |
]
|
| 141 |
)
|
| 142 |
+
|
| 143 |
def __call__(self, item):
|
| 144 |
return self.transform(item)
|
| 145 |
|
| 146 |
|
|
|
|
| 147 |
class GOTConfig(Qwen2Config):
|
| 148 |
model_type = "GOT"
|
| 149 |
|
|
|
|
| 156 |
|
| 157 |
self.vision_tower_high = build_GOT_vit_b()
|
| 158 |
|
| 159 |
+
self.mm_projector_vary = nn.Linear(1024, 1024)
|
|
|
|
| 160 |
|
| 161 |
def initialize_vision_modules(
|
| 162 |
+
self,
|
| 163 |
vision_tower,
|
| 164 |
pretrained_stage1_model=None,
|
| 165 |
freeze_vision_tower=False,
|
| 166 |
use_im_start_end=False,
|
| 167 |
vision_select_layer=-1,
|
| 168 |
dtype=torch.float16,
|
| 169 |
+
device="cuda",
|
| 170 |
):
|
|
|
|
|
|
|
| 171 |
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
| 172 |
+
|
| 173 |
self.vision_tower_high = self.vision_tower_high.to(dtype=dtype, device=device)
|
| 174 |
|
| 175 |
self.mm_projector_vary = self.mm_projector_vary.to(dtype=dtype, device=device)
|
| 176 |
|
|
|
|
| 177 |
image_token_len = 256
|
| 178 |
|
| 179 |
self.config.vision_tower = vision_tower
|
|
|
|
| 183 |
|
| 184 |
self.config.vision_select_layer = vision_select_layer
|
| 185 |
self.config.freeze_vision_tower = freeze_vision_tower
|
| 186 |
+
|
| 187 |
return dict(
|
| 188 |
image_processor_high=image_processor_high,
|
| 189 |
image_token_len=image_token_len,
|
| 190 |
)
|
| 191 |
+
|
|
|
|
| 192 |
def forward(
|
| 193 |
self,
|
| 194 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 202 |
images: Optional[torch.FloatTensor] = None,
|
| 203 |
return_dict: Optional[bool] = None,
|
| 204 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
|
|
|
| 205 |
# HACK: replace back original embeddings for LLaVA pretraining
|
| 206 |
+
orig_embeds_params = getattr(self, "orig_embeds_params", None)
|
| 207 |
if orig_embeds_params is not None:
|
| 208 |
with torch.no_grad():
|
| 209 |
+
self.get_input_embeddings().weight[: -self.num_new_tokens] = orig_embeds_params[: -self.num_new_tokens].data
|
| 210 |
|
| 211 |
if inputs_embeds is None:
|
| 212 |
inputs_embeds = self.embed_tokens(input_ids)
|
| 213 |
|
| 214 |
+
vision_tower_high = getattr(self, "vision_tower_high", None)
|
|
|
|
|
|
|
| 215 |
|
| 216 |
if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
|
| 217 |
use_im_start_end = getattr(self.config, "use_im_start_end", -1)
|
|
|
|
| 227 |
im_start_token = 151857
|
| 228 |
|
| 229 |
im_end_token = 151858
|
| 230 |
+
|
| 231 |
image_features = []
|
| 232 |
+
|
| 233 |
for image in images:
|
| 234 |
P, C, H, W = image.shape
|
| 235 |
if P == 1:
|
| 236 |
with torch.set_grad_enabled(False):
|
| 237 |
cnn_feature = vision_tower_high(image)
|
| 238 |
+
cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1) # 256*1024
|
| 239 |
image_feature = self.mm_projector_vary(cnn_feature)
|
| 240 |
image_features.append(image_feature)
|
| 241 |
|
|
|
|
| 244 |
image_patches_features = []
|
| 245 |
for image_patch in image_patches:
|
| 246 |
image_p = torch.stack([image_patch])
|
| 247 |
+
|
| 248 |
with torch.set_grad_enabled(False):
|
| 249 |
cnn_feature_p = vision_tower_high(image_p)
|
| 250 |
cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
|
|
|
|
| 253 |
image_feature = torch.cat(image_patches_features, dim=1)
|
| 254 |
image_features.append(image_feature)
|
| 255 |
|
|
|
|
| 256 |
dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
|
| 257 |
dummy_image_features = dummy_image_features_2
|
| 258 |
use_im_start_end = True
|
| 259 |
new_input_embeds = []
|
| 260 |
for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
|
| 261 |
if (cur_input_ids == im_patch_token).sum() == 0:
|
| 262 |
+
cur_input_embeds = cur_input_embeds + (0.0 * dummy_image_features).sum()
|
| 263 |
new_input_embeds.append(cur_input_embeds)
|
| 264 |
continue
|
| 265 |
|
| 266 |
if use_im_start_end:
|
| 267 |
if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
|
| 268 |
raise ValueError("The number of image start tokens and image end tokens should be the same.")
|
| 269 |
+
|
| 270 |
image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
|
| 271 |
for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
|
| 272 |
per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
|
|
|
|
| 274 |
|
| 275 |
if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
|
| 276 |
raise ValueError("The image end token should follow the image start token.")
|
| 277 |
+
|
| 278 |
cur_input_embeds = torch.cat(
|
| 279 |
(
|
| 280 |
+
cur_input_embeds[: image_start_token_pos + 1],
|
| 281 |
+
per_cur_image_features,
|
| 282 |
+
cur_input_embeds[image_start_token_pos + num_patches + 1 :],
|
| 283 |
+
),
|
| 284 |
+
dim=0,
|
| 285 |
)
|
| 286 |
|
|
|
|
| 287 |
new_input_embeds.append(cur_input_embeds)
|
| 288 |
else:
|
| 289 |
raise NotImplementedError
|
|
|
|
| 291 |
inputs_embeds = torch.stack(new_input_embeds, dim=0)
|
| 292 |
|
| 293 |
return super(GOTQwenModel, self).forward(
|
| 294 |
+
input_ids=None,
|
| 295 |
+
attention_mask=attention_mask,
|
| 296 |
+
past_key_values=past_key_values,
|
| 297 |
+
inputs_embeds=inputs_embeds,
|
| 298 |
+
use_cache=use_cache,
|
| 299 |
+
position_ids=position_ids,
|
| 300 |
+
output_attentions=output_attentions,
|
| 301 |
+
output_hidden_states=output_hidden_states,
|
| 302 |
+
return_dict=return_dict,
|
| 303 |
)
|
| 304 |
|
| 305 |
|
|
|
|
| 306 |
class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
| 307 |
config_class = GOTConfig
|
| 308 |
# supports_gradient_checkpointing = True
|
|
|
|
| 333 |
output_hidden_states: Optional[bool] = None,
|
| 334 |
images: Optional[torch.FloatTensor] = None,
|
| 335 |
return_dict: Optional[bool] = None,
|
|
|
|
| 336 |
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 337 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 338 |
+
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
|
|
|
|
|
| 339 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 340 |
|
| 341 |
+
outputs = self.model(
|
| 342 |
input_ids=input_ids,
|
| 343 |
past_key_values=past_key_values,
|
| 344 |
attention_mask=attention_mask,
|
|
|
|
| 348 |
output_attentions=output_attentions,
|
| 349 |
output_hidden_states=output_hidden_states,
|
| 350 |
images=images,
|
| 351 |
+
return_dict=return_dict,
|
|
|
|
| 352 |
)
|
| 353 |
|
| 354 |
hidden_states = outputs[0]
|
|
|
|
| 382 |
attentions=outputs.attentions,
|
| 383 |
)
|
| 384 |
|
| 385 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs):
|
|
|
|
|
|
|
|
|
|
| 386 |
# Omit tokens covered by past_key_values
|
| 387 |
if past_key_values is not None:
|
| 388 |
if isinstance(past_key_values, Cache):
|
|
|
|
| 406 |
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
|
| 407 |
|
| 408 |
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
|
| 409 |
+
if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
attention_mask = attention_mask[:, -max_cache_length:]
|
| 411 |
|
| 412 |
position_ids = kwargs.get("position_ids", None)
|
|
|
|
| 434 |
)
|
| 435 |
return model_inputs
|
| 436 |
|
| 437 |
+
def initialize_vision_tokenizer(self, tokenizer, freeze_lm_model=False, pretrained_stage1_model=None, device="cuda"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
config = self.get_model().config
|
| 439 |
|
|
|
|
| 440 |
self.resize_token_embeddings(len(tokenizer))
|
| 441 |
|
| 442 |
config.im_patch_token = 151859
|
|
|
|
| 448 |
config.im_start_token, config.im_end_token = 151857, 151858
|
| 449 |
|
| 450 |
def load_image(self, image_file):
|
| 451 |
+
if image_file.startswith("http") or image_file.startswith("https"):
|
| 452 |
response = requests.get(image_file)
|
| 453 |
+
image = Image.open(BytesIO(response.content)).convert("RGB")
|
| 454 |
else:
|
| 455 |
+
image = Image.open(image_file).convert("RGB")
|
| 456 |
return image
|
| 457 |
|
| 458 |
def disable_torch_init(self):
|
|
|
|
| 460 |
Disable the redundant torch default initialization to accelerate model creation.
|
| 461 |
"""
|
| 462 |
import torch
|
| 463 |
+
|
| 464 |
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
| 465 |
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
| 466 |
|
| 467 |
+
def chat(
|
| 468 |
+
self,
|
| 469 |
+
tokenizer,
|
| 470 |
+
image_file,
|
| 471 |
+
ocr_type,
|
| 472 |
+
ocr_box="",
|
| 473 |
+
ocr_color="",
|
| 474 |
+
render=False,
|
| 475 |
+
save_render_file=None,
|
| 476 |
+
print_prompt=False,
|
| 477 |
+
gradio_input=False,
|
| 478 |
+
stream_flag=False,
|
| 479 |
+
):
|
| 480 |
self.disable_torch_init()
|
| 481 |
|
| 482 |
+
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
|
|
|
| 483 |
|
| 484 |
use_im_start_end = True
|
| 485 |
|
|
|
|
| 491 |
image = self.load_image(image_file)
|
| 492 |
|
| 493 |
w, h = image.size
|
| 494 |
+
|
| 495 |
+
if ocr_type == "format":
|
| 496 |
+
qs = "OCR with format: "
|
| 497 |
else:
|
| 498 |
+
qs = "OCR: "
|
| 499 |
|
| 500 |
if ocr_box:
|
| 501 |
bbox = eval(ocr_box)
|
| 502 |
if len(bbox) == 2:
|
| 503 |
+
bbox[0] = int(bbox[0] / w * 1000)
|
| 504 |
+
bbox[1] = int(bbox[1] / h * 1000)
|
| 505 |
if len(bbox) == 4:
|
| 506 |
+
bbox[0] = int(bbox[0] / w * 1000)
|
| 507 |
+
bbox[1] = int(bbox[1] / h * 1000)
|
| 508 |
+
bbox[2] = int(bbox[2] / w * 1000)
|
| 509 |
+
bbox[3] = int(bbox[3] / h * 1000)
|
| 510 |
+
if ocr_type == "format":
|
| 511 |
+
qs = str(bbox) + " " + "OCR with format: "
|
| 512 |
else:
|
| 513 |
+
qs = str(bbox) + " " + "OCR: "
|
| 514 |
|
| 515 |
if ocr_color:
|
| 516 |
+
if ocr_type == "format":
|
| 517 |
+
qs = "[" + ocr_color + "]" + " " + "OCR with format: "
|
| 518 |
else:
|
| 519 |
+
qs = "[" + ocr_color + "]" + " " + "OCR: "
|
| 520 |
|
| 521 |
if use_im_start_end:
|
| 522 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN + "\n" + qs
|
| 523 |
else:
|
| 524 |
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
|
|
|
| 525 |
|
| 526 |
conv_mpt = Conversation(
|
| 527 |
system="""<|im_start|>system
|
|
|
|
| 560 |
input_ids,
|
| 561 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
| 562 |
do_sample=False,
|
| 563 |
+
num_beams=1,
|
| 564 |
+
no_repeat_ngram_size=20,
|
| 565 |
streamer=streamer,
|
| 566 |
max_new_tokens=4096,
|
| 567 |
+
stopping_criteria=[stopping_criteria],
|
| 568 |
+
)
|
| 569 |
else:
|
| 570 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 571 |
output_ids = self.generate(
|
| 572 |
input_ids,
|
| 573 |
images=[image_tensor_1.unsqueeze(0).half().cuda()],
|
| 574 |
do_sample=False,
|
| 575 |
+
num_beams=1,
|
| 576 |
+
no_repeat_ngram_size=20,
|
| 577 |
# streamer=streamer,
|
| 578 |
max_new_tokens=4096,
|
| 579 |
+
stopping_criteria=[stopping_criteria],
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
| 583 |
+
|
| 584 |
if outputs.endswith(stop_str):
|
| 585 |
+
outputs = outputs[: -len(stop_str)]
|
| 586 |
outputs = outputs.strip()
|
| 587 |
response_str = outputs
|
| 588 |
|
| 589 |
if render:
|
| 590 |
+
print("==============rendering===============")
|
| 591 |
+
from .render_tools import content_mmd_to_html, svg_to_html, tik_html, translation_table
|
| 592 |
|
| 593 |
+
if "**kern" in outputs:
|
| 594 |
import verovio
|
| 595 |
+
|
| 596 |
tk = verovio.toolkit()
|
| 597 |
tk.loadData(outputs)
|
| 598 |
+
tk.setOptions({"pageWidth": 2100, "footer": "none", "barLineWidth": 0.5, "beamMaxSlope": 15, "staffLineWidth": 0.2, "spacingStaff": 6})
|
|
|
|
|
|
|
| 599 |
tk.getPageCount()
|
| 600 |
svg = tk.renderToSVG()
|
| 601 |
+
svg = svg.replace('overflow="inherit"', 'overflow="visible"')
|
| 602 |
|
| 603 |
svg_to_html(svg, save_render_file)
|
| 604 |
|
| 605 |
+
if ocr_type == "format" and "**kern" not in outputs:
|
| 606 |
+
if "\\begin{tikzpicture}" not in outputs:
|
|
|
|
|
|
|
| 607 |
html_path_2 = save_render_file
|
| 608 |
+
right_num = outputs.count("\\right")
|
| 609 |
+
left_num = outputs.count("\left")
|
| 610 |
|
| 611 |
if right_num != left_num:
|
| 612 |
+
outputs = (
|
| 613 |
+
outputs.replace("\left(", "(")
|
| 614 |
+
.replace("\\right)", ")")
|
| 615 |
+
.replace("\left[", "[")
|
| 616 |
+
.replace("\\right]", "]")
|
| 617 |
+
.replace("\left{", "{")
|
| 618 |
+
.replace("\\right}", "}")
|
| 619 |
+
.replace("\left|", "|")
|
| 620 |
+
.replace("\\right|", "|")
|
| 621 |
+
.replace("\left.", ".")
|
| 622 |
+
.replace("\\right.", ".")
|
| 623 |
+
)
|
| 624 |
|
| 625 |
+
outputs = outputs.replace('"', "``").replace("$", "")
|
| 626 |
|
| 627 |
+
outputs_list = outputs.split("\n")
|
| 628 |
+
gt = ""
|
| 629 |
for out in outputs_list:
|
| 630 |
+
gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
|
|
|
|
|
|
|
| 631 |
|
| 632 |
+
gt = gt[:-2]
|
| 633 |
|
| 634 |
lines = content_mmd_to_html
|
| 635 |
lines = lines.split("const text =")
|
| 636 |
+
new_web = lines[0] + "const text =" + gt + lines[1]
|
| 637 |
|
| 638 |
else:
|
| 639 |
html_path_2 = save_render_file
|
| 640 |
outputs = outputs.translate(translation_table)
|
| 641 |
+
outputs_list = outputs.split("\n")
|
| 642 |
+
gt = ""
|
| 643 |
for out in outputs_list:
|
| 644 |
if out:
|
| 645 |
+
if "\\begin{tikzpicture}" not in out and "\\end{tikzpicture}" not in out:
|
| 646 |
+
while out[-1] == " ":
|
| 647 |
out = out[:-1]
|
| 648 |
if out is None:
|
| 649 |
break
|
| 650 |
+
|
| 651 |
if out:
|
| 652 |
+
if out[-1] != ";":
|
| 653 |
+
gt += out[:-1] + ";\n"
|
| 654 |
else:
|
| 655 |
+
gt += out + "\n"
|
| 656 |
else:
|
| 657 |
+
gt += out + "\n"
|
|
|
|
| 658 |
|
| 659 |
lines = tik_html
|
| 660 |
lines = lines.split("const text =")
|
| 661 |
new_web = lines[0] + gt + lines[1]
|
| 662 |
|
| 663 |
+
with open(html_path_2, "w") as web_f_new:
|
| 664 |
web_f_new.write(new_web)
|
| 665 |
return response_str
|
| 666 |
|
| 667 |
def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
|
|
|
|
| 668 |
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
| 669 |
+
best_ratio_diff = float("inf")
|
| 670 |
best_ratio = (1, 1)
|
| 671 |
area = width * height
|
| 672 |
for ratio in target_ratios:
|
|
|
|
| 680 |
best_ratio = ratio
|
| 681 |
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
|
| 682 |
return best_ratio
|
| 683 |
+
|
| 684 |
orig_width, orig_height = image.size
|
| 685 |
aspect_ratio = orig_width / orig_height
|
| 686 |
|
| 687 |
# calculate the existing image aspect ratio
|
| 688 |
target_ratios = set(
|
| 689 |
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num
|
| 690 |
+
)
|
| 691 |
# print(target_ratios)
|
| 692 |
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
| 693 |
|
| 694 |
# find the closest aspect ratio to the target
|
| 695 |
+
target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
|
|
|
| 696 |
|
| 697 |
# print(target_aspect_ratio)
|
| 698 |
# calculate the target width and height
|
|
|
|
| 708 |
(i % (target_width // image_size)) * image_size,
|
| 709 |
(i // (target_width // image_size)) * image_size,
|
| 710 |
((i % (target_width // image_size)) + 1) * image_size,
|
| 711 |
+
((i // (target_width // image_size)) + 1) * image_size,
|
| 712 |
)
|
| 713 |
# split the image
|
| 714 |
split_img = resized_img.crop(box)
|
|
|
|
| 719 |
processed_images.append(thumbnail_img)
|
| 720 |
return processed_images
|
| 721 |
|
| 722 |
+
def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag=False):
|
|
|
|
| 723 |
# Model
|
| 724 |
self.disable_torch_init()
|
| 725 |
+
multi_page = False
|
|
|
|
| 726 |
|
| 727 |
+
image_processor_high = GOTImageEvalProcessor(image_size=1024)
|
| 728 |
|
| 729 |
use_im_start_end = True
|
| 730 |
|
|
|
|
| 731 |
image_token_len = 256
|
| 732 |
|
| 733 |
image_list = []
|
|
|
|
| 736 |
# multi_page = True
|
| 737 |
|
| 738 |
if multi_page:
|
| 739 |
+
qs = "OCR with format across multi pages: "
|
| 740 |
# only for png files
|
| 741 |
# import glob
|
| 742 |
# from natsort import natsorted
|
|
|
|
| 752 |
# print("len ll: ", ll)
|
| 753 |
|
| 754 |
else:
|
| 755 |
+
if ocr_type == "format":
|
| 756 |
+
qs = "OCR with format upon the patch reference: "
|
| 757 |
else:
|
| 758 |
+
qs = "OCR upon the patch reference: "
|
| 759 |
if gradio_input:
|
| 760 |
img = image_file.copy()
|
| 761 |
else:
|
|
|
|
| 767 |
image_tensor_1 = image_processor_high(image)
|
| 768 |
image_list.append(image_tensor_1)
|
| 769 |
|
|
|
|
| 770 |
image_list = torch.stack(image_list)
|
| 771 |
|
| 772 |
+
print("====new images batch size======: \n", image_list.shape)
|
|
|
|
| 773 |
|
| 774 |
if use_im_start_end:
|
| 775 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len * ll + DEFAULT_IM_END_TOKEN + "\n" + qs
|
| 776 |
else:
|
| 777 |
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
|
|
|
| 778 |
|
| 779 |
conv_mpt = Conversation(
|
| 780 |
system="""<|im_start|>system
|
|
|
|
| 811 |
input_ids,
|
| 812 |
images=[image_list.half().cuda()],
|
| 813 |
do_sample=False,
|
| 814 |
+
num_beams=1,
|
| 815 |
# no_repeat_ngram_size = 20,
|
| 816 |
streamer=streamer,
|
| 817 |
max_new_tokens=4096,
|
| 818 |
+
stopping_criteria=[stopping_criteria],
|
| 819 |
+
)
|
| 820 |
else:
|
| 821 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 822 |
output_ids = self.generate(
|
| 823 |
input_ids,
|
| 824 |
images=[image_list.half().cuda()],
|
| 825 |
do_sample=False,
|
| 826 |
+
num_beams=1,
|
| 827 |
# no_repeat_ngram_size = 20,
|
| 828 |
# streamer=streamer,
|
| 829 |
max_new_tokens=4096,
|
| 830 |
+
stopping_criteria=[stopping_criteria],
|
| 831 |
+
)
|
| 832 |
+
|
| 833 |
+
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
| 834 |
|
|
|
|
|
|
|
| 835 |
if outputs.endswith(stop_str):
|
| 836 |
+
outputs = outputs[: -len(stop_str)]
|
| 837 |
+
outputs = outputs.strip()
|
| 838 |
response_str = outputs
|
| 839 |
|
| 840 |
if render:
|
| 841 |
+
print("==============rendering===============")
|
| 842 |
from .render_tools import content_mmd_to_html
|
| 843 |
+
|
| 844 |
html_path_2 = save_render_file
|
| 845 |
+
right_num = outputs.count("\\right")
|
| 846 |
+
left_num = outputs.count("\left")
|
| 847 |
|
| 848 |
if right_num != left_num:
|
| 849 |
+
outputs = (
|
| 850 |
+
outputs.replace("\left(", "(")
|
| 851 |
+
.replace("\\right)", ")")
|
| 852 |
+
.replace("\left[", "[")
|
| 853 |
+
.replace("\\right]", "]")
|
| 854 |
+
.replace("\left{", "{")
|
| 855 |
+
.replace("\\right}", "}")
|
| 856 |
+
.replace("\left|", "|")
|
| 857 |
+
.replace("\\right|", "|")
|
| 858 |
+
.replace("\left.", ".")
|
| 859 |
+
.replace("\\right.", ".")
|
| 860 |
+
)
|
| 861 |
+
|
| 862 |
+
outputs = outputs.replace('"', "``").replace("$", "")
|
| 863 |
+
|
| 864 |
+
outputs_list = outputs.split("\n")
|
| 865 |
+
gt = ""
|
| 866 |
for out in outputs_list:
|
| 867 |
+
gt += '"' + out.replace("\\", "\\\\") + r"\n" + '"' + "+" + "\n"
|
| 868 |
+
|
| 869 |
gt = gt[:-2]
|
| 870 |
|
| 871 |
lines = content_mmd_to_html
|
| 872 |
lines = lines.split("const text =")
|
| 873 |
+
new_web = lines[0] + "const text =" + gt + lines[1]
|
| 874 |
+
|
| 875 |
+
with open(html_path_2, "w") as web_f_new:
|
| 876 |
web_f_new.write(new_web)
|
| 877 |
|
| 878 |
+
return response_str
|
render_tools.py
CHANGED
|
@@ -1,13 +1,9 @@
|
|
|
|
|
| 1 |
|
| 2 |
-
punctuation_dict = {
|
| 3 |
-
",": ",",
|
| 4 |
-
"。": ".",
|
| 5 |
-
|
| 6 |
-
}
|
| 7 |
translation_table = str.maketrans(punctuation_dict)
|
| 8 |
-
|
| 9 |
-
def svg_to_html(svg_content, output_filename):
|
| 10 |
|
|
|
|
|
|
|
| 11 |
html_content = f"""
|
| 12 |
<!DOCTYPE html>
|
| 13 |
<html lang="en">
|
|
@@ -24,9 +20,8 @@ def svg_to_html(svg_content, output_filename):
|
|
| 24 |
</html>
|
| 25 |
"""
|
| 26 |
|
| 27 |
-
with open(output_filename,
|
| 28 |
file.write(html_content)
|
| 29 |
-
|
| 30 |
|
| 31 |
|
| 32 |
content_mmd_to_html = """<!DOCTYPE html>
|
|
@@ -34,7 +29,7 @@ content_mmd_to_html = """<!DOCTYPE html>
|
|
| 34 |
<meta charset="UTF-8">
|
| 35 |
<title>Title</title>
|
| 36 |
<script>
|
| 37 |
-
const text =
|
| 38 |
</script>
|
| 39 |
<style>
|
| 40 |
#content {
|
|
@@ -71,7 +66,6 @@ content_mmd_to_html = """<!DOCTYPE html>
|
|
| 71 |
"""
|
| 72 |
|
| 73 |
|
| 74 |
-
|
| 75 |
tik_html = """
|
| 76 |
<!DOCTYPE html>
|
| 77 |
|
|
@@ -92,5 +86,4 @@ const text =
|
|
| 92 |
</html>"""
|
| 93 |
|
| 94 |
|
| 95 |
-
|
| 96 |
-
# print(tik_html)
|
|
|
|
| 1 |
+
punctuation_dict = {",": ",", "。": "."}
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
translation_table = str.maketrans(punctuation_dict)
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
|
| 6 |
+
def svg_to_html(svg_content, output_filename):
|
| 7 |
html_content = f"""
|
| 8 |
<!DOCTYPE html>
|
| 9 |
<html lang="en">
|
|
|
|
| 20 |
</html>
|
| 21 |
"""
|
| 22 |
|
| 23 |
+
with open(output_filename, "w") as file:
|
| 24 |
file.write(html_content)
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
content_mmd_to_html = """<!DOCTYPE html>
|
|
|
|
| 29 |
<meta charset="UTF-8">
|
| 30 |
<title>Title</title>
|
| 31 |
<script>
|
| 32 |
+
const text =
|
| 33 |
</script>
|
| 34 |
<style>
|
| 35 |
#content {
|
|
|
|
| 66 |
"""
|
| 67 |
|
| 68 |
|
|
|
|
| 69 |
tik_html = """
|
| 70 |
<!DOCTYPE html>
|
| 71 |
|
|
|
|
| 86 |
</html>"""
|
| 87 |
|
| 88 |
|
| 89 |
+
# print(tik_html)
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tiktoken
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
torchvision
|
| 5 |
+
requests
|
| 6 |
+
verovio
|
special_tokens_map.json
CHANGED
|
@@ -6,4 +6,4 @@
|
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
}
|
| 9 |
-
}
|
|
|
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
}
|
| 9 |
+
}
|
tokenization_qwen.py
CHANGED
|
@@ -12,7 +12,7 @@ import unicodedata
|
|
| 12 |
from typing import Collection, Dict, List, Set, Tuple, Union
|
| 13 |
|
| 14 |
import tiktoken
|
| 15 |
-
from transformers import
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
@@ -37,10 +37,8 @@ SPECIAL_TOKENS = (
|
|
| 37 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
| 38 |
with open(tiktoken_bpe_file, "rb") as f:
|
| 39 |
contents = f.read()
|
| 40 |
-
return {
|
| 41 |
-
|
| 42 |
-
for token, rank in (line.split() for line in contents.splitlines() if line)
|
| 43 |
-
}
|
| 44 |
|
| 45 |
class QWenTokenizer(PreTrainedTokenizer):
|
| 46 |
"""QWen tokenizer."""
|
|
@@ -51,19 +49,19 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 51 |
self,
|
| 52 |
vocab_file,
|
| 53 |
errors="replace",
|
| 54 |
-
image_start_tag=
|
| 55 |
-
image_end_tag=
|
| 56 |
-
image_pad_tag=
|
| 57 |
-
ref_start_tag=
|
| 58 |
-
ref_end_tag=
|
| 59 |
-
box_start_tag=
|
| 60 |
-
box_end_tag=
|
| 61 |
-
quad_start_tag=
|
| 62 |
-
quad_end_tag=
|
| 63 |
**kwargs,
|
| 64 |
):
|
| 65 |
super().__init__(**kwargs)
|
| 66 |
-
|
| 67 |
self.image_start_tag = image_start_tag
|
| 68 |
self.image_end_tag = image_end_tag
|
| 69 |
self.image_pad_tag = image_pad_tag
|
|
@@ -73,24 +71,13 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 73 |
self.box_end_tag = box_end_tag
|
| 74 |
self.quad_start_tag = quad_start_tag
|
| 75 |
self.quad_end_tag = quad_end_tag
|
| 76 |
-
self.IMAGE_ST = (
|
| 77 |
-
ref_start_tag, ref_end_tag,
|
| 78 |
-
box_start_tag, box_end_tag,
|
| 79 |
-
quad_start_tag, quad_end_tag,
|
| 80 |
-
image_start_tag, image_end_tag,
|
| 81 |
-
image_pad_tag
|
| 82 |
-
)
|
| 83 |
|
| 84 |
self.errors = errors # how to handle errors in decoding
|
| 85 |
|
| 86 |
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
| 87 |
-
self.special_tokens = {
|
| 88 |
-
|
| 89 |
-
for index, token in enumerate(
|
| 90 |
-
SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
|
| 91 |
-
)
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
self.img_start_id = self.special_tokens[self.image_start_tag]
|
| 95 |
self.img_end_id = self.special_tokens[self.image_end_tag]
|
| 96 |
self.img_pad_id = self.special_tokens[self.image_pad_tag]
|
|
@@ -111,9 +98,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 111 |
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
| 112 |
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
| 113 |
|
| 114 |
-
self.decoder = {
|
| 115 |
-
v: k for k, v in self.mergeable_ranks.items()
|
| 116 |
-
} # type: dict[int, bytes|str]
|
| 117 |
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
| 118 |
|
| 119 |
self.tokenizer = enc # type: tiktoken.Encoding
|
|
@@ -128,9 +113,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 128 |
def get_vocab(self) -> Dict[bytes, int]:
|
| 129 |
return self.mergeable_ranks
|
| 130 |
|
| 131 |
-
def convert_tokens_to_ids(
|
| 132 |
-
self, tokens: Union[bytes, str, List[Union[bytes, str]]]
|
| 133 |
-
) -> List[int]:
|
| 134 |
ids = []
|
| 135 |
if isinstance(tokens, (str, bytes)):
|
| 136 |
if tokens in self.special_tokens:
|
|
@@ -146,11 +129,11 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 146 |
|
| 147 |
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
| 148 |
if not special_tokens and new_tokens:
|
| 149 |
-
raise ValueError(
|
| 150 |
for token in new_tokens:
|
| 151 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
| 152 |
if surface_form not in SPECIAL_TOKENS:
|
| 153 |
-
raise ValueError(
|
| 154 |
return 0
|
| 155 |
|
| 156 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
|
@@ -197,9 +180,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 197 |
text = unicodedata.normalize("NFC", text)
|
| 198 |
|
| 199 |
# this implementation takes a detour: text -> token id -> token surface forms
|
| 200 |
-
for t in self.tokenizer.encode(
|
| 201 |
-
text, allowed_special=allowed_special, disallowed_special=disallowed_special
|
| 202 |
-
):
|
| 203 |
tokens.append(self.decoder[t])
|
| 204 |
return tokens
|
| 205 |
|
|
|
|
| 12 |
from typing import Collection, Dict, List, Set, Tuple, Union
|
| 13 |
|
| 14 |
import tiktoken
|
| 15 |
+
from transformers import AddedToken, PreTrainedTokenizer
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
|
|
| 37 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
| 38 |
with open(tiktoken_bpe_file, "rb") as f:
|
| 39 |
contents = f.read()
|
| 40 |
+
return {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
|
| 41 |
+
|
|
|
|
|
|
|
| 42 |
|
| 43 |
class QWenTokenizer(PreTrainedTokenizer):
|
| 44 |
"""QWen tokenizer."""
|
|
|
|
| 49 |
self,
|
| 50 |
vocab_file,
|
| 51 |
errors="replace",
|
| 52 |
+
image_start_tag="<img>",
|
| 53 |
+
image_end_tag="</img>",
|
| 54 |
+
image_pad_tag="<imgpad>",
|
| 55 |
+
ref_start_tag="<ref>",
|
| 56 |
+
ref_end_tag="</ref>",
|
| 57 |
+
box_start_tag="<box>",
|
| 58 |
+
box_end_tag="</box>",
|
| 59 |
+
quad_start_tag="<quad>",
|
| 60 |
+
quad_end_tag="</quad>",
|
| 61 |
**kwargs,
|
| 62 |
):
|
| 63 |
super().__init__(**kwargs)
|
| 64 |
+
|
| 65 |
self.image_start_tag = image_start_tag
|
| 66 |
self.image_end_tag = image_end_tag
|
| 67 |
self.image_pad_tag = image_pad_tag
|
|
|
|
| 71 |
self.box_end_tag = box_end_tag
|
| 72 |
self.quad_start_tag = quad_start_tag
|
| 73 |
self.quad_end_tag = quad_end_tag
|
| 74 |
+
self.IMAGE_ST = (ref_start_tag, ref_end_tag, box_start_tag, box_end_tag, quad_start_tag, quad_end_tag, image_start_tag, image_end_tag, image_pad_tag)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
self.errors = errors # how to handle errors in decoding
|
| 77 |
|
| 78 |
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
| 79 |
+
self.special_tokens = {token: index for index, token in enumerate(SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks))}
|
| 80 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
self.img_start_id = self.special_tokens[self.image_start_tag]
|
| 82 |
self.img_end_id = self.special_tokens[self.image_end_tag]
|
| 83 |
self.img_pad_id = self.special_tokens[self.image_pad_tag]
|
|
|
|
| 98 |
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
| 99 |
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
| 100 |
|
| 101 |
+
self.decoder = {v: k for k, v in self.mergeable_ranks.items()} # type: dict[int, bytes|str]
|
|
|
|
|
|
|
| 102 |
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
| 103 |
|
| 104 |
self.tokenizer = enc # type: tiktoken.Encoding
|
|
|
|
| 113 |
def get_vocab(self) -> Dict[bytes, int]:
|
| 114 |
return self.mergeable_ranks
|
| 115 |
|
| 116 |
+
def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
|
|
|
|
|
|
|
| 117 |
ids = []
|
| 118 |
if isinstance(tokens, (str, bytes)):
|
| 119 |
if tokens in self.special_tokens:
|
|
|
|
| 129 |
|
| 130 |
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
| 131 |
if not special_tokens and new_tokens:
|
| 132 |
+
raise ValueError("Adding regular tokens is not supported")
|
| 133 |
for token in new_tokens:
|
| 134 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
| 135 |
if surface_form not in SPECIAL_TOKENS:
|
| 136 |
+
raise ValueError("Adding unknown special tokens is not supported")
|
| 137 |
return 0
|
| 138 |
|
| 139 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
|
|
|
| 180 |
text = unicodedata.normalize("NFC", text)
|
| 181 |
|
| 182 |
# this implementation takes a detour: text -> token id -> token surface forms
|
| 183 |
+
for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
|
|
|
|
|
|
|
| 184 |
tokens.append(self.decoder[t])
|
| 185 |
return tokens
|
| 186 |
|