PengDa02 commited on
Commit
bb46e5d
·
verified ·
1 Parent(s): d5c2f73

upload Cheers-v1.0

Browse files

init, upload Cheers-v1.0

config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Cheers"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_umm.UMMConfig",
7
+ "AutoModel": "modeling_umm.UMMModel",
8
+ "AutoModelForCausalLM": "modeling_umm.Cheers"
9
+ },
10
+ "vae_encoder_config": {
11
+ "resolution": 512
12
+ },
13
+ "vae_decoder_config": {
14
+ "resolution": 512
15
+ },
16
+ "vision_representation_config": {
17
+ "attention_dropout": 0.0,
18
+ "hidden_act": "gelu_pytorch_tanh",
19
+ "hidden_size": 1152,
20
+ "image_size": 512,
21
+ "intermediate_size": 4304,
22
+ "layer_norm_eps": 1e-06,
23
+ "model_type": "umm",
24
+ "num_attention_heads": 16,
25
+ "num_channels": 3,
26
+ "num_hidden_layers": 27,
27
+ "num_patches": 1024,
28
+ "patch_size": 16
29
+ },
30
+ "text_config":{
31
+ "hidden_size": 1536,
32
+ "intermediate_size": 8960,
33
+ "max_window_layers": 21,
34
+ "num_attention_heads": 12,
35
+ "num_key_value_heads": 2,
36
+ "sliding_window": 32768,
37
+ "tie_word_embeddings": true,
38
+ "vocab_size": 151936,
39
+ "max_position_embeddings": 32768
40
+ },
41
+ "model_type": "umm",
42
+ "torch_dtype": "bfloat16",
43
+ "transformers_version": "4.51.3"
44
+ }
configuration_umm.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.modeling_rope_utils import rope_config_validation
3
+ from transformers.utils import logging
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ class VAEEncoderConfig(PretrainedConfig):
8
+ model_type = "umm"
9
+ base_config_key = "vae_encoder_config"
10
+
11
+ def __init__(
12
+ self,
13
+ resolution=256,
14
+ in_channels=3,
15
+ ch=128,
16
+ ch_mult=[1,2,4,4],
17
+ num_res_blocks=2,
18
+ z_channels=32,
19
+ **kwargs
20
+ ):
21
+ self.resolution = resolution
22
+ self.in_channels = in_channels
23
+ self.ch = ch
24
+ self.ch_mult = ch_mult
25
+ self.num_res_blocks = num_res_blocks
26
+ self.z_channels = z_channels
27
+ super().__init__(**kwargs)
28
+
29
+ class VAEDecoderConfig(PretrainedConfig):
30
+ model_type = "umm"
31
+ base_config_key = "vae_decoder_config"
32
+
33
+ def __init__(
34
+ self,
35
+ ch=128,
36
+ out_ch=3,
37
+ ch_mult=[1,2,4,4],
38
+ num_res_blocks=2,
39
+ in_channels=3,
40
+ resolution=256,
41
+ z_channels=32,
42
+ **kwargs
43
+ ):
44
+ self.resolution = resolution
45
+ self.in_channels = in_channels
46
+ self.ch = ch
47
+ self.out_ch = out_ch
48
+ self.ch_mult = ch_mult
49
+ self.num_res_blocks = num_res_blocks
50
+ self.z_channels = z_channels
51
+ super().__init__(**kwargs)
52
+
53
+ class Siglip2VisionConfig(PretrainedConfig):
54
+ model_type = "umm"
55
+ base_config_key = "vision_representation_config"
56
+
57
+ def __init__(
58
+ self,
59
+ hidden_size=768,
60
+ intermediate_size=3072,
61
+ num_hidden_layers=12,
62
+ num_attention_heads=12,
63
+ num_channels=3,
64
+ image_size=256,
65
+ patch_size=16,
66
+ hidden_act="gelu_pytorch_tanh",
67
+ layer_norm_eps=1e-6,
68
+ attention_dropout=0.0,
69
+ **kwargs,
70
+ ):
71
+ super().__init__(**kwargs)
72
+
73
+ self.hidden_size = hidden_size
74
+ self.intermediate_size = intermediate_size
75
+ self.num_hidden_layers = num_hidden_layers
76
+ self.num_attention_heads = num_attention_heads
77
+ self.num_channels = num_channels
78
+ self.patch_size = patch_size
79
+ self.attention_dropout = attention_dropout
80
+ self.layer_norm_eps = layer_norm_eps
81
+ self.hidden_act = hidden_act
82
+ self.image_size = image_size
83
+
84
+ class Qwen2Config(PretrainedConfig):
85
+ model_type = "umm"
86
+ base_config_key = "text_config"
87
+
88
+ def __init__(
89
+ self,
90
+ vocab_size=152064,
91
+ hidden_size=3584,
92
+ intermediate_size=18944,
93
+ num_hidden_layers=28,
94
+ num_attention_heads=28,
95
+ num_key_value_heads=4,
96
+ hidden_act="silu",
97
+ max_position_embeddings=131072,
98
+ initializer_range=0.02,
99
+ rms_norm_eps=1e-6,
100
+ use_cache=True,
101
+ tie_word_embeddings=False,
102
+ rope_theta=1000000.0,
103
+ rope_scaling=None,
104
+ use_sliding_window=False,
105
+ sliding_window=131072,
106
+ max_window_layers=28,
107
+ layer_types=None,
108
+ attention_dropout=0.0,
109
+ **kwargs,
110
+ ):
111
+ self.vocab_size = vocab_size
112
+ self.max_position_embeddings = max_position_embeddings
113
+ self.hidden_size = hidden_size
114
+ self.intermediate_size = intermediate_size
115
+ self.num_hidden_layers = num_hidden_layers
116
+ self.num_attention_heads = num_attention_heads
117
+ self.use_sliding_window = use_sliding_window
118
+ self.sliding_window = sliding_window if self.use_sliding_window else None
119
+ self.max_window_layers = max_window_layers
120
+ # for backward compatibility
121
+ if num_key_value_heads is None:
122
+ num_key_value_heads = num_attention_heads
123
+
124
+ self.num_key_value_heads = num_key_value_heads
125
+ self.hidden_act = hidden_act
126
+ self.initializer_range = initializer_range
127
+ self.rms_norm_eps = rms_norm_eps
128
+ self.use_cache = use_cache
129
+ self.rope_theta = rope_theta
130
+ self.rope_scaling = rope_scaling
131
+ self.attention_dropout = attention_dropout
132
+ # Validate the correctness of rotary position embeddings parameters
133
+ # BC: if there is a 'type' field, move it to 'rope_type'.
134
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
135
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
136
+ rope_config_validation(self)
137
+
138
+ self.layer_types = layer_types
139
+ if self.layer_types is None:
140
+ self.layer_types = [
141
+ "sliding_attention"
142
+ if self.sliding_window is not None and i >= self.max_window_layers
143
+ else "full_attention"
144
+ for i in range(self.num_hidden_layers)
145
+ ]
146
+
147
+ super().__init__(
148
+ tie_word_embeddings=tie_word_embeddings,
149
+ **kwargs,
150
+ )
151
+
152
+ class UMMConfig(PretrainedConfig):
153
+ model_type = "umm"
154
+ sub_configs = {
155
+ "vision_representation_config": Siglip2VisionConfig,
156
+ "vae_encoder_config": VAEEncoderConfig,
157
+ "vae_decoder_config": VAEDecoderConfig,
158
+ "text_config": Qwen2Config,
159
+ }
160
+ keys_to_ignore_at_inference = ["past_key_values"]
161
+ def __init__(
162
+ self,
163
+ vision_representation_config=None,
164
+ vae_encoder_config=None,
165
+ vae_decoder_config=None,
166
+ text_config=None,
167
+ **kwargs,
168
+ ):
169
+ if isinstance(vision_representation_config, dict):
170
+ self.vision_representation_config = self.sub_configs["vision_representation_config"](**vision_representation_config)
171
+ elif vision_representation_config is None:
172
+ self.vision_representation_config = self.sub_configs["vision_representation_config"]()
173
+
174
+ if isinstance(vae_encoder_config, dict):
175
+ self.vae_encoder_config = self.sub_configs["vae_encoder_config"](**vae_encoder_config)
176
+ elif vae_encoder_config is None:
177
+ self.vae_encoder_config = self.sub_configs["vae_encoder_config"]()
178
+
179
+ if isinstance(vae_decoder_config, dict):
180
+ self.vae_decoder_config = self.sub_configs["vae_decoder_config"](**vae_decoder_config)
181
+ elif vae_decoder_config is None:
182
+ self.vae_decoder_config = self.sub_configs["vae_decoder_config"]()
183
+
184
+ if isinstance(text_config, dict):
185
+ self.text_config = self.sub_configs["text_config"](**text_config)
186
+ elif text_config is None:
187
+ self.text_config = self.sub_configs["text_config"]()
188
+
189
+ super().__init__(**kwargs)
190
+
191
+ __all__ = ["UMMConfig"]
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.51.0"
14
+ }
image_processing_umm.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from networkx import to_numpy_array
2
+ import numpy as np
3
+ import torch
4
+ from PIL import Image, ImageOps
5
+ import math
6
+ from functools import partial, reduce
7
+ from transformers.image_transforms import (
8
+ convert_to_rgb,
9
+ center_crop,
10
+ normalize,
11
+ rescale,
12
+ resize,
13
+ to_channel_dimension_format,
14
+ )
15
+
16
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
17
+ from transformers.image_utils import ImageInput, ChannelDimension, PILImageResampling, to_numpy_array
18
+
19
+ class UMMImageProcessor(BaseImageProcessor):
20
+ model_input_names = ["pixel_values", "grid_hws"]
21
+ def __init__(
22
+ self,
23
+ image_mean=(0.5, 0.5, 0.5),
24
+ image_std=(0.5, 0.5, 0.5),
25
+ size=(256, 256),
26
+ crop_size = None,
27
+ resample=PILImageResampling.BICUBIC,
28
+ rescale_factor=1 / 255,
29
+ data_format=ChannelDimension.FIRST,
30
+ scale_resolution=256,
31
+ patch_size=16,
32
+ **kwargs,
33
+ ):
34
+ super().__init__(**kwargs)
35
+ crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
36
+ crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
37
+ self.image_mean = image_mean
38
+ self.image_std = image_std
39
+ self.size = size
40
+ self.resample = resample
41
+ self.rescale_factor = rescale_factor
42
+ self.data_format = data_format
43
+ self.crop_size = crop_size
44
+ self.scale_resolution = scale_resolution
45
+ self.patch_size = patch_size
46
+
47
+ def preprocess(self, image, max_resolution=None, return_tensors = 'pt', und=True, **kwargs) -> BatchFeature:
48
+ if max_resolution is not None:
49
+ scale_resolution = max_resolution
50
+ else:
51
+ scale_resolution = self.scale_resolution
52
+ if image is not None:
53
+ pixel_values, grid_hws = [], []
54
+ if und:
55
+ image = self._preprocess_und(image, scale_resolution)
56
+ else:
57
+ image = self._preprocess_gen(image, scale_resolution)
58
+ if not torch.is_tensor(image):
59
+ image = torch.tensor(image)
60
+ _,H,W = image.shape
61
+ grid_h = int(H // self.patch_size)
62
+ grid_w = int(W // self.patch_size)
63
+ grid_hw = (grid_h, grid_w)
64
+ pixel_values = torch.stack([image], dim=0)
65
+ grid_hws = torch.tensor([grid_hw])
66
+ data = {
67
+ "pixel_values": pixel_values,
68
+ "grid_hws": grid_hws
69
+ }
70
+ return BatchFeature(data=data, tensor_type=return_tensors)
71
+
72
+ def _preprocess_gen(self, source_image, scale_resolution):
73
+ w, h = source_image.size
74
+ scale = scale_resolution / min(h, w)
75
+ new_h = int(round(h * scale))
76
+ new_w = int(round(w * scale))
77
+ source_image = source_image.resize((new_w, new_h), Image.Resampling.BICUBIC)
78
+ source_image = [source_image]
79
+ transforms = [
80
+ convert_to_rgb,
81
+ to_numpy_array,
82
+ ]
83
+ transforms.append(partial(center_crop, size=(scale_resolution, scale_resolution)))
84
+ transforms.append(partial(rescale, scale=self.rescale_factor, data_format=self.data_format))
85
+ transforms.append(partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format))
86
+ image = reduce(lambda x, f: [*map(f, x)], transforms, source_image)
87
+ return image[0] if len(image) == 1 else image
88
+
89
+ def _preprocess_und(self, source_image, scale_resolution):
90
+ w, h = source_image.size
91
+ scale = min(scale_resolution / h, scale_resolution / w)
92
+ new_h = int(round(h * scale))
93
+ new_w = int(round(w * scale))
94
+ resized_image = source_image.resize((new_w, new_h), Image.Resampling.BICUBIC)
95
+
96
+ pad_w = scale_resolution - new_w
97
+ pad_h = scale_resolution - new_h
98
+
99
+ left = pad_w // 2
100
+ right = pad_w - left
101
+ top = pad_h // 2
102
+ bottom = pad_h - top
103
+
104
+ new_image = ImageOps.expand(resized_image, border=(left, top, right, bottom), fill=(0,0,0))
105
+ # new_image.save("test_path")
106
+ source_image = [new_image]
107
+ transforms = [
108
+ convert_to_rgb,
109
+ to_numpy_array
110
+ ]
111
+ transforms.append(partial(rescale, scale=self.rescale_factor, data_format=self.data_format))
112
+ transforms.append(partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format))
113
+ image = reduce(lambda x, f: [*map(f, x)], transforms, source_image)
114
+ return image[0] if len(image) == 1 else image
115
+
116
+ __all__ = ["UMMImageProcessor"]
117
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf342f998ba25d39f994bacd955c5cf3b26ba0c5e655da80e2605dae9b5c6655
3
+ size 4994192134
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9993848bbb16fc09745602361881f362302db2779cb2a6beb39235b8336b7320
3
+ size 615406460
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_umm.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor_type": "UMMImageProcessor",
3
+ "auto_map":{
4
+ "AutoProcessor": "processing_umm.UMMProcessor",
5
+ "AutoImageProcessor": "image_processing_umm.UMMImageProcessor"
6
+ },
7
+ "processor_class": "UMMProcessor",
8
+ "scale_resolution": 512,
9
+ "patch_size": 16
10
+ }
processing_umm.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from scipy import special
3
+ import torch
4
+ import numpy as np
5
+ from math import e
6
+ from param import output
7
+ from transformers.feature_extraction_utils import BatchFeature
8
+ from transformers.processing_utils import ProcessorMixin
9
+
10
+ class UMMProcessor(ProcessorMixin):
11
+ attributes = ["image_processor", "tokenizer"]
12
+ image_processor_class = "AutoImageProcessor"
13
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
14
+
15
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
16
+ self.image_token = "<image>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
17
+ if getattr(tokenizer, "image_token_id", None):
18
+ self.image_token_id = tokenizer.image_token_id
19
+ else:
20
+ tokenizer.add_tokens(["<image>"], special_tokens=True)
21
+ self.image_token_id = -200
22
+
23
+ self.image_gen_token = "<image_gen>" if not hasattr(tokenizer, "image_gen_token") else tokenizer.image_gen_token
24
+ if getattr(tokenizer, "image_gen_token_id", None):
25
+ self.image_gen_token_id = tokenizer.image_gen_token_id
26
+ else:
27
+ tokenizer.add_tokens(["<image_gen>"], special_tokens=True)
28
+ self.image_gen_token_id = -300
29
+
30
+ self.image_gen_start_token = "<im_start>" if not hasattr(tokenizer, "image_gen_start") else tokenizer.image_gen_start
31
+ if getattr(tokenizer, "image_gen_start_token_id", None):
32
+ self.image_gen_start_token_id = tokenizer.image_gen_start_token_id
33
+ else:
34
+ tokenizer.add_tokens(["<im_start>"], special_tokens=True)
35
+ self.image_gen_start_token_id = tokenizer.convert_tokens_to_ids(self.image_gen_start_token)
36
+
37
+ self.image_gen_end_token = "<im_end>" if not hasattr(tokenizer, "image_gen_end") else tokenizer.image_gen_end
38
+ if getattr(tokenizer, "image_gen_end_token_id", None):
39
+ self.image_gen_end_token_id = tokenizer.image_gen_end_token_id
40
+ else:
41
+ tokenizer.add_tokens(["<im_end>"], special_tokens=True)
42
+ self.image_gen_end_token_id = tokenizer.convert_tokens_to_ids(self.image_gen_end_token)
43
+
44
+ self.no_mean_token = "<no_mean>" if not hasattr(tokenizer, "no_mean") else tokenizer.no_mean
45
+ if getattr(tokenizer, "no_mean_id", None):
46
+ self.no_mean_token_id = tokenizer.no_mean_id
47
+ else:
48
+ tokenizer.add_tokens(["<no_mean>"], special_tokens=True)
49
+ self.no_mean_token_id = tokenizer.convert_tokens_to_ids(self.no_mean_token)
50
+
51
+ if chat_template is None and hasattr(tokenizer, "chat_template"):
52
+ chat_template = tokenizer.chat_template
53
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
54
+
55
+ def __call__(self, images=None, text=None, max_resolution=None, add_im_start_id=False, **kwargs):
56
+ if "padding" not in kwargs:
57
+ kwargs["padding"] = True
58
+ if "truncation" not in kwargs:
59
+ kwargs["truncation"] = True
60
+ if not isinstance(text, list):
61
+ text = [text]
62
+ text = text.copy()
63
+ return_tensors = kwargs.pop("return_tensors", None)
64
+ text_inputs = self.tokenizer(text, **kwargs, return_tensors=return_tensors)
65
+ img_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
66
+ img_gen_token_id = self.tokenizer.convert_tokens_to_ids(self.image_gen_token)
67
+ if add_im_start_id:
68
+ B, T = text_inputs["input_ids"].shape
69
+ new_input_ids = torch.full((B, T+1), self.tokenizer.pad_token_id)
70
+ new_input_ids[:, :T] = text_inputs["input_ids"]
71
+ is_valid = (text_inputs["input_ids"] != self.tokenizer.pad_token_id)
72
+ valid_len = is_valid.sum(dim=1)
73
+ else:
74
+ new_input_ids = text_inputs["input_ids"]
75
+
76
+ t = []
77
+ und_gen_mask_list = []
78
+ for i, ids in enumerate(text_inputs["input_ids"]):
79
+ for j, token_id in enumerate(ids):
80
+ if token_id == img_token_id:
81
+ new_input_ids[i][j] = self.image_token_id
82
+ t.append(torch.tensor([1.0]))
83
+ und_gen_mask_list.append(1)
84
+ elif token_id == img_gen_token_id:
85
+ new_input_ids[i][j] = self.image_gen_token_id
86
+ t.append(torch.rand(1))
87
+ und_gen_mask_list.append(0)
88
+
89
+ image_inputs = {}
90
+ pixel_values, grid_hws = [], []
91
+ if images is not None:
92
+ image_idx = 0
93
+ for per_images in images if isinstance(images, list) else [images]:
94
+ if per_images is None:
95
+ dummy_image = Image.fromarray(np.random.randint(0, 256, (256, 256, 3), dtype=np.uint8))
96
+ image_info = self.image_processor(images=dummy_image)
97
+ else:
98
+ for per_image in per_images if isinstance(per_images, list) else[per_images]:
99
+ if und_gen_mask_list[image_idx] == 0:
100
+ image_info = self.image_processor(images=per_image, max_resolution=max_resolution, und=False)
101
+ else:
102
+ image_info = self.image_processor(images=per_image, max_resolution=max_resolution)
103
+ image_idx += 1
104
+ pixel_values.append(image_info.pixel_values)
105
+ grid_hws.append(image_info.grid_hws)
106
+ pixel_values = torch.concat(pixel_values, dim=0)
107
+ grid_hws = torch.concat(grid_hws, dim=0)
108
+ image_inputs.update({'pixel_values': pixel_values, 'grid_hws': grid_hws})
109
+
110
+ if len(t) > 0:
111
+ t = torch.cat(t)
112
+ image_inputs.update({"t":t})
113
+ if add_im_start_id:
114
+ for b in range(B):
115
+ pos = valid_len[b].item()
116
+ new_input_ids[b, pos] = self.image_gen_start_token_id
117
+ attention_mask = torch.cat([
118
+ text_inputs["attention_mask"],
119
+ (new_input_ids[:, -1] != self.tokenizer.pad_token_id).long().unsqueeze(1)
120
+ ], dim=1)
121
+ text_inputs["attention_mask"] = attention_mask
122
+ text_inputs["input_ids"] = new_input_ids
123
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
124
+
125
+ __all__ = ["UMMProcessor"]
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff