yujingfeng commited on
Commit
967a9c5
·
verified ·
1 Parent(s): c7d5144

Upload 2 files

Browse files
configuration_qwen2_5_vl.py CHANGED
@@ -31,13 +31,12 @@ class Qwen2_5_VLConfig(PretrainedConfig):
31
  no_bias=True,
32
  tie_word_embeddings=False,
33
  visual=dict(
34
- image_start_id=151857,
35
- image_end_id=151858,
36
- image_size=448,
37
- patch_size=14,
38
- hidden_size=4096,
39
- num_hidden_layers=32,
40
- num_attention_heads=32,
41
  ),
42
  **kwargs,
43
  ):
 
31
  no_bias=True,
32
  tie_word_embeddings=False,
33
  visual=dict(
34
+ image_start_id=151652,
35
+ image_end_id=151653,
36
+ hidden_size=1280,
37
+ num_hidden_layers=32,
38
+ num_attention_heads=16,
39
+ patch_size=14,
 
40
  ),
41
  **kwargs,
42
  ):
tokenization_qwen2_5_vl.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ import os
4
+ import requests
5
+ import unicodedata
6
+ from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
7
+
8
+ import tiktoken
9
+ import numpy as np
10
+ from PIL import Image
11
+ from transformers import PreTrainedTokenizer, AddedToken
12
+ from transformers.utils import try_to_load_from_cache
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # 更新为Qwen2.5专用文件名
17
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen2_5.tiktoken", "ttf": "SimSun.ttf"}
18
+
19
+ # 特殊标记更新
20
+ IMSTART = "<|im_start|>"
21
+ IMEND = "<|im_end|>"
22
+ IMG_START = "<image>"
23
+ IMG_END = "</image>"
24
+ IMG_PAD = "<imagepad>"
25
+ REF_START = "<ref>"
26
+ REF_END = "</ref>"
27
+ BOX_START = "<box>"
28
+ BOX_END = "</box>"
29
+ QUAD_START = "<quad>"
30
+ QUAD_END = "</quad>"
31
+
32
+ class Qwen2_5_VLTokenizer(PreTrainedTokenizer):
33
+ """Qwen2.5-VL tokenizer, modified from QWenTokenizer."""
34
+
35
+ vocab_files_names = VOCAB_FILES_NAMES
36
+
37
+ def __init__(
38
+ self,
39
+ vocab_file,
40
+ errors="replace",
41
+ image_start_tag=IMG_START,
42
+ image_end_tag=IMG_END,
43
+ image_pad_tag=IMG_PAD,
44
+ ref_start_tag=REF_START,
45
+ ref_end_tag=REF_END,
46
+ box_start_tag=BOX_START,
47
+ box_end_tag=BOX_END,
48
+ quad_start_tag=QUAD_START,
49
+ quad_end_tag=QUAD_END,
50
+ **kwargs,
51
+ ):
52
+ # 初始化特殊标记
53
+ self.image_start_tag = image_start_tag
54
+ self.image_end_tag = image_end_tag
55
+ self.image_pad_tag = image_pad_tag
56
+ self.ref_start_tag = ref_start_tag
57
+ self.ref_end_tag = ref_end_tag
58
+ self.box_start_tag = box_start_tag
59
+ self.box_end_tag = box_end_tag
60
+ self.quad_start_tag = quad_start_tag
61
+ self.quad_end_tag = quad_end_tag
62
+
63
+ # 视觉相关特殊标记集合
64
+ self.IMAGE_ST = (
65
+ ref_start_tag, ref_end_tag,
66
+ box_start_tag, box_end_tag,
67
+ quad_start_tag, quad_end_tag,
68
+ image_start_tag, image_end_tag,
69
+ image_pad_tag
70
+ )
71
+
72
+ super().__init__(**kwargs)
73
+ self.errors = errors
74
+
75
+ # 加载词汇表
76
+ self.mergeable_ranks = self._load_tiktoken_bpe(vocab_file)
77
+
78
+ # 特殊token处理
79
+ self.special_tokens = {
80
+ token: index
81
+ for index, token in enumerate(
82
+ [IMSTART, IMEND] + list(self.IMAGE_ST),
83
+ start=len(self.mergeable_ranks)
84
+ )
85
+ }
86
+
87
+ # 初始化编码器
88
+ self.tokenizer = tiktoken.Encoding(
89
+ "Qwen2.5",
90
+ pat_str=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
91
+ mergeable_ranks=self.mergeable_ranks,
92
+ special_tokens=self.special_tokens,
93
+ )
94
+
95
+ # 特殊token ID
96
+ self.im_start_id = self.special_tokens[IMSTART]
97
+ self.im_end_id = self.special_tokens[IMEND]
98
+ self.img_start_id = self.special_tokens[image_start_tag]
99
+ self.img_end_id = self.special_tokens[image_end_tag]
100
+ self.img_pad_id = self.special_tokens[image_pad_tag]
101
+
102
+ def _load_tiktoken_bpe(self, tiktoken_bpe_file: str) -> Dict[bytes, int]:
103
+ """加载BPE词汇表"""
104
+ with open(tiktoken_bpe_file, "rb") as f:
105
+ contents = f.read()
106
+ return {
107
+ base64.b64decode(token): int(rank)
108
+ for token, rank in (line.split() for line in contents.splitlines() if line)
109
+ }
110
+
111
+ def __len__(self) -> int:
112
+ return self.tokenizer.n_vocab
113
+
114
+ def get_vocab(self) -> Dict[bytes, int]:
115
+ return {**self.mergeable_ranks, **self.special_tokens}
116
+
117
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
118
+ """Token to id转换"""
119
+ if token in self.special_tokens:
120
+ return self.special_tokens[token]
121
+ if token in self.mergeable_ranks:
122
+ return self.mergeable_ranks[token]
123
+ raise ValueError(f"Unknown token: {token}")
124
+
125
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
126
+ """Id to token转换"""
127
+ if index in self.special_tokens.values():
128
+ return list(self.special_tokens.keys())[list(self.special_tokens.values()).index(index)]
129
+ if index in self.mergeable_ranks.values():
130
+ return list(self.mergeable_ranks.keys())[list(self.mergeable_ranks.values()).index(index)]
131
+ raise ValueError(f"Unknown index: {index}")
132
+
133
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
134
+ """将token序列转换为字符串"""
135
+ text = ""
136
+ temp = b""
137
+ for t in tokens:
138
+ if isinstance(t, str):
139
+ if temp:
140
+ text += temp.decode("utf-8", errors=self.errors)
141
+ temp = b""
142
+ text += t
143
+ elif isinstance(t, bytes):
144
+ temp += t
145
+ else:
146
+ raise TypeError("token should be bytes or str")
147
+ if temp:
148
+ text += temp.decode("utf-8", errors=self.errors)
149
+ return text
150
+
151
+ def tokenize(self, text: str, **kwargs) -> List[Union[bytes, str]]:
152
+ """分词处理"""
153
+ text = unicodedata.normalize("NFC", text)
154
+ tokens = [self._convert_id_to_token(i) for i in self.tokenizer.encode(text)]
155
+ return tokens
156
+
157
+ def _decode(self, token_ids: List[int], **kwargs) -> str:
158
+ """解码token ids"""
159
+ skip_special_tokens = kwargs.get("skip_special_tokens", False)
160
+ keep_image_special = kwargs.get("keep_image_special", False)
161
+
162
+ if skip_special_tokens:
163
+ if keep_image_special:
164
+ token_ids = [i for i in token_ids if i < len(self.mergeable_ranks) or
165
+ i in [self.img_start_id, self.img_end_id]]
166
+ else:
167
+ token_ids = [i for i in token_ids if i < len(self.mergeable_ranks)]
168
+
169
+ return self.tokenizer.decode(token_ids, errors=self.errors)
170
+
171
+ def to_list_format(self, text: str) -> List[Dict]:
172
+ """将文本转换为列表格式(多模态输入)"""
173
+ text = unicodedata.normalize("NFC", text)
174
+ token_ids = self.tokenizer.encode(text)
175
+
176
+ def _encode_element(tokens):
177
+ if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
178
+ return [{'image': self._decode(tokens[1:-1])}]
179
+ # 其他视觉元素处理...
180
+ return [{'text': self._decode(tokens)}]
181
+
182
+ return self._process_visual_tokens(token_ids, _encode_element)
183
+
184
+ def from_list_format(self, messages: List[Dict]) -> str:
185
+ """从列表格式构造多模态文本"""
186
+ text = ""
187
+ for msg in messages:
188
+ if 'image' in msg:
189
+ text += f"{self.image_start_tag}{msg['image']}{self.image_end_tag}\n"
190
+ elif 'text' in msg:
191
+ text += msg['text']
192
+ # 其他视觉元素处理...
193
+ return text
194
+
195
+ def _process_visual_tokens(self, token_ids, process_func):
196
+ """处理视觉token的通用方法"""
197
+ result = []
198
+ i = 0
199
+ while i < len(token_ids):
200
+ if token_ids[i] == self.img_start_id:
201
+ end = token_ids.index(self.img_end_id, i) if self.img_end_id in token_ids[i:] else len(token_ids)
202
+ result.extend(process_func(token_ids[i:end+1]))
203
+ i = end + 1
204
+ else:
205
+ result.extend(process_func([token_ids[i]]))
206
+ i += 1
207
+ return result
208
+
209
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
210
+ """保存词汇表"""
211
+ vocab_file = os.path.join(save_directory, "qwen2_5.tiktoken")
212
+ with open(vocab_file, "w", encoding="utf8") as f:
213
+ for token, rank in self.mergeable_ranks.items():
214
+ f.write(f"{base64.b64encode(token).decode('utf8')} {rank}\n")
215
+ return (vocab_file,)