h0witended commited on
Commit
9abcb49
·
verified ·
1 Parent(s): 4af9ba5

Delete tokenization_minicpmo_fast.py

Browse files
Files changed (1) hide show
  1. tokenization_minicpmo_fast.py +0 -110
tokenization_minicpmo_fast.py DELETED
@@ -1,110 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2025 The OpenBMB Team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- from transformers import Qwen2TokenizerFast
17
-
18
-
19
- class MiniCPMOTokenizerFast(Qwen2TokenizerFast):
20
- def __init__(self, **kwargs):
21
- super().__init__(**kwargs)
22
- # image
23
- self.im_start = "<image>"
24
- self.im_end = "</image>"
25
- self.ref_start = "<ref>"
26
- self.ref_end = "</ref>"
27
- self.box_start = "<box>"
28
- self.box_end = "</box>"
29
- self.quad_start = "<quad>"
30
- self.quad_end = "</quad>"
31
- self.slice_start = "<slice>"
32
- self.slice_end = "</slice>"
33
- self.im_id_start = "<image_id>"
34
- self.im_id_end = "</image_id>"
35
-
36
- # audio
37
- self.audio_start = "<|audio_start|>"
38
- self.audio_end = "<|audio_end|>"
39
- self.spk_start = "<|spk_bos|>"
40
- self.spk_end = "<|spk_eos|>"
41
- self.tts_start = "<|tts_bos|>"
42
- self.tts_end = "<|tts_eos|>"
43
-
44
- @property
45
- def eos_id(self):
46
- return self.eos_token_id
47
-
48
- @property
49
- def bos_id(self):
50
- return self.bos_token_id
51
-
52
- @property
53
- def unk_id(self):
54
- return self.unk_token_id
55
-
56
- @property
57
- def im_start_id(self):
58
- return self.convert_tokens_to_ids(self.im_start)
59
-
60
- @property
61
- def im_end_id(self):
62
- return self.convert_tokens_to_ids(self.im_end)
63
-
64
- @property
65
- def slice_start_id(self):
66
- return self.convert_tokens_to_ids(self.slice_start)
67
-
68
- @property
69
- def slice_end_id(self):
70
- return self.convert_tokens_to_ids(self.slice_end)
71
-
72
- @property
73
- def im_id_start_id(self):
74
- return self.convert_tokens_to_ids(self.im_id_start)
75
-
76
- @property
77
- def im_id_end_id(self):
78
- return self.convert_tokens_to_ids(self.im_id_end)
79
-
80
- @property
81
- def audio_start_id(self):
82
- return self.convert_tokens_to_ids(self.audio_start)
83
-
84
- @property
85
- def audio_end_id(self):
86
- return self.convert_tokens_to_ids(self.audio_end)
87
-
88
- @property
89
- def spk_start_id(self):
90
- return self.convert_tokens_to_ids(self.spk_start)
91
-
92
- @property
93
- def spk_end_id(self):
94
- return self.convert_tokens_to_ids(self.spk_end)
95
-
96
- @property
97
- def tts_start_id(self):
98
- return self.convert_tokens_to_ids(self.tts_start)
99
-
100
- @property
101
- def tts_end_id(self):
102
- return self.convert_tokens_to_ids(self.tts_end)
103
-
104
- @staticmethod
105
- def escape(text: str) -> str:
106
- return text
107
-
108
- @staticmethod
109
- def unescape(text: str) -> str:
110
- return text