Anonumous commited on
Commit
fc0bbdc
·
verified ·
1 Parent(s): 4951893

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - ru
5
+ pipeline_tag: automatic-speech-recognition
6
+ ---
7
+
8
+
9
+ ## Borealis
10
+
11
+ ### Описание
12
+
13
+ **Borealis** - это наша первая ASR модель для русского языка
14
+
15
+
16
+
17
+
18
+
19
+ ### Использование
20
+
21
+ ```python
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoFeatureExtractor
23
+ import torch
24
+
25
+
26
+ model = AutoModelForCausalLM.from_pretrained("Vikhrmodels/Borealis", trust_remote_code=True)
27
+ tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Borealis")
28
+ extractor = AutoFeatureExtractor.from_pretrained("Vikhrmodels/Borealis")
29
+
30
+ generation_params = {
31
+ "max_new_tokens": 350,
32
+ "do_sample": True,
33
+ "top_p": 0.9,
34
+ "top_k": 50,
35
+ "temperature": 0.2,
36
+ }
37
+
38
+ model.eval()
39
+ model.to("cuda")
40
+
41
+ waveform, sr = librosa.load("path/to/your/audio.wav", sr=16_000)
42
+
43
+ proc = extractor(
44
+ waveform,
45
+ sampling_rate=sr,
46
+ padding="max_length",
47
+ max_length=480_000,
48
+ return_tensors="pt",
49
+ )
50
+
51
+ mel = proc.input_features.squeeze(0).to(device)
52
+
53
+
54
+ with torch.inference_mode():
55
+ transcript = model.generate(mel=mel, att_mask=att_mask, **generation_params)
56
+
57
+ print(transcript)
58
+ ```
added_tokens.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|end_of_audio|>": 151666,
7
+ "<|endoftext|>": 151643,
8
+ "<|file_sep|>": 151664,
9
+ "<|fim_middle|>": 151660,
10
+ "<|fim_pad|>": 151662,
11
+ "<|fim_prefix|>": 151659,
12
+ "<|fim_suffix|>": 151661,
13
+ "<|im_end|>": 151645,
14
+ "<|im_start|>": 151644,
15
+ "<|image_pad|>": 151655,
16
+ "<|object_ref_end|>": 151647,
17
+ "<|object_ref_start|>": 151646,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|start_of_audio|>": 151665,
22
+ "<|video_pad|>": 151656,
23
+ "<|vision_end|>": 151653,
24
+ "<|vision_pad|>": 151654,
25
+ "<|vision_start|>": 151652
26
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BorealisForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_borealis.BorealisConfig",
7
+ "AutoModelForCausalLM": "modeling_borealis.BorealisForConditionalGeneration"
8
+ },
9
+ "downsample_factor": 4,
10
+ "llm_name": "unsloth/Qwen2.5-0.5B-Instruct",
11
+ "model_type": "borealis",
12
+ "torch_dtype": "bfloat16",
13
+ "transformers_version": "4.55.4",
14
+ "whisper_encoder_name": "openai/whisper-large-v3"
15
+ }
configuration_borealis.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class BorealisConfig(PretrainedConfig):
5
+ model_type = "borealis"
6
+
7
+ def __init__(
8
+ self,
9
+ whisper_encoder_name: str = "openai/whisper-large-v3",
10
+ llm_name: str = "unsloth/Qwen2.5-0.5B-Instruct",
11
+ downsample_factor: int = 4,
12
+ **kwargs,
13
+ ):
14
+ self.whisper_encoder_name = whisper_encoder_name
15
+ self.llm_name = llm_name
16
+ self.downsample_factor = downsample_factor
17
+ super().__init__(**kwargs)
18
+
19
+
20
+ BorealisConfig.register_for_auto_class()
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_borealis.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from transformers import WhisperModel, PreTrainedModel, WhisperFeatureExtractor
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ from huggingface_hub import PyTorchModelHubMixin
8
+ from .configuration_borealis import BorealisConfig
9
+ from huggingface_hub import hf_hub_download
10
+ import os
11
+
12
+
13
+ class AudioLanguageAdapter(nn.Module):
14
+ def __init__(self, hidden_size: int, dim: int) -> None:
15
+ super().__init__()
16
+ self.w_in = nn.Linear(hidden_size, dim, bias=False)
17
+ self.gelu = nn.GELU()
18
+ self.w_out = nn.Linear(dim, dim, bias=False)
19
+
20
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
21
+ return self.w_out(self.gelu(self.w_in(x)))
22
+
23
+
24
+ class BorealisForConditionalGeneration(PreTrainedModel, PyTorchModelHubMixin):
25
+ config_class = BorealisConfig
26
+
27
+ def __init__(self, config: BorealisConfig, language_model=None, tokenizer=None):
28
+ super().__init__(config)
29
+ assert tokenizer is not None, "Tokenizer надо передать в модельку"
30
+ self.encoder: WhisperModel = WhisperModel.from_pretrained(
31
+ config.whisper_encoder_name
32
+ ).encoder
33
+ self.encoder.to(torch.bfloat16)
34
+ self.encoder.eval()
35
+ for p in self.encoder.parameters():
36
+ p.requires_grad = False
37
+ self.llm = language_model
38
+ self.tokenizer = tokenizer
39
+ self.llm.resize_token_embeddings(len(tokenizer))
40
+ print("Pad token:", self.llm.config.pad_token_id)
41
+ print("EOS token:", self.llm.config.eos_token_id)
42
+ print("Tokenizer EOS token ID:", tokenizer.eos_token_id)
43
+ print("Tokenizer PAD token ID:", tokenizer.pad_token_id)
44
+ self.downsample_factor = config.downsample_factor
45
+ self.adapter = AudioLanguageAdapter(
46
+ hidden_size=self.encoder.config.d_model * self.downsample_factor,
47
+ dim=self.llm.config.hidden_size,
48
+ )
49
+ self.adapter.to(torch.bfloat16)
50
+ self.bos_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
51
+ self.audio_start_id = tokenizer.convert_tokens_to_ids("<|start_of_audio|>")
52
+ self.audio_end_id = tokenizer.convert_tokens_to_ids("<|end_of_audio|>")
53
+
54
+ def _downsample(self, seq: torch.Tensor) -> torch.Tensor:
55
+ k, (T, d) = self.downsample_factor, seq.shape
56
+ target = k * math.ceil(T / k)
57
+ if target != T:
58
+ seq = F.pad(seq, (0, 0, 0, target - T))
59
+ return seq.contiguous().view(target // k, d * k)
60
+
61
+ def _tok_embed(self, tok_id: int, batch: int, device) -> torch.Tensor:
62
+ idx = torch.full((batch, 1), tok_id, dtype=torch.long, device=device)
63
+ return self.llm.get_input_embeddings()(idx)
64
+
65
+ def forward(
66
+ self,
67
+ mel: torch.Tensor,
68
+ audio_att_mask: torch.Tensor,
69
+ labels: torch.Tensor,
70
+ text_att_mask: torch.Tensor,
71
+ ):
72
+ B, device = mel.size(0), mel.device
73
+ enc_out = self.encoder(
74
+ input_features=mel, attention_mask=None, return_dict=True
75
+ ).last_hidden_state
76
+ audio_embs, audio_mask, max_T = [], [], 0
77
+ for seq in enc_out:
78
+ ds = self._downsample(seq)
79
+ audio_embs.append(ds)
80
+ max_T = max(max_T, ds.size(0))
81
+ for ds in audio_embs:
82
+ pad = max_T - ds.size(0)
83
+ audio_mask.append(
84
+ torch.cat(
85
+ [
86
+ torch.ones(ds.size(0), dtype=torch.long, device=device),
87
+ torch.zeros(pad, dtype=torch.long, device=device),
88
+ ]
89
+ )
90
+ )
91
+ if pad:
92
+ ds = F.pad(ds, (0, 0, 0, pad))
93
+ audio_embeddings = torch.stack(audio_embs, 0)
94
+ audio_mask = torch.stack(audio_mask, 0)
95
+ audio_embeddings = self.adapter(audio_embeddings)
96
+ text_embeddings = self.llm.get_input_embeddings()(labels)
97
+ sa_positions = (labels == self.audio_start_id).nonzero(as_tuple=True)
98
+ ea_positions = (labels == self.audio_end_id).nonzero(as_tuple=True)
99
+ inputs_embeds = []
100
+ att_mask = []
101
+ for b in range(B):
102
+ sa_idx = sa_positions[1][sa_positions[0] == b].item()
103
+ ea_idx = ea_positions[1][ea_positions[0] == b].item()
104
+ prefix_emb = text_embeddings[b, : sa_idx + 1]
105
+ postfix_emb = text_embeddings[b, ea_idx:]
106
+ emb = torch.cat([prefix_emb, audio_embeddings[b], postfix_emb], dim=0)
107
+ prefix_mask = text_att_mask[b, : sa_idx + 1]
108
+ postfix_mask = text_att_mask[b, ea_idx:]
109
+ full_mask = torch.cat([prefix_mask, audio_mask[b], postfix_mask], dim=0)
110
+ inputs_embeds.append(emb)
111
+ att_mask.append(full_mask)
112
+ inputs_embeds = torch.nn.utils.rnn.pad_sequence(
113
+ inputs_embeds, batch_first=True, padding_value=0.0
114
+ )
115
+ att_mask = torch.nn.utils.rnn.pad_sequence(
116
+ att_mask, batch_first=True, padding_value=0
117
+ )
118
+ assistant_prompt = self.tokenizer(
119
+ "<|im_start|>assistant\n", add_special_tokens=False
120
+ ).input_ids
121
+ assistant_starts = []
122
+ for b in range(B):
123
+ seq = labels[b]
124
+ for i in range(len(seq) - len(assistant_prompt)):
125
+ if torch.equal(
126
+ seq[i : i + len(assistant_prompt)],
127
+ torch.tensor(assistant_prompt, device=device),
128
+ ):
129
+ assistant_start = i + len(assistant_prompt)
130
+ break
131
+ else:
132
+ raise ValueError("Assistant prompt not found")
133
+ assistant_starts.append(assistant_start + (ea_idx - sa_idx - 1) + max_T)
134
+ max_len = inputs_embeds.size(1)
135
+ loss_labels = labels.new_full((B, max_len), -100)
136
+ for b in range(B):
137
+ orig_assist_start = assistant_starts[b] - max_T - (ea_idx - sa_idx - 1)
138
+ content_len = len(labels[b]) - orig_assist_start
139
+ loss_labels[b, assistant_starts[b] : assistant_starts[b] + content_len] = (
140
+ labels[b, orig_assist_start:]
141
+ )
142
+ if self.tokenizer.pad_token_id is not None:
143
+ loss_labels[loss_labels == self.tokenizer.pad_token_id] = -100
144
+ out = self.llm(
145
+ inputs_embeds=inputs_embeds,
146
+ attention_mask=att_mask,
147
+ labels=loss_labels,
148
+ return_dict=True,
149
+ )
150
+ return out.loss, out.logits
151
+
152
+ @torch.no_grad()
153
+ def generate(
154
+ self,
155
+ mel: torch.Tensor,
156
+ att_mask: torch.Tensor,
157
+ max_new_tokens: int = 512,
158
+ **kwargs,
159
+ ):
160
+ return_tokens = kwargs.pop("return_tokens", False)
161
+ single = mel.dim() == 2
162
+ if single:
163
+ mel, att_mask = mel.unsqueeze(0), att_mask.unsqueeze(0)
164
+ mel = mel.to(torch.bfloat16)
165
+ B, device = mel.size(0), mel.device
166
+ enc_out = self.encoder(
167
+ input_features=mel, attention_mask=None, return_dict=True
168
+ ).last_hidden_state
169
+ audio_embs, audio_mask, max_T = [], [], 0
170
+ for seq in enc_out:
171
+ ds = self._downsample(seq)
172
+ audio_embs.append(ds)
173
+ max_T = max(max_T, ds.size(0))
174
+ for i, ds in enumerate(audio_embs):
175
+ pad = max_T - ds.size(0)
176
+ audio_mask.append(
177
+ torch.cat(
178
+ [
179
+ torch.ones(ds.size(0), dtype=torch.long, device=device),
180
+ torch.zeros(pad, dtype=torch.long, device=device),
181
+ ]
182
+ )
183
+ )
184
+ if pad:
185
+ audio_embs[i] = F.pad(ds, (0, 0, 0, pad))
186
+ audio_embeddings = torch.stack(audio_embs, 0)
187
+ audio_mask = torch.stack(audio_mask, 0)
188
+ audio_embeddings = self.adapter(audio_embeddings)
189
+ messages = [
190
+ {
191
+ "role": "system",
192
+ "content": "Вы полезный помощник по автоматическому распознаванию речи. Точно транскрибируйте аудио в текст.",
193
+ },
194
+ {
195
+ "role": "user",
196
+ "content": "Транскрибируйте это аудио: <|start_of_audio|><|end_of_audio|>",
197
+ },
198
+ ]
199
+ chat_text = self.tokenizer.apply_chat_template(
200
+ messages, tokenize=False, add_generation_prompt=True
201
+ )
202
+ model_inputs = self.tokenizer(chat_text, return_tensors="pt").to(device)
203
+ input_ids = model_inputs.input_ids.repeat(B, 1)
204
+ text_att_mask = model_inputs.attention_mask.repeat(B, 1)
205
+ text_embeddings = self.llm.get_input_embeddings()(input_ids)
206
+ sa_idx = (input_ids[0] == self.audio_start_id).nonzero(as_tuple=True)[0].item()
207
+ ea_idx = (input_ids[0] == self.audio_end_id).nonzero(as_tuple=True)[0].item()
208
+ inputs_embeds = []
209
+ full_att_mask = []
210
+ for b in range(B):
211
+ prefix_emb = text_embeddings[b, : sa_idx + 1]
212
+ postfix_emb = text_embeddings[b, ea_idx:]
213
+ emb = torch.cat([prefix_emb, audio_embeddings[b], postfix_emb], dim=0)
214
+ prefix_mask = text_att_mask[b, : sa_idx + 1]
215
+ postfix_mask = text_att_mask[b, ea_idx:]
216
+ mask = torch.cat([prefix_mask, audio_mask[b], postfix_mask], dim=0)
217
+ inputs_embeds.append(emb)
218
+ full_att_mask.append(mask)
219
+ inputs_embeds = torch.nn.utils.rnn.pad_sequence(
220
+ inputs_embeds, batch_first=True, padding_value=0.0
221
+ )
222
+ att_mask = torch.nn.utils.rnn.pad_sequence(
223
+ full_att_mask, batch_first=True, padding_value=0
224
+ )
225
+ gen_ids = self.llm.generate(
226
+ inputs_embeds=inputs_embeds,
227
+ attention_mask=att_mask,
228
+ max_new_tokens=max_new_tokens,
229
+ eos_token_id=self.tokenizer.eos_token_id,
230
+ **kwargs,
231
+ )
232
+ if return_tokens:
233
+ return gen_ids[0] if single else gen_ids
234
+ else:
235
+ txt = self.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
236
+ return txt[0] if single else txt
237
+
238
+ def save_pretrained(self, save_directory, **kwargs):
239
+ os.makedirs(save_directory, exist_ok=True)
240
+ self.config.save_pretrained(save_directory)
241
+ state_dict = self.state_dict()
242
+ torch.save(state_dict, os.path.join(save_directory, "pytorch_model.bin"))
243
+ self.tokenizer.save_pretrained(save_directory)
244
+ extractor = WhisperFeatureExtractor.from_pretrained(
245
+ self.config.whisper_encoder_name
246
+ )
247
+ extractor.save_pretrained(save_directory)
248
+
249
+ @classmethod
250
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
251
+ config = BorealisConfig.from_pretrained(pretrained_model_name_or_path)
252
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
253
+ language_model = AutoModelForCausalLM.from_pretrained(config.llm_name)
254
+ model = cls(config, language_model=language_model, tokenizer=tokenizer)
255
+
256
+ state_dict_path = hf_hub_download(
257
+ repo_id=pretrained_model_name_or_path, filename="pytorch_model.bin"
258
+ )
259
+
260
+ state_dict = torch.load(state_dict_path, map_location="cpu")
261
+ model.load_state_dict(state_dict)
262
+ return model
263
+
264
+
265
+ BorealisForConditionalGeneration.register_for_auto_class("AutoModelForCausalLM")
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:babfecb8b4346c60c9b3fe01e38186bfec189db26626c37d169c008b57fba8ff
3
+ size 2272601487
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|start_of_audio|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|end_of_audio|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|im_end|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|vision_pad|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55c7fad3b807310f01cead0edd8fa225070d199053eb0649e31f58a1caf09aa2
3
+ size 11422284
tokenizer_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|start_of_audio|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|end_of_audio|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ }
197
+ },
198
+ "additional_special_tokens": [
199
+ "<|start_of_audio|>",
200
+ "<|end_of_audio|>"
201
+ ],
202
+ "bos_token": null,
203
+ "clean_up_tokenization_spaces": false,
204
+ "eos_token": "<|im_end|>",
205
+ "errors": "replace",
206
+ "extra_special_tokens": {},
207
+ "model_max_length": 32768,
208
+ "pad_token": "<|vision_pad|>",
209
+ "padding_side": "left",
210
+ "split_special_tokens": false,
211
+ "tokenizer_class": "Qwen2Tokenizer",
212
+ "unk_token": null
213
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff