mujtaba025 commited on
Commit
0b059aa
·
verified ·
1 Parent(s): 333b3e7

Upload processing_minicpmo.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_minicpmo.py +508 -0
processing_minicpmo.py ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The OpenBMB Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for MiniCPMO.
17
+ """
18
+
19
+ import math
20
+ import re
21
+ from typing import List
22
+ from typing import Literal
23
+ from typing import Optional
24
+ from typing import Union
25
+
26
+ import numpy as np
27
+ import torch
28
+ import torchaudio
29
+ from transformers.image_utils import ImageInput
30
+ from transformers.processing_utils import ProcessorMixin
31
+ from transformers.tokenization_utils_base import PreTokenizedInput
32
+ from transformers.tokenization_utils_base import TextInput
33
+ from transformers.utils import TensorType
34
+
35
+ try:
36
+ from image_processing_minicpmv import MiniCPMOBatchFeature
37
+ except ImportError:
38
+ from .image_processing_minicpmv import MiniCPMOBatchFeature
39
+
40
+
41
+ class MiniCPMOProcessor(ProcessorMixin):
42
+ r"""
43
+ Constructs a MiniCPMV processor which wraps a MiniCPMV image processor and a MiniCPMV tokenizer into a single processor.
44
+
45
+ [`MiniCPMVProcessor`] offers all the functionalities of [`MiniCPMVImageProcessor`] and [`LlamaTokenizerWrapper`]. See the
46
+ [`~MiniCPMVProcessor.__call__`] and [`~MiniCPMVProcessor.decode`] for more information.
47
+
48
+ Args:
49
+ image_processor ([`MiniCPMVImageProcessor`], *optional*):
50
+ The image processor is a required input.
51
+ tokenizer ([`LlamaTokenizerWrapper`], *optional*):
52
+ The tokenizer is a required input.
53
+ """
54
+
55
+ attributes = ["image_processor", "feature_extractor", "tokenizer"]
56
+ feature_extractor_class = "WhisperFeatureExtractor"
57
+ image_processor_class = "AutoImageProcessor"
58
+ tokenizer_class = "AutoTokenizer"
59
+
60
+ def __init__(self, image_processor=None, feature_extractor=None, tokenizer=None):
61
+ super().__init__(image_processor, feature_extractor, tokenizer)
62
+ self.version = image_processor.version
63
+
64
+ def __call__(
65
+ self,
66
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
67
+ images: ImageInput = None,
68
+ audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]] = None,
69
+ audio_parts: Optional[list] = None,
70
+ max_length: Optional[int] = None,
71
+ do_pad: Optional[bool] = True,
72
+ max_slice_nums: int = None,
73
+ use_image_id: bool = True,
74
+ chunk_input: bool = False,
75
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
76
+ sampling_rate: Optional[int] = 16000,
77
+ **kwargs,
78
+ ) -> MiniCPMOBatchFeature:
79
+ if images is not None:
80
+ image_inputs = self.image_processor(
81
+ images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
82
+ )
83
+ else:
84
+ image_inputs = None
85
+
86
+ if audios is not None:
87
+ audio_features, audio_feature_lens, audio_phs = self.audio_feature_extract(
88
+ audios, audio_parts, chunk_input, sampling_rate
89
+ )
90
+ else:
91
+ audio_features, audio_feature_lens, audio_phs = [], [], []
92
+
93
+ model_inputs = self._convert_omni_to_inputs(
94
+ image_inputs,
95
+ audio_phs,
96
+ text,
97
+ max_slice_nums=max_slice_nums,
98
+ use_image_id=use_image_id,
99
+ max_length=max_length,
100
+ **kwargs,
101
+ )
102
+
103
+ model_inputs["audio_features"] = audio_features
104
+ model_inputs["audio_feature_lens"] = audio_feature_lens
105
+
106
+ return MiniCPMOBatchFeature(data={**model_inputs})
107
+
108
+ def get_audio_placeholder(self, audio_lens, chunk_input, chunk_length):
109
+ pool_step = 2
110
+ feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
111
+
112
+ feature_lens = (feature_lens - 1) // 2 + 1
113
+ output_lens = (feature_lens - pool_step) // pool_step + 1
114
+
115
+ if chunk_input:
116
+ fbank_feat_in_chunk = int(chunk_length * 100)
117
+ cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
118
+ audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
119
+ num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
120
+
121
+ place_holders = ""
122
+ total_unk_len = 0
123
+ for _ in range(num_audio_chunks):
124
+ unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
125
+ place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
126
+ total_unk_len += unk_len
127
+ audio_placeholder = place_holders
128
+ else:
129
+ audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
130
+
131
+ return audio_placeholder
132
+
133
+ def audio_feature_extract(
134
+ self,
135
+ audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
136
+ audio_parts: Optional[list] = None,
137
+ chunk_input: Optional[bool] = False,
138
+ sampling_rate: Optional[int] = None,
139
+ chunk_length: Optional[int] = 1,
140
+ **kwargs,
141
+ ):
142
+ if isinstance(audios, np.ndarray):
143
+ audios_list = [[audios]]
144
+ elif isinstance(audios[0], np.ndarray):
145
+ audios_list = [audios]
146
+ else:
147
+ audios_list = audios
148
+
149
+ if audio_parts is not None:
150
+ assert len(audio_parts) == len(audios_list)
151
+ for parts, audios in zip(audio_parts, audios_list):
152
+ assert len(parts) == len(audios)
153
+
154
+ audio_feature_lens_list = []
155
+ audio_ph_list = []
156
+
157
+ audio_features_all = []
158
+
159
+ # audio placeholder not dependent on audio_parts
160
+ for audios in audios_list:
161
+ if audios:
162
+ audio_ph_list.append([self.get_audio_placeholder(len(a), chunk_input, chunk_length) for a in audios])
163
+ else:
164
+ audio_ph_list.append([])
165
+
166
+ for idx, audios in enumerate(audios_list):
167
+ if audio_parts is not None:
168
+ # same audio part merge
169
+ audio_part = audio_parts[idx]
170
+ merge_audio = []
171
+ cur_audio = []
172
+ for aid, (part, audio) in enumerate(zip(audio_part, audios)):
173
+ if aid == 0 or audio_part[aid] == audio_part[aid - 1]:
174
+ cur_audio.append(audio)
175
+ else:
176
+ merge_audio.append(np.hstack(cur_audio))
177
+ cur_audio = [audio]
178
+ if cur_audio:
179
+ merge_audio.append(np.hstack(cur_audio))
180
+
181
+ else:
182
+ merge_audio = audios
183
+
184
+ audio_feature_lens = []
185
+
186
+ # If the audio exceeds 30 seconds, split it into chunks every 30 seconds.
187
+ final_merge_audio = []
188
+ max_audio_inp_len = 30 * sampling_rate
189
+ for audio in merge_audio:
190
+ if len(audio) <= max_audio_inp_len:
191
+ final_merge_audio.append(audio)
192
+ else:
193
+ for i in range(math.ceil(len(audio) / max_audio_inp_len)):
194
+ final_merge_audio.append(audio[i * max_audio_inp_len : (i + 1) * max_audio_inp_len])
195
+
196
+ if audios:
197
+ audio_inputs = self.feature_extractor(
198
+ final_merge_audio,
199
+ sampling_rate=sampling_rate,
200
+ return_attention_mask=True,
201
+ padding="max_length",
202
+ return_tensors="pt",
203
+ **kwargs,
204
+ )
205
+ audio_feature = audio_inputs["input_features"]
206
+ actual_lens = audio_inputs["attention_mask"].sum(dim=1)
207
+
208
+ for feat, lens in zip(audio_feature, actual_lens):
209
+ audio_features_all.append(feat[:, :lens])
210
+ audio_feature_lens.append(lens)
211
+
212
+ audio_feature_lens = torch.hstack(audio_feature_lens)
213
+ audio_feature_lens_list.append(audio_feature_lens)
214
+ else:
215
+ audio_feature_lens_list.append([])
216
+
217
+ if audio_features_all:
218
+ audio_features = [i.permute(1, 0) for i in audio_features_all]
219
+ audio_features = torch.nn.utils.rnn.pad_sequence(
220
+ audio_features, batch_first=True, padding_value=0.0
221
+ ).permute(0, 2, 1)
222
+ else:
223
+ audio_features = []
224
+
225
+ return audio_features, audio_feature_lens_list, audio_ph_list
226
+
227
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
228
+ def batch_decode(self, *args, **kwargs):
229
+ """
230
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
231
+ refer to the docstring of this method for more information.
232
+ """
233
+ output_ids = args[0]
234
+ result_text = []
235
+ for result in output_ids:
236
+ result = result[result != 0]
237
+ if result[0] == self.tokenizer.bos_id:
238
+ result = result[1:]
239
+ if result[-1] == self.tokenizer.eos_id:
240
+ result = result[:-1]
241
+ result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
242
+ return result_text
243
+ # return self.tokenizer.batch_decode(*args, **kwargs)
244
+
245
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
246
+ def decode(self, *args, **kwargs):
247
+ """
248
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
249
+ the docstring of this method for more information.
250
+ """
251
+ result = args[0]
252
+ result = result[result != 0]
253
+ if result[0] == self.tokenizer.bos_id:
254
+ result = result[1:]
255
+ if result[-1] == self.tokenizer.eos_id or (
256
+ hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id
257
+ ):
258
+ result = result[:-1]
259
+ return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
260
+
261
+ def _convert(self, input_str, max_inp_length: Optional[int] = None, **kwargs):
262
+ input_ids = self.tokenizer.encode(input_str, **kwargs)
263
+ if max_inp_length is not None:
264
+ input_ids = input_ids[:max_inp_length]
265
+ input_ids = torch.tensor(input_ids, dtype=torch.int32)
266
+
267
+ ## image bound
268
+ start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
269
+ end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
270
+
271
+ image_start_idx = torch.where(start_cond)[0]
272
+ image_start_idx += 1
273
+ image_end_idx = torch.where(end_cond)[0]
274
+
275
+ valid_image_nums = max(len(image_start_idx), len(image_end_idx))
276
+
277
+ image_bounds = torch.hstack(
278
+ [
279
+ image_start_idx[:valid_image_nums].unsqueeze(-1),
280
+ image_end_idx[:valid_image_nums].unsqueeze(-1),
281
+ ]
282
+ )
283
+
284
+ ## audio bound
285
+ audio_start_idx = torch.where(input_ids == self.tokenizer.audio_start_id)[0]
286
+ audio_end_idx = torch.where(input_ids == self.tokenizer.audio_end_id)[0]
287
+ assert len(audio_start_idx) == len(audio_end_idx)
288
+ audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
289
+
290
+ spk_start_idx = torch.where(input_ids == self.tokenizer.spk_start_id)[0]
291
+ spk_end_idx = torch.where(input_ids == self.tokenizer.spk_end_id)[0]
292
+ assert len(spk_start_idx) == len(spk_end_idx)
293
+ spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
294
+
295
+ return input_ids, image_bounds, audio_bounds, spk_bounds
296
+
297
+ def _convert_omni_to_inputs(
298
+ self,
299
+ images,
300
+ audio_phs,
301
+ texts: Union[str, List[str]],
302
+ truncation=None,
303
+ max_length=None,
304
+ max_slice_nums=None,
305
+ use_image_id=None,
306
+ return_tensors=None,
307
+ **kwargs,
308
+ ):
309
+ if images is None and audio_phs is None:
310
+ model_inputs = self.tokenizer(
311
+ texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs
312
+ )
313
+ return MiniCPMOBatchFeature(data={**model_inputs})
314
+
315
+ image_tag = "(<image>./</image>)"
316
+ image_pattern = "\(<image>./</image>\)"
317
+ audio_tag = "(<audio>./</audio>)"
318
+ audio_pattern = "\(<audio>./</audio>\)"
319
+ split_pattern = f"({image_pattern}|{audio_pattern})"
320
+
321
+ if isinstance(texts, str):
322
+ texts = [texts]
323
+
324
+ bs = len(texts)
325
+ if images is not None:
326
+ images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
327
+ else:
328
+ images, image_sizes, tgt_sizes = [[]] * bs, [[]] * bs, [[]] * bs
329
+
330
+ input_ids_list = []
331
+ image_bounds_list = []
332
+ audio_bounds_list = []
333
+ spk_bounds_list = []
334
+
335
+ for index, text in enumerate(texts):
336
+ text_chunks = re.split(split_pattern, text)
337
+
338
+ image_tags = re.findall(image_pattern, text)
339
+ audio_tags = re.findall(audio_pattern, text)
340
+
341
+ if image_tags:
342
+ assert images is not None
343
+ assert len(image_tags) == len(image_sizes[index])
344
+ if audio_tags:
345
+ assert audio_phs is not None
346
+ assert len(audio_tags) == len(audio_phs[index])
347
+
348
+ image_id = 0
349
+ audio_id = 0
350
+ for i, chunk in enumerate(text_chunks):
351
+ if chunk == image_tag:
352
+ image_placeholder = self.image_processor.get_slice_image_placeholder(
353
+ image_sizes[index][image_id], image_id, max_slice_nums, use_image_id
354
+ )
355
+ image_id += 1
356
+ text_chunks[i] = image_placeholder
357
+ elif chunk == audio_tag:
358
+ audio_placeholder = audio_phs[index][audio_id]
359
+ audio_id += 1
360
+ text_chunks[i] = audio_placeholder
361
+
362
+ final_text = "".join(text_chunks)
363
+ input_ids, image_bounds, audio_bounds, spk_bounds = self._convert(final_text, max_length, **kwargs)
364
+
365
+ input_ids_list.append(input_ids)
366
+ image_bounds_list.append(image_bounds)
367
+ audio_bounds_list.append(audio_bounds)
368
+ spk_bounds_list.append(spk_bounds)
369
+
370
+ padded_input_ids, padding_lengths = self.pad(input_ids_list, padding_side="left")
371
+ attention_mask = torch.ones_like(padded_input_ids, dtype=torch.bool)
372
+ for i, length in enumerate(padding_lengths):
373
+ image_bounds_list[i] = image_bounds_list[i] + length
374
+ audio_bounds_list[i] = audio_bounds_list[i] + length
375
+ spk_bounds_list[i] = spk_bounds_list[i] + length
376
+ attention_mask[i, :length] = False
377
+
378
+ data = {
379
+ "input_ids": padded_input_ids,
380
+ "attention_mask": attention_mask,
381
+ "pixel_values": images,
382
+ "image_sizes": image_sizes,
383
+ "image_bound": image_bounds_list,
384
+ "tgt_sizes": tgt_sizes,
385
+ "audio_bounds": audio_bounds_list,
386
+ "spk_bounds": spk_bounds_list,
387
+ }
388
+
389
+ return data
390
+
391
+ @property
392
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
393
+ def model_input_names(self):
394
+ tokenizer_input_names = self.tokenizer.model_input_names
395
+ image_processor_input_names = self.image_processor.model_input_names
396
+ feature_extractor_input_names = self.feature_extractor.model_input_names
397
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + feature_extractor_input_names))
398
+
399
+ def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
400
+ items = []
401
+ if isinstance(inputs[0], list):
402
+ assert isinstance(inputs[0][0], torch.Tensor)
403
+ for it in inputs:
404
+ for tr in it:
405
+ items.append(tr)
406
+ else:
407
+ assert isinstance(inputs[0], torch.Tensor)
408
+ items = inputs
409
+
410
+ batch_size = len(items)
411
+ shape = items[0].shape
412
+ dim = len(shape)
413
+ assert dim <= 2
414
+ if max_length is None:
415
+ max_length = 0
416
+ max_length = max(max_length, max(item.shape[-1] for item in items))
417
+ min_length = min(item.shape[-1] for item in items)
418
+ dtype = items[0].dtype
419
+
420
+ if dim == 0:
421
+ return torch.stack([item for item in items], dim=0), [0]
422
+ elif dim == 1:
423
+ if max_length == min_length:
424
+ return torch.stack([item for item in items], dim=0), [0] * batch_size
425
+ tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
426
+ else:
427
+ tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
428
+
429
+ padding_length = []
430
+ for i, item in enumerate(items):
431
+ if dim == 1:
432
+ if padding_side == "left":
433
+ tensor[i, -len(item) :] = item.clone()
434
+ else:
435
+ tensor[i, : len(item)] = item.clone()
436
+ elif dim == 2:
437
+ if padding_side == "left":
438
+ tensor[i, -len(item) :, :] = item.clone()
439
+ else:
440
+ tensor[i, : len(item), :] = item.clone()
441
+ padding_length.append(tensor.shape[-1] - len(item))
442
+
443
+ return tensor, padding_length
444
+
445
+
446
+ class MelSpectrogramFeatures(torch.nn.Module):
447
+ def __init__(
448
+ self,
449
+ sample_rate=24000,
450
+ n_fft=1024,
451
+ hop_length=256,
452
+ n_mels=100,
453
+ padding: Literal["center", "same"] = "center",
454
+ ):
455
+ super().__init__()
456
+ if padding not in ["center", "same"]:
457
+ raise ValueError("Padding must be 'center' or 'same'.")
458
+ self.padding = padding
459
+ self.mel_spec = torchaudio.transforms.MelSpectrogram(
460
+ sample_rate=sample_rate,
461
+ n_fft=n_fft,
462
+ hop_length=hop_length,
463
+ n_mels=n_mels,
464
+ center=padding == "center",
465
+ power=1,
466
+ )
467
+
468
+ def __call__(self, audio: torch.Tensor) -> torch.Tensor:
469
+ """
470
+ audio: Tensor([num_channels, num_samples])
471
+ """
472
+ return super().__call__(audio)
473
+
474
+ def forward(self, audio: torch.Tensor) -> torch.Tensor:
475
+ """
476
+ audio: Tensor([num_channels, num_samples])
477
+ """
478
+ mel: torch.Tensor = self.mel_spec(audio)
479
+ features = torch.log(torch.clip(mel, min=1e-5))
480
+ return features
481
+
482
+
483
+ class ChatTTSProcessor:
484
+ def __init__(self, text_tokenizer):
485
+ self.audio_processor = MelSpectrogramFeatures()
486
+ self.text_tokenizer = text_tokenizer
487
+
488
+ def __call__(self, text_list, audio_list):
489
+ assert len(text_list) == len(audio_list)
490
+ input_ids_varlen = []
491
+ for text in text_list:
492
+ input_ids_ = self.text_tokenizer.encode(text, return_tensors="pt", add_special_tokens=False) # [1, seq_len]
493
+ input_ids_ = input_ids_.squeeze(0) # [seq_len]
494
+ input_ids_varlen.append(input_ids_)
495
+
496
+ audio_features_varlen = []
497
+ for audio in audio_list:
498
+ assert audio.shape.__len__() == 1 # [seq_len]
499
+ try:
500
+ mel = self.audio_processor(audio) # [100(num_mel_bins), seq_len_mel]
501
+ except Exception as e:
502
+ raise e
503
+ audio_features_varlen.append(mel)
504
+
505
+ return {
506
+ "tts_input_ids_varlen": input_ids_varlen, # return List[Tensor]
507
+ "tts_input_features_varlen": audio_features_varlen, # return List[Tensor]
508
+ }