fix bug

Browse files

Files changed (6) hide show

config.json +5 -6
configuration.json +3 -1
modeling_qwen.py +8 -9
qwen_generation_utils.py +0 -1
tokenization_qwen.py +123 -138
tokenizer_config.json +1 -1

config.json CHANGED Viewed

@@ -1,12 +1,11 @@
 {
-  "_name_or_path": "10302244_iter8000_final/",
   "architectures": [
     "QWenLMHeadModel"
   ],
   "attn_dropout_prob": 0.0,
   "audio": {
     "add_audio_bos_eos_token": true,
-    "audio_start_id": 155164,
     "avg_pool": true,
     "n_ctx": 1500,
     "n_head": 20,
@@ -19,7 +18,7 @@
     "AutoConfig": "configuration_qwen.QWenConfig",
     "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
   },
-  "bf16": true,
   "emb_dropout_prob": 0.0,
   "fp16": false,
   "fp32": false,
@@ -27,8 +26,8 @@
   "initializer_range": 0.02,
   "intermediate_size": 22016,
   "kv_channels": 128,
-  "layer_norm_epsilon": 1e-05,
-  "max_position_embeddings": 8192,
   "model_type": "qwen",
   "no_bias": true,
   "num_attention_heads": 32,
@@ -47,7 +46,7 @@
   "use_cache_kernel": false,
   "use_cache_quantization": false,
   "use_dynamic_ntk": true,
-  "use_flash_attn": true,
   "use_logn_attn": true,
   "vocab_size": 155947
 }

 {
   "architectures": [
     "QWenLMHeadModel"
   ],
   "attn_dropout_prob": 0.0,
   "audio": {
     "add_audio_bos_eos_token": true,
+    "audio_start_id": 155163,
     "avg_pool": true,
     "n_ctx": 1500,
     "n_head": 20,
     "AutoConfig": "configuration_qwen.QWenConfig",
     "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
   },
+  "bf16": false,
   "emb_dropout_prob": 0.0,
   "fp16": false,
   "fp32": false,
   "initializer_range": 0.02,
   "intermediate_size": 22016,
   "kv_channels": 128,
+  "layer_norm_epsilon": 1e-06,
+  "max_position_embeddings": 2048,
   "model_type": "qwen",
   "no_bias": true,
   "num_attention_heads": 32,
   "use_cache_kernel": false,
   "use_cache_quantization": false,
   "use_dynamic_ntk": true,
+  "use_flash_attn": "auto",
   "use_logn_attn": true,
   "vocab_size": 155947
 }

configuration.json CHANGED Viewed

	@@ -1 +1,3 @@
1	- {"framework":"Pytorch",~~"task":"multimodal-dialogue"}~~

+{"framework":"Pytorch",
+ "task":"multimodal-dialogue",
+ "allow_remote": true}

modeling_qwen.py CHANGED Viewed

@@ -1015,20 +1015,18 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             self.lm_head.half()
         self.post_init()
     @classmethod
     def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
-        *model_args,
-        config = None,
-        cache_dir = None,
-        **kwargs,
     ):
         if os.path.isdir(pretrained_model_name_or_path):
             # Local Directory of Models
             mel_filters_path = os.path.join(pretrained_model_name_or_path, 'mel_filters.npz')
-            print(mel_filters_path)
             tgt_cache_path = os.path.join(os.path.dirname(__file__), 'mel_filters.npz')
             shutil.copy(mel_filters_path, tgt_cache_path)
         else:
@@ -1036,7 +1034,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             from huggingface_hub import hf_hub_download
             hf_hub_download(repo_id=pretrained_model_name_or_path, filename="mel_filters.npz",
                             token=kwargs.get('token', None), local_dir=os.path.dirname(__file__))
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, config=config, cache_dir=cache_dir, **kwargs)
     def get_output_embeddings(self):
         return self.lm_head

             self.lm_head.half()
         self.post_init()
     @classmethod
     def from_pretrained(
+            cls,
+            pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+            *model_args,
+            config=None,
+            cache_dir=None,
+            **kwargs,
     ):
         if os.path.isdir(pretrained_model_name_or_path):
             # Local Directory of Models
             mel_filters_path = os.path.join(pretrained_model_name_or_path, 'mel_filters.npz')
             tgt_cache_path = os.path.join(os.path.dirname(__file__), 'mel_filters.npz')
             shutil.copy(mel_filters_path, tgt_cache_path)
         else:
             from huggingface_hub import hf_hub_download
             hf_hub_download(repo_id=pretrained_model_name_or_path, filename="mel_filters.npz",
                             token=kwargs.get('token', None), local_dir=os.path.dirname(__file__))
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, config=config, cache_dir=cache_dir,
+                                       **kwargs)
     def get_output_embeddings(self):
         return self.lm_head

qwen_generation_utils.py CHANGED Viewed

@@ -186,7 +186,6 @@ def make_context(
             + nl_tokens
         )
         raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
-        print(raw_text)
         audio_info = tokenizer.process_audio(raw_text)
     elif chat_format == "raw":

             + nl_tokens
         )
         raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
         audio_info = tokenizer.process_audio(raw_text)
     elif chat_format == "raw":

tokenization_qwen.py CHANGED Viewed

@@ -17,13 +17,11 @@ from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Opt
 import tiktoken
 import numpy as np
-from PIL import Image
-from PIL import ImageFont
-from PIL import ImageDraw
 from transformers import PreTrainedTokenizer, AddedToken
 from transformers.utils import try_to_load_from_cache
-from transformers.tokenization_utils_base import BatchEncoding,PaddingStrategy,TruncationStrategy,\
-    TextInput,TextInputPair,PreTokenizedInput,PreTokenizedInputPair,TensorType, EncodedInput, EncodedInputPair
 import matplotlib.colors as mcolors
 from matplotlib.font_manager import FontProperties
@@ -31,7 +29,6 @@ from .audio import *
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
@@ -43,11 +40,11 @@ IMEND = "<|im_end|>"
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
 SPECIAL_TOKENS = (
-    ENDOFTEXT,
-    IMSTART,
-    IMEND,
-) + EXTRAS
-IMG_TOKEN_SPAN = 256
 LANGUAGES = {
     "en": "english",
     "zh": "chinese",
@@ -68,23 +65,25 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 def _list_find(
-    input_list: List[Any],
-    candidates: Tuple[Any],
-    start: int = 0,
 ):
     for i in range(start, len(input_list)):
         if input_list[i] in candidates:
             return i
     return -1
 def _replace_closed_tag(
-    input_tokens: List[Any],
-    start_tags: Union[Any, Tuple[Any]],
-    end_tags: Union[Any, Tuple[Any]],
-    inclusive_replace_func: Callable,
-    exclusive_replace_func: Callable = lambda x: x,
-    audio_info: Dict = None
 ):
     if isinstance(start_tags, (str, int)):
         start_tags = (start_tags,)
@@ -99,107 +98,93 @@ def _replace_closed_tag(
         start = _list_find(input_tokens, start_tags, end)
         if start == -1:
             break
-        output_tokens.extend(exclusive_replace_func(input_tokens[end : start]))
         tag_idx = start_tags.index(input_tokens[start])
         end = _list_find(input_tokens, (end_tags[tag_idx],), start)
         if end == -1:
-            raise ValueError("Unclosed image token")
-        output_tokens.extend(inclusive_replace_func(input_tokens[start : end + 1], audio_info, audio_idx))
         end += 1
         audio_idx += 1
-    output_tokens.extend(exclusive_replace_func(input_tokens[end : ]))
     return output_tokens
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
     vocab_files_names = VOCAB_FILES_NAMES
     def __init__(
-        self,
-        vocab_file,
-        errors="replace",
-        audio_start_tag='<audio>',
-        audio_end_tag='</audio>',
-        **kwargs,
     ):
         super().__init__(**kwargs)
         self.audio_start_tag = audio_start_tag
         self.audio_end_tag = audio_end_tag
         self.audio_pad_tag = "[[[AUDIO:modality]]]"
-        self.IMAGE_ST = ("<ref>", "</ref>", "<box>", "</box>", "<quad>", "</quad>")
         self.AUDIO_ST = (
             '[[[AUDIO:modality]]]',
-            "<|startoftranscript|>",  # 按时间线
-            "<|startofcaption|>",  # 不按时间线
-            # 五大任务 [ASR,ST,AAC,keyword,AQA]
             "<|translate|>",
             "<|transcribe|>",
             "<|caption|>",
             "<|keyword|>",
-            # 语言
-            "<|unknown|>",  # 未知语言
             *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
-            "<|zh_tw|>",  # 繁体中文
-            # 时间戳相关
             "<|notimestamps|>",
             "<|sil|>",
             "<|timestamps|>",
-            *[f"<|{i * 0.01:.2f}|>" for i in range(3001)],
-            # text风格
-            "<|caption_audiocaps|>",  # for audiocaps刷分
-            "<|caption_clotho|>",  # for clotho刷分
-            "<|audioset_ontology|>",  # audioset体系风格
-            "<|caption_plain|>",  # 其他caption数据集
-            "<|itn|>",  # 加标点
-            "<|wo_itn|>",  # 不加标点
-            # 特殊任务——实体识别
             "<|startofentityvalue|>",
             "<|endofentityvalue|>",
             "<|startofentitytype|>",
             "<|endofentitytype|>",
-            "<|named_entity_recognition|>",
-            # 特殊任务——audiogrounding
-            "<|grounding|>",
             "<|startofword|>",
             "<|endofword|>",
-            "<|delim|>",  # 分隔时间戳pair对
-            # 子任务--SER
-            "<|emotion_recognition|>",
-            # 子任务--音乐描述
-            "<|music_description|>",
-            # 子任务--note analysis
-            "<|note_analysis|>",
-            "<|pitch|>",
-            *[f"<|midi_pitch_{i}|>" for i in range(128)],  # midi音符
-            "<|velocity|>",
-            *[f"<|midi_velocity_{i}|>" for i in range(128)],  # midi力度
-            "<|sonic|>",
-            "<|instrument|>",
-            # 子类别--说话人
-            "<|speaker_meta|>",
-            # 子类别--song
-            "<|song_meta|>",
-            # 特殊任务--AQA
-            "<|question|>",
-            "<|answer|>",
-            "<|choice|>",
-            # 子任务--场景识别
-            "<|scene|>",
-            # 子任务--event
-            "<|event|>",
-            # 子任务--vocal_classification
-            "<|vocal_classification|>",
-            # 特殊任务--SLU
-            "<|speech_understanding|>",
-            "<|scenario|>",
-            "<|action|>",
-            "<|entities|>",
-            # 子任务--语音编辑
-            "<|speech_edit|>",
-            # 子任务--命令
-            "<|speech_command|>",
             audio_start_tag,
             audio_end_tag
         )
@@ -210,9 +195,8 @@ class QWenTokenizer(PreTrainedTokenizer):
         self.special_tokens = {
             token: index
             for index, token in enumerate(
-                # SPECIAL_TOKENS + self.IMAGE_ST + self.AUDIO_ST, start=len(self.mergeable_ranks)
                 SPECIAL_TOKENS + self.AUDIO_ST, start=len(self.mergeable_ranks)
             )
         }
         self.audio_start_id = self.special_tokens[self.audio_start_tag]
@@ -229,7 +213,7 @@ class QWenTokenizer(PreTrainedTokenizer):
             special_tokens=self.special_tokens,
         )
         assert (
-            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
         ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
         self.decoder = {
@@ -260,7 +244,6 @@ class QWenTokenizer(PreTrainedTokenizer):
         )
         self.tokenizer = enc
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
@@ -268,7 +251,7 @@ class QWenTokenizer(PreTrainedTokenizer):
         return self.mergeable_ranks
     def convert_tokens_to_ids(
-        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
     ) -> List[int]:
         ids = []
         if isinstance(tokens, (str, bytes)):
@@ -288,7 +271,7 @@ class QWenTokenizer(PreTrainedTokenizer):
             raise ValueError('Adding regular tokens is not supported')
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
-            if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST+ self.AUDIO_ST:
                 raise ValueError('Adding unknown special tokens is not supported')
         return 0
@@ -307,12 +290,12 @@ class QWenTokenizer(PreTrainedTokenizer):
         return (file_path,)
     def tokenize(
-        self,
-        text: str,
-        allowed_special: Union[Set, str] = "all",
-        disallowed_special: Union[Collection, str] = (),
-        audio_info: Dict = None,
-        **kwargs,
     ) -> List[Union[bytes, str]]:
         """
         Converts a string in a sequence of tokens.
@@ -338,44 +321,46 @@ class QWenTokenizer(PreTrainedTokenizer):
         # this implementation takes a detour: text -> token id -> token surface forms
         for t in self.tokenizer.encode(
-            text, allowed_special=allowed_special, disallowed_special=disallowed_special
         ):
             tokens.append(self.decoder[t])
         def _encode_audiourl(audio_tokens, audio_info, audio_idx):
             assert audio_tokens[0] == self.audio_start_tag and audio_tokens[-1] == self.audio_end_tag
             audio_token_span = audio_info['audio_span_tokens'][audio_idx]
-            out_audio_tokens = [self.audio_start_tag] + [self.audio_pad_tag]*(audio_token_span-2) + [self.audio_end_tag]
             return out_audio_tokens
-        return _replace_closed_tag(tokens, self.audio_start_tag, self.audio_end_tag, _encode_audiourl, audio_info=audio_info)
     def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            List[TextInput],
-            List[TextInputPair],
-            List[PreTokenizedInput],
-            List[PreTokenizedInputPair],
-            List[EncodedInput],
-            List[EncodedInputPair],
-        ],
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
@@ -409,7 +394,7 @@ class QWenTokenizer(PreTrainedTokenizer):
         for pair_id in range(len(batch_text_or_text_pairs)):
             kwargs['audio_info'] = audio_info[pair_id]
             ids_or_pair_ids = batch_text_or_text_pairs[pair_id]
-        # for ids_or_pair_ids in batch_text_or_text_pairs:
             if not isinstance(ids_or_pair_ids, (list, tuple)):
                 ids, pair_ids = ids_or_pair_ids, None
             elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
@@ -488,23 +473,23 @@ class QWenTokenizer(PreTrainedTokenizer):
         raise NotImplementedError
     def _decode(
-        self,
-        token_ids: Union[int, List[int]],
-        skip_special_tokens: bool = False,
-        errors: str = None,
-        **kwargs,
     ) -> str:
         if isinstance(token_ids, int):
             token_ids = [token_ids]
         audio_info = kwargs.pop("audio_info", None)
         def _decode_audiourl(audio_token_ids, audio_info, audio_idx):
             assert audio_token_ids[0] == self.audio_start_id and audio_token_ids[-1] == self.audio_end_id
             audio_url = audio_info["audio_urls"][audio_idx]
             return [self.audio_start_id] + self.tokenizer.encode(audio_url) + [self.audio_end_id]
-        token_ids = _replace_closed_tag(token_ids, self.audio_start_id, self.audio_end_id, _decode_audiourl, audio_info=audio_info)
         if skip_special_tokens:
             token_ids = [i for i in token_ids if i < self.eod_id]
@@ -513,7 +498,7 @@ class QWenTokenizer(PreTrainedTokenizer):
     def to_list_format(self, text: str):
         text = unicodedata.normalize("NFC", text)
         token_ids = self.tokenizer.encode(
-            text, allowed_special=set(self.IMAGE_ST + self.AUDIO_ST + (ENDOFTEXT,)))
         def _encode_audio_info(tokens):
             if len(tokens) == 0:
@@ -561,10 +546,10 @@ class QWenTokenizer(PreTrainedTokenizer):
     def process_audio(self, text):
         audio_urls = self.extract_audio_urls(text)
-        if len(audio_urls)> 0:
             audios, audio_lens, audio_span_tokens = [], [], []
             for audio_path in audio_urls:
-                if audio_path.startswith("http://") or audio_path.startswith("https://"): # http
                     data = bytes(requests.get(audio_path, stream=True).content)
                     audio = load_bytesio_audio(data)
                 else:
@@ -578,7 +563,7 @@ class QWenTokenizer(PreTrainedTokenizer):
                 audio_len = [audio_len_after_cnn, audio_token_num]
                 audios.append(mel)
                 audio_lens.append(audio_len)
-                audio_span_tokens.append(audio_token_num+2) # add audio bos eos
             input_audio_lengths = torch.IntTensor(audio_lens)
             input_audios = torch.stack(audios, dim=0)
             return {"input_audios": input_audios,

 import tiktoken
 import numpy as np
 from transformers import PreTrainedTokenizer, AddedToken
 from transformers.utils import try_to_load_from_cache
+from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy, \
+    TextInput, TextInputPair, PreTokenizedInput, PreTokenizedInputPair, TensorType, EncodedInput, EncodedInputPair
 import matplotlib.colors as mcolors
 from matplotlib.font_manager import FontProperties
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
 SPECIAL_TOKENS = (
+                     ENDOFTEXT,
+                     IMSTART,
+                     IMEND,
+                 ) + EXTRAS
 LANGUAGES = {
     "en": "english",
     "zh": "chinese",
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 def _list_find(
+        input_list: List[Any],
+        candidates: Tuple[Any],
+        start: int = 0,
 ):
     for i in range(start, len(input_list)):
         if input_list[i] in candidates:
             return i
     return -1
 def _replace_closed_tag(
+        input_tokens: List[Any],
+        start_tags: Union[Any, Tuple[Any]],
+        end_tags: Union[Any, Tuple[Any]],
+        inclusive_replace_func: Callable,
+        exclusive_replace_func: Callable = lambda x: x,
+        audio_info: Dict = None
 ):
     if isinstance(start_tags, (str, int)):
         start_tags = (start_tags,)
         start = _list_find(input_tokens, start_tags, end)
         if start == -1:
             break
+        output_tokens.extend(exclusive_replace_func(input_tokens[end: start]))
         tag_idx = start_tags.index(input_tokens[start])
         end = _list_find(input_tokens, (end_tags[tag_idx],), start)
         if end == -1:
+            raise ValueError("Unclosed audio token")
+        output_tokens.extend(inclusive_replace_func(input_tokens[start: end + 1], audio_info, audio_idx))
         end += 1
         audio_idx += 1
+    output_tokens.extend(exclusive_replace_func(input_tokens[end:]))
     return output_tokens
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
     vocab_files_names = VOCAB_FILES_NAMES
     def __init__(
+            self,
+            vocab_file,
+            errors="replace",
+            audio_start_tag='<audio>',
+            audio_end_tag='</audio>',
+            **kwargs,
     ):
         super().__init__(**kwargs)
         self.audio_start_tag = audio_start_tag
         self.audio_end_tag = audio_end_tag
         self.audio_pad_tag = "[[[AUDIO:modality]]]"
         self.AUDIO_ST = (
             '[[[AUDIO:modality]]]',
+            # Transcription Tag
+            "<|startoftranscript|>",  # Transcription
+            "<|startofanalysis|>",  # Analysis
+            # Task Tag
             "<|translate|>",
             "<|transcribe|>",
             "<|caption|>",
             "<|keyword|>",
+            # Language Tag
+            "<|unknown|>",  # unknown language
             *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+            "<|zh_tr|>",  # tranditional Chinese
+            # Timestamps Tag
             "<|notimestamps|>",
             "<|sil|>",
             "<|timestamps|>",
+            *[f"<|{i * 0.01:.2f}|>" for i in range(3001)],  # timestamps 0.00-30.00
+            # Output Instruction
+            "<|caption_audiocaps|>",  # Audiocaps caption style
+            "<|caption_clotho|>",  # Clotho caption style
+            "<|audioset_ontology|>",  # Audioset ontology style
+            "<|caption_plain|>",  # plain caption
+            "<|itn|>",  # inversed text normalized
+            "<|wo_itn|>",  # without inversed text normalized
             "<|startofentityvalue|>",
             "<|endofentityvalue|>",
             "<|startofentitytype|>",
             "<|endofentitytype|>",
+            "<|named_entity_recognition|>",  # named entity recognition task
+            "<|audio_grounding|>",
             "<|startofword|>",
             "<|endofword|>",
+            "<|delim|>",  # delimiter of timestamps pair in audio grounding
+            "<|emotion_recognition|>",  # emotion recognition
+            "<|music_description|>",  # music description
+            "<|note_analysis|>",  # note analysis
+            "<|pitch|>",  # note analysis: pitch
+            *[f"<|midi_pitch_{i}|>" for i in range(128)],  # midi pitch 0-127
+            "<|velocity|>",  # note analysis: velocity
+            *[f"<|midi_velocity_{i}|>" for i in range(128)],  # midi velocity 0-127
+            "<|sonic|>",  # note analysis:  sonic
+            "<|instrument|>",  # note analysis:  instrument
+            "<|speaker_meta|>",  # meta information of speaker
+            "<|song_meta|>",  # meta information of song
+            "<|question|>",  # AQA: question
+            "<|answer|>",  # AQA: answer
+            "<|choice|>",  # AQA: answer choice
+            "<|scene|>",  # scene recognition
+            "<|event|>",  # sound event
+            "<|vocal_classification|>",  # vocal classification
+            "<|speech_understanding|>",  # speech language understanding
+            "<|scenario|>",  # speech language understanding: scenario
+            "<|action|>",  # speech language understanding: action
+            "<|entities|>",  # speech language understanding: entities
+            "<|speech_edit|>",  # speech edit
             audio_start_tag,
             audio_end_tag
         )
         self.special_tokens = {
             token: index
             for index, token in enumerate(
                 SPECIAL_TOKENS + self.AUDIO_ST, start=len(self.mergeable_ranks)
             )
         }
         self.audio_start_id = self.special_tokens[self.audio_start_tag]
             special_tokens=self.special_tokens,
         )
         assert (
+                len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
         ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
         self.decoder = {
         )
         self.tokenizer = enc
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
         return self.mergeable_ranks
     def convert_tokens_to_ids(
+            self, tokens: Union[bytes, str, List[Union[bytes, str]]]
     ) -> List[int]:
         ids = []
         if isinstance(tokens, (str, bytes)):
             raise ValueError('Adding regular tokens is not supported')
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS  + self.AUDIO_ST:
                 raise ValueError('Adding unknown special tokens is not supported')
         return 0
         return (file_path,)
     def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[Set, str] = "all",
+            disallowed_special: Union[Collection, str] = (),
+            audio_info: Dict = None,
+            **kwargs,
     ) -> List[Union[bytes, str]]:
         """
         Converts a string in a sequence of tokens.
         # this implementation takes a detour: text -> token id -> token surface forms
         for t in self.tokenizer.encode(
+                text, allowed_special=allowed_special, disallowed_special=disallowed_special
         ):
             tokens.append(self.decoder[t])
         def _encode_audiourl(audio_tokens, audio_info, audio_idx):
             assert audio_tokens[0] == self.audio_start_tag and audio_tokens[-1] == self.audio_end_tag
             audio_token_span = audio_info['audio_span_tokens'][audio_idx]
+            out_audio_tokens = [self.audio_start_tag] + [self.audio_pad_tag] * (audio_token_span - 2) + [
+                self.audio_end_tag]
             return out_audio_tokens
+        return _replace_closed_tag(tokens, self.audio_start_tag, self.audio_end_tag, _encode_audiourl,
+                                   audio_info=audio_info)
     def _batch_encode_plus(
+            self,
+            batch_text_or_text_pairs: Union[
+                List[TextInput],
+                List[TextInputPair],
+                List[PreTokenizedInput],
+                List[PreTokenizedInputPair],
+                List[EncodedInput],
+                List[EncodedInputPair],
+            ],
+            add_special_tokens: bool = True,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
         for pair_id in range(len(batch_text_or_text_pairs)):
             kwargs['audio_info'] = audio_info[pair_id]
             ids_or_pair_ids = batch_text_or_text_pairs[pair_id]
+            # for ids_or_pair_ids in batch_text_or_text_pairs:
             if not isinstance(ids_or_pair_ids, (list, tuple)):
                 ids, pair_ids = ids_or_pair_ids, None
             elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
         raise NotImplementedError
     def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            errors: str = None,
+            **kwargs,
     ) -> str:
         if isinstance(token_ids, int):
             token_ids = [token_ids]
         audio_info = kwargs.pop("audio_info", None)
         def _decode_audiourl(audio_token_ids, audio_info, audio_idx):
             assert audio_token_ids[0] == self.audio_start_id and audio_token_ids[-1] == self.audio_end_id
             audio_url = audio_info["audio_urls"][audio_idx]
             return [self.audio_start_id] + self.tokenizer.encode(audio_url) + [self.audio_end_id]
+        token_ids = _replace_closed_tag(token_ids, self.audio_start_id, self.audio_end_id, _decode_audiourl,
+                                        audio_info=audio_info)
         if skip_special_tokens:
             token_ids = [i for i in token_ids if i < self.eod_id]
     def to_list_format(self, text: str):
         text = unicodedata.normalize("NFC", text)
         token_ids = self.tokenizer.encode(
+            text, allowed_special=set(self.AUDIO_ST + (ENDOFTEXT,)))
         def _encode_audio_info(tokens):
             if len(tokens) == 0:
     def process_audio(self, text):
         audio_urls = self.extract_audio_urls(text)
+        if len(audio_urls) > 0:
             audios, audio_lens, audio_span_tokens = [], [], []
             for audio_path in audio_urls:
+                if audio_path.startswith("http://") or audio_path.startswith("https://"):  # http
                     data = bytes(requests.get(audio_path, stream=True).content)
                     audio = load_bytesio_audio(data)
                 else:
                 audio_len = [audio_len_after_cnn, audio_token_num]
                 audios.append(mel)
                 audio_lens.append(audio_len)
+                audio_span_tokens.append(audio_token_num + 2)  # add audio bos eos
             input_audio_lengths = torch.IntTensor(audio_lens)
             input_audios = torch.stack(audios, dim=0)
             return {"input_audios": input_audios,

tokenizer_config.json CHANGED Viewed

@@ -6,6 +6,6 @@
     ]
   },
   "clean_up_tokenization_spaces": true,
-  "model_max_length": 8192,
   "tokenizer_class": "QWenTokenizer"
 }

     ]
   },
   "clean_up_tokenization_spaces": true,
+  "model_max_length": 2048,
   "tokenizer_class": "QWenTokenizer"
 }