niobures commited on Oct 25, 2025

Commit

111ec90

verified ·

1 Parent(s): 4c1278d

Qwen-Audio (code, models, paper)

Browse files

Files changed (22) hide show

.gitattributes +4 -0
Qwen-Audio. Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models.pdf +3 -0
Qwen2-Audio Technical Report.pdf +3 -0
code/Qwen-Audio.zip +3 -0
models/ailia-models/Qwen-Audio-Chat.onnx +3 -0
models/ailia-models/Qwen-Audio-Chat.onnx.prototxt +0 -0
models/ailia-models/Qwen-Audio-Chat_encode.onnx +3 -0
models/ailia-models/Qwen-Audio-Chat_encode.onnx.prototxt +3 -0
models/ailia-models/code/1272-128104-0000.flac +3 -0
models/ailia-models/code/LICENSE +53 -0
models/ailia-models/code/README.md +61 -0
models/ailia-models/code/audio_utils.py +173 -0
models/ailia-models/code/bos.npy +3 -0
models/ailia-models/code/eos.npy +3 -0
models/ailia-models/code/logit_process.py +65 -0
models/ailia-models/code/qwen_audio.py +607 -0
models/ailia-models/code/requirements.txt +2 -0
models/ailia-models/code/tokenizer/qwen.tiktoken +0 -0
models/ailia-models/code/tokenizer/special_tokens_map.json +1 -0
models/ailia-models/code/tokenizer/tokenization_qwen.py +578 -0
models/ailia-models/code/tokenizer/tokenizer_config.json +13 -0
models/ailia-models/source.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/ailia-models/code/1272-128104-0000.flac filter=lfs diff=lfs merge=lfs -text
+models/ailia-models/Qwen-Audio-Chat_encode.onnx.prototxt filter=lfs diff=lfs merge=lfs -text
+Qwen-Audio.[[:space:]]Advancing[[:space:]]Universal[[:space:]]Audio[[:space:]]Understanding[[:space:]]via[[:space:]]Unified[[:space:]]Large-Scale[[:space:]]Audio-Language[[:space:]]Models.pdf filter=lfs diff=lfs merge=lfs -text
+Qwen2-Audio[[:space:]]Technical[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text

Qwen-Audio. Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2582e1767927a67bc68db1fa9324c3a7839b8d2efa348ac6c76c57f5b44fae
+size 1798895

Qwen2-Audio Technical Report.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ddac6d6b779f567efffb06144a4a9030e8524168ca54e504e26c81767758826
+size 1644312

code/Qwen-Audio.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f35ae8a24225250ebe9281988a39a7e97d7475aed0ec8291e5a07208c4b9fb3
+size 35137544

models/ailia-models/Qwen-Audio-Chat.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:332e74dd4a032192b9dcc1819c379fe5572dd1b240371d2ab5f0bcca05e0cb72
+size 2059065

models/ailia-models/Qwen-Audio-Chat.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/Qwen-Audio-Chat_encode.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc99f46232b628850bfd5354f48eb89c9e1a99428b5218061424815bf45346ca
+size 1297414171

models/ailia-models/Qwen-Audio-Chat_encode.onnx.prototxt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d95d0bc4b3ed085db4fe8e9f9a2d2662b11ffebaa48e393d5f886499b8642802
+size 11250765

models/ailia-models/code/1272-128104-0000.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e25e22555cd16e90edb0a3b49fdcf1fe652b2a1250ab643634db33895c75b41
+size 120041

models/ailia-models/code/LICENSE ADDED Viewed

	@@ -0,0 +1,53 @@

+Tongyi Qianwen LICENSE AGREEMENT
+Tongyi Qianwen Release Date: August 23, 2023
+By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1. Definitions
+    a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
+    b. "We"(or "Us") shall mean Alibaba Cloud.
+    c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
+    d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
+    e. "Tongyi Qianwen" shall mean the large language models (including Qwen-Audio model and Qwen-Audio-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
+    f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
+    g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
+    h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+2. Grant of Rights
+You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
+3. Redistribution
+You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+    a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
+    b. You shall cause any modified files to carry prominent notices stating that You changed the files;
+    c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
+    d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
+4. Restrictions
+If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
+5. Rules of use
+    a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
+    b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
+6. Intellectual Property
+    a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
+    b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
+    c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
+7. Disclaimer of Warranty and Limitation of Liability
+    a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
+    b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
+    c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
+    d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
+8. Survival and Termination.
+    a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+    b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
+9. Governing Law and Jurisdiction.
+    a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+    b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.

models/ailia-models/code/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Qwen-Audio
+## Input
+- Audio file
+  https://github.com/QwenLM/Qwen-Audio/blob/main/assets/audio/1272-128104-0000.flac
+- Prompt
+  what does the person say?
+## Output
+The person says: "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel".
+## Requirements
+This model requires additional module.
+```
+pip3 install transformers
+pip3 install tiktoken
+pip3 install librosa
+```
+## Usage
+Automatically downloads the onnx and prototxt files on the first run.
+It is necessary to be connected to the Internet while downloading.
+For the sample wav,
+```bash
+$ python3 qwen_audio.py
+```
+If you want to specify the audio, put the file path after the `--input` option.
+```bash
+$ python3 qwen_audio.py --input AUDIO_FILE
+```
+If you want to specify the prompt, put the prompt after the `--prompt` option.
+```bash
+$ python3 qwen_audio.py --prompt PROMPT
+```
+## Reference
+- [Qwen-Audio](https://github.com/QwenLM/Qwen-Audio)
+## Framework
+Pytorch
+## Model Format
+ONNX opset=17
+## Netron
+[Qwen-Audio-Chat_encode.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat_encode.onnx.prototxt)
+[Qwen-Audio-Chat.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat.onnx.prototxt)

models/ailia-models/code/audio_utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import re
+from functools import lru_cache
+from subprocess import CalledProcessError, run
+import numpy as np
+import librosa
+flg_ffmpeg = False
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+def get_T_after_cnn(L_in, dilation=1):
+    for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
+        L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+        L_out = 1 + L_out // stride
+        L_in = L_out
+    return L_out
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    """
+    if flg_ffmpeg:
+        # This launches a subprocess to decode audio while down-mixing
+        # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+        # fmt: off
+        cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads", "0",
+            "-i", file,
+            "-f", "s16le",
+            "-ac", "1",
+            "-acodec", "pcm_s16le",
+            "-ar", str(sr),
+            "-"
+        ]
+        # fmt: on
+        try:
+            out = run(cmd, capture_output=True, check=True).stdout
+        except CalledProcessError as e:
+            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+        return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    else:
+        # prepare input data
+        audio, _ = librosa.load(file, sr=sr, mono=True, dtype=np.float32)
+        return audio
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if array.shape[axis] > length:
+        array = array.take(indices=range(length), axis=axis)
+    if array.shape[axis] < length:
+        pad_widths = [(0, 0)] * array.ndim
+        pad_widths[axis] = (0, length - array.shape[axis])
+        array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(n_mels: int = N_MELS):
+    """
+    the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    """
+    filters = librosa.filters.mel(sr=SAMPLE_RATE, n_fft=N_FFT, n_mels=n_mels)
+    return filters
+def log_mel_spectrogram(
+    audio: np.ndarray,
+    n_mels: int = N_MELS,
+    padding: int = 0,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: np.ndarray, shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    np.ndarray, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if padding > 0:
+        audio = np.pad(audio, (0, padding))
+    stft = librosa.stft(
+        y=audio,
+        n_fft=N_FFT,
+        hop_length=HOP_LENGTH,
+        window="hann",
+        pad_mode="reflect",
+    )
+    magnitudes = np.abs(stft[:, :-1]) ** 2
+    filters = mel_filters(n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = np.log10(np.clip(mel_spec, 1e-10, None))
+    log_spec = np.maximum(log_spec, np.max(log_spec) - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+def process_audio(content):
+    pattern = r"<audio>(.*?)</audio>"
+    audio_urls = re.findall(pattern, content)
+    if len(audio_urls) == 0:
+        return None
+    audios, audio_lens, audio_span_tokens = [], [], []
+    for audio_path in audio_urls:
+        cache = getattr(process_audio, "cache", {})
+        if audio_path in cache:
+            mel, audio_len, audio_token_num = cache[audio_path]
+            audios.append(mel)
+            audio_lens.append(audio_len)
+            audio_span_tokens.append(audio_token_num + 2)
+            continue
+        audio = load_audio(audio_path)
+        L = audio.shape[0] if audio.shape[0] <= 480000 else 480000  # max_length < 30s
+        mel_len = L // 160
+        audio = pad_or_trim(audio.flatten())
+        mel = log_mel_spectrogram(audio)
+        audio_len_after_cnn = get_T_after_cnn(mel_len)
+        audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
+        audio_len = [audio_len_after_cnn, audio_token_num]
+        audios.append(mel)
+        audio_lens.append(audio_len)
+        audio_span_tokens.append(audio_token_num + 2)  # add audio bos eos
+        cache[audio_path] = (mel, audio_len, audio_token_num)
+        process_audio.cache = cache
+    input_audio_lengths = np.array(audio_lens)
+    input_audios = np.stack(audios, axis=0)
+    return {
+        "input_audios": input_audios,
+        "input_audio_lengths": input_audio_lengths,
+        "audio_span_tokens": audio_span_tokens,
+        "audio_urls": audio_urls,
+    }

models/ailia-models/code/bos.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30c022af7033308381f3949286c0ab373fb8e90eb67d4b29dd03482101874ef5
+size 8320

models/ailia-models/code/eos.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e27cff1ed8b60edcd7ce2ad9cc77d3468fe17335c2597277ca7841e4b229ea9b
+size 8320

models/ailia-models/code/logit_process.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import List
+import numpy as np
+from math_utils import softmax
+def StopWordsLogitsProcessor(scores, input_ids):
+    eos_token_id = 151643
+    stop_words_ids = [[151645], [151644]]
+    def tokens_match(prev_tokens: np.ndarray, tokens: List[int]) -> bool:
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        elif len(tokens) > len(prev_tokens):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+        elif prev_tokens[-len(tokens) :].tolist() == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+    stopped_samples = []
+    for prev_input_ids_slice in input_ids:
+        match = False
+        for stop_token_seq in stop_words_ids:
+            if tokens_match(prev_input_ids_slice, stop_token_seq):
+                # if tokens do not match continue
+                match = True
+                break
+        stopped_samples.append(match)
+    for i, should_stop in enumerate(stopped_samples):
+        if should_stop:
+            scores[i, eos_token_id] = float(2**15)
+    return scores
+def TopPLogitsWarper(scores, top_p):
+    sorted_indices = np.argsort(scores)
+    sorted_logits = np.take_along_axis(scores, sorted_indices, axis=-1)
+    cumulative_probs = np.cumsum(softmax(sorted_logits, axis=-1), axis=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # Keep at least min_tokens_to_keep
+    min_tokens_to_keep = 1
+    sorted_indices_to_remove[..., -min_tokens_to_keep:] = 0
+    # scatter sorted tensors to original indexing
+    indices_to_remove = np.copy(sorted_indices_to_remove)
+    np.put_along_axis(
+        indices_to_remove, sorted_indices, sorted_indices_to_remove, axis=1
+    )
+    scores_processed = np.where(indices_to_remove, -np.inf, scores)
+    return scores_processed
+def logits_processor(input_ids, scores, top_p=0.5):
+    scores = StopWordsLogitsProcessor(scores, input_ids)
+    scores = TopPLogitsWarper(scores, top_p)
+    return scores

models/ailia-models/code/qwen_audio.py ADDED Viewed

	@@ -0,0 +1,607 @@

+import os
+import sys
+import time
+from typing import Dict, List, Optional, Tuple
+# logger
+from logging import getLogger
+import numpy as np
+import ailia
+# import original modules
+sys.path.append("../../util")
+from arg_utils import get_base_parser, update_parser  # noqa
+from model_utils import check_and_download_models, check_and_download_file  # noqa
+from math_utils import softmax
+from logit_process import logits_processor
+from audio_utils import process_audio
+logger = getLogger(__name__)
+# ======================
+# Parameters
+# ======================
+REMOTE_PATH = "https://storage.googleapis.com/ailia-models/qwen_audio/"
+AUDIO_PATH = "1272-128104-0000.flac"
+COPY_BLOB_DATA = True
+# ======================
+# Arguemnt Parser Config
+# ======================
+parser = get_base_parser("Qwen-Audio", AUDIO_PATH, None, large_model=True)
+parser.add_argument(
+    "-p",
+    "--prompt",
+    type=str,
+    default="what does the person say?",
+    help="prompt",
+)
+parser.add_argument(
+    "--disable_ailia_tokenizer", action="store_true", help="disable ailia tokenizer."
+)
+parser.add_argument("--onnx", action="store_true", help="execute onnxruntime version.")
+args = update_parser(parser)
+# ======================
+# Parameters
+# ======================
+WEIGHT_PATH = "Qwen-Audio-Chat.onnx"
+WEIGHT_ENC_PATH = "Qwen-Audio-Chat_encode.onnx"
+MODEL_PATH = "Qwen-Audio-Chat.onnx.prototxt"
+MODEL_ENC_PATH = "Qwen-Audio-Chat_encode.onnx.prototxt"
+PB_PATH = "Qwen-Audio-Chat_weights.pb"
+SYSTEM_PROMPT = "You are a helpful assistant."
+# ======================
+# Secondary Functions
+# ======================
+def make_context(
+    tokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+):
+    if history is None:
+        history = []
+    im_start, im_end = "<|im_start|>", "<|im_end|>"
+    im_start_tokens = [tokenizer.im_start_id]
+    im_end_tokens = [tokenizer.im_end_id]
+    nl_tokens = tokenizer.encode("\n")
+    def _tokenize_str(role, content):
+        audio_info = process_audio(content)
+        return (
+            f"{role}\n{content}",
+            tokenizer.encode(
+                role, allowed_special=set(tokenizer.AUDIO_ST), audio_info=audio_info
+            )
+            + nl_tokens
+            + tokenizer.encode(
+                content,
+                allowed_special=set(tokenizer.AUDIO_ST),
+                audio_info=audio_info,
+            ),
+        )
+    system_text, system_tokens_part = _tokenize_str("system", system)
+    system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+    raw_text = ""
+    context_tokens = []
+    for turn_query, turn_response in reversed(history):
+        query_text, query_tokens_part = _tokenize_str("user", turn_query)
+        query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+        if turn_response is not None:
+            response_text, response_tokens_part = _tokenize_str(
+                "assistant", turn_response
+            )
+            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+            prev_chat = (
+                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+            )
+        else:
+            next_context_tokens = nl_tokens + query_tokens + nl_tokens
+            prev_chat = f"\n{im_start}{query_text}{im_end}\n"
+        current_context_size = (
+            len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+        )
+        if current_context_size < max_window_size:
+            context_tokens = next_context_tokens + context_tokens
+            raw_text = prev_chat + raw_text
+        else:
+            break
+    context_tokens = system_tokens + context_tokens
+    raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+    context_tokens += (
+        nl_tokens
+        + im_start_tokens
+        + _tokenize_str("user", query)[1]
+        + im_end_tokens
+        + nl_tokens
+        + im_start_tokens
+        + tokenizer.encode("assistant")
+        + nl_tokens
+    )
+    raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+    return raw_text, context_tokens
+def decode_tokens(
+    tokens,
+    tokenizer,
+    raw_text_len: int,
+    context_length: int,
+    verbose: bool = False,
+    errors: str = "replace",
+    audio_info: Dict = None,
+) -> str:
+    eod_token_ids = [tokenizer.im_start_id, tokenizer.im_end_id]
+    kwargs = {"audio_info": audio_info}
+    end_reason = f"Gen length {len(tokens)}"
+    eod_token_idx = context_length
+    for eod_token_idx in range(context_length, len(tokens)):
+        if tokens[eod_token_idx] in eod_token_ids:
+            end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]],**kwargs)!r}"
+            break
+    trim_decode_tokens = tokenizer.decode(
+        tokens[:eod_token_idx], errors=errors, **kwargs
+    )[raw_text_len:]
+    if verbose:
+        print(
+            "\nRaw Generate w/o EOD:",
+            tokenizer.decode(tokens, errors=errors, **kwargs)[raw_text_len:],
+        )
+        print("\nRaw Generate:", trim_decode_tokens)
+        print("\nEnd Reason:", end_reason)
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nGenerate:", trim_decode_tokens)
+    return trim_decode_tokens
+# ======================
+# Main functions
+# ======================
+def audio_encode(models, input_audios, input_audio_lengths, audio_span_tokens):
+    real_input_audio_lens = input_audio_lengths[:, 0].tolist()
+    max_len_in_batch = max(real_input_audio_lens)
+    padding_mask = np.ones([input_audios.shape[0], max_len_in_batch], dtype=np.float16)
+    for index in range(len(input_audios)):
+        padding_mask[index, : input_audio_lengths[index][0]] = 0
+    # feedforward
+    net = models["enc"]
+    if not args.onnx:
+        # if False:
+        output = net.predict([input_audios, padding_mask, input_audio_lengths])
+    else:
+        output = net.run(
+            None,
+            {
+                "input_audios": input_audios,
+                "padding_mask": padding_mask,
+                "input_audio_lengths": input_audio_lengths,
+            },
+        )
+    x = output[0]
+    bos = np.load(os.path.join(os.path.dirname(__file__), "bos.npy"))
+    eos = np.load(os.path.join(os.path.dirname(__file__), "eos.npy"))
+    output_audios = []
+    for i in range(len(audio_span_tokens)):
+        audio_span = audio_span_tokens[i]
+        audio = x[i][: audio_span - 2]
+        audio = np.concatenate([bos, audio, eos])
+        assert len(audio) == audio_span
+        output_audios.append(audio)
+    return output_audios
+def forward(
+    models,
+    input_ids: np.ndarray,
+    attention_mask: np.ndarray,
+    audio_info: dict,
+    past_key_values: List[np.ndarray],
+    blob_copy: bool,
+):
+    audios = audio_info["input_audios"]
+    audio_span_tokens = audio_info["audio_span_tokens"]
+    input_audio_lengths = audio_info["input_audio_lengths"]
+    if 0 < past_key_values[0].shape[1]:
+        audios = (
+            np.ones(
+                (len(audio_span_tokens), input_ids.shape[1], 4096), dtype=np.float16
+            )
+            * -np.inf
+        )
+    else:
+        audio_start_id = 155163
+        bos_pos = np.where(input_ids == audio_start_id)
+        eos_pos = np.where(input_ids == audio_start_id + 1)
+        audio_pos = np.stack((bos_pos[0], bos_pos[1], eos_pos[1]), axis=1)
+        audios = audio_encode(models, audios, input_audio_lengths, audio_span_tokens)
+        lst = []
+        for idx, (i, a, b) in enumerate(audio_pos):
+            lst.append(
+                np.concatenate(
+                    [
+                        np.ones((a, 4096), dtype=np.float16) * -np.inf,
+                        audios[idx],
+                        np.ones((input_ids.shape[1] - b - 1, 4096), dtype=np.float16)
+                        * -np.inf,
+                    ]
+                )
+            )
+        audios = np.stack(lst, axis=0)
+    net = models["net"]
+    if not args.onnx:
+        if not blob_copy:
+            output = net.predict(
+                [
+                    input_ids,
+                    attention_mask,
+                    audios,
+                    *past_key_values,
+                ]
+            )
+            logits, new_past_key_values = output[0], output[1:]
+        else:
+            NUM_KV = 32
+            key_shapes = [
+                net.get_blob_shape(
+                    net.find_blob_index_by_name("key_cache_out" + str(i))
+                )
+                for i in range(NUM_KV)
+            ]
+            value_shapes = [
+                net.get_blob_shape(
+                    net.find_blob_index_by_name("value_cache_out" + str(i))
+                )
+                for i in range(NUM_KV)
+            ]
+            net.set_input_blob_data(input_ids, net.find_blob_index_by_name("input_ids"))
+            net.set_input_blob_data(
+                attention_mask, net.find_blob_index_by_name("attention_mask")
+            )
+            net.set_input_blob_data(audios, net.find_blob_index_by_name("audios"))
+            for i in range(NUM_KV):
+                net.set_input_blob_shape(
+                    key_shapes[i], net.find_blob_index_by_name("key_cache" + str(i))
+                )
+                net.set_input_blob_shape(
+                    value_shapes[i], net.find_blob_index_by_name("value_cache" + str(i))
+                )
+                net.copy_blob_data("key_cache" + str(i), "key_cache_out" + str(i))
+                net.copy_blob_data("value_cache" + str(i), "value_cache_out" + str(i))
+            net.update()
+            logits = net.get_blob_data(net.find_blob_index_by_name("logits"))
+            new_past_key_values = [
+                net.get_blob_data(net.find_blob_index_by_name("key_cache_out0"))
+            ]
+    else:
+        output = net.run(
+            None,
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "audios": audios,
+                "key_cache0": past_key_values[0],
+                "value_cache0": past_key_values[1],
+                "key_cache1": past_key_values[2],
+                "value_cache1": past_key_values[3],
+                "key_cache2": past_key_values[4],
+                "value_cache2": past_key_values[5],
+                "key_cache3": past_key_values[6],
+                "value_cache3": past_key_values[7],
+                "key_cache4": past_key_values[8],
+                "value_cache4": past_key_values[9],
+                "key_cache5": past_key_values[10],
+                "value_cache5": past_key_values[11],
+                "key_cache6": past_key_values[12],
+                "value_cache6": past_key_values[13],
+                "key_cache7": past_key_values[14],
+                "value_cache7": past_key_values[15],
+                "key_cache8": past_key_values[16],
+                "value_cache8": past_key_values[17],
+                "key_cache9": past_key_values[18],
+                "value_cache9": past_key_values[19],
+                "key_cache10": past_key_values[20],
+                "value_cache10": past_key_values[21],
+                "key_cache11": past_key_values[22],
+                "value_cache11": past_key_values[23],
+                "key_cache12": past_key_values[24],
+                "value_cache12": past_key_values[25],
+                "key_cache13": past_key_values[26],
+                "value_cache13": past_key_values[27],
+                "key_cache14": past_key_values[28],
+                "value_cache14": past_key_values[29],
+                "key_cache15": past_key_values[30],
+                "value_cache15": past_key_values[31],
+                "key_cache16": past_key_values[32],
+                "value_cache16": past_key_values[33],
+                "key_cache17": past_key_values[34],
+                "value_cache17": past_key_values[35],
+                "key_cache18": past_key_values[36],
+                "value_cache18": past_key_values[37],
+                "key_cache19": past_key_values[38],
+                "value_cache19": past_key_values[39],
+                "key_cache20": past_key_values[40],
+                "value_cache20": past_key_values[41],
+                "key_cache21": past_key_values[42],
+                "value_cache21": past_key_values[43],
+                "key_cache22": past_key_values[44],
+                "value_cache22": past_key_values[45],
+                "key_cache23": past_key_values[46],
+                "value_cache23": past_key_values[47],
+                "key_cache24": past_key_values[48],
+                "value_cache24": past_key_values[49],
+                "key_cache25": past_key_values[50],
+                "value_cache25": past_key_values[51],
+                "key_cache26": past_key_values[52],
+                "value_cache26": past_key_values[53],
+                "key_cache27": past_key_values[54],
+                "value_cache27": past_key_values[55],
+                "key_cache28": past_key_values[56],
+                "value_cache28": past_key_values[57],
+                "key_cache29": past_key_values[58],
+                "value_cache29": past_key_values[59],
+                "key_cache30": past_key_values[60],
+                "value_cache30": past_key_values[61],
+                "key_cache31": past_key_values[62],
+                "value_cache31": past_key_values[63],
+            },
+        )
+        logits, new_past_key_values = output[0], output[1:]
+    return logits, new_past_key_values
+def stopping_criteria(input_ids: np.array) -> np.array:
+    max_length = 690
+    cur_len = input_ids.shape[-1]
+    is_done = cur_len >= max_length
+    is_done = np.full(input_ids.shape[0], is_done)
+    eos_token_id = np.array([151643])
+    is_done = is_done | np.isin(input_ids[:, -1], eos_token_id)
+    return is_done
+def sample(models, input_ids, attention_mask, audio_info):
+    pad_token_id = 151643
+    past_key_values = [np.zeros((1, 0, 32, 128), dtype=np.float16)] * 64
+    # keep track of which sequences are already finished
+    batch_size, cur_len = input_ids.shape
+    this_peer_finished = False
+    unfinished_sequences = np.ones(batch_size, dtype=int)
+    cache_position = (
+        np.cumsum(np.ones_like(input_ids[0, :], dtype=np.int64), axis=0) - 1
+    )
+    blob_copy = False
+    while True:
+        # prepare model inputs
+        if 0 < past_key_values[0].shape[1]:
+            model_input_ids = input_ids[:, cache_position]
+        else:
+            model_input_ids = input_ids
+        position_ids = attention_mask.astype(np.int32).cumsum(axis=-1) - 1
+        position_ids = np.where(attention_mask == 0, 1, position_ids)
+        if 0 < past_key_values[0].shape[1]:
+            position_ids = position_ids[:, -1:]
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        logits, past_key_values = forward(
+            models,
+            model_input_ids,
+            attention_mask,
+            audio_info,
+            past_key_values,
+            blob_copy,
+        )
+        blob_copy = True if COPY_BLOB_DATA else False
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            estimation_time = end - start
+            logger.info(f"\tdecode time {estimation_time} ms")
+        attention_mask = np.concatenate(
+            [attention_mask, np.ones((attention_mask.shape[0], 1), dtype=int)],
+            axis=-1,
+        )
+        cache_position = cache_position[-1:] + 1
+        next_token_logits = logits[:, -1, :]
+        # pre-process distribution
+        next_token_scores = logits_processor(input_ids, next_token_logits)
+        # token selection
+        probs = softmax(next_token_scores, axis=-1)
+        next_tokens = np.random.choice(len(probs[0]), size=1, p=probs[0])
+        # finished sentences should have their next token be a padding token
+        next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
+            1 - unfinished_sequences
+        )
+        # update generated ids, model inputs, and length for next step
+        input_ids = np.concatenate([input_ids, next_tokens[:, None]], axis=-1)
+        unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids)
+        this_peer_finished = np.max(unfinished_sequences) == 0
+        cur_len += 1
+        if this_peer_finished:
+            break
+    return input_ids
+def predict(models, query, history: Optional[List[Tuple[str, str]]] = None):
+    if history is None:
+        history = []
+    else:
+        # copy history to avoid modification
+        history = [x for x in history]
+    tokenizer = models["tokenizer"]
+    raw_text, context_tokens = make_context(
+        tokenizer,
+        query,
+        history=history,
+        system=SYSTEM_PROMPT,
+    )
+    audio_info = process_audio(raw_text)
+    input_ids = np.array([context_tokens])
+    attention_mask = np.ones(input_ids.shape[:2], dtype=np.int64)
+    outputs = sample(models, input_ids, attention_mask, audio_info)
+    response = decode_tokens(
+        outputs[0],
+        tokenizer,
+        raw_text_len=len(raw_text),
+        context_length=len(context_tokens),
+        audio_info=audio_info,
+    )
+    history.append((query, response))
+    return response, history
+def recognize(models):
+    prompt = args.prompt
+    audio_urls = args.input
+    logger.info("Prompt: %s" % prompt)
+    tokenizer = models["tokenizer"]
+    query = tokenizer.from_list_format(
+        [{"audio": input} for input in audio_urls] + [{"text": prompt}],
+    )
+    # inference
+    logger.info("Start inference...")
+    if args.benchmark:
+        logger.info("BENCHMARK mode")
+        total_time_estimation = 0
+        for i in range(args.benchmark_count):
+            start = int(round(time.time() * 1000))
+            response, history = predict(models, query)
+            end = int(round(time.time() * 1000))
+            estimation_time = end - start
+            # Logging
+            logger.info(f"\tailia processing estimation time {estimation_time} ms")
+            if i != 0:
+                total_time_estimation = total_time_estimation + estimation_time
+        logger.info(
+            f"\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms"
+        )
+    else:
+        response, history = predict(models, query)
+        # # 2nd dialogue turn
+        # print(response)
+        # query = 'Find the start time and end time of the word "middle classes"'
+        # response, history = predict(models, query, history=history)
+    print(response)
+    logger.info("Script finished successfully.")
+def main():
+    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
+    check_and_download_models(WEIGHT_ENC_PATH, MODEL_ENC_PATH, REMOTE_PATH)
+    check_and_download_file(PB_PATH, REMOTE_PATH)
+    env_id = args.env_id
+    # initialize
+    if not args.onnx:
+        memory_mode = ailia.get_memory_mode(
+            reduce_constant=True,
+            ignore_input_with_initializer=True,
+            reduce_interstage=False,
+            reuse_interstage=True,
+        )
+        enc = ailia.Net(
+            MODEL_ENC_PATH, WEIGHT_ENC_PATH, env_id=env_id, memory_mode=memory_mode
+        )
+        net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id, memory_mode=memory_mode)
+    else:
+        import onnxruntime
+        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        enc = onnxruntime.InferenceSession(WEIGHT_ENC_PATH, providers=providers)
+        net = onnxruntime.InferenceSession(WEIGHT_PATH, providers=providers)
+    args.disable_ailia_tokenizer = True
+    if args.disable_ailia_tokenizer:
+        import transformers
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            "./tokenizer", trust_remote_code=True
+        )
+    else:
+        raise NotImplementedError
+    models = {
+        "tokenizer": tokenizer,
+        "enc": enc,
+        "net": net,
+    }
+    # generate
+    recognize(models)
+if __name__ == "__main__":
+    main()

models/ailia-models/code/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ tiktoken

models/ailia-models/code/tokenizer/qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/code/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

models/ailia-models/code/tokenizer/tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,578 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import re
+import itertools
+import requests
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
+import tiktoken
+import numpy as np
+from transformers import PreTrainedTokenizer, AddedToken
+from transformers.utils import try_to_load_from_cache
+from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy, \
+    TextInput, TextInputPair, PreTokenizedInput, PreTokenizedInputPair, TensorType, EncodedInput, EncodedInputPair
+import matplotlib.colors as mcolors
+from matplotlib.font_manager import FontProperties
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+                     ENDOFTEXT,
+                     IMSTART,
+                     IMEND,
+                 ) + EXTRAS
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "it": "italian",
+}
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+def _list_find(
+        input_list: List[Any],
+        candidates: Tuple[Any],
+        start: int = 0,
+):
+    for i in range(start, len(input_list)):
+        if input_list[i] in candidates:
+            return i
+    return -1
+def _replace_closed_tag(
+        input_tokens: List[Any],
+        start_tags: Union[Any, Tuple[Any]],
+        end_tags: Union[Any, Tuple[Any]],
+        inclusive_replace_func: Callable,
+        exclusive_replace_func: Callable = lambda x: x,
+        audio_info: Dict = None
+):
+    if isinstance(start_tags, (str, int)):
+        start_tags = (start_tags,)
+    if isinstance(end_tags, (str, int)):
+        end_tags = (end_tags,)
+    assert len(start_tags) == len(end_tags)
+    output_tokens = []
+    end = 0
+    audio_idx = 0
+    while True:
+        start = _list_find(input_tokens, start_tags, end)
+        if start == -1:
+            break
+        output_tokens.extend(exclusive_replace_func(input_tokens[end: start]))
+        tag_idx = start_tags.index(input_tokens[start])
+        end = _list_find(input_tokens, (end_tags[tag_idx],), start)
+        if end == -1:
+            raise ValueError("Unclosed audio token")
+        output_tokens.extend(inclusive_replace_func(input_tokens[start: end + 1], audio_info, audio_idx))
+        end += 1
+        audio_idx += 1
+    output_tokens.extend(exclusive_replace_func(input_tokens[end:]))
+    return output_tokens
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+            self,
+            vocab_file,
+            errors="replace",
+            audio_start_tag='<audio>',
+            audio_end_tag='</audio>',
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_start_tag = audio_start_tag
+        self.audio_end_tag = audio_end_tag
+        self.audio_pad_tag = "[[[AUDIO:modality]]]"
+        self.AUDIO_ST = (
+            '[[[AUDIO:modality]]]',
+            # Transcription Tag
+            "<|startoftranscript|>",  # Transcription
+            "<|startofanalysis|>",  # Analysis
+            # Task Tag
+            "<|translate|>",
+            "<|transcribe|>",
+            "<|caption|>",
+            "<|keyword|>",
+            # Language Tag
+            "<|unknown|>",  # unknown language
+            *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+            "<|zh_tr|>",  # tranditional Chinese
+            # Timestamps Tag
+            "<|notimestamps|>",
+            "<|sil|>",
+            "<|timestamps|>",
+            *[f"<|{i * 0.01:.2f}|>" for i in range(3001)],  # timestamps 0.00-30.00
+            # Output Instruction
+            "<|caption_audiocaps|>",  # Audiocaps caption style
+            "<|caption_clotho|>",  # Clotho caption style
+            "<|audioset_ontology|>",  # Audioset ontology style
+            "<|caption_plain|>",  # plain caption
+            "<|itn|>",  # inversed text normalized
+            "<|wo_itn|>",  # without inversed text normalized
+            "<|startofentityvalue|>",
+            "<|endofentityvalue|>",
+            "<|startofentitytype|>",
+            "<|endofentitytype|>",
+            "<|named_entity_recognition|>",  # named entity recognition task
+            "<|audio_grounding|>",
+            "<|startofword|>",
+            "<|endofword|>",
+            "<|delim|>",  # delimiter of timestamps pair in audio grounding
+            "<|emotion_recognition|>",  # emotion recognition
+            "<|music_description|>",  # music description
+            "<|note_analysis|>",  # note analysis
+            "<|pitch|>",  # note analysis: pitch
+            *[f"<|midi_pitch_{i}|>" for i in range(128)],  # midi pitch 0-127
+            "<|velocity|>",  # note analysis: velocity
+            *[f"<|midi_velocity_{i}|>" for i in range(128)],  # midi velocity 0-127
+            "<|sonic|>",  # note analysis:  sonic
+            "<|instrument|>",  # note analysis:  instrument
+            "<|speaker_meta|>",  # meta information of speaker
+            "<|song_meta|>",  # meta information of song
+            "<|question|>",  # AQA: question
+            "<|answer|>",  # AQA: answer
+            "<|choice|>",  # AQA: answer choice
+            "<|scene|>",  # scene recognition
+            "<|event|>",  # sound event
+            "<|vocal_classification|>",  # vocal classification
+            "<|speech_understanding|>",  # speech language understanding
+            "<|scenario|>",  # speech language understanding: scenario
+            "<|action|>",  # speech language understanding: action
+            "<|entities|>",  # speech language understanding: entities
+            "<|speech_edit|>",  # speech edit
+            audio_start_tag,
+            audio_end_tag
+        )
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS + self.AUDIO_ST, start=len(self.mergeable_ranks)
+            )
+        }
+        self.audio_start_id = self.special_tokens[self.audio_start_tag]
+        self.audio_end_id = self.special_tokens[self.audio_end_tag]
+        self.audio_pad_id = self.special_tokens[self.audio_pad_tag]
+        print(f"audio_start_id: {self.audio_start_id}, "
+              f"audio_end_id: {self.audio_end_id}, "
+              f"audio_pad_id: {self.audio_pad_id}.")
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+                len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state['tokenizer']
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+            self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError('Adding regular tokens is not supported')
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS  + self.AUDIO_ST:
+                raise ValueError('Adding unknown special tokens is not supported')
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[Set, str] = "all",
+            disallowed_special: Union[Collection, str] = (),
+            audio_info: Dict = None,
+            **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+                text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        def _encode_audiourl(audio_tokens, audio_info, audio_idx):
+            assert audio_tokens[0] == self.audio_start_tag and audio_tokens[-1] == self.audio_end_tag
+            audio_token_span = audio_info['audio_span_tokens'][audio_idx]
+            out_audio_tokens = [self.audio_start_tag] + [self.audio_pad_tag] * (audio_token_span - 2) + [
+                self.audio_end_tag]
+            return out_audio_tokens
+        return _replace_closed_tag(tokens, self.audio_start_tag, self.audio_end_tag, _encode_audiourl,
+                                   audio_info=audio_info)
+    def _batch_encode_plus(
+            self,
+            batch_text_or_text_pairs: Union[
+                List[TextInput],
+                List[TextInputPair],
+                List[PreTokenizedInput],
+                List[PreTokenizedInputPair],
+                List[EncodedInput],
+                List[EncodedInputPair],
+            ],
+            add_special_tokens: bool = True,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            stride: int = 0,
+            is_split_into_words: bool = False,
+            pad_to_multiple_of: Optional[int] = None,
+            return_tensors: Optional[Union[str, TensorType]] = None,
+            return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = None,
+            return_overflowing_tokens: bool = False,
+            return_special_tokens_mask: bool = False,
+            return_offsets_mapping: bool = False,
+            return_length: bool = False,
+            verbose: bool = True,
+            **kwargs,
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+        input_ids = []
+        audio_info = kwargs.pop("audio_info", None)
+        for pair_id in range(len(batch_text_or_text_pairs)):
+            kwargs['audio_info'] = audio_info[pair_id]
+            ids_or_pair_ids = batch_text_or_text_pairs[pair_id]
+            # for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+            first_ids = get_input_ids(ids)
+            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            input_ids.append((first_ids, second_ids))
+        batch_outputs = self._batch_prepare_for_model(
+            input_ids,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+        return BatchEncoding(batch_outputs)
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            errors: str = None,
+            **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        audio_info = kwargs.pop("audio_info", None)
+        def _decode_audiourl(audio_token_ids, audio_info, audio_idx):
+            assert audio_token_ids[0] == self.audio_start_id and audio_token_ids[-1] == self.audio_end_id
+            audio_url = audio_info["audio_urls"][audio_idx]
+            return [self.audio_start_id] + self.tokenizer.encode(audio_url) + [self.audio_end_id]
+        token_ids = _replace_closed_tag(token_ids, self.audio_start_id, self.audio_end_id, _decode_audiourl,
+                                        audio_info=audio_info)
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+    def to_list_format(self, text: str):
+        text = unicodedata.normalize("NFC", text)
+        token_ids = self.tokenizer.encode(
+            text, allowed_special=set(self.AUDIO_ST + (ENDOFTEXT,)))
+        def _encode_audio_info(tokens):
+            if len(tokens) == 0:
+                return []
+            if tokens[0] == self.audio_start_id and tokens[-1] == self.audio_end_id:
+                key = 'audio'
+            else:
+                _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+                return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
+            _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+            val = b''.join(map(_tobytes, map(self.decoder.get, tokens[1:-1]))).decode('utf-8')
+            return [{key: val}]
+        return _replace_closed_tag(
+            token_ids,
+            (self.audio_start_id),
+            (self.audio_end_id),
+            _encode_audio_info,
+            _encode_audio_info,
+        )
+    def from_list_format(self, list_format: List[Dict]):
+        text = ''
+        num_audios = 0
+        for ele in list_format:
+            if 'audio' in ele:
+                num_audios += 1
+                text += f'Audio {num_audios}:'
+                text += self.audio_start_tag + ele['audio'] + self.audio_end_tag
+                text += '\n'
+            elif 'text' in ele:
+                text += ele['text']
+            elif 'box' in ele:
+                if 'ref' in ele:
+                    text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
+                for box in ele['box']:
+                    text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
+            else:
+                raise ValueError("Unsupport element: " + str(ele))
+        return text
+    def extract_audio_urls(self, text):
+        pattern = rf"{self.audio_start_tag}(.*?){self.audio_end_tag}"
+        return re.findall(pattern, text)
+    def process_audio(self, text):
+        audio_urls = self.extract_audio_urls(text)
+        if len(audio_urls) > 0:
+            audios, audio_lens, audio_span_tokens = [], [], []
+            for audio_path in audio_urls:
+                if audio_path.startswith("http://") or audio_path.startswith("https://"):  # http
+                    data = bytes(requests.get(audio_path, stream=True).content)
+                    audio = load_bytesio_audio(data)
+                else:
+                    audio = load_audio(audio_path)
+                L = (audio.shape[0] if audio.shape[0] <= 480000 else 480000)  # max_length < 30s
+                mel_len = L // 160
+                audio = pad_or_trim(audio.flatten())
+                mel = log_mel_spectrogram(audio)
+                audio_len_after_cnn = get_T_after_cnn(mel_len)
+                audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
+                audio_len = [audio_len_after_cnn, audio_token_num]
+                audios.append(mel)
+                audio_lens.append(audio_len)
+                audio_span_tokens.append(audio_token_num + 2)  # add audio bos eos
+            input_audio_lengths = torch.IntTensor(audio_lens)
+            input_audios = torch.stack(audios, dim=0)
+            return {"input_audios": input_audios,
+                    "input_audio_lengths": input_audio_lengths,
+                    "audio_span_tokens": audio_span_tokens,
+                    "audio_urls": audio_urls}
+        else:
+            return None

models/ailia-models/code/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "extra_special_tokens": {},
+  "model_max_length": 2048,
+  "tokenizer_class": "QWenTokenizer"
+}

models/ailia-models/source.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+https://github.com/axinc-ai/ailia-models/tree/master/audio_language_model/qwen_audio
+https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat_encode.onnx
+https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat_encode.onnx.prototxt
+https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat.onnx
+https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat.onnx.prototxt