niobures commited on
Commit
111ec90
·
verified ·
1 Parent(s): 4c1278d

Qwen-Audio (code, models, paper)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/ailia-models/code/1272-128104-0000.flac filter=lfs diff=lfs merge=lfs -text
37
+ models/ailia-models/Qwen-Audio-Chat_encode.onnx.prototxt filter=lfs diff=lfs merge=lfs -text
38
+ Qwen-Audio.[[:space:]]Advancing[[:space:]]Universal[[:space:]]Audio[[:space:]]Understanding[[:space:]]via[[:space:]]Unified[[:space:]]Large-Scale[[:space:]]Audio-Language[[:space:]]Models.pdf filter=lfs diff=lfs merge=lfs -text
39
+ Qwen2-Audio[[:space:]]Technical[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text
Qwen-Audio. Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec2582e1767927a67bc68db1fa9324c3a7839b8d2efa348ac6c76c57f5b44fae
3
+ size 1798895
Qwen2-Audio Technical Report.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ddac6d6b779f567efffb06144a4a9030e8524168ca54e504e26c81767758826
3
+ size 1644312
code/Qwen-Audio.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f35ae8a24225250ebe9281988a39a7e97d7475aed0ec8291e5a07208c4b9fb3
3
+ size 35137544
models/ailia-models/Qwen-Audio-Chat.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332e74dd4a032192b9dcc1819c379fe5572dd1b240371d2ab5f0bcca05e0cb72
3
+ size 2059065
models/ailia-models/Qwen-Audio-Chat.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/Qwen-Audio-Chat_encode.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc99f46232b628850bfd5354f48eb89c9e1a99428b5218061424815bf45346ca
3
+ size 1297414171
models/ailia-models/Qwen-Audio-Chat_encode.onnx.prototxt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d95d0bc4b3ed085db4fe8e9f9a2d2662b11ffebaa48e393d5f886499b8642802
3
+ size 11250765
models/ailia-models/code/1272-128104-0000.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e25e22555cd16e90edb0a3b49fdcf1fe652b2a1250ab643634db33895c75b41
3
+ size 120041
models/ailia-models/code/LICENSE ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tongyi Qianwen LICENSE AGREEMENT
2
+
3
+ Tongyi Qianwen Release Date: August 23, 2023
4
+
5
+ By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
6
+
7
+ 1. Definitions
8
+ a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
9
+ b. "We"(or "Us") shall mean Alibaba Cloud.
10
+ c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
11
+ d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
12
+ e. "Tongyi Qianwen" shall mean the large language models (including Qwen-Audio model and Qwen-Audio-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
13
+ f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
14
+ g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
15
+ h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
16
+ and conversions to other media types.
17
+
18
+ 2. Grant of Rights
19
+ You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
20
+
21
+ 3. Redistribution
22
+ You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
23
+ a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
24
+ b. You shall cause any modified files to carry prominent notices stating that You changed the files;
25
+ c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
26
+ d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
27
+
28
+ 4. Restrictions
29
+ If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
30
+
31
+ 5. Rules of use
32
+ a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
33
+ b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
34
+
35
+ 6. Intellectual Property
36
+ a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
37
+ b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
38
+ c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
39
+
40
+ 7. Disclaimer of Warranty and Limitation of Liability
41
+
42
+ a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
43
+ b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
44
+ c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
45
+ d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
46
+
47
+ 8. Survival and Termination.
48
+ a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
49
+ b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
50
+
51
+ 9. Governing Law and Jurisdiction.
52
+ a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
53
+ b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
models/ailia-models/code/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen-Audio
2
+
3
+ ## Input
4
+
5
+ - Audio file
6
+
7
+ https://github.com/QwenLM/Qwen-Audio/blob/main/assets/audio/1272-128104-0000.flac
8
+
9
+ - Prompt
10
+
11
+ what does the person say?
12
+
13
+ ## Output
14
+
15
+ The person says: "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel".
16
+
17
+ ## Requirements
18
+
19
+ This model requires additional module.
20
+ ```
21
+ pip3 install transformers
22
+ pip3 install tiktoken
23
+ pip3 install librosa
24
+ ```
25
+
26
+
27
+ ## Usage
28
+ Automatically downloads the onnx and prototxt files on the first run.
29
+ It is necessary to be connected to the Internet while downloading.
30
+
31
+ For the sample wav,
32
+ ```bash
33
+ $ python3 qwen_audio.py
34
+ ```
35
+
36
+ If you want to specify the audio, put the file path after the `--input` option.
37
+ ```bash
38
+ $ python3 qwen_audio.py --input AUDIO_FILE
39
+ ```
40
+
41
+ If you want to specify the prompt, put the prompt after the `--prompt` option.
42
+ ```bash
43
+ $ python3 qwen_audio.py --prompt PROMPT
44
+ ```
45
+
46
+ ## Reference
47
+
48
+ - [Qwen-Audio](https://github.com/QwenLM/Qwen-Audio)
49
+
50
+ ## Framework
51
+
52
+ Pytorch
53
+
54
+ ## Model Format
55
+
56
+ ONNX opset=17
57
+
58
+ ## Netron
59
+
60
+ [Qwen-Audio-Chat_encode.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat_encode.onnx.prototxt)
61
+ [Qwen-Audio-Chat.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat.onnx.prototxt)
models/ailia-models/code/audio_utils.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from functools import lru_cache
3
+ from subprocess import CalledProcessError, run
4
+
5
+ import numpy as np
6
+ import librosa
7
+
8
+ flg_ffmpeg = False
9
+
10
+
11
+ # hard-coded audio hyperparameters
12
+ SAMPLE_RATE = 16000
13
+ N_FFT = 400
14
+ N_MELS = 80
15
+ HOP_LENGTH = 160
16
+ CHUNK_LENGTH = 30
17
+ N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
18
+
19
+
20
+ def get_T_after_cnn(L_in, dilation=1):
21
+ for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
22
+ L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
23
+ L_out = 1 + L_out // stride
24
+ L_in = L_out
25
+ return L_out
26
+
27
+
28
+ def load_audio(file: str, sr: int = SAMPLE_RATE):
29
+ """
30
+ Open an audio file and read as mono waveform, resampling as necessary
31
+ """
32
+
33
+ if flg_ffmpeg:
34
+ # This launches a subprocess to decode audio while down-mixing
35
+ # and resampling as necessary. Requires the ffmpeg CLI in PATH.
36
+ # fmt: off
37
+ cmd = [
38
+ "ffmpeg",
39
+ "-nostdin",
40
+ "-threads", "0",
41
+ "-i", file,
42
+ "-f", "s16le",
43
+ "-ac", "1",
44
+ "-acodec", "pcm_s16le",
45
+ "-ar", str(sr),
46
+ "-"
47
+ ]
48
+ # fmt: on
49
+ try:
50
+ out = run(cmd, capture_output=True, check=True).stdout
51
+ except CalledProcessError as e:
52
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
53
+
54
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
55
+ else:
56
+ # prepare input data
57
+ audio, _ = librosa.load(file, sr=sr, mono=True, dtype=np.float32)
58
+ return audio
59
+
60
+
61
+ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
62
+ """
63
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
64
+ """
65
+ if array.shape[axis] > length:
66
+ array = array.take(indices=range(length), axis=axis)
67
+
68
+ if array.shape[axis] < length:
69
+ pad_widths = [(0, 0)] * array.ndim
70
+ pad_widths[axis] = (0, length - array.shape[axis])
71
+ array = np.pad(array, pad_widths)
72
+
73
+ return array
74
+
75
+
76
+ @lru_cache(maxsize=None)
77
+ def mel_filters(n_mels: int = N_MELS):
78
+ """
79
+ the mel filterbank matrix for projecting STFT into a Mel spectrogram.
80
+ """
81
+ filters = librosa.filters.mel(sr=SAMPLE_RATE, n_fft=N_FFT, n_mels=n_mels)
82
+
83
+ return filters
84
+
85
+
86
+ def log_mel_spectrogram(
87
+ audio: np.ndarray,
88
+ n_mels: int = N_MELS,
89
+ padding: int = 0,
90
+ ):
91
+ """
92
+ Compute the log-Mel spectrogram of
93
+
94
+ Parameters
95
+ ----------
96
+ audio: np.ndarray, shape = (*)
97
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
98
+
99
+ n_mels: int
100
+ The number of Mel-frequency filters, only 80 is supported
101
+
102
+ padding: int
103
+ Number of zero samples to pad to the right
104
+
105
+ device: Optional[Union[str, torch.device]]
106
+ If given, the audio tensor is moved to this device before STFT
107
+
108
+ Returns
109
+ -------
110
+ np.ndarray, shape = (80, n_frames)
111
+ A Tensor that contains the Mel spectrogram
112
+ """
113
+ if padding > 0:
114
+ audio = np.pad(audio, (0, padding))
115
+ stft = librosa.stft(
116
+ y=audio,
117
+ n_fft=N_FFT,
118
+ hop_length=HOP_LENGTH,
119
+ window="hann",
120
+ pad_mode="reflect",
121
+ )
122
+ magnitudes = np.abs(stft[:, :-1]) ** 2
123
+
124
+ filters = mel_filters(n_mels)
125
+ mel_spec = filters @ magnitudes
126
+
127
+ log_spec = np.log10(np.clip(mel_spec, 1e-10, None))
128
+ log_spec = np.maximum(log_spec, np.max(log_spec) - 8.0)
129
+ log_spec = (log_spec + 4.0) / 4.0
130
+
131
+ return log_spec
132
+
133
+
134
+ def process_audio(content):
135
+ pattern = r"<audio>(.*?)</audio>"
136
+ audio_urls = re.findall(pattern, content)
137
+ if len(audio_urls) == 0:
138
+ return None
139
+
140
+ audios, audio_lens, audio_span_tokens = [], [], []
141
+ for audio_path in audio_urls:
142
+ cache = getattr(process_audio, "cache", {})
143
+ if audio_path in cache:
144
+ mel, audio_len, audio_token_num = cache[audio_path]
145
+ audios.append(mel)
146
+ audio_lens.append(audio_len)
147
+ audio_span_tokens.append(audio_token_num + 2)
148
+ continue
149
+
150
+ audio = load_audio(audio_path)
151
+ L = audio.shape[0] if audio.shape[0] <= 480000 else 480000 # max_length < 30s
152
+ mel_len = L // 160
153
+ audio = pad_or_trim(audio.flatten())
154
+ mel = log_mel_spectrogram(audio)
155
+ audio_len_after_cnn = get_T_after_cnn(mel_len)
156
+ audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
157
+ audio_len = [audio_len_after_cnn, audio_token_num]
158
+ audios.append(mel)
159
+ audio_lens.append(audio_len)
160
+ audio_span_tokens.append(audio_token_num + 2) # add audio bos eos
161
+
162
+ cache[audio_path] = (mel, audio_len, audio_token_num)
163
+ process_audio.cache = cache
164
+
165
+ input_audio_lengths = np.array(audio_lens)
166
+ input_audios = np.stack(audios, axis=0)
167
+
168
+ return {
169
+ "input_audios": input_audios,
170
+ "input_audio_lengths": input_audio_lengths,
171
+ "audio_span_tokens": audio_span_tokens,
172
+ "audio_urls": audio_urls,
173
+ }
models/ailia-models/code/bos.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30c022af7033308381f3949286c0ab373fb8e90eb67d4b29dd03482101874ef5
3
+ size 8320
models/ailia-models/code/eos.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e27cff1ed8b60edcd7ce2ad9cc77d3468fe17335c2597277ca7841e4b229ea9b
3
+ size 8320
models/ailia-models/code/logit_process.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import numpy as np
3
+
4
+ from math_utils import softmax
5
+
6
+
7
+ def StopWordsLogitsProcessor(scores, input_ids):
8
+ eos_token_id = 151643
9
+ stop_words_ids = [[151645], [151644]]
10
+
11
+ def tokens_match(prev_tokens: np.ndarray, tokens: List[int]) -> bool:
12
+ if len(tokens) == 0:
13
+ # if bad word tokens is just one token always ban it
14
+ return True
15
+ elif len(tokens) > len(prev_tokens):
16
+ # if bad word tokens are longer then prev input_ids they can't be equal
17
+ return False
18
+ elif prev_tokens[-len(tokens) :].tolist() == tokens:
19
+ # if tokens match
20
+ return True
21
+ else:
22
+ return False
23
+
24
+ stopped_samples = []
25
+ for prev_input_ids_slice in input_ids:
26
+ match = False
27
+ for stop_token_seq in stop_words_ids:
28
+ if tokens_match(prev_input_ids_slice, stop_token_seq):
29
+ # if tokens do not match continue
30
+ match = True
31
+ break
32
+ stopped_samples.append(match)
33
+
34
+ for i, should_stop in enumerate(stopped_samples):
35
+ if should_stop:
36
+ scores[i, eos_token_id] = float(2**15)
37
+ return scores
38
+
39
+
40
+ def TopPLogitsWarper(scores, top_p):
41
+ sorted_indices = np.argsort(scores)
42
+ sorted_logits = np.take_along_axis(scores, sorted_indices, axis=-1)
43
+ cumulative_probs = np.cumsum(softmax(sorted_logits, axis=-1), axis=-1)
44
+
45
+ # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
46
+ sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
47
+ # Keep at least min_tokens_to_keep
48
+ min_tokens_to_keep = 1
49
+ sorted_indices_to_remove[..., -min_tokens_to_keep:] = 0
50
+
51
+ # scatter sorted tensors to original indexing
52
+ indices_to_remove = np.copy(sorted_indices_to_remove)
53
+ np.put_along_axis(
54
+ indices_to_remove, sorted_indices, sorted_indices_to_remove, axis=1
55
+ )
56
+
57
+ scores_processed = np.where(indices_to_remove, -np.inf, scores)
58
+ return scores_processed
59
+
60
+
61
+ def logits_processor(input_ids, scores, top_p=0.5):
62
+ scores = StopWordsLogitsProcessor(scores, input_ids)
63
+ scores = TopPLogitsWarper(scores, top_p)
64
+
65
+ return scores
models/ailia-models/code/qwen_audio.py ADDED
@@ -0,0 +1,607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ from typing import Dict, List, Optional, Tuple
5
+
6
+ # logger
7
+ from logging import getLogger
8
+
9
+ import numpy as np
10
+
11
+ import ailia
12
+
13
+ # import original modules
14
+ sys.path.append("../../util")
15
+ from arg_utils import get_base_parser, update_parser # noqa
16
+ from model_utils import check_and_download_models, check_and_download_file # noqa
17
+ from math_utils import softmax
18
+
19
+ from logit_process import logits_processor
20
+ from audio_utils import process_audio
21
+
22
+ logger = getLogger(__name__)
23
+
24
+ # ======================
25
+ # Parameters
26
+ # ======================
27
+
28
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/qwen_audio/"
29
+
30
+ AUDIO_PATH = "1272-128104-0000.flac"
31
+
32
+ COPY_BLOB_DATA = True
33
+
34
+
35
+ # ======================
36
+ # Arguemnt Parser Config
37
+ # ======================
38
+
39
+ parser = get_base_parser("Qwen-Audio", AUDIO_PATH, None, large_model=True)
40
+ parser.add_argument(
41
+ "-p",
42
+ "--prompt",
43
+ type=str,
44
+ default="what does the person say?",
45
+ help="prompt",
46
+ )
47
+ parser.add_argument(
48
+ "--disable_ailia_tokenizer", action="store_true", help="disable ailia tokenizer."
49
+ )
50
+ parser.add_argument("--onnx", action="store_true", help="execute onnxruntime version.")
51
+ args = update_parser(parser)
52
+
53
+
54
+ # ======================
55
+ # Parameters
56
+ # ======================
57
+
58
+ WEIGHT_PATH = "Qwen-Audio-Chat.onnx"
59
+ WEIGHT_ENC_PATH = "Qwen-Audio-Chat_encode.onnx"
60
+ MODEL_PATH = "Qwen-Audio-Chat.onnx.prototxt"
61
+ MODEL_ENC_PATH = "Qwen-Audio-Chat_encode.onnx.prototxt"
62
+ PB_PATH = "Qwen-Audio-Chat_weights.pb"
63
+
64
+ SYSTEM_PROMPT = "You are a helpful assistant."
65
+
66
+
67
+ # ======================
68
+ # Secondary Functions
69
+ # ======================
70
+
71
+
72
+ def make_context(
73
+ tokenizer,
74
+ query: str,
75
+ history: List[Tuple[str, str]] = None,
76
+ system: str = "",
77
+ max_window_size: int = 6144,
78
+ ):
79
+ if history is None:
80
+ history = []
81
+
82
+ im_start, im_end = "<|im_start|>", "<|im_end|>"
83
+ im_start_tokens = [tokenizer.im_start_id]
84
+ im_end_tokens = [tokenizer.im_end_id]
85
+ nl_tokens = tokenizer.encode("\n")
86
+
87
+ def _tokenize_str(role, content):
88
+ audio_info = process_audio(content)
89
+ return (
90
+ f"{role}\n{content}",
91
+ tokenizer.encode(
92
+ role, allowed_special=set(tokenizer.AUDIO_ST), audio_info=audio_info
93
+ )
94
+ + nl_tokens
95
+ + tokenizer.encode(
96
+ content,
97
+ allowed_special=set(tokenizer.AUDIO_ST),
98
+ audio_info=audio_info,
99
+ ),
100
+ )
101
+
102
+ system_text, system_tokens_part = _tokenize_str("system", system)
103
+ system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
104
+
105
+ raw_text = ""
106
+ context_tokens = []
107
+
108
+ for turn_query, turn_response in reversed(history):
109
+ query_text, query_tokens_part = _tokenize_str("user", turn_query)
110
+ query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
111
+ if turn_response is not None:
112
+ response_text, response_tokens_part = _tokenize_str(
113
+ "assistant", turn_response
114
+ )
115
+ response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
116
+
117
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
118
+ prev_chat = (
119
+ f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
120
+ )
121
+ else:
122
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens
123
+ prev_chat = f"\n{im_start}{query_text}{im_end}\n"
124
+
125
+ current_context_size = (
126
+ len(system_tokens) + len(next_context_tokens) + len(context_tokens)
127
+ )
128
+ if current_context_size < max_window_size:
129
+ context_tokens = next_context_tokens + context_tokens
130
+ raw_text = prev_chat + raw_text
131
+ else:
132
+ break
133
+
134
+ context_tokens = system_tokens + context_tokens
135
+ raw_text = f"{im_start}{system_text}{im_end}" + raw_text
136
+ context_tokens += (
137
+ nl_tokens
138
+ + im_start_tokens
139
+ + _tokenize_str("user", query)[1]
140
+ + im_end_tokens
141
+ + nl_tokens
142
+ + im_start_tokens
143
+ + tokenizer.encode("assistant")
144
+ + nl_tokens
145
+ )
146
+ raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
147
+
148
+ return raw_text, context_tokens
149
+
150
+
151
+ def decode_tokens(
152
+ tokens,
153
+ tokenizer,
154
+ raw_text_len: int,
155
+ context_length: int,
156
+ verbose: bool = False,
157
+ errors: str = "replace",
158
+ audio_info: Dict = None,
159
+ ) -> str:
160
+ eod_token_ids = [tokenizer.im_start_id, tokenizer.im_end_id]
161
+ kwargs = {"audio_info": audio_info}
162
+
163
+ end_reason = f"Gen length {len(tokens)}"
164
+ eod_token_idx = context_length
165
+ for eod_token_idx in range(context_length, len(tokens)):
166
+ if tokens[eod_token_idx] in eod_token_ids:
167
+ end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]],**kwargs)!r}"
168
+ break
169
+
170
+ trim_decode_tokens = tokenizer.decode(
171
+ tokens[:eod_token_idx], errors=errors, **kwargs
172
+ )[raw_text_len:]
173
+
174
+ if verbose:
175
+ print(
176
+ "\nRaw Generate w/o EOD:",
177
+ tokenizer.decode(tokens, errors=errors, **kwargs)[raw_text_len:],
178
+ )
179
+ print("\nRaw Generate:", trim_decode_tokens)
180
+ print("\nEnd Reason:", end_reason)
181
+
182
+ trim_decode_tokens = trim_decode_tokens.strip()
183
+
184
+ if verbose:
185
+ print("\nGenerate:", trim_decode_tokens)
186
+
187
+ return trim_decode_tokens
188
+
189
+
190
+ # ======================
191
+ # Main functions
192
+ # ======================
193
+
194
+
195
+ def audio_encode(models, input_audios, input_audio_lengths, audio_span_tokens):
196
+ real_input_audio_lens = input_audio_lengths[:, 0].tolist()
197
+ max_len_in_batch = max(real_input_audio_lens)
198
+ padding_mask = np.ones([input_audios.shape[0], max_len_in_batch], dtype=np.float16)
199
+ for index in range(len(input_audios)):
200
+ padding_mask[index, : input_audio_lengths[index][0]] = 0
201
+
202
+ # feedforward
203
+ net = models["enc"]
204
+ if not args.onnx:
205
+ # if False:
206
+ output = net.predict([input_audios, padding_mask, input_audio_lengths])
207
+ else:
208
+ output = net.run(
209
+ None,
210
+ {
211
+ "input_audios": input_audios,
212
+ "padding_mask": padding_mask,
213
+ "input_audio_lengths": input_audio_lengths,
214
+ },
215
+ )
216
+ x = output[0]
217
+
218
+ bos = np.load(os.path.join(os.path.dirname(__file__), "bos.npy"))
219
+ eos = np.load(os.path.join(os.path.dirname(__file__), "eos.npy"))
220
+
221
+ output_audios = []
222
+ for i in range(len(audio_span_tokens)):
223
+ audio_span = audio_span_tokens[i]
224
+ audio = x[i][: audio_span - 2]
225
+ audio = np.concatenate([bos, audio, eos])
226
+ assert len(audio) == audio_span
227
+ output_audios.append(audio)
228
+
229
+ return output_audios
230
+
231
+
232
+ def forward(
233
+ models,
234
+ input_ids: np.ndarray,
235
+ attention_mask: np.ndarray,
236
+ audio_info: dict,
237
+ past_key_values: List[np.ndarray],
238
+ blob_copy: bool,
239
+ ):
240
+ audios = audio_info["input_audios"]
241
+ audio_span_tokens = audio_info["audio_span_tokens"]
242
+ input_audio_lengths = audio_info["input_audio_lengths"]
243
+ if 0 < past_key_values[0].shape[1]:
244
+ audios = (
245
+ np.ones(
246
+ (len(audio_span_tokens), input_ids.shape[1], 4096), dtype=np.float16
247
+ )
248
+ * -np.inf
249
+ )
250
+ else:
251
+ audio_start_id = 155163
252
+ bos_pos = np.where(input_ids == audio_start_id)
253
+ eos_pos = np.where(input_ids == audio_start_id + 1)
254
+
255
+ audio_pos = np.stack((bos_pos[0], bos_pos[1], eos_pos[1]), axis=1)
256
+
257
+ audios = audio_encode(models, audios, input_audio_lengths, audio_span_tokens)
258
+ lst = []
259
+ for idx, (i, a, b) in enumerate(audio_pos):
260
+ lst.append(
261
+ np.concatenate(
262
+ [
263
+ np.ones((a, 4096), dtype=np.float16) * -np.inf,
264
+ audios[idx],
265
+ np.ones((input_ids.shape[1] - b - 1, 4096), dtype=np.float16)
266
+ * -np.inf,
267
+ ]
268
+ )
269
+ )
270
+ audios = np.stack(lst, axis=0)
271
+
272
+ net = models["net"]
273
+ if not args.onnx:
274
+ if not blob_copy:
275
+ output = net.predict(
276
+ [
277
+ input_ids,
278
+ attention_mask,
279
+ audios,
280
+ *past_key_values,
281
+ ]
282
+ )
283
+ logits, new_past_key_values = output[0], output[1:]
284
+ else:
285
+ NUM_KV = 32
286
+ key_shapes = [
287
+ net.get_blob_shape(
288
+ net.find_blob_index_by_name("key_cache_out" + str(i))
289
+ )
290
+ for i in range(NUM_KV)
291
+ ]
292
+ value_shapes = [
293
+ net.get_blob_shape(
294
+ net.find_blob_index_by_name("value_cache_out" + str(i))
295
+ )
296
+ for i in range(NUM_KV)
297
+ ]
298
+ net.set_input_blob_data(input_ids, net.find_blob_index_by_name("input_ids"))
299
+ net.set_input_blob_data(
300
+ attention_mask, net.find_blob_index_by_name("attention_mask")
301
+ )
302
+ net.set_input_blob_data(audios, net.find_blob_index_by_name("audios"))
303
+ for i in range(NUM_KV):
304
+ net.set_input_blob_shape(
305
+ key_shapes[i], net.find_blob_index_by_name("key_cache" + str(i))
306
+ )
307
+ net.set_input_blob_shape(
308
+ value_shapes[i], net.find_blob_index_by_name("value_cache" + str(i))
309
+ )
310
+ net.copy_blob_data("key_cache" + str(i), "key_cache_out" + str(i))
311
+ net.copy_blob_data("value_cache" + str(i), "value_cache_out" + str(i))
312
+ net.update()
313
+ logits = net.get_blob_data(net.find_blob_index_by_name("logits"))
314
+ new_past_key_values = [
315
+ net.get_blob_data(net.find_blob_index_by_name("key_cache_out0"))
316
+ ]
317
+ else:
318
+ output = net.run(
319
+ None,
320
+ {
321
+ "input_ids": input_ids,
322
+ "attention_mask": attention_mask,
323
+ "audios": audios,
324
+ "key_cache0": past_key_values[0],
325
+ "value_cache0": past_key_values[1],
326
+ "key_cache1": past_key_values[2],
327
+ "value_cache1": past_key_values[3],
328
+ "key_cache2": past_key_values[4],
329
+ "value_cache2": past_key_values[5],
330
+ "key_cache3": past_key_values[6],
331
+ "value_cache3": past_key_values[7],
332
+ "key_cache4": past_key_values[8],
333
+ "value_cache4": past_key_values[9],
334
+ "key_cache5": past_key_values[10],
335
+ "value_cache5": past_key_values[11],
336
+ "key_cache6": past_key_values[12],
337
+ "value_cache6": past_key_values[13],
338
+ "key_cache7": past_key_values[14],
339
+ "value_cache7": past_key_values[15],
340
+ "key_cache8": past_key_values[16],
341
+ "value_cache8": past_key_values[17],
342
+ "key_cache9": past_key_values[18],
343
+ "value_cache9": past_key_values[19],
344
+ "key_cache10": past_key_values[20],
345
+ "value_cache10": past_key_values[21],
346
+ "key_cache11": past_key_values[22],
347
+ "value_cache11": past_key_values[23],
348
+ "key_cache12": past_key_values[24],
349
+ "value_cache12": past_key_values[25],
350
+ "key_cache13": past_key_values[26],
351
+ "value_cache13": past_key_values[27],
352
+ "key_cache14": past_key_values[28],
353
+ "value_cache14": past_key_values[29],
354
+ "key_cache15": past_key_values[30],
355
+ "value_cache15": past_key_values[31],
356
+ "key_cache16": past_key_values[32],
357
+ "value_cache16": past_key_values[33],
358
+ "key_cache17": past_key_values[34],
359
+ "value_cache17": past_key_values[35],
360
+ "key_cache18": past_key_values[36],
361
+ "value_cache18": past_key_values[37],
362
+ "key_cache19": past_key_values[38],
363
+ "value_cache19": past_key_values[39],
364
+ "key_cache20": past_key_values[40],
365
+ "value_cache20": past_key_values[41],
366
+ "key_cache21": past_key_values[42],
367
+ "value_cache21": past_key_values[43],
368
+ "key_cache22": past_key_values[44],
369
+ "value_cache22": past_key_values[45],
370
+ "key_cache23": past_key_values[46],
371
+ "value_cache23": past_key_values[47],
372
+ "key_cache24": past_key_values[48],
373
+ "value_cache24": past_key_values[49],
374
+ "key_cache25": past_key_values[50],
375
+ "value_cache25": past_key_values[51],
376
+ "key_cache26": past_key_values[52],
377
+ "value_cache26": past_key_values[53],
378
+ "key_cache27": past_key_values[54],
379
+ "value_cache27": past_key_values[55],
380
+ "key_cache28": past_key_values[56],
381
+ "value_cache28": past_key_values[57],
382
+ "key_cache29": past_key_values[58],
383
+ "value_cache29": past_key_values[59],
384
+ "key_cache30": past_key_values[60],
385
+ "value_cache30": past_key_values[61],
386
+ "key_cache31": past_key_values[62],
387
+ "value_cache31": past_key_values[63],
388
+ },
389
+ )
390
+ logits, new_past_key_values = output[0], output[1:]
391
+
392
+ return logits, new_past_key_values
393
+
394
+
395
+ def stopping_criteria(input_ids: np.array) -> np.array:
396
+ max_length = 690
397
+ cur_len = input_ids.shape[-1]
398
+ is_done = cur_len >= max_length
399
+ is_done = np.full(input_ids.shape[0], is_done)
400
+
401
+ eos_token_id = np.array([151643])
402
+ is_done = is_done | np.isin(input_ids[:, -1], eos_token_id)
403
+
404
+ return is_done
405
+
406
+
407
+ def sample(models, input_ids, attention_mask, audio_info):
408
+ pad_token_id = 151643
409
+
410
+ past_key_values = [np.zeros((1, 0, 32, 128), dtype=np.float16)] * 64
411
+
412
+ # keep track of which sequences are already finished
413
+ batch_size, cur_len = input_ids.shape
414
+ this_peer_finished = False
415
+ unfinished_sequences = np.ones(batch_size, dtype=int)
416
+ cache_position = (
417
+ np.cumsum(np.ones_like(input_ids[0, :], dtype=np.int64), axis=0) - 1
418
+ )
419
+
420
+ blob_copy = False
421
+ while True:
422
+ # prepare model inputs
423
+ if 0 < past_key_values[0].shape[1]:
424
+ model_input_ids = input_ids[:, cache_position]
425
+ else:
426
+ model_input_ids = input_ids
427
+ position_ids = attention_mask.astype(np.int32).cumsum(axis=-1) - 1
428
+ position_ids = np.where(attention_mask == 0, 1, position_ids)
429
+ if 0 < past_key_values[0].shape[1]:
430
+ position_ids = position_ids[:, -1:]
431
+
432
+ if args.benchmark:
433
+ start = int(round(time.time() * 1000))
434
+
435
+ logits, past_key_values = forward(
436
+ models,
437
+ model_input_ids,
438
+ attention_mask,
439
+ audio_info,
440
+ past_key_values,
441
+ blob_copy,
442
+ )
443
+ blob_copy = True if COPY_BLOB_DATA else False
444
+
445
+ if args.benchmark:
446
+ end = int(round(time.time() * 1000))
447
+ estimation_time = end - start
448
+ logger.info(f"\tdecode time {estimation_time} ms")
449
+
450
+ attention_mask = np.concatenate(
451
+ [attention_mask, np.ones((attention_mask.shape[0], 1), dtype=int)],
452
+ axis=-1,
453
+ )
454
+ cache_position = cache_position[-1:] + 1
455
+
456
+ next_token_logits = logits[:, -1, :]
457
+
458
+ # pre-process distribution
459
+ next_token_scores = logits_processor(input_ids, next_token_logits)
460
+
461
+ # token selection
462
+ probs = softmax(next_token_scores, axis=-1)
463
+ next_tokens = np.random.choice(len(probs[0]), size=1, p=probs[0])
464
+
465
+ # finished sentences should have their next token be a padding token
466
+ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
467
+ 1 - unfinished_sequences
468
+ )
469
+
470
+ # update generated ids, model inputs, and length for next step
471
+ input_ids = np.concatenate([input_ids, next_tokens[:, None]], axis=-1)
472
+
473
+ unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids)
474
+ this_peer_finished = np.max(unfinished_sequences) == 0
475
+ cur_len += 1
476
+
477
+ if this_peer_finished:
478
+ break
479
+
480
+ return input_ids
481
+
482
+
483
+ def predict(models, query, history: Optional[List[Tuple[str, str]]] = None):
484
+ if history is None:
485
+ history = []
486
+ else:
487
+ # copy history to avoid modification
488
+ history = [x for x in history]
489
+
490
+ tokenizer = models["tokenizer"]
491
+ raw_text, context_tokens = make_context(
492
+ tokenizer,
493
+ query,
494
+ history=history,
495
+ system=SYSTEM_PROMPT,
496
+ )
497
+ audio_info = process_audio(raw_text)
498
+
499
+ input_ids = np.array([context_tokens])
500
+ attention_mask = np.ones(input_ids.shape[:2], dtype=np.int64)
501
+ outputs = sample(models, input_ids, attention_mask, audio_info)
502
+
503
+ response = decode_tokens(
504
+ outputs[0],
505
+ tokenizer,
506
+ raw_text_len=len(raw_text),
507
+ context_length=len(context_tokens),
508
+ audio_info=audio_info,
509
+ )
510
+
511
+ history.append((query, response))
512
+
513
+ return response, history
514
+
515
+
516
+ def recognize(models):
517
+ prompt = args.prompt
518
+ audio_urls = args.input
519
+
520
+ logger.info("Prompt: %s" % prompt)
521
+
522
+ tokenizer = models["tokenizer"]
523
+ query = tokenizer.from_list_format(
524
+ [{"audio": input} for input in audio_urls] + [{"text": prompt}],
525
+ )
526
+
527
+ # inference
528
+ logger.info("Start inference...")
529
+ if args.benchmark:
530
+ logger.info("BENCHMARK mode")
531
+ total_time_estimation = 0
532
+ for i in range(args.benchmark_count):
533
+ start = int(round(time.time() * 1000))
534
+ response, history = predict(models, query)
535
+ end = int(round(time.time() * 1000))
536
+ estimation_time = end - start
537
+
538
+ # Logging
539
+ logger.info(f"\tailia processing estimation time {estimation_time} ms")
540
+ if i != 0:
541
+ total_time_estimation = total_time_estimation + estimation_time
542
+
543
+ logger.info(
544
+ f"\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms"
545
+ )
546
+ else:
547
+ response, history = predict(models, query)
548
+
549
+ # # 2nd dialogue turn
550
+ # print(response)
551
+ # query = 'Find the start time and end time of the word "middle classes"'
552
+ # response, history = predict(models, query, history=history)
553
+
554
+ print(response)
555
+
556
+ logger.info("Script finished successfully.")
557
+
558
+
559
+ def main():
560
+ check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
561
+ check_and_download_models(WEIGHT_ENC_PATH, MODEL_ENC_PATH, REMOTE_PATH)
562
+ check_and_download_file(PB_PATH, REMOTE_PATH)
563
+
564
+ env_id = args.env_id
565
+
566
+ # initialize
567
+ if not args.onnx:
568
+ memory_mode = ailia.get_memory_mode(
569
+ reduce_constant=True,
570
+ ignore_input_with_initializer=True,
571
+ reduce_interstage=False,
572
+ reuse_interstage=True,
573
+ )
574
+ enc = ailia.Net(
575
+ MODEL_ENC_PATH, WEIGHT_ENC_PATH, env_id=env_id, memory_mode=memory_mode
576
+ )
577
+ net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id, memory_mode=memory_mode)
578
+ else:
579
+ import onnxruntime
580
+
581
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
582
+
583
+ enc = onnxruntime.InferenceSession(WEIGHT_ENC_PATH, providers=providers)
584
+ net = onnxruntime.InferenceSession(WEIGHT_PATH, providers=providers)
585
+
586
+ args.disable_ailia_tokenizer = True
587
+ if args.disable_ailia_tokenizer:
588
+ import transformers
589
+
590
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
591
+ "./tokenizer", trust_remote_code=True
592
+ )
593
+ else:
594
+ raise NotImplementedError
595
+
596
+ models = {
597
+ "tokenizer": tokenizer,
598
+ "enc": enc,
599
+ "net": net,
600
+ }
601
+
602
+ # generate
603
+ recognize(models)
604
+
605
+
606
+ if __name__ == "__main__":
607
+ main()
models/ailia-models/code/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ tiktoken
models/ailia-models/code/tokenizer/qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/code/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
models/ailia-models/code/tokenizer/tokenization_qwen.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba Cloud.
2
+ #
3
+ # This source code is licensed under the license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """Tokenization classes for QWen."""
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ import re
12
+ import itertools
13
+
14
+ import requests
15
+ import unicodedata
16
+ from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
17
+
18
+ import tiktoken
19
+ import numpy as np
20
+
21
+ from transformers import PreTrainedTokenizer, AddedToken
22
+ from transformers.utils import try_to_load_from_cache
23
+ from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy, \
24
+ TextInput, TextInputPair, PreTokenizedInput, PreTokenizedInputPair, TensorType, EncodedInput, EncodedInputPair
25
+
26
+ import matplotlib.colors as mcolors
27
+ from matplotlib.font_manager import FontProperties
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
32
+
33
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
34
+ ENDOFTEXT = "<|endoftext|>"
35
+ IMSTART = "<|im_start|>"
36
+ IMEND = "<|im_end|>"
37
+ # as the default behavior is changed to allow special tokens in
38
+ # regular texts, the surface forms of special tokens need to be
39
+ # as different as possible to minimize the impact
40
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
41
+ SPECIAL_TOKENS = (
42
+ ENDOFTEXT,
43
+ IMSTART,
44
+ IMEND,
45
+ ) + EXTRAS
46
+
47
+ LANGUAGES = {
48
+ "en": "english",
49
+ "zh": "chinese",
50
+ "de": "german",
51
+ "es": "spanish",
52
+ "ko": "korean",
53
+ "fr": "french",
54
+ "ja": "japanese",
55
+ "it": "italian",
56
+ }
57
+
58
+
59
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
60
+ with open(tiktoken_bpe_file, "rb") as f:
61
+ contents = f.read()
62
+ return {
63
+ base64.b64decode(token): int(rank)
64
+ for token, rank in (line.split() for line in contents.splitlines() if line)
65
+ }
66
+
67
+
68
+ def _list_find(
69
+ input_list: List[Any],
70
+ candidates: Tuple[Any],
71
+ start: int = 0,
72
+ ):
73
+ for i in range(start, len(input_list)):
74
+ if input_list[i] in candidates:
75
+ return i
76
+ return -1
77
+
78
+
79
+ def _replace_closed_tag(
80
+ input_tokens: List[Any],
81
+ start_tags: Union[Any, Tuple[Any]],
82
+ end_tags: Union[Any, Tuple[Any]],
83
+ inclusive_replace_func: Callable,
84
+ exclusive_replace_func: Callable = lambda x: x,
85
+ audio_info: Dict = None
86
+ ):
87
+ if isinstance(start_tags, (str, int)):
88
+ start_tags = (start_tags,)
89
+ if isinstance(end_tags, (str, int)):
90
+ end_tags = (end_tags,)
91
+ assert len(start_tags) == len(end_tags)
92
+
93
+ output_tokens = []
94
+ end = 0
95
+ audio_idx = 0
96
+ while True:
97
+ start = _list_find(input_tokens, start_tags, end)
98
+ if start == -1:
99
+ break
100
+ output_tokens.extend(exclusive_replace_func(input_tokens[end: start]))
101
+ tag_idx = start_tags.index(input_tokens[start])
102
+ end = _list_find(input_tokens, (end_tags[tag_idx],), start)
103
+ if end == -1:
104
+ raise ValueError("Unclosed audio token")
105
+ output_tokens.extend(inclusive_replace_func(input_tokens[start: end + 1], audio_info, audio_idx))
106
+ end += 1
107
+ audio_idx += 1
108
+ output_tokens.extend(exclusive_replace_func(input_tokens[end:]))
109
+ return output_tokens
110
+
111
+
112
+ class QWenTokenizer(PreTrainedTokenizer):
113
+ """QWen tokenizer."""
114
+
115
+ vocab_files_names = VOCAB_FILES_NAMES
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_file,
120
+ errors="replace",
121
+ audio_start_tag='<audio>',
122
+ audio_end_tag='</audio>',
123
+ **kwargs,
124
+ ):
125
+ super().__init__(**kwargs)
126
+ self.audio_start_tag = audio_start_tag
127
+ self.audio_end_tag = audio_end_tag
128
+ self.audio_pad_tag = "[[[AUDIO:modality]]]"
129
+
130
+ self.AUDIO_ST = (
131
+ '[[[AUDIO:modality]]]',
132
+ # Transcription Tag
133
+ "<|startoftranscript|>", # Transcription
134
+ "<|startofanalysis|>", # Analysis
135
+ # Task Tag
136
+ "<|translate|>",
137
+ "<|transcribe|>",
138
+ "<|caption|>",
139
+ "<|keyword|>",
140
+ # Language Tag
141
+ "<|unknown|>", # unknown language
142
+ *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
143
+ "<|zh_tr|>", # tranditional Chinese
144
+ # Timestamps Tag
145
+ "<|notimestamps|>",
146
+ "<|sil|>",
147
+ "<|timestamps|>",
148
+ *[f"<|{i * 0.01:.2f}|>" for i in range(3001)], # timestamps 0.00-30.00
149
+ # Output Instruction
150
+ "<|caption_audiocaps|>", # Audiocaps caption style
151
+ "<|caption_clotho|>", # Clotho caption style
152
+ "<|audioset_ontology|>", # Audioset ontology style
153
+ "<|caption_plain|>", # plain caption
154
+ "<|itn|>", # inversed text normalized
155
+ "<|wo_itn|>", # without inversed text normalized
156
+ "<|startofentityvalue|>",
157
+ "<|endofentityvalue|>",
158
+ "<|startofentitytype|>",
159
+ "<|endofentitytype|>",
160
+ "<|named_entity_recognition|>", # named entity recognition task
161
+ "<|audio_grounding|>",
162
+ "<|startofword|>",
163
+ "<|endofword|>",
164
+ "<|delim|>", # delimiter of timestamps pair in audio grounding
165
+ "<|emotion_recognition|>", # emotion recognition
166
+ "<|music_description|>", # music description
167
+ "<|note_analysis|>", # note analysis
168
+ "<|pitch|>", # note analysis: pitch
169
+ *[f"<|midi_pitch_{i}|>" for i in range(128)], # midi pitch 0-127
170
+ "<|velocity|>", # note analysis: velocity
171
+ *[f"<|midi_velocity_{i}|>" for i in range(128)], # midi velocity 0-127
172
+ "<|sonic|>", # note analysis: sonic
173
+ "<|instrument|>", # note analysis: instrument
174
+ "<|speaker_meta|>", # meta information of speaker
175
+ "<|song_meta|>", # meta information of song
176
+ "<|question|>", # AQA: question
177
+ "<|answer|>", # AQA: answer
178
+ "<|choice|>", # AQA: answer choice
179
+ "<|scene|>", # scene recognition
180
+ "<|event|>", # sound event
181
+ "<|vocal_classification|>", # vocal classification
182
+ "<|speech_understanding|>", # speech language understanding
183
+ "<|scenario|>", # speech language understanding: scenario
184
+ "<|action|>", # speech language understanding: action
185
+ "<|entities|>", # speech language understanding: entities
186
+ "<|speech_edit|>", # speech edit
187
+ audio_start_tag,
188
+ audio_end_tag
189
+ )
190
+
191
+ self.errors = errors # how to handle errors in decoding
192
+
193
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
194
+ self.special_tokens = {
195
+ token: index
196
+ for index, token in enumerate(
197
+ SPECIAL_TOKENS + self.AUDIO_ST, start=len(self.mergeable_ranks)
198
+
199
+ )
200
+ }
201
+ self.audio_start_id = self.special_tokens[self.audio_start_tag]
202
+ self.audio_end_id = self.special_tokens[self.audio_end_tag]
203
+ self.audio_pad_id = self.special_tokens[self.audio_pad_tag]
204
+ print(f"audio_start_id: {self.audio_start_id}, "
205
+ f"audio_end_id: {self.audio_end_id}, "
206
+ f"audio_pad_id: {self.audio_pad_id}.")
207
+
208
+ enc = tiktoken.Encoding(
209
+ "Qwen",
210
+ pat_str=PAT_STR,
211
+ mergeable_ranks=self.mergeable_ranks,
212
+ special_tokens=self.special_tokens,
213
+ )
214
+ assert (
215
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
216
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
217
+
218
+ self.decoder = {
219
+ v: k for k, v in self.mergeable_ranks.items()
220
+ } # type: dict[int, bytes|str]
221
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
222
+
223
+ self.tokenizer = enc # type: tiktoken.Encoding
224
+
225
+ self.eod_id = self.tokenizer.eot_token
226
+ self.im_start_id = self.special_tokens[IMSTART]
227
+ self.im_end_id = self.special_tokens[IMEND]
228
+
229
+ def __getstate__(self):
230
+ # for pickle lovers
231
+ state = self.__dict__.copy()
232
+ del state['tokenizer']
233
+ return state
234
+
235
+ def __setstate__(self, state):
236
+ # tokenizer is not python native; don't pass it; rebuild it
237
+ self.__dict__.update(state)
238
+ enc = tiktoken.Encoding(
239
+ "Qwen",
240
+ pat_str=PAT_STR,
241
+ mergeable_ranks=self.mergeable_ranks,
242
+ special_tokens=self.special_tokens,
243
+ )
244
+ self.tokenizer = enc
245
+
246
+ def __len__(self) -> int:
247
+ return self.tokenizer.n_vocab
248
+
249
+ def get_vocab(self) -> Dict[bytes, int]:
250
+ return self.mergeable_ranks
251
+
252
+ def convert_tokens_to_ids(
253
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
254
+ ) -> List[int]:
255
+ ids = []
256
+ if isinstance(tokens, (str, bytes)):
257
+ if tokens in self.special_tokens:
258
+ return self.special_tokens[tokens]
259
+ else:
260
+ return self.mergeable_ranks.get(tokens)
261
+ for token in tokens:
262
+ if token in self.special_tokens:
263
+ ids.append(self.special_tokens[token])
264
+ else:
265
+ ids.append(self.mergeable_ranks.get(token))
266
+ return ids
267
+
268
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
269
+ if not special_tokens and new_tokens:
270
+ raise ValueError('Adding regular tokens is not supported')
271
+ for token in new_tokens:
272
+ surface_form = token.content if isinstance(token, AddedToken) else token
273
+ if surface_form not in SPECIAL_TOKENS + self.AUDIO_ST:
274
+ raise ValueError('Adding unknown special tokens is not supported')
275
+ return 0
276
+
277
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
278
+ """
279
+ Save only the vocabulary of the tokenizer (vocabulary).
280
+
281
+ Returns:
282
+ `Tuple(str)`: Paths to the files saved.
283
+ """
284
+ file_path = os.path.join(save_directory, "qwen.tiktoken")
285
+ with open(file_path, "w", encoding="utf8") as w:
286
+ for k, v in self.mergeable_ranks.items():
287
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
288
+ w.write(line)
289
+ return (file_path,)
290
+
291
+ def tokenize(
292
+ self,
293
+ text: str,
294
+ allowed_special: Union[Set, str] = "all",
295
+ disallowed_special: Union[Collection, str] = (),
296
+ audio_info: Dict = None,
297
+ **kwargs,
298
+ ) -> List[Union[bytes, str]]:
299
+ """
300
+ Converts a string in a sequence of tokens.
301
+
302
+ Args:
303
+ text (`str`):
304
+ The sequence to be encoded.
305
+ allowed_special (`Literal["all"]` or `set`):
306
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
307
+ Default to "all".
308
+ disallowed_special (`Literal["all"]` or `Collection`):
309
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
310
+ Default to an empty tuple.
311
+
312
+ kwargs (additional keyword arguments, *optional*):
313
+ Will be passed to the underlying model specific encode method.
314
+
315
+ Returns:
316
+ `List[bytes|str]`: The list of tokens.
317
+ """
318
+ tokens = []
319
+ text = unicodedata.normalize("NFC", text)
320
+
321
+ # this implementation takes a detour: text -> token id -> token surface forms
322
+ for t in self.tokenizer.encode(
323
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
324
+ ):
325
+ tokens.append(self.decoder[t])
326
+
327
+ def _encode_audiourl(audio_tokens, audio_info, audio_idx):
328
+ assert audio_tokens[0] == self.audio_start_tag and audio_tokens[-1] == self.audio_end_tag
329
+ audio_token_span = audio_info['audio_span_tokens'][audio_idx]
330
+ out_audio_tokens = [self.audio_start_tag] + [self.audio_pad_tag] * (audio_token_span - 2) + [
331
+ self.audio_end_tag]
332
+ return out_audio_tokens
333
+
334
+ return _replace_closed_tag(tokens, self.audio_start_tag, self.audio_end_tag, _encode_audiourl,
335
+ audio_info=audio_info)
336
+
337
+ def _batch_encode_plus(
338
+ self,
339
+ batch_text_or_text_pairs: Union[
340
+ List[TextInput],
341
+ List[TextInputPair],
342
+ List[PreTokenizedInput],
343
+ List[PreTokenizedInputPair],
344
+ List[EncodedInput],
345
+ List[EncodedInputPair],
346
+ ],
347
+ add_special_tokens: bool = True,
348
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
349
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
350
+ max_length: Optional[int] = None,
351
+ stride: int = 0,
352
+ is_split_into_words: bool = False,
353
+ pad_to_multiple_of: Optional[int] = None,
354
+ return_tensors: Optional[Union[str, TensorType]] = None,
355
+ return_token_type_ids: Optional[bool] = None,
356
+ return_attention_mask: Optional[bool] = None,
357
+ return_overflowing_tokens: bool = False,
358
+ return_special_tokens_mask: bool = False,
359
+ return_offsets_mapping: bool = False,
360
+ return_length: bool = False,
361
+ verbose: bool = True,
362
+ **kwargs,
363
+ ) -> BatchEncoding:
364
+
365
+ def get_input_ids(text):
366
+ if isinstance(text, str):
367
+ tokens = self.tokenize(text, **kwargs)
368
+ return self.convert_tokens_to_ids(tokens)
369
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
370
+ if is_split_into_words:
371
+ tokens = list(
372
+ itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
373
+ )
374
+ return self.convert_tokens_to_ids(tokens)
375
+ else:
376
+ return self.convert_tokens_to_ids(text)
377
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
378
+ return text
379
+ else:
380
+ raise ValueError(
381
+ "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
382
+ )
383
+
384
+ if return_offsets_mapping:
385
+ raise NotImplementedError(
386
+ "return_offset_mapping is not available when using Python tokenizers. "
387
+ "To use this feature, change your tokenizer to one deriving from "
388
+ "transformers.PreTrainedTokenizerFast."
389
+ )
390
+
391
+ input_ids = []
392
+ audio_info = kwargs.pop("audio_info", None)
393
+ for pair_id in range(len(batch_text_or_text_pairs)):
394
+ kwargs['audio_info'] = audio_info[pair_id]
395
+ ids_or_pair_ids = batch_text_or_text_pairs[pair_id]
396
+ # for ids_or_pair_ids in batch_text_or_text_pairs:
397
+ if not isinstance(ids_or_pair_ids, (list, tuple)):
398
+ ids, pair_ids = ids_or_pair_ids, None
399
+ elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
400
+ ids, pair_ids = ids_or_pair_ids, None
401
+ else:
402
+ ids, pair_ids = ids_or_pair_ids
403
+
404
+ first_ids = get_input_ids(ids)
405
+ second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
406
+ input_ids.append((first_ids, second_ids))
407
+
408
+ batch_outputs = self._batch_prepare_for_model(
409
+ input_ids,
410
+ add_special_tokens=add_special_tokens,
411
+ padding_strategy=padding_strategy,
412
+ truncation_strategy=truncation_strategy,
413
+ max_length=max_length,
414
+ stride=stride,
415
+ pad_to_multiple_of=pad_to_multiple_of,
416
+ return_attention_mask=return_attention_mask,
417
+ return_token_type_ids=return_token_type_ids,
418
+ return_overflowing_tokens=return_overflowing_tokens,
419
+ return_special_tokens_mask=return_special_tokens_mask,
420
+ return_length=return_length,
421
+ return_tensors=return_tensors,
422
+ verbose=verbose,
423
+ )
424
+
425
+ return BatchEncoding(batch_outputs)
426
+
427
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
428
+ """
429
+ Converts a sequence of tokens in a single string.
430
+ """
431
+ text = ""
432
+ temp = b""
433
+ for t in tokens:
434
+ if isinstance(t, str):
435
+ if temp:
436
+ text += temp.decode("utf-8", errors=self.errors)
437
+ temp = b""
438
+ text += t
439
+ elif isinstance(t, bytes):
440
+ temp += t
441
+ else:
442
+ raise TypeError("token should only be of type types or str")
443
+ if temp:
444
+ text += temp.decode("utf-8", errors=self.errors)
445
+ return text
446
+
447
+ @property
448
+ def vocab_size(self):
449
+ return self.tokenizer.n_vocab
450
+
451
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
452
+ """Converts an id to a token, special tokens included"""
453
+ if index in self.decoder:
454
+ return self.decoder[index]
455
+ raise ValueError("unknown ids")
456
+
457
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
458
+ """Converts a token to an id using the vocab, special tokens included"""
459
+ if token in self.special_tokens:
460
+ return self.special_tokens[token]
461
+ if token in self.mergeable_ranks:
462
+ return self.mergeable_ranks[token]
463
+ raise ValueError("unknown token")
464
+
465
+ def _tokenize(self, text: str, **kwargs):
466
+ """
467
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
468
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
469
+
470
+ Do NOT take care of added tokens.
471
+ """
472
+ raise NotImplementedError
473
+
474
+ def _decode(
475
+ self,
476
+ token_ids: Union[int, List[int]],
477
+ skip_special_tokens: bool = False,
478
+ errors: str = None,
479
+ **kwargs,
480
+ ) -> str:
481
+ if isinstance(token_ids, int):
482
+ token_ids = [token_ids]
483
+ audio_info = kwargs.pop("audio_info", None)
484
+
485
+ def _decode_audiourl(audio_token_ids, audio_info, audio_idx):
486
+ assert audio_token_ids[0] == self.audio_start_id and audio_token_ids[-1] == self.audio_end_id
487
+ audio_url = audio_info["audio_urls"][audio_idx]
488
+ return [self.audio_start_id] + self.tokenizer.encode(audio_url) + [self.audio_end_id]
489
+
490
+ token_ids = _replace_closed_tag(token_ids, self.audio_start_id, self.audio_end_id, _decode_audiourl,
491
+ audio_info=audio_info)
492
+
493
+ if skip_special_tokens:
494
+ token_ids = [i for i in token_ids if i < self.eod_id]
495
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)
496
+
497
+ def to_list_format(self, text: str):
498
+ text = unicodedata.normalize("NFC", text)
499
+ token_ids = self.tokenizer.encode(
500
+ text, allowed_special=set(self.AUDIO_ST + (ENDOFTEXT,)))
501
+
502
+ def _encode_audio_info(tokens):
503
+ if len(tokens) == 0:
504
+ return []
505
+ if tokens[0] == self.audio_start_id and tokens[-1] == self.audio_end_id:
506
+ key = 'audio'
507
+ else:
508
+ _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
509
+ return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
510
+ _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
511
+ val = b''.join(map(_tobytes, map(self.decoder.get, tokens[1:-1]))).decode('utf-8')
512
+ return [{key: val}]
513
+
514
+ return _replace_closed_tag(
515
+ token_ids,
516
+ (self.audio_start_id),
517
+ (self.audio_end_id),
518
+ _encode_audio_info,
519
+ _encode_audio_info,
520
+ )
521
+
522
+ def from_list_format(self, list_format: List[Dict]):
523
+ text = ''
524
+ num_audios = 0
525
+ for ele in list_format:
526
+ if 'audio' in ele:
527
+ num_audios += 1
528
+ text += f'Audio {num_audios}:'
529
+ text += self.audio_start_tag + ele['audio'] + self.audio_end_tag
530
+ text += '\n'
531
+ elif 'text' in ele:
532
+ text += ele['text']
533
+ elif 'box' in ele:
534
+ if 'ref' in ele:
535
+ text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
536
+ for box in ele['box']:
537
+ text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
538
+ else:
539
+ raise ValueError("Unsupport element: " + str(ele))
540
+ return text
541
+
542
+ def extract_audio_urls(self, text):
543
+ pattern = rf"{self.audio_start_tag}(.*?){self.audio_end_tag}"
544
+ return re.findall(pattern, text)
545
+
546
+ def process_audio(self, text):
547
+ audio_urls = self.extract_audio_urls(text)
548
+ if len(audio_urls) > 0:
549
+ audios, audio_lens, audio_span_tokens = [], [], []
550
+ for audio_path in audio_urls:
551
+ if audio_path.startswith("http://") or audio_path.startswith("https://"): # http
552
+ data = bytes(requests.get(audio_path, stream=True).content)
553
+ audio = load_bytesio_audio(data)
554
+ else:
555
+ audio = load_audio(audio_path)
556
+ L = (audio.shape[0] if audio.shape[0] <= 480000 else 480000) # max_length < 30s
557
+ mel_len = L // 160
558
+ audio = pad_or_trim(audio.flatten())
559
+ mel = log_mel_spectrogram(audio)
560
+ audio_len_after_cnn = get_T_after_cnn(mel_len)
561
+ audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
562
+ audio_len = [audio_len_after_cnn, audio_token_num]
563
+ audios.append(mel)
564
+ audio_lens.append(audio_len)
565
+ audio_span_tokens.append(audio_token_num + 2) # add audio bos eos
566
+ input_audio_lengths = torch.IntTensor(audio_lens)
567
+ input_audios = torch.stack(audios, dim=0)
568
+ return {"input_audios": input_audios,
569
+ "input_audio_lengths": input_audio_lengths,
570
+ "audio_span_tokens": audio_span_tokens,
571
+ "audio_urls": audio_urls}
572
+ else:
573
+ return None
574
+
575
+
576
+
577
+
578
+
models/ailia-models/code/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_qwen.QWenTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "clean_up_tokenization_spaces": true,
10
+ "extra_special_tokens": {},
11
+ "model_max_length": 2048,
12
+ "tokenizer_class": "QWenTokenizer"
13
+ }
models/ailia-models/source.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_language_model/qwen_audio
2
+
3
+ https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat_encode.onnx
4
+ https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat_encode.onnx.prototxt
5
+
6
+ https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat.onnx
7
+ https://storage.googleapis.com/ailia-models/qwen_audio/Qwen-Audio-Chat.onnx.prototxt