niobures commited on
Commit
9479f69
·
verified ·
1 Parent(s): a02fb17

GPT-SoVITS (models_onnx)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. models/ailia-models/GPT-SoVITS/cnhubert.onnx +3 -0
  3. models/ailia-models/GPT-SoVITS/cnhubert.onnx.prototxt +0 -0
  4. models/ailia-models/GPT-SoVITS/code/LICENSE +21 -0
  5. models/ailia-models/GPT-SoVITS/code/README.md +58 -0
  6. models/ailia-models/GPT-SoVITS/code/colab.ipynb +0 -0
  7. models/ailia-models/GPT-SoVITS/code/gpt-sovits.py +383 -0
  8. models/ailia-models/GPT-SoVITS/code/reference_audio_captured_by_ax.wav +3 -0
  9. models/ailia-models/GPT-SoVITS/code/requirements.txt +6 -0
  10. models/ailia-models/GPT-SoVITS/code/text/__init__.py +15 -0
  11. models/ailia-models/GPT-SoVITS/code/text/cmudict.rep +0 -0
  12. models/ailia-models/GPT-SoVITS/code/text/cmudict_cache.pickle +3 -0
  13. models/ailia-models/GPT-SoVITS/code/text/english.py +107 -0
  14. models/ailia-models/GPT-SoVITS/code/text/japanese.py +191 -0
  15. models/ailia-models/GPT-SoVITS/code/text/symbols.py +401 -0
  16. models/ailia-models/GPT-SoVITS/source.txt +26 -0
  17. models/ailia-models/GPT-SoVITS/t2s_encoder.onnx +3 -0
  18. models/ailia-models/GPT-SoVITS/t2s_encoder.onnx.prototxt +2816 -0
  19. models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx +3 -0
  20. models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx.prototxt +0 -0
  21. models/ailia-models/GPT-SoVITS/t2s_sdec.onnx +3 -0
  22. models/ailia-models/GPT-SoVITS/t2s_sdec.onnx.prototxt +0 -0
  23. models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx +3 -0
  24. models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx.prototxt +0 -0
  25. models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx +3 -0
  26. models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx.prototxt +0 -0
  27. models/ailia-models/GPT-SoVITS/vits.onnx +3 -0
  28. models/ailia-models/GPT-SoVITS/vits.onnx.prototxt +0 -0
  29. models/ailia-models/GPT-SoVITS2/cnhubert.onnx +3 -0
  30. models/ailia-models/GPT-SoVITS2/cnhubert.onnx.prototxt +0 -0
  31. models/ailia-models/GPT-SoVITS2/code/LICENSE +21 -0
  32. models/ailia-models/GPT-SoVITS2/code/README.md +53 -0
  33. models/ailia-models/GPT-SoVITS2/code/gpt-sovits-v2.py +632 -0
  34. models/ailia-models/GPT-SoVITS2/code/reference_audio_captured_by_ax.wav +3 -0
  35. models/ailia-models/GPT-SoVITS2/code/requirements.txt +7 -0
  36. models/ailia-models/GPT-SoVITS2/code/text/__init__.py +15 -0
  37. models/ailia-models/GPT-SoVITS2/code/text/cleaner.py +32 -0
  38. models/ailia-models/GPT-SoVITS2/code/text/cmudict-fast.rep +0 -0
  39. models/ailia-models/GPT-SoVITS2/code/text/cmudict.rep +0 -0
  40. models/ailia-models/GPT-SoVITS2/code/text/engdict-hot.rep +3 -0
  41. models/ailia-models/GPT-SoVITS2/code/text/english.py +393 -0
  42. models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/user.dict +0 -0
  43. models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.csv +1 -0
  44. models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.md5 +1 -0
  45. models/ailia-models/GPT-SoVITS2/code/text/japanese.py +207 -0
  46. models/ailia-models/GPT-SoVITS2/code/text/namedict_cache.pickle +3 -0
  47. models/ailia-models/GPT-SoVITS2/code/text/symbols2.py +785 -0
  48. models/ailia-models/GPT-SoVITS2/source.txt +18 -0
  49. models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx +3 -0
  50. models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx.prototxt +2293 -0
.gitattributes CHANGED
@@ -34,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  GPT-SoVITS.[[:space:]]A[[:space:]]Zero-Shot[[:space:]]Speech[[:space:]]Synthesis[[:space:]]Model[[:space:]]with[[:space:]]Customizable[[:space:]]Fine-Tuning.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  GPT-SoVITS.[[:space:]]A[[:space:]]Zero-Shot[[:space:]]Speech[[:space:]]Synthesis[[:space:]]Model[[:space:]]with[[:space:]]Customizable[[:space:]]Fine-Tuning.pdf filter=lfs diff=lfs merge=lfs -text
37
+ models/ailia-models/GPT-SoVITS/code/reference_audio_captured_by_ax.wav filter=lfs diff=lfs merge=lfs -text
38
+ models/ailia-models/GPT-SoVITS2/code/reference_audio_captured_by_ax.wav filter=lfs diff=lfs merge=lfs -text
39
+ models/ailia-models/GPT-SoVITS3/code/reference_audio_captured_by_ax.wav filter=lfs diff=lfs merge=lfs -text
40
+ models/ailia-models/GPT-SoVITS3/code/text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text
models/ailia-models/GPT-SoVITS/cnhubert.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:558e4aabf7a7d1ef8ad89c0983a4a6413f9f4489232a35b4c1d455575f6cc242
3
+ size 377745020
models/ailia-models/GPT-SoVITS/cnhubert.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/code/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 RVC-Boss
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
models/ailia-models/GPT-SoVITS/code/README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-SoVITS
2
+
3
+ ### Input
4
+ - A synthesis text and reference audio and reference text for voice cloning
5
+
6
+ ### Output
7
+ The Voice file is output as .wav which path is defined as `SAVE_WAV_PATH` in `gpt-sovits.py `.
8
+
9
+ ### Requirements
10
+ This model requires pyopenjtalk for g2p.
11
+
12
+ ```
13
+ pip3 install -r requirements.txt
14
+ ```
15
+
16
+ ### Usage
17
+ Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
18
+
19
+ For the sample sentence and sample audio,
20
+ ```
21
+ python3 gpt-sovits.py
22
+ ```
23
+
24
+ Run with audio prompt.
25
+
26
+ ```
27
+ python3 gpt-sovits.py -i "音声合成のテストを行なっています。" --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。"
28
+ ```
29
+
30
+ Run for english.
31
+
32
+ ```
33
+ python3 gpt-sovits.py -i "Hello world. We are testing speech synthesis." --text_language en --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。" --ref_language ja
34
+ ```
35
+
36
+ ### Reference
37
+ [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
38
+
39
+ ### Framework
40
+ PyTorch 2.1.2
41
+
42
+ ### Model Format
43
+ ONNX opset = 17
44
+
45
+ ### Netron
46
+
47
+ #### Normal model
48
+
49
+ - [cnhubert.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/cnhubert.onnx.prototxt)
50
+ - [t2s_encoder.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_encoder.onnx.prototxt)
51
+ - [t2s_fsdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_fsdec.onnx.prototxt)
52
+ - [t2s_sdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.onnx.prototxt)
53
+ - [vits.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/vits.onnx.prototxt)
54
+
55
+ #### Optimized model
56
+
57
+ - [t2s_sdec.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt.onnx.prototxt)
58
+ - [t2s_sdec.opt2.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt2.onnx.prototxt)
models/ailia-models/GPT-SoVITS/code/colab.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/code/gpt-sovits.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import platform
4
+
5
+ import numpy as np
6
+ import soundfile as sf
7
+
8
+ # import original modules
9
+ sys.path.append('../../util')
10
+ from arg_utils import get_base_parser, update_parser, get_savepath # noqa: E402
11
+ from model_utils import check_and_download_models # noqa: E402
12
+ from scipy.io.wavfile import write
13
+ # logger
14
+ from logging import getLogger # noqa: E402
15
+ logger = getLogger(__name__)
16
+
17
+ from text import cleaned_text_to_sequence
18
+ import soundfile
19
+ import librosa
20
+
21
+ # ======================
22
+ # PARAMETERS
23
+ # ======================
24
+
25
+ SAVE_WAV_PATH = 'output.wav'
26
+ REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/gpt-sovits/'
27
+
28
+ # ======================
29
+ # Arguemnt Parser Config
30
+ # ======================
31
+
32
+ parser = get_base_parser( 'GPT-SoVits', None, SAVE_WAV_PATH)
33
+ # overwrite
34
+ parser.add_argument(
35
+ '--input', '-i', metavar='TEXT', default="ax株式会社ではAIの実用化のための技術を開発しています。",
36
+ help='input text'
37
+ )
38
+ parser.add_argument(
39
+ '--text_language', '-tl',
40
+ default='ja',
41
+ help='[ja, en]'
42
+ )
43
+ parser.add_argument(
44
+ '--ref_audio', '-ra', metavar='TEXT', default="reference_audio_captured_by_ax.wav",
45
+ help='ref audio'
46
+ )
47
+ parser.add_argument(
48
+ '--ref_text', '-rt', metavar='TEXT', default="水をマレーシアから買わなくてはならない。",
49
+ help='ref text'
50
+ )
51
+ parser.add_argument(
52
+ '--ref_language', '-rl',
53
+ default='ja',
54
+ help='[ja, en]'
55
+ )
56
+ parser.add_argument(
57
+ '--onnx', action='store_true',
58
+ help='use onnx runtime'
59
+ )
60
+ parser.add_argument(
61
+ '--normal', action='store_true',
62
+ help='use normal model'
63
+ )
64
+ parser.add_argument(
65
+ '--profile', action='store_true',
66
+ help='use profile model'
67
+ )
68
+ parser.add_argument(
69
+ '--ailia_voice', action='store_true',
70
+ help='use ailia voice for G2P'
71
+ )
72
+ args = update_parser(parser, check_input_type=False)
73
+
74
+ WEIGHT_PATH_SSL = 'cnhubert.onnx'
75
+ WEIGHT_PATH_T2S_ENCODER = 't2s_encoder.onnx'
76
+ WEIGHT_PATH_T2S_FIRST_DECODER = 't2s_fsdec.onnx'
77
+ if args.normal:
78
+ WEIGHT_PATH_T2S_STAGE_DECODER = 't2s_sdec.onnx'
79
+ else:
80
+ WEIGHT_PATH_T2S_STAGE_DECODER = 't2s_sdec.opt3.onnx'
81
+ WEIGHT_PATH_VITS = 'vits.onnx'
82
+
83
+ MODEL_PATH_SSL = WEIGHT_PATH_SSL + '.prototxt'
84
+ MODEL_PATH_T2S_ENCODER = WEIGHT_PATH_T2S_ENCODER + '.prototxt'
85
+ MODEL_PATH_T2S_FIRST_DECODER = WEIGHT_PATH_T2S_FIRST_DECODER + '.prototxt'
86
+ MODEL_PATH_T2S_STAGE_DECODER = WEIGHT_PATH_T2S_STAGE_DECODER + '.prototxt'
87
+ MODEL_PATH_VITS = WEIGHT_PATH_VITS + '.prototxt'
88
+
89
+ # ======================
90
+ # Mode
91
+ # ======================
92
+
93
+ if not args.onnx:
94
+ import ailia
95
+ version = ailia.get_version().split(".")
96
+ AILIA_VERSION_MAJOR = int(version[0])
97
+ AILIA_VERSION_MINOR = int(version[1])
98
+ AILIA_VERSION_REVISION = int(version[2])
99
+ COPY_BLOB_DATA = not (
100
+ AILIA_VERSION_MAJOR <= 1
101
+ and AILIA_VERSION_MINOR <= 2
102
+ and AILIA_VERSION_REVISION < 15
103
+ )
104
+
105
+ # ======================
106
+ # Logic
107
+ # ======================
108
+
109
+ class T2SModel():
110
+ def __init__(self, sess_encoder, sess_fsdec, sess_sdec):
111
+ self.hz = 50
112
+ self.max_sec = 54
113
+ self.top_k = 5
114
+ self.early_stop_num = np.array([self.hz * self.max_sec])
115
+ self.sess_encoder = sess_encoder
116
+ self.sess_fsdec = sess_fsdec
117
+ self.sess_sdec = sess_sdec
118
+
119
+ def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
120
+ early_stop_num = self.early_stop_num
121
+
122
+ top_k = np.array([5], dtype=np.int64)
123
+ top_p = np.array([1.0], dtype=np.float32)
124
+ temperature = np.array([1.0], dtype=np.float32)
125
+ repetition_penalty = np.array([1.35], dtype=np.float32)
126
+
127
+ EOS = 1024
128
+
129
+ if args.benchmark:
130
+ start = int(round(time.time() * 1000))
131
+ if args.onnx:
132
+ x, prompts = self.sess_encoder.run(None, {"ref_seq":ref_seq, "text_seq":text_seq, "ref_bert":ref_bert, "text_bert":text_bert, "ssl_content":ssl_content})
133
+ else:
134
+ x, prompts = self.sess_encoder.run({"ref_seq":ref_seq, "text_seq":text_seq, "ref_bert":ref_bert, "text_bert":text_bert, "ssl_content":ssl_content})
135
+ if args.benchmark:
136
+ end = int(round(time.time() * 1000))
137
+ logger.info("\tsencoder processing time {} ms".format(end-start))
138
+
139
+ prefix_len = prompts.shape[1]
140
+
141
+ if args.benchmark:
142
+ start = int(round(time.time() * 1000))
143
+ if args.onnx:
144
+ y, k, v, y_emb, x_example = self.sess_fsdec.run(None, {"x":x, "prompts":prompts, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
145
+ else:
146
+ y, k, v, y_emb, x_example = self.sess_fsdec.run({"x":x, "prompts":prompts, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
147
+ if args.benchmark:
148
+ end = int(round(time.time() * 1000))
149
+ logger.info("\tfsdec processing time {} ms".format(end-start))
150
+
151
+ stop = False
152
+ for idx in range(1, 1500):
153
+ if args.benchmark:
154
+ start = int(round(time.time() * 1000))
155
+ if args.onnx:
156
+ y, k, v, y_emb, logits, samples = self.sess_sdec.run(None, {"iy":y, "ik":k, "iv":v, "iy_emb":y_emb, "ix_example":x_example, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
157
+ else:
158
+ if idx == 1:
159
+ y, k, v, y_emb, logits, samples = self.sess_sdec.run({"iy":y, "ik":k, "iv":v, "iy_emb":y_emb, "ix_example":x_example, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
160
+ kv_base_shape = k.shape
161
+ else:
162
+ input_blob_idx = self.sess_sdec.get_input_blob_list()
163
+ output_blob_idx = self.sess_sdec.get_output_blob_list()
164
+ self.sess_sdec.set_input_blob_data(y, 0)
165
+ if COPY_BLOB_DATA:
166
+ kv_shape = (kv_base_shape[0], kv_base_shape[1] + idx - 2, kv_base_shape[2], kv_base_shape[3])
167
+ self.sess_sdec.set_input_blob_shape(kv_shape, 1)
168
+ self.sess_sdec.set_input_blob_shape(kv_shape, 2)
169
+ self.sess_sdec.copy_blob_data(input_blob_idx[1], output_blob_idx[1], self.sess_sdec)
170
+ self.sess_sdec.copy_blob_data(input_blob_idx[2], output_blob_idx[2], self.sess_sdec)
171
+ else:
172
+ self.sess_sdec.set_input_blob_data(k, 1)
173
+ self.sess_sdec.set_input_blob_data(v, 2)
174
+ self.sess_sdec.set_input_blob_data(y_emb, 3)
175
+ self.sess_sdec.set_input_blob_data(x_example, 4)
176
+ self.sess_sdec.set_input_blob_data(top_k, 5)
177
+ self.sess_sdec.set_input_blob_data(top_p, 6)
178
+ self.sess_sdec.set_input_blob_data(temperature, 7)
179
+ self.sess_sdec.set_input_blob_data(repetition_penalty, 8)
180
+ self.sess_sdec.update()
181
+ y = self.sess_sdec.get_blob_data(output_blob_idx[0])
182
+ if not COPY_BLOB_DATA:
183
+ k = self.sess_sdec.get_blob_data(output_blob_idx[1])
184
+ v = self.sess_sdec.get_blob_data(output_blob_idx[2])
185
+ y_emb = self.sess_sdec.get_blob_data(output_blob_idx[3])
186
+ logits = self.sess_sdec.get_blob_data(output_blob_idx[4])
187
+ samples = self.sess_sdec.get_blob_data(output_blob_idx[5])
188
+
189
+ if args.benchmark:
190
+ end = int(round(time.time() * 1000))
191
+ logger.info("\tsdec processing time {} ms".format(end-start))
192
+ if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
193
+ stop = True
194
+ if np.argmax(logits, axis=-1)[0] == EOS or samples[0, 0] == EOS:
195
+ stop = True
196
+ if stop:
197
+ break
198
+ y[0, -1] = 0
199
+
200
+ return y[np.newaxis, :, -idx:-1]
201
+
202
+
203
+ class GptSoVits():
204
+ def __init__(self, t2s, sess):
205
+ self.t2s = t2s
206
+ self.sess = sess
207
+
208
+ def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content):
209
+ pred_semantic = self.t2s.forward(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
210
+ if args.benchmark:
211
+ start = int(round(time.time() * 1000))
212
+ if args.onnx:
213
+ audio1 = self.sess.run(None, {
214
+ "text_seq" : text_seq,
215
+ "pred_semantic" : pred_semantic,
216
+ "ref_audio" : ref_audio
217
+ })
218
+ else:
219
+ audio1 = self.sess.run({
220
+ "text_seq" : text_seq,
221
+ "pred_semantic" : pred_semantic,
222
+ "ref_audio" : ref_audio
223
+ })
224
+ if args.benchmark:
225
+ end = int(round(time.time() * 1000))
226
+ logger.info("\tvits processing time {} ms".format(end-start))
227
+ return audio1[0]
228
+
229
+
230
+ class SSLModel():
231
+ def __init__(self, sess):
232
+ self.sess = sess
233
+
234
+ def forward(self, ref_audio_16k):
235
+ if args.benchmark:
236
+ start = int(round(time.time() * 1000))
237
+ if args.onnx:
238
+ last_hidden_state = self.sess.run(None, {
239
+ "ref_audio_16k" : ref_audio_16k
240
+ })
241
+ else:
242
+ last_hidden_state = self.sess.run({
243
+ "ref_audio_16k" : ref_audio_16k
244
+ })
245
+ if args.benchmark:
246
+ end = int(round(time.time() * 1000))
247
+ logger.info("\tssl processing time {} ms".format(end-start))
248
+ return last_hidden_state[0]
249
+
250
+
251
+ def generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits):
252
+ gpt = T2SModel(t2s_encoder, t2s_first_decoder, t2s_stage_decoder,)
253
+ gpt_sovits = GptSoVits(gpt, vits)
254
+ ssl = SSLModel(ssl)
255
+
256
+ input_audio = args.ref_audio
257
+
258
+ if args.ailia_voice:
259
+ import ailia_voice
260
+ voice = ailia_voice.G2P()
261
+ voice.initialize_model(model_path = "./models/")
262
+ else:
263
+ import text.japanese as japanese
264
+ import text.english as english
265
+
266
+ if args.ref_language == "ja":
267
+ if args.ailia_voice:
268
+ ref_phones = voice.g2p(args.ref_text, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_JA).split(" ")[:-1]
269
+ else:
270
+ ref_phones = japanese.g2p(args.ref_text)
271
+ else:
272
+ if args.ailia_voice:
273
+ ref_phones = voice.g2p(args.ref_text, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_EN).split(" ")[:-1]
274
+ else:
275
+ ref_phones = english.g2p(args.ref_text)
276
+ ref_seq = np.array([cleaned_text_to_sequence(ref_phones)], dtype=np.int64)
277
+
278
+ if args.text_language == "ja":
279
+ if args.ailia_voice:
280
+ text_phones = voice.g2p(args.input, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_JA).split(" ")[:-1]
281
+ else:
282
+ text_phones = japanese.g2p(args.input)
283
+ else:
284
+ if args.ailia_voice:
285
+ text_phones = voice.g2p(args.input, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_EN).split(" ")[:-1]
286
+ else:
287
+ text_phones = english.g2p(args.input)
288
+ text_seq = np.array([cleaned_text_to_sequence(text_phones)], dtype=np.int64)
289
+
290
+ # empty for ja or en
291
+ ref_bert = np.zeros((ref_seq.shape[1], 1024), dtype=np.float32)
292
+ text_bert = np.zeros((text_seq.shape[1], 1024), dtype=np.float32)
293
+
294
+ vits_hps_data_sampling_rate = 32000
295
+
296
+ zero_wav = np.zeros(
297
+ int(vits_hps_data_sampling_rate * 0.3),
298
+ dtype=np.float32,
299
+ )
300
+ wav16k, sr = librosa.load(input_audio, sr=16000)
301
+ wav16k = np.concatenate([wav16k, zero_wav], axis=0)
302
+ wav16k = wav16k[np.newaxis, :]
303
+ ref_audio_16k = wav16k # hubertの入力のみpaddingする
304
+
305
+ wav32k, sr = librosa.load(input_audio, sr=vits_hps_data_sampling_rate)
306
+ wav32k = wav32k[np.newaxis, :]
307
+
308
+ ssl_content = ssl.forward(ref_audio_16k)
309
+
310
+ a = gpt_sovits.forward(ref_seq, text_seq, ref_bert, text_bert, wav32k, ssl_content)
311
+
312
+ savepath = args.savepath
313
+ logger.info(f'saved at : {savepath}')
314
+
315
+ soundfile.write(savepath, a, vits_hps_data_sampling_rate)
316
+
317
+ logger.info('Script finished successfully.')
318
+
319
+
320
+ def main():
321
+ # model files check and download
322
+ check_and_download_models(WEIGHT_PATH_SSL, MODEL_PATH_SSL, REMOTE_PATH)
323
+ check_and_download_models(WEIGHT_PATH_T2S_ENCODER, MODEL_PATH_T2S_ENCODER, REMOTE_PATH)
324
+ check_and_download_models(WEIGHT_PATH_T2S_FIRST_DECODER, MODEL_PATH_T2S_FIRST_DECODER, REMOTE_PATH)
325
+ check_and_download_models(WEIGHT_PATH_T2S_STAGE_DECODER, MODEL_PATH_T2S_STAGE_DECODER, REMOTE_PATH)
326
+ check_and_download_models(WEIGHT_PATH_VITS, MODEL_PATH_VITS, REMOTE_PATH)
327
+
328
+ #env_id = args.env_id
329
+
330
+ if args.onnx:
331
+ import onnxruntime
332
+ providers = ["CPUExecutionProvider"]
333
+ #providers = ["CUDAExecutionProvider"]
334
+ ssl = onnxruntime.InferenceSession(WEIGHT_PATH_SSL, providers=providers)
335
+ t2s_encoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_ENCODER, providers=providers)
336
+ t2s_first_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_FIRST_DECODER, providers=providers)
337
+ t2s_stage_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_STAGE_DECODER, providers=providers)
338
+ vits = onnxruntime.InferenceSession(WEIGHT_PATH_VITS, providers=providers)
339
+ else:
340
+ import ailia
341
+ memory_mode = ailia.get_memory_mode(reduce_constant=True, ignore_input_with_initializer=True, reduce_interstage=False, reuse_interstage=True)
342
+ ssl = ailia.Net(weight = WEIGHT_PATH_SSL, stream = MODEL_PATH_SSL, memory_mode = memory_mode, env_id = args.env_id)
343
+ t2s_encoder = ailia.Net(weight = WEIGHT_PATH_T2S_ENCODER, stream = MODEL_PATH_T2S_ENCODER, memory_mode = memory_mode, env_id = args.env_id)
344
+ t2s_first_decoder = ailia.Net(weight = WEIGHT_PATH_T2S_FIRST_DECODER, stream = MODEL_PATH_T2S_FIRST_DECODER, memory_mode = memory_mode, env_id = args.env_id)
345
+ t2s_stage_decoder = ailia.Net(weight = WEIGHT_PATH_T2S_STAGE_DECODER, stream = MODEL_PATH_T2S_STAGE_DECODER, memory_mode = memory_mode, env_id = args.env_id)
346
+ vits = ailia.Net(weight = WEIGHT_PATH_VITS, stream = MODEL_PATH_VITS, memory_mode = memory_mode, env_id = args.env_id)
347
+ if args.profile:
348
+ ssl.set_profile_mode(True)
349
+ t2s_encoder.set_profile_mode(True)
350
+ t2s_first_decoder.set_profile_mode(True)
351
+ t2s_stage_decoder.set_profile_mode(True)
352
+ vits.set_profile_mode(True)
353
+ pf = platform.system()
354
+ if pf == "Darwin":
355
+ if args.env_id == 2:
356
+ logger.info(
357
+ "This model not optimized for macOS GPU currently. Please try -e 1 option to improve inference speed."
358
+ )
359
+
360
+ if args.benchmark:
361
+ start = int(round(time.time() * 1000))
362
+
363
+ generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits)
364
+
365
+ if args.benchmark:
366
+ end = int(round(time.time() * 1000))
367
+ logger.info("\ttotal processing time {} ms".format(end-start))
368
+
369
+ if args.profile:
370
+ print("ssl : ")
371
+ print(ssl.get_summary())
372
+ print("t2s_encoder : ")
373
+ print(t2s_encoder.get_summary())
374
+ print("t2s_first_decoder : ")
375
+ print(t2s_first_decoder.get_summary())
376
+ print("t2s_stage_decoder : ")
377
+ print(t2s_stage_decoder.get_summary())
378
+ print("vits : ")
379
+ print(vits.get_summary())
380
+
381
+
382
+ if __name__ == '__main__':
383
+ main()
models/ailia-models/GPT-SoVITS/code/reference_audio_captured_by_ax.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af474a35ab4aebaadda5a20d626c44830d5987880e54e10fc645eb73d568743
3
+ size 226298
models/ailia-models/GPT-SoVITS/code/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langid
2
+ unidecode
3
+ pyopenjtalk-prebuilt
4
+ SoundFile
5
+ librosa
6
+ g2p_en
models/ailia-models/GPT-SoVITS/code/text/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import *
2
+
3
+
4
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5
+
6
+ def cleaned_text_to_sequence(cleaned_text):
7
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ '''
13
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
+ return phones
15
+
models/ailia-models/GPT-SoVITS/code/text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/code/text/cmudict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
+ size 6212655
models/ailia-models/GPT-SoVITS/code/text/english.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+
6
+ from text import symbols
7
+
8
+ current_file_path = os.path.dirname(__file__)
9
+ CMU_DICT_PATH = os.path.join(current_file_path, 'cmudict.rep')
10
+ CACHE_PATH = os.path.join(current_file_path, 'cmudict_cache.pickle')
11
+ _g2p = G2p()
12
+
13
+ arpa = {'AH0', 'S', 'AH1', 'EY2', 'AE2', 'EH0', 'OW2', 'UH0', 'NG', 'B', 'G', 'AY0', 'M', 'AA0', 'F', 'AO0', 'ER2', 'UH1', 'IY1', 'AH2', 'DH', 'IY0', 'EY1', 'IH0', 'K', 'N', 'W', 'IY2', 'T', 'AA1', 'ER1', 'EH2', 'OY0', 'UH2', 'UW1', 'Z', 'AW2', 'AW1', 'V', 'UW2', 'AA2', 'ER', 'AW0', 'UW0', 'R', 'OW1', 'EH1', 'ZH', 'AE0', 'IH2', 'IH', 'Y', 'JH', 'P', 'AY1', 'EY0', 'OY2', 'TH', 'HH', 'D', 'ER0', 'CH', 'AO1', 'AE1', 'AO2', 'OY1', 'AY2', 'IH1', 'OW0', 'L', 'SH'}
14
+
15
+
16
+ def replace_phs(phs):
17
+ rep_map = {
18
+ ';': ',',
19
+ ':': ',',
20
+ '\'': '-',
21
+ '"': '-'
22
+ }
23
+ phs_new = []
24
+ for ph in phs:
25
+ if ph in symbols:
26
+ phs_new.append(ph)
27
+ elif ph in rep_map.keys():
28
+ phs_new.append(rep_map[ph])
29
+ else:
30
+ print('ph not in symbols: ', ph)
31
+ return phs_new
32
+
33
+ def read_dict():
34
+ g2p_dict = {}
35
+ start_line = 49
36
+ with open(CMU_DICT_PATH) as f:
37
+ line = f.readline()
38
+ line_index = 1
39
+ while line:
40
+ if line_index >= start_line:
41
+ line = line.strip()
42
+ word_split = line.split(' ')
43
+ word = word_split[0]
44
+
45
+ syllable_split = word_split[1].split(' - ')
46
+ g2p_dict[word] = []
47
+ for syllable in syllable_split:
48
+ phone_split = syllable.split(' ')
49
+ g2p_dict[word].append(phone_split)
50
+
51
+ line_index = line_index + 1
52
+ line = f.readline()
53
+
54
+ return g2p_dict
55
+
56
+
57
+ def cache_dict(g2p_dict, file_path):
58
+ with open(file_path, 'wb') as pickle_file:
59
+ pickle.dump(g2p_dict, pickle_file)
60
+
61
+
62
+ def get_dict():
63
+ if os.path.exists(CACHE_PATH):
64
+ with open(CACHE_PATH, 'rb') as pickle_file:
65
+ g2p_dict = pickle.load(pickle_file)
66
+ else:
67
+ g2p_dict = read_dict()
68
+ cache_dict(g2p_dict, CACHE_PATH)
69
+
70
+ return g2p_dict
71
+
72
+ eng_dict = get_dict()
73
+
74
+
75
+ def text_normalize(text):
76
+ # todo: eng text normalize
77
+ return text.replace(";", ",")
78
+
79
+ def g2p(text):
80
+
81
+ phones = []
82
+ words = re.split(r"([,;.\-\?\!\s+])", text)
83
+ for w in words:
84
+ if w.upper() in eng_dict:
85
+ phns = eng_dict[w.upper()]
86
+ for ph in phns:
87
+ phones += ph
88
+ else:
89
+ phone_list = list(filter(lambda p: p != " ", _g2p(w)))
90
+ for ph in phone_list:
91
+ if ph in arpa:
92
+ phones.append(ph)
93
+ else:
94
+ phones.append(ph)
95
+
96
+ return replace_phs(phones)
97
+
98
+ if __name__ == "__main__":
99
+ # print(get_dict())
100
+ print(g2p("hello"))
101
+ print(g2p("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
102
+ # all_phones = set()
103
+ # for k, syllables in eng_dict.items():
104
+ # for group in syllables:
105
+ # for ph in group:
106
+ # all_phones.add(ph)
107
+ # print(all_phones)
models/ailia-models/GPT-SoVITS/code/text/japanese.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2
+ import re
3
+ import sys
4
+
5
+ import pyopenjtalk
6
+
7
+
8
+ from text import symbols
9
+ # Regular expression matching Japanese without punctuation marks:
10
+ _japanese_characters = re.compile(
11
+ r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
12
+ )
13
+
14
+ # Regular expression matching non-Japanese characters or punctuation marks:
15
+ _japanese_marks = re.compile(
16
+ r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
17
+ )
18
+
19
+ # List of (symbol, Japanese) pairs for marks:
20
+ _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
21
+
22
+
23
+ # List of (consonant, sokuon) pairs:
24
+ _real_sokuon = [
25
+ (re.compile("%s" % x[0]), x[1])
26
+ for x in [
27
+ (r"Q([↑↓]*[kg])", r"k#\1"),
28
+ (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
29
+ (r"Q([↑↓]*[sʃ])", r"s\1"),
30
+ (r"Q([↑↓]*[pb])", r"p#\1"),
31
+ ]
32
+ ]
33
+
34
+ # List of (consonant, hatsuon) pairs:
35
+ _real_hatsuon = [
36
+ (re.compile("%s" % x[0]), x[1])
37
+ for x in [
38
+ (r"N([↑↓]*[pbm])", r"m\1"),
39
+ (r"N([↑↓]*[ʧʥj])", r"n^\1"),
40
+ (r"N([↑↓]*[tdn])", r"n\1"),
41
+ (r"N([↑↓]*[kg])", r"ŋ\1"),
42
+ ]
43
+ ]
44
+
45
+
46
+ def post_replace_ph(ph):
47
+ rep_map = {
48
+ ":": ",",
49
+ ";": ",",
50
+ ",": ",",
51
+ "。": ".",
52
+ "!": "!",
53
+ "?": "?",
54
+ "\n": ".",
55
+ "·": ",",
56
+ "、": ",",
57
+ "...": "…",
58
+ }
59
+ if ph in rep_map.keys():
60
+ ph = rep_map[ph]
61
+ if ph in symbols:
62
+ return ph
63
+ if ph not in symbols:
64
+ ph = "UNK"
65
+ return ph
66
+
67
+
68
+ def symbols_to_japanese(text):
69
+ for regex, replacement in _symbols_to_japanese:
70
+ text = re.sub(regex, replacement, text)
71
+ return text
72
+
73
+
74
+ def preprocess_jap(text, with_prosody=False):
75
+ """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
76
+ text = symbols_to_japanese(text)
77
+ sentences = re.split(_japanese_marks, text)
78
+ marks = re.findall(_japanese_marks, text)
79
+ text = []
80
+ for i, sentence in enumerate(sentences):
81
+ if re.match(_japanese_characters, sentence):
82
+ if with_prosody:
83
+ text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
84
+ else:
85
+ p = pyopenjtalk.g2p(sentence)
86
+ text += p.split(" ")
87
+
88
+ if i < len(marks):
89
+ if marks[i] == " ":# 防止意外的UNK
90
+ continue
91
+ text += [marks[i].replace(" ", "")]
92
+ return text
93
+
94
+
95
+ def text_normalize(text):
96
+ # todo: jap text normalize
97
+ return text
98
+
99
+ # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
100
+ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
101
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
102
+
103
+ The algorithm is based on `Prosodic features control by symbols as input of
104
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
105
+
106
+ Args:
107
+ text (str): Input text.
108
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
109
+
110
+ Returns:
111
+ List[str]: List of phoneme + prosody symbols.
112
+
113
+ Examples:
114
+ >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
115
+ >>> pyopenjtalk_g2p_prosody("こんにちは。")
116
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
117
+
118
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
119
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
120
+
121
+ """
122
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
123
+ N = len(labels)
124
+
125
+ phones = []
126
+ for n in range(N):
127
+ lab_curr = labels[n]
128
+
129
+ # current phoneme
130
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
131
+ # deal unvoiced vowels as normal vowels
132
+ if drop_unvoiced_vowels and p3 in "AEIOU":
133
+ p3 = p3.lower()
134
+
135
+ # deal with sil at the beginning and the end of text
136
+ if p3 == "sil":
137
+ assert n == 0 or n == N - 1
138
+ if n == 0:
139
+ phones.append("^")
140
+ elif n == N - 1:
141
+ # check question form or not
142
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
143
+ if e3 == 0:
144
+ phones.append("$")
145
+ elif e3 == 1:
146
+ phones.append("?")
147
+ continue
148
+ elif p3 == "pau":
149
+ phones.append("_")
150
+ continue
151
+ else:
152
+ phones.append(p3)
153
+
154
+ # accent type and position info (forward or backward)
155
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
156
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
157
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
158
+
159
+ # number of mora in accent phrase
160
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
161
+
162
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
163
+ # accent phrase border
164
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
165
+ phones.append("#")
166
+ # pitch falling
167
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
168
+ phones.append("]")
169
+ # pitch rising
170
+ elif a2 == 1 and a2_next == 2:
171
+ phones.append("[")
172
+
173
+ return phones
174
+
175
+ # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
176
+ def _numeric_feature_by_regex(regex, s):
177
+ match = re.search(regex, s)
178
+ if match is None:
179
+ return -50
180
+ return int(match.group(1))
181
+
182
+ def g2p(norm_text, with_prosody=False):
183
+ phones = preprocess_jap(norm_text, with_prosody)
184
+ phones = [post_replace_ph(i) for i in phones]
185
+ # todo: implement tones and word2ph
186
+ return phones
187
+
188
+
189
+ if __name__ == "__main__":
190
+ phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!")
191
+ print(phones)
models/ailia-models/GPT-SoVITS/code/text/symbols.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
4
+ punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
5
+ punctuation.append("-")
6
+ pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
7
+ # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
8
+ pad = "_"
9
+
10
+ c = [
11
+ "AA",
12
+ "EE",
13
+ "OO",
14
+ "b",
15
+ "c",
16
+ "ch",
17
+ "d",
18
+ "f",
19
+ "g",
20
+ "h",
21
+ "j",
22
+ "k",
23
+ "l",
24
+ "m",
25
+ "n",
26
+ "p",
27
+ "q",
28
+ "r",
29
+ "s",
30
+ "sh",
31
+ "t",
32
+ "w",
33
+ "x",
34
+ "y",
35
+ "z",
36
+ "zh",
37
+ ]
38
+ v = [
39
+ "E1",
40
+ "En1",
41
+ "a1",
42
+ "ai1",
43
+ "an1",
44
+ "ang1",
45
+ "ao1",
46
+ "e1",
47
+ "ei1",
48
+ "en1",
49
+ "eng1",
50
+ "er1",
51
+ "i1",
52
+ "i01",
53
+ "ia1",
54
+ "ian1",
55
+ "iang1",
56
+ "iao1",
57
+ "ie1",
58
+ "in1",
59
+ "ing1",
60
+ "iong1",
61
+ "ir1",
62
+ "iu1",
63
+ "o1",
64
+ "ong1",
65
+ "ou1",
66
+ "u1",
67
+ "ua1",
68
+ "uai1",
69
+ "uan1",
70
+ "uang1",
71
+ "ui1",
72
+ "un1",
73
+ "uo1",
74
+ "v1",
75
+ "van1",
76
+ "ve1",
77
+ "vn1",
78
+ "E2",
79
+ "En2",
80
+ "a2",
81
+ "ai2",
82
+ "an2",
83
+ "ang2",
84
+ "ao2",
85
+ "e2",
86
+ "ei2",
87
+ "en2",
88
+ "eng2",
89
+ "er2",
90
+ "i2",
91
+ "i02",
92
+ "ia2",
93
+ "ian2",
94
+ "iang2",
95
+ "iao2",
96
+ "ie2",
97
+ "in2",
98
+ "ing2",
99
+ "iong2",
100
+ "ir2",
101
+ "iu2",
102
+ "o2",
103
+ "ong2",
104
+ "ou2",
105
+ "u2",
106
+ "ua2",
107
+ "uai2",
108
+ "uan2",
109
+ "uang2",
110
+ "ui2",
111
+ "un2",
112
+ "uo2",
113
+ "v2",
114
+ "van2",
115
+ "ve2",
116
+ "vn2",
117
+ "E3",
118
+ "En3",
119
+ "a3",
120
+ "ai3",
121
+ "an3",
122
+ "ang3",
123
+ "ao3",
124
+ "e3",
125
+ "ei3",
126
+ "en3",
127
+ "eng3",
128
+ "er3",
129
+ "i3",
130
+ "i03",
131
+ "ia3",
132
+ "ian3",
133
+ "iang3",
134
+ "iao3",
135
+ "ie3",
136
+ "in3",
137
+ "ing3",
138
+ "iong3",
139
+ "ir3",
140
+ "iu3",
141
+ "o3",
142
+ "ong3",
143
+ "ou3",
144
+ "u3",
145
+ "ua3",
146
+ "uai3",
147
+ "uan3",
148
+ "uang3",
149
+ "ui3",
150
+ "un3",
151
+ "uo3",
152
+ "v3",
153
+ "van3",
154
+ "ve3",
155
+ "vn3",
156
+ "E4",
157
+ "En4",
158
+ "a4",
159
+ "ai4",
160
+ "an4",
161
+ "ang4",
162
+ "ao4",
163
+ "e4",
164
+ "ei4",
165
+ "en4",
166
+ "eng4",
167
+ "er4",
168
+ "i4",
169
+ "i04",
170
+ "ia4",
171
+ "ian4",
172
+ "iang4",
173
+ "iao4",
174
+ "ie4",
175
+ "in4",
176
+ "ing4",
177
+ "iong4",
178
+ "ir4",
179
+ "iu4",
180
+ "o4",
181
+ "ong4",
182
+ "ou4",
183
+ "u4",
184
+ "ua4",
185
+ "uai4",
186
+ "uan4",
187
+ "uang4",
188
+ "ui4",
189
+ "un4",
190
+ "uo4",
191
+ "v4",
192
+ "van4",
193
+ "ve4",
194
+ "vn4",
195
+ "E5",
196
+ "En5",
197
+ "a5",
198
+ "ai5",
199
+ "an5",
200
+ "ang5",
201
+ "ao5",
202
+ "e5",
203
+ "ei5",
204
+ "en5",
205
+ "eng5",
206
+ "er5",
207
+ "i5",
208
+ "i05",
209
+ "ia5",
210
+ "ian5",
211
+ "iang5",
212
+ "iao5",
213
+ "ie5",
214
+ "in5",
215
+ "ing5",
216
+ "iong5",
217
+ "ir5",
218
+ "iu5",
219
+ "o5",
220
+ "ong5",
221
+ "ou5",
222
+ "u5",
223
+ "ua5",
224
+ "uai5",
225
+ "uan5",
226
+ "uang5",
227
+ "ui5",
228
+ "un5",
229
+ "uo5",
230
+ "v5",
231
+ "van5",
232
+ "ve5",
233
+ "vn5",
234
+ ]
235
+
236
+ v_without_tone = [
237
+ "E",
238
+ "En",
239
+ "a",
240
+ "ai",
241
+ "an",
242
+ "ang",
243
+ "ao",
244
+ "e",
245
+ "ei",
246
+ "en",
247
+ "eng",
248
+ "er",
249
+ "i",
250
+ "i0",
251
+ "ia",
252
+ "ian",
253
+ "iang",
254
+ "iao",
255
+ "ie",
256
+ "in",
257
+ "ing",
258
+ "iong",
259
+ "ir",
260
+ "iu",
261
+ "o",
262
+ "ong",
263
+ "ou",
264
+ "u",
265
+ "ua",
266
+ "uai",
267
+ "uan",
268
+ "uang",
269
+ "ui",
270
+ "un",
271
+ "uo",
272
+ "v",
273
+ "van",
274
+ "ve",
275
+ "vn",
276
+ ]
277
+
278
+ # japanese
279
+ ja_symbols = [
280
+ "I",
281
+ "N",
282
+ "U",
283
+ "a",
284
+ "b",
285
+ "by",
286
+ "ch",
287
+ "cl",
288
+ "d",
289
+ "dy",
290
+ "e",
291
+ "f",
292
+ "g",
293
+ "gy",
294
+ "h",
295
+ "hy",
296
+ "i",
297
+ "j",
298
+ "k",
299
+ "ky",
300
+ "m",
301
+ "my",
302
+ "n",
303
+ "ny",
304
+ "o",
305
+ "p",
306
+ "py",
307
+ "r",
308
+ "ry",
309
+ "s",
310
+ "sh",
311
+ "t",
312
+ "ts",
313
+ "u",
314
+ "v",
315
+ "w",
316
+ "y",
317
+ "z",
318
+ # "[", #上升调型
319
+ # "]", #下降调型
320
+ # "$", #结束符
321
+ # "^", #开始符
322
+ ]
323
+
324
+ arpa = {
325
+ "AH0",
326
+ "S",
327
+ "AH1",
328
+ "EY2",
329
+ "AE2",
330
+ "EH0",
331
+ "OW2",
332
+ "UH0",
333
+ "NG",
334
+ "B",
335
+ "G",
336
+ "AY0",
337
+ "M",
338
+ "AA0",
339
+ "F",
340
+ "AO0",
341
+ "ER2",
342
+ "UH1",
343
+ "IY1",
344
+ "AH2",
345
+ "DH",
346
+ "IY0",
347
+ "EY1",
348
+ "IH0",
349
+ "K",
350
+ "N",
351
+ "W",
352
+ "IY2",
353
+ "T",
354
+ "AA1",
355
+ "ER1",
356
+ "EH2",
357
+ "OY0",
358
+ "UH2",
359
+ "UW1",
360
+ "Z",
361
+ "AW2",
362
+ "AW1",
363
+ "V",
364
+ "UW2",
365
+ "AA2",
366
+ "ER",
367
+ "AW0",
368
+ "UW0",
369
+ "R",
370
+ "OW1",
371
+ "EH1",
372
+ "ZH",
373
+ "AE0",
374
+ "IH2",
375
+ "IH",
376
+ "Y",
377
+ "JH",
378
+ "P",
379
+ "AY1",
380
+ "EY0",
381
+ "OY2",
382
+ "TH",
383
+ "HH",
384
+ "D",
385
+ "ER0",
386
+ "CH",
387
+ "AO1",
388
+ "AE1",
389
+ "AO2",
390
+ "OY1",
391
+ "AY2",
392
+ "IH1",
393
+ "OW0",
394
+ "L",
395
+ "SH",
396
+ }
397
+
398
+ symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
399
+ symbols = sorted(set(symbols))
400
+ if __name__ == "__main__":
401
+ print(len(symbols))
models/ailia-models/GPT-SoVITS/source.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/gpt-sovits
2
+
3
+ [normal]
4
+
5
+ https://storage.googleapis.com/ailia-models/gpt-sovits/cnhubert.onnx
6
+ https://storage.googleapis.com/ailia-models/gpt-sovits/cnhubert.onnx.prototxt
7
+
8
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_encoder.onnx
9
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_encoder.onnx.prototxt
10
+
11
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_fsdec.onnx
12
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_fsdec.onnx.prototxt
13
+
14
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.onnx
15
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.onnx.prototxt
16
+
17
+ https://storage.googleapis.com/ailia-models/gpt-sovits/vits.onnx
18
+ https://storage.googleapis.com/ailia-models/gpt-sovits/vits.onnx.prototxt
19
+
20
+ [optimized]
21
+
22
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt.onnx
23
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt.onnx.prototxt
24
+
25
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt2.onnx
26
+ https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt2.onnx.prototxt
models/ailia-models/GPT-SoVITS/t2s_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5aa4a075812b0b8f2ff97d1c56aad66d31dd85b2c9dccc1d39eb0f6a550195e
3
+ size 11055096
models/ailia-models/GPT-SoVITS/t2s_encoder.onnx.prototxt ADDED
@@ -0,0 +1,2816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ir_version: 8
2
+ producer_name: "pytorch"
3
+ producer_version: "2.1.2"
4
+ model_version: 0
5
+ graph {
6
+ name: "main_graph"
7
+ node {
8
+ output: "onnx::ReduceSum_785"
9
+ name: "Constant_0"
10
+ op_type: "Constant"
11
+ attribute {
12
+ name: "value"
13
+ t {
14
+ dims: 1
15
+ data_type: 7
16
+ data_location: 0
17
+ }
18
+ type: TENSOR
19
+ }
20
+ }
21
+ node {
22
+ output: "onnx::ReduceSum_786"
23
+ name: "Constant_1"
24
+ op_type: "Constant"
25
+ attribute {
26
+ name: "value"
27
+ t {
28
+ dims: 1
29
+ data_type: 7
30
+ data_location: 0
31
+ }
32
+ type: TENSOR
33
+ }
34
+ }
35
+ node {
36
+ input: "ssl_content"
37
+ input: "vits.ssl_proj.weight"
38
+ input: "vits.ssl_proj.bias"
39
+ output: "/ssl_proj/Conv_output_0"
40
+ name: "/ssl_proj/Conv"
41
+ op_type: "Conv"
42
+ attribute {
43
+ name: "dilations"
44
+ ints: 1
45
+ type: INTS
46
+ }
47
+ attribute {
48
+ name: "group"
49
+ i: 1
50
+ type: INT
51
+ }
52
+ attribute {
53
+ name: "kernel_shape"
54
+ ints: 2
55
+ type: INTS
56
+ }
57
+ attribute {
58
+ name: "pads"
59
+ ints: 0
60
+ ints: 0
61
+ type: INTS
62
+ }
63
+ attribute {
64
+ name: "strides"
65
+ ints: 2
66
+ type: INTS
67
+ }
68
+ }
69
+ node {
70
+ output: "/quantizer/vq/layers.0/Constant_output_0"
71
+ name: "/quantizer/vq/layers.0/Constant"
72
+ op_type: "Constant"
73
+ attribute {
74
+ name: "value"
75
+ t {
76
+ data_type: 7
77
+ data_location: 0
78
+ }
79
+ type: TENSOR
80
+ }
81
+ }
82
+ node {
83
+ input: "/ssl_proj/Conv_output_0"
84
+ output: "/quantizer/vq/layers.0/Shape_output_0"
85
+ name: "/quantizer/vq/layers.0/Shape"
86
+ op_type: "Shape"
87
+ }
88
+ node {
89
+ output: "/quantizer/vq/layers.0/Constant_1_output_0"
90
+ name: "/quantizer/vq/layers.0/Constant_1"
91
+ op_type: "Constant"
92
+ attribute {
93
+ name: "value"
94
+ t {
95
+ data_type: 7
96
+ data_location: 0
97
+ }
98
+ type: TENSOR
99
+ }
100
+ }
101
+ node {
102
+ input: "/quantizer/vq/layers.0/Shape_output_0"
103
+ input: "/quantizer/vq/layers.0/Constant_1_output_0"
104
+ output: "/quantizer/vq/layers.0/Gather_output_0"
105
+ name: "/quantizer/vq/layers.0/Gather"
106
+ op_type: "Gather"
107
+ attribute {
108
+ name: "axis"
109
+ i: 0
110
+ type: INT
111
+ }
112
+ }
113
+ node {
114
+ input: "/ssl_proj/Conv_output_0"
115
+ output: "/quantizer/vq/layers.0/Shape_1_output_0"
116
+ name: "/quantizer/vq/layers.0/Shape_1"
117
+ op_type: "Shape"
118
+ }
119
+ node {
120
+ output: "/quantizer/vq/layers.0/Constant_2_output_0"
121
+ name: "/quantizer/vq/layers.0/Constant_2"
122
+ op_type: "Constant"
123
+ attribute {
124
+ name: "value"
125
+ t {
126
+ data_type: 7
127
+ data_location: 0
128
+ }
129
+ type: TENSOR
130
+ }
131
+ }
132
+ node {
133
+ input: "/quantizer/vq/layers.0/Shape_1_output_0"
134
+ input: "/quantizer/vq/layers.0/Constant_2_output_0"
135
+ output: "/quantizer/vq/layers.0/Gather_1_output_0"
136
+ name: "/quantizer/vq/layers.0/Gather_1"
137
+ op_type: "Gather"
138
+ attribute {
139
+ name: "axis"
140
+ i: 0
141
+ type: INT
142
+ }
143
+ }
144
+ node {
145
+ input: "/ssl_proj/Conv_output_0"
146
+ output: "/quantizer/vq/layers.0/Shape_2_output_0"
147
+ name: "/quantizer/vq/layers.0/Shape_2"
148
+ op_type: "Shape"
149
+ }
150
+ node {
151
+ output: "/quantizer/vq/layers.0/Constant_3_output_0"
152
+ name: "/quantizer/vq/layers.0/Constant_3"
153
+ op_type: "Constant"
154
+ attribute {
155
+ name: "value"
156
+ t {
157
+ data_type: 7
158
+ data_location: 0
159
+ }
160
+ type: TENSOR
161
+ }
162
+ }
163
+ node {
164
+ input: "/quantizer/vq/layers.0/Shape_2_output_0"
165
+ input: "/quantizer/vq/layers.0/Constant_3_output_0"
166
+ output: "/quantizer/vq/layers.0/Gather_2_output_0"
167
+ name: "/quantizer/vq/layers.0/Gather_2"
168
+ op_type: "Gather"
169
+ attribute {
170
+ name: "axis"
171
+ i: 0
172
+ type: INT
173
+ }
174
+ }
175
+ node {
176
+ output: "/quantizer/vq/layers.0/Constant_4_output_0"
177
+ name: "/quantizer/vq/layers.0/Constant_4"
178
+ op_type: "Constant"
179
+ attribute {
180
+ name: "value"
181
+ t {
182
+ data_type: 7
183
+ data_location: 0
184
+ }
185
+ type: TENSOR
186
+ }
187
+ }
188
+ node {
189
+ input: "/quantizer/vq/layers.0/Gather_output_0"
190
+ input: "/quantizer/vq/layers.0/Constant_4_output_0"
191
+ output: "/quantizer/vq/layers.0/Div_output_0"
192
+ name: "/quantizer/vq/layers.0/Div"
193
+ op_type: "Div"
194
+ }
195
+ node {
196
+ input: "/quantizer/vq/layers.0/Div_output_0"
197
+ output: "/quantizer/vq/layers.0/Cast_output_0"
198
+ name: "/quantizer/vq/layers.0/Cast"
199
+ op_type: "Cast"
200
+ attribute {
201
+ name: "to"
202
+ i: 7
203
+ type: INT
204
+ }
205
+ }
206
+ node {
207
+ input: "/quantizer/vq/layers.0/Cast_output_0"
208
+ output: "/quantizer/vq/layers.0/Cast_1_output_0"
209
+ name: "/quantizer/vq/layers.0/Cast_1"
210
+ op_type: "Cast"
211
+ attribute {
212
+ name: "to"
213
+ i: 7
214
+ type: INT
215
+ }
216
+ }
217
+ node {
218
+ output: "/quantizer/vq/layers.0/Constant_5_output_0"
219
+ name: "/quantizer/vq/layers.0/Constant_5"
220
+ op_type: "Constant"
221
+ attribute {
222
+ name: "value"
223
+ t {
224
+ data_type: 7
225
+ data_location: 0
226
+ }
227
+ type: TENSOR
228
+ }
229
+ }
230
+ node {
231
+ input: "/quantizer/vq/layers.0/Gather_1_output_0"
232
+ input: "/quantizer/vq/layers.0/Constant_5_output_0"
233
+ output: "/quantizer/vq/layers.0/Div_1_output_0"
234
+ name: "/quantizer/vq/layers.0/Div_1"
235
+ op_type: "Div"
236
+ }
237
+ node {
238
+ input: "/quantizer/vq/layers.0/Div_1_output_0"
239
+ output: "/quantizer/vq/layers.0/Cast_2_output_0"
240
+ name: "/quantizer/vq/layers.0/Cast_2"
241
+ op_type: "Cast"
242
+ attribute {
243
+ name: "to"
244
+ i: 7
245
+ type: INT
246
+ }
247
+ }
248
+ node {
249
+ input: "/quantizer/vq/layers.0/Cast_2_output_0"
250
+ output: "/quantizer/vq/layers.0/Cast_3_output_0"
251
+ name: "/quantizer/vq/layers.0/Cast_3"
252
+ op_type: "Cast"
253
+ attribute {
254
+ name: "to"
255
+ i: 7
256
+ type: INT
257
+ }
258
+ }
259
+ node {
260
+ output: "/quantizer/vq/layers.0/Constant_6_output_0"
261
+ name: "/quantizer/vq/layers.0/Constant_6"
262
+ op_type: "Constant"
263
+ attribute {
264
+ name: "value"
265
+ t {
266
+ data_type: 7
267
+ data_location: 0
268
+ }
269
+ type: TENSOR
270
+ }
271
+ }
272
+ node {
273
+ input: "/quantizer/vq/layers.0/Gather_2_output_0"
274
+ input: "/quantizer/vq/layers.0/Constant_6_output_0"
275
+ output: "/quantizer/vq/layers.0/Div_2_output_0"
276
+ name: "/quantizer/vq/layers.0/Div_2"
277
+ op_type: "Div"
278
+ }
279
+ node {
280
+ input: "/quantizer/vq/layers.0/Div_2_output_0"
281
+ output: "/quantizer/vq/layers.0/Cast_4_output_0"
282
+ name: "/quantizer/vq/layers.0/Cast_4"
283
+ op_type: "Cast"
284
+ attribute {
285
+ name: "to"
286
+ i: 7
287
+ type: INT
288
+ }
289
+ }
290
+ node {
291
+ input: "/quantizer/vq/layers.0/Cast_4_output_0"
292
+ output: "/quantizer/vq/layers.0/Cast_5_output_0"
293
+ name: "/quantizer/vq/layers.0/Cast_5"
294
+ op_type: "Cast"
295
+ attribute {
296
+ name: "to"
297
+ i: 7
298
+ type: INT
299
+ }
300
+ }
301
+ node {
302
+ output: "onnx::Unsqueeze_812"
303
+ name: "Constant_25"
304
+ op_type: "Constant"
305
+ attribute {
306
+ name: "value"
307
+ t {
308
+ dims: 1
309
+ data_type: 7
310
+ data_location: 0
311
+ }
312
+ type: TENSOR
313
+ }
314
+ }
315
+ node {
316
+ input: "/quantizer/vq/layers.0/Cast_1_output_0"
317
+ input: "onnx::Unsqueeze_812"
318
+ output: "/quantizer/vq/layers.0/Unsqueeze_output_0"
319
+ name: "/quantizer/vq/layers.0/Unsqueeze"
320
+ op_type: "Unsqueeze"
321
+ }
322
+ node {
323
+ output: "onnx::Unsqueeze_814"
324
+ name: "Constant_27"
325
+ op_type: "Constant"
326
+ attribute {
327
+ name: "value"
328
+ t {
329
+ dims: 1
330
+ data_type: 7
331
+ data_location: 0
332
+ }
333
+ type: TENSOR
334
+ }
335
+ }
336
+ node {
337
+ input: "/quantizer/vq/layers.0/Cast_3_output_0"
338
+ input: "onnx::Unsqueeze_814"
339
+ output: "/quantizer/vq/layers.0/Unsqueeze_1_output_0"
340
+ name: "/quantizer/vq/layers.0/Unsqueeze_1"
341
+ op_type: "Unsqueeze"
342
+ }
343
+ node {
344
+ output: "onnx::Unsqueeze_816"
345
+ name: "Constant_29"
346
+ op_type: "Constant"
347
+ attribute {
348
+ name: "value"
349
+ t {
350
+ dims: 1
351
+ data_type: 7
352
+ data_location: 0
353
+ }
354
+ type: TENSOR
355
+ }
356
+ }
357
+ node {
358
+ input: "/quantizer/vq/layers.0/Cast_5_output_0"
359
+ input: "onnx::Unsqueeze_816"
360
+ output: "/quantizer/vq/layers.0/Unsqueeze_2_output_0"
361
+ name: "/quantizer/vq/layers.0/Unsqueeze_2"
362
+ op_type: "Unsqueeze"
363
+ }
364
+ node {
365
+ input: "/quantizer/vq/layers.0/Unsqueeze_output_0"
366
+ input: "/quantizer/vq/layers.0/Unsqueeze_1_output_0"
367
+ input: "/quantizer/vq/layers.0/Unsqueeze_2_output_0"
368
+ output: "/quantizer/vq/layers.0/Concat_output_0"
369
+ name: "/quantizer/vq/layers.0/Concat"
370
+ op_type: "Concat"
371
+ attribute {
372
+ name: "axis"
373
+ i: 0
374
+ type: INT
375
+ }
376
+ }
377
+ node {
378
+ input: "/ssl_proj/Conv_output_0"
379
+ input: "/quantizer/vq/layers.0/Concat_output_0"
380
+ output: "/quantizer/vq/layers.0/Reshape_output_0"
381
+ name: "/quantizer/vq/layers.0/Reshape"
382
+ op_type: "Reshape"
383
+ attribute {
384
+ name: "allowzero"
385
+ i: 0
386
+ type: INT
387
+ }
388
+ }
389
+ node {
390
+ input: "/quantizer/vq/layers.0/Reshape_output_0"
391
+ output: "/quantizer/vq/layers.0/Transpose_output_0"
392
+ name: "/quantizer/vq/layers.0/Transpose"
393
+ op_type: "Transpose"
394
+ attribute {
395
+ name: "perm"
396
+ ints: 0
397
+ ints: 2
398
+ ints: 1
399
+ type: INTS
400
+ }
401
+ }
402
+ node {
403
+ output: "onnx::Unsqueeze_821"
404
+ name: "Constant_34"
405
+ op_type: "Constant"
406
+ attribute {
407
+ name: "value"
408
+ t {
409
+ dims: 1
410
+ data_type: 7
411
+ data_location: 0
412
+ }
413
+ type: TENSOR
414
+ }
415
+ }
416
+ node {
417
+ input: "/quantizer/vq/layers.0/Cast_1_output_0"
418
+ input: "onnx::Unsqueeze_821"
419
+ output: "/quantizer/vq/layers.0/Unsqueeze_3_output_0"
420
+ name: "/quantizer/vq/layers.0/Unsqueeze_3"
421
+ op_type: "Unsqueeze"
422
+ }
423
+ node {
424
+ output: "onnx::Unsqueeze_823"
425
+ name: "Constant_36"
426
+ op_type: "Constant"
427
+ attribute {
428
+ name: "value"
429
+ t {
430
+ dims: 1
431
+ data_type: 7
432
+ data_location: 0
433
+ }
434
+ type: TENSOR
435
+ }
436
+ }
437
+ node {
438
+ input: "/quantizer/vq/layers.0/Cast_5_output_0"
439
+ input: "onnx::Unsqueeze_823"
440
+ output: "/quantizer/vq/layers.0/Unsqueeze_4_output_0"
441
+ name: "/quantizer/vq/layers.0/Unsqueeze_4"
442
+ op_type: "Unsqueeze"
443
+ }
444
+ node {
445
+ output: "onnx::Unsqueeze_825"
446
+ name: "Constant_38"
447
+ op_type: "Constant"
448
+ attribute {
449
+ name: "value"
450
+ t {
451
+ dims: 1
452
+ data_type: 7
453
+ data_location: 0
454
+ }
455
+ type: TENSOR
456
+ }
457
+ }
458
+ node {
459
+ input: "/quantizer/vq/layers.0/Cast_3_output_0"
460
+ input: "onnx::Unsqueeze_825"
461
+ output: "/quantizer/vq/layers.0/Unsqueeze_5_output_0"
462
+ name: "/quantizer/vq/layers.0/Unsqueeze_5"
463
+ op_type: "Unsqueeze"
464
+ }
465
+ node {
466
+ input: "/quantizer/vq/layers.0/Unsqueeze_3_output_0"
467
+ input: "/quantizer/vq/layers.0/Unsqueeze_4_output_0"
468
+ input: "/quantizer/vq/layers.0/Unsqueeze_5_output_0"
469
+ output: "/quantizer/vq/layers.0/Concat_1_output_0"
470
+ name: "/quantizer/vq/layers.0/Concat_1"
471
+ op_type: "Concat"
472
+ attribute {
473
+ name: "axis"
474
+ i: 0
475
+ type: INT
476
+ }
477
+ }
478
+ node {
479
+ input: "/quantizer/vq/layers.0/Transpose_output_0"
480
+ input: "/quantizer/vq/layers.0/Concat_1_output_0"
481
+ output: "/quantizer/vq/layers.0/Reshape_1_output_0"
482
+ name: "/quantizer/vq/layers.0/Reshape_1"
483
+ op_type: "Reshape"
484
+ attribute {
485
+ name: "allowzero"
486
+ i: 0
487
+ type: INT
488
+ }
489
+ }
490
+ node {
491
+ input: "/quantizer/vq/layers.0/Reshape_1_output_0"
492
+ output: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
493
+ name: "/quantizer/vq/layers.0/_codebook/Shape"
494
+ op_type: "Shape"
495
+ }
496
+ node {
497
+ output: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
498
+ name: "/quantizer/vq/layers.0/_codebook/Constant"
499
+ op_type: "Constant"
500
+ attribute {
501
+ name: "value"
502
+ t {
503
+ data_type: 7
504
+ data_location: 0
505
+ }
506
+ type: TENSOR
507
+ }
508
+ }
509
+ node {
510
+ input: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
511
+ input: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
512
+ output: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
513
+ name: "/quantizer/vq/layers.0/_codebook/Gather"
514
+ op_type: "Gather"
515
+ attribute {
516
+ name: "axis"
517
+ i: 0
518
+ type: INT
519
+ }
520
+ }
521
+ node {
522
+ input: "/quantizer/vq/layers.0/Reshape_1_output_0"
523
+ output: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
524
+ name: "/quantizer/vq/layers.0/_codebook/Shape_1"
525
+ op_type: "Shape"
526
+ }
527
+ node {
528
+ output: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
529
+ name: "/quantizer/vq/layers.0/_codebook/Constant_1"
530
+ op_type: "Constant"
531
+ attribute {
532
+ name: "value"
533
+ t {
534
+ data_type: 7
535
+ data_location: 0
536
+ }
537
+ type: TENSOR
538
+ }
539
+ }
540
+ node {
541
+ input: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
542
+ input: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
543
+ output: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
544
+ name: "/quantizer/vq/layers.0/_codebook/Gather_1"
545
+ op_type: "Gather"
546
+ attribute {
547
+ name: "axis"
548
+ i: 0
549
+ type: INT
550
+ }
551
+ }
552
+ node {
553
+ input: "/quantizer/vq/layers.0/Reshape_1_output_0"
554
+ output: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
555
+ name: "/quantizer/vq/layers.0/_codebook/Shape_2"
556
+ op_type: "Shape"
557
+ }
558
+ node {
559
+ output: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
560
+ name: "/quantizer/vq/layers.0/_codebook/Constant_2"
561
+ op_type: "Constant"
562
+ attribute {
563
+ name: "value"
564
+ t {
565
+ data_type: 7
566
+ data_location: 0
567
+ }
568
+ type: TENSOR
569
+ }
570
+ }
571
+ node {
572
+ input: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
573
+ input: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
574
+ output: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
575
+ name: "/quantizer/vq/layers.0/_codebook/Gather_2"
576
+ op_type: "Gather"
577
+ attribute {
578
+ name: "axis"
579
+ i: 0
580
+ type: INT
581
+ }
582
+ }
583
+ node {
584
+ output: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
585
+ name: "/quantizer/vq/layers.0/_codebook/Constant_3"
586
+ op_type: "Constant"
587
+ attribute {
588
+ name: "value"
589
+ t {
590
+ data_type: 7
591
+ data_location: 0
592
+ }
593
+ type: TENSOR
594
+ }
595
+ }
596
+ node {
597
+ input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
598
+ input: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
599
+ output: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
600
+ name: "/quantizer/vq/layers.0/_codebook/Mul"
601
+ op_type: "Mul"
602
+ }
603
+ node {
604
+ input: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
605
+ input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
606
+ output: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
607
+ name: "/quantizer/vq/layers.0/_codebook/Mul_1"
608
+ op_type: "Mul"
609
+ }
610
+ node {
611
+ output: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
612
+ name: "/quantizer/vq/layers.0/_codebook/Constant_4"
613
+ op_type: "Constant"
614
+ attribute {
615
+ name: "value"
616
+ t {
617
+ data_type: 7
618
+ data_location: 0
619
+ }
620
+ type: TENSOR
621
+ }
622
+ }
623
+ node {
624
+ input: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
625
+ input: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
626
+ output: "/quantizer/vq/layers.0/_codebook/Div_output_0"
627
+ name: "/quantizer/vq/layers.0/_codebook/Div"
628
+ op_type: "Div"
629
+ }
630
+ node {
631
+ input: "/quantizer/vq/layers.0/_codebook/Div_output_0"
632
+ output: "/quantizer/vq/layers.0/_codebook/Cast_output_0"
633
+ name: "/quantizer/vq/layers.0/_codebook/Cast"
634
+ op_type: "Cast"
635
+ attribute {
636
+ name: "to"
637
+ i: 7
638
+ type: INT
639
+ }
640
+ }
641
+ node {
642
+ input: "/quantizer/vq/layers.0/_codebook/Cast_output_0"
643
+ output: "/quantizer/vq/layers.0/_codebook/Cast_1_output_0"
644
+ name: "/quantizer/vq/layers.0/_codebook/Cast_1"
645
+ op_type: "Cast"
646
+ attribute {
647
+ name: "to"
648
+ i: 7
649
+ type: INT
650
+ }
651
+ }
652
+ node {
653
+ output: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
654
+ name: "/quantizer/vq/layers.0/_codebook/Constant_5"
655
+ op_type: "Constant"
656
+ attribute {
657
+ name: "value"
658
+ t {
659
+ data_type: 7
660
+ data_location: 0
661
+ }
662
+ type: TENSOR
663
+ }
664
+ }
665
+ node {
666
+ input: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
667
+ input: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
668
+ output: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
669
+ name: "/quantizer/vq/layers.0/_codebook/Mul_2"
670
+ op_type: "Mul"
671
+ }
672
+ node {
673
+ output: "onnx::Unsqueeze_847"
674
+ name: "Constant_60"
675
+ op_type: "Constant"
676
+ attribute {
677
+ name: "value"
678
+ t {
679
+ dims: 1
680
+ data_type: 7
681
+ data_location: 0
682
+ }
683
+ type: TENSOR
684
+ }
685
+ }
686
+ node {
687
+ input: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
688
+ input: "onnx::Unsqueeze_847"
689
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
690
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze"
691
+ op_type: "Unsqueeze"
692
+ }
693
+ node {
694
+ output: "onnx::Unsqueeze_849"
695
+ name: "Constant_62"
696
+ op_type: "Constant"
697
+ attribute {
698
+ name: "value"
699
+ t {
700
+ dims: 1
701
+ data_type: 7
702
+ data_location: 0
703
+ }
704
+ type: TENSOR
705
+ }
706
+ }
707
+ node {
708
+ input: "/quantizer/vq/layers.0/_codebook/Cast_1_output_0"
709
+ input: "onnx::Unsqueeze_849"
710
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
711
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1"
712
+ op_type: "Unsqueeze"
713
+ }
714
+ node {
715
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
716
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
717
+ output: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
718
+ name: "/quantizer/vq/layers.0/_codebook/Concat"
719
+ op_type: "Concat"
720
+ attribute {
721
+ name: "axis"
722
+ i: 0
723
+ type: INT
724
+ }
725
+ }
726
+ node {
727
+ input: "/quantizer/vq/layers.0/Reshape_1_output_0"
728
+ input: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
729
+ output: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
730
+ name: "/quantizer/vq/layers.0/_codebook/Reshape"
731
+ op_type: "Reshape"
732
+ attribute {
733
+ name: "allowzero"
734
+ i: 0
735
+ type: INT
736
+ }
737
+ }
738
+ node {
739
+ output: "onnx::Unsqueeze_853"
740
+ name: "Constant_66"
741
+ op_type: "Constant"
742
+ attribute {
743
+ name: "value"
744
+ t {
745
+ dims: 1
746
+ data_type: 7
747
+ data_location: 0
748
+ }
749
+ type: TENSOR
750
+ }
751
+ }
752
+ node {
753
+ input: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
754
+ input: "onnx::Unsqueeze_853"
755
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
756
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2"
757
+ op_type: "Unsqueeze"
758
+ }
759
+ node {
760
+ output: "onnx::Unsqueeze_855"
761
+ name: "Constant_68"
762
+ op_type: "Constant"
763
+ attribute {
764
+ name: "value"
765
+ t {
766
+ dims: 1
767
+ data_type: 7
768
+ data_location: 0
769
+ }
770
+ type: TENSOR
771
+ }
772
+ }
773
+ node {
774
+ input: "/quantizer/vq/layers.0/_codebook/Cast_1_output_0"
775
+ input: "onnx::Unsqueeze_855"
776
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
777
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3"
778
+ op_type: "Unsqueeze"
779
+ }
780
+ node {
781
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
782
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
783
+ output: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
784
+ name: "/quantizer/vq/layers.0/_codebook/Concat_1"
785
+ op_type: "Concat"
786
+ attribute {
787
+ name: "axis"
788
+ i: 0
789
+ type: INT
790
+ }
791
+ }
792
+ node {
793
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
794
+ input: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
795
+ output: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
796
+ name: "/quantizer/vq/layers.0/_codebook/Reshape_1"
797
+ op_type: "Reshape"
798
+ attribute {
799
+ name: "allowzero"
800
+ i: 0
801
+ type: INT
802
+ }
803
+ }
804
+ node {
805
+ output: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
806
+ name: "/quantizer/vq/layers.0/_codebook/Constant_6"
807
+ op_type: "Constant"
808
+ attribute {
809
+ name: "value"
810
+ t {
811
+ data_type: 1
812
+ data_location: 0
813
+ }
814
+ type: TENSOR
815
+ }
816
+ }
817
+ node {
818
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
819
+ input: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
820
+ output: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
821
+ name: "/quantizer/vq/layers.0/_codebook/Pow"
822
+ op_type: "Pow"
823
+ }
824
+ node {
825
+ input: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
826
+ input: "onnx::ReduceSum_786"
827
+ output: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
828
+ name: "/quantizer/vq/layers.0/_codebook/ReduceSum"
829
+ op_type: "ReduceSum"
830
+ attribute {
831
+ name: "keepdims"
832
+ i: 1
833
+ type: INT
834
+ }
835
+ }
836
+ node {
837
+ output: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
838
+ name: "/quantizer/vq/layers.0/_codebook/Constant_7"
839
+ op_type: "Constant"
840
+ attribute {
841
+ name: "value"
842
+ t {
843
+ data_type: 1
844
+ data_location: 0
845
+ }
846
+ type: TENSOR
847
+ }
848
+ }
849
+ node {
850
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
851
+ input: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
852
+ output: "/quantizer/vq/layers.0/_codebook/Mul_3_output_0"
853
+ name: "/quantizer/vq/layers.0/_codebook/Mul_3"
854
+ op_type: "Mul"
855
+ }
856
+ node {
857
+ input: "/quantizer/vq/layers.0/_codebook/Mul_3_output_0"
858
+ input: "onnx::MatMul_1058"
859
+ output: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
860
+ name: "/quantizer/vq/layers.0/_codebook/MatMul"
861
+ op_type: "MatMul"
862
+ }
863
+ node {
864
+ input: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
865
+ input: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
866
+ output: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
867
+ name: "/quantizer/vq/layers.0/_codebook/Sub"
868
+ op_type: "Sub"
869
+ }
870
+ node {
871
+ output: "/quantizer/vq/layers.0/_codebook/Constant_8_output_0"
872
+ name: "/quantizer/vq/layers.0/_codebook/Constant_8"
873
+ op_type: "Constant"
874
+ attribute {
875
+ name: "value"
876
+ t {
877
+ data_type: 1
878
+ data_location: 0
879
+ }
880
+ type: TENSOR
881
+ }
882
+ }
883
+ node {
884
+ input: "onnx::MatMul_1058"
885
+ input: "/quantizer/vq/layers.0/_codebook/Constant_8_output_0"
886
+ output: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
887
+ name: "/quantizer/vq/layers.0/_codebook/Pow_1"
888
+ op_type: "Pow"
889
+ }
890
+ node {
891
+ input: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
892
+ input: "onnx::ReduceSum_785"
893
+ output: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
894
+ name: "/quantizer/vq/layers.0/_codebook/ReduceSum_1"
895
+ op_type: "ReduceSum"
896
+ attribute {
897
+ name: "keepdims"
898
+ i: 1
899
+ type: INT
900
+ }
901
+ }
902
+ node {
903
+ input: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
904
+ input: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
905
+ output: "/quantizer/vq/layers.0/_codebook/Add_output_0"
906
+ name: "/quantizer/vq/layers.0/_codebook/Add"
907
+ op_type: "Add"
908
+ }
909
+ node {
910
+ input: "/quantizer/vq/layers.0/_codebook/Add_output_0"
911
+ output: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
912
+ name: "/quantizer/vq/layers.0/_codebook/Neg"
913
+ op_type: "Neg"
914
+ }
915
+ node {
916
+ input: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
917
+ output: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
918
+ name: "/quantizer/vq/layers.0/_codebook/ArgMax"
919
+ op_type: "ArgMax"
920
+ attribute {
921
+ name: "axis"
922
+ i: -1
923
+ type: INT
924
+ }
925
+ attribute {
926
+ name: "keepdims"
927
+ i: 0
928
+ type: INT
929
+ }
930
+ }
931
+ node {
932
+ output: "onnx::Unsqueeze_873"
933
+ name: "Constant_85"
934
+ op_type: "Constant"
935
+ attribute {
936
+ name: "value"
937
+ t {
938
+ dims: 1
939
+ data_type: 7
940
+ data_location: 0
941
+ }
942
+ type: TENSOR
943
+ }
944
+ }
945
+ node {
946
+ input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
947
+ input: "onnx::Unsqueeze_873"
948
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_4_output_0"
949
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_4"
950
+ op_type: "Unsqueeze"
951
+ }
952
+ node {
953
+ output: "onnx::Unsqueeze_875"
954
+ name: "Constant_87"
955
+ op_type: "Constant"
956
+ attribute {
957
+ name: "value"
958
+ t {
959
+ dims: 1
960
+ data_type: 7
961
+ data_location: 0
962
+ }
963
+ type: TENSOR
964
+ }
965
+ }
966
+ node {
967
+ input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
968
+ input: "onnx::Unsqueeze_875"
969
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_5_output_0"
970
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_5"
971
+ op_type: "Unsqueeze"
972
+ }
973
+ node {
974
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_4_output_0"
975
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_5_output_0"
976
+ output: "/quantizer/vq/layers.0/_codebook/Concat_2_output_0"
977
+ name: "/quantizer/vq/layers.0/_codebook/Concat_2"
978
+ op_type: "Concat"
979
+ attribute {
980
+ name: "axis"
981
+ i: 0
982
+ type: INT
983
+ }
984
+ }
985
+ node {
986
+ input: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
987
+ input: "/quantizer/vq/layers.0/_codebook/Concat_2_output_0"
988
+ output: "/quantizer/vq/layers.0/_codebook/Reshape_2_output_0"
989
+ name: "/quantizer/vq/layers.0/_codebook/Reshape_2"
990
+ op_type: "Reshape"
991
+ attribute {
992
+ name: "allowzero"
993
+ i: 0
994
+ type: INT
995
+ }
996
+ }
997
+ node {
998
+ output: "/quantizer/vq/Constant_output_0"
999
+ name: "/quantizer/vq/Constant"
1000
+ op_type: "Constant"
1001
+ attribute {
1002
+ name: "value"
1003
+ t {
1004
+ dims: 1
1005
+ data_type: 7
1006
+ data_location: 0
1007
+ }
1008
+ type: TENSOR
1009
+ }
1010
+ }
1011
+ node {
1012
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_2_output_0"
1013
+ input: "/quantizer/vq/Constant_output_0"
1014
+ output: "/quantizer/vq/Unsqueeze_output_0"
1015
+ name: "/quantizer/vq/Unsqueeze"
1016
+ op_type: "Unsqueeze"
1017
+ }
1018
+ node {
1019
+ input: "/quantizer/vq/Unsqueeze_output_0"
1020
+ output: "/quantizer/vq/Concat_output_0"
1021
+ name: "/quantizer/vq/Concat"
1022
+ op_type: "Concat"
1023
+ attribute {
1024
+ name: "axis"
1025
+ i: 0
1026
+ type: INT
1027
+ }
1028
+ }
1029
+ node {
1030
+ input: "/quantizer/vq/Concat_output_0"
1031
+ output: "/Transpose_output_0"
1032
+ name: "/Transpose"
1033
+ op_type: "Transpose"
1034
+ attribute {
1035
+ name: "perm"
1036
+ ints: 1
1037
+ ints: 0
1038
+ ints: 2
1039
+ type: INTS
1040
+ }
1041
+ }
1042
+ node {
1043
+ input: "/Transpose_output_0"
1044
+ input: "/quantizer/vq/layers.0/Constant_output_0"
1045
+ output: "/Gather_output_0"
1046
+ name: "/Gather"
1047
+ op_type: "Gather"
1048
+ attribute {
1049
+ name: "axis"
1050
+ i: 0
1051
+ type: INT
1052
+ }
1053
+ }
1054
+ node {
1055
+ input: "/Gather_output_0"
1056
+ input: "/quantizer/vq/layers.0/Constant_output_0"
1057
+ output: "/Gather_1_output_0"
1058
+ name: "/Gather_1"
1059
+ op_type: "Gather"
1060
+ attribute {
1061
+ name: "axis"
1062
+ i: 0
1063
+ type: INT
1064
+ }
1065
+ }
1066
+ node {
1067
+ input: "ref_bert"
1068
+ output: "/Transpose_1_output_0"
1069
+ name: "/Transpose_1"
1070
+ op_type: "Transpose"
1071
+ attribute {
1072
+ name: "perm"
1073
+ ints: 1
1074
+ ints: 0
1075
+ type: INTS
1076
+ }
1077
+ }
1078
+ node {
1079
+ input: "text_bert"
1080
+ output: "/Transpose_2_output_0"
1081
+ name: "/Transpose_2"
1082
+ op_type: "Transpose"
1083
+ attribute {
1084
+ name: "perm"
1085
+ ints: 1
1086
+ ints: 0
1087
+ type: INTS
1088
+ }
1089
+ }
1090
+ node {
1091
+ input: "/Transpose_1_output_0"
1092
+ input: "/Transpose_2_output_0"
1093
+ output: "/Concat_output_0"
1094
+ name: "/Concat"
1095
+ op_type: "Concat"
1096
+ attribute {
1097
+ name: "axis"
1098
+ i: 1
1099
+ type: INT
1100
+ }
1101
+ }
1102
+ node {
1103
+ input: "ref_seq"
1104
+ input: "text_seq"
1105
+ output: "/Concat_1_output_0"
1106
+ name: "/Concat_1"
1107
+ op_type: "Concat"
1108
+ attribute {
1109
+ name: "axis"
1110
+ i: 1
1111
+ type: INT
1112
+ }
1113
+ }
1114
+ node {
1115
+ output: "/Constant_output_0"
1116
+ name: "/Constant"
1117
+ op_type: "Constant"
1118
+ attribute {
1119
+ name: "value"
1120
+ t {
1121
+ dims: 1
1122
+ data_type: 7
1123
+ data_location: 0
1124
+ }
1125
+ type: TENSOR
1126
+ }
1127
+ }
1128
+ node {
1129
+ input: "/Concat_output_0"
1130
+ input: "/Constant_output_0"
1131
+ output: "/Unsqueeze_output_0"
1132
+ name: "/Unsqueeze"
1133
+ op_type: "Unsqueeze"
1134
+ }
1135
+ node {
1136
+ output: "/Constant_1_output_0"
1137
+ name: "/Constant_1"
1138
+ op_type: "Constant"
1139
+ attribute {
1140
+ name: "value"
1141
+ t {
1142
+ dims: 1
1143
+ data_type: 7
1144
+ data_location: 0
1145
+ }
1146
+ type: TENSOR
1147
+ }
1148
+ }
1149
+ node {
1150
+ input: "/Gather_1_output_0"
1151
+ input: "/Constant_1_output_0"
1152
+ output: "prompts"
1153
+ name: "/Unsqueeze_1"
1154
+ op_type: "Unsqueeze"
1155
+ }
1156
+ node {
1157
+ input: "encoder.ar_text_embedding.word_embeddings.weight"
1158
+ input: "/Concat_1_output_0"
1159
+ output: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
1160
+ name: "/encoder/ar_text_embedding/word_embeddings/Gather"
1161
+ op_type: "Gather"
1162
+ }
1163
+ node {
1164
+ input: "/Unsqueeze_output_0"
1165
+ output: "/encoder/Transpose_output_0"
1166
+ name: "/encoder/Transpose"
1167
+ op_type: "Transpose"
1168
+ attribute {
1169
+ name: "perm"
1170
+ ints: 0
1171
+ ints: 2
1172
+ ints: 1
1173
+ type: INTS
1174
+ }
1175
+ }
1176
+ node {
1177
+ input: "/encoder/Transpose_output_0"
1178
+ input: "onnx::MatMul_1059"
1179
+ output: "/encoder/bert_proj/MatMul_output_0"
1180
+ name: "/encoder/bert_proj/MatMul"
1181
+ op_type: "MatMul"
1182
+ }
1183
+ node {
1184
+ input: "encoder.bert_proj.bias"
1185
+ input: "/encoder/bert_proj/MatMul_output_0"
1186
+ output: "/encoder/bert_proj/Add_output_0"
1187
+ name: "/encoder/bert_proj/Add"
1188
+ op_type: "Add"
1189
+ }
1190
+ node {
1191
+ input: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
1192
+ input: "/encoder/bert_proj/Add_output_0"
1193
+ output: "/encoder/Add_output_0"
1194
+ name: "/encoder/Add"
1195
+ op_type: "Add"
1196
+ }
1197
+ node {
1198
+ input: "/encoder/Add_output_0"
1199
+ output: "/encoder/ar_text_position/Shape_output_0"
1200
+ name: "/encoder/ar_text_position/Shape"
1201
+ op_type: "Shape"
1202
+ }
1203
+ node {
1204
+ output: "/encoder/ar_text_position/Constant_output_0"
1205
+ name: "/encoder/ar_text_position/Constant"
1206
+ op_type: "Constant"
1207
+ attribute {
1208
+ name: "value"
1209
+ t {
1210
+ data_type: 7
1211
+ data_location: 0
1212
+ }
1213
+ type: TENSOR
1214
+ }
1215
+ }
1216
+ node {
1217
+ input: "/encoder/ar_text_position/Shape_output_0"
1218
+ input: "/encoder/ar_text_position/Constant_output_0"
1219
+ output: "/encoder/ar_text_position/Gather_output_0"
1220
+ name: "/encoder/ar_text_position/Gather"
1221
+ op_type: "Gather"
1222
+ attribute {
1223
+ name: "axis"
1224
+ i: 0
1225
+ type: INT
1226
+ }
1227
+ }
1228
+ node {
1229
+ output: "/encoder/ar_text_position/Constant_1_output_0"
1230
+ name: "/encoder/ar_text_position/Constant_1"
1231
+ op_type: "Constant"
1232
+ attribute {
1233
+ name: "value"
1234
+ t {
1235
+ data_type: 1
1236
+ data_location: 0
1237
+ }
1238
+ type: TENSOR
1239
+ }
1240
+ }
1241
+ node {
1242
+ input: "/encoder/ar_text_position/Gather_output_0"
1243
+ output: "/encoder/ar_text_position/Cast_output_0"
1244
+ name: "/encoder/ar_text_position/Cast"
1245
+ op_type: "Cast"
1246
+ attribute {
1247
+ name: "to"
1248
+ i: 1
1249
+ type: INT
1250
+ }
1251
+ }
1252
+ node {
1253
+ output: "/encoder/ar_text_position/Constant_2_output_0"
1254
+ name: "/encoder/ar_text_position/Constant_2"
1255
+ op_type: "Constant"
1256
+ attribute {
1257
+ name: "value"
1258
+ t {
1259
+ data_type: 1
1260
+ data_location: 0
1261
+ }
1262
+ type: TENSOR
1263
+ }
1264
+ }
1265
+ node {
1266
+ input: "/encoder/ar_text_position/Constant_1_output_0"
1267
+ input: "/encoder/ar_text_position/Cast_output_0"
1268
+ input: "/encoder/ar_text_position/Constant_2_output_0"
1269
+ output: "/encoder/ar_text_position/Range_output_0"
1270
+ name: "/encoder/ar_text_position/Range"
1271
+ op_type: "Range"
1272
+ }
1273
+ node {
1274
+ output: "/encoder/ar_text_position/Constant_3_output_0"
1275
+ name: "/encoder/ar_text_position/Constant_3"
1276
+ op_type: "Constant"
1277
+ attribute {
1278
+ name: "value"
1279
+ t {
1280
+ dims: 1
1281
+ data_type: 7
1282
+ data_location: 0
1283
+ }
1284
+ type: TENSOR
1285
+ }
1286
+ }
1287
+ node {
1288
+ input: "/encoder/ar_text_position/Range_output_0"
1289
+ input: "/encoder/ar_text_position/Constant_3_output_0"
1290
+ output: "/encoder/ar_text_position/Unsqueeze_output_0"
1291
+ name: "/encoder/ar_text_position/Unsqueeze"
1292
+ op_type: "Unsqueeze"
1293
+ }
1294
+ node {
1295
+ output: "onnx::Unsqueeze_909"
1296
+ name: "Constant_119"
1297
+ op_type: "Constant"
1298
+ attribute {
1299
+ name: "value"
1300
+ t {
1301
+ dims: 1
1302
+ data_type: 7
1303
+ data_location: 0
1304
+ }
1305
+ type: TENSOR
1306
+ }
1307
+ }
1308
+ node {
1309
+ input: "/encoder/ar_text_position/Gather_output_0"
1310
+ input: "onnx::Unsqueeze_909"
1311
+ output: "/encoder/ar_text_position/Unsqueeze_1_output_0"
1312
+ name: "/encoder/ar_text_position/Unsqueeze_1"
1313
+ op_type: "Unsqueeze"
1314
+ }
1315
+ node {
1316
+ output: "/encoder/ar_text_position/Constant_4_output_0"
1317
+ name: "/encoder/ar_text_position/Constant_4"
1318
+ op_type: "Constant"
1319
+ attribute {
1320
+ name: "value"
1321
+ t {
1322
+ dims: 1
1323
+ data_type: 7
1324
+ data_location: 0
1325
+ }
1326
+ type: TENSOR
1327
+ }
1328
+ }
1329
+ node {
1330
+ input: "/encoder/ar_text_position/Unsqueeze_1_output_0"
1331
+ input: "/encoder/ar_text_position/Constant_4_output_0"
1332
+ output: "/encoder/ar_text_position/Concat_output_0"
1333
+ name: "/encoder/ar_text_position/Concat"
1334
+ op_type: "Concat"
1335
+ attribute {
1336
+ name: "axis"
1337
+ i: 0
1338
+ type: INT
1339
+ }
1340
+ }
1341
+ node {
1342
+ input: "/encoder/ar_text_position/Concat_output_0"
1343
+ output: "/encoder/ar_text_position/ConstantOfShape_output_0"
1344
+ name: "/encoder/ar_text_position/ConstantOfShape"
1345
+ op_type: "ConstantOfShape"
1346
+ attribute {
1347
+ name: "value"
1348
+ t {
1349
+ dims: 1
1350
+ data_type: 1
1351
+ raw_data: "\000\000\000\000"
1352
+ }
1353
+ type: TENSOR
1354
+ }
1355
+ }
1356
+ node {
1357
+ output: "/encoder/ar_text_position/Constant_5_output_0"
1358
+ name: "/encoder/ar_text_position/Constant_5"
1359
+ op_type: "Constant"
1360
+ attribute {
1361
+ name: "value"
1362
+ t {
1363
+ dims: 256
1364
+ data_type: 1
1365
+ data_location: 0
1366
+ }
1367
+ type: TENSOR
1368
+ }
1369
+ }
1370
+ node {
1371
+ input: "/encoder/ar_text_position/Unsqueeze_output_0"
1372
+ input: "/encoder/ar_text_position/Constant_5_output_0"
1373
+ output: "/encoder/ar_text_position/Mul_output_0"
1374
+ name: "/encoder/ar_text_position/Mul"
1375
+ op_type: "Mul"
1376
+ }
1377
+ node {
1378
+ input: "/encoder/ar_text_position/Mul_output_0"
1379
+ output: "/encoder/ar_text_position/Sin_output_0"
1380
+ name: "/encoder/ar_text_position/Sin"
1381
+ op_type: "Sin"
1382
+ }
1383
+ node {
1384
+ output: "/encoder/ar_text_position/Constant_6_output_0"
1385
+ name: "/encoder/ar_text_position/Constant_6"
1386
+ op_type: "Constant"
1387
+ attribute {
1388
+ name: "value"
1389
+ t {
1390
+ dims: 1
1391
+ data_type: 7
1392
+ data_location: 0
1393
+ }
1394
+ type: TENSOR
1395
+ }
1396
+ }
1397
+ node {
1398
+ output: "/encoder/ar_text_position/Constant_7_output_0"
1399
+ name: "/encoder/ar_text_position/Constant_7"
1400
+ op_type: "Constant"
1401
+ attribute {
1402
+ name: "value"
1403
+ t {
1404
+ dims: 1
1405
+ data_type: 7
1406
+ data_location: 0
1407
+ }
1408
+ type: TENSOR
1409
+ }
1410
+ }
1411
+ node {
1412
+ output: "/encoder/ar_text_position/Constant_8_output_0"
1413
+ name: "/encoder/ar_text_position/Constant_8"
1414
+ op_type: "Constant"
1415
+ attribute {
1416
+ name: "value"
1417
+ t {
1418
+ dims: 1
1419
+ data_type: 7
1420
+ data_location: 0
1421
+ }
1422
+ type: TENSOR
1423
+ }
1424
+ }
1425
+ node {
1426
+ output: "/encoder/ar_text_position/Constant_9_output_0"
1427
+ name: "/encoder/ar_text_position/Constant_9"
1428
+ op_type: "Constant"
1429
+ attribute {
1430
+ name: "value"
1431
+ t {
1432
+ dims: 1
1433
+ data_type: 7
1434
+ data_location: 0
1435
+ }
1436
+ type: TENSOR
1437
+ }
1438
+ }
1439
+ node {
1440
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1441
+ input: "/encoder/ar_text_position/Constant_7_output_0"
1442
+ input: "/encoder/ar_text_position/Constant_8_output_0"
1443
+ input: "/encoder/ar_text_position/Constant_6_output_0"
1444
+ input: "/encoder/ar_text_position/Constant_9_output_0"
1445
+ output: "/encoder/ar_text_position/Slice_output_0"
1446
+ name: "/encoder/ar_text_position/Slice"
1447
+ op_type: "Slice"
1448
+ }
1449
+ node {
1450
+ input: "/encoder/ar_text_position/Slice_output_0"
1451
+ output: "/encoder/ar_text_position/Shape_1_output_0"
1452
+ name: "/encoder/ar_text_position/Shape_1"
1453
+ op_type: "Shape"
1454
+ }
1455
+ node {
1456
+ input: "/encoder/ar_text_position/Sin_output_0"
1457
+ input: "/encoder/ar_text_position/Shape_1_output_0"
1458
+ output: "/encoder/ar_text_position/Expand_output_0"
1459
+ name: "/encoder/ar_text_position/Expand"
1460
+ op_type: "Expand"
1461
+ }
1462
+ node {
1463
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1464
+ output: "/encoder/ar_text_position/Shape_2_output_0"
1465
+ name: "/encoder/ar_text_position/Shape_2"
1466
+ op_type: "Shape"
1467
+ }
1468
+ node {
1469
+ output: "/encoder/ar_text_position/Constant_10_output_0"
1470
+ name: "/encoder/ar_text_position/Constant_10"
1471
+ op_type: "Constant"
1472
+ attribute {
1473
+ name: "value"
1474
+ t {
1475
+ data_type: 7
1476
+ data_location: 0
1477
+ }
1478
+ type: TENSOR
1479
+ }
1480
+ }
1481
+ node {
1482
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1483
+ input: "/encoder/ar_text_position/Constant_10_output_0"
1484
+ output: "/encoder/ar_text_position/Gather_1_output_0"
1485
+ name: "/encoder/ar_text_position/Gather_1"
1486
+ op_type: "Gather"
1487
+ attribute {
1488
+ name: "axis"
1489
+ i: 0
1490
+ type: INT
1491
+ }
1492
+ }
1493
+ node {
1494
+ input: "/encoder/ar_text_position/Gather_1_output_0"
1495
+ output: "/encoder/ar_text_position/Cast_1_output_0"
1496
+ name: "/encoder/ar_text_position/Cast_1"
1497
+ op_type: "Cast"
1498
+ attribute {
1499
+ name: "to"
1500
+ i: 7
1501
+ type: INT
1502
+ }
1503
+ }
1504
+ node {
1505
+ output: "/encoder/ar_text_position/Constant_11_output_0"
1506
+ name: "/encoder/ar_text_position/Constant_11"
1507
+ op_type: "Constant"
1508
+ attribute {
1509
+ name: "value"
1510
+ t {
1511
+ data_type: 7
1512
+ data_location: 0
1513
+ }
1514
+ type: TENSOR
1515
+ }
1516
+ }
1517
+ node {
1518
+ output: "/encoder/ar_text_position/Constant_12_output_0"
1519
+ name: "/encoder/ar_text_position/Constant_12"
1520
+ op_type: "Constant"
1521
+ attribute {
1522
+ name: "value"
1523
+ t {
1524
+ data_type: 7
1525
+ data_location: 0
1526
+ }
1527
+ type: TENSOR
1528
+ }
1529
+ }
1530
+ node {
1531
+ input: "/encoder/ar_text_position/Constant_11_output_0"
1532
+ input: "/encoder/ar_text_position/Cast_1_output_0"
1533
+ input: "/encoder/ar_text_position/Constant_12_output_0"
1534
+ output: "/encoder/ar_text_position/Range_1_output_0"
1535
+ name: "/encoder/ar_text_position/Range_1"
1536
+ op_type: "Range"
1537
+ }
1538
+ node {
1539
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1540
+ output: "/encoder/ar_text_position/Shape_3_output_0"
1541
+ name: "/encoder/ar_text_position/Shape_3"
1542
+ op_type: "Shape"
1543
+ }
1544
+ node {
1545
+ output: "/encoder/ar_text_position/Constant_13_output_0"
1546
+ name: "/encoder/ar_text_position/Constant_13"
1547
+ op_type: "Constant"
1548
+ attribute {
1549
+ name: "value"
1550
+ t {
1551
+ data_type: 7
1552
+ data_location: 0
1553
+ }
1554
+ type: TENSOR
1555
+ }
1556
+ }
1557
+ node {
1558
+ input: "/encoder/ar_text_position/Shape_3_output_0"
1559
+ input: "/encoder/ar_text_position/Constant_13_output_0"
1560
+ output: "/encoder/ar_text_position/Gather_2_output_0"
1561
+ name: "/encoder/ar_text_position/Gather_2"
1562
+ op_type: "Gather"
1563
+ attribute {
1564
+ name: "axis"
1565
+ i: 0
1566
+ type: INT
1567
+ }
1568
+ }
1569
+ node {
1570
+ input: "/encoder/ar_text_position/Gather_2_output_0"
1571
+ output: "/encoder/ar_text_position/Cast_2_output_0"
1572
+ name: "/encoder/ar_text_position/Cast_2"
1573
+ op_type: "Cast"
1574
+ attribute {
1575
+ name: "to"
1576
+ i: 7
1577
+ type: INT
1578
+ }
1579
+ }
1580
+ node {
1581
+ output: "/encoder/ar_text_position/Constant_14_output_0"
1582
+ name: "/encoder/ar_text_position/Constant_14"
1583
+ op_type: "Constant"
1584
+ attribute {
1585
+ name: "value"
1586
+ t {
1587
+ data_type: 7
1588
+ data_location: 0
1589
+ }
1590
+ type: TENSOR
1591
+ }
1592
+ }
1593
+ node {
1594
+ output: "/encoder/ar_text_position/Constant_15_output_0"
1595
+ name: "/encoder/ar_text_position/Constant_15"
1596
+ op_type: "Constant"
1597
+ attribute {
1598
+ name: "value"
1599
+ t {
1600
+ data_type: 7
1601
+ data_location: 0
1602
+ }
1603
+ type: TENSOR
1604
+ }
1605
+ }
1606
+ node {
1607
+ input: "/encoder/ar_text_position/Constant_14_output_0"
1608
+ input: "/encoder/ar_text_position/Cast_2_output_0"
1609
+ input: "/encoder/ar_text_position/Constant_15_output_0"
1610
+ output: "/encoder/ar_text_position/Range_2_output_0"
1611
+ name: "/encoder/ar_text_position/Range_2"
1612
+ op_type: "Range"
1613
+ }
1614
+ node {
1615
+ output: "/encoder/ar_text_position/Constant_16_output_0"
1616
+ name: "/encoder/ar_text_position/Constant_16"
1617
+ op_type: "Constant"
1618
+ attribute {
1619
+ name: "value"
1620
+ t {
1621
+ dims: 1
1622
+ data_type: 7
1623
+ data_location: 0
1624
+ }
1625
+ type: TENSOR
1626
+ }
1627
+ }
1628
+ node {
1629
+ output: "/encoder/ar_text_position/Constant_17_output_0"
1630
+ name: "/encoder/ar_text_position/Constant_17"
1631
+ op_type: "Constant"
1632
+ attribute {
1633
+ name: "value"
1634
+ t {
1635
+ dims: 1
1636
+ data_type: 7
1637
+ data_location: 0
1638
+ }
1639
+ type: TENSOR
1640
+ }
1641
+ }
1642
+ node {
1643
+ output: "/encoder/ar_text_position/Constant_18_output_0"
1644
+ name: "/encoder/ar_text_position/Constant_18"
1645
+ op_type: "Constant"
1646
+ attribute {
1647
+ name: "value"
1648
+ t {
1649
+ dims: 1
1650
+ data_type: 7
1651
+ data_location: 0
1652
+ }
1653
+ type: TENSOR
1654
+ }
1655
+ }
1656
+ node {
1657
+ output: "/encoder/ar_text_position/Constant_19_output_0"
1658
+ name: "/encoder/ar_text_position/Constant_19"
1659
+ op_type: "Constant"
1660
+ attribute {
1661
+ name: "value"
1662
+ t {
1663
+ dims: 1
1664
+ data_type: 7
1665
+ data_location: 0
1666
+ }
1667
+ type: TENSOR
1668
+ }
1669
+ }
1670
+ node {
1671
+ input: "/encoder/ar_text_position/Range_2_output_0"
1672
+ input: "/encoder/ar_text_position/Constant_17_output_0"
1673
+ input: "/encoder/ar_text_position/Constant_18_output_0"
1674
+ input: "/encoder/ar_text_position/Constant_16_output_0"
1675
+ input: "/encoder/ar_text_position/Constant_19_output_0"
1676
+ output: "/encoder/ar_text_position/Slice_1_output_0"
1677
+ name: "/encoder/ar_text_position/Slice_1"
1678
+ op_type: "Slice"
1679
+ }
1680
+ node {
1681
+ output: "/encoder/ar_text_position/Constant_20_output_0"
1682
+ name: "/encoder/ar_text_position/Constant_20"
1683
+ op_type: "Constant"
1684
+ attribute {
1685
+ name: "value"
1686
+ t {
1687
+ dims: 2
1688
+ data_type: 7
1689
+ data_location: 0
1690
+ }
1691
+ type: TENSOR
1692
+ }
1693
+ }
1694
+ node {
1695
+ input: "/encoder/ar_text_position/Range_1_output_0"
1696
+ input: "/encoder/ar_text_position/Constant_20_output_0"
1697
+ output: "/encoder/ar_text_position/Reshape_output_0"
1698
+ name: "/encoder/ar_text_position/Reshape"
1699
+ op_type: "Reshape"
1700
+ attribute {
1701
+ name: "allowzero"
1702
+ i: 0
1703
+ type: INT
1704
+ }
1705
+ }
1706
+ node {
1707
+ input: "/encoder/ar_text_position/Reshape_output_0"
1708
+ input: "/encoder/ar_text_position/Slice_1_output_0"
1709
+ output: "/encoder/ar_text_position/Add_output_0"
1710
+ name: "/encoder/ar_text_position/Add"
1711
+ op_type: "Add"
1712
+ }
1713
+ node {
1714
+ input: "/encoder/ar_text_position/Add_output_0"
1715
+ output: "/encoder/ar_text_position/Shape_4_output_0"
1716
+ name: "/encoder/ar_text_position/Shape_4"
1717
+ op_type: "Shape"
1718
+ }
1719
+ node {
1720
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1721
+ output: "/encoder/ar_text_position/Shape_5_output_0"
1722
+ name: "/encoder/ar_text_position/Shape_5"
1723
+ op_type: "Shape"
1724
+ }
1725
+ node {
1726
+ input: "/encoder/ar_text_position/Shape_5_output_0"
1727
+ output: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
1728
+ name: "/encoder/ar_text_position/ConstantOfShape_1"
1729
+ op_type: "ConstantOfShape"
1730
+ attribute {
1731
+ name: "value"
1732
+ t {
1733
+ dims: 1
1734
+ data_type: 7
1735
+ raw_data: "\001\000\000\000\000\000\000\000"
1736
+ }
1737
+ type: TENSOR
1738
+ }
1739
+ }
1740
+ node {
1741
+ output: "/encoder/ar_text_position/Constant_21_output_0"
1742
+ name: "/encoder/ar_text_position/Constant_21"
1743
+ op_type: "Constant"
1744
+ attribute {
1745
+ name: "value"
1746
+ t {
1747
+ data_type: 7
1748
+ data_location: 0
1749
+ }
1750
+ type: TENSOR
1751
+ }
1752
+ }
1753
+ node {
1754
+ input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
1755
+ input: "/encoder/ar_text_position/Constant_21_output_0"
1756
+ output: "/encoder/ar_text_position/Mul_1_output_0"
1757
+ name: "/encoder/ar_text_position/Mul_1"
1758
+ op_type: "Mul"
1759
+ }
1760
+ node {
1761
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1762
+ input: "/encoder/ar_text_position/Mul_1_output_0"
1763
+ output: "/encoder/ar_text_position/Equal_output_0"
1764
+ name: "/encoder/ar_text_position/Equal"
1765
+ op_type: "Equal"
1766
+ }
1767
+ node {
1768
+ input: "/encoder/ar_text_position/Equal_output_0"
1769
+ input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
1770
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1771
+ output: "/encoder/ar_text_position/Where_output_0"
1772
+ name: "/encoder/ar_text_position/Where"
1773
+ op_type: "Where"
1774
+ }
1775
+ node {
1776
+ input: "/encoder/ar_text_position/Reshape_output_0"
1777
+ input: "/encoder/ar_text_position/Where_output_0"
1778
+ output: "/encoder/ar_text_position/Expand_1_output_0"
1779
+ name: "/encoder/ar_text_position/Expand_1"
1780
+ op_type: "Expand"
1781
+ }
1782
+ node {
1783
+ output: "/encoder/ar_text_position/Constant_22_output_0"
1784
+ name: "/encoder/ar_text_position/Constant_22"
1785
+ op_type: "Constant"
1786
+ attribute {
1787
+ name: "value"
1788
+ t {
1789
+ dims: 1
1790
+ data_type: 7
1791
+ data_location: 0
1792
+ }
1793
+ type: TENSOR
1794
+ }
1795
+ }
1796
+ node {
1797
+ input: "/encoder/ar_text_position/Expand_1_output_0"
1798
+ input: "/encoder/ar_text_position/Constant_22_output_0"
1799
+ output: "/encoder/ar_text_position/Unsqueeze_2_output_0"
1800
+ name: "/encoder/ar_text_position/Unsqueeze_2"
1801
+ op_type: "Unsqueeze"
1802
+ }
1803
+ node {
1804
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1805
+ output: "/encoder/ar_text_position/Shape_6_output_0"
1806
+ name: "/encoder/ar_text_position/Shape_6"
1807
+ op_type: "Shape"
1808
+ }
1809
+ node {
1810
+ input: "/encoder/ar_text_position/Shape_6_output_0"
1811
+ output: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
1812
+ name: "/encoder/ar_text_position/ConstantOfShape_2"
1813
+ op_type: "ConstantOfShape"
1814
+ attribute {
1815
+ name: "value"
1816
+ t {
1817
+ dims: 1
1818
+ data_type: 7
1819
+ raw_data: "\001\000\000\000\000\000\000\000"
1820
+ }
1821
+ type: TENSOR
1822
+ }
1823
+ }
1824
+ node {
1825
+ output: "/encoder/ar_text_position/Constant_23_output_0"
1826
+ name: "/encoder/ar_text_position/Constant_23"
1827
+ op_type: "Constant"
1828
+ attribute {
1829
+ name: "value"
1830
+ t {
1831
+ data_type: 7
1832
+ data_location: 0
1833
+ }
1834
+ type: TENSOR
1835
+ }
1836
+ }
1837
+ node {
1838
+ input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
1839
+ input: "/encoder/ar_text_position/Constant_23_output_0"
1840
+ output: "/encoder/ar_text_position/Mul_2_output_0"
1841
+ name: "/encoder/ar_text_position/Mul_2"
1842
+ op_type: "Mul"
1843
+ }
1844
+ node {
1845
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1846
+ input: "/encoder/ar_text_position/Mul_2_output_0"
1847
+ output: "/encoder/ar_text_position/Equal_1_output_0"
1848
+ name: "/encoder/ar_text_position/Equal_1"
1849
+ op_type: "Equal"
1850
+ }
1851
+ node {
1852
+ input: "/encoder/ar_text_position/Equal_1_output_0"
1853
+ input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
1854
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1855
+ output: "/encoder/ar_text_position/Where_1_output_0"
1856
+ name: "/encoder/ar_text_position/Where_1"
1857
+ op_type: "Where"
1858
+ }
1859
+ node {
1860
+ input: "/encoder/ar_text_position/Slice_1_output_0"
1861
+ input: "/encoder/ar_text_position/Where_1_output_0"
1862
+ output: "/encoder/ar_text_position/Expand_2_output_0"
1863
+ name: "/encoder/ar_text_position/Expand_2"
1864
+ op_type: "Expand"
1865
+ }
1866
+ node {
1867
+ output: "/encoder/ar_text_position/Constant_24_output_0"
1868
+ name: "/encoder/ar_text_position/Constant_24"
1869
+ op_type: "Constant"
1870
+ attribute {
1871
+ name: "value"
1872
+ t {
1873
+ dims: 1
1874
+ data_type: 7
1875
+ data_location: 0
1876
+ }
1877
+ type: TENSOR
1878
+ }
1879
+ }
1880
+ node {
1881
+ input: "/encoder/ar_text_position/Expand_2_output_0"
1882
+ input: "/encoder/ar_text_position/Constant_24_output_0"
1883
+ output: "/encoder/ar_text_position/Unsqueeze_3_output_0"
1884
+ name: "/encoder/ar_text_position/Unsqueeze_3"
1885
+ op_type: "Unsqueeze"
1886
+ }
1887
+ node {
1888
+ input: "/encoder/ar_text_position/Unsqueeze_2_output_0"
1889
+ input: "/encoder/ar_text_position/Unsqueeze_3_output_0"
1890
+ output: "/encoder/ar_text_position/Concat_1_output_0"
1891
+ name: "/encoder/ar_text_position/Concat_1"
1892
+ op_type: "Concat"
1893
+ attribute {
1894
+ name: "axis"
1895
+ i: -1
1896
+ type: INT
1897
+ }
1898
+ }
1899
+ node {
1900
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1901
+ output: "/encoder/ar_text_position/Shape_7_output_0"
1902
+ name: "/encoder/ar_text_position/Shape_7"
1903
+ op_type: "Shape"
1904
+ }
1905
+ node {
1906
+ output: "/encoder/ar_text_position/Constant_25_output_0"
1907
+ name: "/encoder/ar_text_position/Constant_25"
1908
+ op_type: "Constant"
1909
+ attribute {
1910
+ name: "value"
1911
+ t {
1912
+ dims: 1
1913
+ data_type: 7
1914
+ data_location: 0
1915
+ }
1916
+ type: TENSOR
1917
+ }
1918
+ }
1919
+ node {
1920
+ output: "/encoder/ar_text_position/Constant_26_output_0"
1921
+ name: "/encoder/ar_text_position/Constant_26"
1922
+ op_type: "Constant"
1923
+ attribute {
1924
+ name: "value"
1925
+ t {
1926
+ dims: 1
1927
+ data_type: 7
1928
+ data_location: 0
1929
+ }
1930
+ type: TENSOR
1931
+ }
1932
+ }
1933
+ node {
1934
+ output: "/encoder/ar_text_position/Constant_27_output_0"
1935
+ name: "/encoder/ar_text_position/Constant_27"
1936
+ op_type: "Constant"
1937
+ attribute {
1938
+ name: "value"
1939
+ t {
1940
+ dims: 1
1941
+ data_type: 7
1942
+ data_location: 0
1943
+ }
1944
+ type: TENSOR
1945
+ }
1946
+ }
1947
+ node {
1948
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1949
+ input: "/encoder/ar_text_position/Constant_26_output_0"
1950
+ input: "/encoder/ar_text_position/Constant_27_output_0"
1951
+ input: "/encoder/ar_text_position/Constant_25_output_0"
1952
+ output: "/encoder/ar_text_position/Slice_2_output_0"
1953
+ name: "/encoder/ar_text_position/Slice_2"
1954
+ op_type: "Slice"
1955
+ }
1956
+ node {
1957
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1958
+ input: "/encoder/ar_text_position/Slice_2_output_0"
1959
+ output: "/encoder/ar_text_position/Concat_2_output_0"
1960
+ name: "/encoder/ar_text_position/Concat_2"
1961
+ op_type: "Concat"
1962
+ attribute {
1963
+ name: "axis"
1964
+ i: 0
1965
+ type: INT
1966
+ }
1967
+ }
1968
+ node {
1969
+ input: "/encoder/ar_text_position/Expand_output_0"
1970
+ input: "/encoder/ar_text_position/Concat_2_output_0"
1971
+ output: "/encoder/ar_text_position/Reshape_1_output_0"
1972
+ name: "/encoder/ar_text_position/Reshape_1"
1973
+ op_type: "Reshape"
1974
+ attribute {
1975
+ name: "allowzero"
1976
+ i: 0
1977
+ type: INT
1978
+ }
1979
+ }
1980
+ node {
1981
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1982
+ input: "/encoder/ar_text_position/Concat_1_output_0"
1983
+ input: "/encoder/ar_text_position/Reshape_1_output_0"
1984
+ output: "/encoder/ar_text_position/ScatterND_output_0"
1985
+ name: "/encoder/ar_text_position/ScatterND"
1986
+ op_type: "ScatterND"
1987
+ }
1988
+ node {
1989
+ input: "/encoder/ar_text_position/Mul_output_0"
1990
+ output: "/encoder/ar_text_position/Cos_output_0"
1991
+ name: "/encoder/ar_text_position/Cos"
1992
+ op_type: "Cos"
1993
+ }
1994
+ node {
1995
+ output: "/encoder/ar_text_position/Constant_28_output_0"
1996
+ name: "/encoder/ar_text_position/Constant_28"
1997
+ op_type: "Constant"
1998
+ attribute {
1999
+ name: "value"
2000
+ t {
2001
+ dims: 1
2002
+ data_type: 7
2003
+ data_location: 0
2004
+ }
2005
+ type: TENSOR
2006
+ }
2007
+ }
2008
+ node {
2009
+ output: "/encoder/ar_text_position/Constant_29_output_0"
2010
+ name: "/encoder/ar_text_position/Constant_29"
2011
+ op_type: "Constant"
2012
+ attribute {
2013
+ name: "value"
2014
+ t {
2015
+ dims: 1
2016
+ data_type: 7
2017
+ data_location: 0
2018
+ }
2019
+ type: TENSOR
2020
+ }
2021
+ }
2022
+ node {
2023
+ output: "/encoder/ar_text_position/Constant_30_output_0"
2024
+ name: "/encoder/ar_text_position/Constant_30"
2025
+ op_type: "Constant"
2026
+ attribute {
2027
+ name: "value"
2028
+ t {
2029
+ dims: 1
2030
+ data_type: 7
2031
+ data_location: 0
2032
+ }
2033
+ type: TENSOR
2034
+ }
2035
+ }
2036
+ node {
2037
+ output: "/encoder/ar_text_position/Constant_31_output_0"
2038
+ name: "/encoder/ar_text_position/Constant_31"
2039
+ op_type: "Constant"
2040
+ attribute {
2041
+ name: "value"
2042
+ t {
2043
+ dims: 1
2044
+ data_type: 7
2045
+ data_location: 0
2046
+ }
2047
+ type: TENSOR
2048
+ }
2049
+ }
2050
+ node {
2051
+ input: "/encoder/ar_text_position/ScatterND_output_0"
2052
+ input: "/encoder/ar_text_position/Constant_29_output_0"
2053
+ input: "/encoder/ar_text_position/Constant_30_output_0"
2054
+ input: "/encoder/ar_text_position/Constant_28_output_0"
2055
+ input: "/encoder/ar_text_position/Constant_31_output_0"
2056
+ output: "/encoder/ar_text_position/Slice_3_output_0"
2057
+ name: "/encoder/ar_text_position/Slice_3"
2058
+ op_type: "Slice"
2059
+ }
2060
+ node {
2061
+ input: "/encoder/ar_text_position/Slice_3_output_0"
2062
+ output: "/encoder/ar_text_position/Shape_8_output_0"
2063
+ name: "/encoder/ar_text_position/Shape_8"
2064
+ op_type: "Shape"
2065
+ }
2066
+ node {
2067
+ input: "/encoder/ar_text_position/Cos_output_0"
2068
+ input: "/encoder/ar_text_position/Shape_8_output_0"
2069
+ output: "/encoder/ar_text_position/Expand_3_output_0"
2070
+ name: "/encoder/ar_text_position/Expand_3"
2071
+ op_type: "Expand"
2072
+ }
2073
+ node {
2074
+ input: "/encoder/ar_text_position/ScatterND_output_0"
2075
+ output: "/encoder/ar_text_position/Shape_9_output_0"
2076
+ name: "/encoder/ar_text_position/Shape_9"
2077
+ op_type: "Shape"
2078
+ }
2079
+ node {
2080
+ output: "/encoder/ar_text_position/Constant_32_output_0"
2081
+ name: "/encoder/ar_text_position/Constant_32"
2082
+ op_type: "Constant"
2083
+ attribute {
2084
+ name: "value"
2085
+ t {
2086
+ data_type: 7
2087
+ data_location: 0
2088
+ }
2089
+ type: TENSOR
2090
+ }
2091
+ }
2092
+ node {
2093
+ input: "/encoder/ar_text_position/Shape_9_output_0"
2094
+ input: "/encoder/ar_text_position/Constant_32_output_0"
2095
+ output: "/encoder/ar_text_position/Gather_3_output_0"
2096
+ name: "/encoder/ar_text_position/Gather_3"
2097
+ op_type: "Gather"
2098
+ attribute {
2099
+ name: "axis"
2100
+ i: 0
2101
+ type: INT
2102
+ }
2103
+ }
2104
+ node {
2105
+ input: "/encoder/ar_text_position/Gather_3_output_0"
2106
+ output: "/encoder/ar_text_position/Cast_3_output_0"
2107
+ name: "/encoder/ar_text_position/Cast_3"
2108
+ op_type: "Cast"
2109
+ attribute {
2110
+ name: "to"
2111
+ i: 7
2112
+ type: INT
2113
+ }
2114
+ }
2115
+ node {
2116
+ output: "/encoder/ar_text_position/Constant_33_output_0"
2117
+ name: "/encoder/ar_text_position/Constant_33"
2118
+ op_type: "Constant"
2119
+ attribute {
2120
+ name: "value"
2121
+ t {
2122
+ data_type: 7
2123
+ data_location: 0
2124
+ }
2125
+ type: TENSOR
2126
+ }
2127
+ }
2128
+ node {
2129
+ output: "/encoder/ar_text_position/Constant_34_output_0"
2130
+ name: "/encoder/ar_text_position/Constant_34"
2131
+ op_type: "Constant"
2132
+ attribute {
2133
+ name: "value"
2134
+ t {
2135
+ data_type: 7
2136
+ data_location: 0
2137
+ }
2138
+ type: TENSOR
2139
+ }
2140
+ }
2141
+ node {
2142
+ input: "/encoder/ar_text_position/Constant_33_output_0"
2143
+ input: "/encoder/ar_text_position/Cast_3_output_0"
2144
+ input: "/encoder/ar_text_position/Constant_34_output_0"
2145
+ output: "/encoder/ar_text_position/Range_3_output_0"
2146
+ name: "/encoder/ar_text_position/Range_3"
2147
+ op_type: "Range"
2148
+ }
2149
+ node {
2150
+ input: "/encoder/ar_text_position/ScatterND_output_0"
2151
+ output: "/encoder/ar_text_position/Shape_10_output_0"
2152
+ name: "/encoder/ar_text_position/Shape_10"
2153
+ op_type: "Shape"
2154
+ }
2155
+ node {
2156
+ output: "/encoder/ar_text_position/Constant_35_output_0"
2157
+ name: "/encoder/ar_text_position/Constant_35"
2158
+ op_type: "Constant"
2159
+ attribute {
2160
+ name: "value"
2161
+ t {
2162
+ data_type: 7
2163
+ data_location: 0
2164
+ }
2165
+ type: TENSOR
2166
+ }
2167
+ }
2168
+ node {
2169
+ input: "/encoder/ar_text_position/Shape_10_output_0"
2170
+ input: "/encoder/ar_text_position/Constant_35_output_0"
2171
+ output: "/encoder/ar_text_position/Gather_4_output_0"
2172
+ name: "/encoder/ar_text_position/Gather_4"
2173
+ op_type: "Gather"
2174
+ attribute {
2175
+ name: "axis"
2176
+ i: 0
2177
+ type: INT
2178
+ }
2179
+ }
2180
+ node {
2181
+ input: "/encoder/ar_text_position/Gather_4_output_0"
2182
+ output: "/encoder/ar_text_position/Cast_4_output_0"
2183
+ name: "/encoder/ar_text_position/Cast_4"
2184
+ op_type: "Cast"
2185
+ attribute {
2186
+ name: "to"
2187
+ i: 7
2188
+ type: INT
2189
+ }
2190
+ }
2191
+ node {
2192
+ output: "/encoder/ar_text_position/Constant_36_output_0"
2193
+ name: "/encoder/ar_text_position/Constant_36"
2194
+ op_type: "Constant"
2195
+ attribute {
2196
+ name: "value"
2197
+ t {
2198
+ data_type: 7
2199
+ data_location: 0
2200
+ }
2201
+ type: TENSOR
2202
+ }
2203
+ }
2204
+ node {
2205
+ output: "/encoder/ar_text_position/Constant_37_output_0"
2206
+ name: "/encoder/ar_text_position/Constant_37"
2207
+ op_type: "Constant"
2208
+ attribute {
2209
+ name: "value"
2210
+ t {
2211
+ data_type: 7
2212
+ data_location: 0
2213
+ }
2214
+ type: TENSOR
2215
+ }
2216
+ }
2217
+ node {
2218
+ input: "/encoder/ar_text_position/Constant_36_output_0"
2219
+ input: "/encoder/ar_text_position/Cast_4_output_0"
2220
+ input: "/encoder/ar_text_position/Constant_37_output_0"
2221
+ output: "/encoder/ar_text_position/Range_4_output_0"
2222
+ name: "/encoder/ar_text_position/Range_4"
2223
+ op_type: "Range"
2224
+ }
2225
+ node {
2226
+ output: "/encoder/ar_text_position/Constant_38_output_0"
2227
+ name: "/encoder/ar_text_position/Constant_38"
2228
+ op_type: "Constant"
2229
+ attribute {
2230
+ name: "value"
2231
+ t {
2232
+ dims: 1
2233
+ data_type: 7
2234
+ data_location: 0
2235
+ }
2236
+ type: TENSOR
2237
+ }
2238
+ }
2239
+ node {
2240
+ output: "/encoder/ar_text_position/Constant_39_output_0"
2241
+ name: "/encoder/ar_text_position/Constant_39"
2242
+ op_type: "Constant"
2243
+ attribute {
2244
+ name: "value"
2245
+ t {
2246
+ dims: 1
2247
+ data_type: 7
2248
+ data_location: 0
2249
+ }
2250
+ type: TENSOR
2251
+ }
2252
+ }
2253
+ node {
2254
+ output: "/encoder/ar_text_position/Constant_40_output_0"
2255
+ name: "/encoder/ar_text_position/Constant_40"
2256
+ op_type: "Constant"
2257
+ attribute {
2258
+ name: "value"
2259
+ t {
2260
+ dims: 1
2261
+ data_type: 7
2262
+ data_location: 0
2263
+ }
2264
+ type: TENSOR
2265
+ }
2266
+ }
2267
+ node {
2268
+ output: "/encoder/ar_text_position/Constant_41_output_0"
2269
+ name: "/encoder/ar_text_position/Constant_41"
2270
+ op_type: "Constant"
2271
+ attribute {
2272
+ name: "value"
2273
+ t {
2274
+ dims: 1
2275
+ data_type: 7
2276
+ data_location: 0
2277
+ }
2278
+ type: TENSOR
2279
+ }
2280
+ }
2281
+ node {
2282
+ input: "/encoder/ar_text_position/Range_4_output_0"
2283
+ input: "/encoder/ar_text_position/Constant_39_output_0"
2284
+ input: "/encoder/ar_text_position/Constant_40_output_0"
2285
+ input: "/encoder/ar_text_position/Constant_38_output_0"
2286
+ input: "/encoder/ar_text_position/Constant_41_output_0"
2287
+ output: "/encoder/ar_text_position/Slice_4_output_0"
2288
+ name: "/encoder/ar_text_position/Slice_4"
2289
+ op_type: "Slice"
2290
+ }
2291
+ node {
2292
+ output: "/encoder/ar_text_position/Constant_42_output_0"
2293
+ name: "/encoder/ar_text_position/Constant_42"
2294
+ op_type: "Constant"
2295
+ attribute {
2296
+ name: "value"
2297
+ t {
2298
+ dims: 2
2299
+ data_type: 7
2300
+ data_location: 0
2301
+ }
2302
+ type: TENSOR
2303
+ }
2304
+ }
2305
+ node {
2306
+ input: "/encoder/ar_text_position/Range_3_output_0"
2307
+ input: "/encoder/ar_text_position/Constant_42_output_0"
2308
+ output: "/encoder/ar_text_position/Reshape_2_output_0"
2309
+ name: "/encoder/ar_text_position/Reshape_2"
2310
+ op_type: "Reshape"
2311
+ attribute {
2312
+ name: "allowzero"
2313
+ i: 0
2314
+ type: INT
2315
+ }
2316
+ }
2317
+ node {
2318
+ input: "/encoder/ar_text_position/Reshape_2_output_0"
2319
+ input: "/encoder/ar_text_position/Slice_4_output_0"
2320
+ output: "/encoder/ar_text_position/Add_1_output_0"
2321
+ name: "/encoder/ar_text_position/Add_1"
2322
+ op_type: "Add"
2323
+ }
2324
+ node {
2325
+ input: "/encoder/ar_text_position/Add_1_output_0"
2326
+ output: "/encoder/ar_text_position/Shape_11_output_0"
2327
+ name: "/encoder/ar_text_position/Shape_11"
2328
+ op_type: "Shape"
2329
+ }
2330
+ node {
2331
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2332
+ output: "/encoder/ar_text_position/Shape_12_output_0"
2333
+ name: "/encoder/ar_text_position/Shape_12"
2334
+ op_type: "Shape"
2335
+ }
2336
+ node {
2337
+ input: "/encoder/ar_text_position/Shape_12_output_0"
2338
+ output: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
2339
+ name: "/encoder/ar_text_position/ConstantOfShape_3"
2340
+ op_type: "ConstantOfShape"
2341
+ attribute {
2342
+ name: "value"
2343
+ t {
2344
+ dims: 1
2345
+ data_type: 7
2346
+ raw_data: "\001\000\000\000\000\000\000\000"
2347
+ }
2348
+ type: TENSOR
2349
+ }
2350
+ }
2351
+ node {
2352
+ output: "/encoder/ar_text_position/Constant_43_output_0"
2353
+ name: "/encoder/ar_text_position/Constant_43"
2354
+ op_type: "Constant"
2355
+ attribute {
2356
+ name: "value"
2357
+ t {
2358
+ data_type: 7
2359
+ data_location: 0
2360
+ }
2361
+ type: TENSOR
2362
+ }
2363
+ }
2364
+ node {
2365
+ input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
2366
+ input: "/encoder/ar_text_position/Constant_43_output_0"
2367
+ output: "/encoder/ar_text_position/Mul_3_output_0"
2368
+ name: "/encoder/ar_text_position/Mul_3"
2369
+ op_type: "Mul"
2370
+ }
2371
+ node {
2372
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2373
+ input: "/encoder/ar_text_position/Mul_3_output_0"
2374
+ output: "/encoder/ar_text_position/Equal_2_output_0"
2375
+ name: "/encoder/ar_text_position/Equal_2"
2376
+ op_type: "Equal"
2377
+ }
2378
+ node {
2379
+ input: "/encoder/ar_text_position/Equal_2_output_0"
2380
+ input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
2381
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2382
+ output: "/encoder/ar_text_position/Where_2_output_0"
2383
+ name: "/encoder/ar_text_position/Where_2"
2384
+ op_type: "Where"
2385
+ }
2386
+ node {
2387
+ input: "/encoder/ar_text_position/Reshape_2_output_0"
2388
+ input: "/encoder/ar_text_position/Where_2_output_0"
2389
+ output: "/encoder/ar_text_position/Expand_4_output_0"
2390
+ name: "/encoder/ar_text_position/Expand_4"
2391
+ op_type: "Expand"
2392
+ }
2393
+ node {
2394
+ output: "/encoder/ar_text_position/Constant_44_output_0"
2395
+ name: "/encoder/ar_text_position/Constant_44"
2396
+ op_type: "Constant"
2397
+ attribute {
2398
+ name: "value"
2399
+ t {
2400
+ dims: 1
2401
+ data_type: 7
2402
+ data_location: 0
2403
+ }
2404
+ type: TENSOR
2405
+ }
2406
+ }
2407
+ node {
2408
+ input: "/encoder/ar_text_position/Expand_4_output_0"
2409
+ input: "/encoder/ar_text_position/Constant_44_output_0"
2410
+ output: "/encoder/ar_text_position/Unsqueeze_4_output_0"
2411
+ name: "/encoder/ar_text_position/Unsqueeze_4"
2412
+ op_type: "Unsqueeze"
2413
+ }
2414
+ node {
2415
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2416
+ output: "/encoder/ar_text_position/Shape_13_output_0"
2417
+ name: "/encoder/ar_text_position/Shape_13"
2418
+ op_type: "Shape"
2419
+ }
2420
+ node {
2421
+ input: "/encoder/ar_text_position/Shape_13_output_0"
2422
+ output: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
2423
+ name: "/encoder/ar_text_position/ConstantOfShape_4"
2424
+ op_type: "ConstantOfShape"
2425
+ attribute {
2426
+ name: "value"
2427
+ t {
2428
+ dims: 1
2429
+ data_type: 7
2430
+ raw_data: "\001\000\000\000\000\000\000\000"
2431
+ }
2432
+ type: TENSOR
2433
+ }
2434
+ }
2435
+ node {
2436
+ output: "/encoder/ar_text_position/Constant_45_output_0"
2437
+ name: "/encoder/ar_text_position/Constant_45"
2438
+ op_type: "Constant"
2439
+ attribute {
2440
+ name: "value"
2441
+ t {
2442
+ data_type: 7
2443
+ data_location: 0
2444
+ }
2445
+ type: TENSOR
2446
+ }
2447
+ }
2448
+ node {
2449
+ input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
2450
+ input: "/encoder/ar_text_position/Constant_45_output_0"
2451
+ output: "/encoder/ar_text_position/Mul_4_output_0"
2452
+ name: "/encoder/ar_text_position/Mul_4"
2453
+ op_type: "Mul"
2454
+ }
2455
+ node {
2456
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2457
+ input: "/encoder/ar_text_position/Mul_4_output_0"
2458
+ output: "/encoder/ar_text_position/Equal_3_output_0"
2459
+ name: "/encoder/ar_text_position/Equal_3"
2460
+ op_type: "Equal"
2461
+ }
2462
+ node {
2463
+ input: "/encoder/ar_text_position/Equal_3_output_0"
2464
+ input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
2465
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2466
+ output: "/encoder/ar_text_position/Where_3_output_0"
2467
+ name: "/encoder/ar_text_position/Where_3"
2468
+ op_type: "Where"
2469
+ }
2470
+ node {
2471
+ input: "/encoder/ar_text_position/Slice_4_output_0"
2472
+ input: "/encoder/ar_text_position/Where_3_output_0"
2473
+ output: "/encoder/ar_text_position/Expand_5_output_0"
2474
+ name: "/encoder/ar_text_position/Expand_5"
2475
+ op_type: "Expand"
2476
+ }
2477
+ node {
2478
+ output: "/encoder/ar_text_position/Constant_46_output_0"
2479
+ name: "/encoder/ar_text_position/Constant_46"
2480
+ op_type: "Constant"
2481
+ attribute {
2482
+ name: "value"
2483
+ t {
2484
+ dims: 1
2485
+ data_type: 7
2486
+ data_location: 0
2487
+ }
2488
+ type: TENSOR
2489
+ }
2490
+ }
2491
+ node {
2492
+ input: "/encoder/ar_text_position/Expand_5_output_0"
2493
+ input: "/encoder/ar_text_position/Constant_46_output_0"
2494
+ output: "/encoder/ar_text_position/Unsqueeze_5_output_0"
2495
+ name: "/encoder/ar_text_position/Unsqueeze_5"
2496
+ op_type: "Unsqueeze"
2497
+ }
2498
+ node {
2499
+ input: "/encoder/ar_text_position/Unsqueeze_4_output_0"
2500
+ input: "/encoder/ar_text_position/Unsqueeze_5_output_0"
2501
+ output: "/encoder/ar_text_position/Concat_3_output_0"
2502
+ name: "/encoder/ar_text_position/Concat_3"
2503
+ op_type: "Concat"
2504
+ attribute {
2505
+ name: "axis"
2506
+ i: -1
2507
+ type: INT
2508
+ }
2509
+ }
2510
+ node {
2511
+ input: "/encoder/ar_text_position/ScatterND_output_0"
2512
+ output: "/encoder/ar_text_position/Shape_14_output_0"
2513
+ name: "/encoder/ar_text_position/Shape_14"
2514
+ op_type: "Shape"
2515
+ }
2516
+ node {
2517
+ output: "/encoder/ar_text_position/Constant_47_output_0"
2518
+ name: "/encoder/ar_text_position/Constant_47"
2519
+ op_type: "Constant"
2520
+ attribute {
2521
+ name: "value"
2522
+ t {
2523
+ dims: 1
2524
+ data_type: 7
2525
+ data_location: 0
2526
+ }
2527
+ type: TENSOR
2528
+ }
2529
+ }
2530
+ node {
2531
+ output: "/encoder/ar_text_position/Constant_48_output_0"
2532
+ name: "/encoder/ar_text_position/Constant_48"
2533
+ op_type: "Constant"
2534
+ attribute {
2535
+ name: "value"
2536
+ t {
2537
+ dims: 1
2538
+ data_type: 7
2539
+ data_location: 0
2540
+ }
2541
+ type: TENSOR
2542
+ }
2543
+ }
2544
+ node {
2545
+ output: "/encoder/ar_text_position/Constant_49_output_0"
2546
+ name: "/encoder/ar_text_position/Constant_49"
2547
+ op_type: "Constant"
2548
+ attribute {
2549
+ name: "value"
2550
+ t {
2551
+ dims: 1
2552
+ data_type: 7
2553
+ data_location: 0
2554
+ }
2555
+ type: TENSOR
2556
+ }
2557
+ }
2558
+ node {
2559
+ input: "/encoder/ar_text_position/Shape_14_output_0"
2560
+ input: "/encoder/ar_text_position/Constant_48_output_0"
2561
+ input: "/encoder/ar_text_position/Constant_49_output_0"
2562
+ input: "/encoder/ar_text_position/Constant_47_output_0"
2563
+ output: "/encoder/ar_text_position/Slice_5_output_0"
2564
+ name: "/encoder/ar_text_position/Slice_5"
2565
+ op_type: "Slice"
2566
+ }
2567
+ node {
2568
+ input: "/encoder/ar_text_position/Shape_11_output_0"
2569
+ input: "/encoder/ar_text_position/Slice_5_output_0"
2570
+ output: "/encoder/ar_text_position/Concat_4_output_0"
2571
+ name: "/encoder/ar_text_position/Concat_4"
2572
+ op_type: "Concat"
2573
+ attribute {
2574
+ name: "axis"
2575
+ i: 0
2576
+ type: INT
2577
+ }
2578
+ }
2579
+ node {
2580
+ input: "/encoder/ar_text_position/Expand_3_output_0"
2581
+ input: "/encoder/ar_text_position/Concat_4_output_0"
2582
+ output: "/encoder/ar_text_position/Reshape_3_output_0"
2583
+ name: "/encoder/ar_text_position/Reshape_3"
2584
+ op_type: "Reshape"
2585
+ attribute {
2586
+ name: "allowzero"
2587
+ i: 0
2588
+ type: INT
2589
+ }
2590
+ }
2591
+ node {
2592
+ input: "/encoder/ar_text_position/ScatterND_output_0"
2593
+ input: "/encoder/ar_text_position/Concat_3_output_0"
2594
+ input: "/encoder/ar_text_position/Reshape_3_output_0"
2595
+ output: "/encoder/ar_text_position/ScatterND_1_output_0"
2596
+ name: "/encoder/ar_text_position/ScatterND_1"
2597
+ op_type: "ScatterND"
2598
+ }
2599
+ node {
2600
+ output: "/encoder/ar_text_position/Constant_50_output_0"
2601
+ name: "/encoder/ar_text_position/Constant_50"
2602
+ op_type: "Constant"
2603
+ attribute {
2604
+ name: "value"
2605
+ t {
2606
+ dims: 1
2607
+ data_type: 7
2608
+ data_location: 0
2609
+ }
2610
+ type: TENSOR
2611
+ }
2612
+ }
2613
+ node {
2614
+ input: "/encoder/ar_text_position/ScatterND_1_output_0"
2615
+ input: "/encoder/ar_text_position/Constant_50_output_0"
2616
+ output: "/encoder/ar_text_position/Unsqueeze_6_output_0"
2617
+ name: "/encoder/ar_text_position/Unsqueeze_6"
2618
+ op_type: "Unsqueeze"
2619
+ }
2620
+ node {
2621
+ output: "/encoder/ar_text_position/Constant_51_output_0"
2622
+ name: "/encoder/ar_text_position/Constant_51"
2623
+ op_type: "Constant"
2624
+ attribute {
2625
+ name: "value"
2626
+ t {
2627
+ data_type: 1
2628
+ data_location: 0
2629
+ }
2630
+ type: TENSOR
2631
+ }
2632
+ }
2633
+ node {
2634
+ input: "/encoder/Add_output_0"
2635
+ input: "/encoder/ar_text_position/Constant_51_output_0"
2636
+ output: "/encoder/ar_text_position/Mul_5_output_0"
2637
+ name: "/encoder/ar_text_position/Mul_5"
2638
+ op_type: "Mul"
2639
+ }
2640
+ node {
2641
+ input: "encoder.ar_text_position.alpha"
2642
+ input: "/encoder/ar_text_position/Unsqueeze_6_output_0"
2643
+ output: "/encoder/ar_text_position/Mul_6_output_0"
2644
+ name: "/encoder/ar_text_position/Mul_6"
2645
+ op_type: "Mul"
2646
+ }
2647
+ node {
2648
+ input: "/encoder/ar_text_position/Mul_5_output_0"
2649
+ input: "/encoder/ar_text_position/Mul_6_output_0"
2650
+ output: "x"
2651
+ name: "/encoder/ar_text_position/Add_2"
2652
+ op_type: "Add"
2653
+ }
2654
+ initializer {
2655
+ dims: 512
2656
+ dims: 512
2657
+ data_type: 1
2658
+ name: "encoder.ar_text_embedding.word_embeddings.weight"
2659
+ }
2660
+ initializer {
2661
+ dims: 512
2662
+ data_type: 1
2663
+ name: "encoder.bert_proj.bias"
2664
+ }
2665
+ initializer {
2666
+ dims: 1
2667
+ data_type: 1
2668
+ name: "encoder.ar_text_position.alpha"
2669
+ }
2670
+ initializer {
2671
+ dims: 768
2672
+ dims: 768
2673
+ dims: 2
2674
+ data_type: 1
2675
+ name: "vits.ssl_proj.weight"
2676
+ }
2677
+ initializer {
2678
+ dims: 768
2679
+ data_type: 1
2680
+ name: "vits.ssl_proj.bias"
2681
+ }
2682
+ initializer {
2683
+ dims: 768
2684
+ dims: 1024
2685
+ data_type: 1
2686
+ name: "onnx::MatMul_1058"
2687
+ }
2688
+ initializer {
2689
+ dims: 1024
2690
+ dims: 512
2691
+ data_type: 1
2692
+ name: "onnx::MatMul_1059"
2693
+ }
2694
+ input {
2695
+ name: "ref_seq"
2696
+ type {
2697
+ tensor_type {
2698
+ elem_type: 7
2699
+ shape {
2700
+ dim {
2701
+ dim_value: 1
2702
+ }
2703
+ dim {
2704
+ dim_param: "ref_length"
2705
+ }
2706
+ }
2707
+ }
2708
+ }
2709
+ }
2710
+ input {
2711
+ name: "text_seq"
2712
+ type {
2713
+ tensor_type {
2714
+ elem_type: 7
2715
+ shape {
2716
+ dim {
2717
+ dim_value: 1
2718
+ }
2719
+ dim {
2720
+ dim_param: "text_length"
2721
+ }
2722
+ }
2723
+ }
2724
+ }
2725
+ }
2726
+ input {
2727
+ name: "ref_bert"
2728
+ type {
2729
+ tensor_type {
2730
+ elem_type: 1
2731
+ shape {
2732
+ dim {
2733
+ dim_param: "ref_length"
2734
+ }
2735
+ dim {
2736
+ dim_value: 1024
2737
+ }
2738
+ }
2739
+ }
2740
+ }
2741
+ }
2742
+ input {
2743
+ name: "text_bert"
2744
+ type {
2745
+ tensor_type {
2746
+ elem_type: 1
2747
+ shape {
2748
+ dim {
2749
+ dim_param: "text_length"
2750
+ }
2751
+ dim {
2752
+ dim_value: 1024
2753
+ }
2754
+ }
2755
+ }
2756
+ }
2757
+ }
2758
+ input {
2759
+ name: "ssl_content"
2760
+ type {
2761
+ tensor_type {
2762
+ elem_type: 1
2763
+ shape {
2764
+ dim {
2765
+ dim_value: 1
2766
+ }
2767
+ dim {
2768
+ dim_value: 768
2769
+ }
2770
+ dim {
2771
+ dim_param: "ssl_length"
2772
+ }
2773
+ }
2774
+ }
2775
+ }
2776
+ }
2777
+ output {
2778
+ name: "x"
2779
+ type {
2780
+ tensor_type {
2781
+ elem_type: 1
2782
+ shape {
2783
+ dim {
2784
+ dim_value: 1
2785
+ }
2786
+ dim {
2787
+ dim_param: "Addx_dim_1"
2788
+ }
2789
+ dim {
2790
+ dim_param: "Addx_dim_2"
2791
+ }
2792
+ }
2793
+ }
2794
+ }
2795
+ }
2796
+ output {
2797
+ name: "prompts"
2798
+ type {
2799
+ tensor_type {
2800
+ elem_type: 7
2801
+ shape {
2802
+ dim {
2803
+ dim_value: 1
2804
+ }
2805
+ dim {
2806
+ dim_param: "Unsqueezeprompts_dim_1"
2807
+ }
2808
+ }
2809
+ }
2810
+ }
2811
+ }
2812
+ }
2813
+ opset_import {
2814
+ domain: ""
2815
+ version: 16
2816
+ }
models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f0b326266755e811e57fcc43294e785655bc6f444339add99e484addb7ee36
3
+ size 307531566
models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/t2s_sdec.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5212b7e83037acb53f9c2f178394ccc04190ea9c128c19e33098db36df08a764
3
+ size 307594527
models/ailia-models/GPT-SoVITS/t2s_sdec.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:548fb3d948fab18cb73a26f85cda19fc7f1f849db3be10ec2791fcc97db4a16a
3
+ size 307182563
models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e78c512b7765844ac4fe8f2ab6822a7e0bae68856e130e75e3623c8b3c4f506
3
+ size 307138999
models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS/vits.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec25162dac307d37b652f1897504d1e7e80abd46e77bd1bb4a8ae66c02e28623
3
+ size 162706996
models/ailia-models/GPT-SoVITS/vits.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS2/cnhubert.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:558e4aabf7a7d1ef8ad89c0983a4a6413f9f4489232a35b4c1d455575f6cc242
3
+ size 377745020
models/ailia-models/GPT-SoVITS2/cnhubert.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS2/code/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 RVC-Boss
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
models/ailia-models/GPT-SoVITS2/code/README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-SoVITS V2
2
+
3
+ ### Input
4
+ - A synthesis text and reference audio and reference text for voice cloning
5
+
6
+ ### Output
7
+ The Voice file is output as .wav which path is defined as `SAVE_WAV_PATH` in `gpt-sovits-v2.py `.
8
+
9
+ ### Requirements
10
+ This model requires pyopenjtalk for g2p.
11
+
12
+ ```
13
+ pip3 install -r requirements.txt
14
+ ```
15
+
16
+ ### Usage
17
+ Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
18
+
19
+ For the sample sentence and sample audio,
20
+ ```
21
+ python3 gpt-sovits-v2.py
22
+ ```
23
+
24
+ Run with audio prompt.
25
+
26
+ ```
27
+ python3 gpt-sovits-v2.py -i "ax株式会社ではAIの実用化のための技術を開発しています。" --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。"
28
+ ```
29
+
30
+ Run for english.
31
+
32
+ ```
33
+ python3 gpt-sovits-v2.py -i "Hello world. We are testing speech synthesis." --text_language en --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。" --ref_language ja
34
+ ```
35
+
36
+ ### Reference
37
+ [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
38
+
39
+ ### Framework
40
+ PyTorch 2.5.0
41
+
42
+ ### Model Format
43
+ ONNX opset = 17
44
+
45
+ ### Netron
46
+
47
+ #### Normal model
48
+
49
+ - [cnhubert.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/cnhubert.onnx.prototxt)
50
+ - [t2s_encoder.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_encoder.onnx.prototxt)
51
+ - [t2s_fsdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_fsdec.onnx.prototxt)
52
+ - [t2s_sdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_sdec.onnx.prototxt)
53
+ - [vits.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/vits.onnx.prototxt)
models/ailia-models/GPT-SoVITS2/code/gpt-sovits-v2.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+
4
+ # logger
5
+ from logging import getLogger # noqa: E402
6
+
7
+ import numpy as np
8
+ import soundfile
9
+ import librosa
10
+ from tqdm import tqdm
11
+
12
+ # import original modules
13
+ sys.path.append("../../util")
14
+ from arg_utils import get_base_parser, update_parser # noqa: E402
15
+ from model_utils import check_and_download_models # noqa: E402
16
+
17
+ import ailia
18
+ from text import cleaned_text_to_sequence
19
+ from text.cleaner import clean_text
20
+
21
+
22
+ logger = getLogger(__name__)
23
+
24
+ # ======================
25
+ # PARAMETERS
26
+ # ======================
27
+
28
+ REF_WAV_PATH = "reference_audio_captured_by_ax.wav"
29
+ REF_TEXT = "水をマレーシアから買わなくてはならない。"
30
+ SAVE_WAV_PATH = "output.wav"
31
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/gpt-sovits-v2/"
32
+ WEIGHT_PATH_SSL = "cnhubert.onnx"
33
+ WEIGHT_PATH_T2S_ENCODER = "t2s_encoder.onnx"
34
+ WEIGHT_PATH_T2S_FIRST_DECODER = "t2s_fsdec.onnx"
35
+ WEIGHT_PATH_T2S_STAGE_DECODER = "t2s_sdec.onnx"
36
+ WEIGHT_PATH_VITS = "vits.onnx"
37
+ MODEL_PATH_SSL = WEIGHT_PATH_SSL + ".prototxt"
38
+ MODEL_PATH_T2S_ENCODER = WEIGHT_PATH_T2S_ENCODER + ".prototxt"
39
+ MODEL_PATH_T2S_FIRST_DECODER = WEIGHT_PATH_T2S_FIRST_DECODER + ".prototxt"
40
+ MODEL_PATH_T2S_STAGE_DECODER = WEIGHT_PATH_T2S_STAGE_DECODER + ".prototxt"
41
+ MODEL_PATH_VITS = WEIGHT_PATH_VITS + ".prototxt"
42
+
43
+
44
+ # ======================
45
+ # Arguemnt Parser Config
46
+ # ======================
47
+
48
+ parser = get_base_parser("GPT-SoVits", None, SAVE_WAV_PATH)
49
+ # overwrite
50
+ parser.add_argument(
51
+ "--input",
52
+ "-i",
53
+ metavar="TEXT",
54
+ default="ax株式会社ではAIの実用化のための技術を開発しています。",
55
+ help="input text",
56
+ )
57
+ parser.add_argument(
58
+ "--text_language", "-tl", default="ja", choices=("ja", "en"), help="[ja, en]"
59
+ )
60
+ parser.add_argument(
61
+ "--ref_audio",
62
+ "-ra",
63
+ metavar="TEXT",
64
+ default=REF_WAV_PATH,
65
+ help="ref audio",
66
+ )
67
+ parser.add_argument(
68
+ "--ref_text",
69
+ "-rt",
70
+ metavar="TEXT",
71
+ default=REF_TEXT,
72
+ help="ref text",
73
+ )
74
+ parser.add_argument(
75
+ "--ref_language", "-rl", default="ja", choices=("ja", "en"), help="[ja, en]"
76
+ )
77
+ parser.add_argument("--top_k", type=int, default=15, help="top_k")
78
+ parser.add_argument("--top_p", type=float, default=1.0, help="top_p")
79
+ parser.add_argument("--temperature", type=float, default=1.0, help="temperature")
80
+ parser.add_argument("--speed", type=float, default=1.0, help="Speech rate")
81
+ parser.add_argument("--onnx", action="store_true", help="use onnx runtime")
82
+ parser.add_argument("--profile", action="store_true", help="use profile model")
83
+ args = update_parser(parser, check_input_type=False)
84
+
85
+
86
+ splits = {
87
+ # fmt: off
88
+ ",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…",
89
+ # fmt: on
90
+ }
91
+
92
+
93
+ # ======================
94
+ # Secondary Functions
95
+ # ======================
96
+
97
+
98
+ def split(todo_text):
99
+ todo_text = todo_text.replace("……", "。").replace("——", ",")
100
+ if todo_text[-1] not in splits:
101
+ todo_text += "。"
102
+ i_split_head = i_split_tail = 0
103
+ len_text = len(todo_text)
104
+ todo_texts = []
105
+ while 1:
106
+ if i_split_head >= len_text:
107
+ break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
108
+ if todo_text[i_split_head] in splits:
109
+ i_split_head += 1
110
+ todo_texts.append(todo_text[i_split_tail:i_split_head])
111
+ i_split_tail = i_split_head
112
+ else:
113
+ i_split_head += 1
114
+ return todo_texts
115
+
116
+
117
+ def cut(inp):
118
+ punctuation = set(["!", "?", "…", ",", ".", "-", " "])
119
+
120
+ inp = inp.strip("\n")
121
+ inps = split(inp)
122
+ split_idx = list(range(0, len(inps), 4))
123
+ split_idx[-1] = None
124
+ if len(split_idx) > 1:
125
+ opts = []
126
+ for idx in range(len(split_idx) - 1):
127
+ opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]]))
128
+ else:
129
+ opts = [inp]
130
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
131
+ return "\n".join(opts)
132
+
133
+
134
+ def process_text(texts):
135
+ _text = []
136
+ if all(text in [None, " ", "\n", ""] for text in texts):
137
+ raise ValueError("Please enter valid text.")
138
+ for text in texts:
139
+ if text in [None, " ", ""]:
140
+ pass
141
+ else:
142
+ _text.append(text)
143
+ return _text
144
+
145
+
146
+ def merge_short_text_in_array(texts, threshold):
147
+ if (len(texts)) < 2:
148
+ return texts
149
+ result = []
150
+ text = ""
151
+ for ele in texts:
152
+ text += ele
153
+ if len(text) >= threshold:
154
+ result.append(text)
155
+ text = ""
156
+ if len(text) > 0:
157
+ if len(result) == 0:
158
+ result.append(text)
159
+ else:
160
+ result[len(result) - 1] += text
161
+ return result
162
+
163
+
164
+ # ======================
165
+ # Main Logic
166
+ # ======================
167
+
168
+
169
+ class T2SModel:
170
+ def __init__(self, sess_encoder, sess_fsdec, sess_sdec):
171
+ self.hz = 50
172
+ self.max_sec = 54
173
+ self.top_k = 5
174
+ self.early_stop_num = np.array([self.hz * self.max_sec])
175
+ self.sess_encoder = sess_encoder
176
+ self.sess_fsdec = sess_fsdec
177
+ self.sess_sdec = sess_sdec
178
+
179
+ def forward(
180
+ self,
181
+ ref_seq,
182
+ text_seq,
183
+ ref_bert,
184
+ text_bert,
185
+ ssl_content,
186
+ top_k=20,
187
+ top_p=0.6,
188
+ temperature=0.6,
189
+ repetition_penalty=1.35,
190
+ ):
191
+ early_stop_num = self.early_stop_num
192
+
193
+ top_k = np.array([top_k], dtype=np.int64)
194
+ top_p = np.array([top_p], dtype=np.float32)
195
+ temperature = np.array([temperature], dtype=np.float32)
196
+ repetition_penalty = np.array([repetition_penalty], dtype=np.float32)
197
+
198
+ EOS = 1024
199
+
200
+ if args.benchmark:
201
+ start = int(round(time.time() * 1000))
202
+ if args.onnx:
203
+ x, prompts = self.sess_encoder.run(
204
+ None,
205
+ {
206
+ "ref_seq": ref_seq,
207
+ "text_seq": text_seq,
208
+ "ref_bert": ref_bert,
209
+ "text_bert": text_bert,
210
+ "ssl_content": ssl_content,
211
+ },
212
+ )
213
+ else:
214
+ x, prompts = self.sess_encoder.run(
215
+ {
216
+ "ref_seq": ref_seq,
217
+ "text_seq": text_seq,
218
+ "ref_bert": ref_bert,
219
+ "text_bert": text_bert,
220
+ "ssl_content": ssl_content,
221
+ }
222
+ )
223
+ if args.benchmark:
224
+ end = int(round(time.time() * 1000))
225
+ logger.info("\tsencoder processing time {} ms".format(end - start))
226
+
227
+ prefix_len = prompts.shape[1]
228
+
229
+ if args.benchmark:
230
+ start = int(round(time.time() * 1000))
231
+ if args.onnx:
232
+ y, k, v, y_emb, x_example = self.sess_fsdec.run(
233
+ None,
234
+ {
235
+ "x": x,
236
+ "prompts": prompts,
237
+ "top_k": top_k,
238
+ "top_p": top_p,
239
+ "temperature": temperature,
240
+ "repetition_penalty": repetition_penalty,
241
+ },
242
+ )
243
+ else:
244
+ y, k, v, y_emb, x_example = self.sess_fsdec.run(
245
+ {
246
+ "x": x,
247
+ "prompts": prompts,
248
+ "top_k": top_k,
249
+ "top_p": top_p,
250
+ "temperature": temperature,
251
+ "repetition_penalty": repetition_penalty,
252
+ }
253
+ )
254
+ if args.benchmark:
255
+ end = int(round(time.time() * 1000))
256
+ logger.info("\tfsdec processing time {} ms".format(end - start))
257
+
258
+ stop = False
259
+ for idx in tqdm(range(1, 1500)):
260
+ if args.benchmark:
261
+ start = int(round(time.time() * 1000))
262
+ if args.onnx:
263
+ y, k, v, y_emb, logits, samples = self.sess_sdec.run(
264
+ None,
265
+ {
266
+ "iy": y,
267
+ "ik": k,
268
+ "iv": v,
269
+ "iy_emb": y_emb,
270
+ "ix_example": x_example,
271
+ "top_k": top_k,
272
+ "top_p": top_p,
273
+ "temperature": temperature,
274
+ "repetition_penalty": repetition_penalty,
275
+ },
276
+ )
277
+ else:
278
+ COPY_INPUT_BLOB_DATA = False
279
+ if idx == 1:
280
+ y, k, v, y_emb, logits, samples = self.sess_sdec.run(
281
+ {
282
+ "iy": y,
283
+ "ik": k,
284
+ "iv": v,
285
+ "iy_emb": y_emb,
286
+ "ix_example": x_example,
287
+ "top_k": top_k,
288
+ "top_p": top_p,
289
+ "temperature": temperature,
290
+ "repetition_penalty": repetition_penalty,
291
+ }
292
+ )
293
+ kv_base_shape = k.shape
294
+ else:
295
+ input_blob_idx = self.sess_sdec.get_input_blob_list()
296
+ output_blob_idx = self.sess_sdec.get_output_blob_list()
297
+ self.sess_sdec.set_input_blob_data(y, 0)
298
+ if COPY_INPUT_BLOB_DATA:
299
+ kv_shape = (
300
+ kv_base_shape[0],
301
+ kv_base_shape[1] + idx - 2,
302
+ kv_base_shape[2],
303
+ kv_base_shape[3],
304
+ )
305
+ self.sess_sdec.set_input_blob_shape(kv_shape, 1)
306
+ self.sess_sdec.set_input_blob_shape(kv_shape, 2)
307
+ self.sess_sdec.copy_blob_data(
308
+ input_blob_idx[1], output_blob_idx[1], self.sess_sdec
309
+ )
310
+ self.sess_sdec.copy_blob_data(
311
+ input_blob_idx[2], output_blob_idx[2], self.sess_sdec
312
+ )
313
+ else:
314
+ self.sess_sdec.set_input_blob_data(k, 1)
315
+ self.sess_sdec.set_input_blob_data(v, 2)
316
+ self.sess_sdec.set_input_blob_data(y_emb, 3)
317
+ self.sess_sdec.set_input_blob_data(x_example, 4)
318
+ self.sess_sdec.set_input_blob_data(top_k, 5)
319
+ self.sess_sdec.set_input_blob_data(top_p, 6)
320
+ self.sess_sdec.set_input_blob_data(temperature, 7)
321
+ self.sess_sdec.set_input_blob_data(repetition_penalty, 8)
322
+ self.sess_sdec.update()
323
+ y = self.sess_sdec.get_blob_data(output_blob_idx[0])
324
+ if not COPY_INPUT_BLOB_DATA:
325
+ k = self.sess_sdec.get_blob_data(output_blob_idx[1])
326
+ v = self.sess_sdec.get_blob_data(output_blob_idx[2])
327
+ y_emb = self.sess_sdec.get_blob_data(output_blob_idx[3])
328
+ logits = self.sess_sdec.get_blob_data(output_blob_idx[4])
329
+ samples = self.sess_sdec.get_blob_data(output_blob_idx[5])
330
+
331
+ if args.benchmark:
332
+ end = int(round(time.time() * 1000))
333
+ logger.info("\tsdec processing time {} ms".format(end - start))
334
+ if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
335
+ stop = True
336
+ if np.argmax(logits, axis=-1)[0] == EOS or samples[0, 0] == EOS:
337
+ stop = True
338
+ if stop:
339
+ break
340
+ y[0, -1] = 0
341
+
342
+ return y[np.newaxis, :, -idx:-1]
343
+
344
+
345
+ class GptSoVits:
346
+ def __init__(self, t2s: T2SModel, sess):
347
+ self.t2s = t2s
348
+ self.sess = sess
349
+
350
+ def forward(
351
+ self,
352
+ ref_seq,
353
+ text_seq,
354
+ ref_bert,
355
+ text_bert,
356
+ ref_audio,
357
+ ssl_content,
358
+ top_k=20,
359
+ top_p=0.6,
360
+ temperature=0.6,
361
+ repetition_penalty=1.35,
362
+ speed=1.0,
363
+ ):
364
+ pred_semantic = self.t2s.forward(
365
+ ref_seq,
366
+ text_seq,
367
+ ref_bert,
368
+ text_bert,
369
+ ssl_content,
370
+ top_k=top_k,
371
+ top_p=top_p,
372
+ temperature=temperature,
373
+ repetition_penalty=repetition_penalty,
374
+ )
375
+ speed = np.array(speed, dtype=np.float32)
376
+ if args.benchmark:
377
+ start = int(round(time.time() * 1000))
378
+ if args.onnx:
379
+ audio1 = self.sess.run(
380
+ None,
381
+ {
382
+ "text_seq": text_seq,
383
+ "pred_semantic": pred_semantic,
384
+ "ref_audio": ref_audio,
385
+ "speed": speed,
386
+ },
387
+ )
388
+ else:
389
+ audio1 = self.sess.run(
390
+ {
391
+ "text_seq": text_seq,
392
+ "pred_semantic": pred_semantic,
393
+ "ref_audio": ref_audio,
394
+ "speed": speed,
395
+ }
396
+ )
397
+ if args.benchmark:
398
+ end = int(round(time.time() * 1000))
399
+ logger.info("\tvits processing time {} ms".format(end - start))
400
+ return audio1[0]
401
+
402
+
403
+ class SSLModel:
404
+ def __init__(self, sess):
405
+ self.sess = sess
406
+
407
+ def forward(self, ref_audio_16k):
408
+ if args.benchmark:
409
+ start = int(round(time.time() * 1000))
410
+ if args.onnx:
411
+ last_hidden_state = self.sess.run(None, {"ref_audio_16k": ref_audio_16k})
412
+ else:
413
+ last_hidden_state = self.sess.run({"ref_audio_16k": ref_audio_16k})
414
+ if args.benchmark:
415
+ end = int(round(time.time() * 1000))
416
+ logger.info("\tssl processing time {} ms".format(end - start))
417
+ return last_hidden_state[0]
418
+
419
+
420
+ def get_phones_and_bert(text, language, final=False):
421
+ if language == "en":
422
+ try:
423
+ import LangSegment
424
+
425
+ LangSegment.setfilters(["en"])
426
+ formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
427
+ except ImportError:
428
+ formattext = text
429
+ else:
430
+ formattext = text
431
+ while " " in formattext:
432
+ formattext = formattext.replace(" ", " ")
433
+
434
+ phones, word2ph, norm_text = clean_text(formattext, language)
435
+ phones = cleaned_text_to_sequence(phones)
436
+ bert = np.zeros((1024, len(phones)), dtype=np.float32)
437
+
438
+ if not final and len(phones) < 6:
439
+ return get_phones_and_bert("." + text, language, final=True)
440
+
441
+ return phones, bert, norm_text
442
+
443
+
444
+ def generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits):
445
+ gpt = T2SModel(
446
+ t2s_encoder,
447
+ t2s_first_decoder,
448
+ t2s_stage_decoder,
449
+ )
450
+ gpt_sovits = GptSoVits(gpt, vits)
451
+ ssl = SSLModel(ssl)
452
+
453
+ input_audio = args.ref_audio
454
+ ref_text = args.ref_text
455
+ ref_language = args.ref_language
456
+ text = args.input
457
+ text_language = args.text_language
458
+ top_k = args.top_k
459
+ top_p = args.top_p
460
+ temperature = args.temperature
461
+ speed = args.speed
462
+
463
+ ref_text = ref_text.strip("\n")
464
+ if ref_text[-1] not in splits:
465
+ ref_text += "。" if ref_language != "en" else "."
466
+ logger.info("Actual Input Reference Text: %s" % ref_text)
467
+
468
+ text = text.strip("\n")
469
+ logger.info("Actual Input Target Text: %s" % text)
470
+
471
+ vits_hps_data_sampling_rate = 32000
472
+ zero_wav = np.zeros(int(vits_hps_data_sampling_rate * 0.3), dtype=np.float16)
473
+
474
+ ref_audio, sr = librosa.load(input_audio, sr=vits_hps_data_sampling_rate)
475
+
476
+ ref_audio_16k = librosa.resample(ref_audio, orig_sr=sr, target_sr=16000)
477
+ if ref_audio_16k.shape[0] > 160000 or ref_audio_16k.shape[0] < 48000:
478
+ logger.warning(
479
+ "Reference audio is outside the 3-10 second range, please choose another one!"
480
+ )
481
+
482
+ # hubertの入力のみpaddingする
483
+ ref_audio_16k = np.concatenate([ref_audio_16k, zero_wav], axis=0)
484
+ ref_audio_16k = ref_audio_16k[np.newaxis, :]
485
+ ssl_content = ssl.forward(ref_audio_16k)
486
+
487
+ text = cut(text) # Slice once every 4 sentences
488
+ while "\n\n" in text:
489
+ text = text.replace("\n\n", "\n")
490
+ logger.info("Actual Input Target Text (after sentence segmentation): %s" % text)
491
+ texts = text.split("\n")
492
+ texts = process_text(texts)
493
+ texts = merge_short_text_in_array(texts, 5)
494
+
495
+ ref_seq, ref_bert, _ = get_phones_and_bert(ref_text, ref_language)
496
+ ref_seq = np.array(ref_seq)[np.newaxis, :]
497
+
498
+ ref_audio = ref_audio[np.newaxis, :]
499
+
500
+ audio_opt = []
501
+ for i_text, text in enumerate(texts):
502
+ # 解决输入目标文本的空行导致报错的问题
503
+ if len(text.strip()) == 0:
504
+ continue
505
+ if text[-1] not in splits:
506
+ text += "。" if text_language != "en" else "."
507
+
508
+ logger.info("Actual Input Target Text (per sentence): %s" % text)
509
+ text_seq, text_bert, norm_text = get_phones_and_bert(text, text_language)
510
+ text_seq = np.array(text_seq)[np.newaxis, :]
511
+ logger.info("Processed text from the frontend (per sentence): %s" % norm_text)
512
+
513
+ audio = gpt_sovits.forward(
514
+ ref_seq,
515
+ text_seq,
516
+ ref_bert.T,
517
+ text_bert.T,
518
+ ref_audio,
519
+ ssl_content,
520
+ top_k=top_k,
521
+ top_p=top_p,
522
+ temperature=temperature,
523
+ speed=speed,
524
+ )
525
+
526
+ max_audio = np.abs(audio).max()
527
+ if max_audio > 1:
528
+ audio /= max_audio
529
+ audio_opt.append(audio)
530
+ audio_opt.append(zero_wav)
531
+
532
+ audio = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
533
+
534
+ savepath = args.savepath
535
+ logger.info(f"saved at : {savepath}")
536
+ soundfile.write(savepath, audio, vits_hps_data_sampling_rate)
537
+
538
+ logger.info("Script finished successfully.")
539
+
540
+
541
+ def main():
542
+ # model files check and download
543
+ check_and_download_models(WEIGHT_PATH_SSL, MODEL_PATH_SSL, REMOTE_PATH)
544
+ check_and_download_models(
545
+ WEIGHT_PATH_T2S_ENCODER, MODEL_PATH_T2S_ENCODER, REMOTE_PATH
546
+ )
547
+ check_and_download_models(
548
+ WEIGHT_PATH_T2S_FIRST_DECODER, MODEL_PATH_T2S_FIRST_DECODER, REMOTE_PATH
549
+ )
550
+ check_and_download_models(
551
+ WEIGHT_PATH_T2S_STAGE_DECODER, MODEL_PATH_T2S_STAGE_DECODER, REMOTE_PATH
552
+ )
553
+ check_and_download_models(WEIGHT_PATH_VITS, MODEL_PATH_VITS, REMOTE_PATH)
554
+
555
+ env_id = args.env_id
556
+
557
+ if args.onnx:
558
+ import onnxruntime
559
+
560
+ ssl = onnxruntime.InferenceSession(WEIGHT_PATH_SSL)
561
+ t2s_encoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_ENCODER)
562
+ t2s_first_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_FIRST_DECODER)
563
+ t2s_stage_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_STAGE_DECODER)
564
+ vits = onnxruntime.InferenceSession(WEIGHT_PATH_VITS)
565
+ else:
566
+ memory_mode = ailia.get_memory_mode(
567
+ reduce_constant=True,
568
+ ignore_input_with_initializer=True,
569
+ reduce_interstage=False,
570
+ reuse_interstage=True,
571
+ )
572
+ ssl = ailia.Net(
573
+ weight=WEIGHT_PATH_SSL,
574
+ stream=MODEL_PATH_SSL,
575
+ memory_mode=memory_mode,
576
+ env_id=env_id,
577
+ )
578
+ t2s_encoder = ailia.Net(
579
+ weight=WEIGHT_PATH_T2S_ENCODER,
580
+ stream=MODEL_PATH_T2S_ENCODER,
581
+ memory_mode=memory_mode,
582
+ env_id=env_id,
583
+ )
584
+ t2s_first_decoder = ailia.Net(
585
+ weight=WEIGHT_PATH_T2S_FIRST_DECODER,
586
+ stream=MODEL_PATH_T2S_FIRST_DECODER,
587
+ memory_mode=memory_mode,
588
+ env_id=env_id,
589
+ )
590
+ t2s_stage_decoder = ailia.Net(
591
+ weight=WEIGHT_PATH_T2S_STAGE_DECODER,
592
+ stream=MODEL_PATH_T2S_STAGE_DECODER,
593
+ memory_mode=memory_mode,
594
+ env_id=env_id,
595
+ )
596
+ vits = ailia.Net(
597
+ weight=WEIGHT_PATH_VITS,
598
+ stream=MODEL_PATH_VITS,
599
+ memory_mode=memory_mode,
600
+ env_id=env_id,
601
+ )
602
+ if args.profile:
603
+ ssl.set_profile_mode(True)
604
+ t2s_encoder.set_profile_mode(True)
605
+ t2s_first_decoder.set_profile_mode(True)
606
+ t2s_stage_decoder.set_profile_mode(True)
607
+ vits.set_profile_mode(True)
608
+
609
+ if args.benchmark:
610
+ start = int(round(time.time() * 1000))
611
+
612
+ generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits)
613
+
614
+ if args.benchmark:
615
+ end = int(round(time.time() * 1000))
616
+ logger.info("\ttotal processing time {} ms".format(end - start))
617
+
618
+ if args.profile:
619
+ print("ssl : ")
620
+ print(ssl.get_summary())
621
+ print("t2s_encoder : ")
622
+ print(t2s_encoder.get_summary())
623
+ print("t2s_first_decoder : ")
624
+ print(t2s_first_decoder.get_summary())
625
+ print("t2s_stage_decoder : ")
626
+ print(t2s_stage_decoder.get_summary())
627
+ print("vits : ")
628
+ print(vits.get_summary())
629
+
630
+
631
+ if __name__ == "__main__":
632
+ main()
models/ailia-models/GPT-SoVITS2/code/reference_audio_captured_by_ax.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af474a35ab4aebaadda5a20d626c44830d5987880e54e10fc645eb73d568743
3
+ size 226298
models/ailia-models/GPT-SoVITS2/code/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ SoundFile
2
+ librosa
3
+ nltk
4
+ pyopenjtalk>=0.3.4
5
+ g2p_en
6
+ LangSegment>=0.2.0
7
+ wordsegment
models/ailia-models/GPT-SoVITS2/code/text/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import symbols2 as symbols_v2
2
+
3
+ _symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
4
+
5
+
6
+ def cleaned_text_to_sequence(cleaned_text, version=None):
7
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ """
13
+ phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
14
+
15
+ return phones
models/ailia-models/GPT-SoVITS2/code/text/cleaner.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from . import symbols2 as symbols_v2
2
+
3
+
4
+ def clean_text(text, language):
5
+ symbols = symbols_v2.symbols
6
+ language_module_map = {
7
+ # "zh": "chinese2",
8
+ "ja": "japanese",
9
+ "en": "english",
10
+ # "ko": "korean",
11
+ # "yue": "cantonese",
12
+ }
13
+
14
+ if language not in language_module_map:
15
+ language = "en"
16
+ text = " "
17
+ language_module = __import__(
18
+ "text." + language_module_map[language],
19
+ fromlist=[language_module_map[language]],
20
+ )
21
+ norm_text = language_module.text_normalize(text)
22
+ if language == "en":
23
+ phones = language_module.g2p(norm_text)
24
+ if len(phones) < 4:
25
+ phones = [","] + phones
26
+ word2ph = None
27
+ else:
28
+ phones = language_module.g2p(norm_text)
29
+ word2ph = None
30
+ phones = ["UNK" if ph not in symbols else ph for ph in phones]
31
+
32
+ return phones, word2ph, norm_text
models/ailia-models/GPT-SoVITS2/code/text/cmudict-fast.rep ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS2/code/text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/GPT-SoVITS2/code/text/engdict-hot.rep ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2
+ JSON JH EY1 S AH0 N
3
+ CONDA K AA1 N D AH0
models/ailia-models/GPT-SoVITS2/code/text/english.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pickle
4
+
5
+ import unicodedata
6
+ from builtins import str as unicode
7
+
8
+ import wordsegment
9
+ from g2p_en import G2p
10
+ from g2p_en.expand import normalize_numbers
11
+ from nltk.tokenize import TweetTokenizer
12
+ from nltk import pos_tag
13
+
14
+ from .symbols2 import symbols
15
+
16
+ word_tokenize = TweetTokenizer().tokenize
17
+
18
+
19
+ current_file_path = os.path.dirname(__file__)
20
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
21
+ CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
22
+ CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
23
+ CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
24
+ NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
25
+
26
+ punctuation = ["!", "?", "…", ",", ".", "-"]
27
+
28
+ arpa = {
29
+ "AH0",
30
+ "S",
31
+ "AH1",
32
+ "EY2",
33
+ "AE2",
34
+ "EH0",
35
+ "OW2",
36
+ "UH0",
37
+ "NG",
38
+ "B",
39
+ "G",
40
+ "AY0",
41
+ "M",
42
+ "AA0",
43
+ "F",
44
+ "AO0",
45
+ "ER2",
46
+ "UH1",
47
+ "IY1",
48
+ "AH2",
49
+ "DH",
50
+ "IY0",
51
+ "EY1",
52
+ "IH0",
53
+ "K",
54
+ "N",
55
+ "W",
56
+ "IY2",
57
+ "T",
58
+ "AA1",
59
+ "ER1",
60
+ "EH2",
61
+ "OY0",
62
+ "UH2",
63
+ "UW1",
64
+ "Z",
65
+ "AW2",
66
+ "AW1",
67
+ "V",
68
+ "UW2",
69
+ "AA2",
70
+ "ER",
71
+ "AW0",
72
+ "UW0",
73
+ "R",
74
+ "OW1",
75
+ "EH1",
76
+ "ZH",
77
+ "AE0",
78
+ "IH2",
79
+ "IH",
80
+ "Y",
81
+ "JH",
82
+ "P",
83
+ "AY1",
84
+ "EY0",
85
+ "OY2",
86
+ "TH",
87
+ "HH",
88
+ "D",
89
+ "ER0",
90
+ "CH",
91
+ "AO1",
92
+ "AE1",
93
+ "AO2",
94
+ "OY1",
95
+ "AY2",
96
+ "IH1",
97
+ "OW0",
98
+ "L",
99
+ "SH",
100
+ }
101
+
102
+
103
+ def replace_phs(phs):
104
+ rep_map = {"'": "-"}
105
+ phs_new = []
106
+ for ph in phs:
107
+ if ph in symbols:
108
+ phs_new.append(ph)
109
+ elif ph in rep_map.keys():
110
+ phs_new.append(rep_map[ph])
111
+ else:
112
+ print("ph not in symbols: ", ph)
113
+ return phs_new
114
+
115
+
116
+ def replace_consecutive_punctuation(text):
117
+ punctuations = "".join(re.escape(p) for p in punctuation)
118
+ pattern = f"([{punctuations}])([{punctuations}])+"
119
+ result = re.sub(pattern, r"\1", text)
120
+ return result
121
+
122
+
123
+ def read_dict():
124
+ g2p_dict = {}
125
+ start_line = 49
126
+ with open(CMU_DICT_PATH) as f:
127
+ line = f.readline()
128
+ line_index = 1
129
+ while line:
130
+ if line_index >= start_line:
131
+ line = line.strip()
132
+ word_split = line.split(" ")
133
+ word = word_split[0].lower()
134
+
135
+ syllable_split = word_split[1].split(" - ")
136
+ g2p_dict[word] = []
137
+ for syllable in syllable_split:
138
+ phone_split = syllable.split(" ")
139
+ g2p_dict[word].append(phone_split)
140
+
141
+ line_index = line_index + 1
142
+ line = f.readline()
143
+
144
+ return g2p_dict
145
+
146
+
147
+ def read_dict_new():
148
+ g2p_dict = {}
149
+ with open(CMU_DICT_PATH, encoding="utf-8") as f:
150
+ line = f.readline()
151
+ line_index = 1
152
+ while line:
153
+ if line_index >= 57:
154
+ line = line.strip()
155
+ word_split = line.split(" ")
156
+ word = word_split[0].lower()
157
+ g2p_dict[word] = [word_split[1].split(" ")]
158
+
159
+ line_index = line_index + 1
160
+ line = f.readline()
161
+
162
+ with open(CMU_DICT_FAST_PATH, encoding="utf-8") as f:
163
+ line = f.readline()
164
+ line_index = 1
165
+ while line:
166
+ if line_index >= 0:
167
+ line = line.strip()
168
+ word_split = line.split(" ")
169
+ word = word_split[0].lower()
170
+ if word not in g2p_dict:
171
+ g2p_dict[word] = [word_split[1:]]
172
+
173
+ line_index = line_index + 1
174
+ line = f.readline()
175
+
176
+ return g2p_dict
177
+
178
+
179
+ def hot_reload_hot(g2p_dict):
180
+ with open(CMU_DICT_HOT_PATH) as f:
181
+ line = f.readline()
182
+ line_index = 1
183
+ while line:
184
+ if line_index >= 0:
185
+ line = line.strip()
186
+ word_split = line.split(" ")
187
+ word = word_split[0].lower()
188
+ # 自定义发音词直接覆盖字典
189
+ g2p_dict[word] = [word_split[1:]]
190
+
191
+ line_index = line_index + 1
192
+ line = f.readline()
193
+
194
+ return g2p_dict
195
+
196
+
197
+ def cache_dict(g2p_dict, file_path):
198
+ with open(file_path, "wb") as pickle_file:
199
+ pickle.dump(g2p_dict, pickle_file)
200
+
201
+
202
+ def get_dict():
203
+ if os.path.exists(CACHE_PATH):
204
+ with open(CACHE_PATH, "rb") as pickle_file:
205
+ g2p_dict = pickle.load(pickle_file)
206
+ else:
207
+ g2p_dict = read_dict_new()
208
+ cache_dict(g2p_dict, CACHE_PATH)
209
+
210
+ g2p_dict = hot_reload_hot(g2p_dict)
211
+
212
+ return g2p_dict
213
+
214
+
215
+ def get_namedict():
216
+ if os.path.exists(NAMECACHE_PATH):
217
+ with open(NAMECACHE_PATH, "rb") as pickle_file:
218
+ name_dict = pickle.load(pickle_file)
219
+ else:
220
+ name_dict = {}
221
+
222
+ return name_dict
223
+
224
+
225
+ def text_normalize(text):
226
+ # todo: eng text normalize
227
+ # 适配中文及 g2p_en 标点
228
+ rep_map = {
229
+ "[;::,;]": ",",
230
+ '["’]': "'",
231
+ "。": ".",
232
+ "!": "!",
233
+ "?": "?",
234
+ }
235
+ for p, r in rep_map.items():
236
+ text = re.sub(p, r, text)
237
+
238
+ # 来自 g2p_en 文本格式化处理
239
+ # 增加大写兼容
240
+ text = unicode(text)
241
+ text = normalize_numbers(text)
242
+ text = "".join(
243
+ char
244
+ for char in unicodedata.normalize("NFD", text)
245
+ if unicodedata.category(char) != "Mn"
246
+ ) # Strip accents
247
+ text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
248
+ text = re.sub(r"(?i)i\.e\.", "that is", text)
249
+ text = re.sub(r"(?i)e\.g\.", "for example", text)
250
+
251
+ # 避免重复标点引起的参考泄露
252
+ text = replace_consecutive_punctuation(text)
253
+
254
+ return text
255
+
256
+
257
+ class en_G2p(G2p):
258
+ def __init__(self):
259
+ super().__init__()
260
+ # 分词初始化
261
+ wordsegment.load()
262
+
263
+ # 扩展过时字典, 添加姓名字典
264
+ self.cmu = get_dict()
265
+ self.namedict = get_namedict()
266
+
267
+ # 剔除读音错误的几个缩写
268
+ for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
269
+ del self.cmu[word.lower()]
270
+
271
+ # 修正多音字
272
+ self.homograph2features["read"] = (["R", "IY1", "D"], ["R", "EH1", "D"], "VBP")
273
+ self.homograph2features["complex"] = (
274
+ ["K", "AH0", "M", "P", "L", "EH1", "K", "S"],
275
+ ["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
276
+ "JJ",
277
+ )
278
+
279
+ def __call__(self, text):
280
+ # tokenization
281
+ words = word_tokenize(text)
282
+ tokens = pos_tag(words) # tuples of (word, tag)
283
+
284
+ # steps
285
+ prons = []
286
+ for o_word, pos in tokens:
287
+ # 还原 g2p_en 小写操作逻辑
288
+ word = o_word.lower()
289
+
290
+ if re.search("[a-z]", word) is None:
291
+ pron = [word]
292
+ # 先把单字母推出去
293
+ elif len(word) == 1:
294
+ # 单读 A 发音修正, 这里需要原格式 o_word 判断大写
295
+ if o_word == "A":
296
+ pron = ["EY1"]
297
+ else:
298
+ pron = self.cmu[word][0]
299
+ # g2p_en 原版多音字处理
300
+ elif word in self.homograph2features: # Check homograph
301
+ pron1, pron2, pos1 = self.homograph2features[word]
302
+ if pos.startswith(pos1):
303
+ pron = pron1
304
+ # pos1比pos长仅出现在read
305
+ elif len(pos) < len(pos1) and pos == pos1[: len(pos)]:
306
+ pron = pron1
307
+ else:
308
+ pron = pron2
309
+ else:
310
+ # 递归查找预测
311
+ pron = self.qryword(o_word)
312
+
313
+ prons.extend(pron)
314
+ prons.extend([" "])
315
+
316
+ return prons[:-1]
317
+
318
+ def qryword(self, o_word):
319
+ word = o_word.lower()
320
+
321
+ # 查字典, 单字母除外
322
+ if len(word) > 1 and word in self.cmu: # lookup CMU dict
323
+ return self.cmu[word][0]
324
+
325
+ # 单词仅首字母大写时查找姓名字典
326
+ if o_word.istitle() and word in self.namedict:
327
+ return self.namedict[word][0]
328
+
329
+ # oov 长度小于等于 3 直接读字母
330
+ if len(word) <= 3:
331
+ phones = []
332
+ for w in word:
333
+ # 单读 A 发音修正, 此处不存在大写的情况
334
+ if w == "a":
335
+ phones.extend(["EY1"])
336
+ elif not w.isalpha():
337
+ phones.extend([w])
338
+ else:
339
+ phones.extend(self.cmu[w][0])
340
+ return phones
341
+
342
+ # 尝试分离所有格
343
+ if re.match(r"^([a-z]+)('s)$", word):
344
+ phones = self.qryword(word[:-2])[:]
345
+ # P T K F TH HH 无声辅音结尾 's 发 ['S']
346
+ if phones[-1] in ["P", "T", "K", "F", "TH", "HH"]:
347
+ phones.extend(["S"])
348
+ # S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
349
+ elif phones[-1] in ["S", "Z", "SH", "ZH", "CH", "JH"]:
350
+ phones.extend(["AH0", "Z"])
351
+ # B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
352
+ # AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
353
+ # ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
354
+ else:
355
+ phones.extend(["Z"])
356
+ return phones
357
+
358
+ # 尝试进行分词,应对复合词
359
+ comps = wordsegment.segment(word.lower())
360
+
361
+ # 无法分词的送回去预测
362
+ if len(comps) == 1:
363
+ return self.predict(word)
364
+
365
+ # 可以分词的递归处理
366
+ return [phone for comp in comps for phone in self.qryword(comp)]
367
+
368
+
369
+ _g2p = en_G2p()
370
+
371
+
372
+ def g2p(text):
373
+ # g2p_en 整段推理,剔除不存在的arpa返回
374
+ phone_list = _g2p(text)
375
+ phones = [
376
+ ph if ph != "<unk>" else "UNK"
377
+ for ph in phone_list
378
+ if ph not in [" ", "<pad>", "UW", "</s>", "<s>"]
379
+ ]
380
+
381
+ return replace_phs(phones)
382
+
383
+
384
+ if __name__ == "__main__":
385
+ print(g2p("hello"))
386
+ print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
387
+ print(
388
+ g2p(
389
+ text_normalize(
390
+ "In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder."
391
+ )
392
+ )
393
+ )
models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/user.dict ADDED
Binary file (4.13 kB). View file
 
models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ 主殿,*,*,-32767,名詞,固有名詞,一般,*,*,*,アルジドノ,アルジドノ,アルジドノ,3/5,*
models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.md5 ADDED
@@ -0,0 +1 @@
 
 
1
+ d448850ab3b6f07c4db19fd6f8181cbe
models/ailia-models/GPT-SoVITS2/code/text/japanese.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2
+ import re
3
+ import os
4
+ import hashlib
5
+
6
+ import pyopenjtalk
7
+
8
+ current_file_path = os.path.dirname(__file__)
9
+
10
+
11
+ def get_hash(fp: str) -> str:
12
+ hash_md5 = hashlib.md5()
13
+ with open(fp, "rb") as f:
14
+ for chunk in iter(lambda: f.read(4096), b""):
15
+ hash_md5.update(chunk)
16
+ return hash_md5.hexdigest()
17
+
18
+
19
+ USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
20
+ USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
21
+ USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
22
+ # 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成
23
+ if os.path.exists(USERDIC_CSV_PATH):
24
+ if (
25
+ not os.path.exists(USERDIC_BIN_PATH)
26
+ or get_hash(USERDIC_CSV_PATH)
27
+ != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read()
28
+ ):
29
+ pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
30
+ with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f:
31
+ f.write(get_hash(USERDIC_CSV_PATH))
32
+
33
+ if os.path.exists(USERDIC_BIN_PATH):
34
+ pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
35
+
36
+
37
+ from text.symbols2 import punctuation
38
+
39
+ # Regular expression matching Japanese without punctuation marks:
40
+ _japanese_characters = re.compile(
41
+ r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
42
+ )
43
+
44
+ # Regular expression matching non-Japanese characters or punctuation marks:
45
+ _japanese_marks = re.compile(
46
+ r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
47
+ )
48
+
49
+ # List of (symbol, Japanese) pairs for marks:
50
+ _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
51
+
52
+
53
+ def post_replace_ph(ph):
54
+ rep_map = {
55
+ ":": ",",
56
+ ";": ",",
57
+ ",": ",",
58
+ "。": ".",
59
+ "!": "!",
60
+ "?": "?",
61
+ "\n": ".",
62
+ "·": ",",
63
+ "、": ",",
64
+ "...": "…",
65
+ }
66
+
67
+ if ph in rep_map.keys():
68
+ ph = rep_map[ph]
69
+ # if ph in symbols:
70
+ # return ph
71
+ # if ph not in symbols:
72
+ # ph = "UNK"
73
+ return ph
74
+
75
+
76
+ def replace_consecutive_punctuation(text):
77
+ punctuations = "".join(re.escape(p) for p in punctuation)
78
+ pattern = f"([{punctuations}])([{punctuations}])+"
79
+ result = re.sub(pattern, r"\1", text)
80
+ return result
81
+
82
+
83
+ def symbols_to_japanese(text):
84
+ for regex, replacement in _symbols_to_japanese:
85
+ text = re.sub(regex, replacement, text)
86
+ return text
87
+
88
+
89
+ def preprocess_jap(text, with_prosody=False):
90
+ """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
91
+ text = symbols_to_japanese(text)
92
+ sentences = re.split(_japanese_marks, text)
93
+ marks = re.findall(_japanese_marks, text)
94
+ text = []
95
+ for i, sentence in enumerate(sentences):
96
+ if re.match(_japanese_characters, sentence):
97
+ if with_prosody:
98
+ text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
99
+ else:
100
+ p = pyopenjtalk.g2p(sentence)
101
+ text += p.split(" ")
102
+
103
+ if i < len(marks):
104
+ if marks[i] == " ": # 防止意外的UNK
105
+ continue
106
+ text += [marks[i].replace(" ", "")]
107
+ return text
108
+
109
+
110
+ def text_normalize(text):
111
+ # todo: jap text normalize
112
+
113
+ # 避免重复标点引起的参考泄露
114
+ text = replace_consecutive_punctuation(text)
115
+ return text
116
+
117
+
118
+ # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
119
+ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
120
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
121
+
122
+ The algorithm is based on `Prosodic features control by symbols as input of
123
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
124
+
125
+ Args:
126
+ text (str): Input text.
127
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
128
+
129
+ Returns:
130
+ List[str]: List of phoneme + prosody symbols.
131
+
132
+ Examples:
133
+ >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
134
+ >>> pyopenjtalk_g2p_prosody("こんにちは。")
135
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
136
+
137
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
138
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
139
+
140
+ """
141
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
142
+ N = len(labels)
143
+
144
+ phones = []
145
+ for n in range(N):
146
+ lab_curr = labels[n]
147
+
148
+ # current phoneme
149
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
150
+ # deal unvoiced vowels as normal vowels
151
+ if drop_unvoiced_vowels and p3 in "AEIOU":
152
+ p3 = p3.lower()
153
+
154
+ # deal with sil at the beginning and the end of text
155
+ if p3 == "sil":
156
+ assert n == 0 or n == N - 1
157
+ if n == 0:
158
+ phones.append("^")
159
+ elif n == N - 1:
160
+ # check question form or not
161
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
162
+ if e3 == 0:
163
+ phones.append("$")
164
+ elif e3 == 1:
165
+ phones.append("?")
166
+ continue
167
+ elif p3 == "pau":
168
+ phones.append("_")
169
+ continue
170
+ else:
171
+ phones.append(p3)
172
+
173
+ # accent type and position info (forward or backward)
174
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
175
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
176
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
177
+
178
+ # number of mora in accent phrase
179
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
180
+
181
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
182
+ # accent phrase border
183
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
184
+ phones.append("#")
185
+ # pitch falling
186
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
187
+ phones.append("]")
188
+ # pitch rising
189
+ elif a2 == 1 and a2_next == 2:
190
+ phones.append("[")
191
+
192
+ return phones
193
+
194
+
195
+ # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
196
+ def _numeric_feature_by_regex(regex, s):
197
+ match = re.search(regex, s)
198
+ if match is None:
199
+ return -50
200
+ return int(match.group(1))
201
+
202
+
203
+ def g2p(norm_text, with_prosody=True):
204
+ phones = preprocess_jap(norm_text, with_prosody)
205
+ phones = [post_replace_ph(i) for i in phones]
206
+ # todo: implement tones and word2ph
207
+ return phones
models/ailia-models/GPT-SoVITS2/code/text/namedict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:559552094c4a6e995213e3fa586330e078ef8cb3a7a95a3109e945111cd2bfc1
3
+ size 760663
models/ailia-models/GPT-SoVITS2/code/text/symbols2.py ADDED
@@ -0,0 +1,785 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
2
+ punctuation.append("-")
3
+ pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
4
+ pad = "_"
5
+
6
+ c = [
7
+ "AA",
8
+ "EE",
9
+ "OO",
10
+ "b",
11
+ "c",
12
+ "ch",
13
+ "d",
14
+ "f",
15
+ "g",
16
+ "h",
17
+ "j",
18
+ "k",
19
+ "l",
20
+ "m",
21
+ "n",
22
+ "p",
23
+ "q",
24
+ "r",
25
+ "s",
26
+ "sh",
27
+ "t",
28
+ "w",
29
+ "x",
30
+ "y",
31
+ "z",
32
+ "zh",
33
+ ]
34
+ v = [
35
+ "E1",
36
+ "En1",
37
+ "a1",
38
+ "ai1",
39
+ "an1",
40
+ "ang1",
41
+ "ao1",
42
+ "e1",
43
+ "ei1",
44
+ "en1",
45
+ "eng1",
46
+ "er1",
47
+ "i1",
48
+ "i01",
49
+ "ia1",
50
+ "ian1",
51
+ "iang1",
52
+ "iao1",
53
+ "ie1",
54
+ "in1",
55
+ "ing1",
56
+ "iong1",
57
+ "ir1",
58
+ "iu1",
59
+ "o1",
60
+ "ong1",
61
+ "ou1",
62
+ "u1",
63
+ "ua1",
64
+ "uai1",
65
+ "uan1",
66
+ "uang1",
67
+ "ui1",
68
+ "un1",
69
+ "uo1",
70
+ "v1",
71
+ "van1",
72
+ "ve1",
73
+ "vn1",
74
+ "E2",
75
+ "En2",
76
+ "a2",
77
+ "ai2",
78
+ "an2",
79
+ "ang2",
80
+ "ao2",
81
+ "e2",
82
+ "ei2",
83
+ "en2",
84
+ "eng2",
85
+ "er2",
86
+ "i2",
87
+ "i02",
88
+ "ia2",
89
+ "ian2",
90
+ "iang2",
91
+ "iao2",
92
+ "ie2",
93
+ "in2",
94
+ "ing2",
95
+ "iong2",
96
+ "ir2",
97
+ "iu2",
98
+ "o2",
99
+ "ong2",
100
+ "ou2",
101
+ "u2",
102
+ "ua2",
103
+ "uai2",
104
+ "uan2",
105
+ "uang2",
106
+ "ui2",
107
+ "un2",
108
+ "uo2",
109
+ "v2",
110
+ "van2",
111
+ "ve2",
112
+ "vn2",
113
+ "E3",
114
+ "En3",
115
+ "a3",
116
+ "ai3",
117
+ "an3",
118
+ "ang3",
119
+ "ao3",
120
+ "e3",
121
+ "ei3",
122
+ "en3",
123
+ "eng3",
124
+ "er3",
125
+ "i3",
126
+ "i03",
127
+ "ia3",
128
+ "ian3",
129
+ "iang3",
130
+ "iao3",
131
+ "ie3",
132
+ "in3",
133
+ "ing3",
134
+ "iong3",
135
+ "ir3",
136
+ "iu3",
137
+ "o3",
138
+ "ong3",
139
+ "ou3",
140
+ "u3",
141
+ "ua3",
142
+ "uai3",
143
+ "uan3",
144
+ "uang3",
145
+ "ui3",
146
+ "un3",
147
+ "uo3",
148
+ "v3",
149
+ "van3",
150
+ "ve3",
151
+ "vn3",
152
+ "E4",
153
+ "En4",
154
+ "a4",
155
+ "ai4",
156
+ "an4",
157
+ "ang4",
158
+ "ao4",
159
+ "e4",
160
+ "ei4",
161
+ "en4",
162
+ "eng4",
163
+ "er4",
164
+ "i4",
165
+ "i04",
166
+ "ia4",
167
+ "ian4",
168
+ "iang4",
169
+ "iao4",
170
+ "ie4",
171
+ "in4",
172
+ "ing4",
173
+ "iong4",
174
+ "ir4",
175
+ "iu4",
176
+ "o4",
177
+ "ong4",
178
+ "ou4",
179
+ "u4",
180
+ "ua4",
181
+ "uai4",
182
+ "uan4",
183
+ "uang4",
184
+ "ui4",
185
+ "un4",
186
+ "uo4",
187
+ "v4",
188
+ "van4",
189
+ "ve4",
190
+ "vn4",
191
+ "E5",
192
+ "En5",
193
+ "a5",
194
+ "ai5",
195
+ "an5",
196
+ "ang5",
197
+ "ao5",
198
+ "e5",
199
+ "ei5",
200
+ "en5",
201
+ "eng5",
202
+ "er5",
203
+ "i5",
204
+ "i05",
205
+ "ia5",
206
+ "ian5",
207
+ "iang5",
208
+ "iao5",
209
+ "ie5",
210
+ "in5",
211
+ "ing5",
212
+ "iong5",
213
+ "ir5",
214
+ "iu5",
215
+ "o5",
216
+ "ong5",
217
+ "ou5",
218
+ "u5",
219
+ "ua5",
220
+ "uai5",
221
+ "uan5",
222
+ "uang5",
223
+ "ui5",
224
+ "un5",
225
+ "uo5",
226
+ "v5",
227
+ "van5",
228
+ "ve5",
229
+ "vn5",
230
+ ]
231
+
232
+ v_without_tone = [
233
+ "E",
234
+ "En",
235
+ "a",
236
+ "ai",
237
+ "an",
238
+ "ang",
239
+ "ao",
240
+ "e",
241
+ "ei",
242
+ "en",
243
+ "eng",
244
+ "er",
245
+ "i",
246
+ "i0",
247
+ "ia",
248
+ "ian",
249
+ "iang",
250
+ "iao",
251
+ "ie",
252
+ "in",
253
+ "ing",
254
+ "iong",
255
+ "ir",
256
+ "iu",
257
+ "o",
258
+ "ong",
259
+ "ou",
260
+ "u",
261
+ "ua",
262
+ "uai",
263
+ "uan",
264
+ "uang",
265
+ "ui",
266
+ "un",
267
+ "uo",
268
+ "v",
269
+ "van",
270
+ "ve",
271
+ "vn",
272
+ ]
273
+
274
+ # japanese
275
+ ja_symbols = [
276
+ "I",
277
+ "N",
278
+ "U",
279
+ "a",
280
+ "b",
281
+ "by",
282
+ "ch",
283
+ "cl",
284
+ "d",
285
+ "dy",
286
+ "e",
287
+ "f",
288
+ "g",
289
+ "gy",
290
+ "h",
291
+ "hy",
292
+ "i",
293
+ "j",
294
+ "k",
295
+ "ky",
296
+ "m",
297
+ "my",
298
+ "n",
299
+ "ny",
300
+ "o",
301
+ "p",
302
+ "py",
303
+ "r",
304
+ "ry",
305
+ "s",
306
+ "sh",
307
+ "t",
308
+ "ts",
309
+ "u",
310
+ "v",
311
+ "w",
312
+ "y",
313
+ "z",
314
+ ###楼下2个留到后面加
315
+ # "[", #上升调型
316
+ # "]", #下降调型
317
+ # "$", #结束符
318
+ # "^", #开始符
319
+ ]
320
+
321
+ arpa = {
322
+ "AH0",
323
+ "S",
324
+ "AH1",
325
+ "EY2",
326
+ "AE2",
327
+ "EH0",
328
+ "OW2",
329
+ "UH0",
330
+ "NG",
331
+ "B",
332
+ "G",
333
+ "AY0",
334
+ "M",
335
+ "AA0",
336
+ "F",
337
+ "AO0",
338
+ "ER2",
339
+ "UH1",
340
+ "IY1",
341
+ "AH2",
342
+ "DH",
343
+ "IY0",
344
+ "EY1",
345
+ "IH0",
346
+ "K",
347
+ "N",
348
+ "W",
349
+ "IY2",
350
+ "T",
351
+ "AA1",
352
+ "ER1",
353
+ "EH2",
354
+ "OY0",
355
+ "UH2",
356
+ "UW1",
357
+ "Z",
358
+ "AW2",
359
+ "AW1",
360
+ "V",
361
+ "UW2",
362
+ "AA2",
363
+ "ER",
364
+ "AW0",
365
+ "UW0",
366
+ "R",
367
+ "OW1",
368
+ "EH1",
369
+ "ZH",
370
+ "AE0",
371
+ "IH2",
372
+ "IH",
373
+ "Y",
374
+ "JH",
375
+ "P",
376
+ "AY1",
377
+ "EY0",
378
+ "OY2",
379
+ "TH",
380
+ "HH",
381
+ "D",
382
+ "ER0",
383
+ "CH",
384
+ "AO1",
385
+ "AE1",
386
+ "AO2",
387
+ "OY1",
388
+ "AY2",
389
+ "IH1",
390
+ "OW0",
391
+ "L",
392
+ "SH",
393
+ }
394
+
395
+ ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停"
396
+
397
+ yue_symbols = {
398
+ "Yeot3",
399
+ "Yip1",
400
+ "Yyu3",
401
+ "Yeng4",
402
+ "Yut5",
403
+ "Yaan5",
404
+ "Ym5",
405
+ "Yaan6",
406
+ "Yang1",
407
+ "Yun4",
408
+ "Yon2",
409
+ "Yui5",
410
+ "Yun2",
411
+ "Yat3",
412
+ "Ye",
413
+ "Yeot1",
414
+ "Yoeng5",
415
+ "Yoek2",
416
+ "Yam2",
417
+ "Yeon6",
418
+ "Yu6",
419
+ "Yiu3",
420
+ "Yaang6",
421
+ "Yp5",
422
+ "Yai4",
423
+ "Yoek4",
424
+ "Yit6",
425
+ "Yam5",
426
+ "Yoeng6",
427
+ "Yg1",
428
+ "Yk3",
429
+ "Yoe4",
430
+ "Yam3",
431
+ "Yc",
432
+ "Yyu4",
433
+ "Yyut1",
434
+ "Yiu4",
435
+ "Ying3",
436
+ "Yip3",
437
+ "Yaap3",
438
+ "Yau3",
439
+ "Yan4",
440
+ "Yau1",
441
+ "Yap4",
442
+ "Yk6",
443
+ "Yok3",
444
+ "Yai1",
445
+ "Yeot6",
446
+ "Yan2",
447
+ "Yoek6",
448
+ "Yt1",
449
+ "Yoi1",
450
+ "Yit5",
451
+ "Yn4",
452
+ "Yaau3",
453
+ "Yau4",
454
+ "Yuk6",
455
+ "Ys",
456
+ "Yuk",
457
+ "Yin6",
458
+ "Yung6",
459
+ "Ya",
460
+ "You",
461
+ "Yaai5",
462
+ "Yau5",
463
+ "Yoi3",
464
+ "Yaak3",
465
+ "Yaat3",
466
+ "Ying2",
467
+ "Yok5",
468
+ "Yeng2",
469
+ "Yyut3",
470
+ "Yam1",
471
+ "Yip5",
472
+ "You1",
473
+ "Yam6",
474
+ "Yaa5",
475
+ "Yi6",
476
+ "Yek4",
477
+ "Yyu2",
478
+ "Yuk5",
479
+ "Yaam1",
480
+ "Yang2",
481
+ "Yai",
482
+ "Yiu6",
483
+ "Yin4",
484
+ "Yok4",
485
+ "Yot3",
486
+ "Yui2",
487
+ "Yeoi5",
488
+ "Yyun6",
489
+ "Yyu5",
490
+ "Yoi5",
491
+ "Yeot2",
492
+ "Yim4",
493
+ "Yeoi2",
494
+ "Yaan1",
495
+ "Yang6",
496
+ "Yong1",
497
+ "Yaang4",
498
+ "Yung5",
499
+ "Yeon1",
500
+ "Yin2",
501
+ "Ya3",
502
+ "Yaang3",
503
+ "Yg",
504
+ "Yk2",
505
+ "Yaau5",
506
+ "Yut1",
507
+ "Yt5",
508
+ "Yip4",
509
+ "Yung4",
510
+ "Yj",
511
+ "Yong3",
512
+ "Ya1",
513
+ "Yg6",
514
+ "Yaau6",
515
+ "Yit3",
516
+ "Yun3",
517
+ "Ying1",
518
+ "Yn2",
519
+ "Yg4",
520
+ "Yl",
521
+ "Yp3",
522
+ "Yn3",
523
+ "Yak1",
524
+ "Yang5",
525
+ "Yoe6",
526
+ "You2",
527
+ "Yap2",
528
+ "Yak2",
529
+ "Yt3",
530
+ "Yot5",
531
+ "Yim2",
532
+ "Yi1",
533
+ "Yn6",
534
+ "Yaat5",
535
+ "Yaam3",
536
+ "Yoek5",
537
+ "Ye3",
538
+ "Yeon4",
539
+ "Yaa2",
540
+ "Yu3",
541
+ "Yim6",
542
+ "Ym",
543
+ "Yoe3",
544
+ "Yaai2",
545
+ "Ym2",
546
+ "Ya6",
547
+ "Yeng6",
548
+ "Yik4",
549
+ "Yot4",
550
+ "Yaai4",
551
+ "Yyun3",
552
+ "Yu1",
553
+ "Yoeng1",
554
+ "Yaap2",
555
+ "Yuk3",
556
+ "Yoek3",
557
+ "Yeng5",
558
+ "Yeoi1",
559
+ "Yiu2",
560
+ "Yok1",
561
+ "Yo1",
562
+ "Yoek1",
563
+ "Yoeng2",
564
+ "Yeon5",
565
+ "Yiu1",
566
+ "Yoeng4",
567
+ "Yuk2",
568
+ "Yat4",
569
+ "Yg5",
570
+ "Yut4",
571
+ "Yan6",
572
+ "Yin3",
573
+ "Yaa6",
574
+ "Yap1",
575
+ "Yg2",
576
+ "Yoe5",
577
+ "Yt4",
578
+ "Ya5",
579
+ "Yo4",
580
+ "Yyu1",
581
+ "Yak3",
582
+ "Yeon2",
583
+ "Yong4",
584
+ "Ym1",
585
+ "Ye2",
586
+ "Yaang5",
587
+ "Yoi2",
588
+ "Yeng3",
589
+ "Yn",
590
+ "Yyut4",
591
+ "Yau",
592
+ "Yaak2",
593
+ "Yaan4",
594
+ "Yek2",
595
+ "Yin1",
596
+ "Yi5",
597
+ "Yoe2",
598
+ "Yei5",
599
+ "Yaat6",
600
+ "Yak5",
601
+ "Yp6",
602
+ "Yok6",
603
+ "Yei2",
604
+ "Yaap1",
605
+ "Yyut5",
606
+ "Yi4",
607
+ "Yim1",
608
+ "Yk5",
609
+ "Ye4",
610
+ "Yok2",
611
+ "Yaam6",
612
+ "Yat2",
613
+ "Yon6",
614
+ "Yei3",
615
+ "Yyu6",
616
+ "Yeot5",
617
+ "Yk4",
618
+ "Yai6",
619
+ "Yd",
620
+ "Yg3",
621
+ "Yei6",
622
+ "Yau2",
623
+ "Yok",
624
+ "Yau6",
625
+ "Yung3",
626
+ "Yim5",
627
+ "Yut6",
628
+ "Yit1",
629
+ "Yon3",
630
+ "Yat1",
631
+ "Yaam2",
632
+ "Yyut2",
633
+ "Yui6",
634
+ "Yt2",
635
+ "Yek6",
636
+ "Yt",
637
+ "Ye6",
638
+ "Yang3",
639
+ "Ying6",
640
+ "Yaau1",
641
+ "Yeon3",
642
+ "Yng",
643
+ "Yh",
644
+ "Yang4",
645
+ "Ying5",
646
+ "Yaap6",
647
+ "Yoeng3",
648
+ "Yyun4",
649
+ "You3",
650
+ "Yan5",
651
+ "Yat5",
652
+ "Yot1",
653
+ "Yun1",
654
+ "Yi3",
655
+ "Yaa1",
656
+ "Yaap4",
657
+ "You6",
658
+ "Yaang2",
659
+ "Yaap5",
660
+ "Yaa3",
661
+ "Yaak6",
662
+ "Yeng1",
663
+ "Yaak1",
664
+ "Yo5",
665
+ "Yoi4",
666
+ "Yam4",
667
+ "Yik1",
668
+ "Ye1",
669
+ "Yai5",
670
+ "Yung1",
671
+ "Yp2",
672
+ "Yui4",
673
+ "Yaak4",
674
+ "Yung2",
675
+ "Yak4",
676
+ "Yaat4",
677
+ "Yeoi4",
678
+ "Yut2",
679
+ "Yin5",
680
+ "Yaau4",
681
+ "Yap6",
682
+ "Yb",
683
+ "Yaam4",
684
+ "Yw",
685
+ "Yut3",
686
+ "Yong2",
687
+ "Yt6",
688
+ "Yaai6",
689
+ "Yap5",
690
+ "Yik5",
691
+ "Yun6",
692
+ "Yaam5",
693
+ "Yun5",
694
+ "Yik3",
695
+ "Ya2",
696
+ "Yyut6",
697
+ "Yon4",
698
+ "Yk1",
699
+ "Yit4",
700
+ "Yak6",
701
+ "Yaan2",
702
+ "Yuk1",
703
+ "Yai2",
704
+ "Yik2",
705
+ "Yaat2",
706
+ "Yo3",
707
+ "Ykw",
708
+ "Yn5",
709
+ "Yaa",
710
+ "Ye5",
711
+ "Yu4",
712
+ "Yei1",
713
+ "Yai3",
714
+ "Yyun5",
715
+ "Yip2",
716
+ "Yaau2",
717
+ "Yiu5",
718
+ "Ym4",
719
+ "Yeoi6",
720
+ "Yk",
721
+ "Ym6",
722
+ "Yoe1",
723
+ "Yeoi3",
724
+ "Yon",
725
+ "Yuk4",
726
+ "Yaai3",
727
+ "Yaa4",
728
+ "Yot6",
729
+ "Yaang1",
730
+ "Yei4",
731
+ "Yek1",
732
+ "Yo",
733
+ "Yp",
734
+ "Yo6",
735
+ "Yp4",
736
+ "Yan3",
737
+ "Yoi",
738
+ "Yap3",
739
+ "Yek3",
740
+ "Yim3",
741
+ "Yz",
742
+ "Yot2",
743
+ "Yoi6",
744
+ "Yit2",
745
+ "Yu5",
746
+ "Yaan3",
747
+ "Yan1",
748
+ "Yon5",
749
+ "Yp1",
750
+ "Yong5",
751
+ "Ygw",
752
+ "Yak",
753
+ "Yat6",
754
+ "Ying4",
755
+ "Yu2",
756
+ "Yf",
757
+ "Ya4",
758
+ "Yon1",
759
+ "You4",
760
+ "Yik6",
761
+ "Yui1",
762
+ "Yaat1",
763
+ "Yeot4",
764
+ "Yi2",
765
+ "Yaai1",
766
+ "Yek5",
767
+ "Ym3",
768
+ "Yong6",
769
+ "You5",
770
+ "Yyun1",
771
+ "Yn1",
772
+ "Yo2",
773
+ "Yip6",
774
+ "Yui3",
775
+ "Yaak5",
776
+ "Yyun2",
777
+ }
778
+
779
+ symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
780
+ symbols = sorted(set(symbols))
781
+ symbols += ["[", "]"] ##日文新增上升下降调型
782
+ symbols += sorted(list(ko_symbols))
783
+ symbols += sorted(
784
+ list(yue_symbols)
785
+ ) ##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复
models/ailia-models/GPT-SoVITS2/source.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/gpt-sovits-v2
2
+
3
+ [normal]
4
+
5
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/cnhubert.onnx
6
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/cnhubert.onnx.prototxt
7
+
8
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_encoder.onnx
9
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_encoder.onnx.prototxt
10
+
11
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_fsdec.onnx
12
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_fsdec.onnx.prototxt
13
+
14
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_sdec.onnx
15
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_sdec.onnx.prototxt
16
+
17
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/vits.onnx
18
+ https://storage.googleapis.com/ailia-models/gpt-sovits-v2/vits.onnx.prototxt
models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9986bf01f48c183efc741df4afdb38131119cef0218784c919654403604cd1da
3
+ size 11495994
models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx.prototxt ADDED
@@ -0,0 +1,2293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ir_version: 8
2
+ producer_name: "pytorch"
3
+ producer_version: "2.5.0"
4
+ model_version: 0
5
+ graph {
6
+ name: "main_graph"
7
+ node {
8
+ output: "onnx::ReduceSum_785"
9
+ name: "Constant_70"
10
+ op_type: "Constant"
11
+ attribute {
12
+ name: "value"
13
+ t {
14
+ dims: 1
15
+ data_type: 7
16
+ data_location: 0
17
+ }
18
+ type: TENSOR
19
+ }
20
+ }
21
+ node {
22
+ output: "onnx::ReduceSum_786"
23
+ name: "Constant_71"
24
+ op_type: "Constant"
25
+ attribute {
26
+ name: "value"
27
+ t {
28
+ dims: 1
29
+ data_type: 7
30
+ data_location: 0
31
+ }
32
+ type: TENSOR
33
+ }
34
+ }
35
+ node {
36
+ input: "ssl_content"
37
+ input: "vits.ssl_proj.weight"
38
+ input: "vits.ssl_proj.bias"
39
+ output: "/ssl_proj/Conv_output_0"
40
+ name: "/ssl_proj/Conv"
41
+ op_type: "Conv"
42
+ attribute {
43
+ name: "dilations"
44
+ ints: 1
45
+ type: INTS
46
+ }
47
+ attribute {
48
+ name: "group"
49
+ i: 1
50
+ type: INT
51
+ }
52
+ attribute {
53
+ name: "kernel_shape"
54
+ ints: 2
55
+ type: INTS
56
+ }
57
+ attribute {
58
+ name: "pads"
59
+ ints: 0
60
+ ints: 0
61
+ type: INTS
62
+ }
63
+ attribute {
64
+ name: "strides"
65
+ ints: 2
66
+ type: INTS
67
+ }
68
+ }
69
+ node {
70
+ input: "/ssl_proj/Conv_output_0"
71
+ output: "/quantizer/vq/layers.0/Transpose_output_0"
72
+ name: "/quantizer/vq/layers.0/Transpose"
73
+ op_type: "Transpose"
74
+ attribute {
75
+ name: "perm"
76
+ ints: 0
77
+ ints: 2
78
+ ints: 1
79
+ type: INTS
80
+ }
81
+ }
82
+ node {
83
+ output: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
84
+ name: "/quantizer/vq/layers.0/_codebook/Constant"
85
+ op_type: "Constant"
86
+ attribute {
87
+ name: "value"
88
+ t {
89
+ data_type: 7
90
+ data_location: 0
91
+ }
92
+ type: TENSOR
93
+ }
94
+ }
95
+ node {
96
+ input: "/quantizer/vq/layers.0/Transpose_output_0"
97
+ output: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
98
+ name: "/quantizer/vq/layers.0/_codebook/Shape"
99
+ op_type: "Shape"
100
+ }
101
+ node {
102
+ output: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
103
+ name: "/quantizer/vq/layers.0/_codebook/Constant_1"
104
+ op_type: "Constant"
105
+ attribute {
106
+ name: "value"
107
+ t {
108
+ data_type: 7
109
+ data_location: 0
110
+ }
111
+ type: TENSOR
112
+ }
113
+ }
114
+ node {
115
+ input: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
116
+ input: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
117
+ output: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
118
+ name: "/quantizer/vq/layers.0/_codebook/Gather"
119
+ op_type: "Gather"
120
+ attribute {
121
+ name: "axis"
122
+ i: 0
123
+ type: INT
124
+ }
125
+ }
126
+ node {
127
+ input: "/quantizer/vq/layers.0/Transpose_output_0"
128
+ output: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
129
+ name: "/quantizer/vq/layers.0/_codebook/Shape_1"
130
+ op_type: "Shape"
131
+ }
132
+ node {
133
+ output: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
134
+ name: "/quantizer/vq/layers.0/_codebook/Constant_2"
135
+ op_type: "Constant"
136
+ attribute {
137
+ name: "value"
138
+ t {
139
+ data_type: 7
140
+ data_location: 0
141
+ }
142
+ type: TENSOR
143
+ }
144
+ }
145
+ node {
146
+ input: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
147
+ input: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
148
+ output: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
149
+ name: "/quantizer/vq/layers.0/_codebook/Gather_1"
150
+ op_type: "Gather"
151
+ attribute {
152
+ name: "axis"
153
+ i: 0
154
+ type: INT
155
+ }
156
+ }
157
+ node {
158
+ input: "/quantizer/vq/layers.0/Transpose_output_0"
159
+ output: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
160
+ name: "/quantizer/vq/layers.0/_codebook/Shape_2"
161
+ op_type: "Shape"
162
+ }
163
+ node {
164
+ output: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
165
+ name: "/quantizer/vq/layers.0/_codebook/Constant_3"
166
+ op_type: "Constant"
167
+ attribute {
168
+ name: "value"
169
+ t {
170
+ data_type: 7
171
+ data_location: 0
172
+ }
173
+ type: TENSOR
174
+ }
175
+ }
176
+ node {
177
+ input: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
178
+ input: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
179
+ output: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
180
+ name: "/quantizer/vq/layers.0/_codebook/Gather_2"
181
+ op_type: "Gather"
182
+ attribute {
183
+ name: "axis"
184
+ i: 0
185
+ type: INT
186
+ }
187
+ }
188
+ node {
189
+ output: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
190
+ name: "/quantizer/vq/layers.0/_codebook/Constant_4"
191
+ op_type: "Constant"
192
+ attribute {
193
+ name: "value"
194
+ t {
195
+ data_type: 7
196
+ data_location: 0
197
+ }
198
+ type: TENSOR
199
+ }
200
+ }
201
+ node {
202
+ input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
203
+ input: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
204
+ output: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
205
+ name: "/quantizer/vq/layers.0/_codebook/Mul"
206
+ op_type: "Mul"
207
+ }
208
+ node {
209
+ input: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
210
+ input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
211
+ output: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
212
+ name: "/quantizer/vq/layers.0/_codebook/Mul_1"
213
+ op_type: "Mul"
214
+ }
215
+ node {
216
+ output: "onnx::Unsqueeze_804"
217
+ name: "Constant_87"
218
+ op_type: "Constant"
219
+ attribute {
220
+ name: "value"
221
+ t {
222
+ dims: 1
223
+ data_type: 7
224
+ data_location: 0
225
+ }
226
+ type: TENSOR
227
+ }
228
+ }
229
+ node {
230
+ input: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
231
+ input: "onnx::Unsqueeze_804"
232
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
233
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze"
234
+ op_type: "Unsqueeze"
235
+ }
236
+ node {
237
+ output: "onnx::Unsqueeze_806"
238
+ name: "Constant_89"
239
+ op_type: "Constant"
240
+ attribute {
241
+ name: "value"
242
+ t {
243
+ dims: 1
244
+ data_type: 7
245
+ data_location: 0
246
+ }
247
+ type: TENSOR
248
+ }
249
+ }
250
+ node {
251
+ input: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
252
+ input: "onnx::Unsqueeze_806"
253
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
254
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1"
255
+ op_type: "Unsqueeze"
256
+ }
257
+ node {
258
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
259
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
260
+ output: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
261
+ name: "/quantizer/vq/layers.0/_codebook/Concat"
262
+ op_type: "Concat"
263
+ attribute {
264
+ name: "axis"
265
+ i: 0
266
+ type: INT
267
+ }
268
+ }
269
+ node {
270
+ input: "/quantizer/vq/layers.0/Transpose_output_0"
271
+ input: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
272
+ output: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
273
+ name: "/quantizer/vq/layers.0/_codebook/Reshape"
274
+ op_type: "Reshape"
275
+ attribute {
276
+ name: "allowzero"
277
+ i: 0
278
+ type: INT
279
+ }
280
+ }
281
+ node {
282
+ output: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
283
+ name: "/quantizer/vq/layers.0/_codebook/Constant_5"
284
+ op_type: "Constant"
285
+ attribute {
286
+ name: "value"
287
+ t {
288
+ data_type: 1
289
+ data_location: 0
290
+ }
291
+ type: TENSOR
292
+ }
293
+ }
294
+ node {
295
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
296
+ input: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
297
+ output: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
298
+ name: "/quantizer/vq/layers.0/_codebook/Pow"
299
+ op_type: "Pow"
300
+ }
301
+ node {
302
+ input: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
303
+ input: "onnx::ReduceSum_786"
304
+ output: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
305
+ name: "/quantizer/vq/layers.0/_codebook/ReduceSum"
306
+ op_type: "ReduceSum"
307
+ attribute {
308
+ name: "keepdims"
309
+ i: 1
310
+ type: INT
311
+ }
312
+ }
313
+ node {
314
+ output: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
315
+ name: "/quantizer/vq/layers.0/_codebook/Constant_6"
316
+ op_type: "Constant"
317
+ attribute {
318
+ name: "value"
319
+ t {
320
+ data_type: 1
321
+ data_location: 0
322
+ }
323
+ type: TENSOR
324
+ }
325
+ }
326
+ node {
327
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
328
+ input: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
329
+ output: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
330
+ name: "/quantizer/vq/layers.0/_codebook/Mul_2"
331
+ op_type: "Mul"
332
+ }
333
+ node {
334
+ input: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
335
+ input: "onnx::MatMul_1009"
336
+ output: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
337
+ name: "/quantizer/vq/layers.0/_codebook/MatMul"
338
+ op_type: "MatMul"
339
+ }
340
+ node {
341
+ input: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
342
+ input: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
343
+ output: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
344
+ name: "/quantizer/vq/layers.0/_codebook/Sub"
345
+ op_type: "Sub"
346
+ }
347
+ node {
348
+ output: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
349
+ name: "/quantizer/vq/layers.0/_codebook/Constant_7"
350
+ op_type: "Constant"
351
+ attribute {
352
+ name: "value"
353
+ t {
354
+ data_type: 1
355
+ data_location: 0
356
+ }
357
+ type: TENSOR
358
+ }
359
+ }
360
+ node {
361
+ input: "onnx::MatMul_1009"
362
+ input: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
363
+ output: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
364
+ name: "/quantizer/vq/layers.0/_codebook/Pow_1"
365
+ op_type: "Pow"
366
+ }
367
+ node {
368
+ input: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
369
+ input: "onnx::ReduceSum_785"
370
+ output: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
371
+ name: "/quantizer/vq/layers.0/_codebook/ReduceSum_1"
372
+ op_type: "ReduceSum"
373
+ attribute {
374
+ name: "keepdims"
375
+ i: 1
376
+ type: INT
377
+ }
378
+ }
379
+ node {
380
+ input: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
381
+ input: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
382
+ output: "/quantizer/vq/layers.0/_codebook/Add_output_0"
383
+ name: "/quantizer/vq/layers.0/_codebook/Add"
384
+ op_type: "Add"
385
+ }
386
+ node {
387
+ input: "/quantizer/vq/layers.0/_codebook/Add_output_0"
388
+ output: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
389
+ name: "/quantizer/vq/layers.0/_codebook/Neg"
390
+ op_type: "Neg"
391
+ }
392
+ node {
393
+ input: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
394
+ output: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
395
+ name: "/quantizer/vq/layers.0/_codebook/ArgMax"
396
+ op_type: "ArgMax"
397
+ attribute {
398
+ name: "axis"
399
+ i: -1
400
+ type: INT
401
+ }
402
+ attribute {
403
+ name: "keepdims"
404
+ i: 0
405
+ type: INT
406
+ }
407
+ }
408
+ node {
409
+ output: "onnx::Unsqueeze_824"
410
+ name: "Constant_106"
411
+ op_type: "Constant"
412
+ attribute {
413
+ name: "value"
414
+ t {
415
+ dims: 1
416
+ data_type: 7
417
+ data_location: 0
418
+ }
419
+ type: TENSOR
420
+ }
421
+ }
422
+ node {
423
+ input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
424
+ input: "onnx::Unsqueeze_824"
425
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
426
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2"
427
+ op_type: "Unsqueeze"
428
+ }
429
+ node {
430
+ output: "onnx::Unsqueeze_826"
431
+ name: "Constant_108"
432
+ op_type: "Constant"
433
+ attribute {
434
+ name: "value"
435
+ t {
436
+ dims: 1
437
+ data_type: 7
438
+ data_location: 0
439
+ }
440
+ type: TENSOR
441
+ }
442
+ }
443
+ node {
444
+ input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
445
+ input: "onnx::Unsqueeze_826"
446
+ output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
447
+ name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3"
448
+ op_type: "Unsqueeze"
449
+ }
450
+ node {
451
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
452
+ input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
453
+ output: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
454
+ name: "/quantizer/vq/layers.0/_codebook/Concat_1"
455
+ op_type: "Concat"
456
+ attribute {
457
+ name: "axis"
458
+ i: 0
459
+ type: INT
460
+ }
461
+ }
462
+ node {
463
+ input: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
464
+ input: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
465
+ output: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
466
+ name: "/quantizer/vq/layers.0/_codebook/Reshape_1"
467
+ op_type: "Reshape"
468
+ attribute {
469
+ name: "allowzero"
470
+ i: 0
471
+ type: INT
472
+ }
473
+ }
474
+ node {
475
+ output: "/quantizer/vq/Constant_output_0"
476
+ name: "/quantizer/vq/Constant"
477
+ op_type: "Constant"
478
+ attribute {
479
+ name: "value"
480
+ t {
481
+ dims: 1
482
+ data_type: 7
483
+ data_location: 0
484
+ }
485
+ type: TENSOR
486
+ }
487
+ }
488
+ node {
489
+ input: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
490
+ input: "/quantizer/vq/Constant_output_0"
491
+ output: "/quantizer/vq/Unsqueeze_output_0"
492
+ name: "/quantizer/vq/Unsqueeze"
493
+ op_type: "Unsqueeze"
494
+ }
495
+ node {
496
+ input: "/quantizer/vq/Unsqueeze_output_0"
497
+ output: "/quantizer/vq/Concat_output_0"
498
+ name: "/quantizer/vq/Concat"
499
+ op_type: "Concat"
500
+ attribute {
501
+ name: "axis"
502
+ i: 0
503
+ type: INT
504
+ }
505
+ }
506
+ node {
507
+ input: "/quantizer/vq/Concat_output_0"
508
+ output: "/Transpose_output_0"
509
+ name: "/Transpose"
510
+ op_type: "Transpose"
511
+ attribute {
512
+ name: "perm"
513
+ ints: 1
514
+ ints: 0
515
+ ints: 2
516
+ type: INTS
517
+ }
518
+ }
519
+ node {
520
+ input: "/Transpose_output_0"
521
+ input: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
522
+ output: "/Gather_output_0"
523
+ name: "/Gather"
524
+ op_type: "Gather"
525
+ attribute {
526
+ name: "axis"
527
+ i: 0
528
+ type: INT
529
+ }
530
+ }
531
+ node {
532
+ input: "/Gather_output_0"
533
+ input: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
534
+ output: "/Gather_1_output_0"
535
+ name: "/Gather_1"
536
+ op_type: "Gather"
537
+ attribute {
538
+ name: "axis"
539
+ i: 0
540
+ type: INT
541
+ }
542
+ }
543
+ node {
544
+ input: "ref_bert"
545
+ output: "/Transpose_1_output_0"
546
+ name: "/Transpose_1"
547
+ op_type: "Transpose"
548
+ attribute {
549
+ name: "perm"
550
+ ints: 1
551
+ ints: 0
552
+ type: INTS
553
+ }
554
+ }
555
+ node {
556
+ input: "text_bert"
557
+ output: "/Transpose_2_output_0"
558
+ name: "/Transpose_2"
559
+ op_type: "Transpose"
560
+ attribute {
561
+ name: "perm"
562
+ ints: 1
563
+ ints: 0
564
+ type: INTS
565
+ }
566
+ }
567
+ node {
568
+ input: "/Transpose_1_output_0"
569
+ input: "/Transpose_2_output_0"
570
+ output: "/Concat_output_0"
571
+ name: "/Concat"
572
+ op_type: "Concat"
573
+ attribute {
574
+ name: "axis"
575
+ i: 1
576
+ type: INT
577
+ }
578
+ }
579
+ node {
580
+ input: "ref_seq"
581
+ input: "text_seq"
582
+ output: "/Concat_1_output_0"
583
+ name: "/Concat_1"
584
+ op_type: "Concat"
585
+ attribute {
586
+ name: "axis"
587
+ i: 1
588
+ type: INT
589
+ }
590
+ }
591
+ node {
592
+ output: "/Constant_output_0"
593
+ name: "/Constant"
594
+ op_type: "Constant"
595
+ attribute {
596
+ name: "value"
597
+ t {
598
+ dims: 1
599
+ data_type: 7
600
+ data_location: 0
601
+ }
602
+ type: TENSOR
603
+ }
604
+ }
605
+ node {
606
+ input: "/Concat_output_0"
607
+ input: "/Constant_output_0"
608
+ output: "/Unsqueeze_output_0"
609
+ name: "/Unsqueeze"
610
+ op_type: "Unsqueeze"
611
+ }
612
+ node {
613
+ output: "/Constant_1_output_0"
614
+ name: "/Constant_1"
615
+ op_type: "Constant"
616
+ attribute {
617
+ name: "value"
618
+ t {
619
+ dims: 1
620
+ data_type: 7
621
+ data_location: 0
622
+ }
623
+ type: TENSOR
624
+ }
625
+ }
626
+ node {
627
+ input: "/Gather_1_output_0"
628
+ input: "/Constant_1_output_0"
629
+ output: "prompts"
630
+ name: "/Unsqueeze_1"
631
+ op_type: "Unsqueeze"
632
+ }
633
+ node {
634
+ input: "encoder.ar_text_embedding.word_embeddings.weight"
635
+ input: "/Concat_1_output_0"
636
+ output: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
637
+ name: "/encoder/ar_text_embedding/word_embeddings/Gather"
638
+ op_type: "Gather"
639
+ }
640
+ node {
641
+ input: "/Unsqueeze_output_0"
642
+ output: "/encoder/Transpose_output_0"
643
+ name: "/encoder/Transpose"
644
+ op_type: "Transpose"
645
+ attribute {
646
+ name: "perm"
647
+ ints: 0
648
+ ints: 2
649
+ ints: 1
650
+ type: INTS
651
+ }
652
+ }
653
+ node {
654
+ input: "/encoder/Transpose_output_0"
655
+ input: "onnx::MatMul_1010"
656
+ output: "/encoder/bert_proj/MatMul_output_0"
657
+ name: "/encoder/bert_proj/MatMul"
658
+ op_type: "MatMul"
659
+ }
660
+ node {
661
+ input: "encoder.bert_proj.bias"
662
+ input: "/encoder/bert_proj/MatMul_output_0"
663
+ output: "/encoder/bert_proj/Add_output_0"
664
+ name: "/encoder/bert_proj/Add"
665
+ op_type: "Add"
666
+ }
667
+ node {
668
+ input: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
669
+ input: "/encoder/bert_proj/Add_output_0"
670
+ output: "/encoder/Add_output_0"
671
+ name: "/encoder/Add"
672
+ op_type: "Add"
673
+ }
674
+ node {
675
+ input: "/encoder/Add_output_0"
676
+ output: "/encoder/ar_text_position/Shape_output_0"
677
+ name: "/encoder/ar_text_position/Shape"
678
+ op_type: "Shape"
679
+ }
680
+ node {
681
+ output: "/encoder/ar_text_position/Constant_output_0"
682
+ name: "/encoder/ar_text_position/Constant"
683
+ op_type: "Constant"
684
+ attribute {
685
+ name: "value"
686
+ t {
687
+ data_type: 7
688
+ data_location: 0
689
+ }
690
+ type: TENSOR
691
+ }
692
+ }
693
+ node {
694
+ input: "/encoder/ar_text_position/Shape_output_0"
695
+ input: "/encoder/ar_text_position/Constant_output_0"
696
+ output: "/encoder/ar_text_position/Gather_output_0"
697
+ name: "/encoder/ar_text_position/Gather"
698
+ op_type: "Gather"
699
+ attribute {
700
+ name: "axis"
701
+ i: 0
702
+ type: INT
703
+ }
704
+ }
705
+ node {
706
+ output: "/encoder/ar_text_position/Constant_1_output_0"
707
+ name: "/encoder/ar_text_position/Constant_1"
708
+ op_type: "Constant"
709
+ attribute {
710
+ name: "value"
711
+ t {
712
+ data_type: 1
713
+ data_location: 0
714
+ }
715
+ type: TENSOR
716
+ }
717
+ }
718
+ node {
719
+ input: "/encoder/ar_text_position/Gather_output_0"
720
+ output: "/encoder/ar_text_position/Cast_output_0"
721
+ name: "/encoder/ar_text_position/Cast"
722
+ op_type: "Cast"
723
+ attribute {
724
+ name: "to"
725
+ i: 1
726
+ type: INT
727
+ }
728
+ }
729
+ node {
730
+ output: "/encoder/ar_text_position/Constant_2_output_0"
731
+ name: "/encoder/ar_text_position/Constant_2"
732
+ op_type: "Constant"
733
+ attribute {
734
+ name: "value"
735
+ t {
736
+ data_type: 1
737
+ data_location: 0
738
+ }
739
+ type: TENSOR
740
+ }
741
+ }
742
+ node {
743
+ input: "/encoder/ar_text_position/Constant_1_output_0"
744
+ input: "/encoder/ar_text_position/Cast_output_0"
745
+ input: "/encoder/ar_text_position/Constant_2_output_0"
746
+ output: "/encoder/ar_text_position/Range_output_0"
747
+ name: "/encoder/ar_text_position/Range"
748
+ op_type: "Range"
749
+ }
750
+ node {
751
+ output: "/encoder/ar_text_position/Constant_3_output_0"
752
+ name: "/encoder/ar_text_position/Constant_3"
753
+ op_type: "Constant"
754
+ attribute {
755
+ name: "value"
756
+ t {
757
+ dims: 1
758
+ data_type: 7
759
+ data_location: 0
760
+ }
761
+ type: TENSOR
762
+ }
763
+ }
764
+ node {
765
+ input: "/encoder/ar_text_position/Range_output_0"
766
+ input: "/encoder/ar_text_position/Constant_3_output_0"
767
+ output: "/encoder/ar_text_position/Unsqueeze_output_0"
768
+ name: "/encoder/ar_text_position/Unsqueeze"
769
+ op_type: "Unsqueeze"
770
+ }
771
+ node {
772
+ output: "onnx::Unsqueeze_860"
773
+ name: "Constant_140"
774
+ op_type: "Constant"
775
+ attribute {
776
+ name: "value"
777
+ t {
778
+ dims: 1
779
+ data_type: 7
780
+ data_location: 0
781
+ }
782
+ type: TENSOR
783
+ }
784
+ }
785
+ node {
786
+ input: "/encoder/ar_text_position/Gather_output_0"
787
+ input: "onnx::Unsqueeze_860"
788
+ output: "/encoder/ar_text_position/Unsqueeze_1_output_0"
789
+ name: "/encoder/ar_text_position/Unsqueeze_1"
790
+ op_type: "Unsqueeze"
791
+ }
792
+ node {
793
+ output: "/encoder/ar_text_position/Constant_4_output_0"
794
+ name: "/encoder/ar_text_position/Constant_4"
795
+ op_type: "Constant"
796
+ attribute {
797
+ name: "value"
798
+ t {
799
+ dims: 1
800
+ data_type: 7
801
+ data_location: 0
802
+ }
803
+ type: TENSOR
804
+ }
805
+ }
806
+ node {
807
+ input: "/encoder/ar_text_position/Unsqueeze_1_output_0"
808
+ input: "/encoder/ar_text_position/Constant_4_output_0"
809
+ output: "/encoder/ar_text_position/Concat_output_0"
810
+ name: "/encoder/ar_text_position/Concat"
811
+ op_type: "Concat"
812
+ attribute {
813
+ name: "axis"
814
+ i: 0
815
+ type: INT
816
+ }
817
+ }
818
+ node {
819
+ input: "/encoder/ar_text_position/Concat_output_0"
820
+ output: "/encoder/ar_text_position/ConstantOfShape_output_0"
821
+ name: "/encoder/ar_text_position/ConstantOfShape"
822
+ op_type: "ConstantOfShape"
823
+ attribute {
824
+ name: "value"
825
+ t {
826
+ dims: 1
827
+ data_type: 1
828
+ raw_data: "\000\000\000\000"
829
+ }
830
+ type: TENSOR
831
+ }
832
+ }
833
+ node {
834
+ output: "/encoder/ar_text_position/Constant_5_output_0"
835
+ name: "/encoder/ar_text_position/Constant_5"
836
+ op_type: "Constant"
837
+ attribute {
838
+ name: "value"
839
+ t {
840
+ dims: 256
841
+ data_type: 1
842
+ data_location: 0
843
+ }
844
+ type: TENSOR
845
+ }
846
+ }
847
+ node {
848
+ input: "/encoder/ar_text_position/Unsqueeze_output_0"
849
+ input: "/encoder/ar_text_position/Constant_5_output_0"
850
+ output: "/encoder/ar_text_position/Mul_output_0"
851
+ name: "/encoder/ar_text_position/Mul"
852
+ op_type: "Mul"
853
+ }
854
+ node {
855
+ input: "/encoder/ar_text_position/Mul_output_0"
856
+ output: "/encoder/ar_text_position/Sin_output_0"
857
+ name: "/encoder/ar_text_position/Sin"
858
+ op_type: "Sin"
859
+ }
860
+ node {
861
+ output: "/encoder/ar_text_position/Constant_6_output_0"
862
+ name: "/encoder/ar_text_position/Constant_6"
863
+ op_type: "Constant"
864
+ attribute {
865
+ name: "value"
866
+ t {
867
+ dims: 1
868
+ data_type: 7
869
+ data_location: 0
870
+ }
871
+ type: TENSOR
872
+ }
873
+ }
874
+ node {
875
+ output: "/encoder/ar_text_position/Constant_7_output_0"
876
+ name: "/encoder/ar_text_position/Constant_7"
877
+ op_type: "Constant"
878
+ attribute {
879
+ name: "value"
880
+ t {
881
+ dims: 1
882
+ data_type: 7
883
+ data_location: 0
884
+ }
885
+ type: TENSOR
886
+ }
887
+ }
888
+ node {
889
+ output: "/encoder/ar_text_position/Constant_8_output_0"
890
+ name: "/encoder/ar_text_position/Constant_8"
891
+ op_type: "Constant"
892
+ attribute {
893
+ name: "value"
894
+ t {
895
+ dims: 1
896
+ data_type: 7
897
+ data_location: 0
898
+ }
899
+ type: TENSOR
900
+ }
901
+ }
902
+ node {
903
+ output: "/encoder/ar_text_position/Constant_9_output_0"
904
+ name: "/encoder/ar_text_position/Constant_9"
905
+ op_type: "Constant"
906
+ attribute {
907
+ name: "value"
908
+ t {
909
+ dims: 1
910
+ data_type: 7
911
+ data_location: 0
912
+ }
913
+ type: TENSOR
914
+ }
915
+ }
916
+ node {
917
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
918
+ input: "/encoder/ar_text_position/Constant_7_output_0"
919
+ input: "/encoder/ar_text_position/Constant_8_output_0"
920
+ input: "/encoder/ar_text_position/Constant_6_output_0"
921
+ input: "/encoder/ar_text_position/Constant_9_output_0"
922
+ output: "/encoder/ar_text_position/Slice_output_0"
923
+ name: "/encoder/ar_text_position/Slice"
924
+ op_type: "Slice"
925
+ }
926
+ node {
927
+ input: "/encoder/ar_text_position/Slice_output_0"
928
+ output: "/encoder/ar_text_position/Shape_1_output_0"
929
+ name: "/encoder/ar_text_position/Shape_1"
930
+ op_type: "Shape"
931
+ }
932
+ node {
933
+ input: "/encoder/ar_text_position/Sin_output_0"
934
+ input: "/encoder/ar_text_position/Shape_1_output_0"
935
+ output: "/encoder/ar_text_position/Expand_output_0"
936
+ name: "/encoder/ar_text_position/Expand"
937
+ op_type: "Expand"
938
+ }
939
+ node {
940
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
941
+ output: "onnx::Gather_881"
942
+ name: "Shape_155"
943
+ op_type: "Shape"
944
+ }
945
+ node {
946
+ output: "onnx::Gather_882"
947
+ name: "Constant_156"
948
+ op_type: "Constant"
949
+ attribute {
950
+ name: "value"
951
+ t {
952
+ data_type: 7
953
+ data_location: 0
954
+ }
955
+ type: TENSOR
956
+ }
957
+ }
958
+ node {
959
+ input: "onnx::Gather_881"
960
+ input: "onnx::Gather_882"
961
+ output: "onnx::Cast_883"
962
+ name: "Gather_157"
963
+ op_type: "Gather"
964
+ attribute {
965
+ name: "axis"
966
+ i: 0
967
+ type: INT
968
+ }
969
+ }
970
+ node {
971
+ input: "onnx::Cast_883"
972
+ output: "onnx::Range_884"
973
+ name: "Cast_158"
974
+ op_type: "Cast"
975
+ attribute {
976
+ name: "to"
977
+ i: 7
978
+ type: INT
979
+ }
980
+ }
981
+ node {
982
+ output: "onnx::Range_885"
983
+ name: "Constant_159"
984
+ op_type: "Constant"
985
+ attribute {
986
+ name: "value"
987
+ t {
988
+ data_type: 7
989
+ data_location: 0
990
+ }
991
+ type: TENSOR
992
+ }
993
+ }
994
+ node {
995
+ output: "onnx::Range_886"
996
+ name: "Constant_160"
997
+ op_type: "Constant"
998
+ attribute {
999
+ name: "value"
1000
+ t {
1001
+ data_type: 7
1002
+ data_location: 0
1003
+ }
1004
+ type: TENSOR
1005
+ }
1006
+ }
1007
+ node {
1008
+ input: "onnx::Range_885"
1009
+ input: "onnx::Range_884"
1010
+ input: "onnx::Range_886"
1011
+ output: "onnx::Reshape_887"
1012
+ name: "Range_161"
1013
+ op_type: "Range"
1014
+ }
1015
+ node {
1016
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1017
+ output: "onnx::Gather_888"
1018
+ name: "Shape_162"
1019
+ op_type: "Shape"
1020
+ }
1021
+ node {
1022
+ output: "onnx::Gather_889"
1023
+ name: "Constant_163"
1024
+ op_type: "Constant"
1025
+ attribute {
1026
+ name: "value"
1027
+ t {
1028
+ data_type: 7
1029
+ data_location: 0
1030
+ }
1031
+ type: TENSOR
1032
+ }
1033
+ }
1034
+ node {
1035
+ input: "onnx::Gather_888"
1036
+ input: "onnx::Gather_889"
1037
+ output: "onnx::Cast_890"
1038
+ name: "Gather_164"
1039
+ op_type: "Gather"
1040
+ attribute {
1041
+ name: "axis"
1042
+ i: 0
1043
+ type: INT
1044
+ }
1045
+ }
1046
+ node {
1047
+ input: "onnx::Cast_890"
1048
+ output: "onnx::Range_891"
1049
+ name: "Cast_165"
1050
+ op_type: "Cast"
1051
+ attribute {
1052
+ name: "to"
1053
+ i: 7
1054
+ type: INT
1055
+ }
1056
+ }
1057
+ node {
1058
+ output: "onnx::Range_892"
1059
+ name: "Constant_166"
1060
+ op_type: "Constant"
1061
+ attribute {
1062
+ name: "value"
1063
+ t {
1064
+ data_type: 7
1065
+ data_location: 0
1066
+ }
1067
+ type: TENSOR
1068
+ }
1069
+ }
1070
+ node {
1071
+ output: "onnx::Range_893"
1072
+ name: "Constant_167"
1073
+ op_type: "Constant"
1074
+ attribute {
1075
+ name: "value"
1076
+ t {
1077
+ data_type: 7
1078
+ data_location: 0
1079
+ }
1080
+ type: TENSOR
1081
+ }
1082
+ }
1083
+ node {
1084
+ input: "onnx::Range_892"
1085
+ input: "onnx::Range_891"
1086
+ input: "onnx::Range_893"
1087
+ output: "onnx::Slice_894"
1088
+ name: "Range_168"
1089
+ op_type: "Range"
1090
+ }
1091
+ node {
1092
+ output: "/encoder/ar_text_position/Constant_10_output_0"
1093
+ name: "/encoder/ar_text_position/Constant_10"
1094
+ op_type: "Constant"
1095
+ attribute {
1096
+ name: "value"
1097
+ t {
1098
+ dims: 1
1099
+ data_type: 7
1100
+ data_location: 0
1101
+ }
1102
+ type: TENSOR
1103
+ }
1104
+ }
1105
+ node {
1106
+ output: "/encoder/ar_text_position/Constant_11_output_0"
1107
+ name: "/encoder/ar_text_position/Constant_11"
1108
+ op_type: "Constant"
1109
+ attribute {
1110
+ name: "value"
1111
+ t {
1112
+ dims: 1
1113
+ data_type: 7
1114
+ data_location: 0
1115
+ }
1116
+ type: TENSOR
1117
+ }
1118
+ }
1119
+ node {
1120
+ output: "/encoder/ar_text_position/Constant_12_output_0"
1121
+ name: "/encoder/ar_text_position/Constant_12"
1122
+ op_type: "Constant"
1123
+ attribute {
1124
+ name: "value"
1125
+ t {
1126
+ dims: 1
1127
+ data_type: 7
1128
+ data_location: 0
1129
+ }
1130
+ type: TENSOR
1131
+ }
1132
+ }
1133
+ node {
1134
+ output: "/encoder/ar_text_position/Constant_13_output_0"
1135
+ name: "/encoder/ar_text_position/Constant_13"
1136
+ op_type: "Constant"
1137
+ attribute {
1138
+ name: "value"
1139
+ t {
1140
+ dims: 1
1141
+ data_type: 7
1142
+ data_location: 0
1143
+ }
1144
+ type: TENSOR
1145
+ }
1146
+ }
1147
+ node {
1148
+ input: "onnx::Slice_894"
1149
+ input: "/encoder/ar_text_position/Constant_11_output_0"
1150
+ input: "/encoder/ar_text_position/Constant_12_output_0"
1151
+ input: "/encoder/ar_text_position/Constant_10_output_0"
1152
+ input: "/encoder/ar_text_position/Constant_13_output_0"
1153
+ output: "/encoder/ar_text_position/Slice_1_output_0"
1154
+ name: "/encoder/ar_text_position/Slice_1"
1155
+ op_type: "Slice"
1156
+ }
1157
+ node {
1158
+ output: "onnx::Reshape_905"
1159
+ name: "Constant_174"
1160
+ op_type: "Constant"
1161
+ attribute {
1162
+ name: "value"
1163
+ t {
1164
+ dims: 2
1165
+ data_type: 7
1166
+ data_location: 0
1167
+ }
1168
+ type: TENSOR
1169
+ }
1170
+ }
1171
+ node {
1172
+ input: "onnx::Reshape_887"
1173
+ input: "onnx::Reshape_905"
1174
+ output: "onnx::Add_906"
1175
+ name: "Reshape_175"
1176
+ op_type: "Reshape"
1177
+ attribute {
1178
+ name: "allowzero"
1179
+ i: 0
1180
+ type: INT
1181
+ }
1182
+ }
1183
+ node {
1184
+ input: "onnx::Add_906"
1185
+ input: "/encoder/ar_text_position/Slice_1_output_0"
1186
+ output: "/encoder/ar_text_position/Add_output_0"
1187
+ name: "/encoder/ar_text_position/Add"
1188
+ op_type: "Add"
1189
+ }
1190
+ node {
1191
+ input: "/encoder/ar_text_position/Add_output_0"
1192
+ output: "/encoder/ar_text_position/Shape_2_output_0"
1193
+ name: "/encoder/ar_text_position/Shape_2"
1194
+ op_type: "Shape"
1195
+ }
1196
+ node {
1197
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1198
+ output: "/encoder/ar_text_position/Shape_3_output_0"
1199
+ name: "/encoder/ar_text_position/Shape_3"
1200
+ op_type: "Shape"
1201
+ }
1202
+ node {
1203
+ input: "/encoder/ar_text_position/Shape_3_output_0"
1204
+ output: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
1205
+ name: "/encoder/ar_text_position/ConstantOfShape_1"
1206
+ op_type: "ConstantOfShape"
1207
+ attribute {
1208
+ name: "value"
1209
+ t {
1210
+ dims: 1
1211
+ data_type: 7
1212
+ raw_data: "\001\000\000\000\000\000\000\000"
1213
+ }
1214
+ type: TENSOR
1215
+ }
1216
+ }
1217
+ node {
1218
+ output: "/encoder/ar_text_position/Constant_14_output_0"
1219
+ name: "/encoder/ar_text_position/Constant_14"
1220
+ op_type: "Constant"
1221
+ attribute {
1222
+ name: "value"
1223
+ t {
1224
+ data_type: 7
1225
+ data_location: 0
1226
+ }
1227
+ type: TENSOR
1228
+ }
1229
+ }
1230
+ node {
1231
+ input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
1232
+ input: "/encoder/ar_text_position/Constant_14_output_0"
1233
+ output: "/encoder/ar_text_position/Mul_1_output_0"
1234
+ name: "/encoder/ar_text_position/Mul_1"
1235
+ op_type: "Mul"
1236
+ }
1237
+ node {
1238
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1239
+ input: "/encoder/ar_text_position/Mul_1_output_0"
1240
+ output: "/encoder/ar_text_position/Equal_output_0"
1241
+ name: "/encoder/ar_text_position/Equal"
1242
+ op_type: "Equal"
1243
+ }
1244
+ node {
1245
+ input: "/encoder/ar_text_position/Equal_output_0"
1246
+ input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
1247
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1248
+ output: "/encoder/ar_text_position/Where_output_0"
1249
+ name: "/encoder/ar_text_position/Where"
1250
+ op_type: "Where"
1251
+ }
1252
+ node {
1253
+ input: "onnx::Add_906"
1254
+ input: "/encoder/ar_text_position/Where_output_0"
1255
+ output: "/encoder/ar_text_position/Expand_1_output_0"
1256
+ name: "/encoder/ar_text_position/Expand_1"
1257
+ op_type: "Expand"
1258
+ }
1259
+ node {
1260
+ output: "/encoder/ar_text_position/Constant_15_output_0"
1261
+ name: "/encoder/ar_text_position/Constant_15"
1262
+ op_type: "Constant"
1263
+ attribute {
1264
+ name: "value"
1265
+ t {
1266
+ dims: 1
1267
+ data_type: 7
1268
+ data_location: 0
1269
+ }
1270
+ type: TENSOR
1271
+ }
1272
+ }
1273
+ node {
1274
+ input: "/encoder/ar_text_position/Expand_1_output_0"
1275
+ input: "/encoder/ar_text_position/Constant_15_output_0"
1276
+ output: "/encoder/ar_text_position/Unsqueeze_2_output_0"
1277
+ name: "/encoder/ar_text_position/Unsqueeze_2"
1278
+ op_type: "Unsqueeze"
1279
+ }
1280
+ node {
1281
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1282
+ output: "/encoder/ar_text_position/Shape_4_output_0"
1283
+ name: "/encoder/ar_text_position/Shape_4"
1284
+ op_type: "Shape"
1285
+ }
1286
+ node {
1287
+ input: "/encoder/ar_text_position/Shape_4_output_0"
1288
+ output: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
1289
+ name: "/encoder/ar_text_position/ConstantOfShape_2"
1290
+ op_type: "ConstantOfShape"
1291
+ attribute {
1292
+ name: "value"
1293
+ t {
1294
+ dims: 1
1295
+ data_type: 7
1296
+ raw_data: "\001\000\000\000\000\000\000\000"
1297
+ }
1298
+ type: TENSOR
1299
+ }
1300
+ }
1301
+ node {
1302
+ output: "/encoder/ar_text_position/Constant_16_output_0"
1303
+ name: "/encoder/ar_text_position/Constant_16"
1304
+ op_type: "Constant"
1305
+ attribute {
1306
+ name: "value"
1307
+ t {
1308
+ data_type: 7
1309
+ data_location: 0
1310
+ }
1311
+ type: TENSOR
1312
+ }
1313
+ }
1314
+ node {
1315
+ input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
1316
+ input: "/encoder/ar_text_position/Constant_16_output_0"
1317
+ output: "/encoder/ar_text_position/Mul_2_output_0"
1318
+ name: "/encoder/ar_text_position/Mul_2"
1319
+ op_type: "Mul"
1320
+ }
1321
+ node {
1322
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1323
+ input: "/encoder/ar_text_position/Mul_2_output_0"
1324
+ output: "/encoder/ar_text_position/Equal_1_output_0"
1325
+ name: "/encoder/ar_text_position/Equal_1"
1326
+ op_type: "Equal"
1327
+ }
1328
+ node {
1329
+ input: "/encoder/ar_text_position/Equal_1_output_0"
1330
+ input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
1331
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1332
+ output: "/encoder/ar_text_position/Where_1_output_0"
1333
+ name: "/encoder/ar_text_position/Where_1"
1334
+ op_type: "Where"
1335
+ }
1336
+ node {
1337
+ input: "/encoder/ar_text_position/Slice_1_output_0"
1338
+ input: "/encoder/ar_text_position/Where_1_output_0"
1339
+ output: "/encoder/ar_text_position/Expand_2_output_0"
1340
+ name: "/encoder/ar_text_position/Expand_2"
1341
+ op_type: "Expand"
1342
+ }
1343
+ node {
1344
+ output: "/encoder/ar_text_position/Constant_17_output_0"
1345
+ name: "/encoder/ar_text_position/Constant_17"
1346
+ op_type: "Constant"
1347
+ attribute {
1348
+ name: "value"
1349
+ t {
1350
+ dims: 1
1351
+ data_type: 7
1352
+ data_location: 0
1353
+ }
1354
+ type: TENSOR
1355
+ }
1356
+ }
1357
+ node {
1358
+ input: "/encoder/ar_text_position/Expand_2_output_0"
1359
+ input: "/encoder/ar_text_position/Constant_17_output_0"
1360
+ output: "/encoder/ar_text_position/Unsqueeze_3_output_0"
1361
+ name: "/encoder/ar_text_position/Unsqueeze_3"
1362
+ op_type: "Unsqueeze"
1363
+ }
1364
+ node {
1365
+ input: "/encoder/ar_text_position/Unsqueeze_2_output_0"
1366
+ input: "/encoder/ar_text_position/Unsqueeze_3_output_0"
1367
+ output: "/encoder/ar_text_position/Concat_1_output_0"
1368
+ name: "/encoder/ar_text_position/Concat_1"
1369
+ op_type: "Concat"
1370
+ attribute {
1371
+ name: "axis"
1372
+ i: -1
1373
+ type: INT
1374
+ }
1375
+ }
1376
+ node {
1377
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1378
+ output: "/encoder/ar_text_position/Shape_5_output_0"
1379
+ name: "/encoder/ar_text_position/Shape_5"
1380
+ op_type: "Shape"
1381
+ }
1382
+ node {
1383
+ output: "/encoder/ar_text_position/Constant_18_output_0"
1384
+ name: "/encoder/ar_text_position/Constant_18"
1385
+ op_type: "Constant"
1386
+ attribute {
1387
+ name: "value"
1388
+ t {
1389
+ dims: 1
1390
+ data_type: 7
1391
+ data_location: 0
1392
+ }
1393
+ type: TENSOR
1394
+ }
1395
+ }
1396
+ node {
1397
+ output: "/encoder/ar_text_position/Constant_19_output_0"
1398
+ name: "/encoder/ar_text_position/Constant_19"
1399
+ op_type: "Constant"
1400
+ attribute {
1401
+ name: "value"
1402
+ t {
1403
+ dims: 1
1404
+ data_type: 7
1405
+ data_location: 0
1406
+ }
1407
+ type: TENSOR
1408
+ }
1409
+ }
1410
+ node {
1411
+ output: "/encoder/ar_text_position/Constant_20_output_0"
1412
+ name: "/encoder/ar_text_position/Constant_20"
1413
+ op_type: "Constant"
1414
+ attribute {
1415
+ name: "value"
1416
+ t {
1417
+ dims: 1
1418
+ data_type: 7
1419
+ data_location: 0
1420
+ }
1421
+ type: TENSOR
1422
+ }
1423
+ }
1424
+ node {
1425
+ input: "/encoder/ar_text_position/Shape_5_output_0"
1426
+ input: "/encoder/ar_text_position/Constant_19_output_0"
1427
+ input: "/encoder/ar_text_position/Constant_20_output_0"
1428
+ input: "/encoder/ar_text_position/Constant_18_output_0"
1429
+ output: "/encoder/ar_text_position/Slice_2_output_0"
1430
+ name: "/encoder/ar_text_position/Slice_2"
1431
+ op_type: "Slice"
1432
+ }
1433
+ node {
1434
+ input: "/encoder/ar_text_position/Shape_2_output_0"
1435
+ input: "/encoder/ar_text_position/Slice_2_output_0"
1436
+ output: "/encoder/ar_text_position/Concat_2_output_0"
1437
+ name: "/encoder/ar_text_position/Concat_2"
1438
+ op_type: "Concat"
1439
+ attribute {
1440
+ name: "axis"
1441
+ i: 0
1442
+ type: INT
1443
+ }
1444
+ }
1445
+ node {
1446
+ input: "/encoder/ar_text_position/Expand_output_0"
1447
+ input: "/encoder/ar_text_position/Concat_2_output_0"
1448
+ output: "/encoder/ar_text_position/Reshape_output_0"
1449
+ name: "/encoder/ar_text_position/Reshape"
1450
+ op_type: "Reshape"
1451
+ attribute {
1452
+ name: "allowzero"
1453
+ i: 0
1454
+ type: INT
1455
+ }
1456
+ }
1457
+ node {
1458
+ input: "/encoder/ar_text_position/ConstantOfShape_output_0"
1459
+ input: "/encoder/ar_text_position/Concat_1_output_0"
1460
+ input: "/encoder/ar_text_position/Reshape_output_0"
1461
+ output: "/encoder/ar_text_position/ScatterND_output_0"
1462
+ name: "/encoder/ar_text_position/ScatterND"
1463
+ op_type: "ScatterND"
1464
+ }
1465
+ node {
1466
+ input: "/encoder/ar_text_position/Mul_output_0"
1467
+ output: "/encoder/ar_text_position/Cos_output_0"
1468
+ name: "/encoder/ar_text_position/Cos"
1469
+ op_type: "Cos"
1470
+ }
1471
+ node {
1472
+ output: "/encoder/ar_text_position/Constant_21_output_0"
1473
+ name: "/encoder/ar_text_position/Constant_21"
1474
+ op_type: "Constant"
1475
+ attribute {
1476
+ name: "value"
1477
+ t {
1478
+ dims: 1
1479
+ data_type: 7
1480
+ data_location: 0
1481
+ }
1482
+ type: TENSOR
1483
+ }
1484
+ }
1485
+ node {
1486
+ output: "/encoder/ar_text_position/Constant_22_output_0"
1487
+ name: "/encoder/ar_text_position/Constant_22"
1488
+ op_type: "Constant"
1489
+ attribute {
1490
+ name: "value"
1491
+ t {
1492
+ dims: 1
1493
+ data_type: 7
1494
+ data_location: 0
1495
+ }
1496
+ type: TENSOR
1497
+ }
1498
+ }
1499
+ node {
1500
+ output: "/encoder/ar_text_position/Constant_23_output_0"
1501
+ name: "/encoder/ar_text_position/Constant_23"
1502
+ op_type: "Constant"
1503
+ attribute {
1504
+ name: "value"
1505
+ t {
1506
+ dims: 1
1507
+ data_type: 7
1508
+ data_location: 0
1509
+ }
1510
+ type: TENSOR
1511
+ }
1512
+ }
1513
+ node {
1514
+ output: "/encoder/ar_text_position/Constant_24_output_0"
1515
+ name: "/encoder/ar_text_position/Constant_24"
1516
+ op_type: "Constant"
1517
+ attribute {
1518
+ name: "value"
1519
+ t {
1520
+ dims: 1
1521
+ data_type: 7
1522
+ data_location: 0
1523
+ }
1524
+ type: TENSOR
1525
+ }
1526
+ }
1527
+ node {
1528
+ input: "/encoder/ar_text_position/ScatterND_output_0"
1529
+ input: "/encoder/ar_text_position/Constant_22_output_0"
1530
+ input: "/encoder/ar_text_position/Constant_23_output_0"
1531
+ input: "/encoder/ar_text_position/Constant_21_output_0"
1532
+ input: "/encoder/ar_text_position/Constant_24_output_0"
1533
+ output: "/encoder/ar_text_position/Slice_3_output_0"
1534
+ name: "/encoder/ar_text_position/Slice_3"
1535
+ op_type: "Slice"
1536
+ }
1537
+ node {
1538
+ input: "/encoder/ar_text_position/Slice_3_output_0"
1539
+ output: "/encoder/ar_text_position/Shape_6_output_0"
1540
+ name: "/encoder/ar_text_position/Shape_6"
1541
+ op_type: "Shape"
1542
+ }
1543
+ node {
1544
+ input: "/encoder/ar_text_position/Cos_output_0"
1545
+ input: "/encoder/ar_text_position/Shape_6_output_0"
1546
+ output: "/encoder/ar_text_position/Expand_3_output_0"
1547
+ name: "/encoder/ar_text_position/Expand_3"
1548
+ op_type: "Expand"
1549
+ }
1550
+ node {
1551
+ input: "/encoder/ar_text_position/ScatterND_output_0"
1552
+ output: "onnx::Gather_948"
1553
+ name: "Shape_213"
1554
+ op_type: "Shape"
1555
+ }
1556
+ node {
1557
+ output: "onnx::Gather_949"
1558
+ name: "Constant_214"
1559
+ op_type: "Constant"
1560
+ attribute {
1561
+ name: "value"
1562
+ t {
1563
+ data_type: 7
1564
+ data_location: 0
1565
+ }
1566
+ type: TENSOR
1567
+ }
1568
+ }
1569
+ node {
1570
+ input: "onnx::Gather_948"
1571
+ input: "onnx::Gather_949"
1572
+ output: "onnx::Cast_950"
1573
+ name: "Gather_215"
1574
+ op_type: "Gather"
1575
+ attribute {
1576
+ name: "axis"
1577
+ i: 0
1578
+ type: INT
1579
+ }
1580
+ }
1581
+ node {
1582
+ input: "onnx::Cast_950"
1583
+ output: "onnx::Range_951"
1584
+ name: "Cast_216"
1585
+ op_type: "Cast"
1586
+ attribute {
1587
+ name: "to"
1588
+ i: 7
1589
+ type: INT
1590
+ }
1591
+ }
1592
+ node {
1593
+ output: "onnx::Range_952"
1594
+ name: "Constant_217"
1595
+ op_type: "Constant"
1596
+ attribute {
1597
+ name: "value"
1598
+ t {
1599
+ data_type: 7
1600
+ data_location: 0
1601
+ }
1602
+ type: TENSOR
1603
+ }
1604
+ }
1605
+ node {
1606
+ output: "onnx::Range_953"
1607
+ name: "Constant_218"
1608
+ op_type: "Constant"
1609
+ attribute {
1610
+ name: "value"
1611
+ t {
1612
+ data_type: 7
1613
+ data_location: 0
1614
+ }
1615
+ type: TENSOR
1616
+ }
1617
+ }
1618
+ node {
1619
+ input: "onnx::Range_952"
1620
+ input: "onnx::Range_951"
1621
+ input: "onnx::Range_953"
1622
+ output: "onnx::Reshape_954"
1623
+ name: "Range_219"
1624
+ op_type: "Range"
1625
+ }
1626
+ node {
1627
+ input: "/encoder/ar_text_position/ScatterND_output_0"
1628
+ output: "onnx::Gather_955"
1629
+ name: "Shape_220"
1630
+ op_type: "Shape"
1631
+ }
1632
+ node {
1633
+ output: "onnx::Gather_956"
1634
+ name: "Constant_221"
1635
+ op_type: "Constant"
1636
+ attribute {
1637
+ name: "value"
1638
+ t {
1639
+ data_type: 7
1640
+ data_location: 0
1641
+ }
1642
+ type: TENSOR
1643
+ }
1644
+ }
1645
+ node {
1646
+ input: "onnx::Gather_955"
1647
+ input: "onnx::Gather_956"
1648
+ output: "onnx::Cast_957"
1649
+ name: "Gather_222"
1650
+ op_type: "Gather"
1651
+ attribute {
1652
+ name: "axis"
1653
+ i: 0
1654
+ type: INT
1655
+ }
1656
+ }
1657
+ node {
1658
+ input: "onnx::Cast_957"
1659
+ output: "onnx::Range_958"
1660
+ name: "Cast_223"
1661
+ op_type: "Cast"
1662
+ attribute {
1663
+ name: "to"
1664
+ i: 7
1665
+ type: INT
1666
+ }
1667
+ }
1668
+ node {
1669
+ output: "onnx::Range_959"
1670
+ name: "Constant_224"
1671
+ op_type: "Constant"
1672
+ attribute {
1673
+ name: "value"
1674
+ t {
1675
+ data_type: 7
1676
+ data_location: 0
1677
+ }
1678
+ type: TENSOR
1679
+ }
1680
+ }
1681
+ node {
1682
+ output: "onnx::Range_960"
1683
+ name: "Constant_225"
1684
+ op_type: "Constant"
1685
+ attribute {
1686
+ name: "value"
1687
+ t {
1688
+ data_type: 7
1689
+ data_location: 0
1690
+ }
1691
+ type: TENSOR
1692
+ }
1693
+ }
1694
+ node {
1695
+ input: "onnx::Range_959"
1696
+ input: "onnx::Range_958"
1697
+ input: "onnx::Range_960"
1698
+ output: "onnx::Slice_961"
1699
+ name: "Range_226"
1700
+ op_type: "Range"
1701
+ }
1702
+ node {
1703
+ output: "/encoder/ar_text_position/Constant_25_output_0"
1704
+ name: "/encoder/ar_text_position/Constant_25"
1705
+ op_type: "Constant"
1706
+ attribute {
1707
+ name: "value"
1708
+ t {
1709
+ dims: 1
1710
+ data_type: 7
1711
+ data_location: 0
1712
+ }
1713
+ type: TENSOR
1714
+ }
1715
+ }
1716
+ node {
1717
+ output: "/encoder/ar_text_position/Constant_26_output_0"
1718
+ name: "/encoder/ar_text_position/Constant_26"
1719
+ op_type: "Constant"
1720
+ attribute {
1721
+ name: "value"
1722
+ t {
1723
+ dims: 1
1724
+ data_type: 7
1725
+ data_location: 0
1726
+ }
1727
+ type: TENSOR
1728
+ }
1729
+ }
1730
+ node {
1731
+ output: "/encoder/ar_text_position/Constant_27_output_0"
1732
+ name: "/encoder/ar_text_position/Constant_27"
1733
+ op_type: "Constant"
1734
+ attribute {
1735
+ name: "value"
1736
+ t {
1737
+ dims: 1
1738
+ data_type: 7
1739
+ data_location: 0
1740
+ }
1741
+ type: TENSOR
1742
+ }
1743
+ }
1744
+ node {
1745
+ output: "/encoder/ar_text_position/Constant_28_output_0"
1746
+ name: "/encoder/ar_text_position/Constant_28"
1747
+ op_type: "Constant"
1748
+ attribute {
1749
+ name: "value"
1750
+ t {
1751
+ dims: 1
1752
+ data_type: 7
1753
+ data_location: 0
1754
+ }
1755
+ type: TENSOR
1756
+ }
1757
+ }
1758
+ node {
1759
+ input: "onnx::Slice_961"
1760
+ input: "/encoder/ar_text_position/Constant_26_output_0"
1761
+ input: "/encoder/ar_text_position/Constant_27_output_0"
1762
+ input: "/encoder/ar_text_position/Constant_25_output_0"
1763
+ input: "/encoder/ar_text_position/Constant_28_output_0"
1764
+ output: "/encoder/ar_text_position/Slice_4_output_0"
1765
+ name: "/encoder/ar_text_position/Slice_4"
1766
+ op_type: "Slice"
1767
+ }
1768
+ node {
1769
+ output: "onnx::Reshape_972"
1770
+ name: "Constant_232"
1771
+ op_type: "Constant"
1772
+ attribute {
1773
+ name: "value"
1774
+ t {
1775
+ dims: 2
1776
+ data_type: 7
1777
+ data_location: 0
1778
+ }
1779
+ type: TENSOR
1780
+ }
1781
+ }
1782
+ node {
1783
+ input: "onnx::Reshape_954"
1784
+ input: "onnx::Reshape_972"
1785
+ output: "onnx::Add_973"
1786
+ name: "Reshape_233"
1787
+ op_type: "Reshape"
1788
+ attribute {
1789
+ name: "allowzero"
1790
+ i: 0
1791
+ type: INT
1792
+ }
1793
+ }
1794
+ node {
1795
+ input: "onnx::Add_973"
1796
+ input: "/encoder/ar_text_position/Slice_4_output_0"
1797
+ output: "/encoder/ar_text_position/Add_1_output_0"
1798
+ name: "/encoder/ar_text_position/Add_1"
1799
+ op_type: "Add"
1800
+ }
1801
+ node {
1802
+ input: "/encoder/ar_text_position/Add_1_output_0"
1803
+ output: "/encoder/ar_text_position/Shape_7_output_0"
1804
+ name: "/encoder/ar_text_position/Shape_7"
1805
+ op_type: "Shape"
1806
+ }
1807
+ node {
1808
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1809
+ output: "/encoder/ar_text_position/Shape_8_output_0"
1810
+ name: "/encoder/ar_text_position/Shape_8"
1811
+ op_type: "Shape"
1812
+ }
1813
+ node {
1814
+ input: "/encoder/ar_text_position/Shape_8_output_0"
1815
+ output: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
1816
+ name: "/encoder/ar_text_position/ConstantOfShape_3"
1817
+ op_type: "ConstantOfShape"
1818
+ attribute {
1819
+ name: "value"
1820
+ t {
1821
+ dims: 1
1822
+ data_type: 7
1823
+ raw_data: "\001\000\000\000\000\000\000\000"
1824
+ }
1825
+ type: TENSOR
1826
+ }
1827
+ }
1828
+ node {
1829
+ output: "/encoder/ar_text_position/Constant_29_output_0"
1830
+ name: "/encoder/ar_text_position/Constant_29"
1831
+ op_type: "Constant"
1832
+ attribute {
1833
+ name: "value"
1834
+ t {
1835
+ data_type: 7
1836
+ data_location: 0
1837
+ }
1838
+ type: TENSOR
1839
+ }
1840
+ }
1841
+ node {
1842
+ input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
1843
+ input: "/encoder/ar_text_position/Constant_29_output_0"
1844
+ output: "/encoder/ar_text_position/Mul_3_output_0"
1845
+ name: "/encoder/ar_text_position/Mul_3"
1846
+ op_type: "Mul"
1847
+ }
1848
+ node {
1849
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1850
+ input: "/encoder/ar_text_position/Mul_3_output_0"
1851
+ output: "/encoder/ar_text_position/Equal_2_output_0"
1852
+ name: "/encoder/ar_text_position/Equal_2"
1853
+ op_type: "Equal"
1854
+ }
1855
+ node {
1856
+ input: "/encoder/ar_text_position/Equal_2_output_0"
1857
+ input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
1858
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1859
+ output: "/encoder/ar_text_position/Where_2_output_0"
1860
+ name: "/encoder/ar_text_position/Where_2"
1861
+ op_type: "Where"
1862
+ }
1863
+ node {
1864
+ input: "onnx::Add_973"
1865
+ input: "/encoder/ar_text_position/Where_2_output_0"
1866
+ output: "/encoder/ar_text_position/Expand_4_output_0"
1867
+ name: "/encoder/ar_text_position/Expand_4"
1868
+ op_type: "Expand"
1869
+ }
1870
+ node {
1871
+ output: "/encoder/ar_text_position/Constant_30_output_0"
1872
+ name: "/encoder/ar_text_position/Constant_30"
1873
+ op_type: "Constant"
1874
+ attribute {
1875
+ name: "value"
1876
+ t {
1877
+ dims: 1
1878
+ data_type: 7
1879
+ data_location: 0
1880
+ }
1881
+ type: TENSOR
1882
+ }
1883
+ }
1884
+ node {
1885
+ input: "/encoder/ar_text_position/Expand_4_output_0"
1886
+ input: "/encoder/ar_text_position/Constant_30_output_0"
1887
+ output: "/encoder/ar_text_position/Unsqueeze_4_output_0"
1888
+ name: "/encoder/ar_text_position/Unsqueeze_4"
1889
+ op_type: "Unsqueeze"
1890
+ }
1891
+ node {
1892
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1893
+ output: "/encoder/ar_text_position/Shape_9_output_0"
1894
+ name: "/encoder/ar_text_position/Shape_9"
1895
+ op_type: "Shape"
1896
+ }
1897
+ node {
1898
+ input: "/encoder/ar_text_position/Shape_9_output_0"
1899
+ output: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
1900
+ name: "/encoder/ar_text_position/ConstantOfShape_4"
1901
+ op_type: "ConstantOfShape"
1902
+ attribute {
1903
+ name: "value"
1904
+ t {
1905
+ dims: 1
1906
+ data_type: 7
1907
+ raw_data: "\001\000\000\000\000\000\000\000"
1908
+ }
1909
+ type: TENSOR
1910
+ }
1911
+ }
1912
+ node {
1913
+ output: "/encoder/ar_text_position/Constant_31_output_0"
1914
+ name: "/encoder/ar_text_position/Constant_31"
1915
+ op_type: "Constant"
1916
+ attribute {
1917
+ name: "value"
1918
+ t {
1919
+ data_type: 7
1920
+ data_location: 0
1921
+ }
1922
+ type: TENSOR
1923
+ }
1924
+ }
1925
+ node {
1926
+ input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
1927
+ input: "/encoder/ar_text_position/Constant_31_output_0"
1928
+ output: "/encoder/ar_text_position/Mul_4_output_0"
1929
+ name: "/encoder/ar_text_position/Mul_4"
1930
+ op_type: "Mul"
1931
+ }
1932
+ node {
1933
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1934
+ input: "/encoder/ar_text_position/Mul_4_output_0"
1935
+ output: "/encoder/ar_text_position/Equal_3_output_0"
1936
+ name: "/encoder/ar_text_position/Equal_3"
1937
+ op_type: "Equal"
1938
+ }
1939
+ node {
1940
+ input: "/encoder/ar_text_position/Equal_3_output_0"
1941
+ input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
1942
+ input: "/encoder/ar_text_position/Shape_7_output_0"
1943
+ output: "/encoder/ar_text_position/Where_3_output_0"
1944
+ name: "/encoder/ar_text_position/Where_3"
1945
+ op_type: "Where"
1946
+ }
1947
+ node {
1948
+ input: "/encoder/ar_text_position/Slice_4_output_0"
1949
+ input: "/encoder/ar_text_position/Where_3_output_0"
1950
+ output: "/encoder/ar_text_position/Expand_5_output_0"
1951
+ name: "/encoder/ar_text_position/Expand_5"
1952
+ op_type: "Expand"
1953
+ }
1954
+ node {
1955
+ output: "/encoder/ar_text_position/Constant_32_output_0"
1956
+ name: "/encoder/ar_text_position/Constant_32"
1957
+ op_type: "Constant"
1958
+ attribute {
1959
+ name: "value"
1960
+ t {
1961
+ dims: 1
1962
+ data_type: 7
1963
+ data_location: 0
1964
+ }
1965
+ type: TENSOR
1966
+ }
1967
+ }
1968
+ node {
1969
+ input: "/encoder/ar_text_position/Expand_5_output_0"
1970
+ input: "/encoder/ar_text_position/Constant_32_output_0"
1971
+ output: "/encoder/ar_text_position/Unsqueeze_5_output_0"
1972
+ name: "/encoder/ar_text_position/Unsqueeze_5"
1973
+ op_type: "Unsqueeze"
1974
+ }
1975
+ node {
1976
+ input: "/encoder/ar_text_position/Unsqueeze_4_output_0"
1977
+ input: "/encoder/ar_text_position/Unsqueeze_5_output_0"
1978
+ output: "/encoder/ar_text_position/Concat_3_output_0"
1979
+ name: "/encoder/ar_text_position/Concat_3"
1980
+ op_type: "Concat"
1981
+ attribute {
1982
+ name: "axis"
1983
+ i: -1
1984
+ type: INT
1985
+ }
1986
+ }
1987
+ node {
1988
+ input: "/encoder/ar_text_position/ScatterND_output_0"
1989
+ output: "/encoder/ar_text_position/Shape_10_output_0"
1990
+ name: "/encoder/ar_text_position/Shape_10"
1991
+ op_type: "Shape"
1992
+ }
1993
+ node {
1994
+ output: "/encoder/ar_text_position/Constant_33_output_0"
1995
+ name: "/encoder/ar_text_position/Constant_33"
1996
+ op_type: "Constant"
1997
+ attribute {
1998
+ name: "value"
1999
+ t {
2000
+ dims: 1
2001
+ data_type: 7
2002
+ data_location: 0
2003
+ }
2004
+ type: TENSOR
2005
+ }
2006
+ }
2007
+ node {
2008
+ output: "/encoder/ar_text_position/Constant_34_output_0"
2009
+ name: "/encoder/ar_text_position/Constant_34"
2010
+ op_type: "Constant"
2011
+ attribute {
2012
+ name: "value"
2013
+ t {
2014
+ dims: 1
2015
+ data_type: 7
2016
+ data_location: 0
2017
+ }
2018
+ type: TENSOR
2019
+ }
2020
+ }
2021
+ node {
2022
+ output: "/encoder/ar_text_position/Constant_35_output_0"
2023
+ name: "/encoder/ar_text_position/Constant_35"
2024
+ op_type: "Constant"
2025
+ attribute {
2026
+ name: "value"
2027
+ t {
2028
+ dims: 1
2029
+ data_type: 7
2030
+ data_location: 0
2031
+ }
2032
+ type: TENSOR
2033
+ }
2034
+ }
2035
+ node {
2036
+ input: "/encoder/ar_text_position/Shape_10_output_0"
2037
+ input: "/encoder/ar_text_position/Constant_34_output_0"
2038
+ input: "/encoder/ar_text_position/Constant_35_output_0"
2039
+ input: "/encoder/ar_text_position/Constant_33_output_0"
2040
+ output: "/encoder/ar_text_position/Slice_5_output_0"
2041
+ name: "/encoder/ar_text_position/Slice_5"
2042
+ op_type: "Slice"
2043
+ }
2044
+ node {
2045
+ input: "/encoder/ar_text_position/Shape_7_output_0"
2046
+ input: "/encoder/ar_text_position/Slice_5_output_0"
2047
+ output: "/encoder/ar_text_position/Concat_4_output_0"
2048
+ name: "/encoder/ar_text_position/Concat_4"
2049
+ op_type: "Concat"
2050
+ attribute {
2051
+ name: "axis"
2052
+ i: 0
2053
+ type: INT
2054
+ }
2055
+ }
2056
+ node {
2057
+ input: "/encoder/ar_text_position/Expand_3_output_0"
2058
+ input: "/encoder/ar_text_position/Concat_4_output_0"
2059
+ output: "/encoder/ar_text_position/Reshape_1_output_0"
2060
+ name: "/encoder/ar_text_position/Reshape_1"
2061
+ op_type: "Reshape"
2062
+ attribute {
2063
+ name: "allowzero"
2064
+ i: 0
2065
+ type: INT
2066
+ }
2067
+ }
2068
+ node {
2069
+ input: "/encoder/ar_text_position/ScatterND_output_0"
2070
+ input: "/encoder/ar_text_position/Concat_3_output_0"
2071
+ input: "/encoder/ar_text_position/Reshape_1_output_0"
2072
+ output: "/encoder/ar_text_position/ScatterND_1_output_0"
2073
+ name: "/encoder/ar_text_position/ScatterND_1"
2074
+ op_type: "ScatterND"
2075
+ }
2076
+ node {
2077
+ output: "/encoder/ar_text_position/Constant_36_output_0"
2078
+ name: "/encoder/ar_text_position/Constant_36"
2079
+ op_type: "Constant"
2080
+ attribute {
2081
+ name: "value"
2082
+ t {
2083
+ dims: 1
2084
+ data_type: 7
2085
+ data_location: 0
2086
+ }
2087
+ type: TENSOR
2088
+ }
2089
+ }
2090
+ node {
2091
+ input: "/encoder/ar_text_position/ScatterND_1_output_0"
2092
+ input: "/encoder/ar_text_position/Constant_36_output_0"
2093
+ output: "/encoder/ar_text_position/Unsqueeze_6_output_0"
2094
+ name: "/encoder/ar_text_position/Unsqueeze_6"
2095
+ op_type: "Unsqueeze"
2096
+ }
2097
+ node {
2098
+ output: "/encoder/ar_text_position/Constant_37_output_0"
2099
+ name: "/encoder/ar_text_position/Constant_37"
2100
+ op_type: "Constant"
2101
+ attribute {
2102
+ name: "value"
2103
+ t {
2104
+ data_type: 1
2105
+ data_location: 0
2106
+ }
2107
+ type: TENSOR
2108
+ }
2109
+ }
2110
+ node {
2111
+ input: "/encoder/Add_output_0"
2112
+ input: "/encoder/ar_text_position/Constant_37_output_0"
2113
+ output: "/encoder/ar_text_position/Mul_5_output_0"
2114
+ name: "/encoder/ar_text_position/Mul_5"
2115
+ op_type: "Mul"
2116
+ }
2117
+ node {
2118
+ input: "encoder.ar_text_position.alpha"
2119
+ input: "/encoder/ar_text_position/Unsqueeze_6_output_0"
2120
+ output: "/encoder/ar_text_position/Mul_6_output_0"
2121
+ name: "/encoder/ar_text_position/Mul_6"
2122
+ op_type: "Mul"
2123
+ }
2124
+ node {
2125
+ input: "/encoder/ar_text_position/Mul_5_output_0"
2126
+ input: "/encoder/ar_text_position/Mul_6_output_0"
2127
+ output: "x"
2128
+ name: "/encoder/ar_text_position/Add_2"
2129
+ op_type: "Add"
2130
+ }
2131
+ initializer {
2132
+ dims: 732
2133
+ dims: 512
2134
+ data_type: 1
2135
+ name: "encoder.ar_text_embedding.word_embeddings.weight"
2136
+ }
2137
+ initializer {
2138
+ dims: 512
2139
+ data_type: 1
2140
+ name: "encoder.bert_proj.bias"
2141
+ }
2142
+ initializer {
2143
+ dims: 1
2144
+ data_type: 1
2145
+ name: "encoder.ar_text_position.alpha"
2146
+ }
2147
+ initializer {
2148
+ dims: 768
2149
+ dims: 768
2150
+ dims: 2
2151
+ data_type: 1
2152
+ name: "vits.ssl_proj.weight"
2153
+ }
2154
+ initializer {
2155
+ dims: 768
2156
+ data_type: 1
2157
+ name: "vits.ssl_proj.bias"
2158
+ }
2159
+ initializer {
2160
+ dims: 768
2161
+ dims: 1024
2162
+ data_type: 1
2163
+ name: "onnx::MatMul_1009"
2164
+ }
2165
+ initializer {
2166
+ dims: 1024
2167
+ dims: 512
2168
+ data_type: 1
2169
+ name: "onnx::MatMul_1010"
2170
+ }
2171
+ input {
2172
+ name: "ref_seq"
2173
+ type {
2174
+ tensor_type {
2175
+ elem_type: 7
2176
+ shape {
2177
+ dim {
2178
+ dim_value: 1
2179
+ }
2180
+ dim {
2181
+ dim_param: "ref_length"
2182
+ }
2183
+ }
2184
+ }
2185
+ }
2186
+ }
2187
+ input {
2188
+ name: "text_seq"
2189
+ type {
2190
+ tensor_type {
2191
+ elem_type: 7
2192
+ shape {
2193
+ dim {
2194
+ dim_value: 1
2195
+ }
2196
+ dim {
2197
+ dim_param: "text_length"
2198
+ }
2199
+ }
2200
+ }
2201
+ }
2202
+ }
2203
+ input {
2204
+ name: "ref_bert"
2205
+ type {
2206
+ tensor_type {
2207
+ elem_type: 1
2208
+ shape {
2209
+ dim {
2210
+ dim_param: "ref_length"
2211
+ }
2212
+ dim {
2213
+ dim_value: 1024
2214
+ }
2215
+ }
2216
+ }
2217
+ }
2218
+ }
2219
+ input {
2220
+ name: "text_bert"
2221
+ type {
2222
+ tensor_type {
2223
+ elem_type: 1
2224
+ shape {
2225
+ dim {
2226
+ dim_param: "text_length"
2227
+ }
2228
+ dim {
2229
+ dim_value: 1024
2230
+ }
2231
+ }
2232
+ }
2233
+ }
2234
+ }
2235
+ input {
2236
+ name: "ssl_content"
2237
+ type {
2238
+ tensor_type {
2239
+ elem_type: 1
2240
+ shape {
2241
+ dim {
2242
+ dim_value: 1
2243
+ }
2244
+ dim {
2245
+ dim_value: 768
2246
+ }
2247
+ dim {
2248
+ dim_param: "ssl_length"
2249
+ }
2250
+ }
2251
+ }
2252
+ }
2253
+ }
2254
+ output {
2255
+ name: "x"
2256
+ type {
2257
+ tensor_type {
2258
+ elem_type: 1
2259
+ shape {
2260
+ dim {
2261
+ dim_value: 1
2262
+ }
2263
+ dim {
2264
+ dim_param: "Addx_dim_1"
2265
+ }
2266
+ dim {
2267
+ dim_param: "Addx_dim_2"
2268
+ }
2269
+ }
2270
+ }
2271
+ }
2272
+ }
2273
+ output {
2274
+ name: "prompts"
2275
+ type {
2276
+ tensor_type {
2277
+ elem_type: 7
2278
+ shape {
2279
+ dim {
2280
+ dim_value: 1
2281
+ }
2282
+ dim {
2283
+ dim_param: "Unsqueezeprompts_dim_1"
2284
+ }
2285
+ }
2286
+ }
2287
+ }
2288
+ }
2289
+ }
2290
+ opset_import {
2291
+ domain: ""
2292
+ version: 17
2293
+ }