niobures commited on Oct 24, 2025

Commit

9479f69

verified ·

1 Parent(s): a02fb17

GPT-SoVITS (models_onnx)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
models/ailia-models/GPT-SoVITS/cnhubert.onnx +3 -0
models/ailia-models/GPT-SoVITS/cnhubert.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS/code/LICENSE +21 -0
models/ailia-models/GPT-SoVITS/code/README.md +58 -0
models/ailia-models/GPT-SoVITS/code/colab.ipynb +0 -0
models/ailia-models/GPT-SoVITS/code/gpt-sovits.py +383 -0
models/ailia-models/GPT-SoVITS/code/reference_audio_captured_by_ax.wav +3 -0
models/ailia-models/GPT-SoVITS/code/requirements.txt +6 -0
models/ailia-models/GPT-SoVITS/code/text/__init__.py +15 -0
models/ailia-models/GPT-SoVITS/code/text/cmudict.rep +0 -0
models/ailia-models/GPT-SoVITS/code/text/cmudict_cache.pickle +3 -0
models/ailia-models/GPT-SoVITS/code/text/english.py +107 -0
models/ailia-models/GPT-SoVITS/code/text/japanese.py +191 -0
models/ailia-models/GPT-SoVITS/code/text/symbols.py +401 -0
models/ailia-models/GPT-SoVITS/source.txt +26 -0
models/ailia-models/GPT-SoVITS/t2s_encoder.onnx +3 -0
models/ailia-models/GPT-SoVITS/t2s_encoder.onnx.prototxt +2816 -0
models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx +3 -0
models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS/t2s_sdec.onnx +3 -0
models/ailia-models/GPT-SoVITS/t2s_sdec.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx +3 -0
models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx +3 -0
models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS/vits.onnx +3 -0
models/ailia-models/GPT-SoVITS/vits.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS2/cnhubert.onnx +3 -0
models/ailia-models/GPT-SoVITS2/cnhubert.onnx.prototxt +0 -0
models/ailia-models/GPT-SoVITS2/code/LICENSE +21 -0
models/ailia-models/GPT-SoVITS2/code/README.md +53 -0
models/ailia-models/GPT-SoVITS2/code/gpt-sovits-v2.py +632 -0
models/ailia-models/GPT-SoVITS2/code/reference_audio_captured_by_ax.wav +3 -0
models/ailia-models/GPT-SoVITS2/code/requirements.txt +7 -0
models/ailia-models/GPT-SoVITS2/code/text/__init__.py +15 -0
models/ailia-models/GPT-SoVITS2/code/text/cleaner.py +32 -0
models/ailia-models/GPT-SoVITS2/code/text/cmudict-fast.rep +0 -0
models/ailia-models/GPT-SoVITS2/code/text/cmudict.rep +0 -0
models/ailia-models/GPT-SoVITS2/code/text/engdict-hot.rep +3 -0
models/ailia-models/GPT-SoVITS2/code/text/english.py +393 -0
models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/user.dict +0 -0
models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.csv +1 -0
models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.md5 +1 -0
models/ailia-models/GPT-SoVITS2/code/text/japanese.py +207 -0
models/ailia-models/GPT-SoVITS2/code/text/namedict_cache.pickle +3 -0
models/ailia-models/GPT-SoVITS2/code/text/symbols2.py +785 -0
models/ailia-models/GPT-SoVITS2/source.txt +18 -0
models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx +3 -0
models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx.prototxt +2293 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 GPT-SoVITS.[[:space:]]A[[:space:]]Zero-Shot[[:space:]]Speech[[:space:]]Synthesis[[:space:]]Model[[:space:]]with[[:space:]]Customizable[[:space:]]Fine-Tuning.pdf filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 GPT-SoVITS.[[:space:]]A[[:space:]]Zero-Shot[[:space:]]Speech[[:space:]]Synthesis[[:space:]]Model[[:space:]]with[[:space:]]Customizable[[:space:]]Fine-Tuning.pdf filter=lfs diff=lfs merge=lfs -text
+models/ailia-models/GPT-SoVITS/code/reference_audio_captured_by_ax.wav filter=lfs diff=lfs merge=lfs -text
+models/ailia-models/GPT-SoVITS2/code/reference_audio_captured_by_ax.wav filter=lfs diff=lfs merge=lfs -text
+models/ailia-models/GPT-SoVITS3/code/reference_audio_captured_by_ax.wav filter=lfs diff=lfs merge=lfs -text
+models/ailia-models/GPT-SoVITS3/code/text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text

models/ailia-models/GPT-SoVITS/cnhubert.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:558e4aabf7a7d1ef8ad89c0983a4a6413f9f4489232a35b4c1d455575f6cc242
+size 377745020

models/ailia-models/GPT-SoVITS/cnhubert.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/code/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 RVC-Boss
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

models/ailia-models/GPT-SoVITS/code/README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# GPT-SoVITS
+### Input
+- A synthesis text and reference audio and reference text for voice cloning
+### Output
+The Voice file is output as .wav which path is defined as `SAVE_WAV_PATH` in `gpt-sovits.py `.
+### Requirements
+This model requires pyopenjtalk for g2p.
+```
+pip3 install -r requirements.txt
+```
+### Usage
+Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
+For the sample sentence and sample audio,
+```
+python3 gpt-sovits.py
+```
+Run with audio prompt.
+```
+python3 gpt-sovits.py -i "音声合成のテストを行なっています。" --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。"
+```
+Run for english.
+```
+python3 gpt-sovits.py -i "Hello world. We are testing speech synthesis." --text_language en --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。" --ref_language ja
+```
+### Reference
+[GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+### Framework
+PyTorch 2.1.2
+### Model Format
+ONNX opset = 17
+### Netron
+#### Normal model
+- [cnhubert.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/cnhubert.onnx.prototxt)
+- [t2s_encoder.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_encoder.onnx.prototxt)
+- [t2s_fsdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_fsdec.onnx.prototxt)
+- [t2s_sdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.onnx.prototxt)
+- [vits.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/vits.onnx.prototxt)
+#### Optimized model
+- [t2s_sdec.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt.onnx.prototxt)
+- [t2s_sdec.opt2.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt2.onnx.prototxt)

models/ailia-models/GPT-SoVITS/code/colab.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/code/gpt-sovits.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import time
+import sys
+import platform
+import numpy as np
+import soundfile as sf
+# import original modules
+sys.path.append('../../util')
+from arg_utils import get_base_parser, update_parser, get_savepath  # noqa: E402
+from model_utils import check_and_download_models  # noqa: E402
+from scipy.io.wavfile import write
+# logger
+from logging import getLogger   # noqa: E402
+logger = getLogger(__name__)
+from text import cleaned_text_to_sequence
+import soundfile
+import librosa
+# ======================
+# PARAMETERS
+# ======================
+SAVE_WAV_PATH = 'output.wav'
+REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/gpt-sovits/'
+# ======================
+# Arguemnt Parser Config
+# ======================
+parser = get_base_parser( 'GPT-SoVits', None, SAVE_WAV_PATH)
+# overwrite
+parser.add_argument(
+    '--input', '-i', metavar='TEXT', default="ax株式会社ではAIの実用化のための技術を開発しています。",
+    help='input text'
+)
+parser.add_argument(
+    '--text_language', '-tl',
+    default='ja',
+    help='[ja, en]'
+)
+parser.add_argument(
+    '--ref_audio', '-ra', metavar='TEXT', default="reference_audio_captured_by_ax.wav",
+    help='ref audio'
+)
+parser.add_argument(
+    '--ref_text', '-rt', metavar='TEXT', default="水をマレーシアから買わなくてはならない。",
+    help='ref text'
+)
+parser.add_argument(
+    '--ref_language', '-rl',
+    default='ja',
+    help='[ja, en]'
+)
+parser.add_argument(
+    '--onnx', action='store_true',
+    help='use onnx runtime'
+)
+parser.add_argument(
+    '--normal', action='store_true',
+    help='use normal model'
+)
+parser.add_argument(
+    '--profile', action='store_true',
+    help='use profile model'
+)
+parser.add_argument(
+    '--ailia_voice', action='store_true',
+    help='use ailia voice for G2P'
+)
+args = update_parser(parser, check_input_type=False)
+WEIGHT_PATH_SSL = 'cnhubert.onnx'
+WEIGHT_PATH_T2S_ENCODER = 't2s_encoder.onnx'
+WEIGHT_PATH_T2S_FIRST_DECODER = 't2s_fsdec.onnx'
+if args.normal:
+    WEIGHT_PATH_T2S_STAGE_DECODER = 't2s_sdec.onnx'
+else:
+    WEIGHT_PATH_T2S_STAGE_DECODER = 't2s_sdec.opt3.onnx'
+WEIGHT_PATH_VITS = 'vits.onnx'
+MODEL_PATH_SSL = WEIGHT_PATH_SSL + '.prototxt'
+MODEL_PATH_T2S_ENCODER = WEIGHT_PATH_T2S_ENCODER + '.prototxt'
+MODEL_PATH_T2S_FIRST_DECODER = WEIGHT_PATH_T2S_FIRST_DECODER + '.prototxt'
+MODEL_PATH_T2S_STAGE_DECODER = WEIGHT_PATH_T2S_STAGE_DECODER + '.prototxt'
+MODEL_PATH_VITS = WEIGHT_PATH_VITS + '.prototxt'
+# ======================
+# Mode
+# ======================
+if not args.onnx:
+    import ailia
+    version = ailia.get_version().split(".")
+    AILIA_VERSION_MAJOR = int(version[0])
+    AILIA_VERSION_MINOR = int(version[1])
+    AILIA_VERSION_REVISION = int(version[2])
+    COPY_BLOB_DATA = not (
+        AILIA_VERSION_MAJOR <= 1
+        and AILIA_VERSION_MINOR <= 2
+        and AILIA_VERSION_REVISION < 15
+    )
+# ======================
+# Logic
+# ======================
+class T2SModel():
+    def __init__(self, sess_encoder, sess_fsdec, sess_sdec):
+        self.hz = 50
+        self.max_sec = 54
+        self.top_k = 5
+        self.early_stop_num = np.array([self.hz * self.max_sec])
+        self.sess_encoder = sess_encoder
+        self.sess_fsdec = sess_fsdec
+        self.sess_sdec = sess_sdec
+    def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
+        early_stop_num = self.early_stop_num
+        top_k = np.array([5], dtype=np.int64)
+        top_p = np.array([1.0], dtype=np.float32)
+        temperature = np.array([1.0], dtype=np.float32)
+        repetition_penalty = np.array([1.35], dtype=np.float32)
+        EOS = 1024
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            x, prompts = self.sess_encoder.run(None, {"ref_seq":ref_seq, "text_seq":text_seq, "ref_bert":ref_bert, "text_bert":text_bert, "ssl_content":ssl_content})
+        else:
+            x, prompts = self.sess_encoder.run({"ref_seq":ref_seq, "text_seq":text_seq, "ref_bert":ref_bert, "text_bert":text_bert, "ssl_content":ssl_content})
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tsencoder processing time {} ms".format(end-start))
+        prefix_len = prompts.shape[1]
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            y, k, v, y_emb, x_example = self.sess_fsdec.run(None, {"x":x, "prompts":prompts, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
+        else:
+            y, k, v, y_emb, x_example = self.sess_fsdec.run({"x":x, "prompts":prompts, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tfsdec processing time {} ms".format(end-start))
+        stop = False
+        for idx in range(1, 1500):
+            if args.benchmark:
+                start = int(round(time.time() * 1000))
+            if args.onnx:
+                y, k, v, y_emb, logits, samples = self.sess_sdec.run(None, {"iy":y, "ik":k, "iv":v, "iy_emb":y_emb, "ix_example":x_example, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
+            else:
+                if idx == 1:
+                    y, k, v, y_emb, logits, samples = self.sess_sdec.run({"iy":y, "ik":k, "iv":v, "iy_emb":y_emb, "ix_example":x_example, "top_k":top_k, "top_p":top_p, "temperature":temperature, "repetition_penalty":repetition_penalty})
+                    kv_base_shape = k.shape
+                else:
+                    input_blob_idx = self.sess_sdec.get_input_blob_list()
+                    output_blob_idx = self.sess_sdec.get_output_blob_list()
+                    self.sess_sdec.set_input_blob_data(y, 0)
+                    if COPY_BLOB_DATA:
+                        kv_shape = (kv_base_shape[0], kv_base_shape[1] + idx - 2, kv_base_shape[2], kv_base_shape[3])
+                        self.sess_sdec.set_input_blob_shape(kv_shape, 1)
+                        self.sess_sdec.set_input_blob_shape(kv_shape, 2)
+                        self.sess_sdec.copy_blob_data(input_blob_idx[1], output_blob_idx[1], self.sess_sdec)
+                        self.sess_sdec.copy_blob_data(input_blob_idx[2], output_blob_idx[2], self.sess_sdec)
+                    else:
+                        self.sess_sdec.set_input_blob_data(k, 1)
+                        self.sess_sdec.set_input_blob_data(v, 2)
+                    self.sess_sdec.set_input_blob_data(y_emb, 3)
+                    self.sess_sdec.set_input_blob_data(x_example, 4)
+                    self.sess_sdec.set_input_blob_data(top_k, 5)
+                    self.sess_sdec.set_input_blob_data(top_p, 6)
+                    self.sess_sdec.set_input_blob_data(temperature, 7)
+                    self.sess_sdec.set_input_blob_data(repetition_penalty, 8)
+                    self.sess_sdec.update()
+                    y = self.sess_sdec.get_blob_data(output_blob_idx[0])
+                    if not COPY_BLOB_DATA:
+                        k = self.sess_sdec.get_blob_data(output_blob_idx[1])
+                        v = self.sess_sdec.get_blob_data(output_blob_idx[2])
+                    y_emb = self.sess_sdec.get_blob_data(output_blob_idx[3])
+                    logits = self.sess_sdec.get_blob_data(output_blob_idx[4])
+                    samples = self.sess_sdec.get_blob_data(output_blob_idx[5])
+            if args.benchmark:
+                end = int(round(time.time() * 1000))
+                logger.info("\tsdec processing time {} ms".format(end-start))
+            if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+                stop = True
+            if np.argmax(logits, axis=-1)[0] == EOS or samples[0, 0] == EOS:
+                stop = True
+            if stop:
+                break
+        y[0, -1] = 0
+        return y[np.newaxis, :, -idx:-1]
+class GptSoVits():
+    def __init__(self, t2s, sess):
+        self.t2s = t2s
+        self.sess = sess
+    def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content):
+        pred_semantic = self.t2s.forward(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            audio1 = self.sess.run(None, {
+                "text_seq" : text_seq,
+                "pred_semantic" : pred_semantic,
+                "ref_audio" : ref_audio
+            })
+        else:
+            audio1 = self.sess.run({
+                "text_seq" : text_seq,
+                "pred_semantic" : pred_semantic,
+                "ref_audio" : ref_audio
+            })
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tvits processing time {} ms".format(end-start))
+        return audio1[0]
+class SSLModel():
+    def __init__(self, sess):
+        self.sess = sess
+    def forward(self, ref_audio_16k):
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            last_hidden_state = self.sess.run(None, {
+                "ref_audio_16k" : ref_audio_16k
+            })
+        else:
+            last_hidden_state = self.sess.run({
+                "ref_audio_16k" : ref_audio_16k
+            })
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tssl processing time {} ms".format(end-start))
+        return last_hidden_state[0]
+def generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits):
+    gpt = T2SModel(t2s_encoder, t2s_first_decoder, t2s_stage_decoder,)
+    gpt_sovits = GptSoVits(gpt, vits)
+    ssl = SSLModel(ssl)
+    input_audio = args.ref_audio
+    if args.ailia_voice:
+        import ailia_voice
+        voice = ailia_voice.G2P()
+        voice.initialize_model(model_path = "./models/")
+    else:
+        import text.japanese as japanese
+        import text.english as english
+    if args.ref_language == "ja":
+        if args.ailia_voice:
+            ref_phones = voice.g2p(args.ref_text, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_JA).split(" ")[:-1]
+        else:
+            ref_phones = japanese.g2p(args.ref_text)
+    else:
+        if args.ailia_voice:
+            ref_phones = voice.g2p(args.ref_text, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_EN).split(" ")[:-1]
+        else:
+            ref_phones = english.g2p(args.ref_text)
+    ref_seq = np.array([cleaned_text_to_sequence(ref_phones)], dtype=np.int64)
+    if args.text_language == "ja":
+        if args.ailia_voice:
+            text_phones = voice.g2p(args.input, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_JA).split(" ")[:-1]
+        else:
+            text_phones = japanese.g2p(args.input)
+    else:
+        if args.ailia_voice:
+            text_phones = voice.g2p(args.input, ailia_voice.AILIA_VOICE_G2P_TYPE_GPT_SOVITS_EN).split(" ")[:-1]
+        else:
+            text_phones = english.g2p(args.input)
+    text_seq = np.array([cleaned_text_to_sequence(text_phones)], dtype=np.int64)
+    # empty for ja or en
+    ref_bert = np.zeros((ref_seq.shape[1], 1024), dtype=np.float32)
+    text_bert = np.zeros((text_seq.shape[1], 1024), dtype=np.float32)
+    vits_hps_data_sampling_rate = 32000
+    zero_wav = np.zeros(
+        int(vits_hps_data_sampling_rate * 0.3),
+        dtype=np.float32,
+    )
+    wav16k, sr = librosa.load(input_audio, sr=16000)
+    wav16k = np.concatenate([wav16k, zero_wav], axis=0)
+    wav16k = wav16k[np.newaxis, :]
+    ref_audio_16k = wav16k # hubertの入力のみpaddingする
+    wav32k, sr = librosa.load(input_audio, sr=vits_hps_data_sampling_rate)
+    wav32k = wav32k[np.newaxis, :]
+    ssl_content = ssl.forward(ref_audio_16k)
+    a = gpt_sovits.forward(ref_seq, text_seq, ref_bert, text_bert, wav32k, ssl_content)
+    savepath = args.savepath
+    logger.info(f'saved at : {savepath}')
+    soundfile.write(savepath, a, vits_hps_data_sampling_rate)
+    logger.info('Script finished successfully.')
+def main():
+    # model files check and download
+    check_and_download_models(WEIGHT_PATH_SSL, MODEL_PATH_SSL, REMOTE_PATH)
+    check_and_download_models(WEIGHT_PATH_T2S_ENCODER, MODEL_PATH_T2S_ENCODER, REMOTE_PATH)
+    check_and_download_models(WEIGHT_PATH_T2S_FIRST_DECODER, MODEL_PATH_T2S_FIRST_DECODER, REMOTE_PATH)
+    check_and_download_models(WEIGHT_PATH_T2S_STAGE_DECODER, MODEL_PATH_T2S_STAGE_DECODER, REMOTE_PATH)
+    check_and_download_models(WEIGHT_PATH_VITS, MODEL_PATH_VITS, REMOTE_PATH)
+    #env_id = args.env_id
+    if args.onnx:
+        import onnxruntime
+        providers = ["CPUExecutionProvider"]
+        #providers = ["CUDAExecutionProvider"]
+        ssl = onnxruntime.InferenceSession(WEIGHT_PATH_SSL, providers=providers)
+        t2s_encoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_ENCODER, providers=providers)
+        t2s_first_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_FIRST_DECODER, providers=providers)
+        t2s_stage_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_STAGE_DECODER, providers=providers)
+        vits = onnxruntime.InferenceSession(WEIGHT_PATH_VITS, providers=providers)
+    else:
+        import ailia
+        memory_mode = ailia.get_memory_mode(reduce_constant=True, ignore_input_with_initializer=True, reduce_interstage=False, reuse_interstage=True)
+        ssl = ailia.Net(weight = WEIGHT_PATH_SSL, stream = MODEL_PATH_SSL, memory_mode = memory_mode, env_id = args.env_id)
+        t2s_encoder = ailia.Net(weight = WEIGHT_PATH_T2S_ENCODER, stream = MODEL_PATH_T2S_ENCODER, memory_mode = memory_mode, env_id = args.env_id)
+        t2s_first_decoder = ailia.Net(weight = WEIGHT_PATH_T2S_FIRST_DECODER, stream = MODEL_PATH_T2S_FIRST_DECODER, memory_mode = memory_mode, env_id = args.env_id)
+        t2s_stage_decoder = ailia.Net(weight = WEIGHT_PATH_T2S_STAGE_DECODER, stream = MODEL_PATH_T2S_STAGE_DECODER, memory_mode = memory_mode, env_id = args.env_id)
+        vits = ailia.Net(weight = WEIGHT_PATH_VITS, stream = MODEL_PATH_VITS, memory_mode = memory_mode, env_id = args.env_id)
+        if args.profile:
+            ssl.set_profile_mode(True)
+            t2s_encoder.set_profile_mode(True)
+            t2s_first_decoder.set_profile_mode(True)
+            t2s_stage_decoder.set_profile_mode(True)
+            vits.set_profile_mode(True)
+        pf = platform.system()
+        if pf == "Darwin":
+            if args.env_id == 2:
+                logger.info(
+                    "This model not optimized for macOS GPU currently. Please try -e 1 option to improve inference speed."
+                )
+    if args.benchmark:
+        start = int(round(time.time() * 1000))
+    generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits)
+    if args.benchmark:
+        end = int(round(time.time() * 1000))
+        logger.info("\ttotal processing time {} ms".format(end-start))
+    if args.profile:
+        print("ssl : ")
+        print(ssl.get_summary())
+        print("t2s_encoder : ")
+        print(t2s_encoder.get_summary())
+        print("t2s_first_decoder : ")
+        print(t2s_first_decoder.get_summary())
+        print("t2s_stage_decoder : ")
+        print(t2s_stage_decoder.get_summary())
+        print("vits : ")
+        print(vits.get_summary())
+if __name__ == '__main__':
+    main()

models/ailia-models/GPT-SoVITS/code/reference_audio_captured_by_ax.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8af474a35ab4aebaadda5a20d626c44830d5987880e54e10fc645eb73d568743
+size 226298

models/ailia-models/GPT-SoVITS/code/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+langid
+unidecode
+pyopenjtalk-prebuilt
+SoundFile
+librosa
+g2p_en

models/ailia-models/GPT-SoVITS/code/text/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from text.symbols import *
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
+  return phones

models/ailia-models/GPT-SoVITS/code/text/cmudict.rep ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/code/text/cmudict_cache.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
+size 6212655

models/ailia-models/GPT-SoVITS/code/text/english.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import pickle
+import os
+import re
+from g2p_en import G2p
+from text import symbols
+current_file_path = os.path.dirname(__file__)
+CMU_DICT_PATH = os.path.join(current_file_path, 'cmudict.rep')
+CACHE_PATH = os.path.join(current_file_path, 'cmudict_cache.pickle')
+_g2p = G2p()
+arpa = {'AH0', 'S', 'AH1', 'EY2', 'AE2', 'EH0', 'OW2', 'UH0', 'NG', 'B', 'G', 'AY0', 'M', 'AA0', 'F', 'AO0', 'ER2', 'UH1', 'IY1', 'AH2', 'DH', 'IY0', 'EY1', 'IH0', 'K', 'N', 'W', 'IY2', 'T', 'AA1', 'ER1', 'EH2', 'OY0', 'UH2', 'UW1', 'Z', 'AW2', 'AW1', 'V', 'UW2', 'AA2', 'ER', 'AW0', 'UW0', 'R', 'OW1', 'EH1', 'ZH', 'AE0', 'IH2', 'IH', 'Y', 'JH', 'P', 'AY1', 'EY0', 'OY2', 'TH', 'HH', 'D', 'ER0', 'CH', 'AO1', 'AE1', 'AO2', 'OY1', 'AY2', 'IH1', 'OW0', 'L', 'SH'}
+def replace_phs(phs):
+    rep_map = {
+        ';': ',',
+        ':': ',',
+        '\'': '-',
+        '"': '-'
+    }
+    phs_new = []
+    for ph in phs:
+        if ph in symbols:
+            phs_new.append(ph)
+        elif ph in rep_map.keys():
+            phs_new.append(rep_map[ph])
+        else:
+            print('ph not in symbols: ', ph)
+    return phs_new
+def read_dict():
+    g2p_dict = {}
+    start_line = 49
+    with open(CMU_DICT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= start_line:
+                line = line.strip()
+                word_split = line.split('  ')
+                word = word_split[0]
+                syllable_split = word_split[1].split(' - ')
+                g2p_dict[word] = []
+                for syllable in syllable_split:
+                    phone_split = syllable.split(' ')
+                    g2p_dict[word].append(phone_split)
+            line_index = line_index + 1
+            line = f.readline()
+    return g2p_dict
+def cache_dict(g2p_dict, file_path):
+    with open(file_path, 'wb') as pickle_file:
+        pickle.dump(g2p_dict, pickle_file)
+def get_dict():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, 'rb') as pickle_file:
+            g2p_dict = pickle.load(pickle_file)
+    else:
+        g2p_dict = read_dict()
+        cache_dict(g2p_dict, CACHE_PATH)
+    return g2p_dict
+eng_dict = get_dict()
+def text_normalize(text):
+    # todo: eng text normalize
+    return text.replace(";", ",")
+def g2p(text):
+    phones = []
+    words = re.split(r"([,;.\-\?\!\s+])", text)
+    for w in words:
+        if w.upper() in eng_dict:
+            phns = eng_dict[w.upper()]
+            for ph in phns:
+                phones += ph
+        else:
+            phone_list = list(filter(lambda p: p != " ", _g2p(w)))
+            for ph in phone_list:
+                if ph in arpa:
+                    phones.append(ph)
+                else:
+                    phones.append(ph)
+    return replace_phs(phones)
+if __name__ == "__main__":
+    # print(get_dict())
+    print(g2p("hello"))
+    print(g2p("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
+    # all_phones = set()
+    # for k, syllables in eng_dict.items():
+    #     for group in syllables:
+    #         for ph in group:
+    #             all_phones.add(ph)
+    # print(all_phones)

models/ailia-models/GPT-SoVITS/code/text/japanese.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
+import re
+import sys
+import pyopenjtalk
+from text import symbols
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
+# List of (consonant, sokuon) pairs:
+_real_sokuon = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        (r"Q([↑↓]*[kg])", r"k#\1"),
+        (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
+        (r"Q([↑↓]*[sʃ])", r"s\1"),
+        (r"Q([↑↓]*[pb])", r"p#\1"),
+    ]
+]
+# List of (consonant, hatsuon) pairs:
+_real_hatsuon = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        (r"N([↑↓]*[pbm])", r"m\1"),
+        (r"N([↑↓]*[ʧʥj])", r"n^\1"),
+        (r"N([↑↓]*[tdn])", r"n\1"),
+        (r"N([↑↓]*[kg])", r"ŋ\1"),
+    ]
+]
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "...": "…",
+    }
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    if ph in symbols:
+        return ph
+    if ph not in symbols:
+        ph = "UNK"
+    return ph
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def preprocess_jap(text, with_prosody=False):
+    """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = []
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if with_prosody:
+                text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
+            else:
+                p = pyopenjtalk.g2p(sentence)
+                text += p.split(" ")
+        if i < len(marks):
+            if marks[i] == " ":# 防止意外的UNK
+                continue
+            text += [marks[i].replace(" ", "")]
+    return text
+def text_normalize(text):
+    # todo: jap text normalize
+    return text
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
+    """Extract phoneme + prosoody symbol sequence from input full-context labels.
+    The algorithm is based on `Prosodic features control by symbols as input of
+    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
+    Args:
+        text (str): Input text.
+        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
+    Returns:
+        List[str]: List of phoneme + prosody symbols.
+    Examples:
+        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
+        >>> pyopenjtalk_g2p_prosody("こんにちは。")
+        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
+    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
+        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
+    """
+    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
+    N = len(labels)
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            assert n == 0 or n == N - 1
+            if n == 0:
+                phones.append("^")
+            elif n == N - 1:
+                # check question form or not
+                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+                if e3 == 0:
+                    phones.append("$")
+                elif e3 == 1:
+                    phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+    return phones
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+def g2p(norm_text, with_prosody=False):
+    phones = preprocess_jap(norm_text, with_prosody)
+    phones = [post_replace_ph(i) for i in phones]
+    # todo: implement tones and word2ph
+    return phones
+if __name__ == "__main__":
+    phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね！")
+    print(phones)

models/ailia-models/GPT-SoVITS/code/text/symbols.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import os
+# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
+punctuation = ["!", "?", "…", ",", "."]  # @是SP停顿
+punctuation.append("-")
+pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
+# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
+pad = "_"
+c = [
+    "AA",
+    "EE",
+    "OO",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+]
+v = [
+    "E1",
+    "En1",
+    "a1",
+    "ai1",
+    "an1",
+    "ang1",
+    "ao1",
+    "e1",
+    "ei1",
+    "en1",
+    "eng1",
+    "er1",
+    "i1",
+    "i01",
+    "ia1",
+    "ian1",
+    "iang1",
+    "iao1",
+    "ie1",
+    "in1",
+    "ing1",
+    "iong1",
+    "ir1",
+    "iu1",
+    "o1",
+    "ong1",
+    "ou1",
+    "u1",
+    "ua1",
+    "uai1",
+    "uan1",
+    "uang1",
+    "ui1",
+    "un1",
+    "uo1",
+    "v1",
+    "van1",
+    "ve1",
+    "vn1",
+    "E2",
+    "En2",
+    "a2",
+    "ai2",
+    "an2",
+    "ang2",
+    "ao2",
+    "e2",
+    "ei2",
+    "en2",
+    "eng2",
+    "er2",
+    "i2",
+    "i02",
+    "ia2",
+    "ian2",
+    "iang2",
+    "iao2",
+    "ie2",
+    "in2",
+    "ing2",
+    "iong2",
+    "ir2",
+    "iu2",
+    "o2",
+    "ong2",
+    "ou2",
+    "u2",
+    "ua2",
+    "uai2",
+    "uan2",
+    "uang2",
+    "ui2",
+    "un2",
+    "uo2",
+    "v2",
+    "van2",
+    "ve2",
+    "vn2",
+    "E3",
+    "En3",
+    "a3",
+    "ai3",
+    "an3",
+    "ang3",
+    "ao3",
+    "e3",
+    "ei3",
+    "en3",
+    "eng3",
+    "er3",
+    "i3",
+    "i03",
+    "ia3",
+    "ian3",
+    "iang3",
+    "iao3",
+    "ie3",
+    "in3",
+    "ing3",
+    "iong3",
+    "ir3",
+    "iu3",
+    "o3",
+    "ong3",
+    "ou3",
+    "u3",
+    "ua3",
+    "uai3",
+    "uan3",
+    "uang3",
+    "ui3",
+    "un3",
+    "uo3",
+    "v3",
+    "van3",
+    "ve3",
+    "vn3",
+    "E4",
+    "En4",
+    "a4",
+    "ai4",
+    "an4",
+    "ang4",
+    "ao4",
+    "e4",
+    "ei4",
+    "en4",
+    "eng4",
+    "er4",
+    "i4",
+    "i04",
+    "ia4",
+    "ian4",
+    "iang4",
+    "iao4",
+    "ie4",
+    "in4",
+    "ing4",
+    "iong4",
+    "ir4",
+    "iu4",
+    "o4",
+    "ong4",
+    "ou4",
+    "u4",
+    "ua4",
+    "uai4",
+    "uan4",
+    "uang4",
+    "ui4",
+    "un4",
+    "uo4",
+    "v4",
+    "van4",
+    "ve4",
+    "vn4",
+    "E5",
+    "En5",
+    "a5",
+    "ai5",
+    "an5",
+    "ang5",
+    "ao5",
+    "e5",
+    "ei5",
+    "en5",
+    "eng5",
+    "er5",
+    "i5",
+    "i05",
+    "ia5",
+    "ian5",
+    "iang5",
+    "iao5",
+    "ie5",
+    "in5",
+    "ing5",
+    "iong5",
+    "ir5",
+    "iu5",
+    "o5",
+    "ong5",
+    "ou5",
+    "u5",
+    "ua5",
+    "uai5",
+    "uan5",
+    "uang5",
+    "ui5",
+    "un5",
+    "uo5",
+    "v5",
+    "van5",
+    "ve5",
+    "vn5",
+]
+v_without_tone = [
+    "E",
+    "En",
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "i",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "o",
+    "ong",
+    "ou",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+]
+# japanese
+ja_symbols = [
+    "I",
+    "N",
+    "U",
+    "a",
+    "b",
+    "by",
+    "ch",
+    "cl",
+    "d",
+    "dy",
+    "e",
+    "f",
+    "g",
+    "gy",
+    "h",
+    "hy",
+    "i",
+    "j",
+    "k",
+    "ky",
+    "m",
+    "my",
+    "n",
+    "ny",
+    "o",
+    "p",
+    "py",
+    "r",
+    "ry",
+    "s",
+    "sh",
+    "t",
+    "ts",
+    "u",
+    "v",
+    "w",
+    "y",
+    "z",
+    # "[", #上升调型
+    # "]", #下降调型
+    # "$", #结束符
+    # "^", #开始符
+]
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
+symbols = sorted(set(symbols))
+if __name__ == "__main__":
+    print(len(symbols))

models/ailia-models/GPT-SoVITS/source.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/gpt-sovits
+[normal]
+https://storage.googleapis.com/ailia-models/gpt-sovits/cnhubert.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/cnhubert.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_encoder.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_encoder.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_fsdec.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_fsdec.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits/vits.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/vits.onnx.prototxt
+[optimized]
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt2.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits/t2s_sdec.opt2.onnx.prototxt

models/ailia-models/GPT-SoVITS/t2s_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5aa4a075812b0b8f2ff97d1c56aad66d31dd85b2c9dccc1d39eb0f6a550195e
+size 11055096

models/ailia-models/GPT-SoVITS/t2s_encoder.onnx.prototxt ADDED Viewed

	@@ -0,0 +1,2816 @@

+ir_version: 8
+producer_name: "pytorch"
+producer_version: "2.1.2"
+model_version: 0
+graph {
+  name: "main_graph"
+  node {
+    output: "onnx::ReduceSum_785"
+    name: "Constant_0"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::ReduceSum_786"
+    name: "Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "ssl_content"
+    input: "vits.ssl_proj.weight"
+    input: "vits.ssl_proj.bias"
+    output: "/ssl_proj/Conv_output_0"
+    name: "/ssl_proj/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_output_0"
+    name: "/quantizer/vq/layers.0/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/ssl_proj/Conv_output_0"
+    output: "/quantizer/vq/layers.0/Shape_output_0"
+    name: "/quantizer/vq/layers.0/Shape"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_1_output_0"
+    name: "/quantizer/vq/layers.0/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Shape_output_0"
+    input: "/quantizer/vq/layers.0/Constant_1_output_0"
+    output: "/quantizer/vq/layers.0/Gather_output_0"
+    name: "/quantizer/vq/layers.0/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/ssl_proj/Conv_output_0"
+    output: "/quantizer/vq/layers.0/Shape_1_output_0"
+    name: "/quantizer/vq/layers.0/Shape_1"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_2_output_0"
+    name: "/quantizer/vq/layers.0/Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Shape_1_output_0"
+    input: "/quantizer/vq/layers.0/Constant_2_output_0"
+    output: "/quantizer/vq/layers.0/Gather_1_output_0"
+    name: "/quantizer/vq/layers.0/Gather_1"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/ssl_proj/Conv_output_0"
+    output: "/quantizer/vq/layers.0/Shape_2_output_0"
+    name: "/quantizer/vq/layers.0/Shape_2"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_3_output_0"
+    name: "/quantizer/vq/layers.0/Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Shape_2_output_0"
+    input: "/quantizer/vq/layers.0/Constant_3_output_0"
+    output: "/quantizer/vq/layers.0/Gather_2_output_0"
+    name: "/quantizer/vq/layers.0/Gather_2"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_4_output_0"
+    name: "/quantizer/vq/layers.0/Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Gather_output_0"
+    input: "/quantizer/vq/layers.0/Constant_4_output_0"
+    output: "/quantizer/vq/layers.0/Div_output_0"
+    name: "/quantizer/vq/layers.0/Div"
+    op_type: "Div"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Div_output_0"
+    output: "/quantizer/vq/layers.0/Cast_output_0"
+    name: "/quantizer/vq/layers.0/Cast"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_output_0"
+    output: "/quantizer/vq/layers.0/Cast_1_output_0"
+    name: "/quantizer/vq/layers.0/Cast_1"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_5_output_0"
+    name: "/quantizer/vq/layers.0/Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Gather_1_output_0"
+    input: "/quantizer/vq/layers.0/Constant_5_output_0"
+    output: "/quantizer/vq/layers.0/Div_1_output_0"
+    name: "/quantizer/vq/layers.0/Div_1"
+    op_type: "Div"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Div_1_output_0"
+    output: "/quantizer/vq/layers.0/Cast_2_output_0"
+    name: "/quantizer/vq/layers.0/Cast_2"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_2_output_0"
+    output: "/quantizer/vq/layers.0/Cast_3_output_0"
+    name: "/quantizer/vq/layers.0/Cast_3"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/Constant_6_output_0"
+    name: "/quantizer/vq/layers.0/Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Gather_2_output_0"
+    input: "/quantizer/vq/layers.0/Constant_6_output_0"
+    output: "/quantizer/vq/layers.0/Div_2_output_0"
+    name: "/quantizer/vq/layers.0/Div_2"
+    op_type: "Div"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Div_2_output_0"
+    output: "/quantizer/vq/layers.0/Cast_4_output_0"
+    name: "/quantizer/vq/layers.0/Cast_4"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_4_output_0"
+    output: "/quantizer/vq/layers.0/Cast_5_output_0"
+    name: "/quantizer/vq/layers.0/Cast_5"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Unsqueeze_812"
+    name: "Constant_25"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_1_output_0"
+    input: "onnx::Unsqueeze_812"
+    output: "/quantizer/vq/layers.0/Unsqueeze_output_0"
+    name: "/quantizer/vq/layers.0/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_814"
+    name: "Constant_27"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_3_output_0"
+    input: "onnx::Unsqueeze_814"
+    output: "/quantizer/vq/layers.0/Unsqueeze_1_output_0"
+    name: "/quantizer/vq/layers.0/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_816"
+    name: "Constant_29"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_5_output_0"
+    input: "onnx::Unsqueeze_816"
+    output: "/quantizer/vq/layers.0/Unsqueeze_2_output_0"
+    name: "/quantizer/vq/layers.0/Unsqueeze_2"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Unsqueeze_output_0"
+    input: "/quantizer/vq/layers.0/Unsqueeze_1_output_0"
+    input: "/quantizer/vq/layers.0/Unsqueeze_2_output_0"
+    output: "/quantizer/vq/layers.0/Concat_output_0"
+    name: "/quantizer/vq/layers.0/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/ssl_proj/Conv_output_0"
+    input: "/quantizer/vq/layers.0/Concat_output_0"
+    output: "/quantizer/vq/layers.0/Reshape_output_0"
+    name: "/quantizer/vq/layers.0/Reshape"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Reshape_output_0"
+    output: "/quantizer/vq/layers.0/Transpose_output_0"
+    name: "/quantizer/vq/layers.0/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 0
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "onnx::Unsqueeze_821"
+    name: "Constant_34"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_1_output_0"
+    input: "onnx::Unsqueeze_821"
+    output: "/quantizer/vq/layers.0/Unsqueeze_3_output_0"
+    name: "/quantizer/vq/layers.0/Unsqueeze_3"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_823"
+    name: "Constant_36"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_5_output_0"
+    input: "onnx::Unsqueeze_823"
+    output: "/quantizer/vq/layers.0/Unsqueeze_4_output_0"
+    name: "/quantizer/vq/layers.0/Unsqueeze_4"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_825"
+    name: "Constant_38"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Cast_3_output_0"
+    input: "onnx::Unsqueeze_825"
+    output: "/quantizer/vq/layers.0/Unsqueeze_5_output_0"
+    name: "/quantizer/vq/layers.0/Unsqueeze_5"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Unsqueeze_3_output_0"
+    input: "/quantizer/vq/layers.0/Unsqueeze_4_output_0"
+    input: "/quantizer/vq/layers.0/Unsqueeze_5_output_0"
+    output: "/quantizer/vq/layers.0/Concat_1_output_0"
+    name: "/quantizer/vq/layers.0/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Transpose_output_0"
+    input: "/quantizer/vq/layers.0/Concat_1_output_0"
+    output: "/quantizer/vq/layers.0/Reshape_1_output_0"
+    name: "/quantizer/vq/layers.0/Reshape_1"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Reshape_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Shape"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Reshape_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Shape_1"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Gather_1"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Reshape_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Shape_2"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Gather_2"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul"
+    op_type: "Mul"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul_1"
+    op_type: "Mul"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Div_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Div"
+    op_type: "Div"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Div_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Cast_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Cast"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Cast_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Cast_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Cast_1"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul_2"
+    op_type: "Mul"
+  }
+  node {
+    output: "onnx::Unsqueeze_847"
+    name: "Constant_60"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
+    input: "onnx::Unsqueeze_847"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_849"
+    name: "Constant_62"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Cast_1_output_0"
+    input: "onnx::Unsqueeze_849"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Reshape_1_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Reshape"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Unsqueeze_853"
+    name: "Constant_66"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
+    input: "onnx::Unsqueeze_853"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_855"
+    name: "Constant_68"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Cast_1_output_0"
+    input: "onnx::Unsqueeze_855"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Reshape_1"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Pow"
+    op_type: "Pow"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
+    input: "onnx::ReduceSum_786"
+    output: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/ReduceSum"
+    op_type: "ReduceSum"
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_3_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul_3"
+    op_type: "Mul"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_3_output_0"
+    input: "onnx::MatMul_1058"
+    output: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/MatMul"
+    op_type: "MatMul"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Sub"
+    op_type: "Sub"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_8_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_8"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::MatMul_1058"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_8_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Pow_1"
+    op_type: "Pow"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
+    input: "onnx::ReduceSum_785"
+    output: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/ReduceSum_1"
+    op_type: "ReduceSum"
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Add_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Add_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Neg"
+    op_type: "Neg"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/ArgMax"
+    op_type: "ArgMax"
+    attribute {
+      name: "axis"
+      i: -1
+      type: INT
+    }
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Unsqueeze_873"
+    name: "Constant_85"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
+    input: "onnx::Unsqueeze_873"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_4_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_4"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_875"
+    name: "Constant_87"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
+    input: "onnx::Unsqueeze_875"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_5_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_5"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_4_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_5_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Concat_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Concat_2"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Concat_2_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Reshape_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Reshape_2"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/Constant_output_0"
+    name: "/quantizer/vq/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_2_output_0"
+    input: "/quantizer/vq/Constant_output_0"
+    output: "/quantizer/vq/Unsqueeze_output_0"
+    name: "/quantizer/vq/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/Unsqueeze_output_0"
+    output: "/quantizer/vq/Concat_output_0"
+    name: "/quantizer/vq/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/Concat_output_0"
+    output: "/Transpose_output_0"
+    name: "/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "/Transpose_output_0"
+    input: "/quantizer/vq/layers.0/Constant_output_0"
+    output: "/Gather_output_0"
+    name: "/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/Gather_output_0"
+    input: "/quantizer/vq/layers.0/Constant_output_0"
+    output: "/Gather_1_output_0"
+    name: "/Gather_1"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "ref_bert"
+    output: "/Transpose_1_output_0"
+    name: "/Transpose_1"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "text_bert"
+    output: "/Transpose_2_output_0"
+    name: "/Transpose_2"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "/Transpose_1_output_0"
+    input: "/Transpose_2_output_0"
+    output: "/Concat_output_0"
+    name: "/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "ref_seq"
+    input: "text_seq"
+    output: "/Concat_1_output_0"
+    name: "/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_output_0"
+    name: "/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_output_0"
+    input: "/Constant_output_0"
+    output: "/Unsqueeze_output_0"
+    name: "/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "/Constant_1_output_0"
+    name: "/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Gather_1_output_0"
+    input: "/Constant_1_output_0"
+    output: "prompts"
+    name: "/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "encoder.ar_text_embedding.word_embeddings.weight"
+    input: "/Concat_1_output_0"
+    output: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
+    name: "/encoder/ar_text_embedding/word_embeddings/Gather"
+    op_type: "Gather"
+  }
+  node {
+    input: "/Unsqueeze_output_0"
+    output: "/encoder/Transpose_output_0"
+    name: "/encoder/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 0
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/encoder/Transpose_output_0"
+    input: "onnx::MatMul_1059"
+    output: "/encoder/bert_proj/MatMul_output_0"
+    name: "/encoder/bert_proj/MatMul"
+    op_type: "MatMul"
+  }
+  node {
+    input: "encoder.bert_proj.bias"
+    input: "/encoder/bert_proj/MatMul_output_0"
+    output: "/encoder/bert_proj/Add_output_0"
+    name: "/encoder/bert_proj/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
+    input: "/encoder/bert_proj/Add_output_0"
+    output: "/encoder/Add_output_0"
+    name: "/encoder/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/Add_output_0"
+    output: "/encoder/ar_text_position/Shape_output_0"
+    name: "/encoder/ar_text_position/Shape"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_output_0"
+    name: "/encoder/ar_text_position/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_output_0"
+    input: "/encoder/ar_text_position/Constant_output_0"
+    output: "/encoder/ar_text_position/Gather_output_0"
+    name: "/encoder/ar_text_position/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_1_output_0"
+    name: "/encoder/ar_text_position/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_output_0"
+    output: "/encoder/ar_text_position/Cast_output_0"
+    name: "/encoder/ar_text_position/Cast"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_2_output_0"
+    name: "/encoder/ar_text_position/Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Constant_1_output_0"
+    input: "/encoder/ar_text_position/Cast_output_0"
+    input: "/encoder/ar_text_position/Constant_2_output_0"
+    output: "/encoder/ar_text_position/Range_output_0"
+    name: "/encoder/ar_text_position/Range"
+    op_type: "Range"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_3_output_0"
+    name: "/encoder/ar_text_position/Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Range_output_0"
+    input: "/encoder/ar_text_position/Constant_3_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_909"
+    name: "Constant_119"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_output_0"
+    input: "onnx::Unsqueeze_909"
+    output: "/encoder/ar_text_position/Unsqueeze_1_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_4_output_0"
+    name: "/encoder/ar_text_position/Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_1_output_0"
+    input: "/encoder/ar_text_position/Constant_4_output_0"
+    output: "/encoder/ar_text_position/Concat_output_0"
+    name: "/encoder/ar_text_position/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Concat_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 1
+        raw_data: "\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_5_output_0"
+    name: "/encoder/ar_text_position/Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 256
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_output_0"
+    input: "/encoder/ar_text_position/Constant_5_output_0"
+    output: "/encoder/ar_text_position/Mul_output_0"
+    name: "/encoder/ar_text_position/Mul"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Mul_output_0"
+    output: "/encoder/ar_text_position/Sin_output_0"
+    name: "/encoder/ar_text_position/Sin"
+    op_type: "Sin"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_6_output_0"
+    name: "/encoder/ar_text_position/Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_7_output_0"
+    name: "/encoder/ar_text_position/Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_8_output_0"
+    name: "/encoder/ar_text_position/Constant_8"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_9_output_0"
+    name: "/encoder/ar_text_position/Constant_9"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    input: "/encoder/ar_text_position/Constant_7_output_0"
+    input: "/encoder/ar_text_position/Constant_8_output_0"
+    input: "/encoder/ar_text_position/Constant_6_output_0"
+    input: "/encoder/ar_text_position/Constant_9_output_0"
+    output: "/encoder/ar_text_position/Slice_output_0"
+    name: "/encoder/ar_text_position/Slice"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_output_0"
+    output: "/encoder/ar_text_position/Shape_1_output_0"
+    name: "/encoder/ar_text_position/Shape_1"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Sin_output_0"
+    input: "/encoder/ar_text_position/Shape_1_output_0"
+    output: "/encoder/ar_text_position/Expand_output_0"
+    name: "/encoder/ar_text_position/Expand"
+    op_type: "Expand"
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    output: "/encoder/ar_text_position/Shape_2_output_0"
+    name: "/encoder/ar_text_position/Shape_2"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_10_output_0"
+    name: "/encoder/ar_text_position/Constant_10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    input: "/encoder/ar_text_position/Constant_10_output_0"
+    output: "/encoder/ar_text_position/Gather_1_output_0"
+    name: "/encoder/ar_text_position/Gather_1"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_1_output_0"
+    output: "/encoder/ar_text_position/Cast_1_output_0"
+    name: "/encoder/ar_text_position/Cast_1"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_11_output_0"
+    name: "/encoder/ar_text_position/Constant_11"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_12_output_0"
+    name: "/encoder/ar_text_position/Constant_12"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Constant_11_output_0"
+    input: "/encoder/ar_text_position/Cast_1_output_0"
+    input: "/encoder/ar_text_position/Constant_12_output_0"
+    output: "/encoder/ar_text_position/Range_1_output_0"
+    name: "/encoder/ar_text_position/Range_1"
+    op_type: "Range"
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    output: "/encoder/ar_text_position/Shape_3_output_0"
+    name: "/encoder/ar_text_position/Shape_3"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_13_output_0"
+    name: "/encoder/ar_text_position/Constant_13"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_3_output_0"
+    input: "/encoder/ar_text_position/Constant_13_output_0"
+    output: "/encoder/ar_text_position/Gather_2_output_0"
+    name: "/encoder/ar_text_position/Gather_2"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_2_output_0"
+    output: "/encoder/ar_text_position/Cast_2_output_0"
+    name: "/encoder/ar_text_position/Cast_2"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_14_output_0"
+    name: "/encoder/ar_text_position/Constant_14"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_15_output_0"
+    name: "/encoder/ar_text_position/Constant_15"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Constant_14_output_0"
+    input: "/encoder/ar_text_position/Cast_2_output_0"
+    input: "/encoder/ar_text_position/Constant_15_output_0"
+    output: "/encoder/ar_text_position/Range_2_output_0"
+    name: "/encoder/ar_text_position/Range_2"
+    op_type: "Range"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_16_output_0"
+    name: "/encoder/ar_text_position/Constant_16"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_17_output_0"
+    name: "/encoder/ar_text_position/Constant_17"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_18_output_0"
+    name: "/encoder/ar_text_position/Constant_18"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_19_output_0"
+    name: "/encoder/ar_text_position/Constant_19"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Range_2_output_0"
+    input: "/encoder/ar_text_position/Constant_17_output_0"
+    input: "/encoder/ar_text_position/Constant_18_output_0"
+    input: "/encoder/ar_text_position/Constant_16_output_0"
+    input: "/encoder/ar_text_position/Constant_19_output_0"
+    output: "/encoder/ar_text_position/Slice_1_output_0"
+    name: "/encoder/ar_text_position/Slice_1"
+    op_type: "Slice"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_20_output_0"
+    name: "/encoder/ar_text_position/Constant_20"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Range_1_output_0"
+    input: "/encoder/ar_text_position/Constant_20_output_0"
+    output: "/encoder/ar_text_position/Reshape_output_0"
+    name: "/encoder/ar_text_position/Reshape"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Reshape_output_0"
+    input: "/encoder/ar_text_position/Slice_1_output_0"
+    output: "/encoder/ar_text_position/Add_output_0"
+    name: "/encoder/ar_text_position/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/ar_text_position/Add_output_0"
+    output: "/encoder/ar_text_position/Shape_4_output_0"
+    name: "/encoder/ar_text_position/Shape_4"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    output: "/encoder/ar_text_position/Shape_5_output_0"
+    name: "/encoder/ar_text_position/Shape_5"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_5_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_1"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_21_output_0"
+    name: "/encoder/ar_text_position/Constant_21"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
+    input: "/encoder/ar_text_position/Constant_21_output_0"
+    output: "/encoder/ar_text_position/Mul_1_output_0"
+    name: "/encoder/ar_text_position/Mul_1"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    input: "/encoder/ar_text_position/Mul_1_output_0"
+    output: "/encoder/ar_text_position/Equal_output_0"
+    name: "/encoder/ar_text_position/Equal"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    output: "/encoder/ar_text_position/Where_output_0"
+    name: "/encoder/ar_text_position/Where"
+    op_type: "Where"
+  }
+  node {
+    input: "/encoder/ar_text_position/Reshape_output_0"
+    input: "/encoder/ar_text_position/Where_output_0"
+    output: "/encoder/ar_text_position/Expand_1_output_0"
+    name: "/encoder/ar_text_position/Expand_1"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_22_output_0"
+    name: "/encoder/ar_text_position/Constant_22"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_1_output_0"
+    input: "/encoder/ar_text_position/Constant_22_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_2_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_2"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    output: "/encoder/ar_text_position/Shape_6_output_0"
+    name: "/encoder/ar_text_position/Shape_6"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_6_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_2"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_23_output_0"
+    name: "/encoder/ar_text_position/Constant_23"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
+    input: "/encoder/ar_text_position/Constant_23_output_0"
+    output: "/encoder/ar_text_position/Mul_2_output_0"
+    name: "/encoder/ar_text_position/Mul_2"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    input: "/encoder/ar_text_position/Mul_2_output_0"
+    output: "/encoder/ar_text_position/Equal_1_output_0"
+    name: "/encoder/ar_text_position/Equal_1"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_1_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    output: "/encoder/ar_text_position/Where_1_output_0"
+    name: "/encoder/ar_text_position/Where_1"
+    op_type: "Where"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_1_output_0"
+    input: "/encoder/ar_text_position/Where_1_output_0"
+    output: "/encoder/ar_text_position/Expand_2_output_0"
+    name: "/encoder/ar_text_position/Expand_2"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_24_output_0"
+    name: "/encoder/ar_text_position/Constant_24"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_2_output_0"
+    input: "/encoder/ar_text_position/Constant_24_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_3_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_3"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_2_output_0"
+    input: "/encoder/ar_text_position/Unsqueeze_3_output_0"
+    output: "/encoder/ar_text_position/Concat_1_output_0"
+    name: "/encoder/ar_text_position/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: -1
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    output: "/encoder/ar_text_position/Shape_7_output_0"
+    name: "/encoder/ar_text_position/Shape_7"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_25_output_0"
+    name: "/encoder/ar_text_position/Constant_25"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_26_output_0"
+    name: "/encoder/ar_text_position/Constant_26"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_27_output_0"
+    name: "/encoder/ar_text_position/Constant_27"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    input: "/encoder/ar_text_position/Constant_26_output_0"
+    input: "/encoder/ar_text_position/Constant_27_output_0"
+    input: "/encoder/ar_text_position/Constant_25_output_0"
+    output: "/encoder/ar_text_position/Slice_2_output_0"
+    name: "/encoder/ar_text_position/Slice_2"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    input: "/encoder/ar_text_position/Slice_2_output_0"
+    output: "/encoder/ar_text_position/Concat_2_output_0"
+    name: "/encoder/ar_text_position/Concat_2"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_output_0"
+    input: "/encoder/ar_text_position/Concat_2_output_0"
+    output: "/encoder/ar_text_position/Reshape_1_output_0"
+    name: "/encoder/ar_text_position/Reshape_1"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    input: "/encoder/ar_text_position/Concat_1_output_0"
+    input: "/encoder/ar_text_position/Reshape_1_output_0"
+    output: "/encoder/ar_text_position/ScatterND_output_0"
+    name: "/encoder/ar_text_position/ScatterND"
+    op_type: "ScatterND"
+  }
+  node {
+    input: "/encoder/ar_text_position/Mul_output_0"
+    output: "/encoder/ar_text_position/Cos_output_0"
+    name: "/encoder/ar_text_position/Cos"
+    op_type: "Cos"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_28_output_0"
+    name: "/encoder/ar_text_position/Constant_28"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_29_output_0"
+    name: "/encoder/ar_text_position/Constant_29"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_30_output_0"
+    name: "/encoder/ar_text_position/Constant_30"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_31_output_0"
+    name: "/encoder/ar_text_position/Constant_31"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    input: "/encoder/ar_text_position/Constant_29_output_0"
+    input: "/encoder/ar_text_position/Constant_30_output_0"
+    input: "/encoder/ar_text_position/Constant_28_output_0"
+    input: "/encoder/ar_text_position/Constant_31_output_0"
+    output: "/encoder/ar_text_position/Slice_3_output_0"
+    name: "/encoder/ar_text_position/Slice_3"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_3_output_0"
+    output: "/encoder/ar_text_position/Shape_8_output_0"
+    name: "/encoder/ar_text_position/Shape_8"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Cos_output_0"
+    input: "/encoder/ar_text_position/Shape_8_output_0"
+    output: "/encoder/ar_text_position/Expand_3_output_0"
+    name: "/encoder/ar_text_position/Expand_3"
+    op_type: "Expand"
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    output: "/encoder/ar_text_position/Shape_9_output_0"
+    name: "/encoder/ar_text_position/Shape_9"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_32_output_0"
+    name: "/encoder/ar_text_position/Constant_32"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_9_output_0"
+    input: "/encoder/ar_text_position/Constant_32_output_0"
+    output: "/encoder/ar_text_position/Gather_3_output_0"
+    name: "/encoder/ar_text_position/Gather_3"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_3_output_0"
+    output: "/encoder/ar_text_position/Cast_3_output_0"
+    name: "/encoder/ar_text_position/Cast_3"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_33_output_0"
+    name: "/encoder/ar_text_position/Constant_33"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_34_output_0"
+    name: "/encoder/ar_text_position/Constant_34"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Constant_33_output_0"
+    input: "/encoder/ar_text_position/Cast_3_output_0"
+    input: "/encoder/ar_text_position/Constant_34_output_0"
+    output: "/encoder/ar_text_position/Range_3_output_0"
+    name: "/encoder/ar_text_position/Range_3"
+    op_type: "Range"
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    output: "/encoder/ar_text_position/Shape_10_output_0"
+    name: "/encoder/ar_text_position/Shape_10"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_35_output_0"
+    name: "/encoder/ar_text_position/Constant_35"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_10_output_0"
+    input: "/encoder/ar_text_position/Constant_35_output_0"
+    output: "/encoder/ar_text_position/Gather_4_output_0"
+    name: "/encoder/ar_text_position/Gather_4"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_4_output_0"
+    output: "/encoder/ar_text_position/Cast_4_output_0"
+    name: "/encoder/ar_text_position/Cast_4"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_36_output_0"
+    name: "/encoder/ar_text_position/Constant_36"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_37_output_0"
+    name: "/encoder/ar_text_position/Constant_37"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Constant_36_output_0"
+    input: "/encoder/ar_text_position/Cast_4_output_0"
+    input: "/encoder/ar_text_position/Constant_37_output_0"
+    output: "/encoder/ar_text_position/Range_4_output_0"
+    name: "/encoder/ar_text_position/Range_4"
+    op_type: "Range"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_38_output_0"
+    name: "/encoder/ar_text_position/Constant_38"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_39_output_0"
+    name: "/encoder/ar_text_position/Constant_39"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_40_output_0"
+    name: "/encoder/ar_text_position/Constant_40"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_41_output_0"
+    name: "/encoder/ar_text_position/Constant_41"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Range_4_output_0"
+    input: "/encoder/ar_text_position/Constant_39_output_0"
+    input: "/encoder/ar_text_position/Constant_40_output_0"
+    input: "/encoder/ar_text_position/Constant_38_output_0"
+    input: "/encoder/ar_text_position/Constant_41_output_0"
+    output: "/encoder/ar_text_position/Slice_4_output_0"
+    name: "/encoder/ar_text_position/Slice_4"
+    op_type: "Slice"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_42_output_0"
+    name: "/encoder/ar_text_position/Constant_42"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Range_3_output_0"
+    input: "/encoder/ar_text_position/Constant_42_output_0"
+    output: "/encoder/ar_text_position/Reshape_2_output_0"
+    name: "/encoder/ar_text_position/Reshape_2"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Reshape_2_output_0"
+    input: "/encoder/ar_text_position/Slice_4_output_0"
+    output: "/encoder/ar_text_position/Add_1_output_0"
+    name: "/encoder/ar_text_position/Add_1"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/ar_text_position/Add_1_output_0"
+    output: "/encoder/ar_text_position/Shape_11_output_0"
+    name: "/encoder/ar_text_position/Shape_11"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    output: "/encoder/ar_text_position/Shape_12_output_0"
+    name: "/encoder/ar_text_position/Shape_12"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_12_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_3"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_43_output_0"
+    name: "/encoder/ar_text_position/Constant_43"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
+    input: "/encoder/ar_text_position/Constant_43_output_0"
+    output: "/encoder/ar_text_position/Mul_3_output_0"
+    name: "/encoder/ar_text_position/Mul_3"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    input: "/encoder/ar_text_position/Mul_3_output_0"
+    output: "/encoder/ar_text_position/Equal_2_output_0"
+    name: "/encoder/ar_text_position/Equal_2"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_2_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    output: "/encoder/ar_text_position/Where_2_output_0"
+    name: "/encoder/ar_text_position/Where_2"
+    op_type: "Where"
+  }
+  node {
+    input: "/encoder/ar_text_position/Reshape_2_output_0"
+    input: "/encoder/ar_text_position/Where_2_output_0"
+    output: "/encoder/ar_text_position/Expand_4_output_0"
+    name: "/encoder/ar_text_position/Expand_4"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_44_output_0"
+    name: "/encoder/ar_text_position/Constant_44"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_4_output_0"
+    input: "/encoder/ar_text_position/Constant_44_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_4_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_4"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    output: "/encoder/ar_text_position/Shape_13_output_0"
+    name: "/encoder/ar_text_position/Shape_13"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_13_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_4"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_45_output_0"
+    name: "/encoder/ar_text_position/Constant_45"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
+    input: "/encoder/ar_text_position/Constant_45_output_0"
+    output: "/encoder/ar_text_position/Mul_4_output_0"
+    name: "/encoder/ar_text_position/Mul_4"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    input: "/encoder/ar_text_position/Mul_4_output_0"
+    output: "/encoder/ar_text_position/Equal_3_output_0"
+    name: "/encoder/ar_text_position/Equal_3"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_3_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    output: "/encoder/ar_text_position/Where_3_output_0"
+    name: "/encoder/ar_text_position/Where_3"
+    op_type: "Where"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_4_output_0"
+    input: "/encoder/ar_text_position/Where_3_output_0"
+    output: "/encoder/ar_text_position/Expand_5_output_0"
+    name: "/encoder/ar_text_position/Expand_5"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_46_output_0"
+    name: "/encoder/ar_text_position/Constant_46"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_5_output_0"
+    input: "/encoder/ar_text_position/Constant_46_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_5_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_5"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_4_output_0"
+    input: "/encoder/ar_text_position/Unsqueeze_5_output_0"
+    output: "/encoder/ar_text_position/Concat_3_output_0"
+    name: "/encoder/ar_text_position/Concat_3"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: -1
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    output: "/encoder/ar_text_position/Shape_14_output_0"
+    name: "/encoder/ar_text_position/Shape_14"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_47_output_0"
+    name: "/encoder/ar_text_position/Constant_47"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_48_output_0"
+    name: "/encoder/ar_text_position/Constant_48"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_49_output_0"
+    name: "/encoder/ar_text_position/Constant_49"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_14_output_0"
+    input: "/encoder/ar_text_position/Constant_48_output_0"
+    input: "/encoder/ar_text_position/Constant_49_output_0"
+    input: "/encoder/ar_text_position/Constant_47_output_0"
+    output: "/encoder/ar_text_position/Slice_5_output_0"
+    name: "/encoder/ar_text_position/Slice_5"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_11_output_0"
+    input: "/encoder/ar_text_position/Slice_5_output_0"
+    output: "/encoder/ar_text_position/Concat_4_output_0"
+    name: "/encoder/ar_text_position/Concat_4"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_3_output_0"
+    input: "/encoder/ar_text_position/Concat_4_output_0"
+    output: "/encoder/ar_text_position/Reshape_3_output_0"
+    name: "/encoder/ar_text_position/Reshape_3"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    input: "/encoder/ar_text_position/Concat_3_output_0"
+    input: "/encoder/ar_text_position/Reshape_3_output_0"
+    output: "/encoder/ar_text_position/ScatterND_1_output_0"
+    name: "/encoder/ar_text_position/ScatterND_1"
+    op_type: "ScatterND"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_50_output_0"
+    name: "/encoder/ar_text_position/Constant_50"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_1_output_0"
+    input: "/encoder/ar_text_position/Constant_50_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_6_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_6"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_51_output_0"
+    name: "/encoder/ar_text_position/Constant_51"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/Add_output_0"
+    input: "/encoder/ar_text_position/Constant_51_output_0"
+    output: "/encoder/ar_text_position/Mul_5_output_0"
+    name: "/encoder/ar_text_position/Mul_5"
+    op_type: "Mul"
+  }
+  node {
+    input: "encoder.ar_text_position.alpha"
+    input: "/encoder/ar_text_position/Unsqueeze_6_output_0"
+    output: "/encoder/ar_text_position/Mul_6_output_0"
+    name: "/encoder/ar_text_position/Mul_6"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Mul_5_output_0"
+    input: "/encoder/ar_text_position/Mul_6_output_0"
+    output: "x"
+    name: "/encoder/ar_text_position/Add_2"
+    op_type: "Add"
+  }
+  initializer {
+      dims: 512
+      dims: 512
+      data_type: 1
+      name: "encoder.ar_text_embedding.word_embeddings.weight"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "encoder.bert_proj.bias"
+  }
+  initializer {
+      dims: 1
+      data_type: 1
+      name: "encoder.ar_text_position.alpha"
+  }
+  initializer {
+      dims: 768
+      dims: 768
+      dims: 2
+      data_type: 1
+      name: "vits.ssl_proj.weight"
+  }
+  initializer {
+      dims: 768
+      data_type: 1
+      name: "vits.ssl_proj.bias"
+  }
+  initializer {
+      dims: 768
+      dims: 1024
+      data_type: 1
+      name: "onnx::MatMul_1058"
+  }
+  initializer {
+      dims: 1024
+      dims: 512
+      data_type: 1
+      name: "onnx::MatMul_1059"
+  }
+  input {
+    name: "ref_seq"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "ref_length"
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "text_seq"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "text_length"
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "ref_bert"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "ref_length"
+          }
+          dim {
+            dim_value: 1024
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "text_bert"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "text_length"
+          }
+          dim {
+            dim_value: 1024
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "ssl_content"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 768
+          }
+          dim {
+            dim_param: "ssl_length"
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "x"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "Addx_dim_1"
+          }
+          dim {
+            dim_param: "Addx_dim_2"
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "prompts"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "Unsqueezeprompts_dim_1"
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  domain: ""
+  version: 16
+}

models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82f0b326266755e811e57fcc43294e785655bc6f444339add99e484addb7ee36
+size 307531566

models/ailia-models/GPT-SoVITS/t2s_fsdec.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/t2s_sdec.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5212b7e83037acb53f9c2f178394ccc04190ea9c128c19e33098db36df08a764
+size 307594527

models/ailia-models/GPT-SoVITS/t2s_sdec.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:548fb3d948fab18cb73a26f85cda19fc7f1f849db3be10ec2791fcc97db4a16a
+size 307182563

models/ailia-models/GPT-SoVITS/t2s_sdec.opt.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e78c512b7765844ac4fe8f2ab6822a7e0bae68856e130e75e3623c8b3c4f506
+size 307138999

models/ailia-models/GPT-SoVITS/t2s_sdec.opt2.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS/vits.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec25162dac307d37b652f1897504d1e7e80abd46e77bd1bb4a8ae66c02e28623
+size 162706996

models/ailia-models/GPT-SoVITS/vits.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS2/cnhubert.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:558e4aabf7a7d1ef8ad89c0983a4a6413f9f4489232a35b4c1d455575f6cc242
+size 377745020

models/ailia-models/GPT-SoVITS2/cnhubert.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS2/code/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 RVC-Boss
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

models/ailia-models/GPT-SoVITS2/code/README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# GPT-SoVITS V2
+### Input
+- A synthesis text and reference audio and reference text for voice cloning
+### Output
+The Voice file is output as .wav which path is defined as `SAVE_WAV_PATH` in `gpt-sovits-v2.py `.
+### Requirements
+This model requires pyopenjtalk for g2p.
+```
+pip3 install -r requirements.txt
+```
+### Usage
+Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
+For the sample sentence and sample audio,
+```
+python3 gpt-sovits-v2.py
+```
+Run with audio prompt.
+```
+python3 gpt-sovits-v2.py -i "ax株式会社ではAIの実用化のための技術を開発しています。" --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。"
+```
+Run for english.
+```
+python3 gpt-sovits-v2.py -i "Hello world. We are testing speech synthesis." --text_language en --ref_audio reference_audio_captured_by_ax.wav --ref_text "水をマレーシアから買わなくてはならない。" --ref_language ja
+```
+### Reference
+[GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
+### Framework
+PyTorch 2.5.0
+### Model Format
+ONNX opset = 17
+### Netron
+#### Normal model
+- [cnhubert.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/cnhubert.onnx.prototxt)
+- [t2s_encoder.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_encoder.onnx.prototxt)
+- [t2s_fsdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_fsdec.onnx.prototxt)
+- [t2s_sdec.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_sdec.onnx.prototxt)
+- [vits.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/gpt-sovits-v2/vits.onnx.prototxt)

models/ailia-models/GPT-SoVITS2/code/gpt-sovits-v2.py ADDED Viewed

	@@ -0,0 +1,632 @@

+import time
+import sys
+# logger
+from logging import getLogger  # noqa: E402
+import numpy as np
+import soundfile
+import librosa
+from tqdm import tqdm
+# import original modules
+sys.path.append("../../util")
+from arg_utils import get_base_parser, update_parser  # noqa: E402
+from model_utils import check_and_download_models  # noqa: E402
+import ailia
+from text import cleaned_text_to_sequence
+from text.cleaner import clean_text
+logger = getLogger(__name__)
+# ======================
+# PARAMETERS
+# ======================
+REF_WAV_PATH = "reference_audio_captured_by_ax.wav"
+REF_TEXT = "水をマレーシアから買わなくてはならない。"
+SAVE_WAV_PATH = "output.wav"
+REMOTE_PATH = "https://storage.googleapis.com/ailia-models/gpt-sovits-v2/"
+WEIGHT_PATH_SSL = "cnhubert.onnx"
+WEIGHT_PATH_T2S_ENCODER = "t2s_encoder.onnx"
+WEIGHT_PATH_T2S_FIRST_DECODER = "t2s_fsdec.onnx"
+WEIGHT_PATH_T2S_STAGE_DECODER = "t2s_sdec.onnx"
+WEIGHT_PATH_VITS = "vits.onnx"
+MODEL_PATH_SSL = WEIGHT_PATH_SSL + ".prototxt"
+MODEL_PATH_T2S_ENCODER = WEIGHT_PATH_T2S_ENCODER + ".prototxt"
+MODEL_PATH_T2S_FIRST_DECODER = WEIGHT_PATH_T2S_FIRST_DECODER + ".prototxt"
+MODEL_PATH_T2S_STAGE_DECODER = WEIGHT_PATH_T2S_STAGE_DECODER + ".prototxt"
+MODEL_PATH_VITS = WEIGHT_PATH_VITS + ".prototxt"
+# ======================
+# Arguemnt Parser Config
+# ======================
+parser = get_base_parser("GPT-SoVits", None, SAVE_WAV_PATH)
+# overwrite
+parser.add_argument(
+    "--input",
+    "-i",
+    metavar="TEXT",
+    default="ax株式会社ではAIの実用化のための技術を開発しています。",
+    help="input text",
+)
+parser.add_argument(
+    "--text_language", "-tl", default="ja", choices=("ja", "en"), help="[ja, en]"
+)
+parser.add_argument(
+    "--ref_audio",
+    "-ra",
+    metavar="TEXT",
+    default=REF_WAV_PATH,
+    help="ref audio",
+)
+parser.add_argument(
+    "--ref_text",
+    "-rt",
+    metavar="TEXT",
+    default=REF_TEXT,
+    help="ref text",
+)
+parser.add_argument(
+    "--ref_language", "-rl", default="ja", choices=("ja", "en"), help="[ja, en]"
+)
+parser.add_argument("--top_k", type=int, default=15, help="top_k")
+parser.add_argument("--top_p", type=float, default=1.0, help="top_p")
+parser.add_argument("--temperature", type=float, default=1.0, help="temperature")
+parser.add_argument("--speed", type=float, default=1.0, help="Speech rate")
+parser.add_argument("--onnx", action="store_true", help="use onnx runtime")
+parser.add_argument("--profile", action="store_true", help="use profile model")
+args = update_parser(parser, check_input_type=False)
+splits = {
+    # fmt: off
+    "，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…",
+    # fmt: on
+}
+# ======================
+# Secondary Functions
+# ======================
+def split(todo_text):
+    todo_text = todo_text.replace("……", "。").replace("——", "，")
+    if todo_text[-1] not in splits:
+        todo_text += "。"
+    i_split_head = i_split_tail = 0
+    len_text = len(todo_text)
+    todo_texts = []
+    while 1:
+        if i_split_head >= len_text:
+            break  # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
+        if todo_text[i_split_head] in splits:
+            i_split_head += 1
+            todo_texts.append(todo_text[i_split_tail:i_split_head])
+            i_split_tail = i_split_head
+        else:
+            i_split_head += 1
+    return todo_texts
+def cut(inp):
+    punctuation = set(["!", "?", "…", ",", ".", "-", " "])
+    inp = inp.strip("\n")
+    inps = split(inp)
+    split_idx = list(range(0, len(inps), 4))
+    split_idx[-1] = None
+    if len(split_idx) > 1:
+        opts = []
+        for idx in range(len(split_idx) - 1):
+            opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]]))
+    else:
+        opts = [inp]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+def process_text(texts):
+    _text = []
+    if all(text in [None, " ", "\n", ""] for text in texts):
+        raise ValueError("Please enter valid text.")
+    for text in texts:
+        if text in [None, " ", ""]:
+            pass
+        else:
+            _text.append(text)
+    return _text
+def merge_short_text_in_array(texts, threshold):
+    if (len(texts)) < 2:
+        return texts
+    result = []
+    text = ""
+    for ele in texts:
+        text += ele
+        if len(text) >= threshold:
+            result.append(text)
+            text = ""
+    if len(text) > 0:
+        if len(result) == 0:
+            result.append(text)
+        else:
+            result[len(result) - 1] += text
+    return result
+# ======================
+# Main Logic
+# ======================
+class T2SModel:
+    def __init__(self, sess_encoder, sess_fsdec, sess_sdec):
+        self.hz = 50
+        self.max_sec = 54
+        self.top_k = 5
+        self.early_stop_num = np.array([self.hz * self.max_sec])
+        self.sess_encoder = sess_encoder
+        self.sess_fsdec = sess_fsdec
+        self.sess_sdec = sess_sdec
+    def forward(
+        self,
+        ref_seq,
+        text_seq,
+        ref_bert,
+        text_bert,
+        ssl_content,
+        top_k=20,
+        top_p=0.6,
+        temperature=0.6,
+        repetition_penalty=1.35,
+    ):
+        early_stop_num = self.early_stop_num
+        top_k = np.array([top_k], dtype=np.int64)
+        top_p = np.array([top_p], dtype=np.float32)
+        temperature = np.array([temperature], dtype=np.float32)
+        repetition_penalty = np.array([repetition_penalty], dtype=np.float32)
+        EOS = 1024
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            x, prompts = self.sess_encoder.run(
+                None,
+                {
+                    "ref_seq": ref_seq,
+                    "text_seq": text_seq,
+                    "ref_bert": ref_bert,
+                    "text_bert": text_bert,
+                    "ssl_content": ssl_content,
+                },
+            )
+        else:
+            x, prompts = self.sess_encoder.run(
+                {
+                    "ref_seq": ref_seq,
+                    "text_seq": text_seq,
+                    "ref_bert": ref_bert,
+                    "text_bert": text_bert,
+                    "ssl_content": ssl_content,
+                }
+            )
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tsencoder processing time {} ms".format(end - start))
+        prefix_len = prompts.shape[1]
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            y, k, v, y_emb, x_example = self.sess_fsdec.run(
+                None,
+                {
+                    "x": x,
+                    "prompts": prompts,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                    "temperature": temperature,
+                    "repetition_penalty": repetition_penalty,
+                },
+            )
+        else:
+            y, k, v, y_emb, x_example = self.sess_fsdec.run(
+                {
+                    "x": x,
+                    "prompts": prompts,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                    "temperature": temperature,
+                    "repetition_penalty": repetition_penalty,
+                }
+            )
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tfsdec processing time {} ms".format(end - start))
+        stop = False
+        for idx in tqdm(range(1, 1500)):
+            if args.benchmark:
+                start = int(round(time.time() * 1000))
+            if args.onnx:
+                y, k, v, y_emb, logits, samples = self.sess_sdec.run(
+                    None,
+                    {
+                        "iy": y,
+                        "ik": k,
+                        "iv": v,
+                        "iy_emb": y_emb,
+                        "ix_example": x_example,
+                        "top_k": top_k,
+                        "top_p": top_p,
+                        "temperature": temperature,
+                        "repetition_penalty": repetition_penalty,
+                    },
+                )
+            else:
+                COPY_INPUT_BLOB_DATA = False
+                if idx == 1:
+                    y, k, v, y_emb, logits, samples = self.sess_sdec.run(
+                        {
+                            "iy": y,
+                            "ik": k,
+                            "iv": v,
+                            "iy_emb": y_emb,
+                            "ix_example": x_example,
+                            "top_k": top_k,
+                            "top_p": top_p,
+                            "temperature": temperature,
+                            "repetition_penalty": repetition_penalty,
+                        }
+                    )
+                    kv_base_shape = k.shape
+                else:
+                    input_blob_idx = self.sess_sdec.get_input_blob_list()
+                    output_blob_idx = self.sess_sdec.get_output_blob_list()
+                    self.sess_sdec.set_input_blob_data(y, 0)
+                    if COPY_INPUT_BLOB_DATA:
+                        kv_shape = (
+                            kv_base_shape[0],
+                            kv_base_shape[1] + idx - 2,
+                            kv_base_shape[2],
+                            kv_base_shape[3],
+                        )
+                        self.sess_sdec.set_input_blob_shape(kv_shape, 1)
+                        self.sess_sdec.set_input_blob_shape(kv_shape, 2)
+                        self.sess_sdec.copy_blob_data(
+                            input_blob_idx[1], output_blob_idx[1], self.sess_sdec
+                        )
+                        self.sess_sdec.copy_blob_data(
+                            input_blob_idx[2], output_blob_idx[2], self.sess_sdec
+                        )
+                    else:
+                        self.sess_sdec.set_input_blob_data(k, 1)
+                        self.sess_sdec.set_input_blob_data(v, 2)
+                    self.sess_sdec.set_input_blob_data(y_emb, 3)
+                    self.sess_sdec.set_input_blob_data(x_example, 4)
+                    self.sess_sdec.set_input_blob_data(top_k, 5)
+                    self.sess_sdec.set_input_blob_data(top_p, 6)
+                    self.sess_sdec.set_input_blob_data(temperature, 7)
+                    self.sess_sdec.set_input_blob_data(repetition_penalty, 8)
+                    self.sess_sdec.update()
+                    y = self.sess_sdec.get_blob_data(output_blob_idx[0])
+                    if not COPY_INPUT_BLOB_DATA:
+                        k = self.sess_sdec.get_blob_data(output_blob_idx[1])
+                        v = self.sess_sdec.get_blob_data(output_blob_idx[2])
+                    y_emb = self.sess_sdec.get_blob_data(output_blob_idx[3])
+                    logits = self.sess_sdec.get_blob_data(output_blob_idx[4])
+                    samples = self.sess_sdec.get_blob_data(output_blob_idx[5])
+            if args.benchmark:
+                end = int(round(time.time() * 1000))
+                logger.info("\tsdec processing time {} ms".format(end - start))
+            if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
+                stop = True
+            if np.argmax(logits, axis=-1)[0] == EOS or samples[0, 0] == EOS:
+                stop = True
+            if stop:
+                break
+        y[0, -1] = 0
+        return y[np.newaxis, :, -idx:-1]
+class GptSoVits:
+    def __init__(self, t2s: T2SModel, sess):
+        self.t2s = t2s
+        self.sess = sess
+    def forward(
+        self,
+        ref_seq,
+        text_seq,
+        ref_bert,
+        text_bert,
+        ref_audio,
+        ssl_content,
+        top_k=20,
+        top_p=0.6,
+        temperature=0.6,
+        repetition_penalty=1.35,
+        speed=1.0,
+    ):
+        pred_semantic = self.t2s.forward(
+            ref_seq,
+            text_seq,
+            ref_bert,
+            text_bert,
+            ssl_content,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+        )
+        speed = np.array(speed, dtype=np.float32)
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            audio1 = self.sess.run(
+                None,
+                {
+                    "text_seq": text_seq,
+                    "pred_semantic": pred_semantic,
+                    "ref_audio": ref_audio,
+                    "speed": speed,
+                },
+            )
+        else:
+            audio1 = self.sess.run(
+                {
+                    "text_seq": text_seq,
+                    "pred_semantic": pred_semantic,
+                    "ref_audio": ref_audio,
+                    "speed": speed,
+                }
+            )
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tvits processing time {} ms".format(end - start))
+        return audio1[0]
+class SSLModel:
+    def __init__(self, sess):
+        self.sess = sess
+    def forward(self, ref_audio_16k):
+        if args.benchmark:
+            start = int(round(time.time() * 1000))
+        if args.onnx:
+            last_hidden_state = self.sess.run(None, {"ref_audio_16k": ref_audio_16k})
+        else:
+            last_hidden_state = self.sess.run({"ref_audio_16k": ref_audio_16k})
+        if args.benchmark:
+            end = int(round(time.time() * 1000))
+            logger.info("\tssl processing time {} ms".format(end - start))
+        return last_hidden_state[0]
+def get_phones_and_bert(text, language, final=False):
+    if language == "en":
+        try:
+            import LangSegment
+            LangSegment.setfilters(["en"])
+            formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
+        except ImportError:
+            formattext = text
+    else:
+        formattext = text
+    while "  " in formattext:
+        formattext = formattext.replace("  ", " ")
+    phones, word2ph, norm_text = clean_text(formattext, language)
+    phones = cleaned_text_to_sequence(phones)
+    bert = np.zeros((1024, len(phones)), dtype=np.float32)
+    if not final and len(phones) < 6:
+        return get_phones_and_bert("." + text, language, final=True)
+    return phones, bert, norm_text
+def generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits):
+    gpt = T2SModel(
+        t2s_encoder,
+        t2s_first_decoder,
+        t2s_stage_decoder,
+    )
+    gpt_sovits = GptSoVits(gpt, vits)
+    ssl = SSLModel(ssl)
+    input_audio = args.ref_audio
+    ref_text = args.ref_text
+    ref_language = args.ref_language
+    text = args.input
+    text_language = args.text_language
+    top_k = args.top_k
+    top_p = args.top_p
+    temperature = args.temperature
+    speed = args.speed
+    ref_text = ref_text.strip("\n")
+    if ref_text[-1] not in splits:
+        ref_text += "。" if ref_language != "en" else "."
+    logger.info("Actual Input Reference Text: %s" % ref_text)
+    text = text.strip("\n")
+    logger.info("Actual Input Target Text: %s" % text)
+    vits_hps_data_sampling_rate = 32000
+    zero_wav = np.zeros(int(vits_hps_data_sampling_rate * 0.3), dtype=np.float16)
+    ref_audio, sr = librosa.load(input_audio, sr=vits_hps_data_sampling_rate)
+    ref_audio_16k = librosa.resample(ref_audio, orig_sr=sr, target_sr=16000)
+    if ref_audio_16k.shape[0] > 160000 or ref_audio_16k.shape[0] < 48000:
+        logger.warning(
+            "Reference audio is outside the 3-10 second range, please choose another one!"
+        )
+    # hubertの入力のみpaddingする
+    ref_audio_16k = np.concatenate([ref_audio_16k, zero_wav], axis=0)
+    ref_audio_16k = ref_audio_16k[np.newaxis, :]
+    ssl_content = ssl.forward(ref_audio_16k)
+    text = cut(text)  # Slice once every 4 sentences
+    while "\n\n" in text:
+        text = text.replace("\n\n", "\n")
+    logger.info("Actual Input Target Text (after sentence segmentation): %s" % text)
+    texts = text.split("\n")
+    texts = process_text(texts)
+    texts = merge_short_text_in_array(texts, 5)
+    ref_seq, ref_bert, _ = get_phones_and_bert(ref_text, ref_language)
+    ref_seq = np.array(ref_seq)[np.newaxis, :]
+    ref_audio = ref_audio[np.newaxis, :]
+    audio_opt = []
+    for i_text, text in enumerate(texts):
+        # 解决输入目标文本的空行导致报错的问题
+        if len(text.strip()) == 0:
+            continue
+        if text[-1] not in splits:
+            text += "。" if text_language != "en" else "."
+        logger.info("Actual Input Target Text (per sentence): %s" % text)
+        text_seq, text_bert, norm_text = get_phones_and_bert(text, text_language)
+        text_seq = np.array(text_seq)[np.newaxis, :]
+        logger.info("Processed text from the frontend (per sentence): %s" % norm_text)
+        audio = gpt_sovits.forward(
+            ref_seq,
+            text_seq,
+            ref_bert.T,
+            text_bert.T,
+            ref_audio,
+            ssl_content,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            speed=speed,
+        )
+        max_audio = np.abs(audio).max()
+        if max_audio > 1:
+            audio /= max_audio
+        audio_opt.append(audio)
+        audio_opt.append(zero_wav)
+    audio = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
+    savepath = args.savepath
+    logger.info(f"saved at : {savepath}")
+    soundfile.write(savepath, audio, vits_hps_data_sampling_rate)
+    logger.info("Script finished successfully.")
+def main():
+    # model files check and download
+    check_and_download_models(WEIGHT_PATH_SSL, MODEL_PATH_SSL, REMOTE_PATH)
+    check_and_download_models(
+        WEIGHT_PATH_T2S_ENCODER, MODEL_PATH_T2S_ENCODER, REMOTE_PATH
+    )
+    check_and_download_models(
+        WEIGHT_PATH_T2S_FIRST_DECODER, MODEL_PATH_T2S_FIRST_DECODER, REMOTE_PATH
+    )
+    check_and_download_models(
+        WEIGHT_PATH_T2S_STAGE_DECODER, MODEL_PATH_T2S_STAGE_DECODER, REMOTE_PATH
+    )
+    check_and_download_models(WEIGHT_PATH_VITS, MODEL_PATH_VITS, REMOTE_PATH)
+    env_id = args.env_id
+    if args.onnx:
+        import onnxruntime
+        ssl = onnxruntime.InferenceSession(WEIGHT_PATH_SSL)
+        t2s_encoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_ENCODER)
+        t2s_first_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_FIRST_DECODER)
+        t2s_stage_decoder = onnxruntime.InferenceSession(WEIGHT_PATH_T2S_STAGE_DECODER)
+        vits = onnxruntime.InferenceSession(WEIGHT_PATH_VITS)
+    else:
+        memory_mode = ailia.get_memory_mode(
+            reduce_constant=True,
+            ignore_input_with_initializer=True,
+            reduce_interstage=False,
+            reuse_interstage=True,
+        )
+        ssl = ailia.Net(
+            weight=WEIGHT_PATH_SSL,
+            stream=MODEL_PATH_SSL,
+            memory_mode=memory_mode,
+            env_id=env_id,
+        )
+        t2s_encoder = ailia.Net(
+            weight=WEIGHT_PATH_T2S_ENCODER,
+            stream=MODEL_PATH_T2S_ENCODER,
+            memory_mode=memory_mode,
+            env_id=env_id,
+        )
+        t2s_first_decoder = ailia.Net(
+            weight=WEIGHT_PATH_T2S_FIRST_DECODER,
+            stream=MODEL_PATH_T2S_FIRST_DECODER,
+            memory_mode=memory_mode,
+            env_id=env_id,
+        )
+        t2s_stage_decoder = ailia.Net(
+            weight=WEIGHT_PATH_T2S_STAGE_DECODER,
+            stream=MODEL_PATH_T2S_STAGE_DECODER,
+            memory_mode=memory_mode,
+            env_id=env_id,
+        )
+        vits = ailia.Net(
+            weight=WEIGHT_PATH_VITS,
+            stream=MODEL_PATH_VITS,
+            memory_mode=memory_mode,
+            env_id=env_id,
+        )
+        if args.profile:
+            ssl.set_profile_mode(True)
+            t2s_encoder.set_profile_mode(True)
+            t2s_first_decoder.set_profile_mode(True)
+            t2s_stage_decoder.set_profile_mode(True)
+            vits.set_profile_mode(True)
+    if args.benchmark:
+        start = int(round(time.time() * 1000))
+    generate_voice(ssl, t2s_encoder, t2s_first_decoder, t2s_stage_decoder, vits)
+    if args.benchmark:
+        end = int(round(time.time() * 1000))
+        logger.info("\ttotal processing time {} ms".format(end - start))
+    if args.profile:
+        print("ssl : ")
+        print(ssl.get_summary())
+        print("t2s_encoder : ")
+        print(t2s_encoder.get_summary())
+        print("t2s_first_decoder : ")
+        print(t2s_first_decoder.get_summary())
+        print("t2s_stage_decoder : ")
+        print(t2s_stage_decoder.get_summary())
+        print("vits : ")
+        print(vits.get_summary())
+if __name__ == "__main__":
+    main()

models/ailia-models/GPT-SoVITS2/code/reference_audio_captured_by_ax.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8af474a35ab4aebaadda5a20d626c44830d5987880e54e10fc645eb73d568743
+size 226298

models/ailia-models/GPT-SoVITS2/code/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+SoundFile
+librosa
+nltk
+pyopenjtalk>=0.3.4
+g2p_en
+LangSegment>=0.2.0
+wordsegment

models/ailia-models/GPT-SoVITS2/code/text/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from text import symbols2 as symbols_v2
+_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
+def cleaned_text_to_sequence(cleaned_text, version=None):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
+    return phones

models/ailia-models/GPT-SoVITS2/code/text/cleaner.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from . import symbols2 as symbols_v2
+def clean_text(text, language):
+    symbols = symbols_v2.symbols
+    language_module_map = {
+        # "zh": "chinese2",
+        "ja": "japanese",
+        "en": "english",
+        # "ko": "korean",
+        # "yue": "cantonese",
+    }
+    if language not in language_module_map:
+        language = "en"
+        text = " "
+    language_module = __import__(
+        "text." + language_module_map[language],
+        fromlist=[language_module_map[language]],
+    )
+    norm_text = language_module.text_normalize(text)
+    if language == "en":
+        phones = language_module.g2p(norm_text)
+        if len(phones) < 4:
+            phones = [","] + phones
+        word2ph = None
+    else:
+        phones = language_module.g2p(norm_text)
+        word2ph = None
+    phones = ["UNK" if ph not in symbols else ph for ph in phones]
+    return phones, word2ph, norm_text

models/ailia-models/GPT-SoVITS2/code/text/cmudict-fast.rep ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS2/code/text/cmudict.rep ADDED Viewed

The diff for this file is too large to render. See raw diff

models/ailia-models/GPT-SoVITS2/code/text/engdict-hot.rep ADDED Viewed

	@@ -0,0 +1,3 @@

+CHATGPT CH AE1 T JH IY1 P IY1 T IY1
+JSON JH EY1 S AH0 N
+CONDA K AA1 N D AH0

models/ailia-models/GPT-SoVITS2/code/text/english.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import os
+import re
+import pickle
+import unicodedata
+from builtins import str as unicode
+import wordsegment
+from g2p_en import G2p
+from g2p_en.expand import normalize_numbers
+from nltk.tokenize import TweetTokenizer
+from nltk import pos_tag
+from .symbols2 import symbols
+word_tokenize = TweetTokenizer().tokenize
+current_file_path = os.path.dirname(__file__)
+CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
+CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
+CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
+CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
+NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
+punctuation = ["!", "?", "…", ",", ".", "-"]
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+def replace_phs(phs):
+    rep_map = {"'": "-"}
+    phs_new = []
+    for ph in phs:
+        if ph in symbols:
+            phs_new.append(ph)
+        elif ph in rep_map.keys():
+            phs_new.append(rep_map[ph])
+        else:
+            print("ph not in symbols: ", ph)
+    return phs_new
+def replace_consecutive_punctuation(text):
+    punctuations = "".join(re.escape(p) for p in punctuation)
+    pattern = f"([{punctuations}])([{punctuations}])+"
+    result = re.sub(pattern, r"\1", text)
+    return result
+def read_dict():
+    g2p_dict = {}
+    start_line = 49
+    with open(CMU_DICT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= start_line:
+                line = line.strip()
+                word_split = line.split("  ")
+                word = word_split[0].lower()
+                syllable_split = word_split[1].split(" - ")
+                g2p_dict[word] = []
+                for syllable in syllable_split:
+                    phone_split = syllable.split(" ")
+                    g2p_dict[word].append(phone_split)
+            line_index = line_index + 1
+            line = f.readline()
+    return g2p_dict
+def read_dict_new():
+    g2p_dict = {}
+    with open(CMU_DICT_PATH, encoding="utf-8") as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= 57:
+                line = line.strip()
+                word_split = line.split("  ")
+                word = word_split[0].lower()
+                g2p_dict[word] = [word_split[1].split(" ")]
+            line_index = line_index + 1
+            line = f.readline()
+    with open(CMU_DICT_FAST_PATH, encoding="utf-8") as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= 0:
+                line = line.strip()
+                word_split = line.split(" ")
+                word = word_split[0].lower()
+                if word not in g2p_dict:
+                    g2p_dict[word] = [word_split[1:]]
+            line_index = line_index + 1
+            line = f.readline()
+    return g2p_dict
+def hot_reload_hot(g2p_dict):
+    with open(CMU_DICT_HOT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= 0:
+                line = line.strip()
+                word_split = line.split(" ")
+                word = word_split[0].lower()
+                # 自定义发音词直接覆盖字典
+                g2p_dict[word] = [word_split[1:]]
+            line_index = line_index + 1
+            line = f.readline()
+    return g2p_dict
+def cache_dict(g2p_dict, file_path):
+    with open(file_path, "wb") as pickle_file:
+        pickle.dump(g2p_dict, pickle_file)
+def get_dict():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, "rb") as pickle_file:
+            g2p_dict = pickle.load(pickle_file)
+    else:
+        g2p_dict = read_dict_new()
+        cache_dict(g2p_dict, CACHE_PATH)
+    g2p_dict = hot_reload_hot(g2p_dict)
+    return g2p_dict
+def get_namedict():
+    if os.path.exists(NAMECACHE_PATH):
+        with open(NAMECACHE_PATH, "rb") as pickle_file:
+            name_dict = pickle.load(pickle_file)
+    else:
+        name_dict = {}
+    return name_dict
+def text_normalize(text):
+    # todo: eng text normalize
+    # 适配中文及 g2p_en 标点
+    rep_map = {
+        "[;:：，；]": ",",
+        '["’]': "'",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+    }
+    for p, r in rep_map.items():
+        text = re.sub(p, r, text)
+    # 来自 g2p_en 文本格式化处理
+    # 增加大写兼容
+    text = unicode(text)
+    text = normalize_numbers(text)
+    text = "".join(
+        char
+        for char in unicodedata.normalize("NFD", text)
+        if unicodedata.category(char) != "Mn"
+    )  # Strip accents
+    text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
+    text = re.sub(r"(?i)i\.e\.", "that is", text)
+    text = re.sub(r"(?i)e\.g\.", "for example", text)
+    # 避免重复标点引起的参考泄露
+    text = replace_consecutive_punctuation(text)
+    return text
+class en_G2p(G2p):
+    def __init__(self):
+        super().__init__()
+        # 分词初始化
+        wordsegment.load()
+        # 扩展过时字典, 添加姓名字典
+        self.cmu = get_dict()
+        self.namedict = get_namedict()
+        # 剔除读音错误的几个缩写
+        for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
+            del self.cmu[word.lower()]
+        # 修正多音字
+        self.homograph2features["read"] = (["R", "IY1", "D"], ["R", "EH1", "D"], "VBP")
+        self.homograph2features["complex"] = (
+            ["K", "AH0", "M", "P", "L", "EH1", "K", "S"],
+            ["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
+            "JJ",
+        )
+    def __call__(self, text):
+        # tokenization
+        words = word_tokenize(text)
+        tokens = pos_tag(words)  # tuples of (word, tag)
+        # steps
+        prons = []
+        for o_word, pos in tokens:
+            # 还原 g2p_en 小写操作逻辑
+            word = o_word.lower()
+            if re.search("[a-z]", word) is None:
+                pron = [word]
+            # 先把单字母推出去
+            elif len(word) == 1:
+                # 单读 A 发音修正, 这里需要原格式 o_word 判断大写
+                if o_word == "A":
+                    pron = ["EY1"]
+                else:
+                    pron = self.cmu[word][0]
+            # g2p_en 原版多音字处理
+            elif word in self.homograph2features:  # Check homograph
+                pron1, pron2, pos1 = self.homograph2features[word]
+                if pos.startswith(pos1):
+                    pron = pron1
+                # pos1比pos长仅出现在read
+                elif len(pos) < len(pos1) and pos == pos1[: len(pos)]:
+                    pron = pron1
+                else:
+                    pron = pron2
+            else:
+                # 递归查找预测
+                pron = self.qryword(o_word)
+            prons.extend(pron)
+            prons.extend([" "])
+        return prons[:-1]
+    def qryword(self, o_word):
+        word = o_word.lower()
+        # 查字典, 单字母除外
+        if len(word) > 1 and word in self.cmu:  # lookup CMU dict
+            return self.cmu[word][0]
+        # 单词仅首字母大写时查找姓名字典
+        if o_word.istitle() and word in self.namedict:
+            return self.namedict[word][0]
+        # oov 长度小于等于 3 直接读字母
+        if len(word) <= 3:
+            phones = []
+            for w in word:
+                # 单读 A 发音修正, 此处不存在大写的情况
+                if w == "a":
+                    phones.extend(["EY1"])
+                elif not w.isalpha():
+                    phones.extend([w])
+                else:
+                    phones.extend(self.cmu[w][0])
+            return phones
+        # 尝试分离所有格
+        if re.match(r"^([a-z]+)('s)$", word):
+            phones = self.qryword(word[:-2])[:]
+            # P T K F TH HH 无声辅音结尾 's 发 ['S']
+            if phones[-1] in ["P", "T", "K", "F", "TH", "HH"]:
+                phones.extend(["S"])
+            # S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
+            elif phones[-1] in ["S", "Z", "SH", "ZH", "CH", "JH"]:
+                phones.extend(["AH0", "Z"])
+            # B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
+            # AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
+            # ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
+            else:
+                phones.extend(["Z"])
+            return phones
+        # 尝试进行分词，应对复合词
+        comps = wordsegment.segment(word.lower())
+        # 无法分词的送回去预测
+        if len(comps) == 1:
+            return self.predict(word)
+        # 可以分词的递归处理
+        return [phone for comp in comps for phone in self.qryword(comp)]
+_g2p = en_G2p()
+def g2p(text):
+    # g2p_en 整段推理，剔除不存在的arpa返回
+    phone_list = _g2p(text)
+    phones = [
+        ph if ph != "<unk>" else "UNK"
+        for ph in phone_list
+        if ph not in [" ", "<pad>", "UW", "</s>", "<s>"]
+    ]
+    return replace_phs(phones)
+if __name__ == "__main__":
+    print(g2p("hello"))
+    print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
+    print(
+        g2p(
+            text_normalize(
+                "In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder."
+            )
+        )
+    )

models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/user.dict ADDED Viewed

Binary file (4.13 kB). View file

models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ 主殿,,,-32767,名詞,固有名詞,一般,,,,アルジドノ,アルジドノ,アルジドノ,3/5,

models/ailia-models/GPT-SoVITS2/code/text/ja_userdic/userdict.md5 ADDED Viewed

	@@ -0,0 +1 @@


1	+ d448850ab3b6f07c4db19fd6f8181cbe

models/ailia-models/GPT-SoVITS2/code/text/japanese.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
+import re
+import os
+import hashlib
+import pyopenjtalk
+current_file_path = os.path.dirname(__file__)
+def get_hash(fp: str) -> str:
+    hash_md5 = hashlib.md5()
+    with open(fp, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
+USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
+USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
+# 如果没有用户词典，就生成一个；如果有，就检查md5，如果不一样，就重新生成
+if os.path.exists(USERDIC_CSV_PATH):
+    if (
+        not os.path.exists(USERDIC_BIN_PATH)
+        or get_hash(USERDIC_CSV_PATH)
+        != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read()
+    ):
+        pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
+        with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f:
+            f.write(get_hash(USERDIC_CSV_PATH))
+if os.path.exists(USERDIC_BIN_PATH):
+    pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
+from text.symbols2 import punctuation
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "...": "…",
+    }
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    # if ph in symbols:
+    #     return ph
+    # if ph not in symbols:
+    #     ph = "UNK"
+    return ph
+def replace_consecutive_punctuation(text):
+    punctuations = "".join(re.escape(p) for p in punctuation)
+    pattern = f"([{punctuations}])([{punctuations}])+"
+    result = re.sub(pattern, r"\1", text)
+    return result
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def preprocess_jap(text, with_prosody=False):
+    """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = []
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if with_prosody:
+                text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
+            else:
+                p = pyopenjtalk.g2p(sentence)
+                text += p.split(" ")
+        if i < len(marks):
+            if marks[i] == " ":  # 防止意外的UNK
+                continue
+            text += [marks[i].replace(" ", "")]
+    return text
+def text_normalize(text):
+    # todo: jap text normalize
+    # 避免重复标点引起的参考泄露
+    text = replace_consecutive_punctuation(text)
+    return text
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
+    """Extract phoneme + prosoody symbol sequence from input full-context labels.
+    The algorithm is based on `Prosodic features control by symbols as input of
+    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
+    Args:
+        text (str): Input text.
+        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
+    Returns:
+        List[str]: List of phoneme + prosody symbols.
+    Examples:
+        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
+        >>> pyopenjtalk_g2p_prosody("こんにちは。")
+        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
+    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
+        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
+    """
+    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
+    N = len(labels)
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            assert n == 0 or n == N - 1
+            if n == 0:
+                phones.append("^")
+            elif n == N - 1:
+                # check question form or not
+                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+                if e3 == 0:
+                    phones.append("$")
+                elif e3 == 1:
+                    phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+    return phones
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+def g2p(norm_text, with_prosody=True):
+    phones = preprocess_jap(norm_text, with_prosody)
+    phones = [post_replace_ph(i) for i in phones]
+    # todo: implement tones and word2ph
+    return phones

models/ailia-models/GPT-SoVITS2/code/text/namedict_cache.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:559552094c4a6e995213e3fa586330e078ef8cb3a7a95a3109e945111cd2bfc1
+size 760663

models/ailia-models/GPT-SoVITS2/code/text/symbols2.py ADDED Viewed

	@@ -0,0 +1,785 @@

+punctuation = ["!", "?", "…", ",", "."]  # @是SP停顿
+punctuation.append("-")
+pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
+pad = "_"
+c = [
+    "AA",
+    "EE",
+    "OO",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+]
+v = [
+    "E1",
+    "En1",
+    "a1",
+    "ai1",
+    "an1",
+    "ang1",
+    "ao1",
+    "e1",
+    "ei1",
+    "en1",
+    "eng1",
+    "er1",
+    "i1",
+    "i01",
+    "ia1",
+    "ian1",
+    "iang1",
+    "iao1",
+    "ie1",
+    "in1",
+    "ing1",
+    "iong1",
+    "ir1",
+    "iu1",
+    "o1",
+    "ong1",
+    "ou1",
+    "u1",
+    "ua1",
+    "uai1",
+    "uan1",
+    "uang1",
+    "ui1",
+    "un1",
+    "uo1",
+    "v1",
+    "van1",
+    "ve1",
+    "vn1",
+    "E2",
+    "En2",
+    "a2",
+    "ai2",
+    "an2",
+    "ang2",
+    "ao2",
+    "e2",
+    "ei2",
+    "en2",
+    "eng2",
+    "er2",
+    "i2",
+    "i02",
+    "ia2",
+    "ian2",
+    "iang2",
+    "iao2",
+    "ie2",
+    "in2",
+    "ing2",
+    "iong2",
+    "ir2",
+    "iu2",
+    "o2",
+    "ong2",
+    "ou2",
+    "u2",
+    "ua2",
+    "uai2",
+    "uan2",
+    "uang2",
+    "ui2",
+    "un2",
+    "uo2",
+    "v2",
+    "van2",
+    "ve2",
+    "vn2",
+    "E3",
+    "En3",
+    "a3",
+    "ai3",
+    "an3",
+    "ang3",
+    "ao3",
+    "e3",
+    "ei3",
+    "en3",
+    "eng3",
+    "er3",
+    "i3",
+    "i03",
+    "ia3",
+    "ian3",
+    "iang3",
+    "iao3",
+    "ie3",
+    "in3",
+    "ing3",
+    "iong3",
+    "ir3",
+    "iu3",
+    "o3",
+    "ong3",
+    "ou3",
+    "u3",
+    "ua3",
+    "uai3",
+    "uan3",
+    "uang3",
+    "ui3",
+    "un3",
+    "uo3",
+    "v3",
+    "van3",
+    "ve3",
+    "vn3",
+    "E4",
+    "En4",
+    "a4",
+    "ai4",
+    "an4",
+    "ang4",
+    "ao4",
+    "e4",
+    "ei4",
+    "en4",
+    "eng4",
+    "er4",
+    "i4",
+    "i04",
+    "ia4",
+    "ian4",
+    "iang4",
+    "iao4",
+    "ie4",
+    "in4",
+    "ing4",
+    "iong4",
+    "ir4",
+    "iu4",
+    "o4",
+    "ong4",
+    "ou4",
+    "u4",
+    "ua4",
+    "uai4",
+    "uan4",
+    "uang4",
+    "ui4",
+    "un4",
+    "uo4",
+    "v4",
+    "van4",
+    "ve4",
+    "vn4",
+    "E5",
+    "En5",
+    "a5",
+    "ai5",
+    "an5",
+    "ang5",
+    "ao5",
+    "e5",
+    "ei5",
+    "en5",
+    "eng5",
+    "er5",
+    "i5",
+    "i05",
+    "ia5",
+    "ian5",
+    "iang5",
+    "iao5",
+    "ie5",
+    "in5",
+    "ing5",
+    "iong5",
+    "ir5",
+    "iu5",
+    "o5",
+    "ong5",
+    "ou5",
+    "u5",
+    "ua5",
+    "uai5",
+    "uan5",
+    "uang5",
+    "ui5",
+    "un5",
+    "uo5",
+    "v5",
+    "van5",
+    "ve5",
+    "vn5",
+]
+v_without_tone = [
+    "E",
+    "En",
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "i",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "o",
+    "ong",
+    "ou",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+]
+# japanese
+ja_symbols = [
+    "I",
+    "N",
+    "U",
+    "a",
+    "b",
+    "by",
+    "ch",
+    "cl",
+    "d",
+    "dy",
+    "e",
+    "f",
+    "g",
+    "gy",
+    "h",
+    "hy",
+    "i",
+    "j",
+    "k",
+    "ky",
+    "m",
+    "my",
+    "n",
+    "ny",
+    "o",
+    "p",
+    "py",
+    "r",
+    "ry",
+    "s",
+    "sh",
+    "t",
+    "ts",
+    "u",
+    "v",
+    "w",
+    "y",
+    "z",
+    ###楼下2个留到后面加
+    # "[", #上升调型
+    # "]", #下降调型
+    # "$", #结束符
+    # "^", #开始符
+]
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停"
+yue_symbols = {
+    "Yeot3",
+    "Yip1",
+    "Yyu3",
+    "Yeng4",
+    "Yut5",
+    "Yaan5",
+    "Ym5",
+    "Yaan6",
+    "Yang1",
+    "Yun4",
+    "Yon2",
+    "Yui5",
+    "Yun2",
+    "Yat3",
+    "Ye",
+    "Yeot1",
+    "Yoeng5",
+    "Yoek2",
+    "Yam2",
+    "Yeon6",
+    "Yu6",
+    "Yiu3",
+    "Yaang6",
+    "Yp5",
+    "Yai4",
+    "Yoek4",
+    "Yit6",
+    "Yam5",
+    "Yoeng6",
+    "Yg1",
+    "Yk3",
+    "Yoe4",
+    "Yam3",
+    "Yc",
+    "Yyu4",
+    "Yyut1",
+    "Yiu4",
+    "Ying3",
+    "Yip3",
+    "Yaap3",
+    "Yau3",
+    "Yan4",
+    "Yau1",
+    "Yap4",
+    "Yk6",
+    "Yok3",
+    "Yai1",
+    "Yeot6",
+    "Yan2",
+    "Yoek6",
+    "Yt1",
+    "Yoi1",
+    "Yit5",
+    "Yn4",
+    "Yaau3",
+    "Yau4",
+    "Yuk6",
+    "Ys",
+    "Yuk",
+    "Yin6",
+    "Yung6",
+    "Ya",
+    "You",
+    "Yaai5",
+    "Yau5",
+    "Yoi3",
+    "Yaak3",
+    "Yaat3",
+    "Ying2",
+    "Yok5",
+    "Yeng2",
+    "Yyut3",
+    "Yam1",
+    "Yip5",
+    "You1",
+    "Yam6",
+    "Yaa5",
+    "Yi6",
+    "Yek4",
+    "Yyu2",
+    "Yuk5",
+    "Yaam1",
+    "Yang2",
+    "Yai",
+    "Yiu6",
+    "Yin4",
+    "Yok4",
+    "Yot3",
+    "Yui2",
+    "Yeoi5",
+    "Yyun6",
+    "Yyu5",
+    "Yoi5",
+    "Yeot2",
+    "Yim4",
+    "Yeoi2",
+    "Yaan1",
+    "Yang6",
+    "Yong1",
+    "Yaang4",
+    "Yung5",
+    "Yeon1",
+    "Yin2",
+    "Ya3",
+    "Yaang3",
+    "Yg",
+    "Yk2",
+    "Yaau5",
+    "Yut1",
+    "Yt5",
+    "Yip4",
+    "Yung4",
+    "Yj",
+    "Yong3",
+    "Ya1",
+    "Yg6",
+    "Yaau6",
+    "Yit3",
+    "Yun3",
+    "Ying1",
+    "Yn2",
+    "Yg4",
+    "Yl",
+    "Yp3",
+    "Yn3",
+    "Yak1",
+    "Yang5",
+    "Yoe6",
+    "You2",
+    "Yap2",
+    "Yak2",
+    "Yt3",
+    "Yot5",
+    "Yim2",
+    "Yi1",
+    "Yn6",
+    "Yaat5",
+    "Yaam3",
+    "Yoek5",
+    "Ye3",
+    "Yeon4",
+    "Yaa2",
+    "Yu3",
+    "Yim6",
+    "Ym",
+    "Yoe3",
+    "Yaai2",
+    "Ym2",
+    "Ya6",
+    "Yeng6",
+    "Yik4",
+    "Yot4",
+    "Yaai4",
+    "Yyun3",
+    "Yu1",
+    "Yoeng1",
+    "Yaap2",
+    "Yuk3",
+    "Yoek3",
+    "Yeng5",
+    "Yeoi1",
+    "Yiu2",
+    "Yok1",
+    "Yo1",
+    "Yoek1",
+    "Yoeng2",
+    "Yeon5",
+    "Yiu1",
+    "Yoeng4",
+    "Yuk2",
+    "Yat4",
+    "Yg5",
+    "Yut4",
+    "Yan6",
+    "Yin3",
+    "Yaa6",
+    "Yap1",
+    "Yg2",
+    "Yoe5",
+    "Yt4",
+    "Ya5",
+    "Yo4",
+    "Yyu1",
+    "Yak3",
+    "Yeon2",
+    "Yong4",
+    "Ym1",
+    "Ye2",
+    "Yaang5",
+    "Yoi2",
+    "Yeng3",
+    "Yn",
+    "Yyut4",
+    "Yau",
+    "Yaak2",
+    "Yaan4",
+    "Yek2",
+    "Yin1",
+    "Yi5",
+    "Yoe2",
+    "Yei5",
+    "Yaat6",
+    "Yak5",
+    "Yp6",
+    "Yok6",
+    "Yei2",
+    "Yaap1",
+    "Yyut5",
+    "Yi4",
+    "Yim1",
+    "Yk5",
+    "Ye4",
+    "Yok2",
+    "Yaam6",
+    "Yat2",
+    "Yon6",
+    "Yei3",
+    "Yyu6",
+    "Yeot5",
+    "Yk4",
+    "Yai6",
+    "Yd",
+    "Yg3",
+    "Yei6",
+    "Yau2",
+    "Yok",
+    "Yau6",
+    "Yung3",
+    "Yim5",
+    "Yut6",
+    "Yit1",
+    "Yon3",
+    "Yat1",
+    "Yaam2",
+    "Yyut2",
+    "Yui6",
+    "Yt2",
+    "Yek6",
+    "Yt",
+    "Ye6",
+    "Yang3",
+    "Ying6",
+    "Yaau1",
+    "Yeon3",
+    "Yng",
+    "Yh",
+    "Yang4",
+    "Ying5",
+    "Yaap6",
+    "Yoeng3",
+    "Yyun4",
+    "You3",
+    "Yan5",
+    "Yat5",
+    "Yot1",
+    "Yun1",
+    "Yi3",
+    "Yaa1",
+    "Yaap4",
+    "You6",
+    "Yaang2",
+    "Yaap5",
+    "Yaa3",
+    "Yaak6",
+    "Yeng1",
+    "Yaak1",
+    "Yo5",
+    "Yoi4",
+    "Yam4",
+    "Yik1",
+    "Ye1",
+    "Yai5",
+    "Yung1",
+    "Yp2",
+    "Yui4",
+    "Yaak4",
+    "Yung2",
+    "Yak4",
+    "Yaat4",
+    "Yeoi4",
+    "Yut2",
+    "Yin5",
+    "Yaau4",
+    "Yap6",
+    "Yb",
+    "Yaam4",
+    "Yw",
+    "Yut3",
+    "Yong2",
+    "Yt6",
+    "Yaai6",
+    "Yap5",
+    "Yik5",
+    "Yun6",
+    "Yaam5",
+    "Yun5",
+    "Yik3",
+    "Ya2",
+    "Yyut6",
+    "Yon4",
+    "Yk1",
+    "Yit4",
+    "Yak6",
+    "Yaan2",
+    "Yuk1",
+    "Yai2",
+    "Yik2",
+    "Yaat2",
+    "Yo3",
+    "Ykw",
+    "Yn5",
+    "Yaa",
+    "Ye5",
+    "Yu4",
+    "Yei1",
+    "Yai3",
+    "Yyun5",
+    "Yip2",
+    "Yaau2",
+    "Yiu5",
+    "Ym4",
+    "Yeoi6",
+    "Yk",
+    "Ym6",
+    "Yoe1",
+    "Yeoi3",
+    "Yon",
+    "Yuk4",
+    "Yaai3",
+    "Yaa4",
+    "Yot6",
+    "Yaang1",
+    "Yei4",
+    "Yek1",
+    "Yo",
+    "Yp",
+    "Yo6",
+    "Yp4",
+    "Yan3",
+    "Yoi",
+    "Yap3",
+    "Yek3",
+    "Yim3",
+    "Yz",
+    "Yot2",
+    "Yoi6",
+    "Yit2",
+    "Yu5",
+    "Yaan3",
+    "Yan1",
+    "Yon5",
+    "Yp1",
+    "Yong5",
+    "Ygw",
+    "Yak",
+    "Yat6",
+    "Ying4",
+    "Yu2",
+    "Yf",
+    "Ya4",
+    "Yon1",
+    "You4",
+    "Yik6",
+    "Yui1",
+    "Yaat1",
+    "Yeot4",
+    "Yi2",
+    "Yaai1",
+    "Yek5",
+    "Ym3",
+    "Yong6",
+    "You5",
+    "Yyun1",
+    "Yn1",
+    "Yo2",
+    "Yip6",
+    "Yui3",
+    "Yaak5",
+    "Yyun2",
+}
+symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
+symbols = sorted(set(symbols))
+symbols += ["[", "]"]  ##日文新增上升下降调型
+symbols += sorted(list(ko_symbols))
+symbols += sorted(
+    list(yue_symbols)
+)  ##新加的yue统一摆在后头#已查过开头加Y后没有重复，韩文显然不会重复

models/ailia-models/GPT-SoVITS2/source.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/gpt-sovits-v2
+[normal]
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/cnhubert.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/cnhubert.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_encoder.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_encoder.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_fsdec.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_fsdec.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_sdec.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/t2s_sdec.onnx.prototxt
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/vits.onnx
+https://storage.googleapis.com/ailia-models/gpt-sovits-v2/vits.onnx.prototxt

models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9986bf01f48c183efc741df4afdb38131119cef0218784c919654403604cd1da
+size 11495994

models/ailia-models/GPT-SoVITS2/t2s_encoder.onnx.prototxt ADDED Viewed

	@@ -0,0 +1,2293 @@

+ir_version: 8
+producer_name: "pytorch"
+producer_version: "2.5.0"
+model_version: 0
+graph {
+  name: "main_graph"
+  node {
+    output: "onnx::ReduceSum_785"
+    name: "Constant_70"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::ReduceSum_786"
+    name: "Constant_71"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "ssl_content"
+    input: "vits.ssl_proj.weight"
+    input: "vits.ssl_proj.bias"
+    output: "/ssl_proj/Conv_output_0"
+    name: "/ssl_proj/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "/ssl_proj/Conv_output_0"
+    output: "/quantizer/vq/layers.0/Transpose_output_0"
+    name: "/quantizer/vq/layers.0/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 0
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Transpose_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Shape"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Shape_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Transpose_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Shape_1"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Shape_1_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_2_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Gather_1"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Transpose_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Shape_2"
+    op_type: "Shape"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Shape_2_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_3_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Gather_2"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_4_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul"
+    op_type: "Mul"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul_1"
+    op_type: "Mul"
+  }
+  node {
+    output: "onnx::Unsqueeze_804"
+    name: "Constant_87"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_1_output_0"
+    input: "onnx::Unsqueeze_804"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_806"
+    name: "Constant_89"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_2_output_0"
+    input: "onnx::Unsqueeze_806"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/Transpose_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Concat_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Reshape"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_5_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Pow"
+    op_type: "Pow"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Pow_output_0"
+    input: "onnx::ReduceSum_786"
+    output: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/ReduceSum"
+    op_type: "ReduceSum"
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_6_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Mul_2"
+    op_type: "Mul"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Mul_2_output_0"
+    input: "onnx::MatMul_1009"
+    output: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/MatMul"
+    op_type: "MatMul"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/ReduceSum_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/MatMul_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Sub"
+    op_type: "Sub"
+  }
+  node {
+    output: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::MatMul_1009"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_7_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Pow_1"
+    op_type: "Pow"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Pow_1_output_0"
+    input: "onnx::ReduceSum_785"
+    output: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/ReduceSum_1"
+    op_type: "ReduceSum"
+    attribute {
+      name: "keepdims"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Sub_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/ReduceSum_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Add_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Add_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Neg"
+    op_type: "Neg"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Neg_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/ArgMax"
+    op_type: "ArgMax"
+    attribute {
+      name: "axis"
+      i: -1
+      type: INT
+    }
+    attribute {
+      name: "keepdims"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Unsqueeze_824"
+    name: "Constant_106"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_output_0"
+    input: "onnx::Unsqueeze_824"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_826"
+    name: "Constant_108"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Gather_1_output_0"
+    input: "onnx::Unsqueeze_826"
+    output: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_2_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Unsqueeze_3_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/ArgMax_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Concat_1_output_0"
+    output: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
+    name: "/quantizer/vq/layers.0/_codebook/Reshape_1"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/quantizer/vq/Constant_output_0"
+    name: "/quantizer/vq/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/quantizer/vq/layers.0/_codebook/Reshape_1_output_0"
+    input: "/quantizer/vq/Constant_output_0"
+    output: "/quantizer/vq/Unsqueeze_output_0"
+    name: "/quantizer/vq/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/quantizer/vq/Unsqueeze_output_0"
+    output: "/quantizer/vq/Concat_output_0"
+    name: "/quantizer/vq/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/quantizer/vq/Concat_output_0"
+    output: "/Transpose_output_0"
+    name: "/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "/Transpose_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
+    output: "/Gather_output_0"
+    name: "/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/Gather_output_0"
+    input: "/quantizer/vq/layers.0/_codebook/Constant_output_0"
+    output: "/Gather_1_output_0"
+    name: "/Gather_1"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "ref_bert"
+    output: "/Transpose_1_output_0"
+    name: "/Transpose_1"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "text_bert"
+    output: "/Transpose_2_output_0"
+    name: "/Transpose_2"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "/Transpose_1_output_0"
+    input: "/Transpose_2_output_0"
+    output: "/Concat_output_0"
+    name: "/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "ref_seq"
+    input: "text_seq"
+    output: "/Concat_1_output_0"
+    name: "/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_output_0"
+    name: "/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_output_0"
+    input: "/Constant_output_0"
+    output: "/Unsqueeze_output_0"
+    name: "/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "/Constant_1_output_0"
+    name: "/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Gather_1_output_0"
+    input: "/Constant_1_output_0"
+    output: "prompts"
+    name: "/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "encoder.ar_text_embedding.word_embeddings.weight"
+    input: "/Concat_1_output_0"
+    output: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
+    name: "/encoder/ar_text_embedding/word_embeddings/Gather"
+    op_type: "Gather"
+  }
+  node {
+    input: "/Unsqueeze_output_0"
+    output: "/encoder/Transpose_output_0"
+    name: "/encoder/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 0
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/encoder/Transpose_output_0"
+    input: "onnx::MatMul_1010"
+    output: "/encoder/bert_proj/MatMul_output_0"
+    name: "/encoder/bert_proj/MatMul"
+    op_type: "MatMul"
+  }
+  node {
+    input: "encoder.bert_proj.bias"
+    input: "/encoder/bert_proj/MatMul_output_0"
+    output: "/encoder/bert_proj/Add_output_0"
+    name: "/encoder/bert_proj/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/ar_text_embedding/word_embeddings/Gather_output_0"
+    input: "/encoder/bert_proj/Add_output_0"
+    output: "/encoder/Add_output_0"
+    name: "/encoder/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/Add_output_0"
+    output: "/encoder/ar_text_position/Shape_output_0"
+    name: "/encoder/ar_text_position/Shape"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_output_0"
+    name: "/encoder/ar_text_position/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_output_0"
+    input: "/encoder/ar_text_position/Constant_output_0"
+    output: "/encoder/ar_text_position/Gather_output_0"
+    name: "/encoder/ar_text_position/Gather"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_1_output_0"
+    name: "/encoder/ar_text_position/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_output_0"
+    output: "/encoder/ar_text_position/Cast_output_0"
+    name: "/encoder/ar_text_position/Cast"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_2_output_0"
+    name: "/encoder/ar_text_position/Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Constant_1_output_0"
+    input: "/encoder/ar_text_position/Cast_output_0"
+    input: "/encoder/ar_text_position/Constant_2_output_0"
+    output: "/encoder/ar_text_position/Range_output_0"
+    name: "/encoder/ar_text_position/Range"
+    op_type: "Range"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_3_output_0"
+    name: "/encoder/ar_text_position/Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Range_output_0"
+    input: "/encoder/ar_text_position/Constant_3_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "onnx::Unsqueeze_860"
+    name: "Constant_140"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Gather_output_0"
+    input: "onnx::Unsqueeze_860"
+    output: "/encoder/ar_text_position/Unsqueeze_1_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_1"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_4_output_0"
+    name: "/encoder/ar_text_position/Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_1_output_0"
+    input: "/encoder/ar_text_position/Constant_4_output_0"
+    output: "/encoder/ar_text_position/Concat_output_0"
+    name: "/encoder/ar_text_position/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Concat_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 1
+        raw_data: "\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_5_output_0"
+    name: "/encoder/ar_text_position/Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 256
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_output_0"
+    input: "/encoder/ar_text_position/Constant_5_output_0"
+    output: "/encoder/ar_text_position/Mul_output_0"
+    name: "/encoder/ar_text_position/Mul"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Mul_output_0"
+    output: "/encoder/ar_text_position/Sin_output_0"
+    name: "/encoder/ar_text_position/Sin"
+    op_type: "Sin"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_6_output_0"
+    name: "/encoder/ar_text_position/Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_7_output_0"
+    name: "/encoder/ar_text_position/Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_8_output_0"
+    name: "/encoder/ar_text_position/Constant_8"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_9_output_0"
+    name: "/encoder/ar_text_position/Constant_9"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    input: "/encoder/ar_text_position/Constant_7_output_0"
+    input: "/encoder/ar_text_position/Constant_8_output_0"
+    input: "/encoder/ar_text_position/Constant_6_output_0"
+    input: "/encoder/ar_text_position/Constant_9_output_0"
+    output: "/encoder/ar_text_position/Slice_output_0"
+    name: "/encoder/ar_text_position/Slice"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_output_0"
+    output: "/encoder/ar_text_position/Shape_1_output_0"
+    name: "/encoder/ar_text_position/Shape_1"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Sin_output_0"
+    input: "/encoder/ar_text_position/Shape_1_output_0"
+    output: "/encoder/ar_text_position/Expand_output_0"
+    name: "/encoder/ar_text_position/Expand"
+    op_type: "Expand"
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    output: "onnx::Gather_881"
+    name: "Shape_155"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_882"
+    name: "Constant_156"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_881"
+    input: "onnx::Gather_882"
+    output: "onnx::Cast_883"
+    name: "Gather_157"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Cast_883"
+    output: "onnx::Range_884"
+    name: "Cast_158"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Range_885"
+    name: "Constant_159"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Range_886"
+    name: "Constant_160"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Range_885"
+    input: "onnx::Range_884"
+    input: "onnx::Range_886"
+    output: "onnx::Reshape_887"
+    name: "Range_161"
+    op_type: "Range"
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    output: "onnx::Gather_888"
+    name: "Shape_162"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_889"
+    name: "Constant_163"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_888"
+    input: "onnx::Gather_889"
+    output: "onnx::Cast_890"
+    name: "Gather_164"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Cast_890"
+    output: "onnx::Range_891"
+    name: "Cast_165"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Range_892"
+    name: "Constant_166"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Range_893"
+    name: "Constant_167"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Range_892"
+    input: "onnx::Range_891"
+    input: "onnx::Range_893"
+    output: "onnx::Slice_894"
+    name: "Range_168"
+    op_type: "Range"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_10_output_0"
+    name: "/encoder/ar_text_position/Constant_10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_11_output_0"
+    name: "/encoder/ar_text_position/Constant_11"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_12_output_0"
+    name: "/encoder/ar_text_position/Constant_12"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_13_output_0"
+    name: "/encoder/ar_text_position/Constant_13"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Slice_894"
+    input: "/encoder/ar_text_position/Constant_11_output_0"
+    input: "/encoder/ar_text_position/Constant_12_output_0"
+    input: "/encoder/ar_text_position/Constant_10_output_0"
+    input: "/encoder/ar_text_position/Constant_13_output_0"
+    output: "/encoder/ar_text_position/Slice_1_output_0"
+    name: "/encoder/ar_text_position/Slice_1"
+    op_type: "Slice"
+  }
+  node {
+    output: "onnx::Reshape_905"
+    name: "Constant_174"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Reshape_887"
+    input: "onnx::Reshape_905"
+    output: "onnx::Add_906"
+    name: "Reshape_175"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Add_906"
+    input: "/encoder/ar_text_position/Slice_1_output_0"
+    output: "/encoder/ar_text_position/Add_output_0"
+    name: "/encoder/ar_text_position/Add"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/ar_text_position/Add_output_0"
+    output: "/encoder/ar_text_position/Shape_2_output_0"
+    name: "/encoder/ar_text_position/Shape_2"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    output: "/encoder/ar_text_position/Shape_3_output_0"
+    name: "/encoder/ar_text_position/Shape_3"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_3_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_1"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_14_output_0"
+    name: "/encoder/ar_text_position/Constant_14"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
+    input: "/encoder/ar_text_position/Constant_14_output_0"
+    output: "/encoder/ar_text_position/Mul_1_output_0"
+    name: "/encoder/ar_text_position/Mul_1"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    input: "/encoder/ar_text_position/Mul_1_output_0"
+    output: "/encoder/ar_text_position/Equal_output_0"
+    name: "/encoder/ar_text_position/Equal"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_1_output_0"
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    output: "/encoder/ar_text_position/Where_output_0"
+    name: "/encoder/ar_text_position/Where"
+    op_type: "Where"
+  }
+  node {
+    input: "onnx::Add_906"
+    input: "/encoder/ar_text_position/Where_output_0"
+    output: "/encoder/ar_text_position/Expand_1_output_0"
+    name: "/encoder/ar_text_position/Expand_1"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_15_output_0"
+    name: "/encoder/ar_text_position/Constant_15"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_1_output_0"
+    input: "/encoder/ar_text_position/Constant_15_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_2_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_2"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    output: "/encoder/ar_text_position/Shape_4_output_0"
+    name: "/encoder/ar_text_position/Shape_4"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_4_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_2"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_16_output_0"
+    name: "/encoder/ar_text_position/Constant_16"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
+    input: "/encoder/ar_text_position/Constant_16_output_0"
+    output: "/encoder/ar_text_position/Mul_2_output_0"
+    name: "/encoder/ar_text_position/Mul_2"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    input: "/encoder/ar_text_position/Mul_2_output_0"
+    output: "/encoder/ar_text_position/Equal_1_output_0"
+    name: "/encoder/ar_text_position/Equal_1"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_1_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_2_output_0"
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    output: "/encoder/ar_text_position/Where_1_output_0"
+    name: "/encoder/ar_text_position/Where_1"
+    op_type: "Where"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_1_output_0"
+    input: "/encoder/ar_text_position/Where_1_output_0"
+    output: "/encoder/ar_text_position/Expand_2_output_0"
+    name: "/encoder/ar_text_position/Expand_2"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_17_output_0"
+    name: "/encoder/ar_text_position/Constant_17"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_2_output_0"
+    input: "/encoder/ar_text_position/Constant_17_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_3_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_3"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_2_output_0"
+    input: "/encoder/ar_text_position/Unsqueeze_3_output_0"
+    output: "/encoder/ar_text_position/Concat_1_output_0"
+    name: "/encoder/ar_text_position/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: -1
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    output: "/encoder/ar_text_position/Shape_5_output_0"
+    name: "/encoder/ar_text_position/Shape_5"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_18_output_0"
+    name: "/encoder/ar_text_position/Constant_18"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_19_output_0"
+    name: "/encoder/ar_text_position/Constant_19"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_20_output_0"
+    name: "/encoder/ar_text_position/Constant_20"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_5_output_0"
+    input: "/encoder/ar_text_position/Constant_19_output_0"
+    input: "/encoder/ar_text_position/Constant_20_output_0"
+    input: "/encoder/ar_text_position/Constant_18_output_0"
+    output: "/encoder/ar_text_position/Slice_2_output_0"
+    name: "/encoder/ar_text_position/Slice_2"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_2_output_0"
+    input: "/encoder/ar_text_position/Slice_2_output_0"
+    output: "/encoder/ar_text_position/Concat_2_output_0"
+    name: "/encoder/ar_text_position/Concat_2"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_output_0"
+    input: "/encoder/ar_text_position/Concat_2_output_0"
+    output: "/encoder/ar_text_position/Reshape_output_0"
+    name: "/encoder/ar_text_position/Reshape"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_output_0"
+    input: "/encoder/ar_text_position/Concat_1_output_0"
+    input: "/encoder/ar_text_position/Reshape_output_0"
+    output: "/encoder/ar_text_position/ScatterND_output_0"
+    name: "/encoder/ar_text_position/ScatterND"
+    op_type: "ScatterND"
+  }
+  node {
+    input: "/encoder/ar_text_position/Mul_output_0"
+    output: "/encoder/ar_text_position/Cos_output_0"
+    name: "/encoder/ar_text_position/Cos"
+    op_type: "Cos"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_21_output_0"
+    name: "/encoder/ar_text_position/Constant_21"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_22_output_0"
+    name: "/encoder/ar_text_position/Constant_22"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_23_output_0"
+    name: "/encoder/ar_text_position/Constant_23"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_24_output_0"
+    name: "/encoder/ar_text_position/Constant_24"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    input: "/encoder/ar_text_position/Constant_22_output_0"
+    input: "/encoder/ar_text_position/Constant_23_output_0"
+    input: "/encoder/ar_text_position/Constant_21_output_0"
+    input: "/encoder/ar_text_position/Constant_24_output_0"
+    output: "/encoder/ar_text_position/Slice_3_output_0"
+    name: "/encoder/ar_text_position/Slice_3"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_3_output_0"
+    output: "/encoder/ar_text_position/Shape_6_output_0"
+    name: "/encoder/ar_text_position/Shape_6"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Cos_output_0"
+    input: "/encoder/ar_text_position/Shape_6_output_0"
+    output: "/encoder/ar_text_position/Expand_3_output_0"
+    name: "/encoder/ar_text_position/Expand_3"
+    op_type: "Expand"
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    output: "onnx::Gather_948"
+    name: "Shape_213"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_949"
+    name: "Constant_214"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_948"
+    input: "onnx::Gather_949"
+    output: "onnx::Cast_950"
+    name: "Gather_215"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Cast_950"
+    output: "onnx::Range_951"
+    name: "Cast_216"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Range_952"
+    name: "Constant_217"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Range_953"
+    name: "Constant_218"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Range_952"
+    input: "onnx::Range_951"
+    input: "onnx::Range_953"
+    output: "onnx::Reshape_954"
+    name: "Range_219"
+    op_type: "Range"
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    output: "onnx::Gather_955"
+    name: "Shape_220"
+    op_type: "Shape"
+  }
+  node {
+    output: "onnx::Gather_956"
+    name: "Constant_221"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Gather_955"
+    input: "onnx::Gather_956"
+    output: "onnx::Cast_957"
+    name: "Gather_222"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Cast_957"
+    output: "onnx::Range_958"
+    name: "Cast_223"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    output: "onnx::Range_959"
+    name: "Constant_224"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "onnx::Range_960"
+    name: "Constant_225"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Range_959"
+    input: "onnx::Range_958"
+    input: "onnx::Range_960"
+    output: "onnx::Slice_961"
+    name: "Range_226"
+    op_type: "Range"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_25_output_0"
+    name: "/encoder/ar_text_position/Constant_25"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_26_output_0"
+    name: "/encoder/ar_text_position/Constant_26"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_27_output_0"
+    name: "/encoder/ar_text_position/Constant_27"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_28_output_0"
+    name: "/encoder/ar_text_position/Constant_28"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Slice_961"
+    input: "/encoder/ar_text_position/Constant_26_output_0"
+    input: "/encoder/ar_text_position/Constant_27_output_0"
+    input: "/encoder/ar_text_position/Constant_25_output_0"
+    input: "/encoder/ar_text_position/Constant_28_output_0"
+    output: "/encoder/ar_text_position/Slice_4_output_0"
+    name: "/encoder/ar_text_position/Slice_4"
+    op_type: "Slice"
+  }
+  node {
+    output: "onnx::Reshape_972"
+    name: "Constant_232"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "onnx::Reshape_954"
+    input: "onnx::Reshape_972"
+    output: "onnx::Add_973"
+    name: "Reshape_233"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "onnx::Add_973"
+    input: "/encoder/ar_text_position/Slice_4_output_0"
+    output: "/encoder/ar_text_position/Add_1_output_0"
+    name: "/encoder/ar_text_position/Add_1"
+    op_type: "Add"
+  }
+  node {
+    input: "/encoder/ar_text_position/Add_1_output_0"
+    output: "/encoder/ar_text_position/Shape_7_output_0"
+    name: "/encoder/ar_text_position/Shape_7"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    output: "/encoder/ar_text_position/Shape_8_output_0"
+    name: "/encoder/ar_text_position/Shape_8"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_8_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_3"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_29_output_0"
+    name: "/encoder/ar_text_position/Constant_29"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
+    input: "/encoder/ar_text_position/Constant_29_output_0"
+    output: "/encoder/ar_text_position/Mul_3_output_0"
+    name: "/encoder/ar_text_position/Mul_3"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    input: "/encoder/ar_text_position/Mul_3_output_0"
+    output: "/encoder/ar_text_position/Equal_2_output_0"
+    name: "/encoder/ar_text_position/Equal_2"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_2_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_3_output_0"
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    output: "/encoder/ar_text_position/Where_2_output_0"
+    name: "/encoder/ar_text_position/Where_2"
+    op_type: "Where"
+  }
+  node {
+    input: "onnx::Add_973"
+    input: "/encoder/ar_text_position/Where_2_output_0"
+    output: "/encoder/ar_text_position/Expand_4_output_0"
+    name: "/encoder/ar_text_position/Expand_4"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_30_output_0"
+    name: "/encoder/ar_text_position/Constant_30"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_4_output_0"
+    input: "/encoder/ar_text_position/Constant_30_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_4_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_4"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    output: "/encoder/ar_text_position/Shape_9_output_0"
+    name: "/encoder/ar_text_position/Shape_9"
+    op_type: "Shape"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_9_output_0"
+    output: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
+    name: "/encoder/ar_text_position/ConstantOfShape_4"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_31_output_0"
+    name: "/encoder/ar_text_position/Constant_31"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
+    input: "/encoder/ar_text_position/Constant_31_output_0"
+    output: "/encoder/ar_text_position/Mul_4_output_0"
+    name: "/encoder/ar_text_position/Mul_4"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    input: "/encoder/ar_text_position/Mul_4_output_0"
+    output: "/encoder/ar_text_position/Equal_3_output_0"
+    name: "/encoder/ar_text_position/Equal_3"
+    op_type: "Equal"
+  }
+  node {
+    input: "/encoder/ar_text_position/Equal_3_output_0"
+    input: "/encoder/ar_text_position/ConstantOfShape_4_output_0"
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    output: "/encoder/ar_text_position/Where_3_output_0"
+    name: "/encoder/ar_text_position/Where_3"
+    op_type: "Where"
+  }
+  node {
+    input: "/encoder/ar_text_position/Slice_4_output_0"
+    input: "/encoder/ar_text_position/Where_3_output_0"
+    output: "/encoder/ar_text_position/Expand_5_output_0"
+    name: "/encoder/ar_text_position/Expand_5"
+    op_type: "Expand"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_32_output_0"
+    name: "/encoder/ar_text_position/Constant_32"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_5_output_0"
+    input: "/encoder/ar_text_position/Constant_32_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_5_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_5"
+    op_type: "Unsqueeze"
+  }
+  node {
+    input: "/encoder/ar_text_position/Unsqueeze_4_output_0"
+    input: "/encoder/ar_text_position/Unsqueeze_5_output_0"
+    output: "/encoder/ar_text_position/Concat_3_output_0"
+    name: "/encoder/ar_text_position/Concat_3"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: -1
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    output: "/encoder/ar_text_position/Shape_10_output_0"
+    name: "/encoder/ar_text_position/Shape_10"
+    op_type: "Shape"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_33_output_0"
+    name: "/encoder/ar_text_position/Constant_33"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_34_output_0"
+    name: "/encoder/ar_text_position/Constant_34"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_35_output_0"
+    name: "/encoder/ar_text_position/Constant_35"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_10_output_0"
+    input: "/encoder/ar_text_position/Constant_34_output_0"
+    input: "/encoder/ar_text_position/Constant_35_output_0"
+    input: "/encoder/ar_text_position/Constant_33_output_0"
+    output: "/encoder/ar_text_position/Slice_5_output_0"
+    name: "/encoder/ar_text_position/Slice_5"
+    op_type: "Slice"
+  }
+  node {
+    input: "/encoder/ar_text_position/Shape_7_output_0"
+    input: "/encoder/ar_text_position/Slice_5_output_0"
+    output: "/encoder/ar_text_position/Concat_4_output_0"
+    name: "/encoder/ar_text_position/Concat_4"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/Expand_3_output_0"
+    input: "/encoder/ar_text_position/Concat_4_output_0"
+    output: "/encoder/ar_text_position/Reshape_1_output_0"
+    name: "/encoder/ar_text_position/Reshape_1"
+    op_type: "Reshape"
+    attribute {
+      name: "allowzero"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_output_0"
+    input: "/encoder/ar_text_position/Concat_3_output_0"
+    input: "/encoder/ar_text_position/Reshape_1_output_0"
+    output: "/encoder/ar_text_position/ScatterND_1_output_0"
+    name: "/encoder/ar_text_position/ScatterND_1"
+    op_type: "ScatterND"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_36_output_0"
+    name: "/encoder/ar_text_position/Constant_36"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/ar_text_position/ScatterND_1_output_0"
+    input: "/encoder/ar_text_position/Constant_36_output_0"
+    output: "/encoder/ar_text_position/Unsqueeze_6_output_0"
+    name: "/encoder/ar_text_position/Unsqueeze_6"
+    op_type: "Unsqueeze"
+  }
+  node {
+    output: "/encoder/ar_text_position/Constant_37_output_0"
+    name: "/encoder/ar_text_position/Constant_37"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: 1
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/encoder/Add_output_0"
+    input: "/encoder/ar_text_position/Constant_37_output_0"
+    output: "/encoder/ar_text_position/Mul_5_output_0"
+    name: "/encoder/ar_text_position/Mul_5"
+    op_type: "Mul"
+  }
+  node {
+    input: "encoder.ar_text_position.alpha"
+    input: "/encoder/ar_text_position/Unsqueeze_6_output_0"
+    output: "/encoder/ar_text_position/Mul_6_output_0"
+    name: "/encoder/ar_text_position/Mul_6"
+    op_type: "Mul"
+  }
+  node {
+    input: "/encoder/ar_text_position/Mul_5_output_0"
+    input: "/encoder/ar_text_position/Mul_6_output_0"
+    output: "x"
+    name: "/encoder/ar_text_position/Add_2"
+    op_type: "Add"
+  }
+  initializer {
+      dims: 732
+      dims: 512
+      data_type: 1
+      name: "encoder.ar_text_embedding.word_embeddings.weight"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "encoder.bert_proj.bias"
+  }
+  initializer {
+      dims: 1
+      data_type: 1
+      name: "encoder.ar_text_position.alpha"
+  }
+  initializer {
+      dims: 768
+      dims: 768
+      dims: 2
+      data_type: 1
+      name: "vits.ssl_proj.weight"
+  }
+  initializer {
+      dims: 768
+      data_type: 1
+      name: "vits.ssl_proj.bias"
+  }
+  initializer {
+      dims: 768
+      dims: 1024
+      data_type: 1
+      name: "onnx::MatMul_1009"
+  }
+  initializer {
+      dims: 1024
+      dims: 512
+      data_type: 1
+      name: "onnx::MatMul_1010"
+  }
+  input {
+    name: "ref_seq"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "ref_length"
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "text_seq"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "text_length"
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "ref_bert"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "ref_length"
+          }
+          dim {
+            dim_value: 1024
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "text_bert"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "text_length"
+          }
+          dim {
+            dim_value: 1024
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "ssl_content"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 768
+          }
+          dim {
+            dim_param: "ssl_length"
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "x"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "Addx_dim_1"
+          }
+          dim {
+            dim_param: "Addx_dim_2"
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "prompts"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_param: "Unsqueezeprompts_dim_1"
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  domain: ""
+  version: 17
+}