antigravity commited on
Commit
c441d2c
·
1 Parent(s): 620bb7c

sync all fixes: prompt leakage, cross-lang, ref_cache update, and file wait logic

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +62 -9
  2. genie_tts/Audio/Audio.py +51 -51
  3. genie_tts/Audio/__pycache__/Audio.cpython-311.pyc +0 -0
  4. genie_tts/Audio/__pycache__/ReferenceAudio.cpython-311.pyc +0 -0
  5. genie_tts/Audio/__pycache__/__init__.cpython-311.pyc +0 -0
  6. genie_tts/Converter/Converter.py +11 -11
  7. genie_tts/Converter/__pycache__/Converter.cpython-311.pyc +0 -0
  8. genie_tts/Converter/__pycache__/__init__.cpython-311.pyc +0 -0
  9. genie_tts/Converter/__pycache__/load_state_dict.cpython-311.pyc +0 -0
  10. genie_tts/Converter/__pycache__/utils.cpython-311.pyc +0 -0
  11. genie_tts/Converter/load_state_dict.py +26 -26
  12. genie_tts/Converter/v2/Converter.py +146 -146
  13. genie_tts/Converter/v2/EncoderConverter.py +106 -106
  14. genie_tts/Converter/v2/T2SConverter.py +125 -125
  15. genie_tts/Converter/v2/VITSConverter.py +129 -129
  16. genie_tts/Converter/v2/__pycache__/Converter.cpython-311.pyc +0 -0
  17. genie_tts/Converter/v2/__pycache__/EncoderConverter.cpython-311.pyc +0 -0
  18. genie_tts/Converter/v2/__pycache__/T2SConverter.cpython-311.pyc +0 -0
  19. genie_tts/Converter/v2/__pycache__/VITSConverter.cpython-311.pyc +0 -0
  20. genie_tts/Converter/v2/__pycache__/__init__.cpython-311.pyc +0 -0
  21. genie_tts/Converter/v2ProPlus/Converter.py +89 -89
  22. genie_tts/Converter/v2ProPlus/PromptEncoderConverter.py +128 -128
  23. genie_tts/Converter/v2ProPlus/__pycache__/Converter.cpython-311.pyc +0 -0
  24. genie_tts/Converter/v2ProPlus/__pycache__/PromptEncoderConverter.cpython-311.pyc +0 -0
  25. genie_tts/Core/Resources.py +76 -76
  26. genie_tts/Core/__pycache__/Inference.cpython-311.pyc +0 -0
  27. genie_tts/Core/__pycache__/Resources.cpython-311.pyc +0 -0
  28. genie_tts/Core/__pycache__/TTSPlayer.cpython-311.pyc +0 -0
  29. genie_tts/Core/__pycache__/__init__.cpython-311.pyc +0 -0
  30. genie_tts/Data/v2/Keys/t2s_onnx_keys.txt +291 -291
  31. genie_tts/Data/v2/Keys/vits_onnx_keys.txt +668 -668
  32. genie_tts/Data/v2ProPlus/Keys/prompt_encoder_weights.txt +23 -23
  33. genie_tts/Data/v2ProPlus/Keys/vits_weights.txt +650 -650
  34. genie_tts/G2P/Chinese/CorrectPronunciation.py +50 -50
  35. genie_tts/G2P/Chinese/Erhua.py +49 -49
  36. genie_tts/G2P/Chinese/Normalization/__pycache__/__init__.cpython-311.pyc +0 -0
  37. genie_tts/G2P/Chinese/Normalization/__pycache__/char_convert.cpython-311.pyc +0 -0
  38. genie_tts/G2P/Chinese/Normalization/__pycache__/chronology.cpython-311.pyc +0 -0
  39. genie_tts/G2P/Chinese/Normalization/__pycache__/constants.cpython-311.pyc +0 -0
  40. genie_tts/G2P/Chinese/Normalization/__pycache__/num.cpython-311.pyc +0 -0
  41. genie_tts/G2P/Chinese/Normalization/__pycache__/phonecode.cpython-311.pyc +0 -0
  42. genie_tts/G2P/Chinese/Normalization/__pycache__/quantifier.cpython-311.pyc +0 -0
  43. genie_tts/G2P/Chinese/Normalization/__pycache__/text_normlization.cpython-311.pyc +0 -0
  44. genie_tts/G2P/Chinese/ToneSandhi.py +354 -354
  45. genie_tts/G2P/Chinese/__pycache__/ChineseG2P.cpython-311.pyc +0 -0
  46. genie_tts/G2P/Chinese/__pycache__/CorrectPronunciation.cpython-311.pyc +0 -0
  47. genie_tts/G2P/Chinese/__pycache__/Erhua.cpython-311.pyc +0 -0
  48. genie_tts/G2P/Chinese/__pycache__/ToneSandhi.cpython-311.pyc +0 -0
  49. genie_tts/G2P/Chinese/__pycache__/__init__.cpython-311.pyc +0 -0
  50. genie_tts/G2P/English/EnglishG2P.py +296 -296
app.py CHANGED
@@ -66,6 +66,30 @@ async def load_model(character_name: str = Form(...), model_path: str = Form(...
66
  try:
67
  print(f"📦 Loading character: {character_name} from {full_path}")
68
  genie_tts.load_character(character_name, full_path, language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  return {"status": "success", "message": f"Character '{character_name}' loaded."}
70
  except Exception as e:
71
  raise HTTPException(status_code=500, detail=str(e))
@@ -76,12 +100,21 @@ async def upload_and_tts(
76
  prompt_text: str = Form(...),
77
  text: str = Form(...),
78
  language: str = Form("zh"),
 
79
  file: UploadFile = File(...)
80
  ):
81
  """
82
  上传临时参考音频并生成语音
83
  """
84
  try:
 
 
 
 
 
 
 
 
85
  ts = int(time.time() * 1000)
86
  save_path = f"/tmp/ref_{ts}.wav"
87
  os.makedirs("/tmp", exist_ok=True)
@@ -89,23 +122,37 @@ async def upload_and_tts(
89
  with open(save_path, "wb") as buffer:
90
  shutil.copyfileobj(file.file, buffer)
91
 
92
- print(f"🔥 [Custom] Using temp audio for {character_name}: {save_path}")
93
  genie_tts.set_reference_audio(character_name, save_path, prompt_text, language)
94
 
95
  out_path = f"/tmp/out_{ts}.wav"
96
- genie_tts.tts(character_name, text, save_path=out_path, play=False)
 
97
 
 
 
 
 
 
 
 
 
 
98
  def iterfile():
99
- with open(out_path, "rb") as f:
100
- yield from f
101
- try:
102
- os.remove(save_path)
103
- os.remove(out_path)
104
- except: pass
 
 
 
 
105
 
106
  return StreamingResponse(iterfile(), media_type="audio/wav")
107
  except Exception as e:
108
- print(f"❌ Error in upload/tts: {e}")
109
  raise HTTPException(status_code=500, detail=str(e))
110
 
111
  @app.post("/tts")
@@ -138,6 +185,12 @@ async def dynamic_tts(
138
  out_path = f"/tmp/out_dyn_{int(time.time())}.wav"
139
  genie_tts.tts(character_name, text, save_path=out_path, play=False, text_language=text_lang)
140
 
 
 
 
 
 
 
141
  return StreamingResponse(open(out_path, "rb"), media_type="audio/wav")
142
  except Exception as e:
143
  print(f"❌ Error: {e}")
 
66
  try:
67
  print(f"📦 Loading character: {character_name} from {full_path}")
68
  genie_tts.load_character(character_name, full_path, language)
69
+
70
+ # 自动探测参考音频配置
71
+ prompt_json_path = os.path.join(full_path, "prompt_wav.json")
72
+ ref_wav_path = os.path.join(full_path, "ref.wav")
73
+
74
+ if os.path.exists(prompt_json_path):
75
+ import json
76
+ with open(prompt_json_path, "r", encoding="utf-8") as f:
77
+ data = json.load(f)
78
+ config = data.get("default", {})
79
+ REF_CACHE[character_name] = {
80
+ "path": os.path.join(full_path, config.get("wav_path", "ref.wav")),
81
+ "text": config.get("prompt_text", ""),
82
+ "lang": config.get("prompt_lang", language)
83
+ }
84
+ print(f"📖 Loaded ref info from JSON for {character_name}")
85
+ elif os.path.exists(ref_wav_path):
86
+ REF_CACHE[character_name] = {
87
+ "path": ref_wav_path,
88
+ "text": "",
89
+ "lang": language
90
+ }
91
+ print(f"🎵 Found ref.wav for {character_name}")
92
+
93
  return {"status": "success", "message": f"Character '{character_name}' loaded."}
94
  except Exception as e:
95
  raise HTTPException(status_code=500, detail=str(e))
 
100
  prompt_text: str = Form(...),
101
  text: str = Form(...),
102
  language: str = Form("zh"),
103
+ text_lang: str = Form(None),
104
  file: UploadFile = File(...)
105
  ):
106
  """
107
  上传临时参考音频并生成语音
108
  """
109
  try:
110
+ # 🟢 确保模型已加载
111
+ if not genie_tts.model_manager.get(character_name):
112
+ print(f"⚠️ Character {character_name} not loaded, trying to load...")
113
+ char_path = os.path.join(MODELS_ROOT, character_name.lower())
114
+ if not os.path.exists(char_path):
115
+ char_path = os.path.join(MODELS_ROOT, "mzm") # 兜底逻辑
116
+ genie_tts.load_character(character_name, char_path, language)
117
+
118
  ts = int(time.time() * 1000)
119
  save_path = f"/tmp/ref_{ts}.wav"
120
  os.makedirs("/tmp", exist_ok=True)
 
122
  with open(save_path, "wb") as buffer:
123
  shutil.copyfileobj(file.file, buffer)
124
 
125
+ print(f"🔥 [Custom] Using temp audio: {save_path}")
126
  genie_tts.set_reference_audio(character_name, save_path, prompt_text, language)
127
 
128
  out_path = f"/tmp/out_{ts}.wav"
129
+ # 🟢 执行 TTS
130
+ genie_tts.tts(character_name, text, save_path=out_path, play=False, text_language=text_lang)
131
 
132
+ # 🟢 关键:强制等待文件出现(最多等5秒)
133
+ wait_time = 0
134
+ while not os.path.exists(out_path) and wait_time < 50:
135
+ time.sleep(0.1)
136
+ wait_time += 1
137
+
138
+ if not os.path.exists(out_path):
139
+ raise HTTPException(status_code=500, detail="Audio file generation timed out or failed.")
140
+
141
  def iterfile():
142
+ try:
143
+ with open(out_path, "rb") as f:
144
+ yield from f
145
+ finally:
146
+ # 给一点延迟确保读取完毕后再删除
147
+ time.sleep(1)
148
+ try:
149
+ if os.path.exists(save_path): os.remove(save_path)
150
+ if os.path.exists(out_path): os.remove(out_path)
151
+ except: pass
152
 
153
  return StreamingResponse(iterfile(), media_type="audio/wav")
154
  except Exception as e:
155
+ print(f"❌ Error in upload/tts: {str(e)}")
156
  raise HTTPException(status_code=500, detail=str(e))
157
 
158
  @app.post("/tts")
 
185
  out_path = f"/tmp/out_dyn_{int(time.time())}.wav"
186
  genie_tts.tts(character_name, text, save_path=out_path, play=False, text_language=text_lang)
187
 
188
+ # 🟢 同样增加文件等待
189
+ wait_time = 0
190
+ while not os.path.exists(out_path) and wait_time < 50:
191
+ time.sleep(0.1)
192
+ wait_time += 1
193
+
194
  return StreamingResponse(open(out_path, "rb"), media_type="audio/wav")
195
  except Exception as e:
196
  print(f"❌ Error: {e}")
genie_tts/Audio/Audio.py CHANGED
@@ -1,51 +1,51 @@
1
- import os
2
- import soundfile as sf
3
- import soxr
4
- import numpy as np
5
- import logging
6
- from typing import Optional
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
- # 音频时长建议范围 (秒)
11
- MIN_DURATION_S = 3
12
- MAX_DURATION_S = 10
13
- # 在音频末尾追加的静音时长 (秒)
14
- SILENCE_TO_APPEND_S = 0.3
15
- # 模型期望的目标采样率
16
- TARGET_SAMPLING_RATE = 16000
17
-
18
-
19
- def load_audio(
20
- audio_path: str,
21
- target_sampling_rate: int = TARGET_SAMPLING_RATE
22
- ) -> Optional[np.ndarray]:
23
- try:
24
- wav, original_sr = sf.read(audio_path, dtype='float32')
25
- if wav.ndim > 1:
26
- wav = np.mean(wav, axis=1) # 多声道转单声道。
27
- if original_sr != target_sampling_rate:
28
- wav = soxr.resample(wav, original_sr, target_sampling_rate, quality='hq') # 重采样。
29
-
30
- except Exception as e:
31
- logger.error(f"Failed to load reference audio: {audio_path}. Error: {e}")
32
- return None
33
-
34
- # 检查音频长度是否在建议范围之外
35
- min_samples = int(MIN_DURATION_S * target_sampling_rate)
36
- max_samples = int(MAX_DURATION_S * target_sampling_rate)
37
- if not (min_samples <= wav.shape[0] <= max_samples):
38
- duration = len(wav) / target_sampling_rate
39
- logger.warning(
40
- f"The reference audio '{os.path.basename(audio_path)}' has a duration of {duration:.2f} seconds, "
41
- f"which is outside the recommended range of {MIN_DURATION_S} to {MAX_DURATION_S} seconds!"
42
- )
43
-
44
- # 创建并拼接静音
45
- silence_samples = int(SILENCE_TO_APPEND_S * target_sampling_rate)
46
- silence_array = np.zeros(silence_samples, dtype=np.float32)
47
- wav_processed = np.concatenate([wav, silence_array])
48
-
49
- # 为模型输入增加批次维度
50
- # wav_processed = np.expand_dims(wav_processed, axis=0)
51
- return wav_processed
 
1
+ import os
2
+ import soundfile as sf
3
+ import soxr
4
+ import numpy as np
5
+ import logging
6
+ from typing import Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # 音频时长建议范围 (秒)
11
+ MIN_DURATION_S = 3
12
+ MAX_DURATION_S = 10
13
+ # 在音频末尾追加的静音时长 (秒)
14
+ SILENCE_TO_APPEND_S = 0.3
15
+ # 模型期望的目标采样率
16
+ TARGET_SAMPLING_RATE = 16000
17
+
18
+
19
+ def load_audio(
20
+ audio_path: str,
21
+ target_sampling_rate: int = TARGET_SAMPLING_RATE
22
+ ) -> Optional[np.ndarray]:
23
+ try:
24
+ wav, original_sr = sf.read(audio_path, dtype='float32')
25
+ if wav.ndim > 1:
26
+ wav = np.mean(wav, axis=1) # 多声道转单声道。
27
+ if original_sr != target_sampling_rate:
28
+ wav = soxr.resample(wav, original_sr, target_sampling_rate, quality='hq') # 重采样。
29
+
30
+ except Exception as e:
31
+ logger.error(f"Failed to load reference audio: {audio_path}. Error: {e}")
32
+ return None
33
+
34
+ # 检查音频长度是否在建议范围之外
35
+ min_samples = int(MIN_DURATION_S * target_sampling_rate)
36
+ max_samples = int(MAX_DURATION_S * target_sampling_rate)
37
+ if not (min_samples <= wav.shape[0] <= max_samples):
38
+ duration = len(wav) / target_sampling_rate
39
+ logger.warning(
40
+ f"The reference audio '{os.path.basename(audio_path)}' has a duration of {duration:.2f} seconds, "
41
+ f"which is outside the recommended range of {MIN_DURATION_S} to {MAX_DURATION_S} seconds!"
42
+ )
43
+
44
+ # 创建并拼接静音
45
+ silence_samples = int(SILENCE_TO_APPEND_S * target_sampling_rate)
46
+ silence_array = np.zeros(silence_samples, dtype=np.float32)
47
+ wav_processed = np.concatenate([wav, silence_array])
48
+
49
+ # 为模型输入增加批次维度
50
+ # wav_processed = np.expand_dims(wav_processed, axis=0)
51
+ return wav_processed
genie_tts/Audio/__pycache__/Audio.cpython-311.pyc ADDED
Binary file (2.61 kB). View file
 
genie_tts/Audio/__pycache__/ReferenceAudio.cpython-311.pyc ADDED
Binary file (4.63 kB). View file
 
genie_tts/Audio/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (184 Bytes). View file
 
genie_tts/Converter/Converter.py CHANGED
@@ -1,11 +1,11 @@
1
- from .v2.Converter import convert as convert_v2
2
- from .v2ProPlus.Converter import convert as convert_v2pp
3
-
4
- import os
5
-
6
-
7
- def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None:
8
- if os.path.getsize(torch_pth_path) > 150 * 1024 * 1024: # 大于 150 MB
9
- convert_v2pp(torch_ckpt_path, torch_pth_path, output_dir)
10
- else:
11
- convert_v2(torch_ckpt_path, torch_pth_path, output_dir)
 
1
+ from .v2.Converter import convert as convert_v2
2
+ from .v2ProPlus.Converter import convert as convert_v2pp
3
+
4
+ import os
5
+
6
+
7
+ def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None:
8
+ if os.path.getsize(torch_pth_path) > 150 * 1024 * 1024: # 大于 150 MB
9
+ convert_v2pp(torch_ckpt_path, torch_pth_path, output_dir)
10
+ else:
11
+ convert_v2(torch_ckpt_path, torch_pth_path, output_dir)
genie_tts/Converter/__pycache__/Converter.cpython-311.pyc ADDED
Binary file (838 Bytes). View file
 
genie_tts/Converter/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (188 Bytes). View file
 
genie_tts/Converter/__pycache__/load_state_dict.cpython-311.pyc ADDED
Binary file (1.56 kB). View file
 
genie_tts/Converter/__pycache__/utils.cpython-311.pyc ADDED
Binary file (2.33 kB). View file
 
genie_tts/Converter/load_state_dict.py CHANGED
@@ -1,26 +1,26 @@
1
- import sys
2
- import os
3
-
4
- sys.path.append(os.path.dirname(__file__))
5
-
6
- import torch
7
- from io import BytesIO
8
- import utils
9
-
10
-
11
- def load_sovits_model(pth_path: str, device: str = 'cpu'):
12
- f = open(pth_path, "rb")
13
- meta = f.read(2)
14
- if meta != b"PK":
15
- # noinspection PyTypeChecker
16
- data = b"PK" + f.read()
17
- bio = BytesIO()
18
- # noinspection PyTypeChecker
19
- bio.write(data)
20
- bio.seek(0)
21
- return torch.load(bio, map_location=device, weights_only=False)
22
- return torch.load(pth_path, map_location=device, weights_only=False)
23
-
24
-
25
- def load_gpt_model(ckpt_path: str, device: str = 'cpu'):
26
- return torch.load(ckpt_path, map_location=device, weights_only=True)
 
1
+ import sys
2
+ import os
3
+
4
+ sys.path.append(os.path.dirname(__file__))
5
+
6
+ import torch
7
+ from io import BytesIO
8
+ import utils
9
+
10
+
11
+ def load_sovits_model(pth_path: str, device: str = 'cpu'):
12
+ f = open(pth_path, "rb")
13
+ meta = f.read(2)
14
+ if meta != b"PK":
15
+ # noinspection PyTypeChecker
16
+ data = b"PK" + f.read()
17
+ bio = BytesIO()
18
+ # noinspection PyTypeChecker
19
+ bio.write(data)
20
+ bio.seek(0)
21
+ return torch.load(bio, map_location=device, weights_only=False)
22
+ return torch.load(pth_path, map_location=device, weights_only=False)
23
+
24
+
25
+ def load_gpt_model(ckpt_path: str, device: str = 'cpu'):
26
+ return torch.load(ckpt_path, map_location=device, weights_only=True)
genie_tts/Converter/v2/Converter.py CHANGED
@@ -1,146 +1,146 @@
1
- from .VITSConverter import VITSConverter
2
- from .T2SConverter import T2SModelConverter
3
- from .EncoderConverter import EncoderConverter
4
- from ...Utils.Constants import PACKAGE_NAME
5
-
6
- import logging
7
- from typing import Optional, Tuple
8
- import re
9
- import os
10
- import shutil
11
- import traceback
12
- import importlib.resources
13
- import contextlib
14
-
15
- logger = logging.getLogger()
16
-
17
- CACHE_DIR = os.path.join(os.getcwd(), "Cache")
18
- ENCODER_RESOURCE_PATH = "Data/v2/Models/t2s_encoder_fp32.onnx"
19
- STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_stage_decoder_fp32.onnx"
20
- FIRST_STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_first_stage_decoder_fp32.onnx"
21
- VITS_RESOURCE_PATH = "Data/v2/Models/vits_fp32.onnx"
22
- T2S_KEYS_RESOURCE_PATH = "Data/v2/Keys/t2s_onnx_keys.txt"
23
- VITS_KEYS_RESOURCE_PATH = "Data/v2/Keys/vits_onnx_keys.txt"
24
-
25
-
26
- def find_ckpt_and_pth(directory: str) -> Tuple[Optional[str], Optional[str]]:
27
- """
28
- 在 directory(不递归子目录)里查找:
29
- - .ckpt:从所有 .ckpt 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0),
30
- 选择 epoch 最大的那个文件(若无则为 None)
31
- - .pth :从所有 .pth 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0),
32
- 选择 epoch 最大的那个文件(若无则为 None)
33
- 若出现相同 epoch,选修改时间较新的文件以打破平手。
34
- """
35
- best_ckpt_path: Optional[str] = None
36
- best_ckpt_epoch: int = -1
37
-
38
- best_pth_path: Optional[str] = None
39
- best_pth_epoch: int = -1
40
-
41
- for filename in os.listdir(directory):
42
- full_path = os.path.join(directory, filename)
43
-
44
- if not os.path.isfile(full_path):
45
- continue
46
-
47
- # 提取 epoch
48
- m = re.search(r"e(\d+)", filename, flags=re.IGNORECASE)
49
- epoch = int(m.group(1)) if m else 0
50
-
51
- # .ckpt 文件处理
52
- if filename.lower().endswith(".ckpt"):
53
- if (
54
- epoch > best_ckpt_epoch
55
- or (
56
- epoch == best_ckpt_epoch
57
- and best_ckpt_path is not None
58
- and os.path.getmtime(full_path) > os.path.getmtime(best_ckpt_path)
59
- )
60
- ):
61
- best_ckpt_epoch = epoch
62
- best_ckpt_path = full_path
63
-
64
- # .pth 文件处理
65
- elif filename.lower().endswith(".pth"):
66
- if (
67
- epoch > best_pth_epoch
68
- or (
69
- epoch == best_pth_epoch
70
- and best_pth_path is not None
71
- and os.path.getmtime(full_path) > os.path.getmtime(best_pth_path)
72
- )
73
- ):
74
- best_pth_epoch = epoch
75
- best_pth_path = full_path
76
-
77
- return best_ckpt_path, best_pth_path
78
-
79
-
80
- def remove_folder(folder: str) -> None:
81
- try:
82
- if os.path.exists(folder):
83
- shutil.rmtree(folder)
84
- logger.info(f"🧹 Folder cleaned: {folder}")
85
- except Exception as e:
86
- logger.error(f"❌ Failed to clean folder {folder}: {e}")
87
-
88
-
89
- def convert(torch_ckpt_path: str,
90
- torch_pth_path: str,
91
- output_dir: str):
92
- # 确保缓存和输出目录存在
93
- os.makedirs(CACHE_DIR, exist_ok=True)
94
- os.makedirs(output_dir, exist_ok=True)
95
-
96
- if len(os.listdir(output_dir)) > 0:
97
- logger.warning(f"The output directory {output_dir} is not empty!")
98
-
99
- with contextlib.ExitStack() as stack:
100
- files = importlib.resources.files(PACKAGE_NAME)
101
-
102
- def enter(p):
103
- return stack.enter_context(importlib.resources.as_file(files.joinpath(p)))
104
-
105
- encoder_onnx_path = enter(ENCODER_RESOURCE_PATH)
106
- stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH)
107
- first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH)
108
- vits_onnx_path = enter(VITS_RESOURCE_PATH)
109
- t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH)
110
- vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH)
111
-
112
- converter_1 = T2SModelConverter(
113
- torch_ckpt_path=torch_ckpt_path,
114
- stage_decoder_onnx_path=str(stage_decoder_path),
115
- first_stage_decoder_onnx_path=str(first_stage_decoder_path),
116
- key_list_file=str(t2s_keys_path),
117
- output_dir=output_dir,
118
- cache_dir=CACHE_DIR,
119
- )
120
- converter_2 = VITSConverter(
121
- torch_pth_path=torch_pth_path,
122
- vits_onnx_path=str(vits_onnx_path),
123
- key_list_file=str(vits_keys_path),
124
- output_dir=output_dir,
125
- cache_dir=CACHE_DIR,
126
- )
127
- converter_3 = EncoderConverter(
128
- ckpt_path=torch_ckpt_path,
129
- pth_path=torch_pth_path,
130
- onnx_input_path=str(encoder_onnx_path),
131
- output_dir=output_dir,
132
- )
133
-
134
- try:
135
- converter_1.run_full_process()
136
- converter_2.run_full_process()
137
- converter_3.run_full_process()
138
- logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n"
139
- f"- Model Type: V2")
140
- except Exception:
141
- logger.error(f"❌ A critical error occurred during the conversion process")
142
- logger.error(traceback.format_exc())
143
- remove_folder(output_dir) # 只在失败时清理输出目录
144
- finally:
145
- # 无论成功还是失败,都尝试清理缓存目录
146
- remove_folder(CACHE_DIR)
 
1
+ from .VITSConverter import VITSConverter
2
+ from .T2SConverter import T2SModelConverter
3
+ from .EncoderConverter import EncoderConverter
4
+ from ...Utils.Constants import PACKAGE_NAME
5
+
6
+ import logging
7
+ from typing import Optional, Tuple
8
+ import re
9
+ import os
10
+ import shutil
11
+ import traceback
12
+ import importlib.resources
13
+ import contextlib
14
+
15
+ logger = logging.getLogger()
16
+
17
+ CACHE_DIR = os.path.join(os.getcwd(), "Cache")
18
+ ENCODER_RESOURCE_PATH = "Data/v2/Models/t2s_encoder_fp32.onnx"
19
+ STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_stage_decoder_fp32.onnx"
20
+ FIRST_STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_first_stage_decoder_fp32.onnx"
21
+ VITS_RESOURCE_PATH = "Data/v2/Models/vits_fp32.onnx"
22
+ T2S_KEYS_RESOURCE_PATH = "Data/v2/Keys/t2s_onnx_keys.txt"
23
+ VITS_KEYS_RESOURCE_PATH = "Data/v2/Keys/vits_onnx_keys.txt"
24
+
25
+
26
+ def find_ckpt_and_pth(directory: str) -> Tuple[Optional[str], Optional[str]]:
27
+ """
28
+ 在 directory(不递归子目录)里查找:
29
+ - .ckpt:从所有 .ckpt 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0),
30
+ 选择 epoch 最大的那个文件(若无则为 None)
31
+ - .pth :从所有 .pth 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0),
32
+ 选择 epoch 最大的那个文件(若无则为 None)
33
+ 若出现相同 epoch,选修改时间较新的文件以打破平手。
34
+ """
35
+ best_ckpt_path: Optional[str] = None
36
+ best_ckpt_epoch: int = -1
37
+
38
+ best_pth_path: Optional[str] = None
39
+ best_pth_epoch: int = -1
40
+
41
+ for filename in os.listdir(directory):
42
+ full_path = os.path.join(directory, filename)
43
+
44
+ if not os.path.isfile(full_path):
45
+ continue
46
+
47
+ # 提取 epoch
48
+ m = re.search(r"e(\d+)", filename, flags=re.IGNORECASE)
49
+ epoch = int(m.group(1)) if m else 0
50
+
51
+ # .ckpt 文件处理
52
+ if filename.lower().endswith(".ckpt"):
53
+ if (
54
+ epoch > best_ckpt_epoch
55
+ or (
56
+ epoch == best_ckpt_epoch
57
+ and best_ckpt_path is not None
58
+ and os.path.getmtime(full_path) > os.path.getmtime(best_ckpt_path)
59
+ )
60
+ ):
61
+ best_ckpt_epoch = epoch
62
+ best_ckpt_path = full_path
63
+
64
+ # .pth 文件处理
65
+ elif filename.lower().endswith(".pth"):
66
+ if (
67
+ epoch > best_pth_epoch
68
+ or (
69
+ epoch == best_pth_epoch
70
+ and best_pth_path is not None
71
+ and os.path.getmtime(full_path) > os.path.getmtime(best_pth_path)
72
+ )
73
+ ):
74
+ best_pth_epoch = epoch
75
+ best_pth_path = full_path
76
+
77
+ return best_ckpt_path, best_pth_path
78
+
79
+
80
+ def remove_folder(folder: str) -> None:
81
+ try:
82
+ if os.path.exists(folder):
83
+ shutil.rmtree(folder)
84
+ logger.info(f"🧹 Folder cleaned: {folder}")
85
+ except Exception as e:
86
+ logger.error(f"❌ Failed to clean folder {folder}: {e}")
87
+
88
+
89
+ def convert(torch_ckpt_path: str,
90
+ torch_pth_path: str,
91
+ output_dir: str):
92
+ # 确保缓存和输出目录存在
93
+ os.makedirs(CACHE_DIR, exist_ok=True)
94
+ os.makedirs(output_dir, exist_ok=True)
95
+
96
+ if len(os.listdir(output_dir)) > 0:
97
+ logger.warning(f"The output directory {output_dir} is not empty!")
98
+
99
+ with contextlib.ExitStack() as stack:
100
+ files = importlib.resources.files(PACKAGE_NAME)
101
+
102
+ def enter(p):
103
+ return stack.enter_context(importlib.resources.as_file(files.joinpath(p)))
104
+
105
+ encoder_onnx_path = enter(ENCODER_RESOURCE_PATH)
106
+ stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH)
107
+ first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH)
108
+ vits_onnx_path = enter(VITS_RESOURCE_PATH)
109
+ t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH)
110
+ vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH)
111
+
112
+ converter_1 = T2SModelConverter(
113
+ torch_ckpt_path=torch_ckpt_path,
114
+ stage_decoder_onnx_path=str(stage_decoder_path),
115
+ first_stage_decoder_onnx_path=str(first_stage_decoder_path),
116
+ key_list_file=str(t2s_keys_path),
117
+ output_dir=output_dir,
118
+ cache_dir=CACHE_DIR,
119
+ )
120
+ converter_2 = VITSConverter(
121
+ torch_pth_path=torch_pth_path,
122
+ vits_onnx_path=str(vits_onnx_path),
123
+ key_list_file=str(vits_keys_path),
124
+ output_dir=output_dir,
125
+ cache_dir=CACHE_DIR,
126
+ )
127
+ converter_3 = EncoderConverter(
128
+ ckpt_path=torch_ckpt_path,
129
+ pth_path=torch_pth_path,
130
+ onnx_input_path=str(encoder_onnx_path),
131
+ output_dir=output_dir,
132
+ )
133
+
134
+ try:
135
+ converter_1.run_full_process()
136
+ converter_2.run_full_process()
137
+ converter_3.run_full_process()
138
+ logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n"
139
+ f"- Model Type: V2")
140
+ except Exception:
141
+ logger.error(f"❌ A critical error occurred during the conversion process")
142
+ logger.error(traceback.format_exc())
143
+ remove_folder(output_dir) # 只在失败时清理输出目录
144
+ finally:
145
+ # 无论成功还是失败,都尝试清理缓存目录
146
+ remove_folder(CACHE_DIR)
genie_tts/Converter/v2/EncoderConverter.py CHANGED
@@ -1,106 +1,106 @@
1
- import torch
2
- import onnx
3
- import os
4
-
5
- from ..load_state_dict import load_gpt_model, load_sovits_model
6
-
7
-
8
- class EncoderConverter:
9
- """
10
- 一个转换器,用于为 t2s_encoder 模型创建:
11
- 1. 一个从 .ckpt 和 .pth 文件中合并而来的全精度 (fp32) .bin 权重文件。
12
- 2. 一个链接到该 .bin 文件的 ONNX 模型。
13
- """
14
-
15
- def __init__(self,
16
- ckpt_path: str,
17
- pth_path: str,
18
- onnx_input_path: str,
19
- output_dir: str,
20
- ):
21
- self.ckpt_path: str = ckpt_path
22
- self.pth_path: str = pth_path
23
- self.onnx_input_path: str = onnx_input_path
24
- self.output_dir: str = output_dir
25
-
26
- # 定义最终输出文件的路径
27
- self.output_bin_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.bin")
28
- self.output_onnx_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx")
29
-
30
- # 确保输出目录存在
31
- os.makedirs(self.output_dir, exist_ok=True)
32
-
33
- # 检查所有输入文件是否存在
34
- for path in [self.ckpt_path, self.pth_path, self.onnx_input_path]:
35
- if not os.path.exists(path):
36
- raise FileNotFoundError(f"Error: Input file not found! Path: {path}")
37
-
38
- def run_full_process(self):
39
- # 1. 定义固定的 ONNX 权重键列表 (此顺序决定了 .bin 文件的布局)
40
- onnx_keys = [
41
- "encoder.ar_text_embedding.word_embeddings.weight",
42
- "encoder.bert_proj.weight",
43
- "encoder.bert_proj.bias",
44
- "encoder.ar_text_position.alpha",
45
- "vits.ssl_proj.weight",
46
- "vits.ssl_proj.bias",
47
- "vits.quantizer.vq.layers.0._codebook.embed"
48
- ]
49
-
50
- # 2. 加载所有必要的模型和权重
51
- ckpt_state_dict = load_gpt_model(self.ckpt_path)['weight']
52
- pth_state_dict = load_sovits_model(self.pth_path)['weight']
53
- model = onnx.load(self.onnx_input_path, load_external_data=False)
54
- initializer_map = {init.name: init for init in model.graph.initializer}
55
- current_offset = 0
56
- bin_filename = os.path.basename(self.output_bin_path)
57
-
58
- # 3. 生成 .bin 文件并同步修改 ONNX 模型
59
- with open(self.output_bin_path, 'wb') as f_bin:
60
- for onnx_key in onnx_keys:
61
- source_key = ""
62
- source_dict = None
63
-
64
- if onnx_key.startswith("encoder."):
65
- source_key = "model." + onnx_key[len("encoder."):]
66
- source_dict = ckpt_state_dict
67
- elif onnx_key.startswith("vits."):
68
- source_key = onnx_key[len("vits."):]
69
- source_dict = pth_state_dict
70
-
71
- if source_dict is None:
72
- raise ValueError(
73
- f"❌ Critical error: Unable to determine the weight source for ONNX key '{onnx_key}'.")
74
- # 从源文件中提取张量
75
- tensor = source_dict.get(source_key)
76
- if tensor is None:
77
- raise ValueError(
78
- f"❌ Critical error: Key '{source_key}' (corresponding to ONNX key '{onnx_key}') not found in the source file.")
79
-
80
- # 转换为 fp32 numpy 数组并获取字节
81
- numpy_array_fp32 = tensor.to(torch.float32).cpu().numpy()
82
- tensor_bytes = numpy_array_fp32.tobytes()
83
- tensor_length = len(tensor_bytes)
84
- f_bin.write(tensor_bytes)
85
-
86
- # 在 ONNX 模型中找到对应的 initializer 并修改它
87
- if onnx_key in initializer_map:
88
- tensor_proto = initializer_map[onnx_key]
89
-
90
- tensor_proto.ClearField('raw_data')
91
- tensor_proto.data_location = onnx.TensorProto.EXTERNAL
92
- del tensor_proto.external_data[:]
93
-
94
- keys_to_set = ["location", "offset", "length"]
95
- values_to_set = [bin_filename, str(current_offset), str(tensor_length)]
96
-
97
- for k, v in zip(keys_to_set, values_to_set):
98
- entry = tensor_proto.external_data.add()
99
- entry.key = k
100
- entry.value = v
101
-
102
- # 更新下一个权重的偏移量
103
- current_offset += tensor_length
104
-
105
- # 4. 保存修改后的 ONNX 模型
106
- onnx.save(model, self.output_onnx_path)
 
1
+ import torch
2
+ import onnx
3
+ import os
4
+
5
+ from ..load_state_dict import load_gpt_model, load_sovits_model
6
+
7
+
8
+ class EncoderConverter:
9
+ """
10
+ 一个转换器,用于为 t2s_encoder 模型创建:
11
+ 1. 一个从 .ckpt 和 .pth 文件中合并而来的全精度 (fp32) .bin 权重文件。
12
+ 2. 一个链接到该 .bin 文件的 ONNX 模型。
13
+ """
14
+
15
+ def __init__(self,
16
+ ckpt_path: str,
17
+ pth_path: str,
18
+ onnx_input_path: str,
19
+ output_dir: str,
20
+ ):
21
+ self.ckpt_path: str = ckpt_path
22
+ self.pth_path: str = pth_path
23
+ self.onnx_input_path: str = onnx_input_path
24
+ self.output_dir: str = output_dir
25
+
26
+ # 定义最终输出文件的路径
27
+ self.output_bin_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.bin")
28
+ self.output_onnx_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx")
29
+
30
+ # 确保输出目录存在
31
+ os.makedirs(self.output_dir, exist_ok=True)
32
+
33
+ # 检查所有输入文件是否存在
34
+ for path in [self.ckpt_path, self.pth_path, self.onnx_input_path]:
35
+ if not os.path.exists(path):
36
+ raise FileNotFoundError(f"Error: Input file not found! Path: {path}")
37
+
38
+ def run_full_process(self):
39
+ # 1. 定义固定的 ONNX 权重键列表 (此顺序决定了 .bin 文件的布局)
40
+ onnx_keys = [
41
+ "encoder.ar_text_embedding.word_embeddings.weight",
42
+ "encoder.bert_proj.weight",
43
+ "encoder.bert_proj.bias",
44
+ "encoder.ar_text_position.alpha",
45
+ "vits.ssl_proj.weight",
46
+ "vits.ssl_proj.bias",
47
+ "vits.quantizer.vq.layers.0._codebook.embed"
48
+ ]
49
+
50
+ # 2. 加载所有必要的模型和权重
51
+ ckpt_state_dict = load_gpt_model(self.ckpt_path)['weight']
52
+ pth_state_dict = load_sovits_model(self.pth_path)['weight']
53
+ model = onnx.load(self.onnx_input_path, load_external_data=False)
54
+ initializer_map = {init.name: init for init in model.graph.initializer}
55
+ current_offset = 0
56
+ bin_filename = os.path.basename(self.output_bin_path)
57
+
58
+ # 3. 生成 .bin 文件并同步修改 ONNX 模型
59
+ with open(self.output_bin_path, 'wb') as f_bin:
60
+ for onnx_key in onnx_keys:
61
+ source_key = ""
62
+ source_dict = None
63
+
64
+ if onnx_key.startswith("encoder."):
65
+ source_key = "model." + onnx_key[len("encoder."):]
66
+ source_dict = ckpt_state_dict
67
+ elif onnx_key.startswith("vits."):
68
+ source_key = onnx_key[len("vits."):]
69
+ source_dict = pth_state_dict
70
+
71
+ if source_dict is None:
72
+ raise ValueError(
73
+ f"❌ Critical error: Unable to determine the weight source for ONNX key '{onnx_key}'.")
74
+ # 从源文件中提取张量
75
+ tensor = source_dict.get(source_key)
76
+ if tensor is None:
77
+ raise ValueError(
78
+ f"❌ Critical error: Key '{source_key}' (corresponding to ONNX key '{onnx_key}') not found in the source file.")
79
+
80
+ # 转换为 fp32 numpy 数组并获取字节
81
+ numpy_array_fp32 = tensor.to(torch.float32).cpu().numpy()
82
+ tensor_bytes = numpy_array_fp32.tobytes()
83
+ tensor_length = len(tensor_bytes)
84
+ f_bin.write(tensor_bytes)
85
+
86
+ # 在 ONNX 模型中找到对应的 initializer 并修改它
87
+ if onnx_key in initializer_map:
88
+ tensor_proto = initializer_map[onnx_key]
89
+
90
+ tensor_proto.ClearField('raw_data')
91
+ tensor_proto.data_location = onnx.TensorProto.EXTERNAL
92
+ del tensor_proto.external_data[:]
93
+
94
+ keys_to_set = ["location", "offset", "length"]
95
+ values_to_set = [bin_filename, str(current_offset), str(tensor_length)]
96
+
97
+ for k, v in zip(keys_to_set, values_to_set):
98
+ entry = tensor_proto.external_data.add()
99
+ entry.key = k
100
+ entry.value = v
101
+
102
+ # 更新下一个权重的偏移量
103
+ current_offset += tensor_length
104
+
105
+ # 4. 保存修改后的 ONNX 模型
106
+ onnx.save(model, self.output_onnx_path)
genie_tts/Converter/v2/T2SConverter.py CHANGED
@@ -1,125 +1,125 @@
1
- import torch
2
- import onnx
3
- import numpy as np
4
- import json
5
- import os
6
- from collections import OrderedDict
7
-
8
- from ..load_state_dict import load_gpt_model
9
-
10
-
11
- class T2SModelConverter:
12
- """
13
- 一个专门的转换器,用于处理 t2s (Text-to-Speech) 模型。
14
- - PyTorch 模型: .ckpt 文件
15
- - ONNX 模型: t2s_stage_decoder_fp32.onnx
16
- - 遵循特定的键名映射规则。
17
- """
18
-
19
- def __init__(self,
20
- torch_ckpt_path: str,
21
- stage_decoder_onnx_path: str,
22
- first_stage_decoder_onnx_path: str,
23
- key_list_file: str,
24
- output_dir: str,
25
- cache_dir: str,
26
- ):
27
- self.torch_ckpt_path: str = torch_ckpt_path
28
- self.stage_decoder_onnx_path: str = stage_decoder_onnx_path
29
- self.first_stage_decoder_onnx_path: str = first_stage_decoder_onnx_path
30
- self.key_list_file: str = key_list_file
31
- self.output_dir: str = output_dir
32
- self.cache_dir: str = cache_dir
33
-
34
- os.makedirs(self.output_dir, exist_ok=True)
35
- os.makedirs(self.output_dir, exist_ok=True)
36
-
37
- # 定义输出文件路径
38
- self.fp16_bin_path: str = os.path.join(self.output_dir, "t2s_shared_fp16.bin")
39
- self.index_table_path: str = os.path.join(self.cache_dir, "t2s_weights_index_fp32.json")
40
- self.relinked_encoder_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx")
41
- self.relinked_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_stage_decoder_fp32.onnx")
42
- self.relinked_first_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_first_stage_decoder_fp32.onnx")
43
- self.reconstructed_fp32_bin_path = os.path.join(self.output_dir, "t2s_shared_fp32.bin")
44
-
45
- def step1_create_fp16_bin_with_key_mapping(self):
46
- """
47
- (1) 根据特定的键映射规则,从 .ckpt 创建 fp16 .bin 和 fp32 索引。
48
- (已根据用户验证脚本的正确逻辑进行最终修正)
49
- """
50
- if not os.path.exists(self.key_list_file):
51
- raise FileNotFoundError(
52
- f"Error: Stage 1 requires the key list file, but it was not found: {self.key_list_file}")
53
-
54
- with open(self.key_list_file, 'r') as f:
55
- onnx_keys = [line.strip() for line in f.readlines()]
56
-
57
- ckpt_data = load_gpt_model(self.torch_ckpt_path)
58
- if 'weight' not in ckpt_data:
59
- raise KeyError(
60
- f"❌ Error: 'weight' key not found in the .ckpt file. Top-level keys in the file are: {list(ckpt_data.keys())}")
61
-
62
- torch_state_dict = ckpt_data['weight']
63
-
64
- index_table = OrderedDict()
65
- current_fp32_offset = 0
66
-
67
- with open(self.fp16_bin_path, 'wb') as f_bin:
68
- for onnx_key in onnx_keys:
69
- transformed_onnx_key = onnx_key.replace('transformer_encoder', 'h')
70
- torch_lookup_key = f"model.{transformed_onnx_key}"
71
- torch_tensor = torch_state_dict.get(torch_lookup_key)
72
- numpy_array_fp16 = torch_tensor.to(torch.float16).cpu().numpy()
73
- f_bin.write(numpy_array_fp16.tobytes())
74
- tensor_length_fp32 = numpy_array_fp16.nbytes * 2
75
- index_table[onnx_key] = {'offset': current_fp32_offset, 'length': tensor_length_fp32}
76
- current_fp32_offset += tensor_length_fp32
77
-
78
- with open(self.index_table_path, 'w') as f_json:
79
- json.dump(index_table, f_json, indent=4) # type: ignore
80
-
81
- def step2_relink_onnx_for_fp32(self, old_model: str, new_model: str):
82
- """
83
- (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到未来的全精度 .bin。
84
- (使用与第一个脚本相同的、更稳定的底层方法)
85
- """
86
- if not os.path.exists(self.index_table_path):
87
- raise FileNotFoundError(
88
- f"Error: Stage 2 requires the index file, but it was not found: {self.index_table_path}")
89
-
90
- # 加载描述 fp32 布局的索引表
91
- with open(self.index_table_path, 'r') as f:
92
- index_table = json.load(f)
93
-
94
- model = onnx.load_model(old_model, load_external_data=False)
95
- reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path)
96
-
97
- for tensor in model.graph.initializer:
98
- if tensor.name in index_table:
99
- tensor.ClearField('raw_data')
100
- tensor.data_location = onnx.TensorProto.EXTERNAL
101
- info = index_table[tensor.name]
102
- del tensor.external_data[:]
103
- keys = ["location", "offset", "length"]
104
- values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])]
105
-
106
- for k, v in zip(keys, values):
107
- entry = tensor.external_data.add()
108
- entry.key = k
109
- entry.value = v
110
-
111
- onnx.save(model, new_model)
112
-
113
- @staticmethod
114
- def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str):
115
- """
116
- (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。
117
- """
118
- fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16)
119
- fp32_array = fp16_array.astype(np.float32)
120
- fp32_array.tofile(output_fp32_bin_path)
121
-
122
- def run_full_process(self):
123
- self.step1_create_fp16_bin_with_key_mapping()
124
- self.step2_relink_onnx_for_fp32(self.stage_decoder_onnx_path, self.relinked_stage_decoder_path)
125
- self.step2_relink_onnx_for_fp32(self.first_stage_decoder_onnx_path, self.relinked_first_stage_decoder_path)
 
1
+ import torch
2
+ import onnx
3
+ import numpy as np
4
+ import json
5
+ import os
6
+ from collections import OrderedDict
7
+
8
+ from ..load_state_dict import load_gpt_model
9
+
10
+
11
+ class T2SModelConverter:
12
+ """
13
+ 一个专门的转换器,用于处理 t2s (Text-to-Speech) 模型。
14
+ - PyTorch 模型: .ckpt 文件
15
+ - ONNX 模型: t2s_stage_decoder_fp32.onnx
16
+ - 遵循特定的键名映射规则。
17
+ """
18
+
19
+ def __init__(self,
20
+ torch_ckpt_path: str,
21
+ stage_decoder_onnx_path: str,
22
+ first_stage_decoder_onnx_path: str,
23
+ key_list_file: str,
24
+ output_dir: str,
25
+ cache_dir: str,
26
+ ):
27
+ self.torch_ckpt_path: str = torch_ckpt_path
28
+ self.stage_decoder_onnx_path: str = stage_decoder_onnx_path
29
+ self.first_stage_decoder_onnx_path: str = first_stage_decoder_onnx_path
30
+ self.key_list_file: str = key_list_file
31
+ self.output_dir: str = output_dir
32
+ self.cache_dir: str = cache_dir
33
+
34
+ os.makedirs(self.output_dir, exist_ok=True)
35
+ os.makedirs(self.output_dir, exist_ok=True)
36
+
37
+ # 定义输出文件路径
38
+ self.fp16_bin_path: str = os.path.join(self.output_dir, "t2s_shared_fp16.bin")
39
+ self.index_table_path: str = os.path.join(self.cache_dir, "t2s_weights_index_fp32.json")
40
+ self.relinked_encoder_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx")
41
+ self.relinked_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_stage_decoder_fp32.onnx")
42
+ self.relinked_first_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_first_stage_decoder_fp32.onnx")
43
+ self.reconstructed_fp32_bin_path = os.path.join(self.output_dir, "t2s_shared_fp32.bin")
44
+
45
+ def step1_create_fp16_bin_with_key_mapping(self):
46
+ """
47
+ (1) 根据特定的键映射规则,从 .ckpt 创建 fp16 .bin 和 fp32 索引。
48
+ (已根据用户验证脚本的正确逻辑进行最终修正)
49
+ """
50
+ if not os.path.exists(self.key_list_file):
51
+ raise FileNotFoundError(
52
+ f"Error: Stage 1 requires the key list file, but it was not found: {self.key_list_file}")
53
+
54
+ with open(self.key_list_file, 'r') as f:
55
+ onnx_keys = [line.strip() for line in f.readlines()]
56
+
57
+ ckpt_data = load_gpt_model(self.torch_ckpt_path)
58
+ if 'weight' not in ckpt_data:
59
+ raise KeyError(
60
+ f"❌ Error: 'weight' key not found in the .ckpt file. Top-level keys in the file are: {list(ckpt_data.keys())}")
61
+
62
+ torch_state_dict = ckpt_data['weight']
63
+
64
+ index_table = OrderedDict()
65
+ current_fp32_offset = 0
66
+
67
+ with open(self.fp16_bin_path, 'wb') as f_bin:
68
+ for onnx_key in onnx_keys:
69
+ transformed_onnx_key = onnx_key.replace('transformer_encoder', 'h')
70
+ torch_lookup_key = f"model.{transformed_onnx_key}"
71
+ torch_tensor = torch_state_dict.get(torch_lookup_key)
72
+ numpy_array_fp16 = torch_tensor.to(torch.float16).cpu().numpy()
73
+ f_bin.write(numpy_array_fp16.tobytes())
74
+ tensor_length_fp32 = numpy_array_fp16.nbytes * 2
75
+ index_table[onnx_key] = {'offset': current_fp32_offset, 'length': tensor_length_fp32}
76
+ current_fp32_offset += tensor_length_fp32
77
+
78
+ with open(self.index_table_path, 'w') as f_json:
79
+ json.dump(index_table, f_json, indent=4) # type: ignore
80
+
81
+ def step2_relink_onnx_for_fp32(self, old_model: str, new_model: str):
82
+ """
83
+ (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到未来的全精度 .bin。
84
+ (使用与第一个脚本相同的、更稳定的底层方法)
85
+ """
86
+ if not os.path.exists(self.index_table_path):
87
+ raise FileNotFoundError(
88
+ f"Error: Stage 2 requires the index file, but it was not found: {self.index_table_path}")
89
+
90
+ # 加载描述 fp32 布局的索引表
91
+ with open(self.index_table_path, 'r') as f:
92
+ index_table = json.load(f)
93
+
94
+ model = onnx.load_model(old_model, load_external_data=False)
95
+ reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path)
96
+
97
+ for tensor in model.graph.initializer:
98
+ if tensor.name in index_table:
99
+ tensor.ClearField('raw_data')
100
+ tensor.data_location = onnx.TensorProto.EXTERNAL
101
+ info = index_table[tensor.name]
102
+ del tensor.external_data[:]
103
+ keys = ["location", "offset", "length"]
104
+ values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])]
105
+
106
+ for k, v in zip(keys, values):
107
+ entry = tensor.external_data.add()
108
+ entry.key = k
109
+ entry.value = v
110
+
111
+ onnx.save(model, new_model)
112
+
113
+ @staticmethod
114
+ def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str):
115
+ """
116
+ (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。
117
+ """
118
+ fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16)
119
+ fp32_array = fp16_array.astype(np.float32)
120
+ fp32_array.tofile(output_fp32_bin_path)
121
+
122
+ def run_full_process(self):
123
+ self.step1_create_fp16_bin_with_key_mapping()
124
+ self.step2_relink_onnx_for_fp32(self.stage_decoder_onnx_path, self.relinked_stage_decoder_path)
125
+ self.step2_relink_onnx_for_fp32(self.first_stage_decoder_onnx_path, self.relinked_first_stage_decoder_path)
genie_tts/Converter/v2/VITSConverter.py CHANGED
@@ -1,129 +1,129 @@
1
- import torch
2
- import onnx
3
- import numpy as np
4
- import json
5
- import os
6
- from collections import OrderedDict
7
-
8
- from ..load_state_dict import load_sovits_model
9
-
10
-
11
- class VITSConverter:
12
- """
13
- 一个转换器,用于从 PyTorch 模型创建:
14
- 1. 一个用于分发的半精度 (fp16) .bin 权重文件。
15
- 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。
16
- 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。
17
- """
18
-
19
- def __init__(self,
20
- torch_pth_path: str,
21
- vits_onnx_path: str,
22
- key_list_file: str,
23
- output_dir: str,
24
- cache_dir: str,
25
- ):
26
- self.torch_pth_path: str = torch_pth_path
27
- self.vits_onnx_path: str = vits_onnx_path
28
- self.key_list_file: str = key_list_file
29
- self.output_dir: str = output_dir
30
- self.cache_dir: str = cache_dir
31
- # 定义输出文件路径
32
- self.fp16_bin_path: str = os.path.join(self.output_dir, "vits_fp16.bin")
33
- self.index_table_path: str = os.path.join(self.cache_dir, "vits_weights_index_fp32.json")
34
- self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "vits_fp32.onnx")
35
- self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "vits_fp32.bin")
36
-
37
- # 确保输出目录存在
38
- os.makedirs(self.cache_dir, exist_ok=True)
39
- os.makedirs(self.output_dir, exist_ok=True)
40
-
41
- if not os.path.exists(self.key_list_file):
42
- raise FileNotFoundError(f"Error: Key list file not found! Path: {self.key_list_file}")
43
-
44
- def step1_create_fp16_bin_and_fp32_index(self):
45
- """
46
- (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个
47
- 描述全精度 (fp32) 布局的索引表。
48
- """
49
- # 加载 key 列表
50
- with open(self.key_list_file, 'r') as f:
51
- onnx_keys = [line.strip() for line in f.readlines()]
52
-
53
- # 加载 PyTorch 模型权重
54
- torch_state_dict = load_sovits_model(self.torch_pth_path)['weight']
55
-
56
- index_table = OrderedDict()
57
- current_fp32_offset = 0
58
-
59
- with open(self.fp16_bin_path, 'wb') as f_bin:
60
- for onnx_key in onnx_keys:
61
- torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key
62
-
63
- torch_tensor = torch_state_dict.get(torch_key)
64
- if torch_tensor is None:
65
- raise ValueError(f"❌ Critical error: Key '{torch_key}' not found in the PyTorch weights")
66
-
67
- # 转换为 fp16 并写入文件
68
- torch_tensor_fp16 = torch_tensor.to(torch.float16)
69
- numpy_array_fp16 = torch_tensor_fp16.cpu().numpy()
70
- tensor_bytes_fp16 = numpy_array_fp16.tobytes()
71
- f_bin.write(tensor_bytes_fp16)
72
- tensor_length_fp32 = len(tensor_bytes_fp16) * 2
73
- index_table[onnx_key] = {
74
- 'offset': current_fp32_offset,
75
- 'length': tensor_length_fp32
76
- }
77
- current_fp32_offset += tensor_length_fp32
78
-
79
- # 保存描述 fp32 布局的索引表
80
- with open(self.index_table_path, 'w') as f_json:
81
- json.dump(index_table, f_json, indent=4) # type: ignore
82
-
83
- def step2_relink_onnx_for_fp32(self):
84
- """
85
- (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个
86
- 未来的、全精度的 .bin 文件。
87
- """
88
- # 加载描述 fp32 布局的索引表
89
- with open(self.index_table_path, 'r') as f:
90
- index_table = json.load(f)
91
-
92
- model = onnx.load_model(self.vits_onnx_path, load_external_data=False)
93
- reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path)
94
-
95
- for tensor in model.graph.initializer:
96
- if tensor.name in index_table:
97
- tensor.ClearField('raw_data')
98
- tensor.data_location = onnx.TensorProto.EXTERNAL
99
- info = index_table[tensor.name]
100
-
101
- del tensor.external_data[:]
102
-
103
- keys = ["location", "offset", "length"]
104
- values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])]
105
-
106
- for k, v in zip(keys, values):
107
- entry = tensor.external_data.add()
108
- entry.key = k
109
- entry.value = v
110
-
111
- # 保存修改后的、链接到 fp32 权重的 ONNX 模型
112
- onnx.save(model, self.relinked_fp32_onnx_path)
113
-
114
- @staticmethod
115
- def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str):
116
- """
117
- (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。
118
-
119
- Args:
120
- fp16_bin_path (str): 输入的半精度 .bin 文件路径。
121
- output_fp32_bin_path (str): 输出的全精度 .bin 文件路径。
122
- """
123
- fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16)
124
- fp32_array = fp16_array.astype(np.float32)
125
- fp32_array.tofile(output_fp32_bin_path)
126
-
127
- def run_full_process(self):
128
- self.step1_create_fp16_bin_and_fp32_index()
129
- self.step2_relink_onnx_for_fp32()
 
1
+ import torch
2
+ import onnx
3
+ import numpy as np
4
+ import json
5
+ import os
6
+ from collections import OrderedDict
7
+
8
+ from ..load_state_dict import load_sovits_model
9
+
10
+
11
+ class VITSConverter:
12
+ """
13
+ 一个转换器,用于从 PyTorch 模型创建:
14
+ 1. 一个用于分发的半精度 (fp16) .bin 权重文件。
15
+ 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。
16
+ 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。
17
+ """
18
+
19
+ def __init__(self,
20
+ torch_pth_path: str,
21
+ vits_onnx_path: str,
22
+ key_list_file: str,
23
+ output_dir: str,
24
+ cache_dir: str,
25
+ ):
26
+ self.torch_pth_path: str = torch_pth_path
27
+ self.vits_onnx_path: str = vits_onnx_path
28
+ self.key_list_file: str = key_list_file
29
+ self.output_dir: str = output_dir
30
+ self.cache_dir: str = cache_dir
31
+ # 定义输出文件路径
32
+ self.fp16_bin_path: str = os.path.join(self.output_dir, "vits_fp16.bin")
33
+ self.index_table_path: str = os.path.join(self.cache_dir, "vits_weights_index_fp32.json")
34
+ self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "vits_fp32.onnx")
35
+ self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "vits_fp32.bin")
36
+
37
+ # 确保输出目录存在
38
+ os.makedirs(self.cache_dir, exist_ok=True)
39
+ os.makedirs(self.output_dir, exist_ok=True)
40
+
41
+ if not os.path.exists(self.key_list_file):
42
+ raise FileNotFoundError(f"Error: Key list file not found! Path: {self.key_list_file}")
43
+
44
+ def step1_create_fp16_bin_and_fp32_index(self):
45
+ """
46
+ (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个
47
+ 描述全精度 (fp32) 布局的索引表。
48
+ """
49
+ # 加载 key 列表
50
+ with open(self.key_list_file, 'r') as f:
51
+ onnx_keys = [line.strip() for line in f.readlines()]
52
+
53
+ # 加载 PyTorch 模型权重
54
+ torch_state_dict = load_sovits_model(self.torch_pth_path)['weight']
55
+
56
+ index_table = OrderedDict()
57
+ current_fp32_offset = 0
58
+
59
+ with open(self.fp16_bin_path, 'wb') as f_bin:
60
+ for onnx_key in onnx_keys:
61
+ torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key
62
+
63
+ torch_tensor = torch_state_dict.get(torch_key)
64
+ if torch_tensor is None:
65
+ raise ValueError(f"❌ Critical error: Key '{torch_key}' not found in the PyTorch weights")
66
+
67
+ # 转换为 fp16 并写入文件
68
+ torch_tensor_fp16 = torch_tensor.to(torch.float16)
69
+ numpy_array_fp16 = torch_tensor_fp16.cpu().numpy()
70
+ tensor_bytes_fp16 = numpy_array_fp16.tobytes()
71
+ f_bin.write(tensor_bytes_fp16)
72
+ tensor_length_fp32 = len(tensor_bytes_fp16) * 2
73
+ index_table[onnx_key] = {
74
+ 'offset': current_fp32_offset,
75
+ 'length': tensor_length_fp32
76
+ }
77
+ current_fp32_offset += tensor_length_fp32
78
+
79
+ # 保存描述 fp32 布局的索引表
80
+ with open(self.index_table_path, 'w') as f_json:
81
+ json.dump(index_table, f_json, indent=4) # type: ignore
82
+
83
+ def step2_relink_onnx_for_fp32(self):
84
+ """
85
+ (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个
86
+ 未来的、全精度的 .bin 文件。
87
+ """
88
+ # 加载描述 fp32 布局的索引表
89
+ with open(self.index_table_path, 'r') as f:
90
+ index_table = json.load(f)
91
+
92
+ model = onnx.load_model(self.vits_onnx_path, load_external_data=False)
93
+ reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path)
94
+
95
+ for tensor in model.graph.initializer:
96
+ if tensor.name in index_table:
97
+ tensor.ClearField('raw_data')
98
+ tensor.data_location = onnx.TensorProto.EXTERNAL
99
+ info = index_table[tensor.name]
100
+
101
+ del tensor.external_data[:]
102
+
103
+ keys = ["location", "offset", "length"]
104
+ values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])]
105
+
106
+ for k, v in zip(keys, values):
107
+ entry = tensor.external_data.add()
108
+ entry.key = k
109
+ entry.value = v
110
+
111
+ # 保存修改后的、链接到 fp32 权重的 ONNX 模型
112
+ onnx.save(model, self.relinked_fp32_onnx_path)
113
+
114
+ @staticmethod
115
+ def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str):
116
+ """
117
+ (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。
118
+
119
+ Args:
120
+ fp16_bin_path (str): 输入的半精度 .bin 文件路径。
121
+ output_fp32_bin_path (str): 输出的全精度 .bin 文件路径。
122
+ """
123
+ fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16)
124
+ fp32_array = fp16_array.astype(np.float32)
125
+ fp32_array.tofile(output_fp32_bin_path)
126
+
127
+ def run_full_process(self):
128
+ self.step1_create_fp16_bin_and_fp32_index()
129
+ self.step2_relink_onnx_for_fp32()
genie_tts/Converter/v2/__pycache__/Converter.cpython-311.pyc ADDED
Binary file (7.94 kB). View file
 
genie_tts/Converter/v2/__pycache__/EncoderConverter.cpython-311.pyc ADDED
Binary file (5.57 kB). View file
 
genie_tts/Converter/v2/__pycache__/T2SConverter.cpython-311.pyc ADDED
Binary file (9.11 kB). View file
 
genie_tts/Converter/v2/__pycache__/VITSConverter.cpython-311.pyc ADDED
Binary file (8.21 kB). View file
 
genie_tts/Converter/v2/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (191 Bytes). View file
 
genie_tts/Converter/v2ProPlus/Converter.py CHANGED
@@ -1,89 +1,89 @@
1
- import logging
2
- import traceback
3
- import os
4
- import contextlib
5
- import importlib.resources
6
-
7
- from ...Utils.Constants import PACKAGE_NAME
8
- from ..v2.VITSConverter import VITSConverter
9
- from ..v2.T2SConverter import T2SModelConverter
10
- from ..v2.EncoderConverter import EncoderConverter
11
- from ..v2.Converter import (ENCODER_RESOURCE_PATH, STAGE_DECODER_RESOURCE_PATH,
12
- FIRST_STAGE_DECODER_RESOURCE_PATH, T2S_KEYS_RESOURCE_PATH, CACHE_DIR, remove_folder)
13
- from .PromptEncoderConverter import PromptEncoderConverter
14
-
15
- logger = logging.getLogger()
16
-
17
- # 使用 V2 ProPlus 的文件。
18
- VITS_RESOURCE_PATH = "Data/v2ProPlus/Models/vits_fp32.onnx"
19
- PROMPT_ENCODER_RESOURCE_PATH = "Data/v2ProPlus/Models/prompt_encoder_fp32.onnx"
20
- VITS_KEYS_RESOURCE_PATH = "Data/v2ProPlus/Keys/vits_weights.txt"
21
- PROMPT_ENCODER_KEYS_RESOURCE_PATH = "Data/v2ProPlus/Keys/prompt_encoder_weights.txt"
22
-
23
-
24
- def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None:
25
- # 确保缓存和输出目录存在
26
- os.makedirs(CACHE_DIR, exist_ok=True)
27
- os.makedirs(output_dir, exist_ok=True)
28
-
29
- if len(os.listdir(output_dir)) > 0:
30
- logger.warning(f"The output directory {output_dir} is not empty!")
31
-
32
- with contextlib.ExitStack() as stack:
33
- files = importlib.resources.files(PACKAGE_NAME)
34
-
35
- def enter(p: str) -> str:
36
- return str(stack.enter_context(importlib.resources.as_file(files.joinpath(p))))
37
-
38
- encoder_onnx_path = enter(ENCODER_RESOURCE_PATH)
39
- stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH)
40
- first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH)
41
- vits_onnx_path = enter(VITS_RESOURCE_PATH)
42
- t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH)
43
- vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH)
44
- prompt_encoder_path = enter(PROMPT_ENCODER_RESOURCE_PATH)
45
- prompt_encoder_keys_path = enter(PROMPT_ENCODER_KEYS_RESOURCE_PATH)
46
-
47
- converter_1 = T2SModelConverter(
48
- torch_ckpt_path=torch_ckpt_path,
49
- stage_decoder_onnx_path=stage_decoder_path,
50
- first_stage_decoder_onnx_path=first_stage_decoder_path,
51
- key_list_file=t2s_keys_path,
52
- output_dir=output_dir,
53
- cache_dir=CACHE_DIR,
54
- )
55
- converter_2 = VITSConverter(
56
- torch_pth_path=torch_pth_path,
57
- vits_onnx_path=vits_onnx_path,
58
- key_list_file=vits_keys_path,
59
- output_dir=output_dir,
60
- cache_dir=CACHE_DIR,
61
- )
62
- converter_3 = EncoderConverter(
63
- ckpt_path=torch_ckpt_path,
64
- pth_path=torch_pth_path,
65
- onnx_input_path=encoder_onnx_path,
66
- output_dir=output_dir,
67
- )
68
- converter_4 = PromptEncoderConverter(
69
- torch_pth_path=torch_pth_path,
70
- prompt_encoder_onnx_path=prompt_encoder_path,
71
- key_list_file=prompt_encoder_keys_path,
72
- output_dir=output_dir,
73
- cache_dir=CACHE_DIR,
74
- )
75
-
76
- try:
77
- converter_1.run_full_process()
78
- converter_2.run_full_process()
79
- converter_3.run_full_process()
80
- converter_4.run_full_process()
81
- logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n"
82
- f"- Model Type: V2ProPlus")
83
- except Exception:
84
- logger.error(f"❌ A critical error occurred during the conversion process")
85
- logger.error(traceback.format_exc())
86
- remove_folder(output_dir) # 只在失败时清理输出目录
87
- finally:
88
- # 无论成功还是失败,都尝试清理缓存目录
89
- remove_folder(CACHE_DIR)
 
1
+ import logging
2
+ import traceback
3
+ import os
4
+ import contextlib
5
+ import importlib.resources
6
+
7
+ from ...Utils.Constants import PACKAGE_NAME
8
+ from ..v2.VITSConverter import VITSConverter
9
+ from ..v2.T2SConverter import T2SModelConverter
10
+ from ..v2.EncoderConverter import EncoderConverter
11
+ from ..v2.Converter import (ENCODER_RESOURCE_PATH, STAGE_DECODER_RESOURCE_PATH,
12
+ FIRST_STAGE_DECODER_RESOURCE_PATH, T2S_KEYS_RESOURCE_PATH, CACHE_DIR, remove_folder)
13
+ from .PromptEncoderConverter import PromptEncoderConverter
14
+
15
+ logger = logging.getLogger()
16
+
17
+ # 使用 V2 ProPlus 的文件。
18
+ VITS_RESOURCE_PATH = "Data/v2ProPlus/Models/vits_fp32.onnx"
19
+ PROMPT_ENCODER_RESOURCE_PATH = "Data/v2ProPlus/Models/prompt_encoder_fp32.onnx"
20
+ VITS_KEYS_RESOURCE_PATH = "./Data/v2ProPlus/Keys/vits_weights.txt"
21
+ PROMPT_ENCODER_KEYS_RESOURCE_PATH = "./Data/v2ProPlus/Keys/prompt_encoder_weights.txt"
22
+
23
+
24
+ def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None:
25
+ # 确保缓存和输出目录存在
26
+ os.makedirs(CACHE_DIR, exist_ok=True)
27
+ os.makedirs(output_dir, exist_ok=True)
28
+
29
+ if len(os.listdir(output_dir)) > 0:
30
+ logger.warning(f"The output directory {output_dir} is not empty!")
31
+
32
+ with contextlib.ExitStack() as stack:
33
+ files = importlib.resources.files(PACKAGE_NAME)
34
+
35
+ def enter(p: str) -> str:
36
+ return str(stack.enter_context(importlib.resources.as_file(files.joinpath(p))))
37
+
38
+ encoder_onnx_path = enter(ENCODER_RESOURCE_PATH)
39
+ stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH)
40
+ first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH)
41
+ vits_onnx_path = enter(VITS_RESOURCE_PATH)
42
+ t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH)
43
+ vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH)
44
+ prompt_encoder_path = enter(PROMPT_ENCODER_RESOURCE_PATH)
45
+ prompt_encoder_keys_path = enter(PROMPT_ENCODER_KEYS_RESOURCE_PATH)
46
+
47
+ converter_1 = T2SModelConverter(
48
+ torch_ckpt_path=torch_ckpt_path,
49
+ stage_decoder_onnx_path=stage_decoder_path,
50
+ first_stage_decoder_onnx_path=first_stage_decoder_path,
51
+ key_list_file=t2s_keys_path,
52
+ output_dir=output_dir,
53
+ cache_dir=CACHE_DIR,
54
+ )
55
+ converter_2 = VITSConverter(
56
+ torch_pth_path=torch_pth_path,
57
+ vits_onnx_path=vits_onnx_path,
58
+ key_list_file=vits_keys_path,
59
+ output_dir=output_dir,
60
+ cache_dir=CACHE_DIR,
61
+ )
62
+ converter_3 = EncoderConverter(
63
+ ckpt_path=torch_ckpt_path,
64
+ pth_path=torch_pth_path,
65
+ onnx_input_path=encoder_onnx_path,
66
+ output_dir=output_dir,
67
+ )
68
+ converter_4 = PromptEncoderConverter(
69
+ torch_pth_path=torch_pth_path,
70
+ prompt_encoder_onnx_path=prompt_encoder_path,
71
+ key_list_file=prompt_encoder_keys_path,
72
+ output_dir=output_dir,
73
+ cache_dir=CACHE_DIR,
74
+ )
75
+
76
+ try:
77
+ converter_1.run_full_process()
78
+ converter_2.run_full_process()
79
+ converter_3.run_full_process()
80
+ converter_4.run_full_process()
81
+ logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n"
82
+ f"- Model Type: V2ProPlus")
83
+ except Exception:
84
+ logger.error(f"❌ A critical error occurred during the conversion process")
85
+ logger.error(traceback.format_exc())
86
+ remove_folder(output_dir) # 只在失败时清理输出目录
87
+ finally:
88
+ # 无论成功还是失败,都尝试清理缓存目录
89
+ remove_folder(CACHE_DIR)
genie_tts/Converter/v2ProPlus/PromptEncoderConverter.py CHANGED
@@ -1,128 +1,128 @@
1
- import torch
2
- import onnx
3
- import json
4
- import os
5
- from collections import OrderedDict
6
-
7
- from ..load_state_dict import load_sovits_model
8
-
9
-
10
- class PromptEncoderConverter:
11
- """
12
- 一个转换器,用于从 PyTorch 模型创建:
13
- 1. 一个用于分发的半精度 (fp16) .bin 权重文件。
14
- 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。
15
- 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。
16
- """
17
-
18
- def __init__(self,
19
- torch_pth_path: str,
20
- prompt_encoder_onnx_path: str,
21
- key_list_file: str,
22
- output_dir: str,
23
- cache_dir: str,
24
- ):
25
- self.torch_pth_path: str = torch_pth_path
26
- self.vits_onnx_path: str = prompt_encoder_onnx_path
27
- self.key_list_file: str = key_list_file
28
- self.output_dir: str = output_dir
29
- self.cache_dir: str = cache_dir
30
- # 定义输出文件路径
31
- self.fp16_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp16.bin")
32
- self.index_table_path: str = os.path.join(self.cache_dir, "prompt_encoder_weights_index_fp32.json")
33
- self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.onnx")
34
- self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.bin")
35
-
36
- # 确保输出目录存在
37
- os.makedirs(self.cache_dir, exist_ok=True)
38
- os.makedirs(self.output_dir, exist_ok=True)
39
-
40
- if not os.path.exists(self.key_list_file):
41
- raise FileNotFoundError(f"错误: Key 列表文件未找到! 路径: {self.key_list_file}")
42
-
43
- def step1_create_fp16_bin_and_fp32_index(self):
44
- """
45
- (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个
46
- 描述全精度 (fp32) 布局的索引表。
47
- """
48
- # 加载 key 列表
49
- with open(self.key_list_file, 'r') as f:
50
- onnx_keys = [line.strip() for line in f.readlines()]
51
-
52
- # 加载 PyTorch 模型权重
53
- torch_state_dict = load_sovits_model(self.torch_pth_path)['weight']
54
-
55
- index_table = OrderedDict()
56
- # 这个偏移量将按照 fp32 的大小进行累加
57
- current_fp32_offset = 0
58
-
59
- with open(self.fp16_bin_path, 'wb') as f_bin:
60
- for onnx_key in onnx_keys:
61
- torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key
62
-
63
- torch_tensor = torch_state_dict.get(torch_key)
64
- if torch_tensor is None:
65
- raise ValueError(f"❌ 严重错误: 在 PyTorch 权重中找不到 Key '{torch_key}'")
66
-
67
- # 转换为 fp16 并写入文件
68
- torch_tensor_fp16 = torch_tensor.to(torch.float16)
69
- numpy_array_fp16 = torch_tensor_fp16.cpu().numpy()
70
- tensor_bytes_fp16 = numpy_array_fp16.tobytes()
71
- f_bin.write(tensor_bytes_fp16)
72
-
73
- # 关键步骤:计算并记录 fp32 的长度和偏移量
74
- # 一个 fp32 = 4 字节, 一个 fp16 = 2 字节。所以 fp32 长度是 fp16 的两倍。
75
- tensor_length_fp32 = len(tensor_bytes_fp16) * 2
76
-
77
- index_table[onnx_key] = {
78
- 'offset': current_fp32_offset,
79
- 'length': tensor_length_fp32
80
- }
81
-
82
- # 偏移量也按照 fp32 的长度进行累加
83
- current_fp32_offset += tensor_length_fp32
84
-
85
- # 保存描述 fp32 布局的索引表
86
- with open(self.index_table_path, 'w') as f_json:
87
- json.dump(index_table, f_json, indent=4) # type: ignore
88
-
89
- def step2_relink_onnx_for_fp32(self):
90
- """
91
- (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个
92
- 未来的、全精度的 .bin 文件。
93
- """
94
- # 加载描述 fp32 布局的索引表
95
- with open(self.index_table_path, 'r') as f:
96
- index_table = json.load(f)
97
-
98
- # 加载 ONNX 模型结构
99
- model = onnx.load_model(self.vits_onnx_path, load_external_data=False)
100
-
101
- # 这个 ONNX 模型将要链接的 .bin 文件名
102
- reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path)
103
-
104
- for tensor in model.graph.initializer:
105
- if tensor.name in index_table:
106
- tensor.ClearField('raw_data')
107
- tensor.data_location = onnx.TensorProto.EXTERNAL
108
- info = index_table[tensor.name]
109
-
110
- del tensor.external_data[:]
111
-
112
- keys = ["location", "offset", "length"]
113
- values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])]
114
-
115
- for k, v in zip(keys, values):
116
- entry = tensor.external_data.add()
117
- entry.key = k
118
- entry.value = v
119
-
120
- # 保存修改后的、链接到 fp32 权重的 ONNX 模型
121
- onnx.save(model, self.relinked_fp32_onnx_path)
122
-
123
- def run_full_process(self):
124
- """
125
- 按顺序执行核心的转换步骤 (1 和 2)。
126
- """
127
- self.step1_create_fp16_bin_and_fp32_index()
128
- self.step2_relink_onnx_for_fp32()
 
1
+ import torch
2
+ import onnx
3
+ import json
4
+ import os
5
+ from collections import OrderedDict
6
+
7
+ from ..load_state_dict import load_sovits_model
8
+
9
+
10
+ class PromptEncoderConverter:
11
+ """
12
+ 一个转换器,用于从 PyTorch 模型创建:
13
+ 1. 一个用于分发的半精度 (fp16) .bin 权重文件。
14
+ 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。
15
+ 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。
16
+ """
17
+
18
+ def __init__(self,
19
+ torch_pth_path: str,
20
+ prompt_encoder_onnx_path: str,
21
+ key_list_file: str,
22
+ output_dir: str,
23
+ cache_dir: str,
24
+ ):
25
+ self.torch_pth_path: str = torch_pth_path
26
+ self.vits_onnx_path: str = prompt_encoder_onnx_path
27
+ self.key_list_file: str = key_list_file
28
+ self.output_dir: str = output_dir
29
+ self.cache_dir: str = cache_dir
30
+ # 定义输出文件路径
31
+ self.fp16_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp16.bin")
32
+ self.index_table_path: str = os.path.join(self.cache_dir, "prompt_encoder_weights_index_fp32.json")
33
+ self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.onnx")
34
+ self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.bin")
35
+
36
+ # 确保输出目录存在
37
+ os.makedirs(self.cache_dir, exist_ok=True)
38
+ os.makedirs(self.output_dir, exist_ok=True)
39
+
40
+ if not os.path.exists(self.key_list_file):
41
+ raise FileNotFoundError(f"错误: Key 列表文件未找到! 路径: {self.key_list_file}")
42
+
43
+ def step1_create_fp16_bin_and_fp32_index(self):
44
+ """
45
+ (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个
46
+ 描述全精度 (fp32) 布局的索引表。
47
+ """
48
+ # 加载 key 列表
49
+ with open(self.key_list_file, 'r') as f:
50
+ onnx_keys = [line.strip() for line in f.readlines()]
51
+
52
+ # 加载 PyTorch 模型权重
53
+ torch_state_dict = load_sovits_model(self.torch_pth_path)['weight']
54
+
55
+ index_table = OrderedDict()
56
+ # 这个偏移量将按照 fp32 的大小进行累加
57
+ current_fp32_offset = 0
58
+
59
+ with open(self.fp16_bin_path, 'wb') as f_bin:
60
+ for onnx_key in onnx_keys:
61
+ torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key
62
+
63
+ torch_tensor = torch_state_dict.get(torch_key)
64
+ if torch_tensor is None:
65
+ raise ValueError(f"❌ 严重错误: 在 PyTorch 权重中找不到 Key '{torch_key}'")
66
+
67
+ # 转换为 fp16 并写入文件
68
+ torch_tensor_fp16 = torch_tensor.to(torch.float16)
69
+ numpy_array_fp16 = torch_tensor_fp16.cpu().numpy()
70
+ tensor_bytes_fp16 = numpy_array_fp16.tobytes()
71
+ f_bin.write(tensor_bytes_fp16)
72
+
73
+ # 关键步骤:计算并记录 fp32 的长度和偏移量
74
+ # 一个 fp32 = 4 字节, 一个 fp16 = 2 字节。所以 fp32 长度是 fp16 的两倍。
75
+ tensor_length_fp32 = len(tensor_bytes_fp16) * 2
76
+
77
+ index_table[onnx_key] = {
78
+ 'offset': current_fp32_offset,
79
+ 'length': tensor_length_fp32
80
+ }
81
+
82
+ # 偏移量也按照 fp32 的长度进行累加
83
+ current_fp32_offset += tensor_length_fp32
84
+
85
+ # 保存描述 fp32 布局的索引表
86
+ with open(self.index_table_path, 'w') as f_json:
87
+ json.dump(index_table, f_json, indent=4) # type: ignore
88
+
89
+ def step2_relink_onnx_for_fp32(self):
90
+ """
91
+ (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个
92
+ 未来的、全精度的 .bin 文件。
93
+ """
94
+ # 加载描述 fp32 布局的索引表
95
+ with open(self.index_table_path, 'r') as f:
96
+ index_table = json.load(f)
97
+
98
+ # 加载 ONNX 模型结构
99
+ model = onnx.load_model(self.vits_onnx_path, load_external_data=False)
100
+
101
+ # 这个 ONNX 模型将要链接的 .bin 文件名
102
+ reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path)
103
+
104
+ for tensor in model.graph.initializer:
105
+ if tensor.name in index_table:
106
+ tensor.ClearField('raw_data')
107
+ tensor.data_location = onnx.TensorProto.EXTERNAL
108
+ info = index_table[tensor.name]
109
+
110
+ del tensor.external_data[:]
111
+
112
+ keys = ["location", "offset", "length"]
113
+ values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])]
114
+
115
+ for k, v in zip(keys, values):
116
+ entry = tensor.external_data.add()
117
+ entry.key = k
118
+ entry.value = v
119
+
120
+ # 保存修改后的、链接到 fp32 权重的 ONNX 模型
121
+ onnx.save(model, self.relinked_fp32_onnx_path)
122
+
123
+ def run_full_process(self):
124
+ """
125
+ 按顺序执行核心的转换步骤 (1 和 2)。
126
+ """
127
+ self.step1_create_fp16_bin_and_fp32_index()
128
+ self.step2_relink_onnx_for_fp32()
genie_tts/Converter/v2ProPlus/__pycache__/Converter.cpython-311.pyc ADDED
Binary file (5.34 kB). View file
 
genie_tts/Converter/v2ProPlus/__pycache__/PromptEncoderConverter.cpython-311.pyc ADDED
Binary file (7.51 kB). View file
 
genie_tts/Core/Resources.py CHANGED
@@ -1,76 +1,76 @@
1
- import os
2
- from huggingface_hub import snapshot_download
3
-
4
-
5
- def download_genie_data() -> None:
6
- print(f"🚀 Starting download Genie-TTS resources… This may take a few moments. ⏳")
7
- snapshot_download(
8
- repo_id="High-Logic/Genie",
9
- repo_type="model",
10
- allow_patterns="GenieData/*",
11
- local_dir=".",
12
- local_dir_use_symlinks=True, # 软链接
13
- )
14
- print("✅ Genie-TTS resources downloaded successfully.")
15
-
16
-
17
- def ensure_exists(path: str, name: str):
18
- if not os.path.exists(path):
19
- raise FileNotFoundError(
20
- f"Required directory or file '{name}' was not found at: {path}\n"
21
- f"Please download the pretrained models and place them under './GenieData', "
22
- f"or set the environment variable GENIE_DATA_DIR to the correct directory."
23
- )
24
-
25
-
26
- """
27
- 文件结构与项目 Midori 同步。
28
- """
29
-
30
- GENIE_DATA_DIR: str = os.getenv(
31
- "GENIE_DATA_DIR",
32
- "./GenieData"
33
- )
34
-
35
- """
36
- Japanese_G2P_DIR: str = os.getenv(
37
- "Japanese_G2P_DIR",
38
- f"{GENIE_DATA_DIR}/G2P/JapaneseG2P"
39
- )
40
- """
41
-
42
- English_G2P_DIR: str = os.getenv(
43
- "English_G2P_DIR",
44
- f"{GENIE_DATA_DIR}/G2P/EnglishG2P"
45
- )
46
-
47
- Chinese_G2P_DIR: str = os.getenv(
48
- "Chinese_G2P_DIR",
49
- f"{GENIE_DATA_DIR}/G2P/ChineseG2P"
50
- )
51
-
52
- HUBERT_MODEL_DIR: str = os.getenv(
53
- "HUBERT_MODEL_DIR",
54
- f"{GENIE_DATA_DIR}/chinese-hubert-base"
55
- )
56
-
57
- SV_MODEL: str = os.getenv(
58
- "SV_MODEL",
59
- f"{GENIE_DATA_DIR}/speaker_encoder.onnx"
60
- )
61
-
62
- ROBERTA_MODEL_DIR: str = os.getenv(
63
- "ROBERTA_MODEL_DIR",
64
- f"{GENIE_DATA_DIR}/RoBERTa"
65
- )
66
-
67
- if not os.path.exists(GENIE_DATA_DIR):
68
- print("⚠️ GenieData folder not found.")
69
- choice = input("Would you like to download it automatically from HuggingFace? (y/N): ").strip().lower()
70
- if choice == "y":
71
- download_genie_data()
72
-
73
- # ---- Run directory checks ----
74
- ensure_exists(HUBERT_MODEL_DIR, "HUBERT_MODEL_DIR")
75
- ensure_exists(SV_MODEL, "SV_MODEL")
76
- # ensure_exists(ROBERTA_MODEL_DIR, "ROBERTA_MODEL_DIR")
 
1
+ import os
2
+ from huggingface_hub import snapshot_download
3
+
4
+
5
+ def download_genie_data() -> None:
6
+ print(f"🚀 Starting download Genie-TTS resources… This may take a few moments. ⏳")
7
+ snapshot_download(
8
+ repo_id="High-Logic/Genie",
9
+ repo_type="model",
10
+ allow_patterns="GenieData/*",
11
+ local_dir=".",
12
+ local_dir_use_symlinks=True, # 软链接
13
+ )
14
+ print("✅ Genie-TTS resources downloaded successfully.")
15
+
16
+
17
+ def ensure_exists(path: str, name: str):
18
+ if not os.path.exists(path):
19
+ raise FileNotFoundError(
20
+ f"Required directory or file '{name}' was not found at: {path}\n"
21
+ f"Please download the pretrained models and place them under './GenieData', "
22
+ f"or set the environment variable GENIE_DATA_DIR to the correct directory."
23
+ )
24
+
25
+
26
+ """
27
+ 文件结构与项目 Midori 同步。
28
+ """
29
+
30
+ GENIE_DATA_DIR: str = os.getenv(
31
+ "GENIE_DATA_DIR",
32
+ "./GenieData"
33
+ )
34
+
35
+ """
36
+ Japanese_G2P_DIR: str = os.getenv(
37
+ "Japanese_G2P_DIR",
38
+ f"{GENIE_DATA_DIR}/G2P/JapaneseG2P"
39
+ )
40
+ """
41
+
42
+ English_G2P_DIR: str = os.getenv(
43
+ "English_G2P_DIR",
44
+ f"{GENIE_DATA_DIR}/G2P/EnglishG2P"
45
+ )
46
+
47
+ Chinese_G2P_DIR: str = os.getenv(
48
+ "Chinese_G2P_DIR",
49
+ f"{GENIE_DATA_DIR}/G2P/ChineseG2P"
50
+ )
51
+
52
+ HUBERT_MODEL_DIR: str = os.getenv(
53
+ "HUBERT_MODEL_DIR",
54
+ f"{GENIE_DATA_DIR}/chinese-hubert-base"
55
+ )
56
+
57
+ SV_MODEL: str = os.getenv(
58
+ "SV_MODEL",
59
+ f"{GENIE_DATA_DIR}/speaker_encoder.onnx"
60
+ )
61
+
62
+ ROBERTA_MODEL_DIR: str = os.getenv(
63
+ "ROBERTA_MODEL_DIR",
64
+ f"{GENIE_DATA_DIR}/RoBERTa"
65
+ )
66
+
67
+ if not os.path.exists(GENIE_DATA_DIR):
68
+ print("⚠️ GenieData folder not found.")
69
+ choice = input("Would you like to download it automatically from HuggingFace? (y/N): ").strip().lower()
70
+ if choice == "y":
71
+ download_genie_data()
72
+
73
+ # ---- Run directory checks ----
74
+ ensure_exists(HUBERT_MODEL_DIR, "HUBERT_MODEL_DIR")
75
+ ensure_exists(SV_MODEL, "SV_MODEL")
76
+ # ensure_exists(ROBERTA_MODEL_DIR, "ROBERTA_MODEL_DIR")
genie_tts/Core/__pycache__/Inference.cpython-311.pyc ADDED
Binary file (4.79 kB). View file
 
genie_tts/Core/__pycache__/Resources.cpython-311.pyc ADDED
Binary file (2.89 kB). View file
 
genie_tts/Core/__pycache__/TTSPlayer.cpython-311.pyc ADDED
Binary file (15 kB). View file
 
genie_tts/Core/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (183 Bytes). View file
 
genie_tts/Data/v2/Keys/t2s_onnx_keys.txt CHANGED
@@ -1,291 +1,291 @@
1
- ar_audio_embedding.word_embeddings.weight
2
- ar_audio_position.alpha
3
- transformer_encoder.layers.0.self_attn.in_proj_weight
4
- transformer_encoder.layers.0.self_attn.in_proj_bias
5
- transformer_encoder.layers.0.self_attn.out_proj.weight
6
- transformer_encoder.layers.0.self_attn.out_proj.bias
7
- transformer_encoder.layers.0.linear1.weight
8
- transformer_encoder.layers.0.linear1.bias
9
- transformer_encoder.layers.0.linear2.weight
10
- transformer_encoder.layers.0.linear2.bias
11
- transformer_encoder.layers.0.norm1.weight
12
- transformer_encoder.layers.0.norm1.bias
13
- transformer_encoder.layers.0.norm2.weight
14
- transformer_encoder.layers.0.norm2.bias
15
- transformer_encoder.layers.1.self_attn.in_proj_weight
16
- transformer_encoder.layers.1.self_attn.in_proj_bias
17
- transformer_encoder.layers.1.self_attn.out_proj.weight
18
- transformer_encoder.layers.1.self_attn.out_proj.bias
19
- transformer_encoder.layers.1.linear1.weight
20
- transformer_encoder.layers.1.linear1.bias
21
- transformer_encoder.layers.1.linear2.weight
22
- transformer_encoder.layers.1.linear2.bias
23
- transformer_encoder.layers.1.norm1.weight
24
- transformer_encoder.layers.1.norm1.bias
25
- transformer_encoder.layers.1.norm2.weight
26
- transformer_encoder.layers.1.norm2.bias
27
- transformer_encoder.layers.2.self_attn.in_proj_weight
28
- transformer_encoder.layers.2.self_attn.in_proj_bias
29
- transformer_encoder.layers.2.self_attn.out_proj.weight
30
- transformer_encoder.layers.2.self_attn.out_proj.bias
31
- transformer_encoder.layers.2.linear1.weight
32
- transformer_encoder.layers.2.linear1.bias
33
- transformer_encoder.layers.2.linear2.weight
34
- transformer_encoder.layers.2.linear2.bias
35
- transformer_encoder.layers.2.norm1.weight
36
- transformer_encoder.layers.2.norm1.bias
37
- transformer_encoder.layers.2.norm2.weight
38
- transformer_encoder.layers.2.norm2.bias
39
- transformer_encoder.layers.3.self_attn.in_proj_weight
40
- transformer_encoder.layers.3.self_attn.in_proj_bias
41
- transformer_encoder.layers.3.self_attn.out_proj.weight
42
- transformer_encoder.layers.3.self_attn.out_proj.bias
43
- transformer_encoder.layers.3.linear1.weight
44
- transformer_encoder.layers.3.linear1.bias
45
- transformer_encoder.layers.3.linear2.weight
46
- transformer_encoder.layers.3.linear2.bias
47
- transformer_encoder.layers.3.norm1.weight
48
- transformer_encoder.layers.3.norm1.bias
49
- transformer_encoder.layers.3.norm2.weight
50
- transformer_encoder.layers.3.norm2.bias
51
- transformer_encoder.layers.4.self_attn.in_proj_weight
52
- transformer_encoder.layers.4.self_attn.in_proj_bias
53
- transformer_encoder.layers.4.self_attn.out_proj.weight
54
- transformer_encoder.layers.4.self_attn.out_proj.bias
55
- transformer_encoder.layers.4.linear1.weight
56
- transformer_encoder.layers.4.linear1.bias
57
- transformer_encoder.layers.4.linear2.weight
58
- transformer_encoder.layers.4.linear2.bias
59
- transformer_encoder.layers.4.norm1.weight
60
- transformer_encoder.layers.4.norm1.bias
61
- transformer_encoder.layers.4.norm2.weight
62
- transformer_encoder.layers.4.norm2.bias
63
- transformer_encoder.layers.5.self_attn.in_proj_weight
64
- transformer_encoder.layers.5.self_attn.in_proj_bias
65
- transformer_encoder.layers.5.self_attn.out_proj.weight
66
- transformer_encoder.layers.5.self_attn.out_proj.bias
67
- transformer_encoder.layers.5.linear1.weight
68
- transformer_encoder.layers.5.linear1.bias
69
- transformer_encoder.layers.5.linear2.weight
70
- transformer_encoder.layers.5.linear2.bias
71
- transformer_encoder.layers.5.norm1.weight
72
- transformer_encoder.layers.5.norm1.bias
73
- transformer_encoder.layers.5.norm2.weight
74
- transformer_encoder.layers.5.norm2.bias
75
- transformer_encoder.layers.6.self_attn.in_proj_weight
76
- transformer_encoder.layers.6.self_attn.in_proj_bias
77
- transformer_encoder.layers.6.self_attn.out_proj.weight
78
- transformer_encoder.layers.6.self_attn.out_proj.bias
79
- transformer_encoder.layers.6.linear1.weight
80
- transformer_encoder.layers.6.linear1.bias
81
- transformer_encoder.layers.6.linear2.weight
82
- transformer_encoder.layers.6.linear2.bias
83
- transformer_encoder.layers.6.norm1.weight
84
- transformer_encoder.layers.6.norm1.bias
85
- transformer_encoder.layers.6.norm2.weight
86
- transformer_encoder.layers.6.norm2.bias
87
- transformer_encoder.layers.7.self_attn.in_proj_weight
88
- transformer_encoder.layers.7.self_attn.in_proj_bias
89
- transformer_encoder.layers.7.self_attn.out_proj.weight
90
- transformer_encoder.layers.7.self_attn.out_proj.bias
91
- transformer_encoder.layers.7.linear1.weight
92
- transformer_encoder.layers.7.linear1.bias
93
- transformer_encoder.layers.7.linear2.weight
94
- transformer_encoder.layers.7.linear2.bias
95
- transformer_encoder.layers.7.norm1.weight
96
- transformer_encoder.layers.7.norm1.bias
97
- transformer_encoder.layers.7.norm2.weight
98
- transformer_encoder.layers.7.norm2.bias
99
- transformer_encoder.layers.8.self_attn.in_proj_weight
100
- transformer_encoder.layers.8.self_attn.in_proj_bias
101
- transformer_encoder.layers.8.self_attn.out_proj.weight
102
- transformer_encoder.layers.8.self_attn.out_proj.bias
103
- transformer_encoder.layers.8.linear1.weight
104
- transformer_encoder.layers.8.linear1.bias
105
- transformer_encoder.layers.8.linear2.weight
106
- transformer_encoder.layers.8.linear2.bias
107
- transformer_encoder.layers.8.norm1.weight
108
- transformer_encoder.layers.8.norm1.bias
109
- transformer_encoder.layers.8.norm2.weight
110
- transformer_encoder.layers.8.norm2.bias
111
- transformer_encoder.layers.9.self_attn.in_proj_weight
112
- transformer_encoder.layers.9.self_attn.in_proj_bias
113
- transformer_encoder.layers.9.self_attn.out_proj.weight
114
- transformer_encoder.layers.9.self_attn.out_proj.bias
115
- transformer_encoder.layers.9.linear1.weight
116
- transformer_encoder.layers.9.linear1.bias
117
- transformer_encoder.layers.9.linear2.weight
118
- transformer_encoder.layers.9.linear2.bias
119
- transformer_encoder.layers.9.norm1.weight
120
- transformer_encoder.layers.9.norm1.bias
121
- transformer_encoder.layers.9.norm2.weight
122
- transformer_encoder.layers.9.norm2.bias
123
- transformer_encoder.layers.10.self_attn.in_proj_weight
124
- transformer_encoder.layers.10.self_attn.in_proj_bias
125
- transformer_encoder.layers.10.self_attn.out_proj.weight
126
- transformer_encoder.layers.10.self_attn.out_proj.bias
127
- transformer_encoder.layers.10.linear1.weight
128
- transformer_encoder.layers.10.linear1.bias
129
- transformer_encoder.layers.10.linear2.weight
130
- transformer_encoder.layers.10.linear2.bias
131
- transformer_encoder.layers.10.norm1.weight
132
- transformer_encoder.layers.10.norm1.bias
133
- transformer_encoder.layers.10.norm2.weight
134
- transformer_encoder.layers.10.norm2.bias
135
- transformer_encoder.layers.11.self_attn.in_proj_weight
136
- transformer_encoder.layers.11.self_attn.in_proj_bias
137
- transformer_encoder.layers.11.self_attn.out_proj.weight
138
- transformer_encoder.layers.11.self_attn.out_proj.bias
139
- transformer_encoder.layers.11.linear1.weight
140
- transformer_encoder.layers.11.linear1.bias
141
- transformer_encoder.layers.11.linear2.weight
142
- transformer_encoder.layers.11.linear2.bias
143
- transformer_encoder.layers.11.norm1.weight
144
- transformer_encoder.layers.11.norm1.bias
145
- transformer_encoder.layers.11.norm2.weight
146
- transformer_encoder.layers.11.norm2.bias
147
- transformer_encoder.layers.12.self_attn.in_proj_weight
148
- transformer_encoder.layers.12.self_attn.in_proj_bias
149
- transformer_encoder.layers.12.self_attn.out_proj.weight
150
- transformer_encoder.layers.12.self_attn.out_proj.bias
151
- transformer_encoder.layers.12.linear1.weight
152
- transformer_encoder.layers.12.linear1.bias
153
- transformer_encoder.layers.12.linear2.weight
154
- transformer_encoder.layers.12.linear2.bias
155
- transformer_encoder.layers.12.norm1.weight
156
- transformer_encoder.layers.12.norm1.bias
157
- transformer_encoder.layers.12.norm2.weight
158
- transformer_encoder.layers.12.norm2.bias
159
- transformer_encoder.layers.13.self_attn.in_proj_weight
160
- transformer_encoder.layers.13.self_attn.in_proj_bias
161
- transformer_encoder.layers.13.self_attn.out_proj.weight
162
- transformer_encoder.layers.13.self_attn.out_proj.bias
163
- transformer_encoder.layers.13.linear1.weight
164
- transformer_encoder.layers.13.linear1.bias
165
- transformer_encoder.layers.13.linear2.weight
166
- transformer_encoder.layers.13.linear2.bias
167
- transformer_encoder.layers.13.norm1.weight
168
- transformer_encoder.layers.13.norm1.bias
169
- transformer_encoder.layers.13.norm2.weight
170
- transformer_encoder.layers.13.norm2.bias
171
- transformer_encoder.layers.14.self_attn.in_proj_weight
172
- transformer_encoder.layers.14.self_attn.in_proj_bias
173
- transformer_encoder.layers.14.self_attn.out_proj.weight
174
- transformer_encoder.layers.14.self_attn.out_proj.bias
175
- transformer_encoder.layers.14.linear1.weight
176
- transformer_encoder.layers.14.linear1.bias
177
- transformer_encoder.layers.14.linear2.weight
178
- transformer_encoder.layers.14.linear2.bias
179
- transformer_encoder.layers.14.norm1.weight
180
- transformer_encoder.layers.14.norm1.bias
181
- transformer_encoder.layers.14.norm2.weight
182
- transformer_encoder.layers.14.norm2.bias
183
- transformer_encoder.layers.15.self_attn.in_proj_weight
184
- transformer_encoder.layers.15.self_attn.in_proj_bias
185
- transformer_encoder.layers.15.self_attn.out_proj.weight
186
- transformer_encoder.layers.15.self_attn.out_proj.bias
187
- transformer_encoder.layers.15.linear1.weight
188
- transformer_encoder.layers.15.linear1.bias
189
- transformer_encoder.layers.15.linear2.weight
190
- transformer_encoder.layers.15.linear2.bias
191
- transformer_encoder.layers.15.norm1.weight
192
- transformer_encoder.layers.15.norm1.bias
193
- transformer_encoder.layers.15.norm2.weight
194
- transformer_encoder.layers.15.norm2.bias
195
- transformer_encoder.layers.16.self_attn.in_proj_weight
196
- transformer_encoder.layers.16.self_attn.in_proj_bias
197
- transformer_encoder.layers.16.self_attn.out_proj.weight
198
- transformer_encoder.layers.16.self_attn.out_proj.bias
199
- transformer_encoder.layers.16.linear1.weight
200
- transformer_encoder.layers.16.linear1.bias
201
- transformer_encoder.layers.16.linear2.weight
202
- transformer_encoder.layers.16.linear2.bias
203
- transformer_encoder.layers.16.norm1.weight
204
- transformer_encoder.layers.16.norm1.bias
205
- transformer_encoder.layers.16.norm2.weight
206
- transformer_encoder.layers.16.norm2.bias
207
- transformer_encoder.layers.17.self_attn.in_proj_weight
208
- transformer_encoder.layers.17.self_attn.in_proj_bias
209
- transformer_encoder.layers.17.self_attn.out_proj.weight
210
- transformer_encoder.layers.17.self_attn.out_proj.bias
211
- transformer_encoder.layers.17.linear1.weight
212
- transformer_encoder.layers.17.linear1.bias
213
- transformer_encoder.layers.17.linear2.weight
214
- transformer_encoder.layers.17.linear2.bias
215
- transformer_encoder.layers.17.norm1.weight
216
- transformer_encoder.layers.17.norm1.bias
217
- transformer_encoder.layers.17.norm2.weight
218
- transformer_encoder.layers.17.norm2.bias
219
- transformer_encoder.layers.18.self_attn.in_proj_weight
220
- transformer_encoder.layers.18.self_attn.in_proj_bias
221
- transformer_encoder.layers.18.self_attn.out_proj.weight
222
- transformer_encoder.layers.18.self_attn.out_proj.bias
223
- transformer_encoder.layers.18.linear1.weight
224
- transformer_encoder.layers.18.linear1.bias
225
- transformer_encoder.layers.18.linear2.weight
226
- transformer_encoder.layers.18.linear2.bias
227
- transformer_encoder.layers.18.norm1.weight
228
- transformer_encoder.layers.18.norm1.bias
229
- transformer_encoder.layers.18.norm2.weight
230
- transformer_encoder.layers.18.norm2.bias
231
- transformer_encoder.layers.19.self_attn.in_proj_weight
232
- transformer_encoder.layers.19.self_attn.in_proj_bias
233
- transformer_encoder.layers.19.self_attn.out_proj.weight
234
- transformer_encoder.layers.19.self_attn.out_proj.bias
235
- transformer_encoder.layers.19.linear1.weight
236
- transformer_encoder.layers.19.linear1.bias
237
- transformer_encoder.layers.19.linear2.weight
238
- transformer_encoder.layers.19.linear2.bias
239
- transformer_encoder.layers.19.norm1.weight
240
- transformer_encoder.layers.19.norm1.bias
241
- transformer_encoder.layers.19.norm2.weight
242
- transformer_encoder.layers.19.norm2.bias
243
- transformer_encoder.layers.20.self_attn.in_proj_weight
244
- transformer_encoder.layers.20.self_attn.in_proj_bias
245
- transformer_encoder.layers.20.self_attn.out_proj.weight
246
- transformer_encoder.layers.20.self_attn.out_proj.bias
247
- transformer_encoder.layers.20.linear1.weight
248
- transformer_encoder.layers.20.linear1.bias
249
- transformer_encoder.layers.20.linear2.weight
250
- transformer_encoder.layers.20.linear2.bias
251
- transformer_encoder.layers.20.norm1.weight
252
- transformer_encoder.layers.20.norm1.bias
253
- transformer_encoder.layers.20.norm2.weight
254
- transformer_encoder.layers.20.norm2.bias
255
- transformer_encoder.layers.21.self_attn.in_proj_weight
256
- transformer_encoder.layers.21.self_attn.in_proj_bias
257
- transformer_encoder.layers.21.self_attn.out_proj.weight
258
- transformer_encoder.layers.21.self_attn.out_proj.bias
259
- transformer_encoder.layers.21.linear1.weight
260
- transformer_encoder.layers.21.linear1.bias
261
- transformer_encoder.layers.21.linear2.weight
262
- transformer_encoder.layers.21.linear2.bias
263
- transformer_encoder.layers.21.norm1.weight
264
- transformer_encoder.layers.21.norm1.bias
265
- transformer_encoder.layers.21.norm2.weight
266
- transformer_encoder.layers.21.norm2.bias
267
- transformer_encoder.layers.22.self_attn.in_proj_weight
268
- transformer_encoder.layers.22.self_attn.in_proj_bias
269
- transformer_encoder.layers.22.self_attn.out_proj.weight
270
- transformer_encoder.layers.22.self_attn.out_proj.bias
271
- transformer_encoder.layers.22.linear1.weight
272
- transformer_encoder.layers.22.linear1.bias
273
- transformer_encoder.layers.22.linear2.weight
274
- transformer_encoder.layers.22.linear2.bias
275
- transformer_encoder.layers.22.norm1.weight
276
- transformer_encoder.layers.22.norm1.bias
277
- transformer_encoder.layers.22.norm2.weight
278
- transformer_encoder.layers.22.norm2.bias
279
- transformer_encoder.layers.23.self_attn.in_proj_weight
280
- transformer_encoder.layers.23.self_attn.in_proj_bias
281
- transformer_encoder.layers.23.self_attn.out_proj.weight
282
- transformer_encoder.layers.23.self_attn.out_proj.bias
283
- transformer_encoder.layers.23.linear1.weight
284
- transformer_encoder.layers.23.linear1.bias
285
- transformer_encoder.layers.23.linear2.weight
286
- transformer_encoder.layers.23.linear2.bias
287
- transformer_encoder.layers.23.norm1.weight
288
- transformer_encoder.layers.23.norm1.bias
289
- transformer_encoder.layers.23.norm2.weight
290
- transformer_encoder.layers.23.norm2.bias
291
- ar_predict_layer.weight
 
1
+ ar_audio_embedding.word_embeddings.weight
2
+ ar_audio_position.alpha
3
+ transformer_encoder.layers.0.self_attn.in_proj_weight
4
+ transformer_encoder.layers.0.self_attn.in_proj_bias
5
+ transformer_encoder.layers.0.self_attn.out_proj.weight
6
+ transformer_encoder.layers.0.self_attn.out_proj.bias
7
+ transformer_encoder.layers.0.linear1.weight
8
+ transformer_encoder.layers.0.linear1.bias
9
+ transformer_encoder.layers.0.linear2.weight
10
+ transformer_encoder.layers.0.linear2.bias
11
+ transformer_encoder.layers.0.norm1.weight
12
+ transformer_encoder.layers.0.norm1.bias
13
+ transformer_encoder.layers.0.norm2.weight
14
+ transformer_encoder.layers.0.norm2.bias
15
+ transformer_encoder.layers.1.self_attn.in_proj_weight
16
+ transformer_encoder.layers.1.self_attn.in_proj_bias
17
+ transformer_encoder.layers.1.self_attn.out_proj.weight
18
+ transformer_encoder.layers.1.self_attn.out_proj.bias
19
+ transformer_encoder.layers.1.linear1.weight
20
+ transformer_encoder.layers.1.linear1.bias
21
+ transformer_encoder.layers.1.linear2.weight
22
+ transformer_encoder.layers.1.linear2.bias
23
+ transformer_encoder.layers.1.norm1.weight
24
+ transformer_encoder.layers.1.norm1.bias
25
+ transformer_encoder.layers.1.norm2.weight
26
+ transformer_encoder.layers.1.norm2.bias
27
+ transformer_encoder.layers.2.self_attn.in_proj_weight
28
+ transformer_encoder.layers.2.self_attn.in_proj_bias
29
+ transformer_encoder.layers.2.self_attn.out_proj.weight
30
+ transformer_encoder.layers.2.self_attn.out_proj.bias
31
+ transformer_encoder.layers.2.linear1.weight
32
+ transformer_encoder.layers.2.linear1.bias
33
+ transformer_encoder.layers.2.linear2.weight
34
+ transformer_encoder.layers.2.linear2.bias
35
+ transformer_encoder.layers.2.norm1.weight
36
+ transformer_encoder.layers.2.norm1.bias
37
+ transformer_encoder.layers.2.norm2.weight
38
+ transformer_encoder.layers.2.norm2.bias
39
+ transformer_encoder.layers.3.self_attn.in_proj_weight
40
+ transformer_encoder.layers.3.self_attn.in_proj_bias
41
+ transformer_encoder.layers.3.self_attn.out_proj.weight
42
+ transformer_encoder.layers.3.self_attn.out_proj.bias
43
+ transformer_encoder.layers.3.linear1.weight
44
+ transformer_encoder.layers.3.linear1.bias
45
+ transformer_encoder.layers.3.linear2.weight
46
+ transformer_encoder.layers.3.linear2.bias
47
+ transformer_encoder.layers.3.norm1.weight
48
+ transformer_encoder.layers.3.norm1.bias
49
+ transformer_encoder.layers.3.norm2.weight
50
+ transformer_encoder.layers.3.norm2.bias
51
+ transformer_encoder.layers.4.self_attn.in_proj_weight
52
+ transformer_encoder.layers.4.self_attn.in_proj_bias
53
+ transformer_encoder.layers.4.self_attn.out_proj.weight
54
+ transformer_encoder.layers.4.self_attn.out_proj.bias
55
+ transformer_encoder.layers.4.linear1.weight
56
+ transformer_encoder.layers.4.linear1.bias
57
+ transformer_encoder.layers.4.linear2.weight
58
+ transformer_encoder.layers.4.linear2.bias
59
+ transformer_encoder.layers.4.norm1.weight
60
+ transformer_encoder.layers.4.norm1.bias
61
+ transformer_encoder.layers.4.norm2.weight
62
+ transformer_encoder.layers.4.norm2.bias
63
+ transformer_encoder.layers.5.self_attn.in_proj_weight
64
+ transformer_encoder.layers.5.self_attn.in_proj_bias
65
+ transformer_encoder.layers.5.self_attn.out_proj.weight
66
+ transformer_encoder.layers.5.self_attn.out_proj.bias
67
+ transformer_encoder.layers.5.linear1.weight
68
+ transformer_encoder.layers.5.linear1.bias
69
+ transformer_encoder.layers.5.linear2.weight
70
+ transformer_encoder.layers.5.linear2.bias
71
+ transformer_encoder.layers.5.norm1.weight
72
+ transformer_encoder.layers.5.norm1.bias
73
+ transformer_encoder.layers.5.norm2.weight
74
+ transformer_encoder.layers.5.norm2.bias
75
+ transformer_encoder.layers.6.self_attn.in_proj_weight
76
+ transformer_encoder.layers.6.self_attn.in_proj_bias
77
+ transformer_encoder.layers.6.self_attn.out_proj.weight
78
+ transformer_encoder.layers.6.self_attn.out_proj.bias
79
+ transformer_encoder.layers.6.linear1.weight
80
+ transformer_encoder.layers.6.linear1.bias
81
+ transformer_encoder.layers.6.linear2.weight
82
+ transformer_encoder.layers.6.linear2.bias
83
+ transformer_encoder.layers.6.norm1.weight
84
+ transformer_encoder.layers.6.norm1.bias
85
+ transformer_encoder.layers.6.norm2.weight
86
+ transformer_encoder.layers.6.norm2.bias
87
+ transformer_encoder.layers.7.self_attn.in_proj_weight
88
+ transformer_encoder.layers.7.self_attn.in_proj_bias
89
+ transformer_encoder.layers.7.self_attn.out_proj.weight
90
+ transformer_encoder.layers.7.self_attn.out_proj.bias
91
+ transformer_encoder.layers.7.linear1.weight
92
+ transformer_encoder.layers.7.linear1.bias
93
+ transformer_encoder.layers.7.linear2.weight
94
+ transformer_encoder.layers.7.linear2.bias
95
+ transformer_encoder.layers.7.norm1.weight
96
+ transformer_encoder.layers.7.norm1.bias
97
+ transformer_encoder.layers.7.norm2.weight
98
+ transformer_encoder.layers.7.norm2.bias
99
+ transformer_encoder.layers.8.self_attn.in_proj_weight
100
+ transformer_encoder.layers.8.self_attn.in_proj_bias
101
+ transformer_encoder.layers.8.self_attn.out_proj.weight
102
+ transformer_encoder.layers.8.self_attn.out_proj.bias
103
+ transformer_encoder.layers.8.linear1.weight
104
+ transformer_encoder.layers.8.linear1.bias
105
+ transformer_encoder.layers.8.linear2.weight
106
+ transformer_encoder.layers.8.linear2.bias
107
+ transformer_encoder.layers.8.norm1.weight
108
+ transformer_encoder.layers.8.norm1.bias
109
+ transformer_encoder.layers.8.norm2.weight
110
+ transformer_encoder.layers.8.norm2.bias
111
+ transformer_encoder.layers.9.self_attn.in_proj_weight
112
+ transformer_encoder.layers.9.self_attn.in_proj_bias
113
+ transformer_encoder.layers.9.self_attn.out_proj.weight
114
+ transformer_encoder.layers.9.self_attn.out_proj.bias
115
+ transformer_encoder.layers.9.linear1.weight
116
+ transformer_encoder.layers.9.linear1.bias
117
+ transformer_encoder.layers.9.linear2.weight
118
+ transformer_encoder.layers.9.linear2.bias
119
+ transformer_encoder.layers.9.norm1.weight
120
+ transformer_encoder.layers.9.norm1.bias
121
+ transformer_encoder.layers.9.norm2.weight
122
+ transformer_encoder.layers.9.norm2.bias
123
+ transformer_encoder.layers.10.self_attn.in_proj_weight
124
+ transformer_encoder.layers.10.self_attn.in_proj_bias
125
+ transformer_encoder.layers.10.self_attn.out_proj.weight
126
+ transformer_encoder.layers.10.self_attn.out_proj.bias
127
+ transformer_encoder.layers.10.linear1.weight
128
+ transformer_encoder.layers.10.linear1.bias
129
+ transformer_encoder.layers.10.linear2.weight
130
+ transformer_encoder.layers.10.linear2.bias
131
+ transformer_encoder.layers.10.norm1.weight
132
+ transformer_encoder.layers.10.norm1.bias
133
+ transformer_encoder.layers.10.norm2.weight
134
+ transformer_encoder.layers.10.norm2.bias
135
+ transformer_encoder.layers.11.self_attn.in_proj_weight
136
+ transformer_encoder.layers.11.self_attn.in_proj_bias
137
+ transformer_encoder.layers.11.self_attn.out_proj.weight
138
+ transformer_encoder.layers.11.self_attn.out_proj.bias
139
+ transformer_encoder.layers.11.linear1.weight
140
+ transformer_encoder.layers.11.linear1.bias
141
+ transformer_encoder.layers.11.linear2.weight
142
+ transformer_encoder.layers.11.linear2.bias
143
+ transformer_encoder.layers.11.norm1.weight
144
+ transformer_encoder.layers.11.norm1.bias
145
+ transformer_encoder.layers.11.norm2.weight
146
+ transformer_encoder.layers.11.norm2.bias
147
+ transformer_encoder.layers.12.self_attn.in_proj_weight
148
+ transformer_encoder.layers.12.self_attn.in_proj_bias
149
+ transformer_encoder.layers.12.self_attn.out_proj.weight
150
+ transformer_encoder.layers.12.self_attn.out_proj.bias
151
+ transformer_encoder.layers.12.linear1.weight
152
+ transformer_encoder.layers.12.linear1.bias
153
+ transformer_encoder.layers.12.linear2.weight
154
+ transformer_encoder.layers.12.linear2.bias
155
+ transformer_encoder.layers.12.norm1.weight
156
+ transformer_encoder.layers.12.norm1.bias
157
+ transformer_encoder.layers.12.norm2.weight
158
+ transformer_encoder.layers.12.norm2.bias
159
+ transformer_encoder.layers.13.self_attn.in_proj_weight
160
+ transformer_encoder.layers.13.self_attn.in_proj_bias
161
+ transformer_encoder.layers.13.self_attn.out_proj.weight
162
+ transformer_encoder.layers.13.self_attn.out_proj.bias
163
+ transformer_encoder.layers.13.linear1.weight
164
+ transformer_encoder.layers.13.linear1.bias
165
+ transformer_encoder.layers.13.linear2.weight
166
+ transformer_encoder.layers.13.linear2.bias
167
+ transformer_encoder.layers.13.norm1.weight
168
+ transformer_encoder.layers.13.norm1.bias
169
+ transformer_encoder.layers.13.norm2.weight
170
+ transformer_encoder.layers.13.norm2.bias
171
+ transformer_encoder.layers.14.self_attn.in_proj_weight
172
+ transformer_encoder.layers.14.self_attn.in_proj_bias
173
+ transformer_encoder.layers.14.self_attn.out_proj.weight
174
+ transformer_encoder.layers.14.self_attn.out_proj.bias
175
+ transformer_encoder.layers.14.linear1.weight
176
+ transformer_encoder.layers.14.linear1.bias
177
+ transformer_encoder.layers.14.linear2.weight
178
+ transformer_encoder.layers.14.linear2.bias
179
+ transformer_encoder.layers.14.norm1.weight
180
+ transformer_encoder.layers.14.norm1.bias
181
+ transformer_encoder.layers.14.norm2.weight
182
+ transformer_encoder.layers.14.norm2.bias
183
+ transformer_encoder.layers.15.self_attn.in_proj_weight
184
+ transformer_encoder.layers.15.self_attn.in_proj_bias
185
+ transformer_encoder.layers.15.self_attn.out_proj.weight
186
+ transformer_encoder.layers.15.self_attn.out_proj.bias
187
+ transformer_encoder.layers.15.linear1.weight
188
+ transformer_encoder.layers.15.linear1.bias
189
+ transformer_encoder.layers.15.linear2.weight
190
+ transformer_encoder.layers.15.linear2.bias
191
+ transformer_encoder.layers.15.norm1.weight
192
+ transformer_encoder.layers.15.norm1.bias
193
+ transformer_encoder.layers.15.norm2.weight
194
+ transformer_encoder.layers.15.norm2.bias
195
+ transformer_encoder.layers.16.self_attn.in_proj_weight
196
+ transformer_encoder.layers.16.self_attn.in_proj_bias
197
+ transformer_encoder.layers.16.self_attn.out_proj.weight
198
+ transformer_encoder.layers.16.self_attn.out_proj.bias
199
+ transformer_encoder.layers.16.linear1.weight
200
+ transformer_encoder.layers.16.linear1.bias
201
+ transformer_encoder.layers.16.linear2.weight
202
+ transformer_encoder.layers.16.linear2.bias
203
+ transformer_encoder.layers.16.norm1.weight
204
+ transformer_encoder.layers.16.norm1.bias
205
+ transformer_encoder.layers.16.norm2.weight
206
+ transformer_encoder.layers.16.norm2.bias
207
+ transformer_encoder.layers.17.self_attn.in_proj_weight
208
+ transformer_encoder.layers.17.self_attn.in_proj_bias
209
+ transformer_encoder.layers.17.self_attn.out_proj.weight
210
+ transformer_encoder.layers.17.self_attn.out_proj.bias
211
+ transformer_encoder.layers.17.linear1.weight
212
+ transformer_encoder.layers.17.linear1.bias
213
+ transformer_encoder.layers.17.linear2.weight
214
+ transformer_encoder.layers.17.linear2.bias
215
+ transformer_encoder.layers.17.norm1.weight
216
+ transformer_encoder.layers.17.norm1.bias
217
+ transformer_encoder.layers.17.norm2.weight
218
+ transformer_encoder.layers.17.norm2.bias
219
+ transformer_encoder.layers.18.self_attn.in_proj_weight
220
+ transformer_encoder.layers.18.self_attn.in_proj_bias
221
+ transformer_encoder.layers.18.self_attn.out_proj.weight
222
+ transformer_encoder.layers.18.self_attn.out_proj.bias
223
+ transformer_encoder.layers.18.linear1.weight
224
+ transformer_encoder.layers.18.linear1.bias
225
+ transformer_encoder.layers.18.linear2.weight
226
+ transformer_encoder.layers.18.linear2.bias
227
+ transformer_encoder.layers.18.norm1.weight
228
+ transformer_encoder.layers.18.norm1.bias
229
+ transformer_encoder.layers.18.norm2.weight
230
+ transformer_encoder.layers.18.norm2.bias
231
+ transformer_encoder.layers.19.self_attn.in_proj_weight
232
+ transformer_encoder.layers.19.self_attn.in_proj_bias
233
+ transformer_encoder.layers.19.self_attn.out_proj.weight
234
+ transformer_encoder.layers.19.self_attn.out_proj.bias
235
+ transformer_encoder.layers.19.linear1.weight
236
+ transformer_encoder.layers.19.linear1.bias
237
+ transformer_encoder.layers.19.linear2.weight
238
+ transformer_encoder.layers.19.linear2.bias
239
+ transformer_encoder.layers.19.norm1.weight
240
+ transformer_encoder.layers.19.norm1.bias
241
+ transformer_encoder.layers.19.norm2.weight
242
+ transformer_encoder.layers.19.norm2.bias
243
+ transformer_encoder.layers.20.self_attn.in_proj_weight
244
+ transformer_encoder.layers.20.self_attn.in_proj_bias
245
+ transformer_encoder.layers.20.self_attn.out_proj.weight
246
+ transformer_encoder.layers.20.self_attn.out_proj.bias
247
+ transformer_encoder.layers.20.linear1.weight
248
+ transformer_encoder.layers.20.linear1.bias
249
+ transformer_encoder.layers.20.linear2.weight
250
+ transformer_encoder.layers.20.linear2.bias
251
+ transformer_encoder.layers.20.norm1.weight
252
+ transformer_encoder.layers.20.norm1.bias
253
+ transformer_encoder.layers.20.norm2.weight
254
+ transformer_encoder.layers.20.norm2.bias
255
+ transformer_encoder.layers.21.self_attn.in_proj_weight
256
+ transformer_encoder.layers.21.self_attn.in_proj_bias
257
+ transformer_encoder.layers.21.self_attn.out_proj.weight
258
+ transformer_encoder.layers.21.self_attn.out_proj.bias
259
+ transformer_encoder.layers.21.linear1.weight
260
+ transformer_encoder.layers.21.linear1.bias
261
+ transformer_encoder.layers.21.linear2.weight
262
+ transformer_encoder.layers.21.linear2.bias
263
+ transformer_encoder.layers.21.norm1.weight
264
+ transformer_encoder.layers.21.norm1.bias
265
+ transformer_encoder.layers.21.norm2.weight
266
+ transformer_encoder.layers.21.norm2.bias
267
+ transformer_encoder.layers.22.self_attn.in_proj_weight
268
+ transformer_encoder.layers.22.self_attn.in_proj_bias
269
+ transformer_encoder.layers.22.self_attn.out_proj.weight
270
+ transformer_encoder.layers.22.self_attn.out_proj.bias
271
+ transformer_encoder.layers.22.linear1.weight
272
+ transformer_encoder.layers.22.linear1.bias
273
+ transformer_encoder.layers.22.linear2.weight
274
+ transformer_encoder.layers.22.linear2.bias
275
+ transformer_encoder.layers.22.norm1.weight
276
+ transformer_encoder.layers.22.norm1.bias
277
+ transformer_encoder.layers.22.norm2.weight
278
+ transformer_encoder.layers.22.norm2.bias
279
+ transformer_encoder.layers.23.self_attn.in_proj_weight
280
+ transformer_encoder.layers.23.self_attn.in_proj_bias
281
+ transformer_encoder.layers.23.self_attn.out_proj.weight
282
+ transformer_encoder.layers.23.self_attn.out_proj.bias
283
+ transformer_encoder.layers.23.linear1.weight
284
+ transformer_encoder.layers.23.linear1.bias
285
+ transformer_encoder.layers.23.linear2.weight
286
+ transformer_encoder.layers.23.linear2.bias
287
+ transformer_encoder.layers.23.norm1.weight
288
+ transformer_encoder.layers.23.norm1.bias
289
+ transformer_encoder.layers.23.norm2.weight
290
+ transformer_encoder.layers.23.norm2.bias
291
+ ar_predict_layer.weight
genie_tts/Data/v2/Keys/vits_onnx_keys.txt CHANGED
@@ -1,668 +1,668 @@
1
- vq_model.dec.cond.bias
2
- vq_model.dec.cond.weight
3
- vq_model.dec.conv_post.weight
4
- vq_model.dec.conv_pre.bias
5
- vq_model.dec.conv_pre.weight
6
- vq_model.dec.resblocks.0.convs1.0.bias
7
- vq_model.dec.resblocks.0.convs1.0.weight_g
8
- vq_model.dec.resblocks.0.convs1.0.weight_v
9
- vq_model.dec.resblocks.0.convs1.1.bias
10
- vq_model.dec.resblocks.0.convs1.1.weight_g
11
- vq_model.dec.resblocks.0.convs1.1.weight_v
12
- vq_model.dec.resblocks.0.convs1.2.bias
13
- vq_model.dec.resblocks.0.convs1.2.weight_g
14
- vq_model.dec.resblocks.0.convs1.2.weight_v
15
- vq_model.dec.resblocks.0.convs2.0.bias
16
- vq_model.dec.resblocks.0.convs2.0.weight_g
17
- vq_model.dec.resblocks.0.convs2.0.weight_v
18
- vq_model.dec.resblocks.0.convs2.1.bias
19
- vq_model.dec.resblocks.0.convs2.1.weight_g
20
- vq_model.dec.resblocks.0.convs2.1.weight_v
21
- vq_model.dec.resblocks.0.convs2.2.bias
22
- vq_model.dec.resblocks.0.convs2.2.weight_g
23
- vq_model.dec.resblocks.0.convs2.2.weight_v
24
- vq_model.dec.resblocks.1.convs1.0.bias
25
- vq_model.dec.resblocks.1.convs1.0.weight_g
26
- vq_model.dec.resblocks.1.convs1.0.weight_v
27
- vq_model.dec.resblocks.1.convs1.1.bias
28
- vq_model.dec.resblocks.1.convs1.1.weight_g
29
- vq_model.dec.resblocks.1.convs1.1.weight_v
30
- vq_model.dec.resblocks.1.convs1.2.bias
31
- vq_model.dec.resblocks.1.convs1.2.weight_g
32
- vq_model.dec.resblocks.1.convs1.2.weight_v
33
- vq_model.dec.resblocks.1.convs2.0.bias
34
- vq_model.dec.resblocks.1.convs2.0.weight_g
35
- vq_model.dec.resblocks.1.convs2.0.weight_v
36
- vq_model.dec.resblocks.1.convs2.1.bias
37
- vq_model.dec.resblocks.1.convs2.1.weight_g
38
- vq_model.dec.resblocks.1.convs2.1.weight_v
39
- vq_model.dec.resblocks.1.convs2.2.bias
40
- vq_model.dec.resblocks.1.convs2.2.weight_g
41
- vq_model.dec.resblocks.1.convs2.2.weight_v
42
- vq_model.dec.resblocks.10.convs1.0.bias
43
- vq_model.dec.resblocks.10.convs1.0.weight_g
44
- vq_model.dec.resblocks.10.convs1.0.weight_v
45
- vq_model.dec.resblocks.10.convs1.1.bias
46
- vq_model.dec.resblocks.10.convs1.1.weight_g
47
- vq_model.dec.resblocks.10.convs1.1.weight_v
48
- vq_model.dec.resblocks.10.convs1.2.bias
49
- vq_model.dec.resblocks.10.convs1.2.weight_g
50
- vq_model.dec.resblocks.10.convs1.2.weight_v
51
- vq_model.dec.resblocks.10.convs2.0.bias
52
- vq_model.dec.resblocks.10.convs2.0.weight_g
53
- vq_model.dec.resblocks.10.convs2.0.weight_v
54
- vq_model.dec.resblocks.10.convs2.1.bias
55
- vq_model.dec.resblocks.10.convs2.1.weight_g
56
- vq_model.dec.resblocks.10.convs2.1.weight_v
57
- vq_model.dec.resblocks.10.convs2.2.bias
58
- vq_model.dec.resblocks.10.convs2.2.weight_g
59
- vq_model.dec.resblocks.10.convs2.2.weight_v
60
- vq_model.dec.resblocks.11.convs1.0.bias
61
- vq_model.dec.resblocks.11.convs1.0.weight_g
62
- vq_model.dec.resblocks.11.convs1.0.weight_v
63
- vq_model.dec.resblocks.11.convs1.1.bias
64
- vq_model.dec.resblocks.11.convs1.1.weight_g
65
- vq_model.dec.resblocks.11.convs1.1.weight_v
66
- vq_model.dec.resblocks.11.convs1.2.bias
67
- vq_model.dec.resblocks.11.convs1.2.weight_g
68
- vq_model.dec.resblocks.11.convs1.2.weight_v
69
- vq_model.dec.resblocks.11.convs2.0.bias
70
- vq_model.dec.resblocks.11.convs2.0.weight_g
71
- vq_model.dec.resblocks.11.convs2.0.weight_v
72
- vq_model.dec.resblocks.11.convs2.1.bias
73
- vq_model.dec.resblocks.11.convs2.1.weight_g
74
- vq_model.dec.resblocks.11.convs2.1.weight_v
75
- vq_model.dec.resblocks.11.convs2.2.bias
76
- vq_model.dec.resblocks.11.convs2.2.weight_g
77
- vq_model.dec.resblocks.11.convs2.2.weight_v
78
- vq_model.dec.resblocks.12.convs1.0.bias
79
- vq_model.dec.resblocks.12.convs1.0.weight_g
80
- vq_model.dec.resblocks.12.convs1.0.weight_v
81
- vq_model.dec.resblocks.12.convs1.1.bias
82
- vq_model.dec.resblocks.12.convs1.1.weight_g
83
- vq_model.dec.resblocks.12.convs1.1.weight_v
84
- vq_model.dec.resblocks.12.convs1.2.bias
85
- vq_model.dec.resblocks.12.convs1.2.weight_g
86
- vq_model.dec.resblocks.12.convs1.2.weight_v
87
- vq_model.dec.resblocks.12.convs2.0.bias
88
- vq_model.dec.resblocks.12.convs2.0.weight_g
89
- vq_model.dec.resblocks.12.convs2.0.weight_v
90
- vq_model.dec.resblocks.12.convs2.1.bias
91
- vq_model.dec.resblocks.12.convs2.1.weight_g
92
- vq_model.dec.resblocks.12.convs2.1.weight_v
93
- vq_model.dec.resblocks.12.convs2.2.bias
94
- vq_model.dec.resblocks.12.convs2.2.weight_g
95
- vq_model.dec.resblocks.12.convs2.2.weight_v
96
- vq_model.dec.resblocks.13.convs1.0.bias
97
- vq_model.dec.resblocks.13.convs1.0.weight_g
98
- vq_model.dec.resblocks.13.convs1.0.weight_v
99
- vq_model.dec.resblocks.13.convs1.1.bias
100
- vq_model.dec.resblocks.13.convs1.1.weight_g
101
- vq_model.dec.resblocks.13.convs1.1.weight_v
102
- vq_model.dec.resblocks.13.convs1.2.bias
103
- vq_model.dec.resblocks.13.convs1.2.weight_g
104
- vq_model.dec.resblocks.13.convs1.2.weight_v
105
- vq_model.dec.resblocks.13.convs2.0.bias
106
- vq_model.dec.resblocks.13.convs2.0.weight_g
107
- vq_model.dec.resblocks.13.convs2.0.weight_v
108
- vq_model.dec.resblocks.13.convs2.1.bias
109
- vq_model.dec.resblocks.13.convs2.1.weight_g
110
- vq_model.dec.resblocks.13.convs2.1.weight_v
111
- vq_model.dec.resblocks.13.convs2.2.bias
112
- vq_model.dec.resblocks.13.convs2.2.weight_g
113
- vq_model.dec.resblocks.13.convs2.2.weight_v
114
- vq_model.dec.resblocks.14.convs1.0.bias
115
- vq_model.dec.resblocks.14.convs1.0.weight_g
116
- vq_model.dec.resblocks.14.convs1.0.weight_v
117
- vq_model.dec.resblocks.14.convs1.1.bias
118
- vq_model.dec.resblocks.14.convs1.1.weight_g
119
- vq_model.dec.resblocks.14.convs1.1.weight_v
120
- vq_model.dec.resblocks.14.convs1.2.bias
121
- vq_model.dec.resblocks.14.convs1.2.weight_g
122
- vq_model.dec.resblocks.14.convs1.2.weight_v
123
- vq_model.dec.resblocks.14.convs2.0.bias
124
- vq_model.dec.resblocks.14.convs2.0.weight_g
125
- vq_model.dec.resblocks.14.convs2.0.weight_v
126
- vq_model.dec.resblocks.14.convs2.1.bias
127
- vq_model.dec.resblocks.14.convs2.1.weight_g
128
- vq_model.dec.resblocks.14.convs2.1.weight_v
129
- vq_model.dec.resblocks.14.convs2.2.bias
130
- vq_model.dec.resblocks.14.convs2.2.weight_g
131
- vq_model.dec.resblocks.14.convs2.2.weight_v
132
- vq_model.dec.resblocks.2.convs1.0.bias
133
- vq_model.dec.resblocks.2.convs1.0.weight_g
134
- vq_model.dec.resblocks.2.convs1.0.weight_v
135
- vq_model.dec.resblocks.2.convs1.1.bias
136
- vq_model.dec.resblocks.2.convs1.1.weight_g
137
- vq_model.dec.resblocks.2.convs1.1.weight_v
138
- vq_model.dec.resblocks.2.convs1.2.bias
139
- vq_model.dec.resblocks.2.convs1.2.weight_g
140
- vq_model.dec.resblocks.2.convs1.2.weight_v
141
- vq_model.dec.resblocks.2.convs2.0.bias
142
- vq_model.dec.resblocks.2.convs2.0.weight_g
143
- vq_model.dec.resblocks.2.convs2.0.weight_v
144
- vq_model.dec.resblocks.2.convs2.1.bias
145
- vq_model.dec.resblocks.2.convs2.1.weight_g
146
- vq_model.dec.resblocks.2.convs2.1.weight_v
147
- vq_model.dec.resblocks.2.convs2.2.bias
148
- vq_model.dec.resblocks.2.convs2.2.weight_g
149
- vq_model.dec.resblocks.2.convs2.2.weight_v
150
- vq_model.dec.resblocks.3.convs1.0.bias
151
- vq_model.dec.resblocks.3.convs1.0.weight_g
152
- vq_model.dec.resblocks.3.convs1.0.weight_v
153
- vq_model.dec.resblocks.3.convs1.1.bias
154
- vq_model.dec.resblocks.3.convs1.1.weight_g
155
- vq_model.dec.resblocks.3.convs1.1.weight_v
156
- vq_model.dec.resblocks.3.convs1.2.bias
157
- vq_model.dec.resblocks.3.convs1.2.weight_g
158
- vq_model.dec.resblocks.3.convs1.2.weight_v
159
- vq_model.dec.resblocks.3.convs2.0.bias
160
- vq_model.dec.resblocks.3.convs2.0.weight_g
161
- vq_model.dec.resblocks.3.convs2.0.weight_v
162
- vq_model.dec.resblocks.3.convs2.1.bias
163
- vq_model.dec.resblocks.3.convs2.1.weight_g
164
- vq_model.dec.resblocks.3.convs2.1.weight_v
165
- vq_model.dec.resblocks.3.convs2.2.bias
166
- vq_model.dec.resblocks.3.convs2.2.weight_g
167
- vq_model.dec.resblocks.3.convs2.2.weight_v
168
- vq_model.dec.resblocks.4.convs1.0.bias
169
- vq_model.dec.resblocks.4.convs1.0.weight_g
170
- vq_model.dec.resblocks.4.convs1.0.weight_v
171
- vq_model.dec.resblocks.4.convs1.1.bias
172
- vq_model.dec.resblocks.4.convs1.1.weight_g
173
- vq_model.dec.resblocks.4.convs1.1.weight_v
174
- vq_model.dec.resblocks.4.convs1.2.bias
175
- vq_model.dec.resblocks.4.convs1.2.weight_g
176
- vq_model.dec.resblocks.4.convs1.2.weight_v
177
- vq_model.dec.resblocks.4.convs2.0.bias
178
- vq_model.dec.resblocks.4.convs2.0.weight_g
179
- vq_model.dec.resblocks.4.convs2.0.weight_v
180
- vq_model.dec.resblocks.4.convs2.1.bias
181
- vq_model.dec.resblocks.4.convs2.1.weight_g
182
- vq_model.dec.resblocks.4.convs2.1.weight_v
183
- vq_model.dec.resblocks.4.convs2.2.bias
184
- vq_model.dec.resblocks.4.convs2.2.weight_g
185
- vq_model.dec.resblocks.4.convs2.2.weight_v
186
- vq_model.dec.resblocks.5.convs1.0.bias
187
- vq_model.dec.resblocks.5.convs1.0.weight_g
188
- vq_model.dec.resblocks.5.convs1.0.weight_v
189
- vq_model.dec.resblocks.5.convs1.1.bias
190
- vq_model.dec.resblocks.5.convs1.1.weight_g
191
- vq_model.dec.resblocks.5.convs1.1.weight_v
192
- vq_model.dec.resblocks.5.convs1.2.bias
193
- vq_model.dec.resblocks.5.convs1.2.weight_g
194
- vq_model.dec.resblocks.5.convs1.2.weight_v
195
- vq_model.dec.resblocks.5.convs2.0.bias
196
- vq_model.dec.resblocks.5.convs2.0.weight_g
197
- vq_model.dec.resblocks.5.convs2.0.weight_v
198
- vq_model.dec.resblocks.5.convs2.1.bias
199
- vq_model.dec.resblocks.5.convs2.1.weight_g
200
- vq_model.dec.resblocks.5.convs2.1.weight_v
201
- vq_model.dec.resblocks.5.convs2.2.bias
202
- vq_model.dec.resblocks.5.convs2.2.weight_g
203
- vq_model.dec.resblocks.5.convs2.2.weight_v
204
- vq_model.dec.resblocks.6.convs1.0.bias
205
- vq_model.dec.resblocks.6.convs1.0.weight_g
206
- vq_model.dec.resblocks.6.convs1.0.weight_v
207
- vq_model.dec.resblocks.6.convs1.1.bias
208
- vq_model.dec.resblocks.6.convs1.1.weight_g
209
- vq_model.dec.resblocks.6.convs1.1.weight_v
210
- vq_model.dec.resblocks.6.convs1.2.bias
211
- vq_model.dec.resblocks.6.convs1.2.weight_g
212
- vq_model.dec.resblocks.6.convs1.2.weight_v
213
- vq_model.dec.resblocks.6.convs2.0.bias
214
- vq_model.dec.resblocks.6.convs2.0.weight_g
215
- vq_model.dec.resblocks.6.convs2.0.weight_v
216
- vq_model.dec.resblocks.6.convs2.1.bias
217
- vq_model.dec.resblocks.6.convs2.1.weight_g
218
- vq_model.dec.resblocks.6.convs2.1.weight_v
219
- vq_model.dec.resblocks.6.convs2.2.bias
220
- vq_model.dec.resblocks.6.convs2.2.weight_g
221
- vq_model.dec.resblocks.6.convs2.2.weight_v
222
- vq_model.dec.resblocks.7.convs1.0.bias
223
- vq_model.dec.resblocks.7.convs1.0.weight_g
224
- vq_model.dec.resblocks.7.convs1.0.weight_v
225
- vq_model.dec.resblocks.7.convs1.1.bias
226
- vq_model.dec.resblocks.7.convs1.1.weight_g
227
- vq_model.dec.resblocks.7.convs1.1.weight_v
228
- vq_model.dec.resblocks.7.convs1.2.bias
229
- vq_model.dec.resblocks.7.convs1.2.weight_g
230
- vq_model.dec.resblocks.7.convs1.2.weight_v
231
- vq_model.dec.resblocks.7.convs2.0.bias
232
- vq_model.dec.resblocks.7.convs2.0.weight_g
233
- vq_model.dec.resblocks.7.convs2.0.weight_v
234
- vq_model.dec.resblocks.7.convs2.1.bias
235
- vq_model.dec.resblocks.7.convs2.1.weight_g
236
- vq_model.dec.resblocks.7.convs2.1.weight_v
237
- vq_model.dec.resblocks.7.convs2.2.bias
238
- vq_model.dec.resblocks.7.convs2.2.weight_g
239
- vq_model.dec.resblocks.7.convs2.2.weight_v
240
- vq_model.dec.resblocks.8.convs1.0.bias
241
- vq_model.dec.resblocks.8.convs1.0.weight_g
242
- vq_model.dec.resblocks.8.convs1.0.weight_v
243
- vq_model.dec.resblocks.8.convs1.1.bias
244
- vq_model.dec.resblocks.8.convs1.1.weight_g
245
- vq_model.dec.resblocks.8.convs1.1.weight_v
246
- vq_model.dec.resblocks.8.convs1.2.bias
247
- vq_model.dec.resblocks.8.convs1.2.weight_g
248
- vq_model.dec.resblocks.8.convs1.2.weight_v
249
- vq_model.dec.resblocks.8.convs2.0.bias
250
- vq_model.dec.resblocks.8.convs2.0.weight_g
251
- vq_model.dec.resblocks.8.convs2.0.weight_v
252
- vq_model.dec.resblocks.8.convs2.1.bias
253
- vq_model.dec.resblocks.8.convs2.1.weight_g
254
- vq_model.dec.resblocks.8.convs2.1.weight_v
255
- vq_model.dec.resblocks.8.convs2.2.bias
256
- vq_model.dec.resblocks.8.convs2.2.weight_g
257
- vq_model.dec.resblocks.8.convs2.2.weight_v
258
- vq_model.dec.resblocks.9.convs1.0.bias
259
- vq_model.dec.resblocks.9.convs1.0.weight_g
260
- vq_model.dec.resblocks.9.convs1.0.weight_v
261
- vq_model.dec.resblocks.9.convs1.1.bias
262
- vq_model.dec.resblocks.9.convs1.1.weight_g
263
- vq_model.dec.resblocks.9.convs1.1.weight_v
264
- vq_model.dec.resblocks.9.convs1.2.bias
265
- vq_model.dec.resblocks.9.convs1.2.weight_g
266
- vq_model.dec.resblocks.9.convs1.2.weight_v
267
- vq_model.dec.resblocks.9.convs2.0.bias
268
- vq_model.dec.resblocks.9.convs2.0.weight_g
269
- vq_model.dec.resblocks.9.convs2.0.weight_v
270
- vq_model.dec.resblocks.9.convs2.1.bias
271
- vq_model.dec.resblocks.9.convs2.1.weight_g
272
- vq_model.dec.resblocks.9.convs2.1.weight_v
273
- vq_model.dec.resblocks.9.convs2.2.bias
274
- vq_model.dec.resblocks.9.convs2.2.weight_g
275
- vq_model.dec.resblocks.9.convs2.2.weight_v
276
- vq_model.dec.ups.0.bias
277
- vq_model.dec.ups.0.weight_g
278
- vq_model.dec.ups.0.weight_v
279
- vq_model.dec.ups.1.bias
280
- vq_model.dec.ups.1.weight_g
281
- vq_model.dec.ups.1.weight_v
282
- vq_model.dec.ups.2.bias
283
- vq_model.dec.ups.2.weight_g
284
- vq_model.dec.ups.2.weight_v
285
- vq_model.dec.ups.3.bias
286
- vq_model.dec.ups.3.weight_g
287
- vq_model.dec.ups.3.weight_v
288
- vq_model.dec.ups.4.bias
289
- vq_model.dec.ups.4.weight_g
290
- vq_model.dec.ups.4.weight_v
291
- vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias
292
- vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight
293
- vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias
294
- vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight
295
- vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias
296
- vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight
297
- vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias
298
- vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight
299
- vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k
300
- vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v
301
- vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias
302
- vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight
303
- vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias
304
- vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight
305
- vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias
306
- vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight
307
- vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias
308
- vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight
309
- vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k
310
- vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v
311
- vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias
312
- vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight
313
- vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias
314
- vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight
315
- vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias
316
- vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight
317
- vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias
318
- vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight
319
- vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k
320
- vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v
321
- vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias
322
- vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight
323
- vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias
324
- vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight
325
- vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias
326
- vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight
327
- vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias
328
- vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight
329
- vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias
330
- vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight
331
- vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias
332
- vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight
333
- vq_model.enc_p.encoder2.norm_layers_1.0.beta
334
- vq_model.enc_p.encoder2.norm_layers_1.0.gamma
335
- vq_model.enc_p.encoder2.norm_layers_1.1.beta
336
- vq_model.enc_p.encoder2.norm_layers_1.1.gamma
337
- vq_model.enc_p.encoder2.norm_layers_1.2.beta
338
- vq_model.enc_p.encoder2.norm_layers_1.2.gamma
339
- vq_model.enc_p.encoder2.norm_layers_2.0.beta
340
- vq_model.enc_p.encoder2.norm_layers_2.0.gamma
341
- vq_model.enc_p.encoder2.norm_layers_2.1.beta
342
- vq_model.enc_p.encoder2.norm_layers_2.1.gamma
343
- vq_model.enc_p.encoder2.norm_layers_2.2.beta
344
- vq_model.enc_p.encoder2.norm_layers_2.2.gamma
345
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias
346
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight
347
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias
348
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight
349
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias
350
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight
351
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias
352
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight
353
- vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k
354
- vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v
355
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias
356
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight
357
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias
358
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight
359
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias
360
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight
361
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias
362
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight
363
- vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k
364
- vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v
365
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias
366
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight
367
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias
368
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight
369
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias
370
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight
371
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias
372
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight
373
- vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k
374
- vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v
375
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias
376
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight
377
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias
378
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight
379
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias
380
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight
381
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias
382
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight
383
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias
384
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight
385
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias
386
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight
387
- vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta
388
- vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma
389
- vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta
390
- vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma
391
- vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta
392
- vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma
393
- vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta
394
- vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma
395
- vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta
396
- vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma
397
- vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta
398
- vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma
399
- vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias
400
- vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight
401
- vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias
402
- vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight
403
- vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias
404
- vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight
405
- vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias
406
- vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight
407
- vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k
408
- vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v
409
- vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias
410
- vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight
411
- vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias
412
- vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight
413
- vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias
414
- vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight
415
- vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias
416
- vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight
417
- vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k
418
- vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v
419
- vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias
420
- vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight
421
- vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias
422
- vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight
423
- vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias
424
- vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight
425
- vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias
426
- vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight
427
- vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k
428
- vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v
429
- vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias
430
- vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight
431
- vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias
432
- vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight
433
- vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias
434
- vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight
435
- vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias
436
- vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight
437
- vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k
438
- vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v
439
- vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias
440
- vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight
441
- vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias
442
- vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight
443
- vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias
444
- vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight
445
- vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias
446
- vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight
447
- vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k
448
- vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v
449
- vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias
450
- vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight
451
- vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias
452
- vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight
453
- vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias
454
- vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight
455
- vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias
456
- vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight
457
- vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k
458
- vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v
459
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias
460
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight
461
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias
462
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight
463
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias
464
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight
465
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias
466
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight
467
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias
468
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight
469
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias
470
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight
471
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias
472
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight
473
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias
474
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight
475
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias
476
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight
477
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias
478
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight
479
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias
480
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight
481
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias
482
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight
483
- vq_model.enc_p.encoder_text.norm_layers_1.0.beta
484
- vq_model.enc_p.encoder_text.norm_layers_1.0.gamma
485
- vq_model.enc_p.encoder_text.norm_layers_1.1.beta
486
- vq_model.enc_p.encoder_text.norm_layers_1.1.gamma
487
- vq_model.enc_p.encoder_text.norm_layers_1.2.beta
488
- vq_model.enc_p.encoder_text.norm_layers_1.2.gamma
489
- vq_model.enc_p.encoder_text.norm_layers_1.3.beta
490
- vq_model.enc_p.encoder_text.norm_layers_1.3.gamma
491
- vq_model.enc_p.encoder_text.norm_layers_1.4.beta
492
- vq_model.enc_p.encoder_text.norm_layers_1.4.gamma
493
- vq_model.enc_p.encoder_text.norm_layers_1.5.beta
494
- vq_model.enc_p.encoder_text.norm_layers_1.5.gamma
495
- vq_model.enc_p.encoder_text.norm_layers_2.0.beta
496
- vq_model.enc_p.encoder_text.norm_layers_2.0.gamma
497
- vq_model.enc_p.encoder_text.norm_layers_2.1.beta
498
- vq_model.enc_p.encoder_text.norm_layers_2.1.gamma
499
- vq_model.enc_p.encoder_text.norm_layers_2.2.beta
500
- vq_model.enc_p.encoder_text.norm_layers_2.2.gamma
501
- vq_model.enc_p.encoder_text.norm_layers_2.3.beta
502
- vq_model.enc_p.encoder_text.norm_layers_2.3.gamma
503
- vq_model.enc_p.encoder_text.norm_layers_2.4.beta
504
- vq_model.enc_p.encoder_text.norm_layers_2.4.gamma
505
- vq_model.enc_p.encoder_text.norm_layers_2.5.beta
506
- vq_model.enc_p.encoder_text.norm_layers_2.5.gamma
507
- vq_model.enc_p.mrte.c_post.bias
508
- vq_model.enc_p.mrte.c_post.weight
509
- vq_model.enc_p.mrte.c_pre.bias
510
- vq_model.enc_p.mrte.c_pre.weight
511
- vq_model.enc_p.mrte.cross_attention.conv_k.bias
512
- vq_model.enc_p.mrte.cross_attention.conv_k.weight
513
- vq_model.enc_p.mrte.cross_attention.conv_o.bias
514
- vq_model.enc_p.mrte.cross_attention.conv_o.weight
515
- vq_model.enc_p.mrte.cross_attention.conv_q.bias
516
- vq_model.enc_p.mrte.cross_attention.conv_q.weight
517
- vq_model.enc_p.mrte.cross_attention.conv_v.bias
518
- vq_model.enc_p.mrte.cross_attention.conv_v.weight
519
- vq_model.enc_p.mrte.text_pre.bias
520
- vq_model.enc_p.mrte.text_pre.weight
521
- vq_model.enc_p.proj.bias
522
- vq_model.enc_p.proj.weight
523
- vq_model.enc_p.ssl_proj.bias
524
- vq_model.enc_p.ssl_proj.weight
525
- vq_model.enc_p.text_embedding.weight
526
- vq_model.flow.flows.0.enc.cond_layer.bias
527
- vq_model.flow.flows.0.enc.cond_layer.weight_g
528
- vq_model.flow.flows.0.enc.cond_layer.weight_v
529
- vq_model.flow.flows.0.enc.in_layers.0.bias
530
- vq_model.flow.flows.0.enc.in_layers.0.weight_g
531
- vq_model.flow.flows.0.enc.in_layers.0.weight_v
532
- vq_model.flow.flows.0.enc.in_layers.1.bias
533
- vq_model.flow.flows.0.enc.in_layers.1.weight_g
534
- vq_model.flow.flows.0.enc.in_layers.1.weight_v
535
- vq_model.flow.flows.0.enc.in_layers.2.bias
536
- vq_model.flow.flows.0.enc.in_layers.2.weight_g
537
- vq_model.flow.flows.0.enc.in_layers.2.weight_v
538
- vq_model.flow.flows.0.enc.in_layers.3.bias
539
- vq_model.flow.flows.0.enc.in_layers.3.weight_g
540
- vq_model.flow.flows.0.enc.in_layers.3.weight_v
541
- vq_model.flow.flows.0.enc.res_skip_layers.0.bias
542
- vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g
543
- vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v
544
- vq_model.flow.flows.0.enc.res_skip_layers.1.bias
545
- vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g
546
- vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v
547
- vq_model.flow.flows.0.enc.res_skip_layers.2.bias
548
- vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g
549
- vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v
550
- vq_model.flow.flows.0.enc.res_skip_layers.3.bias
551
- vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g
552
- vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v
553
- vq_model.flow.flows.0.post.bias
554
- vq_model.flow.flows.0.post.weight
555
- vq_model.flow.flows.0.pre.bias
556
- vq_model.flow.flows.0.pre.weight
557
- vq_model.flow.flows.2.enc.cond_layer.bias
558
- vq_model.flow.flows.2.enc.cond_layer.weight_g
559
- vq_model.flow.flows.2.enc.cond_layer.weight_v
560
- vq_model.flow.flows.2.enc.in_layers.0.bias
561
- vq_model.flow.flows.2.enc.in_layers.0.weight_g
562
- vq_model.flow.flows.2.enc.in_layers.0.weight_v
563
- vq_model.flow.flows.2.enc.in_layers.1.bias
564
- vq_model.flow.flows.2.enc.in_layers.1.weight_g
565
- vq_model.flow.flows.2.enc.in_layers.1.weight_v
566
- vq_model.flow.flows.2.enc.in_layers.2.bias
567
- vq_model.flow.flows.2.enc.in_layers.2.weight_g
568
- vq_model.flow.flows.2.enc.in_layers.2.weight_v
569
- vq_model.flow.flows.2.enc.in_layers.3.bias
570
- vq_model.flow.flows.2.enc.in_layers.3.weight_g
571
- vq_model.flow.flows.2.enc.in_layers.3.weight_v
572
- vq_model.flow.flows.2.enc.res_skip_layers.0.bias
573
- vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g
574
- vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v
575
- vq_model.flow.flows.2.enc.res_skip_layers.1.bias
576
- vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g
577
- vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v
578
- vq_model.flow.flows.2.enc.res_skip_layers.2.bias
579
- vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g
580
- vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v
581
- vq_model.flow.flows.2.enc.res_skip_layers.3.bias
582
- vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g
583
- vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v
584
- vq_model.flow.flows.2.post.bias
585
- vq_model.flow.flows.2.post.weight
586
- vq_model.flow.flows.2.pre.bias
587
- vq_model.flow.flows.2.pre.weight
588
- vq_model.flow.flows.4.enc.cond_layer.bias
589
- vq_model.flow.flows.4.enc.cond_layer.weight_g
590
- vq_model.flow.flows.4.enc.cond_layer.weight_v
591
- vq_model.flow.flows.4.enc.in_layers.0.bias
592
- vq_model.flow.flows.4.enc.in_layers.0.weight_g
593
- vq_model.flow.flows.4.enc.in_layers.0.weight_v
594
- vq_model.flow.flows.4.enc.in_layers.1.bias
595
- vq_model.flow.flows.4.enc.in_layers.1.weight_g
596
- vq_model.flow.flows.4.enc.in_layers.1.weight_v
597
- vq_model.flow.flows.4.enc.in_layers.2.bias
598
- vq_model.flow.flows.4.enc.in_layers.2.weight_g
599
- vq_model.flow.flows.4.enc.in_layers.2.weight_v
600
- vq_model.flow.flows.4.enc.in_layers.3.bias
601
- vq_model.flow.flows.4.enc.in_layers.3.weight_g
602
- vq_model.flow.flows.4.enc.in_layers.3.weight_v
603
- vq_model.flow.flows.4.enc.res_skip_layers.0.bias
604
- vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g
605
- vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v
606
- vq_model.flow.flows.4.enc.res_skip_layers.1.bias
607
- vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g
608
- vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v
609
- vq_model.flow.flows.4.enc.res_skip_layers.2.bias
610
- vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g
611
- vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v
612
- vq_model.flow.flows.4.enc.res_skip_layers.3.bias
613
- vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g
614
- vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v
615
- vq_model.flow.flows.4.post.bias
616
- vq_model.flow.flows.4.post.weight
617
- vq_model.flow.flows.4.pre.bias
618
- vq_model.flow.flows.4.pre.weight
619
- vq_model.flow.flows.6.enc.cond_layer.bias
620
- vq_model.flow.flows.6.enc.cond_layer.weight_g
621
- vq_model.flow.flows.6.enc.cond_layer.weight_v
622
- vq_model.flow.flows.6.enc.in_layers.0.bias
623
- vq_model.flow.flows.6.enc.in_layers.0.weight_g
624
- vq_model.flow.flows.6.enc.in_layers.0.weight_v
625
- vq_model.flow.flows.6.enc.in_layers.1.bias
626
- vq_model.flow.flows.6.enc.in_layers.1.weight_g
627
- vq_model.flow.flows.6.enc.in_layers.1.weight_v
628
- vq_model.flow.flows.6.enc.in_layers.2.bias
629
- vq_model.flow.flows.6.enc.in_layers.2.weight_g
630
- vq_model.flow.flows.6.enc.in_layers.2.weight_v
631
- vq_model.flow.flows.6.enc.in_layers.3.bias
632
- vq_model.flow.flows.6.enc.in_layers.3.weight_g
633
- vq_model.flow.flows.6.enc.in_layers.3.weight_v
634
- vq_model.flow.flows.6.enc.res_skip_layers.0.bias
635
- vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g
636
- vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v
637
- vq_model.flow.flows.6.enc.res_skip_layers.1.bias
638
- vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g
639
- vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v
640
- vq_model.flow.flows.6.enc.res_skip_layers.2.bias
641
- vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g
642
- vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v
643
- vq_model.flow.flows.6.enc.res_skip_layers.3.bias
644
- vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g
645
- vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v
646
- vq_model.flow.flows.6.post.bias
647
- vq_model.flow.flows.6.post.weight
648
- vq_model.flow.flows.6.pre.bias
649
- vq_model.flow.flows.6.pre.weight
650
- vq_model.quantizer.vq.layers.0._codebook.embed
651
- vq_model.ref_enc.fc.fc.bias
652
- vq_model.ref_enc.fc.fc.weight
653
- vq_model.ref_enc.slf_attn.fc.bias
654
- vq_model.ref_enc.slf_attn.fc.weight
655
- vq_model.ref_enc.slf_attn.w_ks.bias
656
- vq_model.ref_enc.slf_attn.w_ks.weight
657
- vq_model.ref_enc.slf_attn.w_qs.bias
658
- vq_model.ref_enc.slf_attn.w_qs.weight
659
- vq_model.ref_enc.slf_attn.w_vs.bias
660
- vq_model.ref_enc.slf_attn.w_vs.weight
661
- vq_model.ref_enc.spectral.0.fc.bias
662
- vq_model.ref_enc.spectral.0.fc.weight
663
- vq_model.ref_enc.spectral.3.fc.bias
664
- vq_model.ref_enc.spectral.3.fc.weight
665
- vq_model.ref_enc.temporal.0.conv1.conv.bias
666
- vq_model.ref_enc.temporal.0.conv1.conv.weight
667
- vq_model.ref_enc.temporal.1.conv1.conv.bias
668
- vq_model.ref_enc.temporal.1.conv1.conv.weight
 
1
+ vq_model.dec.cond.bias
2
+ vq_model.dec.cond.weight
3
+ vq_model.dec.conv_post.weight
4
+ vq_model.dec.conv_pre.bias
5
+ vq_model.dec.conv_pre.weight
6
+ vq_model.dec.resblocks.0.convs1.0.bias
7
+ vq_model.dec.resblocks.0.convs1.0.weight_g
8
+ vq_model.dec.resblocks.0.convs1.0.weight_v
9
+ vq_model.dec.resblocks.0.convs1.1.bias
10
+ vq_model.dec.resblocks.0.convs1.1.weight_g
11
+ vq_model.dec.resblocks.0.convs1.1.weight_v
12
+ vq_model.dec.resblocks.0.convs1.2.bias
13
+ vq_model.dec.resblocks.0.convs1.2.weight_g
14
+ vq_model.dec.resblocks.0.convs1.2.weight_v
15
+ vq_model.dec.resblocks.0.convs2.0.bias
16
+ vq_model.dec.resblocks.0.convs2.0.weight_g
17
+ vq_model.dec.resblocks.0.convs2.0.weight_v
18
+ vq_model.dec.resblocks.0.convs2.1.bias
19
+ vq_model.dec.resblocks.0.convs2.1.weight_g
20
+ vq_model.dec.resblocks.0.convs2.1.weight_v
21
+ vq_model.dec.resblocks.0.convs2.2.bias
22
+ vq_model.dec.resblocks.0.convs2.2.weight_g
23
+ vq_model.dec.resblocks.0.convs2.2.weight_v
24
+ vq_model.dec.resblocks.1.convs1.0.bias
25
+ vq_model.dec.resblocks.1.convs1.0.weight_g
26
+ vq_model.dec.resblocks.1.convs1.0.weight_v
27
+ vq_model.dec.resblocks.1.convs1.1.bias
28
+ vq_model.dec.resblocks.1.convs1.1.weight_g
29
+ vq_model.dec.resblocks.1.convs1.1.weight_v
30
+ vq_model.dec.resblocks.1.convs1.2.bias
31
+ vq_model.dec.resblocks.1.convs1.2.weight_g
32
+ vq_model.dec.resblocks.1.convs1.2.weight_v
33
+ vq_model.dec.resblocks.1.convs2.0.bias
34
+ vq_model.dec.resblocks.1.convs2.0.weight_g
35
+ vq_model.dec.resblocks.1.convs2.0.weight_v
36
+ vq_model.dec.resblocks.1.convs2.1.bias
37
+ vq_model.dec.resblocks.1.convs2.1.weight_g
38
+ vq_model.dec.resblocks.1.convs2.1.weight_v
39
+ vq_model.dec.resblocks.1.convs2.2.bias
40
+ vq_model.dec.resblocks.1.convs2.2.weight_g
41
+ vq_model.dec.resblocks.1.convs2.2.weight_v
42
+ vq_model.dec.resblocks.10.convs1.0.bias
43
+ vq_model.dec.resblocks.10.convs1.0.weight_g
44
+ vq_model.dec.resblocks.10.convs1.0.weight_v
45
+ vq_model.dec.resblocks.10.convs1.1.bias
46
+ vq_model.dec.resblocks.10.convs1.1.weight_g
47
+ vq_model.dec.resblocks.10.convs1.1.weight_v
48
+ vq_model.dec.resblocks.10.convs1.2.bias
49
+ vq_model.dec.resblocks.10.convs1.2.weight_g
50
+ vq_model.dec.resblocks.10.convs1.2.weight_v
51
+ vq_model.dec.resblocks.10.convs2.0.bias
52
+ vq_model.dec.resblocks.10.convs2.0.weight_g
53
+ vq_model.dec.resblocks.10.convs2.0.weight_v
54
+ vq_model.dec.resblocks.10.convs2.1.bias
55
+ vq_model.dec.resblocks.10.convs2.1.weight_g
56
+ vq_model.dec.resblocks.10.convs2.1.weight_v
57
+ vq_model.dec.resblocks.10.convs2.2.bias
58
+ vq_model.dec.resblocks.10.convs2.2.weight_g
59
+ vq_model.dec.resblocks.10.convs2.2.weight_v
60
+ vq_model.dec.resblocks.11.convs1.0.bias
61
+ vq_model.dec.resblocks.11.convs1.0.weight_g
62
+ vq_model.dec.resblocks.11.convs1.0.weight_v
63
+ vq_model.dec.resblocks.11.convs1.1.bias
64
+ vq_model.dec.resblocks.11.convs1.1.weight_g
65
+ vq_model.dec.resblocks.11.convs1.1.weight_v
66
+ vq_model.dec.resblocks.11.convs1.2.bias
67
+ vq_model.dec.resblocks.11.convs1.2.weight_g
68
+ vq_model.dec.resblocks.11.convs1.2.weight_v
69
+ vq_model.dec.resblocks.11.convs2.0.bias
70
+ vq_model.dec.resblocks.11.convs2.0.weight_g
71
+ vq_model.dec.resblocks.11.convs2.0.weight_v
72
+ vq_model.dec.resblocks.11.convs2.1.bias
73
+ vq_model.dec.resblocks.11.convs2.1.weight_g
74
+ vq_model.dec.resblocks.11.convs2.1.weight_v
75
+ vq_model.dec.resblocks.11.convs2.2.bias
76
+ vq_model.dec.resblocks.11.convs2.2.weight_g
77
+ vq_model.dec.resblocks.11.convs2.2.weight_v
78
+ vq_model.dec.resblocks.12.convs1.0.bias
79
+ vq_model.dec.resblocks.12.convs1.0.weight_g
80
+ vq_model.dec.resblocks.12.convs1.0.weight_v
81
+ vq_model.dec.resblocks.12.convs1.1.bias
82
+ vq_model.dec.resblocks.12.convs1.1.weight_g
83
+ vq_model.dec.resblocks.12.convs1.1.weight_v
84
+ vq_model.dec.resblocks.12.convs1.2.bias
85
+ vq_model.dec.resblocks.12.convs1.2.weight_g
86
+ vq_model.dec.resblocks.12.convs1.2.weight_v
87
+ vq_model.dec.resblocks.12.convs2.0.bias
88
+ vq_model.dec.resblocks.12.convs2.0.weight_g
89
+ vq_model.dec.resblocks.12.convs2.0.weight_v
90
+ vq_model.dec.resblocks.12.convs2.1.bias
91
+ vq_model.dec.resblocks.12.convs2.1.weight_g
92
+ vq_model.dec.resblocks.12.convs2.1.weight_v
93
+ vq_model.dec.resblocks.12.convs2.2.bias
94
+ vq_model.dec.resblocks.12.convs2.2.weight_g
95
+ vq_model.dec.resblocks.12.convs2.2.weight_v
96
+ vq_model.dec.resblocks.13.convs1.0.bias
97
+ vq_model.dec.resblocks.13.convs1.0.weight_g
98
+ vq_model.dec.resblocks.13.convs1.0.weight_v
99
+ vq_model.dec.resblocks.13.convs1.1.bias
100
+ vq_model.dec.resblocks.13.convs1.1.weight_g
101
+ vq_model.dec.resblocks.13.convs1.1.weight_v
102
+ vq_model.dec.resblocks.13.convs1.2.bias
103
+ vq_model.dec.resblocks.13.convs1.2.weight_g
104
+ vq_model.dec.resblocks.13.convs1.2.weight_v
105
+ vq_model.dec.resblocks.13.convs2.0.bias
106
+ vq_model.dec.resblocks.13.convs2.0.weight_g
107
+ vq_model.dec.resblocks.13.convs2.0.weight_v
108
+ vq_model.dec.resblocks.13.convs2.1.bias
109
+ vq_model.dec.resblocks.13.convs2.1.weight_g
110
+ vq_model.dec.resblocks.13.convs2.1.weight_v
111
+ vq_model.dec.resblocks.13.convs2.2.bias
112
+ vq_model.dec.resblocks.13.convs2.2.weight_g
113
+ vq_model.dec.resblocks.13.convs2.2.weight_v
114
+ vq_model.dec.resblocks.14.convs1.0.bias
115
+ vq_model.dec.resblocks.14.convs1.0.weight_g
116
+ vq_model.dec.resblocks.14.convs1.0.weight_v
117
+ vq_model.dec.resblocks.14.convs1.1.bias
118
+ vq_model.dec.resblocks.14.convs1.1.weight_g
119
+ vq_model.dec.resblocks.14.convs1.1.weight_v
120
+ vq_model.dec.resblocks.14.convs1.2.bias
121
+ vq_model.dec.resblocks.14.convs1.2.weight_g
122
+ vq_model.dec.resblocks.14.convs1.2.weight_v
123
+ vq_model.dec.resblocks.14.convs2.0.bias
124
+ vq_model.dec.resblocks.14.convs2.0.weight_g
125
+ vq_model.dec.resblocks.14.convs2.0.weight_v
126
+ vq_model.dec.resblocks.14.convs2.1.bias
127
+ vq_model.dec.resblocks.14.convs2.1.weight_g
128
+ vq_model.dec.resblocks.14.convs2.1.weight_v
129
+ vq_model.dec.resblocks.14.convs2.2.bias
130
+ vq_model.dec.resblocks.14.convs2.2.weight_g
131
+ vq_model.dec.resblocks.14.convs2.2.weight_v
132
+ vq_model.dec.resblocks.2.convs1.0.bias
133
+ vq_model.dec.resblocks.2.convs1.0.weight_g
134
+ vq_model.dec.resblocks.2.convs1.0.weight_v
135
+ vq_model.dec.resblocks.2.convs1.1.bias
136
+ vq_model.dec.resblocks.2.convs1.1.weight_g
137
+ vq_model.dec.resblocks.2.convs1.1.weight_v
138
+ vq_model.dec.resblocks.2.convs1.2.bias
139
+ vq_model.dec.resblocks.2.convs1.2.weight_g
140
+ vq_model.dec.resblocks.2.convs1.2.weight_v
141
+ vq_model.dec.resblocks.2.convs2.0.bias
142
+ vq_model.dec.resblocks.2.convs2.0.weight_g
143
+ vq_model.dec.resblocks.2.convs2.0.weight_v
144
+ vq_model.dec.resblocks.2.convs2.1.bias
145
+ vq_model.dec.resblocks.2.convs2.1.weight_g
146
+ vq_model.dec.resblocks.2.convs2.1.weight_v
147
+ vq_model.dec.resblocks.2.convs2.2.bias
148
+ vq_model.dec.resblocks.2.convs2.2.weight_g
149
+ vq_model.dec.resblocks.2.convs2.2.weight_v
150
+ vq_model.dec.resblocks.3.convs1.0.bias
151
+ vq_model.dec.resblocks.3.convs1.0.weight_g
152
+ vq_model.dec.resblocks.3.convs1.0.weight_v
153
+ vq_model.dec.resblocks.3.convs1.1.bias
154
+ vq_model.dec.resblocks.3.convs1.1.weight_g
155
+ vq_model.dec.resblocks.3.convs1.1.weight_v
156
+ vq_model.dec.resblocks.3.convs1.2.bias
157
+ vq_model.dec.resblocks.3.convs1.2.weight_g
158
+ vq_model.dec.resblocks.3.convs1.2.weight_v
159
+ vq_model.dec.resblocks.3.convs2.0.bias
160
+ vq_model.dec.resblocks.3.convs2.0.weight_g
161
+ vq_model.dec.resblocks.3.convs2.0.weight_v
162
+ vq_model.dec.resblocks.3.convs2.1.bias
163
+ vq_model.dec.resblocks.3.convs2.1.weight_g
164
+ vq_model.dec.resblocks.3.convs2.1.weight_v
165
+ vq_model.dec.resblocks.3.convs2.2.bias
166
+ vq_model.dec.resblocks.3.convs2.2.weight_g
167
+ vq_model.dec.resblocks.3.convs2.2.weight_v
168
+ vq_model.dec.resblocks.4.convs1.0.bias
169
+ vq_model.dec.resblocks.4.convs1.0.weight_g
170
+ vq_model.dec.resblocks.4.convs1.0.weight_v
171
+ vq_model.dec.resblocks.4.convs1.1.bias
172
+ vq_model.dec.resblocks.4.convs1.1.weight_g
173
+ vq_model.dec.resblocks.4.convs1.1.weight_v
174
+ vq_model.dec.resblocks.4.convs1.2.bias
175
+ vq_model.dec.resblocks.4.convs1.2.weight_g
176
+ vq_model.dec.resblocks.4.convs1.2.weight_v
177
+ vq_model.dec.resblocks.4.convs2.0.bias
178
+ vq_model.dec.resblocks.4.convs2.0.weight_g
179
+ vq_model.dec.resblocks.4.convs2.0.weight_v
180
+ vq_model.dec.resblocks.4.convs2.1.bias
181
+ vq_model.dec.resblocks.4.convs2.1.weight_g
182
+ vq_model.dec.resblocks.4.convs2.1.weight_v
183
+ vq_model.dec.resblocks.4.convs2.2.bias
184
+ vq_model.dec.resblocks.4.convs2.2.weight_g
185
+ vq_model.dec.resblocks.4.convs2.2.weight_v
186
+ vq_model.dec.resblocks.5.convs1.0.bias
187
+ vq_model.dec.resblocks.5.convs1.0.weight_g
188
+ vq_model.dec.resblocks.5.convs1.0.weight_v
189
+ vq_model.dec.resblocks.5.convs1.1.bias
190
+ vq_model.dec.resblocks.5.convs1.1.weight_g
191
+ vq_model.dec.resblocks.5.convs1.1.weight_v
192
+ vq_model.dec.resblocks.5.convs1.2.bias
193
+ vq_model.dec.resblocks.5.convs1.2.weight_g
194
+ vq_model.dec.resblocks.5.convs1.2.weight_v
195
+ vq_model.dec.resblocks.5.convs2.0.bias
196
+ vq_model.dec.resblocks.5.convs2.0.weight_g
197
+ vq_model.dec.resblocks.5.convs2.0.weight_v
198
+ vq_model.dec.resblocks.5.convs2.1.bias
199
+ vq_model.dec.resblocks.5.convs2.1.weight_g
200
+ vq_model.dec.resblocks.5.convs2.1.weight_v
201
+ vq_model.dec.resblocks.5.convs2.2.bias
202
+ vq_model.dec.resblocks.5.convs2.2.weight_g
203
+ vq_model.dec.resblocks.5.convs2.2.weight_v
204
+ vq_model.dec.resblocks.6.convs1.0.bias
205
+ vq_model.dec.resblocks.6.convs1.0.weight_g
206
+ vq_model.dec.resblocks.6.convs1.0.weight_v
207
+ vq_model.dec.resblocks.6.convs1.1.bias
208
+ vq_model.dec.resblocks.6.convs1.1.weight_g
209
+ vq_model.dec.resblocks.6.convs1.1.weight_v
210
+ vq_model.dec.resblocks.6.convs1.2.bias
211
+ vq_model.dec.resblocks.6.convs1.2.weight_g
212
+ vq_model.dec.resblocks.6.convs1.2.weight_v
213
+ vq_model.dec.resblocks.6.convs2.0.bias
214
+ vq_model.dec.resblocks.6.convs2.0.weight_g
215
+ vq_model.dec.resblocks.6.convs2.0.weight_v
216
+ vq_model.dec.resblocks.6.convs2.1.bias
217
+ vq_model.dec.resblocks.6.convs2.1.weight_g
218
+ vq_model.dec.resblocks.6.convs2.1.weight_v
219
+ vq_model.dec.resblocks.6.convs2.2.bias
220
+ vq_model.dec.resblocks.6.convs2.2.weight_g
221
+ vq_model.dec.resblocks.6.convs2.2.weight_v
222
+ vq_model.dec.resblocks.7.convs1.0.bias
223
+ vq_model.dec.resblocks.7.convs1.0.weight_g
224
+ vq_model.dec.resblocks.7.convs1.0.weight_v
225
+ vq_model.dec.resblocks.7.convs1.1.bias
226
+ vq_model.dec.resblocks.7.convs1.1.weight_g
227
+ vq_model.dec.resblocks.7.convs1.1.weight_v
228
+ vq_model.dec.resblocks.7.convs1.2.bias
229
+ vq_model.dec.resblocks.7.convs1.2.weight_g
230
+ vq_model.dec.resblocks.7.convs1.2.weight_v
231
+ vq_model.dec.resblocks.7.convs2.0.bias
232
+ vq_model.dec.resblocks.7.convs2.0.weight_g
233
+ vq_model.dec.resblocks.7.convs2.0.weight_v
234
+ vq_model.dec.resblocks.7.convs2.1.bias
235
+ vq_model.dec.resblocks.7.convs2.1.weight_g
236
+ vq_model.dec.resblocks.7.convs2.1.weight_v
237
+ vq_model.dec.resblocks.7.convs2.2.bias
238
+ vq_model.dec.resblocks.7.convs2.2.weight_g
239
+ vq_model.dec.resblocks.7.convs2.2.weight_v
240
+ vq_model.dec.resblocks.8.convs1.0.bias
241
+ vq_model.dec.resblocks.8.convs1.0.weight_g
242
+ vq_model.dec.resblocks.8.convs1.0.weight_v
243
+ vq_model.dec.resblocks.8.convs1.1.bias
244
+ vq_model.dec.resblocks.8.convs1.1.weight_g
245
+ vq_model.dec.resblocks.8.convs1.1.weight_v
246
+ vq_model.dec.resblocks.8.convs1.2.bias
247
+ vq_model.dec.resblocks.8.convs1.2.weight_g
248
+ vq_model.dec.resblocks.8.convs1.2.weight_v
249
+ vq_model.dec.resblocks.8.convs2.0.bias
250
+ vq_model.dec.resblocks.8.convs2.0.weight_g
251
+ vq_model.dec.resblocks.8.convs2.0.weight_v
252
+ vq_model.dec.resblocks.8.convs2.1.bias
253
+ vq_model.dec.resblocks.8.convs2.1.weight_g
254
+ vq_model.dec.resblocks.8.convs2.1.weight_v
255
+ vq_model.dec.resblocks.8.convs2.2.bias
256
+ vq_model.dec.resblocks.8.convs2.2.weight_g
257
+ vq_model.dec.resblocks.8.convs2.2.weight_v
258
+ vq_model.dec.resblocks.9.convs1.0.bias
259
+ vq_model.dec.resblocks.9.convs1.0.weight_g
260
+ vq_model.dec.resblocks.9.convs1.0.weight_v
261
+ vq_model.dec.resblocks.9.convs1.1.bias
262
+ vq_model.dec.resblocks.9.convs1.1.weight_g
263
+ vq_model.dec.resblocks.9.convs1.1.weight_v
264
+ vq_model.dec.resblocks.9.convs1.2.bias
265
+ vq_model.dec.resblocks.9.convs1.2.weight_g
266
+ vq_model.dec.resblocks.9.convs1.2.weight_v
267
+ vq_model.dec.resblocks.9.convs2.0.bias
268
+ vq_model.dec.resblocks.9.convs2.0.weight_g
269
+ vq_model.dec.resblocks.9.convs2.0.weight_v
270
+ vq_model.dec.resblocks.9.convs2.1.bias
271
+ vq_model.dec.resblocks.9.convs2.1.weight_g
272
+ vq_model.dec.resblocks.9.convs2.1.weight_v
273
+ vq_model.dec.resblocks.9.convs2.2.bias
274
+ vq_model.dec.resblocks.9.convs2.2.weight_g
275
+ vq_model.dec.resblocks.9.convs2.2.weight_v
276
+ vq_model.dec.ups.0.bias
277
+ vq_model.dec.ups.0.weight_g
278
+ vq_model.dec.ups.0.weight_v
279
+ vq_model.dec.ups.1.bias
280
+ vq_model.dec.ups.1.weight_g
281
+ vq_model.dec.ups.1.weight_v
282
+ vq_model.dec.ups.2.bias
283
+ vq_model.dec.ups.2.weight_g
284
+ vq_model.dec.ups.2.weight_v
285
+ vq_model.dec.ups.3.bias
286
+ vq_model.dec.ups.3.weight_g
287
+ vq_model.dec.ups.3.weight_v
288
+ vq_model.dec.ups.4.bias
289
+ vq_model.dec.ups.4.weight_g
290
+ vq_model.dec.ups.4.weight_v
291
+ vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias
292
+ vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight
293
+ vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias
294
+ vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight
295
+ vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias
296
+ vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight
297
+ vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias
298
+ vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight
299
+ vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k
300
+ vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v
301
+ vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias
302
+ vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight
303
+ vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias
304
+ vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight
305
+ vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias
306
+ vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight
307
+ vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias
308
+ vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight
309
+ vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k
310
+ vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v
311
+ vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias
312
+ vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight
313
+ vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias
314
+ vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight
315
+ vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias
316
+ vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight
317
+ vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias
318
+ vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight
319
+ vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k
320
+ vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v
321
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias
322
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight
323
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias
324
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight
325
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias
326
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight
327
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias
328
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight
329
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias
330
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight
331
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias
332
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight
333
+ vq_model.enc_p.encoder2.norm_layers_1.0.beta
334
+ vq_model.enc_p.encoder2.norm_layers_1.0.gamma
335
+ vq_model.enc_p.encoder2.norm_layers_1.1.beta
336
+ vq_model.enc_p.encoder2.norm_layers_1.1.gamma
337
+ vq_model.enc_p.encoder2.norm_layers_1.2.beta
338
+ vq_model.enc_p.encoder2.norm_layers_1.2.gamma
339
+ vq_model.enc_p.encoder2.norm_layers_2.0.beta
340
+ vq_model.enc_p.encoder2.norm_layers_2.0.gamma
341
+ vq_model.enc_p.encoder2.norm_layers_2.1.beta
342
+ vq_model.enc_p.encoder2.norm_layers_2.1.gamma
343
+ vq_model.enc_p.encoder2.norm_layers_2.2.beta
344
+ vq_model.enc_p.encoder2.norm_layers_2.2.gamma
345
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias
346
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight
347
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias
348
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight
349
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias
350
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight
351
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias
352
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight
353
+ vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k
354
+ vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v
355
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias
356
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight
357
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias
358
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight
359
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias
360
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight
361
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias
362
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight
363
+ vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k
364
+ vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v
365
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias
366
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight
367
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias
368
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight
369
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias
370
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight
371
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias
372
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight
373
+ vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k
374
+ vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v
375
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias
376
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight
377
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias
378
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight
379
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias
380
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight
381
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias
382
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight
383
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias
384
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight
385
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias
386
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight
387
+ vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta
388
+ vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma
389
+ vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta
390
+ vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma
391
+ vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta
392
+ vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma
393
+ vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta
394
+ vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma
395
+ vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta
396
+ vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma
397
+ vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta
398
+ vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma
399
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias
400
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight
401
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias
402
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight
403
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias
404
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight
405
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias
406
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight
407
+ vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k
408
+ vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v
409
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias
410
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight
411
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias
412
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight
413
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias
414
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight
415
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias
416
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight
417
+ vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k
418
+ vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v
419
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias
420
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight
421
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias
422
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight
423
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias
424
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight
425
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias
426
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight
427
+ vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k
428
+ vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v
429
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias
430
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight
431
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias
432
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight
433
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias
434
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight
435
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias
436
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight
437
+ vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k
438
+ vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v
439
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias
440
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight
441
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias
442
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight
443
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias
444
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight
445
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias
446
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight
447
+ vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k
448
+ vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v
449
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias
450
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight
451
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias
452
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight
453
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias
454
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight
455
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias
456
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight
457
+ vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k
458
+ vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v
459
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias
460
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight
461
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias
462
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight
463
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias
464
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight
465
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias
466
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight
467
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias
468
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight
469
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias
470
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight
471
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias
472
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight
473
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias
474
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight
475
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias
476
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight
477
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias
478
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight
479
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias
480
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight
481
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias
482
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight
483
+ vq_model.enc_p.encoder_text.norm_layers_1.0.beta
484
+ vq_model.enc_p.encoder_text.norm_layers_1.0.gamma
485
+ vq_model.enc_p.encoder_text.norm_layers_1.1.beta
486
+ vq_model.enc_p.encoder_text.norm_layers_1.1.gamma
487
+ vq_model.enc_p.encoder_text.norm_layers_1.2.beta
488
+ vq_model.enc_p.encoder_text.norm_layers_1.2.gamma
489
+ vq_model.enc_p.encoder_text.norm_layers_1.3.beta
490
+ vq_model.enc_p.encoder_text.norm_layers_1.3.gamma
491
+ vq_model.enc_p.encoder_text.norm_layers_1.4.beta
492
+ vq_model.enc_p.encoder_text.norm_layers_1.4.gamma
493
+ vq_model.enc_p.encoder_text.norm_layers_1.5.beta
494
+ vq_model.enc_p.encoder_text.norm_layers_1.5.gamma
495
+ vq_model.enc_p.encoder_text.norm_layers_2.0.beta
496
+ vq_model.enc_p.encoder_text.norm_layers_2.0.gamma
497
+ vq_model.enc_p.encoder_text.norm_layers_2.1.beta
498
+ vq_model.enc_p.encoder_text.norm_layers_2.1.gamma
499
+ vq_model.enc_p.encoder_text.norm_layers_2.2.beta
500
+ vq_model.enc_p.encoder_text.norm_layers_2.2.gamma
501
+ vq_model.enc_p.encoder_text.norm_layers_2.3.beta
502
+ vq_model.enc_p.encoder_text.norm_layers_2.3.gamma
503
+ vq_model.enc_p.encoder_text.norm_layers_2.4.beta
504
+ vq_model.enc_p.encoder_text.norm_layers_2.4.gamma
505
+ vq_model.enc_p.encoder_text.norm_layers_2.5.beta
506
+ vq_model.enc_p.encoder_text.norm_layers_2.5.gamma
507
+ vq_model.enc_p.mrte.c_post.bias
508
+ vq_model.enc_p.mrte.c_post.weight
509
+ vq_model.enc_p.mrte.c_pre.bias
510
+ vq_model.enc_p.mrte.c_pre.weight
511
+ vq_model.enc_p.mrte.cross_attention.conv_k.bias
512
+ vq_model.enc_p.mrte.cross_attention.conv_k.weight
513
+ vq_model.enc_p.mrte.cross_attention.conv_o.bias
514
+ vq_model.enc_p.mrte.cross_attention.conv_o.weight
515
+ vq_model.enc_p.mrte.cross_attention.conv_q.bias
516
+ vq_model.enc_p.mrte.cross_attention.conv_q.weight
517
+ vq_model.enc_p.mrte.cross_attention.conv_v.bias
518
+ vq_model.enc_p.mrte.cross_attention.conv_v.weight
519
+ vq_model.enc_p.mrte.text_pre.bias
520
+ vq_model.enc_p.mrte.text_pre.weight
521
+ vq_model.enc_p.proj.bias
522
+ vq_model.enc_p.proj.weight
523
+ vq_model.enc_p.ssl_proj.bias
524
+ vq_model.enc_p.ssl_proj.weight
525
+ vq_model.enc_p.text_embedding.weight
526
+ vq_model.flow.flows.0.enc.cond_layer.bias
527
+ vq_model.flow.flows.0.enc.cond_layer.weight_g
528
+ vq_model.flow.flows.0.enc.cond_layer.weight_v
529
+ vq_model.flow.flows.0.enc.in_layers.0.bias
530
+ vq_model.flow.flows.0.enc.in_layers.0.weight_g
531
+ vq_model.flow.flows.0.enc.in_layers.0.weight_v
532
+ vq_model.flow.flows.0.enc.in_layers.1.bias
533
+ vq_model.flow.flows.0.enc.in_layers.1.weight_g
534
+ vq_model.flow.flows.0.enc.in_layers.1.weight_v
535
+ vq_model.flow.flows.0.enc.in_layers.2.bias
536
+ vq_model.flow.flows.0.enc.in_layers.2.weight_g
537
+ vq_model.flow.flows.0.enc.in_layers.2.weight_v
538
+ vq_model.flow.flows.0.enc.in_layers.3.bias
539
+ vq_model.flow.flows.0.enc.in_layers.3.weight_g
540
+ vq_model.flow.flows.0.enc.in_layers.3.weight_v
541
+ vq_model.flow.flows.0.enc.res_skip_layers.0.bias
542
+ vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g
543
+ vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v
544
+ vq_model.flow.flows.0.enc.res_skip_layers.1.bias
545
+ vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g
546
+ vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v
547
+ vq_model.flow.flows.0.enc.res_skip_layers.2.bias
548
+ vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g
549
+ vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v
550
+ vq_model.flow.flows.0.enc.res_skip_layers.3.bias
551
+ vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g
552
+ vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v
553
+ vq_model.flow.flows.0.post.bias
554
+ vq_model.flow.flows.0.post.weight
555
+ vq_model.flow.flows.0.pre.bias
556
+ vq_model.flow.flows.0.pre.weight
557
+ vq_model.flow.flows.2.enc.cond_layer.bias
558
+ vq_model.flow.flows.2.enc.cond_layer.weight_g
559
+ vq_model.flow.flows.2.enc.cond_layer.weight_v
560
+ vq_model.flow.flows.2.enc.in_layers.0.bias
561
+ vq_model.flow.flows.2.enc.in_layers.0.weight_g
562
+ vq_model.flow.flows.2.enc.in_layers.0.weight_v
563
+ vq_model.flow.flows.2.enc.in_layers.1.bias
564
+ vq_model.flow.flows.2.enc.in_layers.1.weight_g
565
+ vq_model.flow.flows.2.enc.in_layers.1.weight_v
566
+ vq_model.flow.flows.2.enc.in_layers.2.bias
567
+ vq_model.flow.flows.2.enc.in_layers.2.weight_g
568
+ vq_model.flow.flows.2.enc.in_layers.2.weight_v
569
+ vq_model.flow.flows.2.enc.in_layers.3.bias
570
+ vq_model.flow.flows.2.enc.in_layers.3.weight_g
571
+ vq_model.flow.flows.2.enc.in_layers.3.weight_v
572
+ vq_model.flow.flows.2.enc.res_skip_layers.0.bias
573
+ vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g
574
+ vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v
575
+ vq_model.flow.flows.2.enc.res_skip_layers.1.bias
576
+ vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g
577
+ vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v
578
+ vq_model.flow.flows.2.enc.res_skip_layers.2.bias
579
+ vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g
580
+ vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v
581
+ vq_model.flow.flows.2.enc.res_skip_layers.3.bias
582
+ vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g
583
+ vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v
584
+ vq_model.flow.flows.2.post.bias
585
+ vq_model.flow.flows.2.post.weight
586
+ vq_model.flow.flows.2.pre.bias
587
+ vq_model.flow.flows.2.pre.weight
588
+ vq_model.flow.flows.4.enc.cond_layer.bias
589
+ vq_model.flow.flows.4.enc.cond_layer.weight_g
590
+ vq_model.flow.flows.4.enc.cond_layer.weight_v
591
+ vq_model.flow.flows.4.enc.in_layers.0.bias
592
+ vq_model.flow.flows.4.enc.in_layers.0.weight_g
593
+ vq_model.flow.flows.4.enc.in_layers.0.weight_v
594
+ vq_model.flow.flows.4.enc.in_layers.1.bias
595
+ vq_model.flow.flows.4.enc.in_layers.1.weight_g
596
+ vq_model.flow.flows.4.enc.in_layers.1.weight_v
597
+ vq_model.flow.flows.4.enc.in_layers.2.bias
598
+ vq_model.flow.flows.4.enc.in_layers.2.weight_g
599
+ vq_model.flow.flows.4.enc.in_layers.2.weight_v
600
+ vq_model.flow.flows.4.enc.in_layers.3.bias
601
+ vq_model.flow.flows.4.enc.in_layers.3.weight_g
602
+ vq_model.flow.flows.4.enc.in_layers.3.weight_v
603
+ vq_model.flow.flows.4.enc.res_skip_layers.0.bias
604
+ vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g
605
+ vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v
606
+ vq_model.flow.flows.4.enc.res_skip_layers.1.bias
607
+ vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g
608
+ vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v
609
+ vq_model.flow.flows.4.enc.res_skip_layers.2.bias
610
+ vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g
611
+ vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v
612
+ vq_model.flow.flows.4.enc.res_skip_layers.3.bias
613
+ vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g
614
+ vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v
615
+ vq_model.flow.flows.4.post.bias
616
+ vq_model.flow.flows.4.post.weight
617
+ vq_model.flow.flows.4.pre.bias
618
+ vq_model.flow.flows.4.pre.weight
619
+ vq_model.flow.flows.6.enc.cond_layer.bias
620
+ vq_model.flow.flows.6.enc.cond_layer.weight_g
621
+ vq_model.flow.flows.6.enc.cond_layer.weight_v
622
+ vq_model.flow.flows.6.enc.in_layers.0.bias
623
+ vq_model.flow.flows.6.enc.in_layers.0.weight_g
624
+ vq_model.flow.flows.6.enc.in_layers.0.weight_v
625
+ vq_model.flow.flows.6.enc.in_layers.1.bias
626
+ vq_model.flow.flows.6.enc.in_layers.1.weight_g
627
+ vq_model.flow.flows.6.enc.in_layers.1.weight_v
628
+ vq_model.flow.flows.6.enc.in_layers.2.bias
629
+ vq_model.flow.flows.6.enc.in_layers.2.weight_g
630
+ vq_model.flow.flows.6.enc.in_layers.2.weight_v
631
+ vq_model.flow.flows.6.enc.in_layers.3.bias
632
+ vq_model.flow.flows.6.enc.in_layers.3.weight_g
633
+ vq_model.flow.flows.6.enc.in_layers.3.weight_v
634
+ vq_model.flow.flows.6.enc.res_skip_layers.0.bias
635
+ vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g
636
+ vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v
637
+ vq_model.flow.flows.6.enc.res_skip_layers.1.bias
638
+ vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g
639
+ vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v
640
+ vq_model.flow.flows.6.enc.res_skip_layers.2.bias
641
+ vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g
642
+ vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v
643
+ vq_model.flow.flows.6.enc.res_skip_layers.3.bias
644
+ vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g
645
+ vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v
646
+ vq_model.flow.flows.6.post.bias
647
+ vq_model.flow.flows.6.post.weight
648
+ vq_model.flow.flows.6.pre.bias
649
+ vq_model.flow.flows.6.pre.weight
650
+ vq_model.quantizer.vq.layers.0._codebook.embed
651
+ vq_model.ref_enc.fc.fc.bias
652
+ vq_model.ref_enc.fc.fc.weight
653
+ vq_model.ref_enc.slf_attn.fc.bias
654
+ vq_model.ref_enc.slf_attn.fc.weight
655
+ vq_model.ref_enc.slf_attn.w_ks.bias
656
+ vq_model.ref_enc.slf_attn.w_ks.weight
657
+ vq_model.ref_enc.slf_attn.w_qs.bias
658
+ vq_model.ref_enc.slf_attn.w_qs.weight
659
+ vq_model.ref_enc.slf_attn.w_vs.bias
660
+ vq_model.ref_enc.slf_attn.w_vs.weight
661
+ vq_model.ref_enc.spectral.0.fc.bias
662
+ vq_model.ref_enc.spectral.0.fc.weight
663
+ vq_model.ref_enc.spectral.3.fc.bias
664
+ vq_model.ref_enc.spectral.3.fc.weight
665
+ vq_model.ref_enc.temporal.0.conv1.conv.bias
666
+ vq_model.ref_enc.temporal.0.conv1.conv.weight
667
+ vq_model.ref_enc.temporal.1.conv1.conv.bias
668
+ vq_model.ref_enc.temporal.1.conv1.conv.weight
genie_tts/Data/v2ProPlus/Keys/prompt_encoder_weights.txt CHANGED
@@ -1,23 +1,23 @@
1
- ref_enc.spectral.0.fc.weight
2
- ref_enc.spectral.0.fc.bias
3
- ref_enc.spectral.3.fc.weight
4
- ref_enc.spectral.3.fc.bias
5
- ref_enc.temporal.0.conv1.conv.weight
6
- ref_enc.temporal.0.conv1.conv.bias
7
- ref_enc.temporal.1.conv1.conv.weight
8
- ref_enc.temporal.1.conv1.conv.bias
9
- ref_enc.slf_attn.w_qs.weight
10
- ref_enc.slf_attn.w_qs.bias
11
- ref_enc.slf_attn.w_ks.weight
12
- ref_enc.slf_attn.w_ks.bias
13
- ref_enc.slf_attn.w_vs.weight
14
- ref_enc.slf_attn.w_vs.bias
15
- ref_enc.slf_attn.fc.weight
16
- ref_enc.slf_attn.fc.bias
17
- ref_enc.fc.fc.weight
18
- ref_enc.fc.fc.bias
19
- sv_emb.weight
20
- sv_emb.bias
21
- ge_to512.weight
22
- ge_to512.bias
23
- prelu.weight
 
1
+ ref_enc.spectral.0.fc.weight
2
+ ref_enc.spectral.0.fc.bias
3
+ ref_enc.spectral.3.fc.weight
4
+ ref_enc.spectral.3.fc.bias
5
+ ref_enc.temporal.0.conv1.conv.weight
6
+ ref_enc.temporal.0.conv1.conv.bias
7
+ ref_enc.temporal.1.conv1.conv.weight
8
+ ref_enc.temporal.1.conv1.conv.bias
9
+ ref_enc.slf_attn.w_qs.weight
10
+ ref_enc.slf_attn.w_qs.bias
11
+ ref_enc.slf_attn.w_ks.weight
12
+ ref_enc.slf_attn.w_ks.bias
13
+ ref_enc.slf_attn.w_vs.weight
14
+ ref_enc.slf_attn.w_vs.bias
15
+ ref_enc.slf_attn.fc.weight
16
+ ref_enc.slf_attn.fc.bias
17
+ ref_enc.fc.fc.weight
18
+ ref_enc.fc.fc.bias
19
+ sv_emb.weight
20
+ sv_emb.bias
21
+ ge_to512.weight
22
+ ge_to512.bias
23
+ prelu.weight
genie_tts/Data/v2ProPlus/Keys/vits_weights.txt CHANGED
@@ -1,650 +1,650 @@
1
- vq_model.enc_p.ssl_proj.weight
2
- vq_model.enc_p.ssl_proj.bias
3
- vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k
4
- vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v
5
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight
6
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias
7
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight
8
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias
9
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight
10
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias
11
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight
12
- vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias
13
- vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k
14
- vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v
15
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight
16
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias
17
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight
18
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias
19
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight
20
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias
21
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight
22
- vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias
23
- vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k
24
- vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v
25
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight
26
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias
27
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight
28
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias
29
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight
30
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias
31
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight
32
- vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias
33
- vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma
34
- vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta
35
- vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma
36
- vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta
37
- vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma
38
- vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta
39
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight
40
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias
41
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight
42
- vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias
43
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight
44
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias
45
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight
46
- vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias
47
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight
48
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias
49
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight
50
- vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias
51
- vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma
52
- vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta
53
- vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma
54
- vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta
55
- vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma
56
- vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta
57
- vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k
58
- vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v
59
- vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight
60
- vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias
61
- vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight
62
- vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias
63
- vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight
64
- vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias
65
- vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight
66
- vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias
67
- vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k
68
- vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v
69
- vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight
70
- vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias
71
- vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight
72
- vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias
73
- vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight
74
- vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias
75
- vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight
76
- vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias
77
- vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k
78
- vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v
79
- vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight
80
- vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias
81
- vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight
82
- vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias
83
- vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight
84
- vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias
85
- vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight
86
- vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias
87
- vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k
88
- vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v
89
- vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight
90
- vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias
91
- vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight
92
- vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias
93
- vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight
94
- vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias
95
- vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight
96
- vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias
97
- vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k
98
- vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v
99
- vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight
100
- vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias
101
- vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight
102
- vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias
103
- vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight
104
- vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias
105
- vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight
106
- vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias
107
- vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k
108
- vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v
109
- vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight
110
- vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias
111
- vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight
112
- vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias
113
- vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight
114
- vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias
115
- vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight
116
- vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias
117
- vq_model.enc_p.encoder_text.norm_layers_1.0.gamma
118
- vq_model.enc_p.encoder_text.norm_layers_1.0.beta
119
- vq_model.enc_p.encoder_text.norm_layers_1.1.gamma
120
- vq_model.enc_p.encoder_text.norm_layers_1.1.beta
121
- vq_model.enc_p.encoder_text.norm_layers_1.2.gamma
122
- vq_model.enc_p.encoder_text.norm_layers_1.2.beta
123
- vq_model.enc_p.encoder_text.norm_layers_1.3.gamma
124
- vq_model.enc_p.encoder_text.norm_layers_1.3.beta
125
- vq_model.enc_p.encoder_text.norm_layers_1.4.gamma
126
- vq_model.enc_p.encoder_text.norm_layers_1.4.beta
127
- vq_model.enc_p.encoder_text.norm_layers_1.5.gamma
128
- vq_model.enc_p.encoder_text.norm_layers_1.5.beta
129
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight
130
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias
131
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight
132
- vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias
133
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight
134
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias
135
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight
136
- vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias
137
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight
138
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias
139
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight
140
- vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias
141
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight
142
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias
143
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight
144
- vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias
145
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight
146
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias
147
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight
148
- vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias
149
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight
150
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias
151
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight
152
- vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias
153
- vq_model.enc_p.encoder_text.norm_layers_2.0.gamma
154
- vq_model.enc_p.encoder_text.norm_layers_2.0.beta
155
- vq_model.enc_p.encoder_text.norm_layers_2.1.gamma
156
- vq_model.enc_p.encoder_text.norm_layers_2.1.beta
157
- vq_model.enc_p.encoder_text.norm_layers_2.2.gamma
158
- vq_model.enc_p.encoder_text.norm_layers_2.2.beta
159
- vq_model.enc_p.encoder_text.norm_layers_2.3.gamma
160
- vq_model.enc_p.encoder_text.norm_layers_2.3.beta
161
- vq_model.enc_p.encoder_text.norm_layers_2.4.gamma
162
- vq_model.enc_p.encoder_text.norm_layers_2.4.beta
163
- vq_model.enc_p.encoder_text.norm_layers_2.5.gamma
164
- vq_model.enc_p.encoder_text.norm_layers_2.5.beta
165
- vq_model.enc_p.text_embedding.weight
166
- vq_model.enc_p.mrte.cross_attention.conv_q.weight
167
- vq_model.enc_p.mrte.cross_attention.conv_q.bias
168
- vq_model.enc_p.mrte.cross_attention.conv_k.weight
169
- vq_model.enc_p.mrte.cross_attention.conv_k.bias
170
- vq_model.enc_p.mrte.cross_attention.conv_v.weight
171
- vq_model.enc_p.mrte.cross_attention.conv_v.bias
172
- vq_model.enc_p.mrte.cross_attention.conv_o.weight
173
- vq_model.enc_p.mrte.cross_attention.conv_o.bias
174
- vq_model.enc_p.mrte.c_pre.weight
175
- vq_model.enc_p.mrte.c_pre.bias
176
- vq_model.enc_p.mrte.text_pre.weight
177
- vq_model.enc_p.mrte.text_pre.bias
178
- vq_model.enc_p.mrte.c_post.weight
179
- vq_model.enc_p.mrte.c_post.bias
180
- vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k
181
- vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v
182
- vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight
183
- vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias
184
- vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight
185
- vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias
186
- vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight
187
- vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias
188
- vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight
189
- vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias
190
- vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k
191
- vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v
192
- vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight
193
- vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias
194
- vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight
195
- vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias
196
- vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight
197
- vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias
198
- vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight
199
- vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias
200
- vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k
201
- vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v
202
- vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight
203
- vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias
204
- vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight
205
- vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias
206
- vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight
207
- vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias
208
- vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight
209
- vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias
210
- vq_model.enc_p.encoder2.norm_layers_1.0.gamma
211
- vq_model.enc_p.encoder2.norm_layers_1.0.beta
212
- vq_model.enc_p.encoder2.norm_layers_1.1.gamma
213
- vq_model.enc_p.encoder2.norm_layers_1.1.beta
214
- vq_model.enc_p.encoder2.norm_layers_1.2.gamma
215
- vq_model.enc_p.encoder2.norm_layers_1.2.beta
216
- vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight
217
- vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias
218
- vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight
219
- vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias
220
- vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight
221
- vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias
222
- vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight
223
- vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias
224
- vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight
225
- vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias
226
- vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight
227
- vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias
228
- vq_model.enc_p.encoder2.norm_layers_2.0.gamma
229
- vq_model.enc_p.encoder2.norm_layers_2.0.beta
230
- vq_model.enc_p.encoder2.norm_layers_2.1.gamma
231
- vq_model.enc_p.encoder2.norm_layers_2.1.beta
232
- vq_model.enc_p.encoder2.norm_layers_2.2.gamma
233
- vq_model.enc_p.encoder2.norm_layers_2.2.beta
234
- vq_model.enc_p.proj.weight
235
- vq_model.enc_p.proj.bias
236
- vq_model.dec.conv_pre.weight
237
- vq_model.dec.conv_pre.bias
238
- vq_model.dec.ups.0.bias
239
- vq_model.dec.ups.0.weight_g
240
- vq_model.dec.ups.0.weight_v
241
- vq_model.dec.ups.1.bias
242
- vq_model.dec.ups.1.weight_g
243
- vq_model.dec.ups.1.weight_v
244
- vq_model.dec.ups.2.bias
245
- vq_model.dec.ups.2.weight_g
246
- vq_model.dec.ups.2.weight_v
247
- vq_model.dec.ups.3.bias
248
- vq_model.dec.ups.3.weight_g
249
- vq_model.dec.ups.3.weight_v
250
- vq_model.dec.ups.4.bias
251
- vq_model.dec.ups.4.weight_g
252
- vq_model.dec.ups.4.weight_v
253
- vq_model.dec.resblocks.0.convs1.0.bias
254
- vq_model.dec.resblocks.0.convs1.0.weight_g
255
- vq_model.dec.resblocks.0.convs1.0.weight_v
256
- vq_model.dec.resblocks.0.convs1.1.bias
257
- vq_model.dec.resblocks.0.convs1.1.weight_g
258
- vq_model.dec.resblocks.0.convs1.1.weight_v
259
- vq_model.dec.resblocks.0.convs1.2.bias
260
- vq_model.dec.resblocks.0.convs1.2.weight_g
261
- vq_model.dec.resblocks.0.convs1.2.weight_v
262
- vq_model.dec.resblocks.0.convs2.0.bias
263
- vq_model.dec.resblocks.0.convs2.0.weight_g
264
- vq_model.dec.resblocks.0.convs2.0.weight_v
265
- vq_model.dec.resblocks.0.convs2.1.bias
266
- vq_model.dec.resblocks.0.convs2.1.weight_g
267
- vq_model.dec.resblocks.0.convs2.1.weight_v
268
- vq_model.dec.resblocks.0.convs2.2.bias
269
- vq_model.dec.resblocks.0.convs2.2.weight_g
270
- vq_model.dec.resblocks.0.convs2.2.weight_v
271
- vq_model.dec.resblocks.1.convs1.0.bias
272
- vq_model.dec.resblocks.1.convs1.0.weight_g
273
- vq_model.dec.resblocks.1.convs1.0.weight_v
274
- vq_model.dec.resblocks.1.convs1.1.bias
275
- vq_model.dec.resblocks.1.convs1.1.weight_g
276
- vq_model.dec.resblocks.1.convs1.1.weight_v
277
- vq_model.dec.resblocks.1.convs1.2.bias
278
- vq_model.dec.resblocks.1.convs1.2.weight_g
279
- vq_model.dec.resblocks.1.convs1.2.weight_v
280
- vq_model.dec.resblocks.1.convs2.0.bias
281
- vq_model.dec.resblocks.1.convs2.0.weight_g
282
- vq_model.dec.resblocks.1.convs2.0.weight_v
283
- vq_model.dec.resblocks.1.convs2.1.bias
284
- vq_model.dec.resblocks.1.convs2.1.weight_g
285
- vq_model.dec.resblocks.1.convs2.1.weight_v
286
- vq_model.dec.resblocks.1.convs2.2.bias
287
- vq_model.dec.resblocks.1.convs2.2.weight_g
288
- vq_model.dec.resblocks.1.convs2.2.weight_v
289
- vq_model.dec.resblocks.2.convs1.0.bias
290
- vq_model.dec.resblocks.2.convs1.0.weight_g
291
- vq_model.dec.resblocks.2.convs1.0.weight_v
292
- vq_model.dec.resblocks.2.convs1.1.bias
293
- vq_model.dec.resblocks.2.convs1.1.weight_g
294
- vq_model.dec.resblocks.2.convs1.1.weight_v
295
- vq_model.dec.resblocks.2.convs1.2.bias
296
- vq_model.dec.resblocks.2.convs1.2.weight_g
297
- vq_model.dec.resblocks.2.convs1.2.weight_v
298
- vq_model.dec.resblocks.2.convs2.0.bias
299
- vq_model.dec.resblocks.2.convs2.0.weight_g
300
- vq_model.dec.resblocks.2.convs2.0.weight_v
301
- vq_model.dec.resblocks.2.convs2.1.bias
302
- vq_model.dec.resblocks.2.convs2.1.weight_g
303
- vq_model.dec.resblocks.2.convs2.1.weight_v
304
- vq_model.dec.resblocks.2.convs2.2.bias
305
- vq_model.dec.resblocks.2.convs2.2.weight_g
306
- vq_model.dec.resblocks.2.convs2.2.weight_v
307
- vq_model.dec.resblocks.3.convs1.0.bias
308
- vq_model.dec.resblocks.3.convs1.0.weight_g
309
- vq_model.dec.resblocks.3.convs1.0.weight_v
310
- vq_model.dec.resblocks.3.convs1.1.bias
311
- vq_model.dec.resblocks.3.convs1.1.weight_g
312
- vq_model.dec.resblocks.3.convs1.1.weight_v
313
- vq_model.dec.resblocks.3.convs1.2.bias
314
- vq_model.dec.resblocks.3.convs1.2.weight_g
315
- vq_model.dec.resblocks.3.convs1.2.weight_v
316
- vq_model.dec.resblocks.3.convs2.0.bias
317
- vq_model.dec.resblocks.3.convs2.0.weight_g
318
- vq_model.dec.resblocks.3.convs2.0.weight_v
319
- vq_model.dec.resblocks.3.convs2.1.bias
320
- vq_model.dec.resblocks.3.convs2.1.weight_g
321
- vq_model.dec.resblocks.3.convs2.1.weight_v
322
- vq_model.dec.resblocks.3.convs2.2.bias
323
- vq_model.dec.resblocks.3.convs2.2.weight_g
324
- vq_model.dec.resblocks.3.convs2.2.weight_v
325
- vq_model.dec.resblocks.4.convs1.0.bias
326
- vq_model.dec.resblocks.4.convs1.0.weight_g
327
- vq_model.dec.resblocks.4.convs1.0.weight_v
328
- vq_model.dec.resblocks.4.convs1.1.bias
329
- vq_model.dec.resblocks.4.convs1.1.weight_g
330
- vq_model.dec.resblocks.4.convs1.1.weight_v
331
- vq_model.dec.resblocks.4.convs1.2.bias
332
- vq_model.dec.resblocks.4.convs1.2.weight_g
333
- vq_model.dec.resblocks.4.convs1.2.weight_v
334
- vq_model.dec.resblocks.4.convs2.0.bias
335
- vq_model.dec.resblocks.4.convs2.0.weight_g
336
- vq_model.dec.resblocks.4.convs2.0.weight_v
337
- vq_model.dec.resblocks.4.convs2.1.bias
338
- vq_model.dec.resblocks.4.convs2.1.weight_g
339
- vq_model.dec.resblocks.4.convs2.1.weight_v
340
- vq_model.dec.resblocks.4.convs2.2.bias
341
- vq_model.dec.resblocks.4.convs2.2.weight_g
342
- vq_model.dec.resblocks.4.convs2.2.weight_v
343
- vq_model.dec.resblocks.5.convs1.0.bias
344
- vq_model.dec.resblocks.5.convs1.0.weight_g
345
- vq_model.dec.resblocks.5.convs1.0.weight_v
346
- vq_model.dec.resblocks.5.convs1.1.bias
347
- vq_model.dec.resblocks.5.convs1.1.weight_g
348
- vq_model.dec.resblocks.5.convs1.1.weight_v
349
- vq_model.dec.resblocks.5.convs1.2.bias
350
- vq_model.dec.resblocks.5.convs1.2.weight_g
351
- vq_model.dec.resblocks.5.convs1.2.weight_v
352
- vq_model.dec.resblocks.5.convs2.0.bias
353
- vq_model.dec.resblocks.5.convs2.0.weight_g
354
- vq_model.dec.resblocks.5.convs2.0.weight_v
355
- vq_model.dec.resblocks.5.convs2.1.bias
356
- vq_model.dec.resblocks.5.convs2.1.weight_g
357
- vq_model.dec.resblocks.5.convs2.1.weight_v
358
- vq_model.dec.resblocks.5.convs2.2.bias
359
- vq_model.dec.resblocks.5.convs2.2.weight_g
360
- vq_model.dec.resblocks.5.convs2.2.weight_v
361
- vq_model.dec.resblocks.6.convs1.0.bias
362
- vq_model.dec.resblocks.6.convs1.0.weight_g
363
- vq_model.dec.resblocks.6.convs1.0.weight_v
364
- vq_model.dec.resblocks.6.convs1.1.bias
365
- vq_model.dec.resblocks.6.convs1.1.weight_g
366
- vq_model.dec.resblocks.6.convs1.1.weight_v
367
- vq_model.dec.resblocks.6.convs1.2.bias
368
- vq_model.dec.resblocks.6.convs1.2.weight_g
369
- vq_model.dec.resblocks.6.convs1.2.weight_v
370
- vq_model.dec.resblocks.6.convs2.0.bias
371
- vq_model.dec.resblocks.6.convs2.0.weight_g
372
- vq_model.dec.resblocks.6.convs2.0.weight_v
373
- vq_model.dec.resblocks.6.convs2.1.bias
374
- vq_model.dec.resblocks.6.convs2.1.weight_g
375
- vq_model.dec.resblocks.6.convs2.1.weight_v
376
- vq_model.dec.resblocks.6.convs2.2.bias
377
- vq_model.dec.resblocks.6.convs2.2.weight_g
378
- vq_model.dec.resblocks.6.convs2.2.weight_v
379
- vq_model.dec.resblocks.7.convs1.0.bias
380
- vq_model.dec.resblocks.7.convs1.0.weight_g
381
- vq_model.dec.resblocks.7.convs1.0.weight_v
382
- vq_model.dec.resblocks.7.convs1.1.bias
383
- vq_model.dec.resblocks.7.convs1.1.weight_g
384
- vq_model.dec.resblocks.7.convs1.1.weight_v
385
- vq_model.dec.resblocks.7.convs1.2.bias
386
- vq_model.dec.resblocks.7.convs1.2.weight_g
387
- vq_model.dec.resblocks.7.convs1.2.weight_v
388
- vq_model.dec.resblocks.7.convs2.0.bias
389
- vq_model.dec.resblocks.7.convs2.0.weight_g
390
- vq_model.dec.resblocks.7.convs2.0.weight_v
391
- vq_model.dec.resblocks.7.convs2.1.bias
392
- vq_model.dec.resblocks.7.convs2.1.weight_g
393
- vq_model.dec.resblocks.7.convs2.1.weight_v
394
- vq_model.dec.resblocks.7.convs2.2.bias
395
- vq_model.dec.resblocks.7.convs2.2.weight_g
396
- vq_model.dec.resblocks.7.convs2.2.weight_v
397
- vq_model.dec.resblocks.8.convs1.0.bias
398
- vq_model.dec.resblocks.8.convs1.0.weight_g
399
- vq_model.dec.resblocks.8.convs1.0.weight_v
400
- vq_model.dec.resblocks.8.convs1.1.bias
401
- vq_model.dec.resblocks.8.convs1.1.weight_g
402
- vq_model.dec.resblocks.8.convs1.1.weight_v
403
- vq_model.dec.resblocks.8.convs1.2.bias
404
- vq_model.dec.resblocks.8.convs1.2.weight_g
405
- vq_model.dec.resblocks.8.convs1.2.weight_v
406
- vq_model.dec.resblocks.8.convs2.0.bias
407
- vq_model.dec.resblocks.8.convs2.0.weight_g
408
- vq_model.dec.resblocks.8.convs2.0.weight_v
409
- vq_model.dec.resblocks.8.convs2.1.bias
410
- vq_model.dec.resblocks.8.convs2.1.weight_g
411
- vq_model.dec.resblocks.8.convs2.1.weight_v
412
- vq_model.dec.resblocks.8.convs2.2.bias
413
- vq_model.dec.resblocks.8.convs2.2.weight_g
414
- vq_model.dec.resblocks.8.convs2.2.weight_v
415
- vq_model.dec.resblocks.9.convs1.0.bias
416
- vq_model.dec.resblocks.9.convs1.0.weight_g
417
- vq_model.dec.resblocks.9.convs1.0.weight_v
418
- vq_model.dec.resblocks.9.convs1.1.bias
419
- vq_model.dec.resblocks.9.convs1.1.weight_g
420
- vq_model.dec.resblocks.9.convs1.1.weight_v
421
- vq_model.dec.resblocks.9.convs1.2.bias
422
- vq_model.dec.resblocks.9.convs1.2.weight_g
423
- vq_model.dec.resblocks.9.convs1.2.weight_v
424
- vq_model.dec.resblocks.9.convs2.0.bias
425
- vq_model.dec.resblocks.9.convs2.0.weight_g
426
- vq_model.dec.resblocks.9.convs2.0.weight_v
427
- vq_model.dec.resblocks.9.convs2.1.bias
428
- vq_model.dec.resblocks.9.convs2.1.weight_g
429
- vq_model.dec.resblocks.9.convs2.1.weight_v
430
- vq_model.dec.resblocks.9.convs2.2.bias
431
- vq_model.dec.resblocks.9.convs2.2.weight_g
432
- vq_model.dec.resblocks.9.convs2.2.weight_v
433
- vq_model.dec.resblocks.10.convs1.0.bias
434
- vq_model.dec.resblocks.10.convs1.0.weight_g
435
- vq_model.dec.resblocks.10.convs1.0.weight_v
436
- vq_model.dec.resblocks.10.convs1.1.bias
437
- vq_model.dec.resblocks.10.convs1.1.weight_g
438
- vq_model.dec.resblocks.10.convs1.1.weight_v
439
- vq_model.dec.resblocks.10.convs1.2.bias
440
- vq_model.dec.resblocks.10.convs1.2.weight_g
441
- vq_model.dec.resblocks.10.convs1.2.weight_v
442
- vq_model.dec.resblocks.10.convs2.0.bias
443
- vq_model.dec.resblocks.10.convs2.0.weight_g
444
- vq_model.dec.resblocks.10.convs2.0.weight_v
445
- vq_model.dec.resblocks.10.convs2.1.bias
446
- vq_model.dec.resblocks.10.convs2.1.weight_g
447
- vq_model.dec.resblocks.10.convs2.1.weight_v
448
- vq_model.dec.resblocks.10.convs2.2.bias
449
- vq_model.dec.resblocks.10.convs2.2.weight_g
450
- vq_model.dec.resblocks.10.convs2.2.weight_v
451
- vq_model.dec.resblocks.11.convs1.0.bias
452
- vq_model.dec.resblocks.11.convs1.0.weight_g
453
- vq_model.dec.resblocks.11.convs1.0.weight_v
454
- vq_model.dec.resblocks.11.convs1.1.bias
455
- vq_model.dec.resblocks.11.convs1.1.weight_g
456
- vq_model.dec.resblocks.11.convs1.1.weight_v
457
- vq_model.dec.resblocks.11.convs1.2.bias
458
- vq_model.dec.resblocks.11.convs1.2.weight_g
459
- vq_model.dec.resblocks.11.convs1.2.weight_v
460
- vq_model.dec.resblocks.11.convs2.0.bias
461
- vq_model.dec.resblocks.11.convs2.0.weight_g
462
- vq_model.dec.resblocks.11.convs2.0.weight_v
463
- vq_model.dec.resblocks.11.convs2.1.bias
464
- vq_model.dec.resblocks.11.convs2.1.weight_g
465
- vq_model.dec.resblocks.11.convs2.1.weight_v
466
- vq_model.dec.resblocks.11.convs2.2.bias
467
- vq_model.dec.resblocks.11.convs2.2.weight_g
468
- vq_model.dec.resblocks.11.convs2.2.weight_v
469
- vq_model.dec.resblocks.12.convs1.0.bias
470
- vq_model.dec.resblocks.12.convs1.0.weight_g
471
- vq_model.dec.resblocks.12.convs1.0.weight_v
472
- vq_model.dec.resblocks.12.convs1.1.bias
473
- vq_model.dec.resblocks.12.convs1.1.weight_g
474
- vq_model.dec.resblocks.12.convs1.1.weight_v
475
- vq_model.dec.resblocks.12.convs1.2.bias
476
- vq_model.dec.resblocks.12.convs1.2.weight_g
477
- vq_model.dec.resblocks.12.convs1.2.weight_v
478
- vq_model.dec.resblocks.12.convs2.0.bias
479
- vq_model.dec.resblocks.12.convs2.0.weight_g
480
- vq_model.dec.resblocks.12.convs2.0.weight_v
481
- vq_model.dec.resblocks.12.convs2.1.bias
482
- vq_model.dec.resblocks.12.convs2.1.weight_g
483
- vq_model.dec.resblocks.12.convs2.1.weight_v
484
- vq_model.dec.resblocks.12.convs2.2.bias
485
- vq_model.dec.resblocks.12.convs2.2.weight_g
486
- vq_model.dec.resblocks.12.convs2.2.weight_v
487
- vq_model.dec.resblocks.13.convs1.0.bias
488
- vq_model.dec.resblocks.13.convs1.0.weight_g
489
- vq_model.dec.resblocks.13.convs1.0.weight_v
490
- vq_model.dec.resblocks.13.convs1.1.bias
491
- vq_model.dec.resblocks.13.convs1.1.weight_g
492
- vq_model.dec.resblocks.13.convs1.1.weight_v
493
- vq_model.dec.resblocks.13.convs1.2.bias
494
- vq_model.dec.resblocks.13.convs1.2.weight_g
495
- vq_model.dec.resblocks.13.convs1.2.weight_v
496
- vq_model.dec.resblocks.13.convs2.0.bias
497
- vq_model.dec.resblocks.13.convs2.0.weight_g
498
- vq_model.dec.resblocks.13.convs2.0.weight_v
499
- vq_model.dec.resblocks.13.convs2.1.bias
500
- vq_model.dec.resblocks.13.convs2.1.weight_g
501
- vq_model.dec.resblocks.13.convs2.1.weight_v
502
- vq_model.dec.resblocks.13.convs2.2.bias
503
- vq_model.dec.resblocks.13.convs2.2.weight_g
504
- vq_model.dec.resblocks.13.convs2.2.weight_v
505
- vq_model.dec.resblocks.14.convs1.0.bias
506
- vq_model.dec.resblocks.14.convs1.0.weight_g
507
- vq_model.dec.resblocks.14.convs1.0.weight_v
508
- vq_model.dec.resblocks.14.convs1.1.bias
509
- vq_model.dec.resblocks.14.convs1.1.weight_g
510
- vq_model.dec.resblocks.14.convs1.1.weight_v
511
- vq_model.dec.resblocks.14.convs1.2.bias
512
- vq_model.dec.resblocks.14.convs1.2.weight_g
513
- vq_model.dec.resblocks.14.convs1.2.weight_v
514
- vq_model.dec.resblocks.14.convs2.0.bias
515
- vq_model.dec.resblocks.14.convs2.0.weight_g
516
- vq_model.dec.resblocks.14.convs2.0.weight_v
517
- vq_model.dec.resblocks.14.convs2.1.bias
518
- vq_model.dec.resblocks.14.convs2.1.weight_g
519
- vq_model.dec.resblocks.14.convs2.1.weight_v
520
- vq_model.dec.resblocks.14.convs2.2.bias
521
- vq_model.dec.resblocks.14.convs2.2.weight_g
522
- vq_model.dec.resblocks.14.convs2.2.weight_v
523
- vq_model.dec.conv_post.weight
524
- vq_model.dec.cond.weight
525
- vq_model.dec.cond.bias
526
- vq_model.flow.flows.0.pre.weight
527
- vq_model.flow.flows.0.pre.bias
528
- vq_model.flow.flows.0.enc.in_layers.0.bias
529
- vq_model.flow.flows.0.enc.in_layers.0.weight_g
530
- vq_model.flow.flows.0.enc.in_layers.0.weight_v
531
- vq_model.flow.flows.0.enc.in_layers.1.bias
532
- vq_model.flow.flows.0.enc.in_layers.1.weight_g
533
- vq_model.flow.flows.0.enc.in_layers.1.weight_v
534
- vq_model.flow.flows.0.enc.in_layers.2.bias
535
- vq_model.flow.flows.0.enc.in_layers.2.weight_g
536
- vq_model.flow.flows.0.enc.in_layers.2.weight_v
537
- vq_model.flow.flows.0.enc.in_layers.3.bias
538
- vq_model.flow.flows.0.enc.in_layers.3.weight_g
539
- vq_model.flow.flows.0.enc.in_layers.3.weight_v
540
- vq_model.flow.flows.0.enc.res_skip_layers.0.bias
541
- vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g
542
- vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v
543
- vq_model.flow.flows.0.enc.res_skip_layers.1.bias
544
- vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g
545
- vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v
546
- vq_model.flow.flows.0.enc.res_skip_layers.2.bias
547
- vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g
548
- vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v
549
- vq_model.flow.flows.0.enc.res_skip_layers.3.bias
550
- vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g
551
- vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v
552
- vq_model.flow.flows.0.enc.cond_layer.bias
553
- vq_model.flow.flows.0.enc.cond_layer.weight_g
554
- vq_model.flow.flows.0.enc.cond_layer.weight_v
555
- vq_model.flow.flows.0.post.weight
556
- vq_model.flow.flows.0.post.bias
557
- vq_model.flow.flows.2.pre.weight
558
- vq_model.flow.flows.2.pre.bias
559
- vq_model.flow.flows.2.enc.in_layers.0.bias
560
- vq_model.flow.flows.2.enc.in_layers.0.weight_g
561
- vq_model.flow.flows.2.enc.in_layers.0.weight_v
562
- vq_model.flow.flows.2.enc.in_layers.1.bias
563
- vq_model.flow.flows.2.enc.in_layers.1.weight_g
564
- vq_model.flow.flows.2.enc.in_layers.1.weight_v
565
- vq_model.flow.flows.2.enc.in_layers.2.bias
566
- vq_model.flow.flows.2.enc.in_layers.2.weight_g
567
- vq_model.flow.flows.2.enc.in_layers.2.weight_v
568
- vq_model.flow.flows.2.enc.in_layers.3.bias
569
- vq_model.flow.flows.2.enc.in_layers.3.weight_g
570
- vq_model.flow.flows.2.enc.in_layers.3.weight_v
571
- vq_model.flow.flows.2.enc.res_skip_layers.0.bias
572
- vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g
573
- vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v
574
- vq_model.flow.flows.2.enc.res_skip_layers.1.bias
575
- vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g
576
- vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v
577
- vq_model.flow.flows.2.enc.res_skip_layers.2.bias
578
- vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g
579
- vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v
580
- vq_model.flow.flows.2.enc.res_skip_layers.3.bias
581
- vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g
582
- vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v
583
- vq_model.flow.flows.2.enc.cond_layer.bias
584
- vq_model.flow.flows.2.enc.cond_layer.weight_g
585
- vq_model.flow.flows.2.enc.cond_layer.weight_v
586
- vq_model.flow.flows.2.post.weight
587
- vq_model.flow.flows.2.post.bias
588
- vq_model.flow.flows.4.pre.weight
589
- vq_model.flow.flows.4.pre.bias
590
- vq_model.flow.flows.4.enc.in_layers.0.bias
591
- vq_model.flow.flows.4.enc.in_layers.0.weight_g
592
- vq_model.flow.flows.4.enc.in_layers.0.weight_v
593
- vq_model.flow.flows.4.enc.in_layers.1.bias
594
- vq_model.flow.flows.4.enc.in_layers.1.weight_g
595
- vq_model.flow.flows.4.enc.in_layers.1.weight_v
596
- vq_model.flow.flows.4.enc.in_layers.2.bias
597
- vq_model.flow.flows.4.enc.in_layers.2.weight_g
598
- vq_model.flow.flows.4.enc.in_layers.2.weight_v
599
- vq_model.flow.flows.4.enc.in_layers.3.bias
600
- vq_model.flow.flows.4.enc.in_layers.3.weight_g
601
- vq_model.flow.flows.4.enc.in_layers.3.weight_v
602
- vq_model.flow.flows.4.enc.res_skip_layers.0.bias
603
- vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g
604
- vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v
605
- vq_model.flow.flows.4.enc.res_skip_layers.1.bias
606
- vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g
607
- vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v
608
- vq_model.flow.flows.4.enc.res_skip_layers.2.bias
609
- vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g
610
- vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v
611
- vq_model.flow.flows.4.enc.res_skip_layers.3.bias
612
- vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g
613
- vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v
614
- vq_model.flow.flows.4.enc.cond_layer.bias
615
- vq_model.flow.flows.4.enc.cond_layer.weight_g
616
- vq_model.flow.flows.4.enc.cond_layer.weight_v
617
- vq_model.flow.flows.4.post.weight
618
- vq_model.flow.flows.4.post.bias
619
- vq_model.flow.flows.6.pre.weight
620
- vq_model.flow.flows.6.pre.bias
621
- vq_model.flow.flows.6.enc.in_layers.0.bias
622
- vq_model.flow.flows.6.enc.in_layers.0.weight_g
623
- vq_model.flow.flows.6.enc.in_layers.0.weight_v
624
- vq_model.flow.flows.6.enc.in_layers.1.bias
625
- vq_model.flow.flows.6.enc.in_layers.1.weight_g
626
- vq_model.flow.flows.6.enc.in_layers.1.weight_v
627
- vq_model.flow.flows.6.enc.in_layers.2.bias
628
- vq_model.flow.flows.6.enc.in_layers.2.weight_g
629
- vq_model.flow.flows.6.enc.in_layers.2.weight_v
630
- vq_model.flow.flows.6.enc.in_layers.3.bias
631
- vq_model.flow.flows.6.enc.in_layers.3.weight_g
632
- vq_model.flow.flows.6.enc.in_layers.3.weight_v
633
- vq_model.flow.flows.6.enc.res_skip_layers.0.bias
634
- vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g
635
- vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v
636
- vq_model.flow.flows.6.enc.res_skip_layers.1.bias
637
- vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g
638
- vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v
639
- vq_model.flow.flows.6.enc.res_skip_layers.2.bias
640
- vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g
641
- vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v
642
- vq_model.flow.flows.6.enc.res_skip_layers.3.bias
643
- vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g
644
- vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v
645
- vq_model.flow.flows.6.enc.cond_layer.bias
646
- vq_model.flow.flows.6.enc.cond_layer.weight_g
647
- vq_model.flow.flows.6.enc.cond_layer.weight_v
648
- vq_model.flow.flows.6.post.weight
649
- vq_model.flow.flows.6.post.bias
650
- vq_model.quantizer.vq.layers.0._codebook.embed
 
1
+ vq_model.enc_p.ssl_proj.weight
2
+ vq_model.enc_p.ssl_proj.bias
3
+ vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k
4
+ vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v
5
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight
6
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias
7
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight
8
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias
9
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight
10
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias
11
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight
12
+ vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias
13
+ vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k
14
+ vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v
15
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight
16
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias
17
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight
18
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias
19
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight
20
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias
21
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight
22
+ vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias
23
+ vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k
24
+ vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v
25
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight
26
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias
27
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight
28
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias
29
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight
30
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias
31
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight
32
+ vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias
33
+ vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma
34
+ vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta
35
+ vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma
36
+ vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta
37
+ vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma
38
+ vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta
39
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight
40
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias
41
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight
42
+ vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias
43
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight
44
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias
45
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight
46
+ vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias
47
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight
48
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias
49
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight
50
+ vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias
51
+ vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma
52
+ vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta
53
+ vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma
54
+ vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta
55
+ vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma
56
+ vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta
57
+ vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k
58
+ vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v
59
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight
60
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias
61
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight
62
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias
63
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight
64
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias
65
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight
66
+ vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias
67
+ vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k
68
+ vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v
69
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight
70
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias
71
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight
72
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias
73
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight
74
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias
75
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight
76
+ vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias
77
+ vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k
78
+ vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v
79
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight
80
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias
81
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight
82
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias
83
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight
84
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias
85
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight
86
+ vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias
87
+ vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k
88
+ vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v
89
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight
90
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias
91
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight
92
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias
93
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight
94
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias
95
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight
96
+ vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias
97
+ vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k
98
+ vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v
99
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight
100
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias
101
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight
102
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias
103
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight
104
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias
105
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight
106
+ vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias
107
+ vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k
108
+ vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v
109
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight
110
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias
111
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight
112
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias
113
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight
114
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias
115
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight
116
+ vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias
117
+ vq_model.enc_p.encoder_text.norm_layers_1.0.gamma
118
+ vq_model.enc_p.encoder_text.norm_layers_1.0.beta
119
+ vq_model.enc_p.encoder_text.norm_layers_1.1.gamma
120
+ vq_model.enc_p.encoder_text.norm_layers_1.1.beta
121
+ vq_model.enc_p.encoder_text.norm_layers_1.2.gamma
122
+ vq_model.enc_p.encoder_text.norm_layers_1.2.beta
123
+ vq_model.enc_p.encoder_text.norm_layers_1.3.gamma
124
+ vq_model.enc_p.encoder_text.norm_layers_1.3.beta
125
+ vq_model.enc_p.encoder_text.norm_layers_1.4.gamma
126
+ vq_model.enc_p.encoder_text.norm_layers_1.4.beta
127
+ vq_model.enc_p.encoder_text.norm_layers_1.5.gamma
128
+ vq_model.enc_p.encoder_text.norm_layers_1.5.beta
129
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight
130
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias
131
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight
132
+ vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias
133
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight
134
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias
135
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight
136
+ vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias
137
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight
138
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias
139
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight
140
+ vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias
141
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight
142
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias
143
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight
144
+ vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias
145
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight
146
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias
147
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight
148
+ vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias
149
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight
150
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias
151
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight
152
+ vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias
153
+ vq_model.enc_p.encoder_text.norm_layers_2.0.gamma
154
+ vq_model.enc_p.encoder_text.norm_layers_2.0.beta
155
+ vq_model.enc_p.encoder_text.norm_layers_2.1.gamma
156
+ vq_model.enc_p.encoder_text.norm_layers_2.1.beta
157
+ vq_model.enc_p.encoder_text.norm_layers_2.2.gamma
158
+ vq_model.enc_p.encoder_text.norm_layers_2.2.beta
159
+ vq_model.enc_p.encoder_text.norm_layers_2.3.gamma
160
+ vq_model.enc_p.encoder_text.norm_layers_2.3.beta
161
+ vq_model.enc_p.encoder_text.norm_layers_2.4.gamma
162
+ vq_model.enc_p.encoder_text.norm_layers_2.4.beta
163
+ vq_model.enc_p.encoder_text.norm_layers_2.5.gamma
164
+ vq_model.enc_p.encoder_text.norm_layers_2.5.beta
165
+ vq_model.enc_p.text_embedding.weight
166
+ vq_model.enc_p.mrte.cross_attention.conv_q.weight
167
+ vq_model.enc_p.mrte.cross_attention.conv_q.bias
168
+ vq_model.enc_p.mrte.cross_attention.conv_k.weight
169
+ vq_model.enc_p.mrte.cross_attention.conv_k.bias
170
+ vq_model.enc_p.mrte.cross_attention.conv_v.weight
171
+ vq_model.enc_p.mrte.cross_attention.conv_v.bias
172
+ vq_model.enc_p.mrte.cross_attention.conv_o.weight
173
+ vq_model.enc_p.mrte.cross_attention.conv_o.bias
174
+ vq_model.enc_p.mrte.c_pre.weight
175
+ vq_model.enc_p.mrte.c_pre.bias
176
+ vq_model.enc_p.mrte.text_pre.weight
177
+ vq_model.enc_p.mrte.text_pre.bias
178
+ vq_model.enc_p.mrte.c_post.weight
179
+ vq_model.enc_p.mrte.c_post.bias
180
+ vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k
181
+ vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v
182
+ vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight
183
+ vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias
184
+ vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight
185
+ vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias
186
+ vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight
187
+ vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias
188
+ vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight
189
+ vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias
190
+ vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k
191
+ vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v
192
+ vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight
193
+ vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias
194
+ vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight
195
+ vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias
196
+ vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight
197
+ vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias
198
+ vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight
199
+ vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias
200
+ vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k
201
+ vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v
202
+ vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight
203
+ vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias
204
+ vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight
205
+ vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias
206
+ vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight
207
+ vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias
208
+ vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight
209
+ vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias
210
+ vq_model.enc_p.encoder2.norm_layers_1.0.gamma
211
+ vq_model.enc_p.encoder2.norm_layers_1.0.beta
212
+ vq_model.enc_p.encoder2.norm_layers_1.1.gamma
213
+ vq_model.enc_p.encoder2.norm_layers_1.1.beta
214
+ vq_model.enc_p.encoder2.norm_layers_1.2.gamma
215
+ vq_model.enc_p.encoder2.norm_layers_1.2.beta
216
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight
217
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias
218
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight
219
+ vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias
220
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight
221
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias
222
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight
223
+ vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias
224
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight
225
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias
226
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight
227
+ vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias
228
+ vq_model.enc_p.encoder2.norm_layers_2.0.gamma
229
+ vq_model.enc_p.encoder2.norm_layers_2.0.beta
230
+ vq_model.enc_p.encoder2.norm_layers_2.1.gamma
231
+ vq_model.enc_p.encoder2.norm_layers_2.1.beta
232
+ vq_model.enc_p.encoder2.norm_layers_2.2.gamma
233
+ vq_model.enc_p.encoder2.norm_layers_2.2.beta
234
+ vq_model.enc_p.proj.weight
235
+ vq_model.enc_p.proj.bias
236
+ vq_model.dec.conv_pre.weight
237
+ vq_model.dec.conv_pre.bias
238
+ vq_model.dec.ups.0.bias
239
+ vq_model.dec.ups.0.weight_g
240
+ vq_model.dec.ups.0.weight_v
241
+ vq_model.dec.ups.1.bias
242
+ vq_model.dec.ups.1.weight_g
243
+ vq_model.dec.ups.1.weight_v
244
+ vq_model.dec.ups.2.bias
245
+ vq_model.dec.ups.2.weight_g
246
+ vq_model.dec.ups.2.weight_v
247
+ vq_model.dec.ups.3.bias
248
+ vq_model.dec.ups.3.weight_g
249
+ vq_model.dec.ups.3.weight_v
250
+ vq_model.dec.ups.4.bias
251
+ vq_model.dec.ups.4.weight_g
252
+ vq_model.dec.ups.4.weight_v
253
+ vq_model.dec.resblocks.0.convs1.0.bias
254
+ vq_model.dec.resblocks.0.convs1.0.weight_g
255
+ vq_model.dec.resblocks.0.convs1.0.weight_v
256
+ vq_model.dec.resblocks.0.convs1.1.bias
257
+ vq_model.dec.resblocks.0.convs1.1.weight_g
258
+ vq_model.dec.resblocks.0.convs1.1.weight_v
259
+ vq_model.dec.resblocks.0.convs1.2.bias
260
+ vq_model.dec.resblocks.0.convs1.2.weight_g
261
+ vq_model.dec.resblocks.0.convs1.2.weight_v
262
+ vq_model.dec.resblocks.0.convs2.0.bias
263
+ vq_model.dec.resblocks.0.convs2.0.weight_g
264
+ vq_model.dec.resblocks.0.convs2.0.weight_v
265
+ vq_model.dec.resblocks.0.convs2.1.bias
266
+ vq_model.dec.resblocks.0.convs2.1.weight_g
267
+ vq_model.dec.resblocks.0.convs2.1.weight_v
268
+ vq_model.dec.resblocks.0.convs2.2.bias
269
+ vq_model.dec.resblocks.0.convs2.2.weight_g
270
+ vq_model.dec.resblocks.0.convs2.2.weight_v
271
+ vq_model.dec.resblocks.1.convs1.0.bias
272
+ vq_model.dec.resblocks.1.convs1.0.weight_g
273
+ vq_model.dec.resblocks.1.convs1.0.weight_v
274
+ vq_model.dec.resblocks.1.convs1.1.bias
275
+ vq_model.dec.resblocks.1.convs1.1.weight_g
276
+ vq_model.dec.resblocks.1.convs1.1.weight_v
277
+ vq_model.dec.resblocks.1.convs1.2.bias
278
+ vq_model.dec.resblocks.1.convs1.2.weight_g
279
+ vq_model.dec.resblocks.1.convs1.2.weight_v
280
+ vq_model.dec.resblocks.1.convs2.0.bias
281
+ vq_model.dec.resblocks.1.convs2.0.weight_g
282
+ vq_model.dec.resblocks.1.convs2.0.weight_v
283
+ vq_model.dec.resblocks.1.convs2.1.bias
284
+ vq_model.dec.resblocks.1.convs2.1.weight_g
285
+ vq_model.dec.resblocks.1.convs2.1.weight_v
286
+ vq_model.dec.resblocks.1.convs2.2.bias
287
+ vq_model.dec.resblocks.1.convs2.2.weight_g
288
+ vq_model.dec.resblocks.1.convs2.2.weight_v
289
+ vq_model.dec.resblocks.2.convs1.0.bias
290
+ vq_model.dec.resblocks.2.convs1.0.weight_g
291
+ vq_model.dec.resblocks.2.convs1.0.weight_v
292
+ vq_model.dec.resblocks.2.convs1.1.bias
293
+ vq_model.dec.resblocks.2.convs1.1.weight_g
294
+ vq_model.dec.resblocks.2.convs1.1.weight_v
295
+ vq_model.dec.resblocks.2.convs1.2.bias
296
+ vq_model.dec.resblocks.2.convs1.2.weight_g
297
+ vq_model.dec.resblocks.2.convs1.2.weight_v
298
+ vq_model.dec.resblocks.2.convs2.0.bias
299
+ vq_model.dec.resblocks.2.convs2.0.weight_g
300
+ vq_model.dec.resblocks.2.convs2.0.weight_v
301
+ vq_model.dec.resblocks.2.convs2.1.bias
302
+ vq_model.dec.resblocks.2.convs2.1.weight_g
303
+ vq_model.dec.resblocks.2.convs2.1.weight_v
304
+ vq_model.dec.resblocks.2.convs2.2.bias
305
+ vq_model.dec.resblocks.2.convs2.2.weight_g
306
+ vq_model.dec.resblocks.2.convs2.2.weight_v
307
+ vq_model.dec.resblocks.3.convs1.0.bias
308
+ vq_model.dec.resblocks.3.convs1.0.weight_g
309
+ vq_model.dec.resblocks.3.convs1.0.weight_v
310
+ vq_model.dec.resblocks.3.convs1.1.bias
311
+ vq_model.dec.resblocks.3.convs1.1.weight_g
312
+ vq_model.dec.resblocks.3.convs1.1.weight_v
313
+ vq_model.dec.resblocks.3.convs1.2.bias
314
+ vq_model.dec.resblocks.3.convs1.2.weight_g
315
+ vq_model.dec.resblocks.3.convs1.2.weight_v
316
+ vq_model.dec.resblocks.3.convs2.0.bias
317
+ vq_model.dec.resblocks.3.convs2.0.weight_g
318
+ vq_model.dec.resblocks.3.convs2.0.weight_v
319
+ vq_model.dec.resblocks.3.convs2.1.bias
320
+ vq_model.dec.resblocks.3.convs2.1.weight_g
321
+ vq_model.dec.resblocks.3.convs2.1.weight_v
322
+ vq_model.dec.resblocks.3.convs2.2.bias
323
+ vq_model.dec.resblocks.3.convs2.2.weight_g
324
+ vq_model.dec.resblocks.3.convs2.2.weight_v
325
+ vq_model.dec.resblocks.4.convs1.0.bias
326
+ vq_model.dec.resblocks.4.convs1.0.weight_g
327
+ vq_model.dec.resblocks.4.convs1.0.weight_v
328
+ vq_model.dec.resblocks.4.convs1.1.bias
329
+ vq_model.dec.resblocks.4.convs1.1.weight_g
330
+ vq_model.dec.resblocks.4.convs1.1.weight_v
331
+ vq_model.dec.resblocks.4.convs1.2.bias
332
+ vq_model.dec.resblocks.4.convs1.2.weight_g
333
+ vq_model.dec.resblocks.4.convs1.2.weight_v
334
+ vq_model.dec.resblocks.4.convs2.0.bias
335
+ vq_model.dec.resblocks.4.convs2.0.weight_g
336
+ vq_model.dec.resblocks.4.convs2.0.weight_v
337
+ vq_model.dec.resblocks.4.convs2.1.bias
338
+ vq_model.dec.resblocks.4.convs2.1.weight_g
339
+ vq_model.dec.resblocks.4.convs2.1.weight_v
340
+ vq_model.dec.resblocks.4.convs2.2.bias
341
+ vq_model.dec.resblocks.4.convs2.2.weight_g
342
+ vq_model.dec.resblocks.4.convs2.2.weight_v
343
+ vq_model.dec.resblocks.5.convs1.0.bias
344
+ vq_model.dec.resblocks.5.convs1.0.weight_g
345
+ vq_model.dec.resblocks.5.convs1.0.weight_v
346
+ vq_model.dec.resblocks.5.convs1.1.bias
347
+ vq_model.dec.resblocks.5.convs1.1.weight_g
348
+ vq_model.dec.resblocks.5.convs1.1.weight_v
349
+ vq_model.dec.resblocks.5.convs1.2.bias
350
+ vq_model.dec.resblocks.5.convs1.2.weight_g
351
+ vq_model.dec.resblocks.5.convs1.2.weight_v
352
+ vq_model.dec.resblocks.5.convs2.0.bias
353
+ vq_model.dec.resblocks.5.convs2.0.weight_g
354
+ vq_model.dec.resblocks.5.convs2.0.weight_v
355
+ vq_model.dec.resblocks.5.convs2.1.bias
356
+ vq_model.dec.resblocks.5.convs2.1.weight_g
357
+ vq_model.dec.resblocks.5.convs2.1.weight_v
358
+ vq_model.dec.resblocks.5.convs2.2.bias
359
+ vq_model.dec.resblocks.5.convs2.2.weight_g
360
+ vq_model.dec.resblocks.5.convs2.2.weight_v
361
+ vq_model.dec.resblocks.6.convs1.0.bias
362
+ vq_model.dec.resblocks.6.convs1.0.weight_g
363
+ vq_model.dec.resblocks.6.convs1.0.weight_v
364
+ vq_model.dec.resblocks.6.convs1.1.bias
365
+ vq_model.dec.resblocks.6.convs1.1.weight_g
366
+ vq_model.dec.resblocks.6.convs1.1.weight_v
367
+ vq_model.dec.resblocks.6.convs1.2.bias
368
+ vq_model.dec.resblocks.6.convs1.2.weight_g
369
+ vq_model.dec.resblocks.6.convs1.2.weight_v
370
+ vq_model.dec.resblocks.6.convs2.0.bias
371
+ vq_model.dec.resblocks.6.convs2.0.weight_g
372
+ vq_model.dec.resblocks.6.convs2.0.weight_v
373
+ vq_model.dec.resblocks.6.convs2.1.bias
374
+ vq_model.dec.resblocks.6.convs2.1.weight_g
375
+ vq_model.dec.resblocks.6.convs2.1.weight_v
376
+ vq_model.dec.resblocks.6.convs2.2.bias
377
+ vq_model.dec.resblocks.6.convs2.2.weight_g
378
+ vq_model.dec.resblocks.6.convs2.2.weight_v
379
+ vq_model.dec.resblocks.7.convs1.0.bias
380
+ vq_model.dec.resblocks.7.convs1.0.weight_g
381
+ vq_model.dec.resblocks.7.convs1.0.weight_v
382
+ vq_model.dec.resblocks.7.convs1.1.bias
383
+ vq_model.dec.resblocks.7.convs1.1.weight_g
384
+ vq_model.dec.resblocks.7.convs1.1.weight_v
385
+ vq_model.dec.resblocks.7.convs1.2.bias
386
+ vq_model.dec.resblocks.7.convs1.2.weight_g
387
+ vq_model.dec.resblocks.7.convs1.2.weight_v
388
+ vq_model.dec.resblocks.7.convs2.0.bias
389
+ vq_model.dec.resblocks.7.convs2.0.weight_g
390
+ vq_model.dec.resblocks.7.convs2.0.weight_v
391
+ vq_model.dec.resblocks.7.convs2.1.bias
392
+ vq_model.dec.resblocks.7.convs2.1.weight_g
393
+ vq_model.dec.resblocks.7.convs2.1.weight_v
394
+ vq_model.dec.resblocks.7.convs2.2.bias
395
+ vq_model.dec.resblocks.7.convs2.2.weight_g
396
+ vq_model.dec.resblocks.7.convs2.2.weight_v
397
+ vq_model.dec.resblocks.8.convs1.0.bias
398
+ vq_model.dec.resblocks.8.convs1.0.weight_g
399
+ vq_model.dec.resblocks.8.convs1.0.weight_v
400
+ vq_model.dec.resblocks.8.convs1.1.bias
401
+ vq_model.dec.resblocks.8.convs1.1.weight_g
402
+ vq_model.dec.resblocks.8.convs1.1.weight_v
403
+ vq_model.dec.resblocks.8.convs1.2.bias
404
+ vq_model.dec.resblocks.8.convs1.2.weight_g
405
+ vq_model.dec.resblocks.8.convs1.2.weight_v
406
+ vq_model.dec.resblocks.8.convs2.0.bias
407
+ vq_model.dec.resblocks.8.convs2.0.weight_g
408
+ vq_model.dec.resblocks.8.convs2.0.weight_v
409
+ vq_model.dec.resblocks.8.convs2.1.bias
410
+ vq_model.dec.resblocks.8.convs2.1.weight_g
411
+ vq_model.dec.resblocks.8.convs2.1.weight_v
412
+ vq_model.dec.resblocks.8.convs2.2.bias
413
+ vq_model.dec.resblocks.8.convs2.2.weight_g
414
+ vq_model.dec.resblocks.8.convs2.2.weight_v
415
+ vq_model.dec.resblocks.9.convs1.0.bias
416
+ vq_model.dec.resblocks.9.convs1.0.weight_g
417
+ vq_model.dec.resblocks.9.convs1.0.weight_v
418
+ vq_model.dec.resblocks.9.convs1.1.bias
419
+ vq_model.dec.resblocks.9.convs1.1.weight_g
420
+ vq_model.dec.resblocks.9.convs1.1.weight_v
421
+ vq_model.dec.resblocks.9.convs1.2.bias
422
+ vq_model.dec.resblocks.9.convs1.2.weight_g
423
+ vq_model.dec.resblocks.9.convs1.2.weight_v
424
+ vq_model.dec.resblocks.9.convs2.0.bias
425
+ vq_model.dec.resblocks.9.convs2.0.weight_g
426
+ vq_model.dec.resblocks.9.convs2.0.weight_v
427
+ vq_model.dec.resblocks.9.convs2.1.bias
428
+ vq_model.dec.resblocks.9.convs2.1.weight_g
429
+ vq_model.dec.resblocks.9.convs2.1.weight_v
430
+ vq_model.dec.resblocks.9.convs2.2.bias
431
+ vq_model.dec.resblocks.9.convs2.2.weight_g
432
+ vq_model.dec.resblocks.9.convs2.2.weight_v
433
+ vq_model.dec.resblocks.10.convs1.0.bias
434
+ vq_model.dec.resblocks.10.convs1.0.weight_g
435
+ vq_model.dec.resblocks.10.convs1.0.weight_v
436
+ vq_model.dec.resblocks.10.convs1.1.bias
437
+ vq_model.dec.resblocks.10.convs1.1.weight_g
438
+ vq_model.dec.resblocks.10.convs1.1.weight_v
439
+ vq_model.dec.resblocks.10.convs1.2.bias
440
+ vq_model.dec.resblocks.10.convs1.2.weight_g
441
+ vq_model.dec.resblocks.10.convs1.2.weight_v
442
+ vq_model.dec.resblocks.10.convs2.0.bias
443
+ vq_model.dec.resblocks.10.convs2.0.weight_g
444
+ vq_model.dec.resblocks.10.convs2.0.weight_v
445
+ vq_model.dec.resblocks.10.convs2.1.bias
446
+ vq_model.dec.resblocks.10.convs2.1.weight_g
447
+ vq_model.dec.resblocks.10.convs2.1.weight_v
448
+ vq_model.dec.resblocks.10.convs2.2.bias
449
+ vq_model.dec.resblocks.10.convs2.2.weight_g
450
+ vq_model.dec.resblocks.10.convs2.2.weight_v
451
+ vq_model.dec.resblocks.11.convs1.0.bias
452
+ vq_model.dec.resblocks.11.convs1.0.weight_g
453
+ vq_model.dec.resblocks.11.convs1.0.weight_v
454
+ vq_model.dec.resblocks.11.convs1.1.bias
455
+ vq_model.dec.resblocks.11.convs1.1.weight_g
456
+ vq_model.dec.resblocks.11.convs1.1.weight_v
457
+ vq_model.dec.resblocks.11.convs1.2.bias
458
+ vq_model.dec.resblocks.11.convs1.2.weight_g
459
+ vq_model.dec.resblocks.11.convs1.2.weight_v
460
+ vq_model.dec.resblocks.11.convs2.0.bias
461
+ vq_model.dec.resblocks.11.convs2.0.weight_g
462
+ vq_model.dec.resblocks.11.convs2.0.weight_v
463
+ vq_model.dec.resblocks.11.convs2.1.bias
464
+ vq_model.dec.resblocks.11.convs2.1.weight_g
465
+ vq_model.dec.resblocks.11.convs2.1.weight_v
466
+ vq_model.dec.resblocks.11.convs2.2.bias
467
+ vq_model.dec.resblocks.11.convs2.2.weight_g
468
+ vq_model.dec.resblocks.11.convs2.2.weight_v
469
+ vq_model.dec.resblocks.12.convs1.0.bias
470
+ vq_model.dec.resblocks.12.convs1.0.weight_g
471
+ vq_model.dec.resblocks.12.convs1.0.weight_v
472
+ vq_model.dec.resblocks.12.convs1.1.bias
473
+ vq_model.dec.resblocks.12.convs1.1.weight_g
474
+ vq_model.dec.resblocks.12.convs1.1.weight_v
475
+ vq_model.dec.resblocks.12.convs1.2.bias
476
+ vq_model.dec.resblocks.12.convs1.2.weight_g
477
+ vq_model.dec.resblocks.12.convs1.2.weight_v
478
+ vq_model.dec.resblocks.12.convs2.0.bias
479
+ vq_model.dec.resblocks.12.convs2.0.weight_g
480
+ vq_model.dec.resblocks.12.convs2.0.weight_v
481
+ vq_model.dec.resblocks.12.convs2.1.bias
482
+ vq_model.dec.resblocks.12.convs2.1.weight_g
483
+ vq_model.dec.resblocks.12.convs2.1.weight_v
484
+ vq_model.dec.resblocks.12.convs2.2.bias
485
+ vq_model.dec.resblocks.12.convs2.2.weight_g
486
+ vq_model.dec.resblocks.12.convs2.2.weight_v
487
+ vq_model.dec.resblocks.13.convs1.0.bias
488
+ vq_model.dec.resblocks.13.convs1.0.weight_g
489
+ vq_model.dec.resblocks.13.convs1.0.weight_v
490
+ vq_model.dec.resblocks.13.convs1.1.bias
491
+ vq_model.dec.resblocks.13.convs1.1.weight_g
492
+ vq_model.dec.resblocks.13.convs1.1.weight_v
493
+ vq_model.dec.resblocks.13.convs1.2.bias
494
+ vq_model.dec.resblocks.13.convs1.2.weight_g
495
+ vq_model.dec.resblocks.13.convs1.2.weight_v
496
+ vq_model.dec.resblocks.13.convs2.0.bias
497
+ vq_model.dec.resblocks.13.convs2.0.weight_g
498
+ vq_model.dec.resblocks.13.convs2.0.weight_v
499
+ vq_model.dec.resblocks.13.convs2.1.bias
500
+ vq_model.dec.resblocks.13.convs2.1.weight_g
501
+ vq_model.dec.resblocks.13.convs2.1.weight_v
502
+ vq_model.dec.resblocks.13.convs2.2.bias
503
+ vq_model.dec.resblocks.13.convs2.2.weight_g
504
+ vq_model.dec.resblocks.13.convs2.2.weight_v
505
+ vq_model.dec.resblocks.14.convs1.0.bias
506
+ vq_model.dec.resblocks.14.convs1.0.weight_g
507
+ vq_model.dec.resblocks.14.convs1.0.weight_v
508
+ vq_model.dec.resblocks.14.convs1.1.bias
509
+ vq_model.dec.resblocks.14.convs1.1.weight_g
510
+ vq_model.dec.resblocks.14.convs1.1.weight_v
511
+ vq_model.dec.resblocks.14.convs1.2.bias
512
+ vq_model.dec.resblocks.14.convs1.2.weight_g
513
+ vq_model.dec.resblocks.14.convs1.2.weight_v
514
+ vq_model.dec.resblocks.14.convs2.0.bias
515
+ vq_model.dec.resblocks.14.convs2.0.weight_g
516
+ vq_model.dec.resblocks.14.convs2.0.weight_v
517
+ vq_model.dec.resblocks.14.convs2.1.bias
518
+ vq_model.dec.resblocks.14.convs2.1.weight_g
519
+ vq_model.dec.resblocks.14.convs2.1.weight_v
520
+ vq_model.dec.resblocks.14.convs2.2.bias
521
+ vq_model.dec.resblocks.14.convs2.2.weight_g
522
+ vq_model.dec.resblocks.14.convs2.2.weight_v
523
+ vq_model.dec.conv_post.weight
524
+ vq_model.dec.cond.weight
525
+ vq_model.dec.cond.bias
526
+ vq_model.flow.flows.0.pre.weight
527
+ vq_model.flow.flows.0.pre.bias
528
+ vq_model.flow.flows.0.enc.in_layers.0.bias
529
+ vq_model.flow.flows.0.enc.in_layers.0.weight_g
530
+ vq_model.flow.flows.0.enc.in_layers.0.weight_v
531
+ vq_model.flow.flows.0.enc.in_layers.1.bias
532
+ vq_model.flow.flows.0.enc.in_layers.1.weight_g
533
+ vq_model.flow.flows.0.enc.in_layers.1.weight_v
534
+ vq_model.flow.flows.0.enc.in_layers.2.bias
535
+ vq_model.flow.flows.0.enc.in_layers.2.weight_g
536
+ vq_model.flow.flows.0.enc.in_layers.2.weight_v
537
+ vq_model.flow.flows.0.enc.in_layers.3.bias
538
+ vq_model.flow.flows.0.enc.in_layers.3.weight_g
539
+ vq_model.flow.flows.0.enc.in_layers.3.weight_v
540
+ vq_model.flow.flows.0.enc.res_skip_layers.0.bias
541
+ vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g
542
+ vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v
543
+ vq_model.flow.flows.0.enc.res_skip_layers.1.bias
544
+ vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g
545
+ vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v
546
+ vq_model.flow.flows.0.enc.res_skip_layers.2.bias
547
+ vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g
548
+ vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v
549
+ vq_model.flow.flows.0.enc.res_skip_layers.3.bias
550
+ vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g
551
+ vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v
552
+ vq_model.flow.flows.0.enc.cond_layer.bias
553
+ vq_model.flow.flows.0.enc.cond_layer.weight_g
554
+ vq_model.flow.flows.0.enc.cond_layer.weight_v
555
+ vq_model.flow.flows.0.post.weight
556
+ vq_model.flow.flows.0.post.bias
557
+ vq_model.flow.flows.2.pre.weight
558
+ vq_model.flow.flows.2.pre.bias
559
+ vq_model.flow.flows.2.enc.in_layers.0.bias
560
+ vq_model.flow.flows.2.enc.in_layers.0.weight_g
561
+ vq_model.flow.flows.2.enc.in_layers.0.weight_v
562
+ vq_model.flow.flows.2.enc.in_layers.1.bias
563
+ vq_model.flow.flows.2.enc.in_layers.1.weight_g
564
+ vq_model.flow.flows.2.enc.in_layers.1.weight_v
565
+ vq_model.flow.flows.2.enc.in_layers.2.bias
566
+ vq_model.flow.flows.2.enc.in_layers.2.weight_g
567
+ vq_model.flow.flows.2.enc.in_layers.2.weight_v
568
+ vq_model.flow.flows.2.enc.in_layers.3.bias
569
+ vq_model.flow.flows.2.enc.in_layers.3.weight_g
570
+ vq_model.flow.flows.2.enc.in_layers.3.weight_v
571
+ vq_model.flow.flows.2.enc.res_skip_layers.0.bias
572
+ vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g
573
+ vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v
574
+ vq_model.flow.flows.2.enc.res_skip_layers.1.bias
575
+ vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g
576
+ vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v
577
+ vq_model.flow.flows.2.enc.res_skip_layers.2.bias
578
+ vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g
579
+ vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v
580
+ vq_model.flow.flows.2.enc.res_skip_layers.3.bias
581
+ vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g
582
+ vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v
583
+ vq_model.flow.flows.2.enc.cond_layer.bias
584
+ vq_model.flow.flows.2.enc.cond_layer.weight_g
585
+ vq_model.flow.flows.2.enc.cond_layer.weight_v
586
+ vq_model.flow.flows.2.post.weight
587
+ vq_model.flow.flows.2.post.bias
588
+ vq_model.flow.flows.4.pre.weight
589
+ vq_model.flow.flows.4.pre.bias
590
+ vq_model.flow.flows.4.enc.in_layers.0.bias
591
+ vq_model.flow.flows.4.enc.in_layers.0.weight_g
592
+ vq_model.flow.flows.4.enc.in_layers.0.weight_v
593
+ vq_model.flow.flows.4.enc.in_layers.1.bias
594
+ vq_model.flow.flows.4.enc.in_layers.1.weight_g
595
+ vq_model.flow.flows.4.enc.in_layers.1.weight_v
596
+ vq_model.flow.flows.4.enc.in_layers.2.bias
597
+ vq_model.flow.flows.4.enc.in_layers.2.weight_g
598
+ vq_model.flow.flows.4.enc.in_layers.2.weight_v
599
+ vq_model.flow.flows.4.enc.in_layers.3.bias
600
+ vq_model.flow.flows.4.enc.in_layers.3.weight_g
601
+ vq_model.flow.flows.4.enc.in_layers.3.weight_v
602
+ vq_model.flow.flows.4.enc.res_skip_layers.0.bias
603
+ vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g
604
+ vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v
605
+ vq_model.flow.flows.4.enc.res_skip_layers.1.bias
606
+ vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g
607
+ vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v
608
+ vq_model.flow.flows.4.enc.res_skip_layers.2.bias
609
+ vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g
610
+ vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v
611
+ vq_model.flow.flows.4.enc.res_skip_layers.3.bias
612
+ vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g
613
+ vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v
614
+ vq_model.flow.flows.4.enc.cond_layer.bias
615
+ vq_model.flow.flows.4.enc.cond_layer.weight_g
616
+ vq_model.flow.flows.4.enc.cond_layer.weight_v
617
+ vq_model.flow.flows.4.post.weight
618
+ vq_model.flow.flows.4.post.bias
619
+ vq_model.flow.flows.6.pre.weight
620
+ vq_model.flow.flows.6.pre.bias
621
+ vq_model.flow.flows.6.enc.in_layers.0.bias
622
+ vq_model.flow.flows.6.enc.in_layers.0.weight_g
623
+ vq_model.flow.flows.6.enc.in_layers.0.weight_v
624
+ vq_model.flow.flows.6.enc.in_layers.1.bias
625
+ vq_model.flow.flows.6.enc.in_layers.1.weight_g
626
+ vq_model.flow.flows.6.enc.in_layers.1.weight_v
627
+ vq_model.flow.flows.6.enc.in_layers.2.bias
628
+ vq_model.flow.flows.6.enc.in_layers.2.weight_g
629
+ vq_model.flow.flows.6.enc.in_layers.2.weight_v
630
+ vq_model.flow.flows.6.enc.in_layers.3.bias
631
+ vq_model.flow.flows.6.enc.in_layers.3.weight_g
632
+ vq_model.flow.flows.6.enc.in_layers.3.weight_v
633
+ vq_model.flow.flows.6.enc.res_skip_layers.0.bias
634
+ vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g
635
+ vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v
636
+ vq_model.flow.flows.6.enc.res_skip_layers.1.bias
637
+ vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g
638
+ vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v
639
+ vq_model.flow.flows.6.enc.res_skip_layers.2.bias
640
+ vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g
641
+ vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v
642
+ vq_model.flow.flows.6.enc.res_skip_layers.3.bias
643
+ vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g
644
+ vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v
645
+ vq_model.flow.flows.6.enc.cond_layer.bias
646
+ vq_model.flow.flows.6.enc.cond_layer.weight_g
647
+ vq_model.flow.flows.6.enc.cond_layer.weight_v
648
+ vq_model.flow.flows.6.post.weight
649
+ vq_model.flow.flows.6.post.bias
650
+ vq_model.quantizer.vq.layers.0._codebook.embed
genie_tts/G2P/Chinese/CorrectPronunciation.py CHANGED
@@ -1,50 +1,50 @@
1
- import os
2
- import pickle
3
- from typing import List, Dict, Any, Union
4
-
5
- from ...Core.Resources import Chinese_G2P_DIR
6
-
7
- # 常量定义
8
- DEFAULT_CACHE_PATH = os.path.join(Chinese_G2P_DIR, "polyphonic.pickle")
9
-
10
-
11
- class PolyphonicDictManager:
12
- _data: Dict[str, Any] = {}
13
-
14
- @classmethod
15
- def get_data(cls, path: str = DEFAULT_CACHE_PATH) -> Dict[str, Any]:
16
- if not cls._data:
17
- with open(path, "rb") as f:
18
- cls._data = pickle.load(f)
19
- return cls._data
20
-
21
-
22
- def correct_pronunciation(word: str, word_pinyin: List[str]) -> Union[List[str], str]:
23
- """
24
- 根据加载的字典修正发音,作为供外部程序调用的独立接口。
25
- 逻辑:优先查找整词修正,如果没有整词匹配,则遍历每个字符进行单字修正。
26
-
27
- Input:
28
- word (str): 原始中文字符串,例如 "银行"。
29
- word_pinyins (List[str]): 当前预测的拼音列表,例如 ['yin2', 'xing2']。
30
-
31
- Output:
32
- Union[List[str], str]: 修正后的拼音列表或字符串。
33
-
34
- Example:
35
- # 字典包含整词 {'银行': ['yin2', 'hang2']}
36
- result = correct_pronunciation("银行", ["yin2", "xing2"])
37
- # Result: ["yin2", "hang2"]
38
- """
39
- pp_dict = PolyphonicDictManager.get_data()
40
- new_word_pinyin = list(word_pinyin)
41
- # 1. 尝试整词匹配
42
- if new_pinyin := pp_dict.get(word):
43
- return new_pinyin
44
- # 2. 逐字修正
45
- for idx, w in enumerate(word):
46
- if idx >= len(new_word_pinyin):
47
- break
48
- if w_pinyin := pp_dict.get(w):
49
- new_word_pinyin[idx] = w_pinyin[0]
50
- return new_word_pinyin
 
1
+ import os
2
+ import pickle
3
+ from typing import List, Dict, Any, Union
4
+
5
+ from ...Core.Resources import Chinese_G2P_DIR
6
+
7
+ # 常量定义
8
+ DEFAULT_CACHE_PATH = os.path.join(Chinese_G2P_DIR, "polyphonic.pickle")
9
+
10
+
11
+ class PolyphonicDictManager:
12
+ _data: Dict[str, Any] = {}
13
+
14
+ @classmethod
15
+ def get_data(cls, path: str = DEFAULT_CACHE_PATH) -> Dict[str, Any]:
16
+ if not cls._data:
17
+ with open(path, "rb") as f:
18
+ cls._data = pickle.load(f)
19
+ return cls._data
20
+
21
+
22
+ def correct_pronunciation(word: str, word_pinyin: List[str]) -> Union[List[str], str]:
23
+ """
24
+ 根据加载的字典修正发音,作为供外部程序调用的独立接口。
25
+ 逻辑:优先查找整词修正,如果没有整词匹配,则遍历每个字符进行单字修正。
26
+
27
+ Input:
28
+ word (str): 原始中文字符串,例如 "银行"。
29
+ word_pinyins (List[str]): 当前预测的拼音列表,例如 ['yin2', 'xing2']。
30
+
31
+ Output:
32
+ Union[List[str], str]: 修正后的拼音列表或字符串。
33
+
34
+ Example:
35
+ # 字典包含整词 {'银行': ['yin2', 'hang2']}
36
+ result = correct_pronunciation("银行", ["yin2", "xing2"])
37
+ # Result: ["yin2", "hang2"]
38
+ """
39
+ pp_dict = PolyphonicDictManager.get_data()
40
+ new_word_pinyin = list(word_pinyin)
41
+ # 1. 尝试整词匹配
42
+ if new_pinyin := pp_dict.get(word):
43
+ return new_pinyin
44
+ # 2. 逐字修正
45
+ for idx, w in enumerate(word):
46
+ if idx >= len(new_word_pinyin):
47
+ break
48
+ if w_pinyin := pp_dict.get(w):
49
+ new_word_pinyin[idx] = w_pinyin[0]
50
+ return new_word_pinyin
genie_tts/G2P/Chinese/Erhua.py CHANGED
@@ -1,49 +1,49 @@
1
- from typing import List, Tuple, Set
2
-
3
-
4
- class ErhuaProcessor:
5
- """
6
- 处理中文G2P中的儿化音逻辑。
7
- """
8
-
9
- def __init__(self):
10
- self.must_erhua: Set[str] = {
11
- "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
12
- }
13
- self.not_erhua: Set[str] = {
14
- "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿",
15
- "妻儿", "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿",
16
- "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿",
17
- "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿",
18
- "猪儿", "猫儿", "狗儿", "少儿",
19
- }
20
-
21
- def merge_erhua(self, initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]:
22
- # 1. 修正 er1 发音为 er2 (当'儿'在词尾且发音为er1时)
23
- for i, phn in enumerate(finals):
24
- if i == len(finals) - 1 and word[i] == "儿" and phn == "er1":
25
- finals[i] = "er2"
26
- # 2. 检查是否跳过儿化处理
27
- if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}):
28
- return initials, finals
29
- # 3. 长度校验 (处理如 "……" 等长度不一致的特殊符号情况)
30
- if len(finals) != len(word):
31
- return initials, finals
32
- # 4. 执行儿化合并逻辑 (与前一个字发同音)
33
- new_initials = []
34
- new_finals = []
35
- for i, phn in enumerate(finals):
36
- # 判断是否需要合并儿化音
37
- # 条件: 是最后一个字 + 是"儿" + 发音是er2/er5 + 后两字不在非儿化表中 + 前面已有韵母
38
- if (
39
- i == len(finals) - 1
40
- and word[i] == "儿"
41
- and phn in {"er2", "er5"}
42
- and word[-2:] not in self.not_erhua
43
- and new_finals
44
- ):
45
- # 将 'er' 加上前一个字的声调
46
- phn = "er" + new_finals[-1][-1]
47
- new_initials.append(initials[i])
48
- new_finals.append(phn)
49
- return new_initials, new_finals
 
1
+ from typing import List, Tuple, Set
2
+
3
+
4
+ class ErhuaProcessor:
5
+ """
6
+ 处理中文G2P中的儿化音逻辑。
7
+ """
8
+
9
+ def __init__(self):
10
+ self.must_erhua: Set[str] = {
11
+ "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
12
+ }
13
+ self.not_erhua: Set[str] = {
14
+ "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿",
15
+ "妻儿", "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿",
16
+ "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿",
17
+ "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿",
18
+ "猪儿", "猫儿", "狗儿", "少儿",
19
+ }
20
+
21
+ def merge_erhua(self, initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]:
22
+ # 1. 修正 er1 发音为 er2 (当'儿'在词尾且发音为er1时)
23
+ for i, phn in enumerate(finals):
24
+ if i == len(finals) - 1 and word[i] == "儿" and phn == "er1":
25
+ finals[i] = "er2"
26
+ # 2. 检查是否跳过儿化处理
27
+ if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}):
28
+ return initials, finals
29
+ # 3. 长度校验 (处理如 "……" 等长度不一致的特殊符号情况)
30
+ if len(finals) != len(word):
31
+ return initials, finals
32
+ # 4. 执行儿化合并逻辑 (与前一个字发同音)
33
+ new_initials = []
34
+ new_finals = []
35
+ for i, phn in enumerate(finals):
36
+ # 判断是否需要合并儿化音
37
+ # 条件: 是最后一个字 + 是"儿" + 发音是er2/er5 + 后两字不在非儿化表中 + 前面已有韵母
38
+ if (
39
+ i == len(finals) - 1
40
+ and word[i] == "儿"
41
+ and phn in {"er2", "er5"}
42
+ and word[-2:] not in self.not_erhua
43
+ and new_finals
44
+ ):
45
+ # 将 'er' 加上前一个字的声调
46
+ phn = "er" + new_finals[-1][-1]
47
+ new_initials.append(initials[i])
48
+ new_finals.append(phn)
49
+ return new_initials, new_finals
genie_tts/G2P/Chinese/Normalization/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (204 Bytes). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/char_convert.cpython-311.pyc ADDED
Binary file (66.1 kB). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/chronology.cpython-311.pyc ADDED
Binary file (4.52 kB). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/constants.cpython-311.pyc ADDED
Binary file (2.36 kB). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/num.cpython-311.pyc ADDED
Binary file (12.8 kB). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/phonecode.cpython-311.pyc ADDED
Binary file (2.26 kB). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/quantifier.cpython-311.pyc ADDED
Binary file (1.94 kB). View file
 
genie_tts/G2P/Chinese/Normalization/__pycache__/text_normlization.cpython-311.pyc ADDED
Binary file (10.8 kB). View file
 
genie_tts/G2P/Chinese/ToneSandhi.py CHANGED
@@ -1,354 +1,354 @@
1
- # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """
16
- 中文拼音变调(Tone Sandhi)自动处理器
17
- """
18
-
19
- from typing import List
20
- from typing import Tuple
21
- import jieba_fast as jieba
22
- from pypinyin import lazy_pinyin
23
- from pypinyin import Style
24
-
25
-
26
- class ToneSandhi:
27
- def __init__(self):
28
- self.must_neural_tone_words = {
29
- "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨",
30
- "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠",
31
- "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么",
32
- "这个", "运气", "过去", "软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主",
33
- "豆腐", "讲究", "记性", "记号", "认识", "规矩", "见识", "裁缝", "补丁", "衣裳",
34
- "衣服", "衙门", "街坊", "行李", "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄",
35
- "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在",
36
- "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同",
37
- "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太",
38
- "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", "精神",
39
- "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿",
40
- "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗",
41
- "砚台", "码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛",
42
- "相声", "盘算", "白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意",
43
- "甘蔗", "琵琶", "琢磨", "琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务",
44
- "牲口", "牙碜", "牌楼", "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心",
45
- "炊帚", "灯笼", "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头",
46
- "活泼", "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃",
47
- "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠", "朋友",
48
- "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", "收成", "提防",
49
- "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼", "抬举",
50
- "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实",
51
- "扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么",
52
- "念头", "念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通",
53
- "应酬", "庄稼", "干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌",
54
- "差事", "工夫", "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头",
55
- "对付", "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆",
56
- "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当", "妖精",
57
- "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫", "多少", "多么",
58
- "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", "嘱咐", "嘟囔", "嘀咕",
59
- "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦", "咳嗽", "和尚",
60
- "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝", "叫唤",
61
- "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹",
62
- "功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析",
63
- "出息", "凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟",
64
- "便宜", "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么",
65
- "人家", "亲戚", "亲家", "交��", "云彩", "事情", "买卖", "主意", "丫头", "丧气",
66
- "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴", "上头", "上司",
67
- "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲", "咕噜", "邋遢", "费用",
68
- "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", "幸福", "熟悉", "计划", "扑腾",
69
- "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜",
70
- "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记",
71
- }
72
- self.must_not_neural_tone_words = {
73
- "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人",
74
- "虎虎", "幺幺", "干嘛", "学子", "哈哈", "数数", "袅袅", "局地", "以下", "娃哈哈",
75
- "花花草草", "留得", "耕地", "想想", "熙熙", "攘攘", "卵子", "死死", "冉冉", "恳恳",
76
- "佼佼", "吵吵", "打打", "考考", "整整", "莘莘", "落地", "算子", "家家户户", "青青",
77
- }
78
- self.punc = ":,;。?!“”‘’':,;.?!"
79
-
80
- # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
81
- # e.g.
82
- # word: "家里"
83
- # pos: "s"
84
- # finals: ['ia1', 'i3']
85
- def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
86
- # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
87
- for j, item in enumerate(word):
88
- if (
89
- j - 1 >= 0
90
- and item == word[j - 1]
91
- and pos[0] in {"n", "v", "a"}
92
- and word not in self.must_not_neural_tone_words
93
- ):
94
- finals[j] = finals[j][:-1] + "5"
95
- ge_idx = word.find("个")
96
- if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
97
- finals[-1] = finals[-1][:-1] + "5"
98
- elif len(word) >= 1 and word[-1] in "的地得":
99
- finals[-1] = finals[-1][:-1] + "5"
100
- # e.g. 走了, 看着, 去过
101
- elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
102
- finals[-1] = finals[-1][:-1] + "5"
103
- elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
104
- finals[-1] = finals[-1][:-1] + "5"
105
- # e.g. 桌上, 地下, 家里
106
- elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
107
- finals[-1] = finals[-1][:-1] + "5"
108
- # e.g. 上来, 下去
109
- elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
110
- finals[-1] = finals[-1][:-1] + "5"
111
- # 个做量词
112
- elif (
113
- ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
114
- ) or word == "个":
115
- finals[ge_idx] = finals[ge_idx][:-1] + "5"
116
- else:
117
- if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
118
- finals[-1] = finals[-1][:-1] + "5"
119
-
120
- word_list = self._split_word(word)
121
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]]
122
- for i, word in enumerate(word_list):
123
- # conventional neural in Chinese
124
- if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
125
- finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
126
- finals = sum(finals_list, [])
127
- return finals
128
-
129
- @staticmethod
130
- def _bu_sandhi(word: str, finals: List[str]) -> List[str]:
131
- # e.g. 看不懂
132
- if len(word) == 3 and word[1] == "不":
133
- finals[1] = finals[1][:-1] + "5"
134
- else:
135
- for i, char in enumerate(word):
136
- # "不" before tone4 should be bu2, e.g. 不怕
137
- if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
138
- finals[i] = finals[i][:-1] + "2"
139
- return finals
140
-
141
- def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
142
- # "一" in number sequences, e.g. 一零零, 二一零
143
- if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]):
144
- return finals
145
- # "一" between reduplication words should be yi5, e.g. 看一看
146
- elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
147
- finals[1] = finals[1][:-1] + "5"
148
- # when "一" is ordinal word, it should be yi1
149
- elif word.startswith("第一"):
150
- finals[1] = finals[1][:-1] + "1"
151
- else:
152
- for i, char in enumerate(word):
153
- if char == "一" and i + 1 < len(word):
154
- # "一" before tone4 should be yi2, e.g. 一段
155
- if finals[i + 1][-1] == "4":
156
- finals[i] = finals[i][:-1] + "2"
157
- # "一" before non-tone4 should be yi4, e.g. 一天
158
- else:
159
- # "一" 后面如果是标点,还读一声
160
- if word[i + 1] not in self.punc:
161
- finals[i] = finals[i][:-1] + "4"
162
- return finals
163
-
164
- @staticmethod
165
- def _split_word(word: str) -> List[str]:
166
- word_list = jieba.cut_for_search(word)
167
- word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
168
- first_subword = word_list[0]
169
- first_begin_idx = word.find(first_subword)
170
- if first_begin_idx == 0:
171
- second_subword = word[len(first_subword):]
172
- new_word_list = [first_subword, second_subword]
173
- else:
174
- second_subword = word[: -len(first_subword)]
175
- new_word_list = [second_subword, first_subword]
176
- return new_word_list
177
-
178
- def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
179
- if len(word) == 2 and self._all_tone_three(finals):
180
- finals[0] = finals[0][:-1] + "2"
181
- elif len(word) == 3:
182
- word_list = self._split_word(word)
183
- if self._all_tone_three(finals):
184
- # disyllabic + monosyllabic, e.g. 蒙古/包
185
- if len(word_list[0]) == 2:
186
- finals[0] = finals[0][:-1] + "2"
187
- finals[1] = finals[1][:-1] + "2"
188
- # monosyllabic + disyllabic, e.g. 纸/老虎
189
- elif len(word_list[0]) == 1:
190
- finals[1] = finals[1][:-1] + "2"
191
- else:
192
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]]
193
- if len(finals_list) == 2:
194
- for i, sub in enumerate(finals_list):
195
- # e.g. 所有/人
196
- if self._all_tone_three(sub) and len(sub) == 2:
197
- finals_list[i][0] = finals_list[i][0][:-1] + "2"
198
- # e.g. 好/喜欢
199
- elif (
200
- i == 1
201
- and not self._all_tone_three(sub)
202
- and finals_list[i][0][-1] == "3"
203
- and finals_list[0][-1][-1] == "3"
204
- ):
205
- finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
206
- finals = sum(finals_list, [])
207
- # split idiom into two words whose length is 2
208
- elif len(word) == 4:
209
- finals_list = [finals[:2], finals[2:]]
210
- finals = []
211
- for sub in finals_list:
212
- if self._all_tone_three(sub):
213
- sub[0] = sub[0][:-1] + "2"
214
- finals += sub
215
-
216
- return finals
217
-
218
- @staticmethod
219
- def _all_tone_three(finals: List[str]) -> bool:
220
- # 增加 len(x) > 0 的判断,防止空字符串导致崩溃
221
- return all(len(x) > 0 and x[-1] == "3" for x in finals)
222
-
223
- @staticmethod
224
- def _merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
225
- new_seg = []
226
- last_word = ""
227
- for word, pos in seg:
228
- if last_word == "不":
229
- word = last_word + word
230
- if word != "不":
231
- new_seg.append((word, pos))
232
- last_word = word[:]
233
- if last_word == "不":
234
- new_seg.append((last_word, "d"))
235
- return new_seg
236
-
237
- @staticmethod
238
- def _merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
239
- new_seg = []
240
- i = 0
241
- # function 1
242
- while i < len(seg):
243
- word, pos = seg[i]
244
- merged = False
245
- if i - 1 >= 0 and word == "一" and i + 1 < len(seg):
246
- last = new_seg[-1] if new_seg else seg[i - 1]
247
- if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
248
- combined = last[0] + "一" + seg[i + 1][0]
249
- new_seg[-1] = [combined, last[1]]
250
- i += 2
251
- merged = True
252
- if not merged:
253
- new_seg.append([word, pos])
254
- i += 1
255
- seg = new_seg
256
- new_seg = []
257
- # function 2
258
- for word, pos in seg:
259
- if new_seg and new_seg[-1][0] == "一":
260
- new_seg[-1][0] = new_seg[-1][0] + word
261
- else:
262
- new_seg.append([word, pos])
263
- return new_seg
264
-
265
- # the first and the second words are all_tone_three
266
- def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
267
- new_seg = []
268
- sub_finals_list = [
269
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
270
- ]
271
- assert len(sub_finals_list) == len(seg)
272
- merge_last = [False] * len(seg)
273
- for i, (word, pos) in enumerate(seg):
274
- if (
275
- i - 1 >= 0
276
- and self._all_tone_three(sub_finals_list[i - 1])
277
- and self._all_tone_three(sub_finals_list[i])
278
- and not merge_last[i - 1]
279
- ):
280
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
281
- if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
282
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
283
- merge_last[i] = True
284
- else:
285
- new_seg.append([word, pos])
286
- else:
287
- new_seg.append([word, pos])
288
-
289
- return new_seg
290
-
291
- @staticmethod
292
- def _is_reduplication(word: str) -> bool:
293
- return len(word) == 2 and word[0] == word[1]
294
-
295
- # the last char of first word and the first char of second word is tone_three
296
- def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
297
- new_seg = []
298
- sub_finals_list = [
299
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
300
- ]
301
- assert len(sub_finals_list) == len(seg)
302
- merge_last = [False] * len(seg)
303
- for i, (word, pos) in enumerate(seg):
304
- if (
305
- i - 1 >= 0
306
- and sub_finals_list[i - 1][-1][-1] == "3"
307
- and sub_finals_list[i][0][-1] == "3"
308
- and not merge_last[i - 1]
309
- ):
310
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
311
- if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
312
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
313
- merge_last[i] = True
314
- else:
315
- new_seg.append([word, pos])
316
- else:
317
- new_seg.append([word, pos])
318
- return new_seg
319
-
320
- @staticmethod
321
- def _merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
322
- new_seg = []
323
- for i, (word, pos) in enumerate(seg):
324
- if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
325
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
326
- else:
327
- new_seg.append([word, pos])
328
- return new_seg
329
-
330
- @staticmethod
331
- def _merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
332
- new_seg = []
333
- for i, (word, pos) in enumerate(seg):
334
- if new_seg and word == new_seg[-1][0]:
335
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
336
- else:
337
- new_seg.append([word, pos])
338
- return new_seg
339
-
340
- def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
341
- seg = self._merge_bu(seg)
342
- seg = self._merge_yi(seg)
343
- seg = self._merge_reduplication(seg)
344
- seg = self._merge_continuous_three_tones(seg)
345
- seg = self._merge_continuous_three_tones_2(seg)
346
- seg = self._merge_er(seg)
347
- return seg
348
-
349
- def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
350
- finals = self._bu_sandhi(word, finals)
351
- finals = self._yi_sandhi(word, finals)
352
- finals = self._neural_sandhi(word, pos, finals)
353
- finals = self._three_sandhi(word, finals)
354
- return finals
 
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ 中文拼音变调(Tone Sandhi)自动处理器
17
+ """
18
+
19
+ from typing import List
20
+ from typing import Tuple
21
+ import jieba_fast as jieba
22
+ from pypinyin import lazy_pinyin
23
+ from pypinyin import Style
24
+
25
+
26
+ class ToneSandhi:
27
+ def __init__(self):
28
+ self.must_neural_tone_words = {
29
+ "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨",
30
+ "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠",
31
+ "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么",
32
+ "这个", "运气", "过去", "软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主",
33
+ "豆腐", "讲究", "记性", "记号", "认识", "规矩", "见识", "裁缝", "补丁", "衣裳",
34
+ "衣服", "衙门", "街坊", "行李", "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄",
35
+ "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在",
36
+ "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同",
37
+ "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太",
38
+ "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", "精神",
39
+ "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿",
40
+ "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗",
41
+ "砚台", "码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛",
42
+ "相声", "盘算", "白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意",
43
+ "甘蔗", "琵琶", "琢磨", "琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务",
44
+ "牲口", "牙碜", "牌楼", "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心",
45
+ "炊帚", "灯笼", "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头",
46
+ "活泼", "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃",
47
+ "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠", "朋友",
48
+ "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", "收成", "提防",
49
+ "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼", "抬举",
50
+ "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实",
51
+ "扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么",
52
+ "念头", "念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通",
53
+ "应酬", "庄稼", "干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌",
54
+ "差事", "工夫", "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头",
55
+ "对付", "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆",
56
+ "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当", "妖精",
57
+ "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫", "多少", "多么",
58
+ "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", "嘱咐", "嘟囔", "嘀咕",
59
+ "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦", "咳嗽", "和尚",
60
+ "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝", "叫唤",
61
+ "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹",
62
+ "功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析",
63
+ "出息", "凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟",
64
+ "便宜", "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么",
65
+ "人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气",
66
+ "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴", "上头", "上司",
67
+ "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲", "咕噜", "邋遢", "费用",
68
+ "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", "幸福", "熟悉", "计划", "扑腾",
69
+ "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜",
70
+ "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记",
71
+ }
72
+ self.must_not_neural_tone_words = {
73
+ "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人",
74
+ "虎虎", "幺幺", "干嘛", "学子", "哈哈", "数数", "袅袅", "局地", "以下", "娃哈哈",
75
+ "花花草草", "留得", "耕地", "想想", "熙熙", "攘攘", "卵子", "死死", "冉冉", "恳恳",
76
+ "佼佼", "吵吵", "打打", "考考", "整整", "莘莘", "落地", "算子", "家家户户", "青青",
77
+ }
78
+ self.punc = ":,;。?!“”‘’':,;.?!"
79
+
80
+ # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
81
+ # e.g.
82
+ # word: "家里"
83
+ # pos: "s"
84
+ # finals: ['ia1', 'i3']
85
+ def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
86
+ # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
87
+ for j, item in enumerate(word):
88
+ if (
89
+ j - 1 >= 0
90
+ and item == word[j - 1]
91
+ and pos[0] in {"n", "v", "a"}
92
+ and word not in self.must_not_neural_tone_words
93
+ ):
94
+ finals[j] = finals[j][:-1] + "5"
95
+ ge_idx = word.find("个")
96
+ if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
97
+ finals[-1] = finals[-1][:-1] + "5"
98
+ elif len(word) >= 1 and word[-1] in "的地得":
99
+ finals[-1] = finals[-1][:-1] + "5"
100
+ # e.g. 走了, 看着, 去过
101
+ elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
102
+ finals[-1] = finals[-1][:-1] + "5"
103
+ elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
104
+ finals[-1] = finals[-1][:-1] + "5"
105
+ # e.g. 桌上, 地下, 家里
106
+ elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
107
+ finals[-1] = finals[-1][:-1] + "5"
108
+ # e.g. 上来, 下去
109
+ elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
110
+ finals[-1] = finals[-1][:-1] + "5"
111
+ # 个做量词
112
+ elif (
113
+ ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
114
+ ) or word == "个":
115
+ finals[ge_idx] = finals[ge_idx][:-1] + "5"
116
+ else:
117
+ if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
118
+ finals[-1] = finals[-1][:-1] + "5"
119
+
120
+ word_list = self._split_word(word)
121
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]]
122
+ for i, word in enumerate(word_list):
123
+ # conventional neural in Chinese
124
+ if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
125
+ finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
126
+ finals = sum(finals_list, [])
127
+ return finals
128
+
129
+ @staticmethod
130
+ def _bu_sandhi(word: str, finals: List[str]) -> List[str]:
131
+ # e.g. 看不懂
132
+ if len(word) == 3 and word[1] == "不":
133
+ finals[1] = finals[1][:-1] + "5"
134
+ else:
135
+ for i, char in enumerate(word):
136
+ # "不" before tone4 should be bu2, e.g. 不怕
137
+ if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
138
+ finals[i] = finals[i][:-1] + "2"
139
+ return finals
140
+
141
+ def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
142
+ # "一" in number sequences, e.g. 一零零, 二一零
143
+ if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]):
144
+ return finals
145
+ # "一" between reduplication words should be yi5, e.g. 看一看
146
+ elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
147
+ finals[1] = finals[1][:-1] + "5"
148
+ # when "一" is ordinal word, it should be yi1
149
+ elif word.startswith("第一"):
150
+ finals[1] = finals[1][:-1] + "1"
151
+ else:
152
+ for i, char in enumerate(word):
153
+ if char == "一" and i + 1 < len(word):
154
+ # "一" before tone4 should be yi2, e.g. 一段
155
+ if finals[i + 1][-1] == "4":
156
+ finals[i] = finals[i][:-1] + "2"
157
+ # "一" before non-tone4 should be yi4, e.g. 一天
158
+ else:
159
+ # "一" 后面如果是标点,还读一声
160
+ if word[i + 1] not in self.punc:
161
+ finals[i] = finals[i][:-1] + "4"
162
+ return finals
163
+
164
+ @staticmethod
165
+ def _split_word(word: str) -> List[str]:
166
+ word_list = jieba.cut_for_search(word)
167
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
168
+ first_subword = word_list[0]
169
+ first_begin_idx = word.find(first_subword)
170
+ if first_begin_idx == 0:
171
+ second_subword = word[len(first_subword):]
172
+ new_word_list = [first_subword, second_subword]
173
+ else:
174
+ second_subword = word[: -len(first_subword)]
175
+ new_word_list = [second_subword, first_subword]
176
+ return new_word_list
177
+
178
+ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
179
+ if len(word) == 2 and self._all_tone_three(finals):
180
+ finals[0] = finals[0][:-1] + "2"
181
+ elif len(word) == 3:
182
+ word_list = self._split_word(word)
183
+ if self._all_tone_three(finals):
184
+ # disyllabic + monosyllabic, e.g. 蒙古/包
185
+ if len(word_list[0]) == 2:
186
+ finals[0] = finals[0][:-1] + "2"
187
+ finals[1] = finals[1][:-1] + "2"
188
+ # monosyllabic + disyllabic, e.g. 纸/老虎
189
+ elif len(word_list[0]) == 1:
190
+ finals[1] = finals[1][:-1] + "2"
191
+ else:
192
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]]
193
+ if len(finals_list) == 2:
194
+ for i, sub in enumerate(finals_list):
195
+ # e.g. 所有/人
196
+ if self._all_tone_three(sub) and len(sub) == 2:
197
+ finals_list[i][0] = finals_list[i][0][:-1] + "2"
198
+ # e.g. 好/喜欢
199
+ elif (
200
+ i == 1
201
+ and not self._all_tone_three(sub)
202
+ and finals_list[i][0][-1] == "3"
203
+ and finals_list[0][-1][-1] == "3"
204
+ ):
205
+ finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
206
+ finals = sum(finals_list, [])
207
+ # split idiom into two words whose length is 2
208
+ elif len(word) == 4:
209
+ finals_list = [finals[:2], finals[2:]]
210
+ finals = []
211
+ for sub in finals_list:
212
+ if self._all_tone_three(sub):
213
+ sub[0] = sub[0][:-1] + "2"
214
+ finals += sub
215
+
216
+ return finals
217
+
218
+ @staticmethod
219
+ def _all_tone_three(finals: List[str]) -> bool:
220
+ # 增加 len(x) > 0 的判断,防止空字符串导致崩溃
221
+ return all(len(x) > 0 and x[-1] == "3" for x in finals)
222
+
223
+ @staticmethod
224
+ def _merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
225
+ new_seg = []
226
+ last_word = ""
227
+ for word, pos in seg:
228
+ if last_word == "不":
229
+ word = last_word + word
230
+ if word != "不":
231
+ new_seg.append((word, pos))
232
+ last_word = word[:]
233
+ if last_word == "不":
234
+ new_seg.append((last_word, "d"))
235
+ return new_seg
236
+
237
+ @staticmethod
238
+ def _merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
239
+ new_seg = []
240
+ i = 0
241
+ # function 1
242
+ while i < len(seg):
243
+ word, pos = seg[i]
244
+ merged = False
245
+ if i - 1 >= 0 and word == "一" and i + 1 < len(seg):
246
+ last = new_seg[-1] if new_seg else seg[i - 1]
247
+ if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
248
+ combined = last[0] + "一" + seg[i + 1][0]
249
+ new_seg[-1] = [combined, last[1]]
250
+ i += 2
251
+ merged = True
252
+ if not merged:
253
+ new_seg.append([word, pos])
254
+ i += 1
255
+ seg = new_seg
256
+ new_seg = []
257
+ # function 2
258
+ for word, pos in seg:
259
+ if new_seg and new_seg[-1][0] == "一":
260
+ new_seg[-1][0] = new_seg[-1][0] + word
261
+ else:
262
+ new_seg.append([word, pos])
263
+ return new_seg
264
+
265
+ # the first and the second words are all_tone_three
266
+ def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
267
+ new_seg = []
268
+ sub_finals_list = [
269
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
270
+ ]
271
+ assert len(sub_finals_list) == len(seg)
272
+ merge_last = [False] * len(seg)
273
+ for i, (word, pos) in enumerate(seg):
274
+ if (
275
+ i - 1 >= 0
276
+ and self._all_tone_three(sub_finals_list[i - 1])
277
+ and self._all_tone_three(sub_finals_list[i])
278
+ and not merge_last[i - 1]
279
+ ):
280
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
281
+ if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
282
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
283
+ merge_last[i] = True
284
+ else:
285
+ new_seg.append([word, pos])
286
+ else:
287
+ new_seg.append([word, pos])
288
+
289
+ return new_seg
290
+
291
+ @staticmethod
292
+ def _is_reduplication(word: str) -> bool:
293
+ return len(word) == 2 and word[0] == word[1]
294
+
295
+ # the last char of first word and the first char of second word is tone_three
296
+ def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
297
+ new_seg = []
298
+ sub_finals_list = [
299
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
300
+ ]
301
+ assert len(sub_finals_list) == len(seg)
302
+ merge_last = [False] * len(seg)
303
+ for i, (word, pos) in enumerate(seg):
304
+ if (
305
+ i - 1 >= 0
306
+ and sub_finals_list[i - 1][-1][-1] == "3"
307
+ and sub_finals_list[i][0][-1] == "3"
308
+ and not merge_last[i - 1]
309
+ ):
310
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
311
+ if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
312
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
313
+ merge_last[i] = True
314
+ else:
315
+ new_seg.append([word, pos])
316
+ else:
317
+ new_seg.append([word, pos])
318
+ return new_seg
319
+
320
+ @staticmethod
321
+ def _merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
322
+ new_seg = []
323
+ for i, (word, pos) in enumerate(seg):
324
+ if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
325
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
326
+ else:
327
+ new_seg.append([word, pos])
328
+ return new_seg
329
+
330
+ @staticmethod
331
+ def _merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
332
+ new_seg = []
333
+ for i, (word, pos) in enumerate(seg):
334
+ if new_seg and word == new_seg[-1][0]:
335
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
336
+ else:
337
+ new_seg.append([word, pos])
338
+ return new_seg
339
+
340
+ def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
341
+ seg = self._merge_bu(seg)
342
+ seg = self._merge_yi(seg)
343
+ seg = self._merge_reduplication(seg)
344
+ seg = self._merge_continuous_three_tones(seg)
345
+ seg = self._merge_continuous_three_tones_2(seg)
346
+ seg = self._merge_er(seg)
347
+ return seg
348
+
349
+ def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
350
+ finals = self._bu_sandhi(word, finals)
351
+ finals = self._yi_sandhi(word, finals)
352
+ finals = self._neural_sandhi(word, pos, finals)
353
+ finals = self._three_sandhi(word, finals)
354
+ return finals
genie_tts/G2P/Chinese/__pycache__/ChineseG2P.cpython-311.pyc ADDED
Binary file (11.9 kB). View file
 
genie_tts/G2P/Chinese/__pycache__/CorrectPronunciation.cpython-311.pyc ADDED
Binary file (2.99 kB). View file
 
genie_tts/G2P/Chinese/__pycache__/Erhua.cpython-311.pyc ADDED
Binary file (2.88 kB). View file
 
genie_tts/G2P/Chinese/__pycache__/ToneSandhi.cpython-311.pyc ADDED
Binary file (23.8 kB). View file
 
genie_tts/G2P/Chinese/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
genie_tts/G2P/English/EnglishG2P.py CHANGED
@@ -1,296 +1,296 @@
1
- import pickle
2
- import os
3
- import re
4
- from typing import List, Dict, Tuple
5
-
6
- import numpy as np
7
- import nltk
8
- from nltk.tokenize import TweetTokenizer
9
- from nltk import pos_tag
10
-
11
- from .Normalization import normalize
12
- from .WordSegment import segment_text
13
- from ..SymbolsV2 import symbols_v2, symbol_to_id_v2
14
- from ..SymbolsV2 import PUNCTUATION
15
- from ...Core.Resources import English_G2P_DIR
16
-
17
- # nltk 路径和分词器初始化
18
- nltk.data.path.append(English_G2P_DIR)
19
- word_tokenize = TweetTokenizer().tokenize
20
-
21
- # 路径定义
22
- CMU_DICT_PATH = os.path.join(English_G2P_DIR, "cmudict.rep")
23
- CMU_DICT_FAST_PATH = os.path.join(English_G2P_DIR, "cmudict-fast.rep")
24
- CMU_DICT_HOT_PATH = os.path.join(English_G2P_DIR, "engdict-hot.rep")
25
- CACHE_PATH = os.path.join(English_G2P_DIR, "engdict_cache.pickle")
26
- NAMECACHE_PATH = os.path.join(English_G2P_DIR, "namedict_cache.pickle")
27
- MODEL_PATH = os.path.join(English_G2P_DIR, "checkpoint20.npz")
28
-
29
- # 正则表达式和映射
30
- REP_MAP = {
31
- "[;::,;]": ",",
32
- '["’]': "'",
33
- "。": ".",
34
- "!": "!",
35
- "?": "?",
36
- }
37
- REP_MAP_PATTERN = re.compile("|".join(re.escape(p) for p in REP_MAP.keys()))
38
- PUNCTUATIONS_FOR_REGEX = "".join(re.escape(p) for p in PUNCTUATION)
39
- CONSECUTIVE_PUNCTUATION_PATTERN = re.compile(rf"([{PUNCTUATIONS_FOR_REGEX}\s])([{PUNCTUATIONS_FOR_REGEX}])+")
40
-
41
-
42
- # 辅助函数
43
- def _read_cmu_dict(file_path: str) -> Dict[str, List[str]]:
44
- g2p_dict = {}
45
- with open(file_path, 'r', encoding='utf-8') as f:
46
- for line in f:
47
- line = line.strip()
48
- if not line or line.startswith(';;;'): continue
49
- parts = re.split(r'\s+', line, maxsplit=1)
50
- if len(parts) < 2: continue
51
- word, pron_str = parts[0].lower(), parts[1]
52
- pron = pron_str.split(" ")
53
- word = re.sub(r'\(\d+\)$', '', word)
54
- if word not in g2p_dict: g2p_dict[word] = [pron]
55
- return g2p_dict
56
-
57
-
58
- def _load_and_cache_dict() -> Dict[str, List[List[str]]]:
59
- with open(CACHE_PATH, "rb") as f:
60
- g2p_dict = pickle.load(f)
61
- hot_dict = _read_cmu_dict(CMU_DICT_HOT_PATH)
62
- if hot_dict: g2p_dict.update(hot_dict)
63
- return g2p_dict
64
-
65
-
66
- def replace_phs(phs: List[str]) -> List[str]:
67
- rep_map = {"'": "-"}
68
- phs_new = []
69
- for ph in phs:
70
- if ph in symbols_v2:
71
- phs_new.append(ph)
72
- elif ph in rep_map:
73
- phs_new.append(rep_map[ph])
74
- return phs_new
75
-
76
-
77
- def replace_consecutive_punctuation(text: str) -> str:
78
- return CONSECUTIVE_PUNCTUATION_PATTERN.sub(r"\1", text)
79
-
80
-
81
- def text_normalize(text: str) -> str:
82
- text = REP_MAP_PATTERN.sub(lambda x: REP_MAP[x.group()], text)
83
- text = normalize(text)
84
- text = replace_consecutive_punctuation(text)
85
- return text
86
-
87
-
88
- class CleanG2p:
89
- """
90
- 一个集成了神经网络预测功能的、独立的英文G2P转换器。
91
- - 不再依赖 g2p_en 库,将模型推理逻辑直接内置。
92
- - 依赖 numpy 库进行计算。
93
- """
94
-
95
- def __init__(self):
96
- # 1. 初始化标准组件
97
- self.cmu = _load_and_cache_dict()
98
- self.namedict = self._load_name_dict()
99
- for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
100
- self.cmu.pop(word.lower(), None)
101
- self._setup_homographs()
102
-
103
- # 2. 初始化神经网络模型组件
104
- self._setup_nn_components()
105
- self._load_nn_model()
106
-
107
- def _setup_nn_components(self):
108
- """设置 G2P 神经网络所需的字母和音素表。"""
109
- self.graphemes = ["<pad>", "<unk>", "</s>"] + list("abcdefghijklmnopqrstuvwxyz")
110
- self.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1',
111
- 'AH2', 'AO0',
112
- 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2',
113
- 'B', 'CH', 'D', 'DH',
114
- 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
115
- 'EY2', 'F', 'G', 'HH',
116
- 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
117
- 'M', 'N', 'NG', 'OW0', 'OW1',
118
- 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
119
- 'UH0', 'UH1', 'UH2', 'UW',
120
- 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
121
- self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)}
122
- self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)}
123
- self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)}
124
- self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)}
125
-
126
- def _load_nn_model(self):
127
- """从 .npz 文件加载预训练的神经网络权重。"""
128
- if not os.path.exists(MODEL_PATH):
129
- raise FileNotFoundError(f"G2P model file not found at: {MODEL_PATH}. "
130
- f"Please ensure 'checkpoint20.npz' is in the correct directory.")
131
-
132
- variables = np.load(MODEL_PATH)
133
- self.enc_emb = variables["enc_emb"]
134
- self.enc_w_ih = variables["enc_w_ih"]
135
- self.enc_w_hh = variables["enc_w_hh"]
136
- self.enc_b_ih = variables["enc_b_ih"]
137
- self.enc_b_hh = variables["enc_b_hh"]
138
- self.dec_emb = variables["dec_emb"]
139
- self.dec_w_ih = variables["dec_w_ih"]
140
- self.dec_w_hh = variables["dec_w_hh"]
141
- self.dec_b_ih = variables["dec_b_ih"]
142
- self.dec_b_hh = variables["dec_b_hh"]
143
- self.fc_w = variables["fc_w"]
144
- self.fc_b = variables["fc_b"]
145
- # logger.info("G2P neural network model loaded successfully.")
146
-
147
- @staticmethod
148
- def _sigmoid(x):
149
- return 1 / (1 + np.exp(-x))
150
-
151
- def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
152
- rzn_ih = np.matmul(x, w_ih.T) + b_ih
153
- rzn_hh = np.matmul(h, w_hh.T) + b_hh
154
- rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:]
155
- rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:]
156
- rz = self._sigmoid(rz_ih + rz_hh)
157
- r, z = np.split(rz, 2, -1)
158
- n = np.tanh(n_ih + r * n_hh)
159
- h = (1 - z) * n + z * h
160
- return h
161
-
162
- def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None):
163
- if h0 is None:
164
- h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
165
- h = h0
166
- outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
167
- for t in range(steps):
168
- h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh)
169
- outputs[:, t, ::] = h
170
- return outputs
171
-
172
- def _encode(self, word: str) -> np.ndarray:
173
- chars = list(word.lower()) + ["</s>"]
174
- x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
175
- x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)
176
- return x
177
-
178
- def predict(self, word: str) -> List[str]:
179
- """使用内置的神经网络模型预测单词的发音。"""
180
- # Encoder
181
- enc = self._encode(word)
182
- enc = self._gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
183
- self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
184
- last_hidden = enc[:, -1, :]
185
-
186
- # Decoder
187
- dec = np.take(self.dec_emb, [self.p2idx["<s>"]], axis=0) # Start with <s>
188
- h = last_hidden
189
- preds = []
190
- for _ in range(20): # Max steps
191
- h = self._grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh)
192
- logits = np.matmul(h, self.fc_w.T) + self.fc_b
193
- pred_idx = logits.argmax()
194
- if pred_idx == self.p2idx["</s>"]: break
195
- preds.append(pred_idx)
196
- dec = np.take(self.dec_emb, [pred_idx], axis=0)
197
-
198
- return [self.idx2p.get(idx, "<unk>") for idx in preds]
199
-
200
- # --- 标准 G2P 逻辑 ---
201
-
202
- @staticmethod
203
- def _load_name_dict() -> Dict[str, List[List[str]]]:
204
- if os.path.exists(NAMECACHE_PATH):
205
- with open(NAMECACHE_PATH, "rb") as f: return pickle.load(f)
206
- return {}
207
-
208
- def _setup_homographs(self):
209
- self.homograph2features: Dict[str, Tuple[List[str], List[str], str]] = {
210
- "read": (["R", "EH1", "D"], ["R", "IY1", "D"], "VBD"),
211
- "complex": (["K", "AH0", "M", "P", "L", "EH1", "K", "S"], ["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
212
- "JJ"),
213
- "lead": (["L", "IY1", "D"], ["L", "EH1", "D"], "NN"),
214
- "presents": (["P", "R", "IY0", "Z", "EH1", "N", "T", "S"], ["P", "R", "EH1", "Z", "AH0", "N", "T", "S"],
215
- "VBZ"),
216
- }
217
-
218
- def __call__(self, text: str) -> List[str]:
219
- original_words = word_tokenize(text)
220
- normalized_text = text_normalize(text)
221
- normalized_words = word_tokenize(normalized_text)
222
-
223
- corrected_words = []
224
- original_idx, normalized_idx = 0, 0
225
- while original_idx < len(original_words) and normalized_idx < len(normalized_words):
226
- if original_words[original_idx] == "I" and \
227
- " ".join(normalized_words[normalized_idx:normalized_idx + 2]) == "the first":
228
- corrected_words.append("I")
229
- original_idx += 1
230
- normalized_idx += 2
231
- else:
232
- corrected_words.append(normalized_words[normalized_idx])
233
- original_idx += 1
234
- normalized_idx += 1
235
- if normalized_idx < len(normalized_words):
236
- corrected_words.extend(normalized_words[normalized_idx:])
237
-
238
- if not corrected_words: return []
239
-
240
- tokens = pos_tag(corrected_words)
241
- prons = []
242
- for o_word, pos in tokens:
243
- word = o_word.lower()
244
- if re.search("[a-z]", word) is None:
245
- pron = [word]
246
- elif word in self.homograph2features:
247
- pron1, pron2, pos1 = self.homograph2features[word]
248
- pron = pron1 if pos.startswith(pos1) else pron2
249
- else:
250
- pron = self._query_word(o_word)
251
- prons.extend(pron)
252
- prons.extend([" "])
253
- return prons[:-1] if prons else []
254
-
255
- def _query_word(self, o_word: str) -> List[str]:
256
- word = o_word.lower()
257
- if word in self.cmu:
258
- if o_word == "A": return ["AH0"]
259
- return self.cmu[word][0]
260
- if o_word.istitle() and word in self.namedict:
261
- return self.namedict[word][0]
262
- if word.endswith("'s") and len(word) > 2:
263
- base_pron = self._query_word(word[:-2])
264
- if base_pron:
265
- last_ph = base_pron[-1]
266
- if last_ph in {"S", "Z", "SH", "ZH", "CH", "JH"}: return base_pron + ["AH0", "Z"]
267
- if last_ph in {"P", "T", "K", "F", "TH"}: return base_pron + ["S"]
268
- return base_pron + ["Z"]
269
- if "-" in word and len(word) > 1:
270
- parts = [p for p in word.split("-") if p]
271
- if len(parts) > 1:
272
- result = [ph for part in parts for ph in self._query_word(part)]
273
- if result: return result
274
- segments = segment_text(word)
275
- if len(segments) > 1 and "".join(segments) == word:
276
- result = [ph for segment in segments for ph in self._query_word(segment)]
277
- if result: return result
278
-
279
- return self.predict(o_word)
280
-
281
-
282
- _g2p_instance: CleanG2p = CleanG2p()
283
-
284
-
285
- def g2p(text: str) -> List[str]:
286
- if _g2p_instance is None: raise RuntimeError("G2P model is not available.")
287
- raw_phonemes = _g2p_instance(text)
288
- undesired = {" ", "<pad>", "UW", "</s>", "<s>"}
289
- phones = ["UNK" if ph == "<unk>" else ph for ph in raw_phonemes if ph not in undesired]
290
- return replace_phs(phones)
291
-
292
-
293
- def english_to_phones(text: str) -> List[int]:
294
- phones = g2p(text)
295
- phones = [symbol_to_id_v2[ph] for ph in phones]
296
- return phones
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from typing import List, Dict, Tuple
5
+
6
+ import numpy as np
7
+ import nltk
8
+ from nltk.tokenize import TweetTokenizer
9
+ from nltk import pos_tag
10
+
11
+ from .Normalization import normalize
12
+ from .WordSegment import segment_text
13
+ from ..SymbolsV2 import symbols_v2, symbol_to_id_v2
14
+ from ..SymbolsV2 import PUNCTUATION
15
+ from ...Core.Resources import English_G2P_DIR
16
+
17
+ # nltk 路径和分词器初始化
18
+ nltk.data.path.append(English_G2P_DIR)
19
+ word_tokenize = TweetTokenizer().tokenize
20
+
21
+ # 路径定义
22
+ CMU_DICT_PATH = os.path.join(English_G2P_DIR, "cmudict.rep")
23
+ CMU_DICT_FAST_PATH = os.path.join(English_G2P_DIR, "cmudict-fast.rep")
24
+ CMU_DICT_HOT_PATH = os.path.join(English_G2P_DIR, "engdict-hot.rep")
25
+ CACHE_PATH = os.path.join(English_G2P_DIR, "engdict_cache.pickle")
26
+ NAMECACHE_PATH = os.path.join(English_G2P_DIR, "namedict_cache.pickle")
27
+ MODEL_PATH = os.path.join(English_G2P_DIR, "checkpoint20.npz")
28
+
29
+ # 正则表达式和映射
30
+ REP_MAP = {
31
+ "[;::,;]": ",",
32
+ '["’]': "'",
33
+ "。": ".",
34
+ "!": "!",
35
+ "?": "?",
36
+ }
37
+ REP_MAP_PATTERN = re.compile("|".join(re.escape(p) for p in REP_MAP.keys()))
38
+ PUNCTUATIONS_FOR_REGEX = "".join(re.escape(p) for p in PUNCTUATION)
39
+ CONSECUTIVE_PUNCTUATION_PATTERN = re.compile(rf"([{PUNCTUATIONS_FOR_REGEX}\s])([{PUNCTUATIONS_FOR_REGEX}])+")
40
+
41
+
42
+ # 辅助函数
43
+ def _read_cmu_dict(file_path: str) -> Dict[str, List[str]]:
44
+ g2p_dict = {}
45
+ with open(file_path, 'r', encoding='utf-8') as f:
46
+ for line in f:
47
+ line = line.strip()
48
+ if not line or line.startswith(';;;'): continue
49
+ parts = re.split(r'\s+', line, maxsplit=1)
50
+ if len(parts) < 2: continue
51
+ word, pron_str = parts[0].lower(), parts[1]
52
+ pron = pron_str.split(" ")
53
+ word = re.sub(r'\(\d+\)$', '', word)
54
+ if word not in g2p_dict: g2p_dict[word] = [pron]
55
+ return g2p_dict
56
+
57
+
58
+ def _load_and_cache_dict() -> Dict[str, List[List[str]]]:
59
+ with open(CACHE_PATH, "rb") as f:
60
+ g2p_dict = pickle.load(f)
61
+ hot_dict = _read_cmu_dict(CMU_DICT_HOT_PATH)
62
+ if hot_dict: g2p_dict.update(hot_dict)
63
+ return g2p_dict
64
+
65
+
66
+ def replace_phs(phs: List[str]) -> List[str]:
67
+ rep_map = {"'": "-"}
68
+ phs_new = []
69
+ for ph in phs:
70
+ if ph in symbols_v2:
71
+ phs_new.append(ph)
72
+ elif ph in rep_map:
73
+ phs_new.append(rep_map[ph])
74
+ return phs_new
75
+
76
+
77
+ def replace_consecutive_punctuation(text: str) -> str:
78
+ return CONSECUTIVE_PUNCTUATION_PATTERN.sub(r"\1", text)
79
+
80
+
81
+ def text_normalize(text: str) -> str:
82
+ text = REP_MAP_PATTERN.sub(lambda x: REP_MAP[x.group()], text)
83
+ text = normalize(text)
84
+ text = replace_consecutive_punctuation(text)
85
+ return text
86
+
87
+
88
+ class CleanG2p:
89
+ """
90
+ 一个集成了神经网络预测功能的、独立的英文G2P转换器。
91
+ - 不再依赖 g2p_en 库,将模型推理逻辑直接内置。
92
+ - 依赖 numpy 库进行计算。
93
+ """
94
+
95
+ def __init__(self):
96
+ # 1. 初始化标准组件
97
+ self.cmu = _load_and_cache_dict()
98
+ self.namedict = self._load_name_dict()
99
+ for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
100
+ self.cmu.pop(word.lower(), None)
101
+ self._setup_homographs()
102
+
103
+ # 2. 初始化神经网络模型组件
104
+ self._setup_nn_components()
105
+ self._load_nn_model()
106
+
107
+ def _setup_nn_components(self):
108
+ """设置 G2P 神经网络所需的字母和音素表。"""
109
+ self.graphemes = ["<pad>", "<unk>", "</s>"] + list("abcdefghijklmnopqrstuvwxyz")
110
+ self.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1',
111
+ 'AH2', 'AO0',
112
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2',
113
+ 'B', 'CH', 'D', 'DH',
114
+ 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
115
+ 'EY2', 'F', 'G', 'HH',
116
+ 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
117
+ 'M', 'N', 'NG', 'OW0', 'OW1',
118
+ 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
119
+ 'UH0', 'UH1', 'UH2', 'UW',
120
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
121
+ self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)}
122
+ self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)}
123
+ self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)}
124
+ self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)}
125
+
126
+ def _load_nn_model(self):
127
+ """从 .npz 文件加载预训练的神经网络权重。"""
128
+ if not os.path.exists(MODEL_PATH):
129
+ raise FileNotFoundError(f"G2P model file not found at: {MODEL_PATH}. "
130
+ f"Please ensure 'checkpoint20.npz' is in the correct directory.")
131
+
132
+ variables = np.load(MODEL_PATH)
133
+ self.enc_emb = variables["enc_emb"]
134
+ self.enc_w_ih = variables["enc_w_ih"]
135
+ self.enc_w_hh = variables["enc_w_hh"]
136
+ self.enc_b_ih = variables["enc_b_ih"]
137
+ self.enc_b_hh = variables["enc_b_hh"]
138
+ self.dec_emb = variables["dec_emb"]
139
+ self.dec_w_ih = variables["dec_w_ih"]
140
+ self.dec_w_hh = variables["dec_w_hh"]
141
+ self.dec_b_ih = variables["dec_b_ih"]
142
+ self.dec_b_hh = variables["dec_b_hh"]
143
+ self.fc_w = variables["fc_w"]
144
+ self.fc_b = variables["fc_b"]
145
+ # logger.info("G2P neural network model loaded successfully.")
146
+
147
+ @staticmethod
148
+ def _sigmoid(x):
149
+ return 1 / (1 + np.exp(-x))
150
+
151
+ def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
152
+ rzn_ih = np.matmul(x, w_ih.T) + b_ih
153
+ rzn_hh = np.matmul(h, w_hh.T) + b_hh
154
+ rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:]
155
+ rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:]
156
+ rz = self._sigmoid(rz_ih + rz_hh)
157
+ r, z = np.split(rz, 2, -1)
158
+ n = np.tanh(n_ih + r * n_hh)
159
+ h = (1 - z) * n + z * h
160
+ return h
161
+
162
+ def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None):
163
+ if h0 is None:
164
+ h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
165
+ h = h0
166
+ outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
167
+ for t in range(steps):
168
+ h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh)
169
+ outputs[:, t, ::] = h
170
+ return outputs
171
+
172
+ def _encode(self, word: str) -> np.ndarray:
173
+ chars = list(word.lower()) + ["</s>"]
174
+ x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
175
+ x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)
176
+ return x
177
+
178
+ def predict(self, word: str) -> List[str]:
179
+ """使用内置的神经网络模型预测单词的发音。"""
180
+ # Encoder
181
+ enc = self._encode(word)
182
+ enc = self._gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
183
+ self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
184
+ last_hidden = enc[:, -1, :]
185
+
186
+ # Decoder
187
+ dec = np.take(self.dec_emb, [self.p2idx["<s>"]], axis=0) # Start with <s>
188
+ h = last_hidden
189
+ preds = []
190
+ for _ in range(20): # Max steps
191
+ h = self._grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh)
192
+ logits = np.matmul(h, self.fc_w.T) + self.fc_b
193
+ pred_idx = logits.argmax()
194
+ if pred_idx == self.p2idx["</s>"]: break
195
+ preds.append(pred_idx)
196
+ dec = np.take(self.dec_emb, [pred_idx], axis=0)
197
+
198
+ return [self.idx2p.get(idx, "<unk>") for idx in preds]
199
+
200
+ # --- 标准 G2P 逻辑 ---
201
+
202
+ @staticmethod
203
+ def _load_name_dict() -> Dict[str, List[List[str]]]:
204
+ if os.path.exists(NAMECACHE_PATH):
205
+ with open(NAMECACHE_PATH, "rb") as f: return pickle.load(f)
206
+ return {}
207
+
208
+ def _setup_homographs(self):
209
+ self.homograph2features: Dict[str, Tuple[List[str], List[str], str]] = {
210
+ "read": (["R", "EH1", "D"], ["R", "IY1", "D"], "VBD"),
211
+ "complex": (["K", "AH0", "M", "P", "L", "EH1", "K", "S"], ["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
212
+ "JJ"),
213
+ "lead": (["L", "IY1", "D"], ["L", "EH1", "D"], "NN"),
214
+ "presents": (["P", "R", "IY0", "Z", "EH1", "N", "T", "S"], ["P", "R", "EH1", "Z", "AH0", "N", "T", "S"],
215
+ "VBZ"),
216
+ }
217
+
218
+ def __call__(self, text: str) -> List[str]:
219
+ original_words = word_tokenize(text)
220
+ normalized_text = text_normalize(text)
221
+ normalized_words = word_tokenize(normalized_text)
222
+
223
+ corrected_words = []
224
+ original_idx, normalized_idx = 0, 0
225
+ while original_idx < len(original_words) and normalized_idx < len(normalized_words):
226
+ if original_words[original_idx] == "I" and \
227
+ " ".join(normalized_words[normalized_idx:normalized_idx + 2]) == "the first":
228
+ corrected_words.append("I")
229
+ original_idx += 1
230
+ normalized_idx += 2
231
+ else:
232
+ corrected_words.append(normalized_words[normalized_idx])
233
+ original_idx += 1
234
+ normalized_idx += 1
235
+ if normalized_idx < len(normalized_words):
236
+ corrected_words.extend(normalized_words[normalized_idx:])
237
+
238
+ if not corrected_words: return []
239
+
240
+ tokens = pos_tag(corrected_words)
241
+ prons = []
242
+ for o_word, pos in tokens:
243
+ word = o_word.lower()
244
+ if re.search("[a-z]", word) is None:
245
+ pron = [word]
246
+ elif word in self.homograph2features:
247
+ pron1, pron2, pos1 = self.homograph2features[word]
248
+ pron = pron1 if pos.startswith(pos1) else pron2
249
+ else:
250
+ pron = self._query_word(o_word)
251
+ prons.extend(pron)
252
+ prons.extend([" "])
253
+ return prons[:-1] if prons else []
254
+
255
+ def _query_word(self, o_word: str) -> List[str]:
256
+ word = o_word.lower()
257
+ if word in self.cmu:
258
+ if o_word == "A": return ["AH0"]
259
+ return self.cmu[word][0]
260
+ if o_word.istitle() and word in self.namedict:
261
+ return self.namedict[word][0]
262
+ if word.endswith("'s") and len(word) > 2:
263
+ base_pron = self._query_word(word[:-2])
264
+ if base_pron:
265
+ last_ph = base_pron[-1]
266
+ if last_ph in {"S", "Z", "SH", "ZH", "CH", "JH"}: return base_pron + ["AH0", "Z"]
267
+ if last_ph in {"P", "T", "K", "F", "TH"}: return base_pron + ["S"]
268
+ return base_pron + ["Z"]
269
+ if "-" in word and len(word) > 1:
270
+ parts = [p for p in word.split("-") if p]
271
+ if len(parts) > 1:
272
+ result = [ph for part in parts for ph in self._query_word(part)]
273
+ if result: return result
274
+ segments = segment_text(word)
275
+ if len(segments) > 1 and "".join(segments) == word:
276
+ result = [ph for segment in segments for ph in self._query_word(segment)]
277
+ if result: return result
278
+
279
+ return self.predict(o_word)
280
+
281
+
282
+ _g2p_instance: CleanG2p = CleanG2p()
283
+
284
+
285
+ def g2p(text: str) -> List[str]:
286
+ if _g2p_instance is None: raise RuntimeError("G2P model is not available.")
287
+ raw_phonemes = _g2p_instance(text)
288
+ undesired = {" ", "<pad>", "UW", "</s>", "<s>"}
289
+ phones = ["UNK" if ph == "<unk>" else ph for ph in raw_phonemes if ph not in undesired]
290
+ return replace_phs(phones)
291
+
292
+
293
+ def english_to_phones(text: str) -> List[int]:
294
+ phones = g2p(text)
295
+ phones = [symbol_to_id_v2[ph] for ph in phones]
296
+ return phones