diff --git a/app.py b/app.py index 89dd5ec71dd48b5a523878dbe6e2bc930faf5cfc..3df291c341af0f20a19ec4da48f6ea37b2f4545a 100644 --- a/app.py +++ b/app.py @@ -66,6 +66,30 @@ async def load_model(character_name: str = Form(...), model_path: str = Form(... try: print(f"📦 Loading character: {character_name} from {full_path}") genie_tts.load_character(character_name, full_path, language) + + # 自动探测参考音频配置 + prompt_json_path = os.path.join(full_path, "prompt_wav.json") + ref_wav_path = os.path.join(full_path, "ref.wav") + + if os.path.exists(prompt_json_path): + import json + with open(prompt_json_path, "r", encoding="utf-8") as f: + data = json.load(f) + config = data.get("default", {}) + REF_CACHE[character_name] = { + "path": os.path.join(full_path, config.get("wav_path", "ref.wav")), + "text": config.get("prompt_text", ""), + "lang": config.get("prompt_lang", language) + } + print(f"📖 Loaded ref info from JSON for {character_name}") + elif os.path.exists(ref_wav_path): + REF_CACHE[character_name] = { + "path": ref_wav_path, + "text": "", + "lang": language + } + print(f"🎵 Found ref.wav for {character_name}") + return {"status": "success", "message": f"Character '{character_name}' loaded."} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -76,12 +100,21 @@ async def upload_and_tts( prompt_text: str = Form(...), text: str = Form(...), language: str = Form("zh"), + text_lang: str = Form(None), file: UploadFile = File(...) ): """ 上传临时参考音频并生成语音 """ try: + # 🟢 确保模型已加载 + if not genie_tts.model_manager.get(character_name): + print(f"⚠️ Character {character_name} not loaded, trying to load...") + char_path = os.path.join(MODELS_ROOT, character_name.lower()) + if not os.path.exists(char_path): + char_path = os.path.join(MODELS_ROOT, "mzm") # 兜底逻辑 + genie_tts.load_character(character_name, char_path, language) + ts = int(time.time() * 1000) save_path = f"/tmp/ref_{ts}.wav" os.makedirs("/tmp", exist_ok=True) @@ -89,23 +122,37 @@ async def upload_and_tts( with open(save_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) - print(f"🔥 [Custom] Using temp audio for {character_name}: {save_path}") + print(f"🔥 [Custom] Using temp audio: {save_path}") genie_tts.set_reference_audio(character_name, save_path, prompt_text, language) out_path = f"/tmp/out_{ts}.wav" - genie_tts.tts(character_name, text, save_path=out_path, play=False) + # 🟢 执行 TTS + genie_tts.tts(character_name, text, save_path=out_path, play=False, text_language=text_lang) + # 🟢 关键:强制等待文件出现(最多等5秒) + wait_time = 0 + while not os.path.exists(out_path) and wait_time < 50: + time.sleep(0.1) + wait_time += 1 + + if not os.path.exists(out_path): + raise HTTPException(status_code=500, detail="Audio file generation timed out or failed.") + def iterfile(): - with open(out_path, "rb") as f: - yield from f - try: - os.remove(save_path) - os.remove(out_path) - except: pass + try: + with open(out_path, "rb") as f: + yield from f + finally: + # 给一点延迟确保读取完毕后再删除 + time.sleep(1) + try: + if os.path.exists(save_path): os.remove(save_path) + if os.path.exists(out_path): os.remove(out_path) + except: pass return StreamingResponse(iterfile(), media_type="audio/wav") except Exception as e: - print(f"❌ Error in upload/tts: {e}") + print(f"❌ Error in upload/tts: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/tts") @@ -138,6 +185,12 @@ async def dynamic_tts( out_path = f"/tmp/out_dyn_{int(time.time())}.wav" genie_tts.tts(character_name, text, save_path=out_path, play=False, text_language=text_lang) + # 🟢 同样增加文件等待 + wait_time = 0 + while not os.path.exists(out_path) and wait_time < 50: + time.sleep(0.1) + wait_time += 1 + return StreamingResponse(open(out_path, "rb"), media_type="audio/wav") except Exception as e: print(f"❌ Error: {e}") diff --git a/genie_tts/Audio/Audio.py b/genie_tts/Audio/Audio.py index 2c519b4da2c38bb9d88f2dfca96c6000ddb95a8a..3819c96ebc94e5d29bee980ff3e9b3c8bd93dc33 100644 --- a/genie_tts/Audio/Audio.py +++ b/genie_tts/Audio/Audio.py @@ -1,51 +1,51 @@ -import os -import soundfile as sf -import soxr -import numpy as np -import logging -from typing import Optional - -logger = logging.getLogger(__name__) - -# 音频时长建议范围 (秒) -MIN_DURATION_S = 3 -MAX_DURATION_S = 10 -# 在音频末尾追加的静音时长 (秒) -SILENCE_TO_APPEND_S = 0.3 -# 模型期望的目标采样率 -TARGET_SAMPLING_RATE = 16000 - - -def load_audio( - audio_path: str, - target_sampling_rate: int = TARGET_SAMPLING_RATE -) -> Optional[np.ndarray]: - try: - wav, original_sr = sf.read(audio_path, dtype='float32') - if wav.ndim > 1: - wav = np.mean(wav, axis=1) # 多声道转单声道。 - if original_sr != target_sampling_rate: - wav = soxr.resample(wav, original_sr, target_sampling_rate, quality='hq') # 重采样。 - - except Exception as e: - logger.error(f"Failed to load reference audio: {audio_path}. Error: {e}") - return None - - # 检查音频长度是否在建议范围之外 - min_samples = int(MIN_DURATION_S * target_sampling_rate) - max_samples = int(MAX_DURATION_S * target_sampling_rate) - if not (min_samples <= wav.shape[0] <= max_samples): - duration = len(wav) / target_sampling_rate - logger.warning( - f"The reference audio '{os.path.basename(audio_path)}' has a duration of {duration:.2f} seconds, " - f"which is outside the recommended range of {MIN_DURATION_S} to {MAX_DURATION_S} seconds!" - ) - - # 创建并拼接静音 - silence_samples = int(SILENCE_TO_APPEND_S * target_sampling_rate) - silence_array = np.zeros(silence_samples, dtype=np.float32) - wav_processed = np.concatenate([wav, silence_array]) - - # 为模型输入增加批次维度 - # wav_processed = np.expand_dims(wav_processed, axis=0) - return wav_processed +import os +import soundfile as sf +import soxr +import numpy as np +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +# 音频时长建议范围 (秒) +MIN_DURATION_S = 3 +MAX_DURATION_S = 10 +# 在音频末尾追加的静音时长 (秒) +SILENCE_TO_APPEND_S = 0.3 +# 模型期望的目标采样率 +TARGET_SAMPLING_RATE = 16000 + + +def load_audio( + audio_path: str, + target_sampling_rate: int = TARGET_SAMPLING_RATE +) -> Optional[np.ndarray]: + try: + wav, original_sr = sf.read(audio_path, dtype='float32') + if wav.ndim > 1: + wav = np.mean(wav, axis=1) # 多声道转单声道。 + if original_sr != target_sampling_rate: + wav = soxr.resample(wav, original_sr, target_sampling_rate, quality='hq') # 重采样。 + + except Exception as e: + logger.error(f"Failed to load reference audio: {audio_path}. Error: {e}") + return None + + # 检查音频长度是否在建议范围之外 + min_samples = int(MIN_DURATION_S * target_sampling_rate) + max_samples = int(MAX_DURATION_S * target_sampling_rate) + if not (min_samples <= wav.shape[0] <= max_samples): + duration = len(wav) / target_sampling_rate + logger.warning( + f"The reference audio '{os.path.basename(audio_path)}' has a duration of {duration:.2f} seconds, " + f"which is outside the recommended range of {MIN_DURATION_S} to {MAX_DURATION_S} seconds!" + ) + + # 创建并拼接静音 + silence_samples = int(SILENCE_TO_APPEND_S * target_sampling_rate) + silence_array = np.zeros(silence_samples, dtype=np.float32) + wav_processed = np.concatenate([wav, silence_array]) + + # 为模型输入增加批次维度 + # wav_processed = np.expand_dims(wav_processed, axis=0) + return wav_processed diff --git a/genie_tts/Audio/__pycache__/Audio.cpython-311.pyc b/genie_tts/Audio/__pycache__/Audio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8feec30071c864483b7e21a331801203ceae1aed Binary files /dev/null and b/genie_tts/Audio/__pycache__/Audio.cpython-311.pyc differ diff --git a/genie_tts/Audio/__pycache__/ReferenceAudio.cpython-311.pyc b/genie_tts/Audio/__pycache__/ReferenceAudio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ce41a436ae82831a1927ac8425f66fb9de37b88 Binary files /dev/null and b/genie_tts/Audio/__pycache__/ReferenceAudio.cpython-311.pyc differ diff --git a/genie_tts/Audio/__pycache__/__init__.cpython-311.pyc b/genie_tts/Audio/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..140f93d8fda57a05d42a44ff5abf388d72191846 Binary files /dev/null and b/genie_tts/Audio/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/Converter/Converter.py b/genie_tts/Converter/Converter.py index 6951e71aedaf4d08338f12c277c0da3e7cf917bf..d0fc7e794de0affd450df31fb64547c2d6b9a743 100644 --- a/genie_tts/Converter/Converter.py +++ b/genie_tts/Converter/Converter.py @@ -1,11 +1,11 @@ -from .v2.Converter import convert as convert_v2 -from .v2ProPlus.Converter import convert as convert_v2pp - -import os - - -def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None: - if os.path.getsize(torch_pth_path) > 150 * 1024 * 1024: # 大于 150 MB - convert_v2pp(torch_ckpt_path, torch_pth_path, output_dir) - else: - convert_v2(torch_ckpt_path, torch_pth_path, output_dir) +from .v2.Converter import convert as convert_v2 +from .v2ProPlus.Converter import convert as convert_v2pp + +import os + + +def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None: + if os.path.getsize(torch_pth_path) > 150 * 1024 * 1024: # 大于 150 MB + convert_v2pp(torch_ckpt_path, torch_pth_path, output_dir) + else: + convert_v2(torch_ckpt_path, torch_pth_path, output_dir) diff --git a/genie_tts/Converter/__pycache__/Converter.cpython-311.pyc b/genie_tts/Converter/__pycache__/Converter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98bd57b82f7dfdade427c904a50f6924c97ffd23 Binary files /dev/null and b/genie_tts/Converter/__pycache__/Converter.cpython-311.pyc differ diff --git a/genie_tts/Converter/__pycache__/__init__.cpython-311.pyc b/genie_tts/Converter/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63a605374d3f1d160e4c3fe59fe575ec3f9d9c08 Binary files /dev/null and b/genie_tts/Converter/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/Converter/__pycache__/load_state_dict.cpython-311.pyc b/genie_tts/Converter/__pycache__/load_state_dict.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..621e06b8a12e5bebd19c3f53e0a8a9ff8551657c Binary files /dev/null and b/genie_tts/Converter/__pycache__/load_state_dict.cpython-311.pyc differ diff --git a/genie_tts/Converter/__pycache__/utils.cpython-311.pyc b/genie_tts/Converter/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e89f6248c90c29b0b65e54251be25e630ba7057c Binary files /dev/null and b/genie_tts/Converter/__pycache__/utils.cpython-311.pyc differ diff --git a/genie_tts/Converter/load_state_dict.py b/genie_tts/Converter/load_state_dict.py index 952d3580c094ca9b4c74b7ac94633b7817a3cc81..ceff2c0183fdcfaa1685b3202c41f2d57f4e3262 100644 --- a/genie_tts/Converter/load_state_dict.py +++ b/genie_tts/Converter/load_state_dict.py @@ -1,26 +1,26 @@ -import sys -import os - -sys.path.append(os.path.dirname(__file__)) - -import torch -from io import BytesIO -import utils - - -def load_sovits_model(pth_path: str, device: str = 'cpu'): - f = open(pth_path, "rb") - meta = f.read(2) - if meta != b"PK": - # noinspection PyTypeChecker - data = b"PK" + f.read() - bio = BytesIO() - # noinspection PyTypeChecker - bio.write(data) - bio.seek(0) - return torch.load(bio, map_location=device, weights_only=False) - return torch.load(pth_path, map_location=device, weights_only=False) - - -def load_gpt_model(ckpt_path: str, device: str = 'cpu'): - return torch.load(ckpt_path, map_location=device, weights_only=True) +import sys +import os + +sys.path.append(os.path.dirname(__file__)) + +import torch +from io import BytesIO +import utils + + +def load_sovits_model(pth_path: str, device: str = 'cpu'): + f = open(pth_path, "rb") + meta = f.read(2) + if meta != b"PK": + # noinspection PyTypeChecker + data = b"PK" + f.read() + bio = BytesIO() + # noinspection PyTypeChecker + bio.write(data) + bio.seek(0) + return torch.load(bio, map_location=device, weights_only=False) + return torch.load(pth_path, map_location=device, weights_only=False) + + +def load_gpt_model(ckpt_path: str, device: str = 'cpu'): + return torch.load(ckpt_path, map_location=device, weights_only=True) diff --git a/genie_tts/Converter/v2/Converter.py b/genie_tts/Converter/v2/Converter.py index 39910b97c9e6cb8aeda2a4d89594dbe1106af0af..301edd060a51c3052a2b05121d9d96da729e5a79 100644 --- a/genie_tts/Converter/v2/Converter.py +++ b/genie_tts/Converter/v2/Converter.py @@ -1,146 +1,146 @@ -from .VITSConverter import VITSConverter -from .T2SConverter import T2SModelConverter -from .EncoderConverter import EncoderConverter -from ...Utils.Constants import PACKAGE_NAME - -import logging -from typing import Optional, Tuple -import re -import os -import shutil -import traceback -import importlib.resources -import contextlib - -logger = logging.getLogger() - -CACHE_DIR = os.path.join(os.getcwd(), "Cache") -ENCODER_RESOURCE_PATH = "Data/v2/Models/t2s_encoder_fp32.onnx" -STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_stage_decoder_fp32.onnx" -FIRST_STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_first_stage_decoder_fp32.onnx" -VITS_RESOURCE_PATH = "Data/v2/Models/vits_fp32.onnx" -T2S_KEYS_RESOURCE_PATH = "Data/v2/Keys/t2s_onnx_keys.txt" -VITS_KEYS_RESOURCE_PATH = "Data/v2/Keys/vits_onnx_keys.txt" - - -def find_ckpt_and_pth(directory: str) -> Tuple[Optional[str], Optional[str]]: - """ - 在 directory(不递归子目录)里查找: - - .ckpt:从所有 .ckpt 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0), - 选择 epoch 最大的那个文件(若无则为 None) - - .pth :从所有 .pth 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0), - 选择 epoch 最大的那个文件(若无则为 None) - 若出现相同 epoch,选修改时间较新的文件以打破平手。 - """ - best_ckpt_path: Optional[str] = None - best_ckpt_epoch: int = -1 - - best_pth_path: Optional[str] = None - best_pth_epoch: int = -1 - - for filename in os.listdir(directory): - full_path = os.path.join(directory, filename) - - if not os.path.isfile(full_path): - continue - - # 提取 epoch - m = re.search(r"e(\d+)", filename, flags=re.IGNORECASE) - epoch = int(m.group(1)) if m else 0 - - # .ckpt 文件处理 - if filename.lower().endswith(".ckpt"): - if ( - epoch > best_ckpt_epoch - or ( - epoch == best_ckpt_epoch - and best_ckpt_path is not None - and os.path.getmtime(full_path) > os.path.getmtime(best_ckpt_path) - ) - ): - best_ckpt_epoch = epoch - best_ckpt_path = full_path - - # .pth 文件处理 - elif filename.lower().endswith(".pth"): - if ( - epoch > best_pth_epoch - or ( - epoch == best_pth_epoch - and best_pth_path is not None - and os.path.getmtime(full_path) > os.path.getmtime(best_pth_path) - ) - ): - best_pth_epoch = epoch - best_pth_path = full_path - - return best_ckpt_path, best_pth_path - - -def remove_folder(folder: str) -> None: - try: - if os.path.exists(folder): - shutil.rmtree(folder) - logger.info(f"🧹 Folder cleaned: {folder}") - except Exception as e: - logger.error(f"❌ Failed to clean folder {folder}: {e}") - - -def convert(torch_ckpt_path: str, - torch_pth_path: str, - output_dir: str): - # 确保缓存和输出目录存在 - os.makedirs(CACHE_DIR, exist_ok=True) - os.makedirs(output_dir, exist_ok=True) - - if len(os.listdir(output_dir)) > 0: - logger.warning(f"The output directory {output_dir} is not empty!") - - with contextlib.ExitStack() as stack: - files = importlib.resources.files(PACKAGE_NAME) - - def enter(p): - return stack.enter_context(importlib.resources.as_file(files.joinpath(p))) - - encoder_onnx_path = enter(ENCODER_RESOURCE_PATH) - stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH) - first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH) - vits_onnx_path = enter(VITS_RESOURCE_PATH) - t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH) - vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH) - - converter_1 = T2SModelConverter( - torch_ckpt_path=torch_ckpt_path, - stage_decoder_onnx_path=str(stage_decoder_path), - first_stage_decoder_onnx_path=str(first_stage_decoder_path), - key_list_file=str(t2s_keys_path), - output_dir=output_dir, - cache_dir=CACHE_DIR, - ) - converter_2 = VITSConverter( - torch_pth_path=torch_pth_path, - vits_onnx_path=str(vits_onnx_path), - key_list_file=str(vits_keys_path), - output_dir=output_dir, - cache_dir=CACHE_DIR, - ) - converter_3 = EncoderConverter( - ckpt_path=torch_ckpt_path, - pth_path=torch_pth_path, - onnx_input_path=str(encoder_onnx_path), - output_dir=output_dir, - ) - - try: - converter_1.run_full_process() - converter_2.run_full_process() - converter_3.run_full_process() - logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n" - f"- Model Type: V2") - except Exception: - logger.error(f"❌ A critical error occurred during the conversion process") - logger.error(traceback.format_exc()) - remove_folder(output_dir) # 只在失败时清理输出目录 - finally: - # 无论成功还是失败,都尝试清理缓存目录 - remove_folder(CACHE_DIR) +from .VITSConverter import VITSConverter +from .T2SConverter import T2SModelConverter +from .EncoderConverter import EncoderConverter +from ...Utils.Constants import PACKAGE_NAME + +import logging +from typing import Optional, Tuple +import re +import os +import shutil +import traceback +import importlib.resources +import contextlib + +logger = logging.getLogger() + +CACHE_DIR = os.path.join(os.getcwd(), "Cache") +ENCODER_RESOURCE_PATH = "Data/v2/Models/t2s_encoder_fp32.onnx" +STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_stage_decoder_fp32.onnx" +FIRST_STAGE_DECODER_RESOURCE_PATH = "Data/v2/Models/t2s_first_stage_decoder_fp32.onnx" +VITS_RESOURCE_PATH = "Data/v2/Models/vits_fp32.onnx" +T2S_KEYS_RESOURCE_PATH = "Data/v2/Keys/t2s_onnx_keys.txt" +VITS_KEYS_RESOURCE_PATH = "Data/v2/Keys/vits_onnx_keys.txt" + + +def find_ckpt_and_pth(directory: str) -> Tuple[Optional[str], Optional[str]]: + """ + 在 directory(不递归子目录)里查找: + - .ckpt:从所有 .ckpt 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0), + 选择 epoch 最大的那个文件(若无则为 None) + - .pth :从所有 .pth 文件名中搜索 'e{正整数}' 作为 epoch(找不到则视为 e0), + 选择 epoch 最大的那个文件(若无则为 None) + 若出现相同 epoch,选修改时间较新的文件以打破平手。 + """ + best_ckpt_path: Optional[str] = None + best_ckpt_epoch: int = -1 + + best_pth_path: Optional[str] = None + best_pth_epoch: int = -1 + + for filename in os.listdir(directory): + full_path = os.path.join(directory, filename) + + if not os.path.isfile(full_path): + continue + + # 提取 epoch + m = re.search(r"e(\d+)", filename, flags=re.IGNORECASE) + epoch = int(m.group(1)) if m else 0 + + # .ckpt 文件处理 + if filename.lower().endswith(".ckpt"): + if ( + epoch > best_ckpt_epoch + or ( + epoch == best_ckpt_epoch + and best_ckpt_path is not None + and os.path.getmtime(full_path) > os.path.getmtime(best_ckpt_path) + ) + ): + best_ckpt_epoch = epoch + best_ckpt_path = full_path + + # .pth 文件处理 + elif filename.lower().endswith(".pth"): + if ( + epoch > best_pth_epoch + or ( + epoch == best_pth_epoch + and best_pth_path is not None + and os.path.getmtime(full_path) > os.path.getmtime(best_pth_path) + ) + ): + best_pth_epoch = epoch + best_pth_path = full_path + + return best_ckpt_path, best_pth_path + + +def remove_folder(folder: str) -> None: + try: + if os.path.exists(folder): + shutil.rmtree(folder) + logger.info(f"🧹 Folder cleaned: {folder}") + except Exception as e: + logger.error(f"❌ Failed to clean folder {folder}: {e}") + + +def convert(torch_ckpt_path: str, + torch_pth_path: str, + output_dir: str): + # 确保缓存和输出目录存在 + os.makedirs(CACHE_DIR, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + if len(os.listdir(output_dir)) > 0: + logger.warning(f"The output directory {output_dir} is not empty!") + + with contextlib.ExitStack() as stack: + files = importlib.resources.files(PACKAGE_NAME) + + def enter(p): + return stack.enter_context(importlib.resources.as_file(files.joinpath(p))) + + encoder_onnx_path = enter(ENCODER_RESOURCE_PATH) + stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH) + first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH) + vits_onnx_path = enter(VITS_RESOURCE_PATH) + t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH) + vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH) + + converter_1 = T2SModelConverter( + torch_ckpt_path=torch_ckpt_path, + stage_decoder_onnx_path=str(stage_decoder_path), + first_stage_decoder_onnx_path=str(first_stage_decoder_path), + key_list_file=str(t2s_keys_path), + output_dir=output_dir, + cache_dir=CACHE_DIR, + ) + converter_2 = VITSConverter( + torch_pth_path=torch_pth_path, + vits_onnx_path=str(vits_onnx_path), + key_list_file=str(vits_keys_path), + output_dir=output_dir, + cache_dir=CACHE_DIR, + ) + converter_3 = EncoderConverter( + ckpt_path=torch_ckpt_path, + pth_path=torch_pth_path, + onnx_input_path=str(encoder_onnx_path), + output_dir=output_dir, + ) + + try: + converter_1.run_full_process() + converter_2.run_full_process() + converter_3.run_full_process() + logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n" + f"- Model Type: V2") + except Exception: + logger.error(f"❌ A critical error occurred during the conversion process") + logger.error(traceback.format_exc()) + remove_folder(output_dir) # 只在失败时清理输出目录 + finally: + # 无论成功还是失败,都尝试清理缓存目录 + remove_folder(CACHE_DIR) diff --git a/genie_tts/Converter/v2/EncoderConverter.py b/genie_tts/Converter/v2/EncoderConverter.py index 33ed75ba3277ec535f9c2f88304c7c07cc2752b4..84eaf47856716ecf5646d81f439ae335390ba6f8 100644 --- a/genie_tts/Converter/v2/EncoderConverter.py +++ b/genie_tts/Converter/v2/EncoderConverter.py @@ -1,106 +1,106 @@ -import torch -import onnx -import os - -from ..load_state_dict import load_gpt_model, load_sovits_model - - -class EncoderConverter: - """ - 一个转换器,用于为 t2s_encoder 模型创建: - 1. 一个从 .ckpt 和 .pth 文件中合并而来的全精度 (fp32) .bin 权重文件。 - 2. 一个链接到该 .bin 文件的 ONNX 模型。 - """ - - def __init__(self, - ckpt_path: str, - pth_path: str, - onnx_input_path: str, - output_dir: str, - ): - self.ckpt_path: str = ckpt_path - self.pth_path: str = pth_path - self.onnx_input_path: str = onnx_input_path - self.output_dir: str = output_dir - - # 定义最终输出文件的路径 - self.output_bin_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.bin") - self.output_onnx_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx") - - # 确保输出目录存在 - os.makedirs(self.output_dir, exist_ok=True) - - # 检查所有输入文件是否存在 - for path in [self.ckpt_path, self.pth_path, self.onnx_input_path]: - if not os.path.exists(path): - raise FileNotFoundError(f"Error: Input file not found! Path: {path}") - - def run_full_process(self): - # 1. 定义固定的 ONNX 权重键列表 (此顺序决定了 .bin 文件的布局) - onnx_keys = [ - "encoder.ar_text_embedding.word_embeddings.weight", - "encoder.bert_proj.weight", - "encoder.bert_proj.bias", - "encoder.ar_text_position.alpha", - "vits.ssl_proj.weight", - "vits.ssl_proj.bias", - "vits.quantizer.vq.layers.0._codebook.embed" - ] - - # 2. 加载所有必要的模型和权重 - ckpt_state_dict = load_gpt_model(self.ckpt_path)['weight'] - pth_state_dict = load_sovits_model(self.pth_path)['weight'] - model = onnx.load(self.onnx_input_path, load_external_data=False) - initializer_map = {init.name: init for init in model.graph.initializer} - current_offset = 0 - bin_filename = os.path.basename(self.output_bin_path) - - # 3. 生成 .bin 文件并同步修改 ONNX 模型 - with open(self.output_bin_path, 'wb') as f_bin: - for onnx_key in onnx_keys: - source_key = "" - source_dict = None - - if onnx_key.startswith("encoder."): - source_key = "model." + onnx_key[len("encoder."):] - source_dict = ckpt_state_dict - elif onnx_key.startswith("vits."): - source_key = onnx_key[len("vits."):] - source_dict = pth_state_dict - - if source_dict is None: - raise ValueError( - f"❌ Critical error: Unable to determine the weight source for ONNX key '{onnx_key}'.") - # 从源文件中提取张量 - tensor = source_dict.get(source_key) - if tensor is None: - raise ValueError( - f"❌ Critical error: Key '{source_key}' (corresponding to ONNX key '{onnx_key}') not found in the source file.") - - # 转换为 fp32 numpy 数组并获取字节 - numpy_array_fp32 = tensor.to(torch.float32).cpu().numpy() - tensor_bytes = numpy_array_fp32.tobytes() - tensor_length = len(tensor_bytes) - f_bin.write(tensor_bytes) - - # 在 ONNX 模型中找到对应的 initializer 并修改它 - if onnx_key in initializer_map: - tensor_proto = initializer_map[onnx_key] - - tensor_proto.ClearField('raw_data') - tensor_proto.data_location = onnx.TensorProto.EXTERNAL - del tensor_proto.external_data[:] - - keys_to_set = ["location", "offset", "length"] - values_to_set = [bin_filename, str(current_offset), str(tensor_length)] - - for k, v in zip(keys_to_set, values_to_set): - entry = tensor_proto.external_data.add() - entry.key = k - entry.value = v - - # 更新下一个权重的偏移量 - current_offset += tensor_length - - # 4. 保存修改后的 ONNX 模型 - onnx.save(model, self.output_onnx_path) +import torch +import onnx +import os + +from ..load_state_dict import load_gpt_model, load_sovits_model + + +class EncoderConverter: + """ + 一个转换器,用于为 t2s_encoder 模型创建: + 1. 一个从 .ckpt 和 .pth 文件中合并而来的全精度 (fp32) .bin 权重文件。 + 2. 一个链接到该 .bin 文件的 ONNX 模型。 + """ + + def __init__(self, + ckpt_path: str, + pth_path: str, + onnx_input_path: str, + output_dir: str, + ): + self.ckpt_path: str = ckpt_path + self.pth_path: str = pth_path + self.onnx_input_path: str = onnx_input_path + self.output_dir: str = output_dir + + # 定义最终输出文件的路径 + self.output_bin_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.bin") + self.output_onnx_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx") + + # 确保输出目录存在 + os.makedirs(self.output_dir, exist_ok=True) + + # 检查所有输入文件是否存在 + for path in [self.ckpt_path, self.pth_path, self.onnx_input_path]: + if not os.path.exists(path): + raise FileNotFoundError(f"Error: Input file not found! Path: {path}") + + def run_full_process(self): + # 1. 定义固定的 ONNX 权重键列表 (此顺序决定了 .bin 文件的布局) + onnx_keys = [ + "encoder.ar_text_embedding.word_embeddings.weight", + "encoder.bert_proj.weight", + "encoder.bert_proj.bias", + "encoder.ar_text_position.alpha", + "vits.ssl_proj.weight", + "vits.ssl_proj.bias", + "vits.quantizer.vq.layers.0._codebook.embed" + ] + + # 2. 加载所有必要的模型和权重 + ckpt_state_dict = load_gpt_model(self.ckpt_path)['weight'] + pth_state_dict = load_sovits_model(self.pth_path)['weight'] + model = onnx.load(self.onnx_input_path, load_external_data=False) + initializer_map = {init.name: init for init in model.graph.initializer} + current_offset = 0 + bin_filename = os.path.basename(self.output_bin_path) + + # 3. 生成 .bin 文件并同步修改 ONNX 模型 + with open(self.output_bin_path, 'wb') as f_bin: + for onnx_key in onnx_keys: + source_key = "" + source_dict = None + + if onnx_key.startswith("encoder."): + source_key = "model." + onnx_key[len("encoder."):] + source_dict = ckpt_state_dict + elif onnx_key.startswith("vits."): + source_key = onnx_key[len("vits."):] + source_dict = pth_state_dict + + if source_dict is None: + raise ValueError( + f"❌ Critical error: Unable to determine the weight source for ONNX key '{onnx_key}'.") + # 从源文件中提取张量 + tensor = source_dict.get(source_key) + if tensor is None: + raise ValueError( + f"❌ Critical error: Key '{source_key}' (corresponding to ONNX key '{onnx_key}') not found in the source file.") + + # 转换为 fp32 numpy 数组并获取字节 + numpy_array_fp32 = tensor.to(torch.float32).cpu().numpy() + tensor_bytes = numpy_array_fp32.tobytes() + tensor_length = len(tensor_bytes) + f_bin.write(tensor_bytes) + + # 在 ONNX 模型中找到对应的 initializer 并修改它 + if onnx_key in initializer_map: + tensor_proto = initializer_map[onnx_key] + + tensor_proto.ClearField('raw_data') + tensor_proto.data_location = onnx.TensorProto.EXTERNAL + del tensor_proto.external_data[:] + + keys_to_set = ["location", "offset", "length"] + values_to_set = [bin_filename, str(current_offset), str(tensor_length)] + + for k, v in zip(keys_to_set, values_to_set): + entry = tensor_proto.external_data.add() + entry.key = k + entry.value = v + + # 更新下一个权重的偏移量 + current_offset += tensor_length + + # 4. 保存修改后的 ONNX 模型 + onnx.save(model, self.output_onnx_path) diff --git a/genie_tts/Converter/v2/T2SConverter.py b/genie_tts/Converter/v2/T2SConverter.py index 6a8de6925697ec911f6a22cfa4cdb31f4f73cbac..3cc386379cc56f008592bfeecc25852feae72d61 100644 --- a/genie_tts/Converter/v2/T2SConverter.py +++ b/genie_tts/Converter/v2/T2SConverter.py @@ -1,125 +1,125 @@ -import torch -import onnx -import numpy as np -import json -import os -from collections import OrderedDict - -from ..load_state_dict import load_gpt_model - - -class T2SModelConverter: - """ - 一个专门的转换器,用于处理 t2s (Text-to-Speech) 模型。 - - PyTorch 模型: .ckpt 文件 - - ONNX 模型: t2s_stage_decoder_fp32.onnx - - 遵循特定的键名映射规则。 - """ - - def __init__(self, - torch_ckpt_path: str, - stage_decoder_onnx_path: str, - first_stage_decoder_onnx_path: str, - key_list_file: str, - output_dir: str, - cache_dir: str, - ): - self.torch_ckpt_path: str = torch_ckpt_path - self.stage_decoder_onnx_path: str = stage_decoder_onnx_path - self.first_stage_decoder_onnx_path: str = first_stage_decoder_onnx_path - self.key_list_file: str = key_list_file - self.output_dir: str = output_dir - self.cache_dir: str = cache_dir - - os.makedirs(self.output_dir, exist_ok=True) - os.makedirs(self.output_dir, exist_ok=True) - - # 定义输出文件路径 - self.fp16_bin_path: str = os.path.join(self.output_dir, "t2s_shared_fp16.bin") - self.index_table_path: str = os.path.join(self.cache_dir, "t2s_weights_index_fp32.json") - self.relinked_encoder_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx") - self.relinked_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_stage_decoder_fp32.onnx") - self.relinked_first_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_first_stage_decoder_fp32.onnx") - self.reconstructed_fp32_bin_path = os.path.join(self.output_dir, "t2s_shared_fp32.bin") - - def step1_create_fp16_bin_with_key_mapping(self): - """ - (1) 根据特定的键映射规则,从 .ckpt 创建 fp16 .bin 和 fp32 索引。 - (已根据用户验证脚本的正确逻辑进行最终修正) - """ - if not os.path.exists(self.key_list_file): - raise FileNotFoundError( - f"Error: Stage 1 requires the key list file, but it was not found: {self.key_list_file}") - - with open(self.key_list_file, 'r') as f: - onnx_keys = [line.strip() for line in f.readlines()] - - ckpt_data = load_gpt_model(self.torch_ckpt_path) - if 'weight' not in ckpt_data: - raise KeyError( - f"❌ Error: 'weight' key not found in the .ckpt file. Top-level keys in the file are: {list(ckpt_data.keys())}") - - torch_state_dict = ckpt_data['weight'] - - index_table = OrderedDict() - current_fp32_offset = 0 - - with open(self.fp16_bin_path, 'wb') as f_bin: - for onnx_key in onnx_keys: - transformed_onnx_key = onnx_key.replace('transformer_encoder', 'h') - torch_lookup_key = f"model.{transformed_onnx_key}" - torch_tensor = torch_state_dict.get(torch_lookup_key) - numpy_array_fp16 = torch_tensor.to(torch.float16).cpu().numpy() - f_bin.write(numpy_array_fp16.tobytes()) - tensor_length_fp32 = numpy_array_fp16.nbytes * 2 - index_table[onnx_key] = {'offset': current_fp32_offset, 'length': tensor_length_fp32} - current_fp32_offset += tensor_length_fp32 - - with open(self.index_table_path, 'w') as f_json: - json.dump(index_table, f_json, indent=4) # type: ignore - - def step2_relink_onnx_for_fp32(self, old_model: str, new_model: str): - """ - (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到未来的全精度 .bin。 - (使用与第一个脚本相同的、更稳定的底层方法) - """ - if not os.path.exists(self.index_table_path): - raise FileNotFoundError( - f"Error: Stage 2 requires the index file, but it was not found: {self.index_table_path}") - - # 加载描述 fp32 布局的索引表 - with open(self.index_table_path, 'r') as f: - index_table = json.load(f) - - model = onnx.load_model(old_model, load_external_data=False) - reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path) - - for tensor in model.graph.initializer: - if tensor.name in index_table: - tensor.ClearField('raw_data') - tensor.data_location = onnx.TensorProto.EXTERNAL - info = index_table[tensor.name] - del tensor.external_data[:] - keys = ["location", "offset", "length"] - values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])] - - for k, v in zip(keys, values): - entry = tensor.external_data.add() - entry.key = k - entry.value = v - - onnx.save(model, new_model) - - @staticmethod - def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str): - """ - (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。 - """ - fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16) - fp32_array = fp16_array.astype(np.float32) - fp32_array.tofile(output_fp32_bin_path) - - def run_full_process(self): - self.step1_create_fp16_bin_with_key_mapping() - self.step2_relink_onnx_for_fp32(self.stage_decoder_onnx_path, self.relinked_stage_decoder_path) - self.step2_relink_onnx_for_fp32(self.first_stage_decoder_onnx_path, self.relinked_first_stage_decoder_path) +import torch +import onnx +import numpy as np +import json +import os +from collections import OrderedDict + +from ..load_state_dict import load_gpt_model + + +class T2SModelConverter: + """ + 一个专门的转换器,用于处理 t2s (Text-to-Speech) 模型。 + - PyTorch 模型: .ckpt 文件 + - ONNX 模型: t2s_stage_decoder_fp32.onnx + - 遵循特定的键名映射规则。 + """ + + def __init__(self, + torch_ckpt_path: str, + stage_decoder_onnx_path: str, + first_stage_decoder_onnx_path: str, + key_list_file: str, + output_dir: str, + cache_dir: str, + ): + self.torch_ckpt_path: str = torch_ckpt_path + self.stage_decoder_onnx_path: str = stage_decoder_onnx_path + self.first_stage_decoder_onnx_path: str = first_stage_decoder_onnx_path + self.key_list_file: str = key_list_file + self.output_dir: str = output_dir + self.cache_dir: str = cache_dir + + os.makedirs(self.output_dir, exist_ok=True) + os.makedirs(self.output_dir, exist_ok=True) + + # 定义输出文件路径 + self.fp16_bin_path: str = os.path.join(self.output_dir, "t2s_shared_fp16.bin") + self.index_table_path: str = os.path.join(self.cache_dir, "t2s_weights_index_fp32.json") + self.relinked_encoder_path: str = os.path.join(self.output_dir, "t2s_encoder_fp32.onnx") + self.relinked_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_stage_decoder_fp32.onnx") + self.relinked_first_stage_decoder_path: str = os.path.join(self.output_dir, "t2s_first_stage_decoder_fp32.onnx") + self.reconstructed_fp32_bin_path = os.path.join(self.output_dir, "t2s_shared_fp32.bin") + + def step1_create_fp16_bin_with_key_mapping(self): + """ + (1) 根据特定的键映射规则,从 .ckpt 创建 fp16 .bin 和 fp32 索引。 + (已根据用户验证脚本的正确逻辑进行最终修正) + """ + if not os.path.exists(self.key_list_file): + raise FileNotFoundError( + f"Error: Stage 1 requires the key list file, but it was not found: {self.key_list_file}") + + with open(self.key_list_file, 'r') as f: + onnx_keys = [line.strip() for line in f.readlines()] + + ckpt_data = load_gpt_model(self.torch_ckpt_path) + if 'weight' not in ckpt_data: + raise KeyError( + f"❌ Error: 'weight' key not found in the .ckpt file. Top-level keys in the file are: {list(ckpt_data.keys())}") + + torch_state_dict = ckpt_data['weight'] + + index_table = OrderedDict() + current_fp32_offset = 0 + + with open(self.fp16_bin_path, 'wb') as f_bin: + for onnx_key in onnx_keys: + transformed_onnx_key = onnx_key.replace('transformer_encoder', 'h') + torch_lookup_key = f"model.{transformed_onnx_key}" + torch_tensor = torch_state_dict.get(torch_lookup_key) + numpy_array_fp16 = torch_tensor.to(torch.float16).cpu().numpy() + f_bin.write(numpy_array_fp16.tobytes()) + tensor_length_fp32 = numpy_array_fp16.nbytes * 2 + index_table[onnx_key] = {'offset': current_fp32_offset, 'length': tensor_length_fp32} + current_fp32_offset += tensor_length_fp32 + + with open(self.index_table_path, 'w') as f_json: + json.dump(index_table, f_json, indent=4) # type: ignore + + def step2_relink_onnx_for_fp32(self, old_model: str, new_model: str): + """ + (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到未来的全精度 .bin。 + (使用与第一个脚本相同的、更稳定的底层方法) + """ + if not os.path.exists(self.index_table_path): + raise FileNotFoundError( + f"Error: Stage 2 requires the index file, but it was not found: {self.index_table_path}") + + # 加载描述 fp32 布局的索引表 + with open(self.index_table_path, 'r') as f: + index_table = json.load(f) + + model = onnx.load_model(old_model, load_external_data=False) + reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path) + + for tensor in model.graph.initializer: + if tensor.name in index_table: + tensor.ClearField('raw_data') + tensor.data_location = onnx.TensorProto.EXTERNAL + info = index_table[tensor.name] + del tensor.external_data[:] + keys = ["location", "offset", "length"] + values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])] + + for k, v in zip(keys, values): + entry = tensor.external_data.add() + entry.key = k + entry.value = v + + onnx.save(model, new_model) + + @staticmethod + def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str): + """ + (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。 + """ + fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16) + fp32_array = fp16_array.astype(np.float32) + fp32_array.tofile(output_fp32_bin_path) + + def run_full_process(self): + self.step1_create_fp16_bin_with_key_mapping() + self.step2_relink_onnx_for_fp32(self.stage_decoder_onnx_path, self.relinked_stage_decoder_path) + self.step2_relink_onnx_for_fp32(self.first_stage_decoder_onnx_path, self.relinked_first_stage_decoder_path) diff --git a/genie_tts/Converter/v2/VITSConverter.py b/genie_tts/Converter/v2/VITSConverter.py index 8b04086a174d0edd3e0da75b7a76da9fc09491c6..41dedc0096c300b7c9d302586f4daff46909a298 100644 --- a/genie_tts/Converter/v2/VITSConverter.py +++ b/genie_tts/Converter/v2/VITSConverter.py @@ -1,129 +1,129 @@ -import torch -import onnx -import numpy as np -import json -import os -from collections import OrderedDict - -from ..load_state_dict import load_sovits_model - - -class VITSConverter: - """ - 一个转换器,用于从 PyTorch 模型创建: - 1. 一个用于分发的半精度 (fp16) .bin 权重文件。 - 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。 - 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。 - """ - - def __init__(self, - torch_pth_path: str, - vits_onnx_path: str, - key_list_file: str, - output_dir: str, - cache_dir: str, - ): - self.torch_pth_path: str = torch_pth_path - self.vits_onnx_path: str = vits_onnx_path - self.key_list_file: str = key_list_file - self.output_dir: str = output_dir - self.cache_dir: str = cache_dir - # 定义输出文件路径 - self.fp16_bin_path: str = os.path.join(self.output_dir, "vits_fp16.bin") - self.index_table_path: str = os.path.join(self.cache_dir, "vits_weights_index_fp32.json") - self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "vits_fp32.onnx") - self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "vits_fp32.bin") - - # 确保输出目录存在 - os.makedirs(self.cache_dir, exist_ok=True) - os.makedirs(self.output_dir, exist_ok=True) - - if not os.path.exists(self.key_list_file): - raise FileNotFoundError(f"Error: Key list file not found! Path: {self.key_list_file}") - - def step1_create_fp16_bin_and_fp32_index(self): - """ - (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个 - 描述全精度 (fp32) 布局的索引表。 - """ - # 加载 key 列表 - with open(self.key_list_file, 'r') as f: - onnx_keys = [line.strip() for line in f.readlines()] - - # 加载 PyTorch 模型权重 - torch_state_dict = load_sovits_model(self.torch_pth_path)['weight'] - - index_table = OrderedDict() - current_fp32_offset = 0 - - with open(self.fp16_bin_path, 'wb') as f_bin: - for onnx_key in onnx_keys: - torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key - - torch_tensor = torch_state_dict.get(torch_key) - if torch_tensor is None: - raise ValueError(f"❌ Critical error: Key '{torch_key}' not found in the PyTorch weights") - - # 转换为 fp16 并写入文件 - torch_tensor_fp16 = torch_tensor.to(torch.float16) - numpy_array_fp16 = torch_tensor_fp16.cpu().numpy() - tensor_bytes_fp16 = numpy_array_fp16.tobytes() - f_bin.write(tensor_bytes_fp16) - tensor_length_fp32 = len(tensor_bytes_fp16) * 2 - index_table[onnx_key] = { - 'offset': current_fp32_offset, - 'length': tensor_length_fp32 - } - current_fp32_offset += tensor_length_fp32 - - # 保存描述 fp32 布局的索引表 - with open(self.index_table_path, 'w') as f_json: - json.dump(index_table, f_json, indent=4) # type: ignore - - def step2_relink_onnx_for_fp32(self): - """ - (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个 - 未来的、全精度的 .bin 文件。 - """ - # 加载描述 fp32 布局的索引表 - with open(self.index_table_path, 'r') as f: - index_table = json.load(f) - - model = onnx.load_model(self.vits_onnx_path, load_external_data=False) - reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path) - - for tensor in model.graph.initializer: - if tensor.name in index_table: - tensor.ClearField('raw_data') - tensor.data_location = onnx.TensorProto.EXTERNAL - info = index_table[tensor.name] - - del tensor.external_data[:] - - keys = ["location", "offset", "length"] - values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])] - - for k, v in zip(keys, values): - entry = tensor.external_data.add() - entry.key = k - entry.value = v - - # 保存修改后的、链接到 fp32 权重的 ONNX 模型 - onnx.save(model, self.relinked_fp32_onnx_path) - - @staticmethod - def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str): - """ - (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。 - - Args: - fp16_bin_path (str): 输入的半精度 .bin 文件路径。 - output_fp32_bin_path (str): 输出的全精度 .bin 文件路径。 - """ - fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16) - fp32_array = fp16_array.astype(np.float32) - fp32_array.tofile(output_fp32_bin_path) - - def run_full_process(self): - self.step1_create_fp16_bin_and_fp32_index() - self.step2_relink_onnx_for_fp32() +import torch +import onnx +import numpy as np +import json +import os +from collections import OrderedDict + +from ..load_state_dict import load_sovits_model + + +class VITSConverter: + """ + 一个转换器,用于从 PyTorch 模型创建: + 1. 一个用于分发的半精度 (fp16) .bin 权重文件。 + 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。 + 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。 + """ + + def __init__(self, + torch_pth_path: str, + vits_onnx_path: str, + key_list_file: str, + output_dir: str, + cache_dir: str, + ): + self.torch_pth_path: str = torch_pth_path + self.vits_onnx_path: str = vits_onnx_path + self.key_list_file: str = key_list_file + self.output_dir: str = output_dir + self.cache_dir: str = cache_dir + # 定义输出文件路径 + self.fp16_bin_path: str = os.path.join(self.output_dir, "vits_fp16.bin") + self.index_table_path: str = os.path.join(self.cache_dir, "vits_weights_index_fp32.json") + self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "vits_fp32.onnx") + self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "vits_fp32.bin") + + # 确保输出目录存在 + os.makedirs(self.cache_dir, exist_ok=True) + os.makedirs(self.output_dir, exist_ok=True) + + if not os.path.exists(self.key_list_file): + raise FileNotFoundError(f"Error: Key list file not found! Path: {self.key_list_file}") + + def step1_create_fp16_bin_and_fp32_index(self): + """ + (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个 + 描述全精度 (fp32) 布局的索引表。 + """ + # 加载 key 列表 + with open(self.key_list_file, 'r') as f: + onnx_keys = [line.strip() for line in f.readlines()] + + # 加载 PyTorch 模型权重 + torch_state_dict = load_sovits_model(self.torch_pth_path)['weight'] + + index_table = OrderedDict() + current_fp32_offset = 0 + + with open(self.fp16_bin_path, 'wb') as f_bin: + for onnx_key in onnx_keys: + torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key + + torch_tensor = torch_state_dict.get(torch_key) + if torch_tensor is None: + raise ValueError(f"❌ Critical error: Key '{torch_key}' not found in the PyTorch weights") + + # 转换为 fp16 并写入文件 + torch_tensor_fp16 = torch_tensor.to(torch.float16) + numpy_array_fp16 = torch_tensor_fp16.cpu().numpy() + tensor_bytes_fp16 = numpy_array_fp16.tobytes() + f_bin.write(tensor_bytes_fp16) + tensor_length_fp32 = len(tensor_bytes_fp16) * 2 + index_table[onnx_key] = { + 'offset': current_fp32_offset, + 'length': tensor_length_fp32 + } + current_fp32_offset += tensor_length_fp32 + + # 保存描述 fp32 布局的索引表 + with open(self.index_table_path, 'w') as f_json: + json.dump(index_table, f_json, indent=4) # type: ignore + + def step2_relink_onnx_for_fp32(self): + """ + (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个 + 未来的、全精度的 .bin 文件。 + """ + # 加载描述 fp32 布局的索引表 + with open(self.index_table_path, 'r') as f: + index_table = json.load(f) + + model = onnx.load_model(self.vits_onnx_path, load_external_data=False) + reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path) + + for tensor in model.graph.initializer: + if tensor.name in index_table: + tensor.ClearField('raw_data') + tensor.data_location = onnx.TensorProto.EXTERNAL + info = index_table[tensor.name] + + del tensor.external_data[:] + + keys = ["location", "offset", "length"] + values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])] + + for k, v in zip(keys, values): + entry = tensor.external_data.add() + entry.key = k + entry.value = v + + # 保存修改后的、链接到 fp32 权重的 ONNX 模型 + onnx.save(model, self.relinked_fp32_onnx_path) + + @staticmethod + def step3_reconstruct_fp32_bin_from_fp16(fp16_bin_path: str, output_fp32_bin_path: str): + """ + (3) 静态工具函数:从半精度 .bin 文件还原出全精度 .bin 文件。 + + Args: + fp16_bin_path (str): 输入的半精度 .bin 文件路径。 + output_fp32_bin_path (str): 输出的全精度 .bin 文件路径。 + """ + fp16_array = np.fromfile(fp16_bin_path, dtype=np.float16) + fp32_array = fp16_array.astype(np.float32) + fp32_array.tofile(output_fp32_bin_path) + + def run_full_process(self): + self.step1_create_fp16_bin_and_fp32_index() + self.step2_relink_onnx_for_fp32() diff --git a/genie_tts/Converter/v2/__pycache__/Converter.cpython-311.pyc b/genie_tts/Converter/v2/__pycache__/Converter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a4ffe154b36e0fb80500290bfc782cf33672ae5 Binary files /dev/null and b/genie_tts/Converter/v2/__pycache__/Converter.cpython-311.pyc differ diff --git a/genie_tts/Converter/v2/__pycache__/EncoderConverter.cpython-311.pyc b/genie_tts/Converter/v2/__pycache__/EncoderConverter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d66294d6f6ebb839a254efccd12d963b34790ff4 Binary files /dev/null and b/genie_tts/Converter/v2/__pycache__/EncoderConverter.cpython-311.pyc differ diff --git a/genie_tts/Converter/v2/__pycache__/T2SConverter.cpython-311.pyc b/genie_tts/Converter/v2/__pycache__/T2SConverter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7414f33a9f18071f8033f23fe76c535a9474b55c Binary files /dev/null and b/genie_tts/Converter/v2/__pycache__/T2SConverter.cpython-311.pyc differ diff --git a/genie_tts/Converter/v2/__pycache__/VITSConverter.cpython-311.pyc b/genie_tts/Converter/v2/__pycache__/VITSConverter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86a00f5771faa85cce35519fc8f341e6c640655c Binary files /dev/null and b/genie_tts/Converter/v2/__pycache__/VITSConverter.cpython-311.pyc differ diff --git a/genie_tts/Converter/v2/__pycache__/__init__.cpython-311.pyc b/genie_tts/Converter/v2/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be74ed9afd7309856f38d6340e7b3f79fab95134 Binary files /dev/null and b/genie_tts/Converter/v2/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/Converter/v2ProPlus/Converter.py b/genie_tts/Converter/v2ProPlus/Converter.py index 9fbac43d89704024911e8618a13413bce834ddc2..7f275a2e96ed410c25ea2d9cc320c24d4f9babd9 100644 --- a/genie_tts/Converter/v2ProPlus/Converter.py +++ b/genie_tts/Converter/v2ProPlus/Converter.py @@ -1,89 +1,89 @@ -import logging -import traceback -import os -import contextlib -import importlib.resources - -from ...Utils.Constants import PACKAGE_NAME -from ..v2.VITSConverter import VITSConverter -from ..v2.T2SConverter import T2SModelConverter -from ..v2.EncoderConverter import EncoderConverter -from ..v2.Converter import (ENCODER_RESOURCE_PATH, STAGE_DECODER_RESOURCE_PATH, - FIRST_STAGE_DECODER_RESOURCE_PATH, T2S_KEYS_RESOURCE_PATH, CACHE_DIR, remove_folder) -from .PromptEncoderConverter import PromptEncoderConverter - -logger = logging.getLogger() - -# 使用 V2 ProPlus 的文件。 -VITS_RESOURCE_PATH = "Data/v2ProPlus/Models/vits_fp32.onnx" -PROMPT_ENCODER_RESOURCE_PATH = "Data/v2ProPlus/Models/prompt_encoder_fp32.onnx" -VITS_KEYS_RESOURCE_PATH = "Data/v2ProPlus/Keys/vits_weights.txt" -PROMPT_ENCODER_KEYS_RESOURCE_PATH = "Data/v2ProPlus/Keys/prompt_encoder_weights.txt" - - -def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None: - # 确保缓存和输出目录存在 - os.makedirs(CACHE_DIR, exist_ok=True) - os.makedirs(output_dir, exist_ok=True) - - if len(os.listdir(output_dir)) > 0: - logger.warning(f"The output directory {output_dir} is not empty!") - - with contextlib.ExitStack() as stack: - files = importlib.resources.files(PACKAGE_NAME) - - def enter(p: str) -> str: - return str(stack.enter_context(importlib.resources.as_file(files.joinpath(p)))) - - encoder_onnx_path = enter(ENCODER_RESOURCE_PATH) - stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH) - first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH) - vits_onnx_path = enter(VITS_RESOURCE_PATH) - t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH) - vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH) - prompt_encoder_path = enter(PROMPT_ENCODER_RESOURCE_PATH) - prompt_encoder_keys_path = enter(PROMPT_ENCODER_KEYS_RESOURCE_PATH) - - converter_1 = T2SModelConverter( - torch_ckpt_path=torch_ckpt_path, - stage_decoder_onnx_path=stage_decoder_path, - first_stage_decoder_onnx_path=first_stage_decoder_path, - key_list_file=t2s_keys_path, - output_dir=output_dir, - cache_dir=CACHE_DIR, - ) - converter_2 = VITSConverter( - torch_pth_path=torch_pth_path, - vits_onnx_path=vits_onnx_path, - key_list_file=vits_keys_path, - output_dir=output_dir, - cache_dir=CACHE_DIR, - ) - converter_3 = EncoderConverter( - ckpt_path=torch_ckpt_path, - pth_path=torch_pth_path, - onnx_input_path=encoder_onnx_path, - output_dir=output_dir, - ) - converter_4 = PromptEncoderConverter( - torch_pth_path=torch_pth_path, - prompt_encoder_onnx_path=prompt_encoder_path, - key_list_file=prompt_encoder_keys_path, - output_dir=output_dir, - cache_dir=CACHE_DIR, - ) - - try: - converter_1.run_full_process() - converter_2.run_full_process() - converter_3.run_full_process() - converter_4.run_full_process() - logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n" - f"- Model Type: V2ProPlus") - except Exception: - logger.error(f"❌ A critical error occurred during the conversion process") - logger.error(traceback.format_exc()) - remove_folder(output_dir) # 只在失败时清理输出目录 - finally: - # 无论成功还是失败,都尝试清理缓存目录 - remove_folder(CACHE_DIR) +import logging +import traceback +import os +import contextlib +import importlib.resources + +from ...Utils.Constants import PACKAGE_NAME +from ..v2.VITSConverter import VITSConverter +from ..v2.T2SConverter import T2SModelConverter +from ..v2.EncoderConverter import EncoderConverter +from ..v2.Converter import (ENCODER_RESOURCE_PATH, STAGE_DECODER_RESOURCE_PATH, + FIRST_STAGE_DECODER_RESOURCE_PATH, T2S_KEYS_RESOURCE_PATH, CACHE_DIR, remove_folder) +from .PromptEncoderConverter import PromptEncoderConverter + +logger = logging.getLogger() + +# 使用 V2 ProPlus 的文件。 +VITS_RESOURCE_PATH = "Data/v2ProPlus/Models/vits_fp32.onnx" +PROMPT_ENCODER_RESOURCE_PATH = "Data/v2ProPlus/Models/prompt_encoder_fp32.onnx" +VITS_KEYS_RESOURCE_PATH = "./Data/v2ProPlus/Keys/vits_weights.txt" +PROMPT_ENCODER_KEYS_RESOURCE_PATH = "./Data/v2ProPlus/Keys/prompt_encoder_weights.txt" + + +def convert(torch_ckpt_path: str, torch_pth_path: str, output_dir: str) -> None: + # 确保缓存和输出目录存在 + os.makedirs(CACHE_DIR, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + if len(os.listdir(output_dir)) > 0: + logger.warning(f"The output directory {output_dir} is not empty!") + + with contextlib.ExitStack() as stack: + files = importlib.resources.files(PACKAGE_NAME) + + def enter(p: str) -> str: + return str(stack.enter_context(importlib.resources.as_file(files.joinpath(p)))) + + encoder_onnx_path = enter(ENCODER_RESOURCE_PATH) + stage_decoder_path = enter(STAGE_DECODER_RESOURCE_PATH) + first_stage_decoder_path = enter(FIRST_STAGE_DECODER_RESOURCE_PATH) + vits_onnx_path = enter(VITS_RESOURCE_PATH) + t2s_keys_path = enter(T2S_KEYS_RESOURCE_PATH) + vits_keys_path = enter(VITS_KEYS_RESOURCE_PATH) + prompt_encoder_path = enter(PROMPT_ENCODER_RESOURCE_PATH) + prompt_encoder_keys_path = enter(PROMPT_ENCODER_KEYS_RESOURCE_PATH) + + converter_1 = T2SModelConverter( + torch_ckpt_path=torch_ckpt_path, + stage_decoder_onnx_path=stage_decoder_path, + first_stage_decoder_onnx_path=first_stage_decoder_path, + key_list_file=t2s_keys_path, + output_dir=output_dir, + cache_dir=CACHE_DIR, + ) + converter_2 = VITSConverter( + torch_pth_path=torch_pth_path, + vits_onnx_path=vits_onnx_path, + key_list_file=vits_keys_path, + output_dir=output_dir, + cache_dir=CACHE_DIR, + ) + converter_3 = EncoderConverter( + ckpt_path=torch_ckpt_path, + pth_path=torch_pth_path, + onnx_input_path=encoder_onnx_path, + output_dir=output_dir, + ) + converter_4 = PromptEncoderConverter( + torch_pth_path=torch_pth_path, + prompt_encoder_onnx_path=prompt_encoder_path, + key_list_file=prompt_encoder_keys_path, + output_dir=output_dir, + cache_dir=CACHE_DIR, + ) + + try: + converter_1.run_full_process() + converter_2.run_full_process() + converter_3.run_full_process() + converter_4.run_full_process() + logger.info(f"🎉 Conversion successful! Saved to: {os.path.abspath(output_dir)}\n" + f"- Model Type: V2ProPlus") + except Exception: + logger.error(f"❌ A critical error occurred during the conversion process") + logger.error(traceback.format_exc()) + remove_folder(output_dir) # 只在失败时清理输出目录 + finally: + # 无论成功还是失败,都尝试清理缓存目录 + remove_folder(CACHE_DIR) diff --git a/genie_tts/Converter/v2ProPlus/PromptEncoderConverter.py b/genie_tts/Converter/v2ProPlus/PromptEncoderConverter.py index 55bac8664c15b2bbaae6fcfc71f13f4feb8b4b1d..395645d87ac20615b834f00d86e8932cdf6f1d49 100644 --- a/genie_tts/Converter/v2ProPlus/PromptEncoderConverter.py +++ b/genie_tts/Converter/v2ProPlus/PromptEncoderConverter.py @@ -1,128 +1,128 @@ -import torch -import onnx -import json -import os -from collections import OrderedDict - -from ..load_state_dict import load_sovits_model - - -class PromptEncoderConverter: - """ - 一个转换器,用于从 PyTorch 模型创建: - 1. 一个用于分发的半精度 (fp16) .bin 权重文件。 - 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。 - 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。 - """ - - def __init__(self, - torch_pth_path: str, - prompt_encoder_onnx_path: str, - key_list_file: str, - output_dir: str, - cache_dir: str, - ): - self.torch_pth_path: str = torch_pth_path - self.vits_onnx_path: str = prompt_encoder_onnx_path - self.key_list_file: str = key_list_file - self.output_dir: str = output_dir - self.cache_dir: str = cache_dir - # 定义输出文件路径 - self.fp16_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp16.bin") - self.index_table_path: str = os.path.join(self.cache_dir, "prompt_encoder_weights_index_fp32.json") - self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.onnx") - self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.bin") - - # 确保输出目录存在 - os.makedirs(self.cache_dir, exist_ok=True) - os.makedirs(self.output_dir, exist_ok=True) - - if not os.path.exists(self.key_list_file): - raise FileNotFoundError(f"错误: Key 列表文件未找到! 路径: {self.key_list_file}") - - def step1_create_fp16_bin_and_fp32_index(self): - """ - (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个 - 描述全精度 (fp32) 布局的索引表。 - """ - # 加载 key 列表 - with open(self.key_list_file, 'r') as f: - onnx_keys = [line.strip() for line in f.readlines()] - - # 加载 PyTorch 模型权重 - torch_state_dict = load_sovits_model(self.torch_pth_path)['weight'] - - index_table = OrderedDict() - # 这个偏移量将按照 fp32 的大小进行累加 - current_fp32_offset = 0 - - with open(self.fp16_bin_path, 'wb') as f_bin: - for onnx_key in onnx_keys: - torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key - - torch_tensor = torch_state_dict.get(torch_key) - if torch_tensor is None: - raise ValueError(f"❌ 严重错误: 在 PyTorch 权重中找不到 Key '{torch_key}'") - - # 转换为 fp16 并写入文件 - torch_tensor_fp16 = torch_tensor.to(torch.float16) - numpy_array_fp16 = torch_tensor_fp16.cpu().numpy() - tensor_bytes_fp16 = numpy_array_fp16.tobytes() - f_bin.write(tensor_bytes_fp16) - - # 关键步骤:计算并记录 fp32 的长度和偏移量 - # 一个 fp32 = 4 字节, 一个 fp16 = 2 字节。所以 fp32 长度是 fp16 的两倍。 - tensor_length_fp32 = len(tensor_bytes_fp16) * 2 - - index_table[onnx_key] = { - 'offset': current_fp32_offset, - 'length': tensor_length_fp32 - } - - # 偏移量也按照 fp32 的长度进行累加 - current_fp32_offset += tensor_length_fp32 - - # 保存描述 fp32 布局的索引表 - with open(self.index_table_path, 'w') as f_json: - json.dump(index_table, f_json, indent=4) # type: ignore - - def step2_relink_onnx_for_fp32(self): - """ - (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个 - 未来的、全精度的 .bin 文件。 - """ - # 加载描述 fp32 布局的索引表 - with open(self.index_table_path, 'r') as f: - index_table = json.load(f) - - # 加载 ONNX 模型结构 - model = onnx.load_model(self.vits_onnx_path, load_external_data=False) - - # 这个 ONNX 模型将要链接的 .bin 文件名 - reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path) - - for tensor in model.graph.initializer: - if tensor.name in index_table: - tensor.ClearField('raw_data') - tensor.data_location = onnx.TensorProto.EXTERNAL - info = index_table[tensor.name] - - del tensor.external_data[:] - - keys = ["location", "offset", "length"] - values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])] - - for k, v in zip(keys, values): - entry = tensor.external_data.add() - entry.key = k - entry.value = v - - # 保存修改后的、链接到 fp32 权重的 ONNX 模型 - onnx.save(model, self.relinked_fp32_onnx_path) - - def run_full_process(self): - """ - 按顺序执行核心的转换步骤 (1 和 2)。 - """ - self.step1_create_fp16_bin_and_fp32_index() - self.step2_relink_onnx_for_fp32() +import torch +import onnx +import json +import os +from collections import OrderedDict + +from ..load_state_dict import load_sovits_model + + +class PromptEncoderConverter: + """ + 一个转换器,用于从 PyTorch 模型创建: + 1. 一个用于分发的半精度 (fp16) .bin 权重文件。 + 2. 一个与全精度 (fp32) 布局兼容的 ONNX 模型。 + 3. 一个可以将 fp16 .bin 文件还原为 fp32 .bin 的工具函数。 + """ + + def __init__(self, + torch_pth_path: str, + prompt_encoder_onnx_path: str, + key_list_file: str, + output_dir: str, + cache_dir: str, + ): + self.torch_pth_path: str = torch_pth_path + self.vits_onnx_path: str = prompt_encoder_onnx_path + self.key_list_file: str = key_list_file + self.output_dir: str = output_dir + self.cache_dir: str = cache_dir + # 定义输出文件路径 + self.fp16_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp16.bin") + self.index_table_path: str = os.path.join(self.cache_dir, "prompt_encoder_weights_index_fp32.json") + self.relinked_fp32_onnx_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.onnx") + self.reconstructed_fp32_bin_path: str = os.path.join(self.output_dir, "prompt_encoder_fp32.bin") + + # 确保输出目录存在 + os.makedirs(self.cache_dir, exist_ok=True) + os.makedirs(self.output_dir, exist_ok=True) + + if not os.path.exists(self.key_list_file): + raise FileNotFoundError(f"错误: Key 列表文件未找到! 路径: {self.key_list_file}") + + def step1_create_fp16_bin_and_fp32_index(self): + """ + (1) 创建一个半精度 (fp16) 的 .bin 文件,但生成一个 + 描述全精度 (fp32) 布局的索引表。 + """ + # 加载 key 列表 + with open(self.key_list_file, 'r') as f: + onnx_keys = [line.strip() for line in f.readlines()] + + # 加载 PyTorch 模型权重 + torch_state_dict = load_sovits_model(self.torch_pth_path)['weight'] + + index_table = OrderedDict() + # 这个偏移量将按照 fp32 的大小进行累加 + current_fp32_offset = 0 + + with open(self.fp16_bin_path, 'wb') as f_bin: + for onnx_key in onnx_keys: + torch_key = onnx_key[len("vq_model."):] if onnx_key.startswith("vq_model.") else onnx_key + + torch_tensor = torch_state_dict.get(torch_key) + if torch_tensor is None: + raise ValueError(f"❌ 严重错误: 在 PyTorch 权重中找不到 Key '{torch_key}'") + + # 转换为 fp16 并写入文件 + torch_tensor_fp16 = torch_tensor.to(torch.float16) + numpy_array_fp16 = torch_tensor_fp16.cpu().numpy() + tensor_bytes_fp16 = numpy_array_fp16.tobytes() + f_bin.write(tensor_bytes_fp16) + + # 关键步骤:计算并记录 fp32 的长度和偏移量 + # 一个 fp32 = 4 字节, 一个 fp16 = 2 字节。所以 fp32 长度是 fp16 的两倍。 + tensor_length_fp32 = len(tensor_bytes_fp16) * 2 + + index_table[onnx_key] = { + 'offset': current_fp32_offset, + 'length': tensor_length_fp32 + } + + # 偏移量也按照 fp32 的长度进行累加 + current_fp32_offset += tensor_length_fp32 + + # 保存描述 fp32 布局的索引表 + with open(self.index_table_path, 'w') as f_json: + json.dump(index_table, f_json, indent=4) # type: ignore + + def step2_relink_onnx_for_fp32(self): + """ + (2) 根据 fp32 索引表,修改 ONNX 模型,使其链接到一个 + 未来的、全精度的 .bin 文件。 + """ + # 加载描述 fp32 布局的索引表 + with open(self.index_table_path, 'r') as f: + index_table = json.load(f) + + # 加载 ONNX 模型结构 + model = onnx.load_model(self.vits_onnx_path, load_external_data=False) + + # 这个 ONNX 模型将要链接的 .bin 文件名 + reconstructed_bin_filename = os.path.basename(self.reconstructed_fp32_bin_path) + + for tensor in model.graph.initializer: + if tensor.name in index_table: + tensor.ClearField('raw_data') + tensor.data_location = onnx.TensorProto.EXTERNAL + info = index_table[tensor.name] + + del tensor.external_data[:] + + keys = ["location", "offset", "length"] + values = [reconstructed_bin_filename, str(info['offset']), str(info['length'])] + + for k, v in zip(keys, values): + entry = tensor.external_data.add() + entry.key = k + entry.value = v + + # 保存修改后的、链接到 fp32 权重的 ONNX 模型 + onnx.save(model, self.relinked_fp32_onnx_path) + + def run_full_process(self): + """ + 按顺序执行核心的转换步骤 (1 和 2)。 + """ + self.step1_create_fp16_bin_and_fp32_index() + self.step2_relink_onnx_for_fp32() diff --git a/genie_tts/Converter/v2ProPlus/__pycache__/Converter.cpython-311.pyc b/genie_tts/Converter/v2ProPlus/__pycache__/Converter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53d4b93523656afa0dabd1351e944fd529539fd9 Binary files /dev/null and b/genie_tts/Converter/v2ProPlus/__pycache__/Converter.cpython-311.pyc differ diff --git a/genie_tts/Converter/v2ProPlus/__pycache__/PromptEncoderConverter.cpython-311.pyc b/genie_tts/Converter/v2ProPlus/__pycache__/PromptEncoderConverter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34330bba3078569bbe4533fe7b042bc877739f19 Binary files /dev/null and b/genie_tts/Converter/v2ProPlus/__pycache__/PromptEncoderConverter.cpython-311.pyc differ diff --git a/genie_tts/Core/Resources.py b/genie_tts/Core/Resources.py index d049cec7c68450ed5abc59f0188838583fa053ab..71e99ead66abc53ad81ed36535a620f7d6f36e45 100644 --- a/genie_tts/Core/Resources.py +++ b/genie_tts/Core/Resources.py @@ -1,76 +1,76 @@ -import os -from huggingface_hub import snapshot_download - - -def download_genie_data() -> None: - print(f"🚀 Starting download Genie-TTS resources… This may take a few moments. ⏳") - snapshot_download( - repo_id="High-Logic/Genie", - repo_type="model", - allow_patterns="GenieData/*", - local_dir=".", - local_dir_use_symlinks=True, # 软链接 - ) - print("✅ Genie-TTS resources downloaded successfully.") - - -def ensure_exists(path: str, name: str): - if not os.path.exists(path): - raise FileNotFoundError( - f"Required directory or file '{name}' was not found at: {path}\n" - f"Please download the pretrained models and place them under './GenieData', " - f"or set the environment variable GENIE_DATA_DIR to the correct directory." - ) - - -""" -文件结构与项目 Midori 同步。 -""" - -GENIE_DATA_DIR: str = os.getenv( - "GENIE_DATA_DIR", - "./GenieData" -) - -""" -Japanese_G2P_DIR: str = os.getenv( - "Japanese_G2P_DIR", - f"{GENIE_DATA_DIR}/G2P/JapaneseG2P" -) -""" - -English_G2P_DIR: str = os.getenv( - "English_G2P_DIR", - f"{GENIE_DATA_DIR}/G2P/EnglishG2P" -) - -Chinese_G2P_DIR: str = os.getenv( - "Chinese_G2P_DIR", - f"{GENIE_DATA_DIR}/G2P/ChineseG2P" -) - -HUBERT_MODEL_DIR: str = os.getenv( - "HUBERT_MODEL_DIR", - f"{GENIE_DATA_DIR}/chinese-hubert-base" -) - -SV_MODEL: str = os.getenv( - "SV_MODEL", - f"{GENIE_DATA_DIR}/speaker_encoder.onnx" -) - -ROBERTA_MODEL_DIR: str = os.getenv( - "ROBERTA_MODEL_DIR", - f"{GENIE_DATA_DIR}/RoBERTa" -) - -if not os.path.exists(GENIE_DATA_DIR): - print("⚠️ GenieData folder not found.") - choice = input("Would you like to download it automatically from HuggingFace? (y/N): ").strip().lower() - if choice == "y": - download_genie_data() - -# ---- Run directory checks ---- -ensure_exists(HUBERT_MODEL_DIR, "HUBERT_MODEL_DIR") -ensure_exists(SV_MODEL, "SV_MODEL") -# ensure_exists(ROBERTA_MODEL_DIR, "ROBERTA_MODEL_DIR") +import os +from huggingface_hub import snapshot_download + + +def download_genie_data() -> None: + print(f"🚀 Starting download Genie-TTS resources… This may take a few moments. ⏳") + snapshot_download( + repo_id="High-Logic/Genie", + repo_type="model", + allow_patterns="GenieData/*", + local_dir=".", + local_dir_use_symlinks=True, # 软链接 + ) + print("✅ Genie-TTS resources downloaded successfully.") + + +def ensure_exists(path: str, name: str): + if not os.path.exists(path): + raise FileNotFoundError( + f"Required directory or file '{name}' was not found at: {path}\n" + f"Please download the pretrained models and place them under './GenieData', " + f"or set the environment variable GENIE_DATA_DIR to the correct directory." + ) + + +""" +文件结构与项目 Midori 同步。 +""" + +GENIE_DATA_DIR: str = os.getenv( + "GENIE_DATA_DIR", + "./GenieData" +) + +""" +Japanese_G2P_DIR: str = os.getenv( + "Japanese_G2P_DIR", + f"{GENIE_DATA_DIR}/G2P/JapaneseG2P" +) +""" + +English_G2P_DIR: str = os.getenv( + "English_G2P_DIR", + f"{GENIE_DATA_DIR}/G2P/EnglishG2P" +) + +Chinese_G2P_DIR: str = os.getenv( + "Chinese_G2P_DIR", + f"{GENIE_DATA_DIR}/G2P/ChineseG2P" +) + +HUBERT_MODEL_DIR: str = os.getenv( + "HUBERT_MODEL_DIR", + f"{GENIE_DATA_DIR}/chinese-hubert-base" +) + +SV_MODEL: str = os.getenv( + "SV_MODEL", + f"{GENIE_DATA_DIR}/speaker_encoder.onnx" +) + +ROBERTA_MODEL_DIR: str = os.getenv( + "ROBERTA_MODEL_DIR", + f"{GENIE_DATA_DIR}/RoBERTa" +) + +if not os.path.exists(GENIE_DATA_DIR): + print("⚠️ GenieData folder not found.") + choice = input("Would you like to download it automatically from HuggingFace? (y/N): ").strip().lower() + if choice == "y": + download_genie_data() + +# ---- Run directory checks ---- +ensure_exists(HUBERT_MODEL_DIR, "HUBERT_MODEL_DIR") +ensure_exists(SV_MODEL, "SV_MODEL") +# ensure_exists(ROBERTA_MODEL_DIR, "ROBERTA_MODEL_DIR") diff --git a/genie_tts/Core/__pycache__/Inference.cpython-311.pyc b/genie_tts/Core/__pycache__/Inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc408b34c0cb7f1447e50ef34fe72c694ee89476 Binary files /dev/null and b/genie_tts/Core/__pycache__/Inference.cpython-311.pyc differ diff --git a/genie_tts/Core/__pycache__/Resources.cpython-311.pyc b/genie_tts/Core/__pycache__/Resources.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40cc6cd804aa4e5079a184dacb63d9ff9468a1db Binary files /dev/null and b/genie_tts/Core/__pycache__/Resources.cpython-311.pyc differ diff --git a/genie_tts/Core/__pycache__/TTSPlayer.cpython-311.pyc b/genie_tts/Core/__pycache__/TTSPlayer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c673c0ffefc1c5f4c838879fed67cbdafae5a33d Binary files /dev/null and b/genie_tts/Core/__pycache__/TTSPlayer.cpython-311.pyc differ diff --git a/genie_tts/Core/__pycache__/__init__.cpython-311.pyc b/genie_tts/Core/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3078343b5ae3e6d139eae4f10e6c7f9e39dbcab4 Binary files /dev/null and b/genie_tts/Core/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/Data/v2/Keys/t2s_onnx_keys.txt b/genie_tts/Data/v2/Keys/t2s_onnx_keys.txt index 0a0b35764b2f696e8b72b6c68809ac463d55987a..0ff9ff6adde7fca9248e2d33de4ddaa32727690b 100644 --- a/genie_tts/Data/v2/Keys/t2s_onnx_keys.txt +++ b/genie_tts/Data/v2/Keys/t2s_onnx_keys.txt @@ -1,291 +1,291 @@ -ar_audio_embedding.word_embeddings.weight -ar_audio_position.alpha -transformer_encoder.layers.0.self_attn.in_proj_weight -transformer_encoder.layers.0.self_attn.in_proj_bias -transformer_encoder.layers.0.self_attn.out_proj.weight -transformer_encoder.layers.0.self_attn.out_proj.bias -transformer_encoder.layers.0.linear1.weight -transformer_encoder.layers.0.linear1.bias -transformer_encoder.layers.0.linear2.weight -transformer_encoder.layers.0.linear2.bias -transformer_encoder.layers.0.norm1.weight -transformer_encoder.layers.0.norm1.bias -transformer_encoder.layers.0.norm2.weight -transformer_encoder.layers.0.norm2.bias -transformer_encoder.layers.1.self_attn.in_proj_weight -transformer_encoder.layers.1.self_attn.in_proj_bias -transformer_encoder.layers.1.self_attn.out_proj.weight -transformer_encoder.layers.1.self_attn.out_proj.bias -transformer_encoder.layers.1.linear1.weight -transformer_encoder.layers.1.linear1.bias -transformer_encoder.layers.1.linear2.weight -transformer_encoder.layers.1.linear2.bias -transformer_encoder.layers.1.norm1.weight -transformer_encoder.layers.1.norm1.bias -transformer_encoder.layers.1.norm2.weight -transformer_encoder.layers.1.norm2.bias -transformer_encoder.layers.2.self_attn.in_proj_weight -transformer_encoder.layers.2.self_attn.in_proj_bias -transformer_encoder.layers.2.self_attn.out_proj.weight -transformer_encoder.layers.2.self_attn.out_proj.bias -transformer_encoder.layers.2.linear1.weight -transformer_encoder.layers.2.linear1.bias -transformer_encoder.layers.2.linear2.weight -transformer_encoder.layers.2.linear2.bias -transformer_encoder.layers.2.norm1.weight -transformer_encoder.layers.2.norm1.bias -transformer_encoder.layers.2.norm2.weight -transformer_encoder.layers.2.norm2.bias -transformer_encoder.layers.3.self_attn.in_proj_weight -transformer_encoder.layers.3.self_attn.in_proj_bias -transformer_encoder.layers.3.self_attn.out_proj.weight -transformer_encoder.layers.3.self_attn.out_proj.bias -transformer_encoder.layers.3.linear1.weight -transformer_encoder.layers.3.linear1.bias -transformer_encoder.layers.3.linear2.weight -transformer_encoder.layers.3.linear2.bias -transformer_encoder.layers.3.norm1.weight -transformer_encoder.layers.3.norm1.bias -transformer_encoder.layers.3.norm2.weight -transformer_encoder.layers.3.norm2.bias -transformer_encoder.layers.4.self_attn.in_proj_weight -transformer_encoder.layers.4.self_attn.in_proj_bias -transformer_encoder.layers.4.self_attn.out_proj.weight -transformer_encoder.layers.4.self_attn.out_proj.bias -transformer_encoder.layers.4.linear1.weight -transformer_encoder.layers.4.linear1.bias -transformer_encoder.layers.4.linear2.weight -transformer_encoder.layers.4.linear2.bias -transformer_encoder.layers.4.norm1.weight -transformer_encoder.layers.4.norm1.bias -transformer_encoder.layers.4.norm2.weight -transformer_encoder.layers.4.norm2.bias -transformer_encoder.layers.5.self_attn.in_proj_weight -transformer_encoder.layers.5.self_attn.in_proj_bias -transformer_encoder.layers.5.self_attn.out_proj.weight -transformer_encoder.layers.5.self_attn.out_proj.bias -transformer_encoder.layers.5.linear1.weight -transformer_encoder.layers.5.linear1.bias -transformer_encoder.layers.5.linear2.weight -transformer_encoder.layers.5.linear2.bias -transformer_encoder.layers.5.norm1.weight -transformer_encoder.layers.5.norm1.bias -transformer_encoder.layers.5.norm2.weight -transformer_encoder.layers.5.norm2.bias -transformer_encoder.layers.6.self_attn.in_proj_weight -transformer_encoder.layers.6.self_attn.in_proj_bias -transformer_encoder.layers.6.self_attn.out_proj.weight -transformer_encoder.layers.6.self_attn.out_proj.bias -transformer_encoder.layers.6.linear1.weight -transformer_encoder.layers.6.linear1.bias -transformer_encoder.layers.6.linear2.weight -transformer_encoder.layers.6.linear2.bias -transformer_encoder.layers.6.norm1.weight -transformer_encoder.layers.6.norm1.bias -transformer_encoder.layers.6.norm2.weight -transformer_encoder.layers.6.norm2.bias -transformer_encoder.layers.7.self_attn.in_proj_weight -transformer_encoder.layers.7.self_attn.in_proj_bias -transformer_encoder.layers.7.self_attn.out_proj.weight -transformer_encoder.layers.7.self_attn.out_proj.bias -transformer_encoder.layers.7.linear1.weight -transformer_encoder.layers.7.linear1.bias -transformer_encoder.layers.7.linear2.weight -transformer_encoder.layers.7.linear2.bias -transformer_encoder.layers.7.norm1.weight -transformer_encoder.layers.7.norm1.bias -transformer_encoder.layers.7.norm2.weight -transformer_encoder.layers.7.norm2.bias -transformer_encoder.layers.8.self_attn.in_proj_weight -transformer_encoder.layers.8.self_attn.in_proj_bias -transformer_encoder.layers.8.self_attn.out_proj.weight -transformer_encoder.layers.8.self_attn.out_proj.bias -transformer_encoder.layers.8.linear1.weight -transformer_encoder.layers.8.linear1.bias -transformer_encoder.layers.8.linear2.weight -transformer_encoder.layers.8.linear2.bias -transformer_encoder.layers.8.norm1.weight -transformer_encoder.layers.8.norm1.bias -transformer_encoder.layers.8.norm2.weight -transformer_encoder.layers.8.norm2.bias -transformer_encoder.layers.9.self_attn.in_proj_weight -transformer_encoder.layers.9.self_attn.in_proj_bias -transformer_encoder.layers.9.self_attn.out_proj.weight -transformer_encoder.layers.9.self_attn.out_proj.bias -transformer_encoder.layers.9.linear1.weight -transformer_encoder.layers.9.linear1.bias -transformer_encoder.layers.9.linear2.weight -transformer_encoder.layers.9.linear2.bias -transformer_encoder.layers.9.norm1.weight -transformer_encoder.layers.9.norm1.bias -transformer_encoder.layers.9.norm2.weight -transformer_encoder.layers.9.norm2.bias -transformer_encoder.layers.10.self_attn.in_proj_weight -transformer_encoder.layers.10.self_attn.in_proj_bias -transformer_encoder.layers.10.self_attn.out_proj.weight -transformer_encoder.layers.10.self_attn.out_proj.bias -transformer_encoder.layers.10.linear1.weight -transformer_encoder.layers.10.linear1.bias -transformer_encoder.layers.10.linear2.weight -transformer_encoder.layers.10.linear2.bias -transformer_encoder.layers.10.norm1.weight -transformer_encoder.layers.10.norm1.bias -transformer_encoder.layers.10.norm2.weight -transformer_encoder.layers.10.norm2.bias -transformer_encoder.layers.11.self_attn.in_proj_weight -transformer_encoder.layers.11.self_attn.in_proj_bias -transformer_encoder.layers.11.self_attn.out_proj.weight -transformer_encoder.layers.11.self_attn.out_proj.bias -transformer_encoder.layers.11.linear1.weight -transformer_encoder.layers.11.linear1.bias -transformer_encoder.layers.11.linear2.weight -transformer_encoder.layers.11.linear2.bias -transformer_encoder.layers.11.norm1.weight -transformer_encoder.layers.11.norm1.bias -transformer_encoder.layers.11.norm2.weight -transformer_encoder.layers.11.norm2.bias -transformer_encoder.layers.12.self_attn.in_proj_weight -transformer_encoder.layers.12.self_attn.in_proj_bias -transformer_encoder.layers.12.self_attn.out_proj.weight -transformer_encoder.layers.12.self_attn.out_proj.bias -transformer_encoder.layers.12.linear1.weight -transformer_encoder.layers.12.linear1.bias -transformer_encoder.layers.12.linear2.weight -transformer_encoder.layers.12.linear2.bias -transformer_encoder.layers.12.norm1.weight -transformer_encoder.layers.12.norm1.bias -transformer_encoder.layers.12.norm2.weight -transformer_encoder.layers.12.norm2.bias -transformer_encoder.layers.13.self_attn.in_proj_weight -transformer_encoder.layers.13.self_attn.in_proj_bias -transformer_encoder.layers.13.self_attn.out_proj.weight -transformer_encoder.layers.13.self_attn.out_proj.bias -transformer_encoder.layers.13.linear1.weight -transformer_encoder.layers.13.linear1.bias -transformer_encoder.layers.13.linear2.weight -transformer_encoder.layers.13.linear2.bias -transformer_encoder.layers.13.norm1.weight -transformer_encoder.layers.13.norm1.bias -transformer_encoder.layers.13.norm2.weight -transformer_encoder.layers.13.norm2.bias -transformer_encoder.layers.14.self_attn.in_proj_weight -transformer_encoder.layers.14.self_attn.in_proj_bias -transformer_encoder.layers.14.self_attn.out_proj.weight -transformer_encoder.layers.14.self_attn.out_proj.bias -transformer_encoder.layers.14.linear1.weight -transformer_encoder.layers.14.linear1.bias -transformer_encoder.layers.14.linear2.weight -transformer_encoder.layers.14.linear2.bias -transformer_encoder.layers.14.norm1.weight -transformer_encoder.layers.14.norm1.bias -transformer_encoder.layers.14.norm2.weight -transformer_encoder.layers.14.norm2.bias -transformer_encoder.layers.15.self_attn.in_proj_weight -transformer_encoder.layers.15.self_attn.in_proj_bias -transformer_encoder.layers.15.self_attn.out_proj.weight -transformer_encoder.layers.15.self_attn.out_proj.bias -transformer_encoder.layers.15.linear1.weight -transformer_encoder.layers.15.linear1.bias -transformer_encoder.layers.15.linear2.weight -transformer_encoder.layers.15.linear2.bias -transformer_encoder.layers.15.norm1.weight -transformer_encoder.layers.15.norm1.bias -transformer_encoder.layers.15.norm2.weight -transformer_encoder.layers.15.norm2.bias -transformer_encoder.layers.16.self_attn.in_proj_weight -transformer_encoder.layers.16.self_attn.in_proj_bias -transformer_encoder.layers.16.self_attn.out_proj.weight -transformer_encoder.layers.16.self_attn.out_proj.bias -transformer_encoder.layers.16.linear1.weight -transformer_encoder.layers.16.linear1.bias -transformer_encoder.layers.16.linear2.weight -transformer_encoder.layers.16.linear2.bias -transformer_encoder.layers.16.norm1.weight -transformer_encoder.layers.16.norm1.bias -transformer_encoder.layers.16.norm2.weight -transformer_encoder.layers.16.norm2.bias -transformer_encoder.layers.17.self_attn.in_proj_weight -transformer_encoder.layers.17.self_attn.in_proj_bias -transformer_encoder.layers.17.self_attn.out_proj.weight -transformer_encoder.layers.17.self_attn.out_proj.bias -transformer_encoder.layers.17.linear1.weight -transformer_encoder.layers.17.linear1.bias -transformer_encoder.layers.17.linear2.weight -transformer_encoder.layers.17.linear2.bias -transformer_encoder.layers.17.norm1.weight -transformer_encoder.layers.17.norm1.bias -transformer_encoder.layers.17.norm2.weight -transformer_encoder.layers.17.norm2.bias -transformer_encoder.layers.18.self_attn.in_proj_weight -transformer_encoder.layers.18.self_attn.in_proj_bias -transformer_encoder.layers.18.self_attn.out_proj.weight -transformer_encoder.layers.18.self_attn.out_proj.bias -transformer_encoder.layers.18.linear1.weight -transformer_encoder.layers.18.linear1.bias -transformer_encoder.layers.18.linear2.weight -transformer_encoder.layers.18.linear2.bias -transformer_encoder.layers.18.norm1.weight -transformer_encoder.layers.18.norm1.bias -transformer_encoder.layers.18.norm2.weight -transformer_encoder.layers.18.norm2.bias -transformer_encoder.layers.19.self_attn.in_proj_weight -transformer_encoder.layers.19.self_attn.in_proj_bias -transformer_encoder.layers.19.self_attn.out_proj.weight -transformer_encoder.layers.19.self_attn.out_proj.bias -transformer_encoder.layers.19.linear1.weight -transformer_encoder.layers.19.linear1.bias -transformer_encoder.layers.19.linear2.weight -transformer_encoder.layers.19.linear2.bias -transformer_encoder.layers.19.norm1.weight -transformer_encoder.layers.19.norm1.bias -transformer_encoder.layers.19.norm2.weight -transformer_encoder.layers.19.norm2.bias -transformer_encoder.layers.20.self_attn.in_proj_weight -transformer_encoder.layers.20.self_attn.in_proj_bias -transformer_encoder.layers.20.self_attn.out_proj.weight -transformer_encoder.layers.20.self_attn.out_proj.bias -transformer_encoder.layers.20.linear1.weight -transformer_encoder.layers.20.linear1.bias -transformer_encoder.layers.20.linear2.weight -transformer_encoder.layers.20.linear2.bias -transformer_encoder.layers.20.norm1.weight -transformer_encoder.layers.20.norm1.bias -transformer_encoder.layers.20.norm2.weight -transformer_encoder.layers.20.norm2.bias -transformer_encoder.layers.21.self_attn.in_proj_weight -transformer_encoder.layers.21.self_attn.in_proj_bias -transformer_encoder.layers.21.self_attn.out_proj.weight -transformer_encoder.layers.21.self_attn.out_proj.bias -transformer_encoder.layers.21.linear1.weight -transformer_encoder.layers.21.linear1.bias -transformer_encoder.layers.21.linear2.weight -transformer_encoder.layers.21.linear2.bias -transformer_encoder.layers.21.norm1.weight -transformer_encoder.layers.21.norm1.bias -transformer_encoder.layers.21.norm2.weight -transformer_encoder.layers.21.norm2.bias -transformer_encoder.layers.22.self_attn.in_proj_weight -transformer_encoder.layers.22.self_attn.in_proj_bias -transformer_encoder.layers.22.self_attn.out_proj.weight -transformer_encoder.layers.22.self_attn.out_proj.bias -transformer_encoder.layers.22.linear1.weight -transformer_encoder.layers.22.linear1.bias -transformer_encoder.layers.22.linear2.weight -transformer_encoder.layers.22.linear2.bias -transformer_encoder.layers.22.norm1.weight -transformer_encoder.layers.22.norm1.bias -transformer_encoder.layers.22.norm2.weight -transformer_encoder.layers.22.norm2.bias -transformer_encoder.layers.23.self_attn.in_proj_weight -transformer_encoder.layers.23.self_attn.in_proj_bias -transformer_encoder.layers.23.self_attn.out_proj.weight -transformer_encoder.layers.23.self_attn.out_proj.bias -transformer_encoder.layers.23.linear1.weight -transformer_encoder.layers.23.linear1.bias -transformer_encoder.layers.23.linear2.weight -transformer_encoder.layers.23.linear2.bias -transformer_encoder.layers.23.norm1.weight -transformer_encoder.layers.23.norm1.bias -transformer_encoder.layers.23.norm2.weight -transformer_encoder.layers.23.norm2.bias -ar_predict_layer.weight +ar_audio_embedding.word_embeddings.weight +ar_audio_position.alpha +transformer_encoder.layers.0.self_attn.in_proj_weight +transformer_encoder.layers.0.self_attn.in_proj_bias +transformer_encoder.layers.0.self_attn.out_proj.weight +transformer_encoder.layers.0.self_attn.out_proj.bias +transformer_encoder.layers.0.linear1.weight +transformer_encoder.layers.0.linear1.bias +transformer_encoder.layers.0.linear2.weight +transformer_encoder.layers.0.linear2.bias +transformer_encoder.layers.0.norm1.weight +transformer_encoder.layers.0.norm1.bias +transformer_encoder.layers.0.norm2.weight +transformer_encoder.layers.0.norm2.bias +transformer_encoder.layers.1.self_attn.in_proj_weight +transformer_encoder.layers.1.self_attn.in_proj_bias +transformer_encoder.layers.1.self_attn.out_proj.weight +transformer_encoder.layers.1.self_attn.out_proj.bias +transformer_encoder.layers.1.linear1.weight +transformer_encoder.layers.1.linear1.bias +transformer_encoder.layers.1.linear2.weight +transformer_encoder.layers.1.linear2.bias +transformer_encoder.layers.1.norm1.weight +transformer_encoder.layers.1.norm1.bias +transformer_encoder.layers.1.norm2.weight +transformer_encoder.layers.1.norm2.bias +transformer_encoder.layers.2.self_attn.in_proj_weight +transformer_encoder.layers.2.self_attn.in_proj_bias +transformer_encoder.layers.2.self_attn.out_proj.weight +transformer_encoder.layers.2.self_attn.out_proj.bias +transformer_encoder.layers.2.linear1.weight +transformer_encoder.layers.2.linear1.bias +transformer_encoder.layers.2.linear2.weight +transformer_encoder.layers.2.linear2.bias +transformer_encoder.layers.2.norm1.weight +transformer_encoder.layers.2.norm1.bias +transformer_encoder.layers.2.norm2.weight +transformer_encoder.layers.2.norm2.bias +transformer_encoder.layers.3.self_attn.in_proj_weight +transformer_encoder.layers.3.self_attn.in_proj_bias +transformer_encoder.layers.3.self_attn.out_proj.weight +transformer_encoder.layers.3.self_attn.out_proj.bias +transformer_encoder.layers.3.linear1.weight +transformer_encoder.layers.3.linear1.bias +transformer_encoder.layers.3.linear2.weight +transformer_encoder.layers.3.linear2.bias +transformer_encoder.layers.3.norm1.weight +transformer_encoder.layers.3.norm1.bias +transformer_encoder.layers.3.norm2.weight +transformer_encoder.layers.3.norm2.bias +transformer_encoder.layers.4.self_attn.in_proj_weight +transformer_encoder.layers.4.self_attn.in_proj_bias +transformer_encoder.layers.4.self_attn.out_proj.weight +transformer_encoder.layers.4.self_attn.out_proj.bias +transformer_encoder.layers.4.linear1.weight +transformer_encoder.layers.4.linear1.bias +transformer_encoder.layers.4.linear2.weight +transformer_encoder.layers.4.linear2.bias +transformer_encoder.layers.4.norm1.weight +transformer_encoder.layers.4.norm1.bias +transformer_encoder.layers.4.norm2.weight +transformer_encoder.layers.4.norm2.bias +transformer_encoder.layers.5.self_attn.in_proj_weight +transformer_encoder.layers.5.self_attn.in_proj_bias +transformer_encoder.layers.5.self_attn.out_proj.weight +transformer_encoder.layers.5.self_attn.out_proj.bias +transformer_encoder.layers.5.linear1.weight +transformer_encoder.layers.5.linear1.bias +transformer_encoder.layers.5.linear2.weight +transformer_encoder.layers.5.linear2.bias +transformer_encoder.layers.5.norm1.weight +transformer_encoder.layers.5.norm1.bias +transformer_encoder.layers.5.norm2.weight +transformer_encoder.layers.5.norm2.bias +transformer_encoder.layers.6.self_attn.in_proj_weight +transformer_encoder.layers.6.self_attn.in_proj_bias +transformer_encoder.layers.6.self_attn.out_proj.weight +transformer_encoder.layers.6.self_attn.out_proj.bias +transformer_encoder.layers.6.linear1.weight +transformer_encoder.layers.6.linear1.bias +transformer_encoder.layers.6.linear2.weight +transformer_encoder.layers.6.linear2.bias +transformer_encoder.layers.6.norm1.weight +transformer_encoder.layers.6.norm1.bias +transformer_encoder.layers.6.norm2.weight +transformer_encoder.layers.6.norm2.bias +transformer_encoder.layers.7.self_attn.in_proj_weight +transformer_encoder.layers.7.self_attn.in_proj_bias +transformer_encoder.layers.7.self_attn.out_proj.weight +transformer_encoder.layers.7.self_attn.out_proj.bias +transformer_encoder.layers.7.linear1.weight +transformer_encoder.layers.7.linear1.bias +transformer_encoder.layers.7.linear2.weight +transformer_encoder.layers.7.linear2.bias +transformer_encoder.layers.7.norm1.weight +transformer_encoder.layers.7.norm1.bias +transformer_encoder.layers.7.norm2.weight +transformer_encoder.layers.7.norm2.bias +transformer_encoder.layers.8.self_attn.in_proj_weight +transformer_encoder.layers.8.self_attn.in_proj_bias +transformer_encoder.layers.8.self_attn.out_proj.weight +transformer_encoder.layers.8.self_attn.out_proj.bias +transformer_encoder.layers.8.linear1.weight +transformer_encoder.layers.8.linear1.bias +transformer_encoder.layers.8.linear2.weight +transformer_encoder.layers.8.linear2.bias +transformer_encoder.layers.8.norm1.weight +transformer_encoder.layers.8.norm1.bias +transformer_encoder.layers.8.norm2.weight +transformer_encoder.layers.8.norm2.bias +transformer_encoder.layers.9.self_attn.in_proj_weight +transformer_encoder.layers.9.self_attn.in_proj_bias +transformer_encoder.layers.9.self_attn.out_proj.weight +transformer_encoder.layers.9.self_attn.out_proj.bias +transformer_encoder.layers.9.linear1.weight +transformer_encoder.layers.9.linear1.bias +transformer_encoder.layers.9.linear2.weight +transformer_encoder.layers.9.linear2.bias +transformer_encoder.layers.9.norm1.weight +transformer_encoder.layers.9.norm1.bias +transformer_encoder.layers.9.norm2.weight +transformer_encoder.layers.9.norm2.bias +transformer_encoder.layers.10.self_attn.in_proj_weight +transformer_encoder.layers.10.self_attn.in_proj_bias +transformer_encoder.layers.10.self_attn.out_proj.weight +transformer_encoder.layers.10.self_attn.out_proj.bias +transformer_encoder.layers.10.linear1.weight +transformer_encoder.layers.10.linear1.bias +transformer_encoder.layers.10.linear2.weight +transformer_encoder.layers.10.linear2.bias +transformer_encoder.layers.10.norm1.weight +transformer_encoder.layers.10.norm1.bias +transformer_encoder.layers.10.norm2.weight +transformer_encoder.layers.10.norm2.bias +transformer_encoder.layers.11.self_attn.in_proj_weight +transformer_encoder.layers.11.self_attn.in_proj_bias +transformer_encoder.layers.11.self_attn.out_proj.weight +transformer_encoder.layers.11.self_attn.out_proj.bias +transformer_encoder.layers.11.linear1.weight +transformer_encoder.layers.11.linear1.bias +transformer_encoder.layers.11.linear2.weight +transformer_encoder.layers.11.linear2.bias +transformer_encoder.layers.11.norm1.weight +transformer_encoder.layers.11.norm1.bias +transformer_encoder.layers.11.norm2.weight +transformer_encoder.layers.11.norm2.bias +transformer_encoder.layers.12.self_attn.in_proj_weight +transformer_encoder.layers.12.self_attn.in_proj_bias +transformer_encoder.layers.12.self_attn.out_proj.weight +transformer_encoder.layers.12.self_attn.out_proj.bias +transformer_encoder.layers.12.linear1.weight +transformer_encoder.layers.12.linear1.bias +transformer_encoder.layers.12.linear2.weight +transformer_encoder.layers.12.linear2.bias +transformer_encoder.layers.12.norm1.weight +transformer_encoder.layers.12.norm1.bias +transformer_encoder.layers.12.norm2.weight +transformer_encoder.layers.12.norm2.bias +transformer_encoder.layers.13.self_attn.in_proj_weight +transformer_encoder.layers.13.self_attn.in_proj_bias +transformer_encoder.layers.13.self_attn.out_proj.weight +transformer_encoder.layers.13.self_attn.out_proj.bias +transformer_encoder.layers.13.linear1.weight +transformer_encoder.layers.13.linear1.bias +transformer_encoder.layers.13.linear2.weight +transformer_encoder.layers.13.linear2.bias +transformer_encoder.layers.13.norm1.weight +transformer_encoder.layers.13.norm1.bias +transformer_encoder.layers.13.norm2.weight +transformer_encoder.layers.13.norm2.bias +transformer_encoder.layers.14.self_attn.in_proj_weight +transformer_encoder.layers.14.self_attn.in_proj_bias +transformer_encoder.layers.14.self_attn.out_proj.weight +transformer_encoder.layers.14.self_attn.out_proj.bias +transformer_encoder.layers.14.linear1.weight +transformer_encoder.layers.14.linear1.bias +transformer_encoder.layers.14.linear2.weight +transformer_encoder.layers.14.linear2.bias +transformer_encoder.layers.14.norm1.weight +transformer_encoder.layers.14.norm1.bias +transformer_encoder.layers.14.norm2.weight +transformer_encoder.layers.14.norm2.bias +transformer_encoder.layers.15.self_attn.in_proj_weight +transformer_encoder.layers.15.self_attn.in_proj_bias +transformer_encoder.layers.15.self_attn.out_proj.weight +transformer_encoder.layers.15.self_attn.out_proj.bias +transformer_encoder.layers.15.linear1.weight +transformer_encoder.layers.15.linear1.bias +transformer_encoder.layers.15.linear2.weight +transformer_encoder.layers.15.linear2.bias +transformer_encoder.layers.15.norm1.weight +transformer_encoder.layers.15.norm1.bias +transformer_encoder.layers.15.norm2.weight +transformer_encoder.layers.15.norm2.bias +transformer_encoder.layers.16.self_attn.in_proj_weight +transformer_encoder.layers.16.self_attn.in_proj_bias +transformer_encoder.layers.16.self_attn.out_proj.weight +transformer_encoder.layers.16.self_attn.out_proj.bias +transformer_encoder.layers.16.linear1.weight +transformer_encoder.layers.16.linear1.bias +transformer_encoder.layers.16.linear2.weight +transformer_encoder.layers.16.linear2.bias +transformer_encoder.layers.16.norm1.weight +transformer_encoder.layers.16.norm1.bias +transformer_encoder.layers.16.norm2.weight +transformer_encoder.layers.16.norm2.bias +transformer_encoder.layers.17.self_attn.in_proj_weight +transformer_encoder.layers.17.self_attn.in_proj_bias +transformer_encoder.layers.17.self_attn.out_proj.weight +transformer_encoder.layers.17.self_attn.out_proj.bias +transformer_encoder.layers.17.linear1.weight +transformer_encoder.layers.17.linear1.bias +transformer_encoder.layers.17.linear2.weight +transformer_encoder.layers.17.linear2.bias +transformer_encoder.layers.17.norm1.weight +transformer_encoder.layers.17.norm1.bias +transformer_encoder.layers.17.norm2.weight +transformer_encoder.layers.17.norm2.bias +transformer_encoder.layers.18.self_attn.in_proj_weight +transformer_encoder.layers.18.self_attn.in_proj_bias +transformer_encoder.layers.18.self_attn.out_proj.weight +transformer_encoder.layers.18.self_attn.out_proj.bias +transformer_encoder.layers.18.linear1.weight +transformer_encoder.layers.18.linear1.bias +transformer_encoder.layers.18.linear2.weight +transformer_encoder.layers.18.linear2.bias +transformer_encoder.layers.18.norm1.weight +transformer_encoder.layers.18.norm1.bias +transformer_encoder.layers.18.norm2.weight +transformer_encoder.layers.18.norm2.bias +transformer_encoder.layers.19.self_attn.in_proj_weight +transformer_encoder.layers.19.self_attn.in_proj_bias +transformer_encoder.layers.19.self_attn.out_proj.weight +transformer_encoder.layers.19.self_attn.out_proj.bias +transformer_encoder.layers.19.linear1.weight +transformer_encoder.layers.19.linear1.bias +transformer_encoder.layers.19.linear2.weight +transformer_encoder.layers.19.linear2.bias +transformer_encoder.layers.19.norm1.weight +transformer_encoder.layers.19.norm1.bias +transformer_encoder.layers.19.norm2.weight +transformer_encoder.layers.19.norm2.bias +transformer_encoder.layers.20.self_attn.in_proj_weight +transformer_encoder.layers.20.self_attn.in_proj_bias +transformer_encoder.layers.20.self_attn.out_proj.weight +transformer_encoder.layers.20.self_attn.out_proj.bias +transformer_encoder.layers.20.linear1.weight +transformer_encoder.layers.20.linear1.bias +transformer_encoder.layers.20.linear2.weight +transformer_encoder.layers.20.linear2.bias +transformer_encoder.layers.20.norm1.weight +transformer_encoder.layers.20.norm1.bias +transformer_encoder.layers.20.norm2.weight +transformer_encoder.layers.20.norm2.bias +transformer_encoder.layers.21.self_attn.in_proj_weight +transformer_encoder.layers.21.self_attn.in_proj_bias +transformer_encoder.layers.21.self_attn.out_proj.weight +transformer_encoder.layers.21.self_attn.out_proj.bias +transformer_encoder.layers.21.linear1.weight +transformer_encoder.layers.21.linear1.bias +transformer_encoder.layers.21.linear2.weight +transformer_encoder.layers.21.linear2.bias +transformer_encoder.layers.21.norm1.weight +transformer_encoder.layers.21.norm1.bias +transformer_encoder.layers.21.norm2.weight +transformer_encoder.layers.21.norm2.bias +transformer_encoder.layers.22.self_attn.in_proj_weight +transformer_encoder.layers.22.self_attn.in_proj_bias +transformer_encoder.layers.22.self_attn.out_proj.weight +transformer_encoder.layers.22.self_attn.out_proj.bias +transformer_encoder.layers.22.linear1.weight +transformer_encoder.layers.22.linear1.bias +transformer_encoder.layers.22.linear2.weight +transformer_encoder.layers.22.linear2.bias +transformer_encoder.layers.22.norm1.weight +transformer_encoder.layers.22.norm1.bias +transformer_encoder.layers.22.norm2.weight +transformer_encoder.layers.22.norm2.bias +transformer_encoder.layers.23.self_attn.in_proj_weight +transformer_encoder.layers.23.self_attn.in_proj_bias +transformer_encoder.layers.23.self_attn.out_proj.weight +transformer_encoder.layers.23.self_attn.out_proj.bias +transformer_encoder.layers.23.linear1.weight +transformer_encoder.layers.23.linear1.bias +transformer_encoder.layers.23.linear2.weight +transformer_encoder.layers.23.linear2.bias +transformer_encoder.layers.23.norm1.weight +transformer_encoder.layers.23.norm1.bias +transformer_encoder.layers.23.norm2.weight +transformer_encoder.layers.23.norm2.bias +ar_predict_layer.weight diff --git a/genie_tts/Data/v2/Keys/vits_onnx_keys.txt b/genie_tts/Data/v2/Keys/vits_onnx_keys.txt index d4f0b4bbe0d8fd259d320e2f2c7967e0a92660a1..cf2afc6423f421fcbacf781f865cd2133b4cd930 100644 --- a/genie_tts/Data/v2/Keys/vits_onnx_keys.txt +++ b/genie_tts/Data/v2/Keys/vits_onnx_keys.txt @@ -1,668 +1,668 @@ -vq_model.dec.cond.bias -vq_model.dec.cond.weight -vq_model.dec.conv_post.weight -vq_model.dec.conv_pre.bias -vq_model.dec.conv_pre.weight -vq_model.dec.resblocks.0.convs1.0.bias -vq_model.dec.resblocks.0.convs1.0.weight_g -vq_model.dec.resblocks.0.convs1.0.weight_v -vq_model.dec.resblocks.0.convs1.1.bias -vq_model.dec.resblocks.0.convs1.1.weight_g -vq_model.dec.resblocks.0.convs1.1.weight_v -vq_model.dec.resblocks.0.convs1.2.bias -vq_model.dec.resblocks.0.convs1.2.weight_g -vq_model.dec.resblocks.0.convs1.2.weight_v -vq_model.dec.resblocks.0.convs2.0.bias -vq_model.dec.resblocks.0.convs2.0.weight_g -vq_model.dec.resblocks.0.convs2.0.weight_v -vq_model.dec.resblocks.0.convs2.1.bias -vq_model.dec.resblocks.0.convs2.1.weight_g -vq_model.dec.resblocks.0.convs2.1.weight_v -vq_model.dec.resblocks.0.convs2.2.bias -vq_model.dec.resblocks.0.convs2.2.weight_g -vq_model.dec.resblocks.0.convs2.2.weight_v -vq_model.dec.resblocks.1.convs1.0.bias -vq_model.dec.resblocks.1.convs1.0.weight_g -vq_model.dec.resblocks.1.convs1.0.weight_v -vq_model.dec.resblocks.1.convs1.1.bias -vq_model.dec.resblocks.1.convs1.1.weight_g -vq_model.dec.resblocks.1.convs1.1.weight_v -vq_model.dec.resblocks.1.convs1.2.bias -vq_model.dec.resblocks.1.convs1.2.weight_g -vq_model.dec.resblocks.1.convs1.2.weight_v -vq_model.dec.resblocks.1.convs2.0.bias -vq_model.dec.resblocks.1.convs2.0.weight_g -vq_model.dec.resblocks.1.convs2.0.weight_v -vq_model.dec.resblocks.1.convs2.1.bias -vq_model.dec.resblocks.1.convs2.1.weight_g -vq_model.dec.resblocks.1.convs2.1.weight_v -vq_model.dec.resblocks.1.convs2.2.bias -vq_model.dec.resblocks.1.convs2.2.weight_g -vq_model.dec.resblocks.1.convs2.2.weight_v -vq_model.dec.resblocks.10.convs1.0.bias -vq_model.dec.resblocks.10.convs1.0.weight_g -vq_model.dec.resblocks.10.convs1.0.weight_v -vq_model.dec.resblocks.10.convs1.1.bias -vq_model.dec.resblocks.10.convs1.1.weight_g -vq_model.dec.resblocks.10.convs1.1.weight_v -vq_model.dec.resblocks.10.convs1.2.bias -vq_model.dec.resblocks.10.convs1.2.weight_g -vq_model.dec.resblocks.10.convs1.2.weight_v -vq_model.dec.resblocks.10.convs2.0.bias -vq_model.dec.resblocks.10.convs2.0.weight_g -vq_model.dec.resblocks.10.convs2.0.weight_v -vq_model.dec.resblocks.10.convs2.1.bias -vq_model.dec.resblocks.10.convs2.1.weight_g -vq_model.dec.resblocks.10.convs2.1.weight_v -vq_model.dec.resblocks.10.convs2.2.bias -vq_model.dec.resblocks.10.convs2.2.weight_g -vq_model.dec.resblocks.10.convs2.2.weight_v -vq_model.dec.resblocks.11.convs1.0.bias -vq_model.dec.resblocks.11.convs1.0.weight_g -vq_model.dec.resblocks.11.convs1.0.weight_v -vq_model.dec.resblocks.11.convs1.1.bias -vq_model.dec.resblocks.11.convs1.1.weight_g -vq_model.dec.resblocks.11.convs1.1.weight_v -vq_model.dec.resblocks.11.convs1.2.bias -vq_model.dec.resblocks.11.convs1.2.weight_g -vq_model.dec.resblocks.11.convs1.2.weight_v -vq_model.dec.resblocks.11.convs2.0.bias -vq_model.dec.resblocks.11.convs2.0.weight_g -vq_model.dec.resblocks.11.convs2.0.weight_v -vq_model.dec.resblocks.11.convs2.1.bias -vq_model.dec.resblocks.11.convs2.1.weight_g -vq_model.dec.resblocks.11.convs2.1.weight_v -vq_model.dec.resblocks.11.convs2.2.bias -vq_model.dec.resblocks.11.convs2.2.weight_g -vq_model.dec.resblocks.11.convs2.2.weight_v -vq_model.dec.resblocks.12.convs1.0.bias -vq_model.dec.resblocks.12.convs1.0.weight_g -vq_model.dec.resblocks.12.convs1.0.weight_v -vq_model.dec.resblocks.12.convs1.1.bias -vq_model.dec.resblocks.12.convs1.1.weight_g -vq_model.dec.resblocks.12.convs1.1.weight_v -vq_model.dec.resblocks.12.convs1.2.bias -vq_model.dec.resblocks.12.convs1.2.weight_g -vq_model.dec.resblocks.12.convs1.2.weight_v -vq_model.dec.resblocks.12.convs2.0.bias -vq_model.dec.resblocks.12.convs2.0.weight_g -vq_model.dec.resblocks.12.convs2.0.weight_v -vq_model.dec.resblocks.12.convs2.1.bias -vq_model.dec.resblocks.12.convs2.1.weight_g -vq_model.dec.resblocks.12.convs2.1.weight_v -vq_model.dec.resblocks.12.convs2.2.bias -vq_model.dec.resblocks.12.convs2.2.weight_g -vq_model.dec.resblocks.12.convs2.2.weight_v -vq_model.dec.resblocks.13.convs1.0.bias -vq_model.dec.resblocks.13.convs1.0.weight_g -vq_model.dec.resblocks.13.convs1.0.weight_v -vq_model.dec.resblocks.13.convs1.1.bias -vq_model.dec.resblocks.13.convs1.1.weight_g -vq_model.dec.resblocks.13.convs1.1.weight_v -vq_model.dec.resblocks.13.convs1.2.bias -vq_model.dec.resblocks.13.convs1.2.weight_g -vq_model.dec.resblocks.13.convs1.2.weight_v -vq_model.dec.resblocks.13.convs2.0.bias -vq_model.dec.resblocks.13.convs2.0.weight_g -vq_model.dec.resblocks.13.convs2.0.weight_v -vq_model.dec.resblocks.13.convs2.1.bias -vq_model.dec.resblocks.13.convs2.1.weight_g -vq_model.dec.resblocks.13.convs2.1.weight_v -vq_model.dec.resblocks.13.convs2.2.bias -vq_model.dec.resblocks.13.convs2.2.weight_g -vq_model.dec.resblocks.13.convs2.2.weight_v -vq_model.dec.resblocks.14.convs1.0.bias -vq_model.dec.resblocks.14.convs1.0.weight_g -vq_model.dec.resblocks.14.convs1.0.weight_v -vq_model.dec.resblocks.14.convs1.1.bias -vq_model.dec.resblocks.14.convs1.1.weight_g -vq_model.dec.resblocks.14.convs1.1.weight_v -vq_model.dec.resblocks.14.convs1.2.bias -vq_model.dec.resblocks.14.convs1.2.weight_g -vq_model.dec.resblocks.14.convs1.2.weight_v -vq_model.dec.resblocks.14.convs2.0.bias -vq_model.dec.resblocks.14.convs2.0.weight_g -vq_model.dec.resblocks.14.convs2.0.weight_v -vq_model.dec.resblocks.14.convs2.1.bias -vq_model.dec.resblocks.14.convs2.1.weight_g -vq_model.dec.resblocks.14.convs2.1.weight_v -vq_model.dec.resblocks.14.convs2.2.bias -vq_model.dec.resblocks.14.convs2.2.weight_g -vq_model.dec.resblocks.14.convs2.2.weight_v -vq_model.dec.resblocks.2.convs1.0.bias -vq_model.dec.resblocks.2.convs1.0.weight_g -vq_model.dec.resblocks.2.convs1.0.weight_v -vq_model.dec.resblocks.2.convs1.1.bias -vq_model.dec.resblocks.2.convs1.1.weight_g -vq_model.dec.resblocks.2.convs1.1.weight_v -vq_model.dec.resblocks.2.convs1.2.bias -vq_model.dec.resblocks.2.convs1.2.weight_g -vq_model.dec.resblocks.2.convs1.2.weight_v -vq_model.dec.resblocks.2.convs2.0.bias -vq_model.dec.resblocks.2.convs2.0.weight_g -vq_model.dec.resblocks.2.convs2.0.weight_v -vq_model.dec.resblocks.2.convs2.1.bias -vq_model.dec.resblocks.2.convs2.1.weight_g -vq_model.dec.resblocks.2.convs2.1.weight_v -vq_model.dec.resblocks.2.convs2.2.bias -vq_model.dec.resblocks.2.convs2.2.weight_g -vq_model.dec.resblocks.2.convs2.2.weight_v -vq_model.dec.resblocks.3.convs1.0.bias -vq_model.dec.resblocks.3.convs1.0.weight_g -vq_model.dec.resblocks.3.convs1.0.weight_v -vq_model.dec.resblocks.3.convs1.1.bias -vq_model.dec.resblocks.3.convs1.1.weight_g -vq_model.dec.resblocks.3.convs1.1.weight_v -vq_model.dec.resblocks.3.convs1.2.bias -vq_model.dec.resblocks.3.convs1.2.weight_g -vq_model.dec.resblocks.3.convs1.2.weight_v -vq_model.dec.resblocks.3.convs2.0.bias -vq_model.dec.resblocks.3.convs2.0.weight_g -vq_model.dec.resblocks.3.convs2.0.weight_v -vq_model.dec.resblocks.3.convs2.1.bias -vq_model.dec.resblocks.3.convs2.1.weight_g -vq_model.dec.resblocks.3.convs2.1.weight_v -vq_model.dec.resblocks.3.convs2.2.bias -vq_model.dec.resblocks.3.convs2.2.weight_g -vq_model.dec.resblocks.3.convs2.2.weight_v -vq_model.dec.resblocks.4.convs1.0.bias -vq_model.dec.resblocks.4.convs1.0.weight_g -vq_model.dec.resblocks.4.convs1.0.weight_v -vq_model.dec.resblocks.4.convs1.1.bias -vq_model.dec.resblocks.4.convs1.1.weight_g -vq_model.dec.resblocks.4.convs1.1.weight_v -vq_model.dec.resblocks.4.convs1.2.bias -vq_model.dec.resblocks.4.convs1.2.weight_g -vq_model.dec.resblocks.4.convs1.2.weight_v -vq_model.dec.resblocks.4.convs2.0.bias -vq_model.dec.resblocks.4.convs2.0.weight_g -vq_model.dec.resblocks.4.convs2.0.weight_v -vq_model.dec.resblocks.4.convs2.1.bias -vq_model.dec.resblocks.4.convs2.1.weight_g -vq_model.dec.resblocks.4.convs2.1.weight_v -vq_model.dec.resblocks.4.convs2.2.bias -vq_model.dec.resblocks.4.convs2.2.weight_g -vq_model.dec.resblocks.4.convs2.2.weight_v -vq_model.dec.resblocks.5.convs1.0.bias -vq_model.dec.resblocks.5.convs1.0.weight_g -vq_model.dec.resblocks.5.convs1.0.weight_v -vq_model.dec.resblocks.5.convs1.1.bias -vq_model.dec.resblocks.5.convs1.1.weight_g -vq_model.dec.resblocks.5.convs1.1.weight_v -vq_model.dec.resblocks.5.convs1.2.bias -vq_model.dec.resblocks.5.convs1.2.weight_g -vq_model.dec.resblocks.5.convs1.2.weight_v -vq_model.dec.resblocks.5.convs2.0.bias -vq_model.dec.resblocks.5.convs2.0.weight_g -vq_model.dec.resblocks.5.convs2.0.weight_v -vq_model.dec.resblocks.5.convs2.1.bias -vq_model.dec.resblocks.5.convs2.1.weight_g -vq_model.dec.resblocks.5.convs2.1.weight_v -vq_model.dec.resblocks.5.convs2.2.bias -vq_model.dec.resblocks.5.convs2.2.weight_g -vq_model.dec.resblocks.5.convs2.2.weight_v -vq_model.dec.resblocks.6.convs1.0.bias -vq_model.dec.resblocks.6.convs1.0.weight_g -vq_model.dec.resblocks.6.convs1.0.weight_v -vq_model.dec.resblocks.6.convs1.1.bias -vq_model.dec.resblocks.6.convs1.1.weight_g -vq_model.dec.resblocks.6.convs1.1.weight_v -vq_model.dec.resblocks.6.convs1.2.bias -vq_model.dec.resblocks.6.convs1.2.weight_g -vq_model.dec.resblocks.6.convs1.2.weight_v -vq_model.dec.resblocks.6.convs2.0.bias -vq_model.dec.resblocks.6.convs2.0.weight_g -vq_model.dec.resblocks.6.convs2.0.weight_v -vq_model.dec.resblocks.6.convs2.1.bias -vq_model.dec.resblocks.6.convs2.1.weight_g -vq_model.dec.resblocks.6.convs2.1.weight_v -vq_model.dec.resblocks.6.convs2.2.bias -vq_model.dec.resblocks.6.convs2.2.weight_g -vq_model.dec.resblocks.6.convs2.2.weight_v -vq_model.dec.resblocks.7.convs1.0.bias -vq_model.dec.resblocks.7.convs1.0.weight_g -vq_model.dec.resblocks.7.convs1.0.weight_v -vq_model.dec.resblocks.7.convs1.1.bias -vq_model.dec.resblocks.7.convs1.1.weight_g -vq_model.dec.resblocks.7.convs1.1.weight_v -vq_model.dec.resblocks.7.convs1.2.bias -vq_model.dec.resblocks.7.convs1.2.weight_g -vq_model.dec.resblocks.7.convs1.2.weight_v -vq_model.dec.resblocks.7.convs2.0.bias -vq_model.dec.resblocks.7.convs2.0.weight_g -vq_model.dec.resblocks.7.convs2.0.weight_v -vq_model.dec.resblocks.7.convs2.1.bias -vq_model.dec.resblocks.7.convs2.1.weight_g -vq_model.dec.resblocks.7.convs2.1.weight_v -vq_model.dec.resblocks.7.convs2.2.bias -vq_model.dec.resblocks.7.convs2.2.weight_g -vq_model.dec.resblocks.7.convs2.2.weight_v -vq_model.dec.resblocks.8.convs1.0.bias -vq_model.dec.resblocks.8.convs1.0.weight_g -vq_model.dec.resblocks.8.convs1.0.weight_v -vq_model.dec.resblocks.8.convs1.1.bias -vq_model.dec.resblocks.8.convs1.1.weight_g -vq_model.dec.resblocks.8.convs1.1.weight_v -vq_model.dec.resblocks.8.convs1.2.bias -vq_model.dec.resblocks.8.convs1.2.weight_g -vq_model.dec.resblocks.8.convs1.2.weight_v -vq_model.dec.resblocks.8.convs2.0.bias -vq_model.dec.resblocks.8.convs2.0.weight_g -vq_model.dec.resblocks.8.convs2.0.weight_v -vq_model.dec.resblocks.8.convs2.1.bias -vq_model.dec.resblocks.8.convs2.1.weight_g -vq_model.dec.resblocks.8.convs2.1.weight_v -vq_model.dec.resblocks.8.convs2.2.bias -vq_model.dec.resblocks.8.convs2.2.weight_g -vq_model.dec.resblocks.8.convs2.2.weight_v -vq_model.dec.resblocks.9.convs1.0.bias -vq_model.dec.resblocks.9.convs1.0.weight_g -vq_model.dec.resblocks.9.convs1.0.weight_v -vq_model.dec.resblocks.9.convs1.1.bias -vq_model.dec.resblocks.9.convs1.1.weight_g -vq_model.dec.resblocks.9.convs1.1.weight_v -vq_model.dec.resblocks.9.convs1.2.bias -vq_model.dec.resblocks.9.convs1.2.weight_g -vq_model.dec.resblocks.9.convs1.2.weight_v -vq_model.dec.resblocks.9.convs2.0.bias -vq_model.dec.resblocks.9.convs2.0.weight_g -vq_model.dec.resblocks.9.convs2.0.weight_v -vq_model.dec.resblocks.9.convs2.1.bias -vq_model.dec.resblocks.9.convs2.1.weight_g -vq_model.dec.resblocks.9.convs2.1.weight_v -vq_model.dec.resblocks.9.convs2.2.bias -vq_model.dec.resblocks.9.convs2.2.weight_g -vq_model.dec.resblocks.9.convs2.2.weight_v -vq_model.dec.ups.0.bias -vq_model.dec.ups.0.weight_g -vq_model.dec.ups.0.weight_v -vq_model.dec.ups.1.bias -vq_model.dec.ups.1.weight_g -vq_model.dec.ups.1.weight_v -vq_model.dec.ups.2.bias -vq_model.dec.ups.2.weight_g -vq_model.dec.ups.2.weight_v -vq_model.dec.ups.3.bias -vq_model.dec.ups.3.weight_g -vq_model.dec.ups.3.weight_v -vq_model.dec.ups.4.bias -vq_model.dec.ups.4.weight_g -vq_model.dec.ups.4.weight_v -vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight -vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k -vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v -vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight -vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k -vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v -vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight -vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k -vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v -vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias -vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight -vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias -vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight -vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias -vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight -vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias -vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight -vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias -vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight -vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias -vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight -vq_model.enc_p.encoder2.norm_layers_1.0.beta -vq_model.enc_p.encoder2.norm_layers_1.0.gamma -vq_model.enc_p.encoder2.norm_layers_1.1.beta -vq_model.enc_p.encoder2.norm_layers_1.1.gamma -vq_model.enc_p.encoder2.norm_layers_1.2.beta -vq_model.enc_p.encoder2.norm_layers_1.2.gamma -vq_model.enc_p.encoder2.norm_layers_2.0.beta -vq_model.enc_p.encoder2.norm_layers_2.0.gamma -vq_model.enc_p.encoder2.norm_layers_2.1.beta -vq_model.enc_p.encoder2.norm_layers_2.1.gamma -vq_model.enc_p.encoder2.norm_layers_2.2.beta -vq_model.enc_p.encoder2.norm_layers_2.2.gamma -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k -vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k -vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k -vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight -vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta -vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma -vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta -vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma -vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta -vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma -vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta -vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma -vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta -vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma -vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta -vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma -vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v -vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight -vq_model.enc_p.encoder_text.norm_layers_1.0.beta -vq_model.enc_p.encoder_text.norm_layers_1.0.gamma -vq_model.enc_p.encoder_text.norm_layers_1.1.beta -vq_model.enc_p.encoder_text.norm_layers_1.1.gamma -vq_model.enc_p.encoder_text.norm_layers_1.2.beta -vq_model.enc_p.encoder_text.norm_layers_1.2.gamma -vq_model.enc_p.encoder_text.norm_layers_1.3.beta -vq_model.enc_p.encoder_text.norm_layers_1.3.gamma -vq_model.enc_p.encoder_text.norm_layers_1.4.beta -vq_model.enc_p.encoder_text.norm_layers_1.4.gamma -vq_model.enc_p.encoder_text.norm_layers_1.5.beta -vq_model.enc_p.encoder_text.norm_layers_1.5.gamma -vq_model.enc_p.encoder_text.norm_layers_2.0.beta -vq_model.enc_p.encoder_text.norm_layers_2.0.gamma -vq_model.enc_p.encoder_text.norm_layers_2.1.beta -vq_model.enc_p.encoder_text.norm_layers_2.1.gamma -vq_model.enc_p.encoder_text.norm_layers_2.2.beta -vq_model.enc_p.encoder_text.norm_layers_2.2.gamma -vq_model.enc_p.encoder_text.norm_layers_2.3.beta -vq_model.enc_p.encoder_text.norm_layers_2.3.gamma -vq_model.enc_p.encoder_text.norm_layers_2.4.beta -vq_model.enc_p.encoder_text.norm_layers_2.4.gamma -vq_model.enc_p.encoder_text.norm_layers_2.5.beta -vq_model.enc_p.encoder_text.norm_layers_2.5.gamma -vq_model.enc_p.mrte.c_post.bias -vq_model.enc_p.mrte.c_post.weight -vq_model.enc_p.mrte.c_pre.bias -vq_model.enc_p.mrte.c_pre.weight -vq_model.enc_p.mrte.cross_attention.conv_k.bias -vq_model.enc_p.mrte.cross_attention.conv_k.weight -vq_model.enc_p.mrte.cross_attention.conv_o.bias -vq_model.enc_p.mrte.cross_attention.conv_o.weight -vq_model.enc_p.mrte.cross_attention.conv_q.bias -vq_model.enc_p.mrte.cross_attention.conv_q.weight -vq_model.enc_p.mrte.cross_attention.conv_v.bias -vq_model.enc_p.mrte.cross_attention.conv_v.weight -vq_model.enc_p.mrte.text_pre.bias -vq_model.enc_p.mrte.text_pre.weight -vq_model.enc_p.proj.bias -vq_model.enc_p.proj.weight -vq_model.enc_p.ssl_proj.bias -vq_model.enc_p.ssl_proj.weight -vq_model.enc_p.text_embedding.weight -vq_model.flow.flows.0.enc.cond_layer.bias -vq_model.flow.flows.0.enc.cond_layer.weight_g -vq_model.flow.flows.0.enc.cond_layer.weight_v -vq_model.flow.flows.0.enc.in_layers.0.bias -vq_model.flow.flows.0.enc.in_layers.0.weight_g -vq_model.flow.flows.0.enc.in_layers.0.weight_v -vq_model.flow.flows.0.enc.in_layers.1.bias -vq_model.flow.flows.0.enc.in_layers.1.weight_g -vq_model.flow.flows.0.enc.in_layers.1.weight_v -vq_model.flow.flows.0.enc.in_layers.2.bias -vq_model.flow.flows.0.enc.in_layers.2.weight_g -vq_model.flow.flows.0.enc.in_layers.2.weight_v -vq_model.flow.flows.0.enc.in_layers.3.bias -vq_model.flow.flows.0.enc.in_layers.3.weight_g -vq_model.flow.flows.0.enc.in_layers.3.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.0.bias -vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.1.bias -vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.2.bias -vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.3.bias -vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.0.post.bias -vq_model.flow.flows.0.post.weight -vq_model.flow.flows.0.pre.bias -vq_model.flow.flows.0.pre.weight -vq_model.flow.flows.2.enc.cond_layer.bias -vq_model.flow.flows.2.enc.cond_layer.weight_g -vq_model.flow.flows.2.enc.cond_layer.weight_v -vq_model.flow.flows.2.enc.in_layers.0.bias -vq_model.flow.flows.2.enc.in_layers.0.weight_g -vq_model.flow.flows.2.enc.in_layers.0.weight_v -vq_model.flow.flows.2.enc.in_layers.1.bias -vq_model.flow.flows.2.enc.in_layers.1.weight_g -vq_model.flow.flows.2.enc.in_layers.1.weight_v -vq_model.flow.flows.2.enc.in_layers.2.bias -vq_model.flow.flows.2.enc.in_layers.2.weight_g -vq_model.flow.flows.2.enc.in_layers.2.weight_v -vq_model.flow.flows.2.enc.in_layers.3.bias -vq_model.flow.flows.2.enc.in_layers.3.weight_g -vq_model.flow.flows.2.enc.in_layers.3.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.0.bias -vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.1.bias -vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.2.bias -vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.3.bias -vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.2.post.bias -vq_model.flow.flows.2.post.weight -vq_model.flow.flows.2.pre.bias -vq_model.flow.flows.2.pre.weight -vq_model.flow.flows.4.enc.cond_layer.bias -vq_model.flow.flows.4.enc.cond_layer.weight_g -vq_model.flow.flows.4.enc.cond_layer.weight_v -vq_model.flow.flows.4.enc.in_layers.0.bias -vq_model.flow.flows.4.enc.in_layers.0.weight_g -vq_model.flow.flows.4.enc.in_layers.0.weight_v -vq_model.flow.flows.4.enc.in_layers.1.bias -vq_model.flow.flows.4.enc.in_layers.1.weight_g -vq_model.flow.flows.4.enc.in_layers.1.weight_v -vq_model.flow.flows.4.enc.in_layers.2.bias -vq_model.flow.flows.4.enc.in_layers.2.weight_g -vq_model.flow.flows.4.enc.in_layers.2.weight_v -vq_model.flow.flows.4.enc.in_layers.3.bias -vq_model.flow.flows.4.enc.in_layers.3.weight_g -vq_model.flow.flows.4.enc.in_layers.3.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.0.bias -vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.1.bias -vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.2.bias -vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.3.bias -vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.4.post.bias -vq_model.flow.flows.4.post.weight -vq_model.flow.flows.4.pre.bias -vq_model.flow.flows.4.pre.weight -vq_model.flow.flows.6.enc.cond_layer.bias -vq_model.flow.flows.6.enc.cond_layer.weight_g -vq_model.flow.flows.6.enc.cond_layer.weight_v -vq_model.flow.flows.6.enc.in_layers.0.bias -vq_model.flow.flows.6.enc.in_layers.0.weight_g -vq_model.flow.flows.6.enc.in_layers.0.weight_v -vq_model.flow.flows.6.enc.in_layers.1.bias -vq_model.flow.flows.6.enc.in_layers.1.weight_g -vq_model.flow.flows.6.enc.in_layers.1.weight_v -vq_model.flow.flows.6.enc.in_layers.2.bias -vq_model.flow.flows.6.enc.in_layers.2.weight_g -vq_model.flow.flows.6.enc.in_layers.2.weight_v -vq_model.flow.flows.6.enc.in_layers.3.bias -vq_model.flow.flows.6.enc.in_layers.3.weight_g -vq_model.flow.flows.6.enc.in_layers.3.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.0.bias -vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.1.bias -vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.2.bias -vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.3.bias -vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.6.post.bias -vq_model.flow.flows.6.post.weight -vq_model.flow.flows.6.pre.bias -vq_model.flow.flows.6.pre.weight -vq_model.quantizer.vq.layers.0._codebook.embed -vq_model.ref_enc.fc.fc.bias -vq_model.ref_enc.fc.fc.weight -vq_model.ref_enc.slf_attn.fc.bias -vq_model.ref_enc.slf_attn.fc.weight -vq_model.ref_enc.slf_attn.w_ks.bias -vq_model.ref_enc.slf_attn.w_ks.weight -vq_model.ref_enc.slf_attn.w_qs.bias -vq_model.ref_enc.slf_attn.w_qs.weight -vq_model.ref_enc.slf_attn.w_vs.bias -vq_model.ref_enc.slf_attn.w_vs.weight -vq_model.ref_enc.spectral.0.fc.bias -vq_model.ref_enc.spectral.0.fc.weight -vq_model.ref_enc.spectral.3.fc.bias -vq_model.ref_enc.spectral.3.fc.weight -vq_model.ref_enc.temporal.0.conv1.conv.bias -vq_model.ref_enc.temporal.0.conv1.conv.weight -vq_model.ref_enc.temporal.1.conv1.conv.bias -vq_model.ref_enc.temporal.1.conv1.conv.weight +vq_model.dec.cond.bias +vq_model.dec.cond.weight +vq_model.dec.conv_post.weight +vq_model.dec.conv_pre.bias +vq_model.dec.conv_pre.weight +vq_model.dec.resblocks.0.convs1.0.bias +vq_model.dec.resblocks.0.convs1.0.weight_g +vq_model.dec.resblocks.0.convs1.0.weight_v +vq_model.dec.resblocks.0.convs1.1.bias +vq_model.dec.resblocks.0.convs1.1.weight_g +vq_model.dec.resblocks.0.convs1.1.weight_v +vq_model.dec.resblocks.0.convs1.2.bias +vq_model.dec.resblocks.0.convs1.2.weight_g +vq_model.dec.resblocks.0.convs1.2.weight_v +vq_model.dec.resblocks.0.convs2.0.bias +vq_model.dec.resblocks.0.convs2.0.weight_g +vq_model.dec.resblocks.0.convs2.0.weight_v +vq_model.dec.resblocks.0.convs2.1.bias +vq_model.dec.resblocks.0.convs2.1.weight_g +vq_model.dec.resblocks.0.convs2.1.weight_v +vq_model.dec.resblocks.0.convs2.2.bias +vq_model.dec.resblocks.0.convs2.2.weight_g +vq_model.dec.resblocks.0.convs2.2.weight_v +vq_model.dec.resblocks.1.convs1.0.bias +vq_model.dec.resblocks.1.convs1.0.weight_g +vq_model.dec.resblocks.1.convs1.0.weight_v +vq_model.dec.resblocks.1.convs1.1.bias +vq_model.dec.resblocks.1.convs1.1.weight_g +vq_model.dec.resblocks.1.convs1.1.weight_v +vq_model.dec.resblocks.1.convs1.2.bias +vq_model.dec.resblocks.1.convs1.2.weight_g +vq_model.dec.resblocks.1.convs1.2.weight_v +vq_model.dec.resblocks.1.convs2.0.bias +vq_model.dec.resblocks.1.convs2.0.weight_g +vq_model.dec.resblocks.1.convs2.0.weight_v +vq_model.dec.resblocks.1.convs2.1.bias +vq_model.dec.resblocks.1.convs2.1.weight_g +vq_model.dec.resblocks.1.convs2.1.weight_v +vq_model.dec.resblocks.1.convs2.2.bias +vq_model.dec.resblocks.1.convs2.2.weight_g +vq_model.dec.resblocks.1.convs2.2.weight_v +vq_model.dec.resblocks.10.convs1.0.bias +vq_model.dec.resblocks.10.convs1.0.weight_g +vq_model.dec.resblocks.10.convs1.0.weight_v +vq_model.dec.resblocks.10.convs1.1.bias +vq_model.dec.resblocks.10.convs1.1.weight_g +vq_model.dec.resblocks.10.convs1.1.weight_v +vq_model.dec.resblocks.10.convs1.2.bias +vq_model.dec.resblocks.10.convs1.2.weight_g +vq_model.dec.resblocks.10.convs1.2.weight_v +vq_model.dec.resblocks.10.convs2.0.bias +vq_model.dec.resblocks.10.convs2.0.weight_g +vq_model.dec.resblocks.10.convs2.0.weight_v +vq_model.dec.resblocks.10.convs2.1.bias +vq_model.dec.resblocks.10.convs2.1.weight_g +vq_model.dec.resblocks.10.convs2.1.weight_v +vq_model.dec.resblocks.10.convs2.2.bias +vq_model.dec.resblocks.10.convs2.2.weight_g +vq_model.dec.resblocks.10.convs2.2.weight_v +vq_model.dec.resblocks.11.convs1.0.bias +vq_model.dec.resblocks.11.convs1.0.weight_g +vq_model.dec.resblocks.11.convs1.0.weight_v +vq_model.dec.resblocks.11.convs1.1.bias +vq_model.dec.resblocks.11.convs1.1.weight_g +vq_model.dec.resblocks.11.convs1.1.weight_v +vq_model.dec.resblocks.11.convs1.2.bias +vq_model.dec.resblocks.11.convs1.2.weight_g +vq_model.dec.resblocks.11.convs1.2.weight_v +vq_model.dec.resblocks.11.convs2.0.bias +vq_model.dec.resblocks.11.convs2.0.weight_g +vq_model.dec.resblocks.11.convs2.0.weight_v +vq_model.dec.resblocks.11.convs2.1.bias +vq_model.dec.resblocks.11.convs2.1.weight_g +vq_model.dec.resblocks.11.convs2.1.weight_v +vq_model.dec.resblocks.11.convs2.2.bias +vq_model.dec.resblocks.11.convs2.2.weight_g +vq_model.dec.resblocks.11.convs2.2.weight_v +vq_model.dec.resblocks.12.convs1.0.bias +vq_model.dec.resblocks.12.convs1.0.weight_g +vq_model.dec.resblocks.12.convs1.0.weight_v +vq_model.dec.resblocks.12.convs1.1.bias +vq_model.dec.resblocks.12.convs1.1.weight_g +vq_model.dec.resblocks.12.convs1.1.weight_v +vq_model.dec.resblocks.12.convs1.2.bias +vq_model.dec.resblocks.12.convs1.2.weight_g +vq_model.dec.resblocks.12.convs1.2.weight_v +vq_model.dec.resblocks.12.convs2.0.bias +vq_model.dec.resblocks.12.convs2.0.weight_g +vq_model.dec.resblocks.12.convs2.0.weight_v +vq_model.dec.resblocks.12.convs2.1.bias +vq_model.dec.resblocks.12.convs2.1.weight_g +vq_model.dec.resblocks.12.convs2.1.weight_v +vq_model.dec.resblocks.12.convs2.2.bias +vq_model.dec.resblocks.12.convs2.2.weight_g +vq_model.dec.resblocks.12.convs2.2.weight_v +vq_model.dec.resblocks.13.convs1.0.bias +vq_model.dec.resblocks.13.convs1.0.weight_g +vq_model.dec.resblocks.13.convs1.0.weight_v +vq_model.dec.resblocks.13.convs1.1.bias +vq_model.dec.resblocks.13.convs1.1.weight_g +vq_model.dec.resblocks.13.convs1.1.weight_v +vq_model.dec.resblocks.13.convs1.2.bias +vq_model.dec.resblocks.13.convs1.2.weight_g +vq_model.dec.resblocks.13.convs1.2.weight_v +vq_model.dec.resblocks.13.convs2.0.bias +vq_model.dec.resblocks.13.convs2.0.weight_g +vq_model.dec.resblocks.13.convs2.0.weight_v +vq_model.dec.resblocks.13.convs2.1.bias +vq_model.dec.resblocks.13.convs2.1.weight_g +vq_model.dec.resblocks.13.convs2.1.weight_v +vq_model.dec.resblocks.13.convs2.2.bias +vq_model.dec.resblocks.13.convs2.2.weight_g +vq_model.dec.resblocks.13.convs2.2.weight_v +vq_model.dec.resblocks.14.convs1.0.bias +vq_model.dec.resblocks.14.convs1.0.weight_g +vq_model.dec.resblocks.14.convs1.0.weight_v +vq_model.dec.resblocks.14.convs1.1.bias +vq_model.dec.resblocks.14.convs1.1.weight_g +vq_model.dec.resblocks.14.convs1.1.weight_v +vq_model.dec.resblocks.14.convs1.2.bias +vq_model.dec.resblocks.14.convs1.2.weight_g +vq_model.dec.resblocks.14.convs1.2.weight_v +vq_model.dec.resblocks.14.convs2.0.bias +vq_model.dec.resblocks.14.convs2.0.weight_g +vq_model.dec.resblocks.14.convs2.0.weight_v +vq_model.dec.resblocks.14.convs2.1.bias +vq_model.dec.resblocks.14.convs2.1.weight_g +vq_model.dec.resblocks.14.convs2.1.weight_v +vq_model.dec.resblocks.14.convs2.2.bias +vq_model.dec.resblocks.14.convs2.2.weight_g +vq_model.dec.resblocks.14.convs2.2.weight_v +vq_model.dec.resblocks.2.convs1.0.bias +vq_model.dec.resblocks.2.convs1.0.weight_g +vq_model.dec.resblocks.2.convs1.0.weight_v +vq_model.dec.resblocks.2.convs1.1.bias +vq_model.dec.resblocks.2.convs1.1.weight_g +vq_model.dec.resblocks.2.convs1.1.weight_v +vq_model.dec.resblocks.2.convs1.2.bias +vq_model.dec.resblocks.2.convs1.2.weight_g +vq_model.dec.resblocks.2.convs1.2.weight_v +vq_model.dec.resblocks.2.convs2.0.bias +vq_model.dec.resblocks.2.convs2.0.weight_g +vq_model.dec.resblocks.2.convs2.0.weight_v +vq_model.dec.resblocks.2.convs2.1.bias +vq_model.dec.resblocks.2.convs2.1.weight_g +vq_model.dec.resblocks.2.convs2.1.weight_v +vq_model.dec.resblocks.2.convs2.2.bias +vq_model.dec.resblocks.2.convs2.2.weight_g +vq_model.dec.resblocks.2.convs2.2.weight_v +vq_model.dec.resblocks.3.convs1.0.bias +vq_model.dec.resblocks.3.convs1.0.weight_g +vq_model.dec.resblocks.3.convs1.0.weight_v +vq_model.dec.resblocks.3.convs1.1.bias +vq_model.dec.resblocks.3.convs1.1.weight_g +vq_model.dec.resblocks.3.convs1.1.weight_v +vq_model.dec.resblocks.3.convs1.2.bias +vq_model.dec.resblocks.3.convs1.2.weight_g +vq_model.dec.resblocks.3.convs1.2.weight_v +vq_model.dec.resblocks.3.convs2.0.bias +vq_model.dec.resblocks.3.convs2.0.weight_g +vq_model.dec.resblocks.3.convs2.0.weight_v +vq_model.dec.resblocks.3.convs2.1.bias +vq_model.dec.resblocks.3.convs2.1.weight_g +vq_model.dec.resblocks.3.convs2.1.weight_v +vq_model.dec.resblocks.3.convs2.2.bias +vq_model.dec.resblocks.3.convs2.2.weight_g +vq_model.dec.resblocks.3.convs2.2.weight_v +vq_model.dec.resblocks.4.convs1.0.bias +vq_model.dec.resblocks.4.convs1.0.weight_g +vq_model.dec.resblocks.4.convs1.0.weight_v +vq_model.dec.resblocks.4.convs1.1.bias +vq_model.dec.resblocks.4.convs1.1.weight_g +vq_model.dec.resblocks.4.convs1.1.weight_v +vq_model.dec.resblocks.4.convs1.2.bias +vq_model.dec.resblocks.4.convs1.2.weight_g +vq_model.dec.resblocks.4.convs1.2.weight_v +vq_model.dec.resblocks.4.convs2.0.bias +vq_model.dec.resblocks.4.convs2.0.weight_g +vq_model.dec.resblocks.4.convs2.0.weight_v +vq_model.dec.resblocks.4.convs2.1.bias +vq_model.dec.resblocks.4.convs2.1.weight_g +vq_model.dec.resblocks.4.convs2.1.weight_v +vq_model.dec.resblocks.4.convs2.2.bias +vq_model.dec.resblocks.4.convs2.2.weight_g +vq_model.dec.resblocks.4.convs2.2.weight_v +vq_model.dec.resblocks.5.convs1.0.bias +vq_model.dec.resblocks.5.convs1.0.weight_g +vq_model.dec.resblocks.5.convs1.0.weight_v +vq_model.dec.resblocks.5.convs1.1.bias +vq_model.dec.resblocks.5.convs1.1.weight_g +vq_model.dec.resblocks.5.convs1.1.weight_v +vq_model.dec.resblocks.5.convs1.2.bias +vq_model.dec.resblocks.5.convs1.2.weight_g +vq_model.dec.resblocks.5.convs1.2.weight_v +vq_model.dec.resblocks.5.convs2.0.bias +vq_model.dec.resblocks.5.convs2.0.weight_g +vq_model.dec.resblocks.5.convs2.0.weight_v +vq_model.dec.resblocks.5.convs2.1.bias +vq_model.dec.resblocks.5.convs2.1.weight_g +vq_model.dec.resblocks.5.convs2.1.weight_v +vq_model.dec.resblocks.5.convs2.2.bias +vq_model.dec.resblocks.5.convs2.2.weight_g +vq_model.dec.resblocks.5.convs2.2.weight_v +vq_model.dec.resblocks.6.convs1.0.bias +vq_model.dec.resblocks.6.convs1.0.weight_g +vq_model.dec.resblocks.6.convs1.0.weight_v +vq_model.dec.resblocks.6.convs1.1.bias +vq_model.dec.resblocks.6.convs1.1.weight_g +vq_model.dec.resblocks.6.convs1.1.weight_v +vq_model.dec.resblocks.6.convs1.2.bias +vq_model.dec.resblocks.6.convs1.2.weight_g +vq_model.dec.resblocks.6.convs1.2.weight_v +vq_model.dec.resblocks.6.convs2.0.bias +vq_model.dec.resblocks.6.convs2.0.weight_g +vq_model.dec.resblocks.6.convs2.0.weight_v +vq_model.dec.resblocks.6.convs2.1.bias +vq_model.dec.resblocks.6.convs2.1.weight_g +vq_model.dec.resblocks.6.convs2.1.weight_v +vq_model.dec.resblocks.6.convs2.2.bias +vq_model.dec.resblocks.6.convs2.2.weight_g +vq_model.dec.resblocks.6.convs2.2.weight_v +vq_model.dec.resblocks.7.convs1.0.bias +vq_model.dec.resblocks.7.convs1.0.weight_g +vq_model.dec.resblocks.7.convs1.0.weight_v +vq_model.dec.resblocks.7.convs1.1.bias +vq_model.dec.resblocks.7.convs1.1.weight_g +vq_model.dec.resblocks.7.convs1.1.weight_v +vq_model.dec.resblocks.7.convs1.2.bias +vq_model.dec.resblocks.7.convs1.2.weight_g +vq_model.dec.resblocks.7.convs1.2.weight_v +vq_model.dec.resblocks.7.convs2.0.bias +vq_model.dec.resblocks.7.convs2.0.weight_g +vq_model.dec.resblocks.7.convs2.0.weight_v +vq_model.dec.resblocks.7.convs2.1.bias +vq_model.dec.resblocks.7.convs2.1.weight_g +vq_model.dec.resblocks.7.convs2.1.weight_v +vq_model.dec.resblocks.7.convs2.2.bias +vq_model.dec.resblocks.7.convs2.2.weight_g +vq_model.dec.resblocks.7.convs2.2.weight_v +vq_model.dec.resblocks.8.convs1.0.bias +vq_model.dec.resblocks.8.convs1.0.weight_g +vq_model.dec.resblocks.8.convs1.0.weight_v +vq_model.dec.resblocks.8.convs1.1.bias +vq_model.dec.resblocks.8.convs1.1.weight_g +vq_model.dec.resblocks.8.convs1.1.weight_v +vq_model.dec.resblocks.8.convs1.2.bias +vq_model.dec.resblocks.8.convs1.2.weight_g +vq_model.dec.resblocks.8.convs1.2.weight_v +vq_model.dec.resblocks.8.convs2.0.bias +vq_model.dec.resblocks.8.convs2.0.weight_g +vq_model.dec.resblocks.8.convs2.0.weight_v +vq_model.dec.resblocks.8.convs2.1.bias +vq_model.dec.resblocks.8.convs2.1.weight_g +vq_model.dec.resblocks.8.convs2.1.weight_v +vq_model.dec.resblocks.8.convs2.2.bias +vq_model.dec.resblocks.8.convs2.2.weight_g +vq_model.dec.resblocks.8.convs2.2.weight_v +vq_model.dec.resblocks.9.convs1.0.bias +vq_model.dec.resblocks.9.convs1.0.weight_g +vq_model.dec.resblocks.9.convs1.0.weight_v +vq_model.dec.resblocks.9.convs1.1.bias +vq_model.dec.resblocks.9.convs1.1.weight_g +vq_model.dec.resblocks.9.convs1.1.weight_v +vq_model.dec.resblocks.9.convs1.2.bias +vq_model.dec.resblocks.9.convs1.2.weight_g +vq_model.dec.resblocks.9.convs1.2.weight_v +vq_model.dec.resblocks.9.convs2.0.bias +vq_model.dec.resblocks.9.convs2.0.weight_g +vq_model.dec.resblocks.9.convs2.0.weight_v +vq_model.dec.resblocks.9.convs2.1.bias +vq_model.dec.resblocks.9.convs2.1.weight_g +vq_model.dec.resblocks.9.convs2.1.weight_v +vq_model.dec.resblocks.9.convs2.2.bias +vq_model.dec.resblocks.9.convs2.2.weight_g +vq_model.dec.resblocks.9.convs2.2.weight_v +vq_model.dec.ups.0.bias +vq_model.dec.ups.0.weight_g +vq_model.dec.ups.0.weight_v +vq_model.dec.ups.1.bias +vq_model.dec.ups.1.weight_g +vq_model.dec.ups.1.weight_v +vq_model.dec.ups.2.bias +vq_model.dec.ups.2.weight_g +vq_model.dec.ups.2.weight_v +vq_model.dec.ups.3.bias +vq_model.dec.ups.3.weight_g +vq_model.dec.ups.3.weight_v +vq_model.dec.ups.4.bias +vq_model.dec.ups.4.weight_g +vq_model.dec.ups.4.weight_v +vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight +vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k +vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v +vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight +vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k +vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v +vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight +vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k +vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v +vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias +vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight +vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias +vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight +vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias +vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight +vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias +vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight +vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias +vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight +vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias +vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight +vq_model.enc_p.encoder2.norm_layers_1.0.beta +vq_model.enc_p.encoder2.norm_layers_1.0.gamma +vq_model.enc_p.encoder2.norm_layers_1.1.beta +vq_model.enc_p.encoder2.norm_layers_1.1.gamma +vq_model.enc_p.encoder2.norm_layers_1.2.beta +vq_model.enc_p.encoder2.norm_layers_1.2.gamma +vq_model.enc_p.encoder2.norm_layers_2.0.beta +vq_model.enc_p.encoder2.norm_layers_2.0.gamma +vq_model.enc_p.encoder2.norm_layers_2.1.beta +vq_model.enc_p.encoder2.norm_layers_2.1.gamma +vq_model.enc_p.encoder2.norm_layers_2.2.beta +vq_model.enc_p.encoder2.norm_layers_2.2.gamma +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k +vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k +vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k +vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight +vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta +vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma +vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta +vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma +vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta +vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma +vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta +vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma +vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta +vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma +vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta +vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma +vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v +vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight +vq_model.enc_p.encoder_text.norm_layers_1.0.beta +vq_model.enc_p.encoder_text.norm_layers_1.0.gamma +vq_model.enc_p.encoder_text.norm_layers_1.1.beta +vq_model.enc_p.encoder_text.norm_layers_1.1.gamma +vq_model.enc_p.encoder_text.norm_layers_1.2.beta +vq_model.enc_p.encoder_text.norm_layers_1.2.gamma +vq_model.enc_p.encoder_text.norm_layers_1.3.beta +vq_model.enc_p.encoder_text.norm_layers_1.3.gamma +vq_model.enc_p.encoder_text.norm_layers_1.4.beta +vq_model.enc_p.encoder_text.norm_layers_1.4.gamma +vq_model.enc_p.encoder_text.norm_layers_1.5.beta +vq_model.enc_p.encoder_text.norm_layers_1.5.gamma +vq_model.enc_p.encoder_text.norm_layers_2.0.beta +vq_model.enc_p.encoder_text.norm_layers_2.0.gamma +vq_model.enc_p.encoder_text.norm_layers_2.1.beta +vq_model.enc_p.encoder_text.norm_layers_2.1.gamma +vq_model.enc_p.encoder_text.norm_layers_2.2.beta +vq_model.enc_p.encoder_text.norm_layers_2.2.gamma +vq_model.enc_p.encoder_text.norm_layers_2.3.beta +vq_model.enc_p.encoder_text.norm_layers_2.3.gamma +vq_model.enc_p.encoder_text.norm_layers_2.4.beta +vq_model.enc_p.encoder_text.norm_layers_2.4.gamma +vq_model.enc_p.encoder_text.norm_layers_2.5.beta +vq_model.enc_p.encoder_text.norm_layers_2.5.gamma +vq_model.enc_p.mrte.c_post.bias +vq_model.enc_p.mrte.c_post.weight +vq_model.enc_p.mrte.c_pre.bias +vq_model.enc_p.mrte.c_pre.weight +vq_model.enc_p.mrte.cross_attention.conv_k.bias +vq_model.enc_p.mrte.cross_attention.conv_k.weight +vq_model.enc_p.mrte.cross_attention.conv_o.bias +vq_model.enc_p.mrte.cross_attention.conv_o.weight +vq_model.enc_p.mrte.cross_attention.conv_q.bias +vq_model.enc_p.mrte.cross_attention.conv_q.weight +vq_model.enc_p.mrte.cross_attention.conv_v.bias +vq_model.enc_p.mrte.cross_attention.conv_v.weight +vq_model.enc_p.mrte.text_pre.bias +vq_model.enc_p.mrte.text_pre.weight +vq_model.enc_p.proj.bias +vq_model.enc_p.proj.weight +vq_model.enc_p.ssl_proj.bias +vq_model.enc_p.ssl_proj.weight +vq_model.enc_p.text_embedding.weight +vq_model.flow.flows.0.enc.cond_layer.bias +vq_model.flow.flows.0.enc.cond_layer.weight_g +vq_model.flow.flows.0.enc.cond_layer.weight_v +vq_model.flow.flows.0.enc.in_layers.0.bias +vq_model.flow.flows.0.enc.in_layers.0.weight_g +vq_model.flow.flows.0.enc.in_layers.0.weight_v +vq_model.flow.flows.0.enc.in_layers.1.bias +vq_model.flow.flows.0.enc.in_layers.1.weight_g +vq_model.flow.flows.0.enc.in_layers.1.weight_v +vq_model.flow.flows.0.enc.in_layers.2.bias +vq_model.flow.flows.0.enc.in_layers.2.weight_g +vq_model.flow.flows.0.enc.in_layers.2.weight_v +vq_model.flow.flows.0.enc.in_layers.3.bias +vq_model.flow.flows.0.enc.in_layers.3.weight_g +vq_model.flow.flows.0.enc.in_layers.3.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.0.bias +vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.1.bias +vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.2.bias +vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.3.bias +vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.0.post.bias +vq_model.flow.flows.0.post.weight +vq_model.flow.flows.0.pre.bias +vq_model.flow.flows.0.pre.weight +vq_model.flow.flows.2.enc.cond_layer.bias +vq_model.flow.flows.2.enc.cond_layer.weight_g +vq_model.flow.flows.2.enc.cond_layer.weight_v +vq_model.flow.flows.2.enc.in_layers.0.bias +vq_model.flow.flows.2.enc.in_layers.0.weight_g +vq_model.flow.flows.2.enc.in_layers.0.weight_v +vq_model.flow.flows.2.enc.in_layers.1.bias +vq_model.flow.flows.2.enc.in_layers.1.weight_g +vq_model.flow.flows.2.enc.in_layers.1.weight_v +vq_model.flow.flows.2.enc.in_layers.2.bias +vq_model.flow.flows.2.enc.in_layers.2.weight_g +vq_model.flow.flows.2.enc.in_layers.2.weight_v +vq_model.flow.flows.2.enc.in_layers.3.bias +vq_model.flow.flows.2.enc.in_layers.3.weight_g +vq_model.flow.flows.2.enc.in_layers.3.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.0.bias +vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.1.bias +vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.2.bias +vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.3.bias +vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.2.post.bias +vq_model.flow.flows.2.post.weight +vq_model.flow.flows.2.pre.bias +vq_model.flow.flows.2.pre.weight +vq_model.flow.flows.4.enc.cond_layer.bias +vq_model.flow.flows.4.enc.cond_layer.weight_g +vq_model.flow.flows.4.enc.cond_layer.weight_v +vq_model.flow.flows.4.enc.in_layers.0.bias +vq_model.flow.flows.4.enc.in_layers.0.weight_g +vq_model.flow.flows.4.enc.in_layers.0.weight_v +vq_model.flow.flows.4.enc.in_layers.1.bias +vq_model.flow.flows.4.enc.in_layers.1.weight_g +vq_model.flow.flows.4.enc.in_layers.1.weight_v +vq_model.flow.flows.4.enc.in_layers.2.bias +vq_model.flow.flows.4.enc.in_layers.2.weight_g +vq_model.flow.flows.4.enc.in_layers.2.weight_v +vq_model.flow.flows.4.enc.in_layers.3.bias +vq_model.flow.flows.4.enc.in_layers.3.weight_g +vq_model.flow.flows.4.enc.in_layers.3.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.0.bias +vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.1.bias +vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.2.bias +vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.3.bias +vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.4.post.bias +vq_model.flow.flows.4.post.weight +vq_model.flow.flows.4.pre.bias +vq_model.flow.flows.4.pre.weight +vq_model.flow.flows.6.enc.cond_layer.bias +vq_model.flow.flows.6.enc.cond_layer.weight_g +vq_model.flow.flows.6.enc.cond_layer.weight_v +vq_model.flow.flows.6.enc.in_layers.0.bias +vq_model.flow.flows.6.enc.in_layers.0.weight_g +vq_model.flow.flows.6.enc.in_layers.0.weight_v +vq_model.flow.flows.6.enc.in_layers.1.bias +vq_model.flow.flows.6.enc.in_layers.1.weight_g +vq_model.flow.flows.6.enc.in_layers.1.weight_v +vq_model.flow.flows.6.enc.in_layers.2.bias +vq_model.flow.flows.6.enc.in_layers.2.weight_g +vq_model.flow.flows.6.enc.in_layers.2.weight_v +vq_model.flow.flows.6.enc.in_layers.3.bias +vq_model.flow.flows.6.enc.in_layers.3.weight_g +vq_model.flow.flows.6.enc.in_layers.3.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.0.bias +vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.1.bias +vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.2.bias +vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.3.bias +vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.6.post.bias +vq_model.flow.flows.6.post.weight +vq_model.flow.flows.6.pre.bias +vq_model.flow.flows.6.pre.weight +vq_model.quantizer.vq.layers.0._codebook.embed +vq_model.ref_enc.fc.fc.bias +vq_model.ref_enc.fc.fc.weight +vq_model.ref_enc.slf_attn.fc.bias +vq_model.ref_enc.slf_attn.fc.weight +vq_model.ref_enc.slf_attn.w_ks.bias +vq_model.ref_enc.slf_attn.w_ks.weight +vq_model.ref_enc.slf_attn.w_qs.bias +vq_model.ref_enc.slf_attn.w_qs.weight +vq_model.ref_enc.slf_attn.w_vs.bias +vq_model.ref_enc.slf_attn.w_vs.weight +vq_model.ref_enc.spectral.0.fc.bias +vq_model.ref_enc.spectral.0.fc.weight +vq_model.ref_enc.spectral.3.fc.bias +vq_model.ref_enc.spectral.3.fc.weight +vq_model.ref_enc.temporal.0.conv1.conv.bias +vq_model.ref_enc.temporal.0.conv1.conv.weight +vq_model.ref_enc.temporal.1.conv1.conv.bias +vq_model.ref_enc.temporal.1.conv1.conv.weight diff --git a/genie_tts/Data/v2ProPlus/Keys/prompt_encoder_weights.txt b/genie_tts/Data/v2ProPlus/Keys/prompt_encoder_weights.txt index 8ae9345f6b061d27362302611f0fe08c8091eb29..fbb01e9ef646f2c9472bda37d78cfd004724dd4a 100644 --- a/genie_tts/Data/v2ProPlus/Keys/prompt_encoder_weights.txt +++ b/genie_tts/Data/v2ProPlus/Keys/prompt_encoder_weights.txt @@ -1,23 +1,23 @@ -ref_enc.spectral.0.fc.weight -ref_enc.spectral.0.fc.bias -ref_enc.spectral.3.fc.weight -ref_enc.spectral.3.fc.bias -ref_enc.temporal.0.conv1.conv.weight -ref_enc.temporal.0.conv1.conv.bias -ref_enc.temporal.1.conv1.conv.weight -ref_enc.temporal.1.conv1.conv.bias -ref_enc.slf_attn.w_qs.weight -ref_enc.slf_attn.w_qs.bias -ref_enc.slf_attn.w_ks.weight -ref_enc.slf_attn.w_ks.bias -ref_enc.slf_attn.w_vs.weight -ref_enc.slf_attn.w_vs.bias -ref_enc.slf_attn.fc.weight -ref_enc.slf_attn.fc.bias -ref_enc.fc.fc.weight -ref_enc.fc.fc.bias -sv_emb.weight -sv_emb.bias -ge_to512.weight -ge_to512.bias -prelu.weight +ref_enc.spectral.0.fc.weight +ref_enc.spectral.0.fc.bias +ref_enc.spectral.3.fc.weight +ref_enc.spectral.3.fc.bias +ref_enc.temporal.0.conv1.conv.weight +ref_enc.temporal.0.conv1.conv.bias +ref_enc.temporal.1.conv1.conv.weight +ref_enc.temporal.1.conv1.conv.bias +ref_enc.slf_attn.w_qs.weight +ref_enc.slf_attn.w_qs.bias +ref_enc.slf_attn.w_ks.weight +ref_enc.slf_attn.w_ks.bias +ref_enc.slf_attn.w_vs.weight +ref_enc.slf_attn.w_vs.bias +ref_enc.slf_attn.fc.weight +ref_enc.slf_attn.fc.bias +ref_enc.fc.fc.weight +ref_enc.fc.fc.bias +sv_emb.weight +sv_emb.bias +ge_to512.weight +ge_to512.bias +prelu.weight diff --git a/genie_tts/Data/v2ProPlus/Keys/vits_weights.txt b/genie_tts/Data/v2ProPlus/Keys/vits_weights.txt index 4ed033cab8dfa1bd065ed90ae62e8e02b093c03b..7b6bc49c812af16ebf0035b2abbad5cc17609dfc 100644 --- a/genie_tts/Data/v2ProPlus/Keys/vits_weights.txt +++ b/genie_tts/Data/v2ProPlus/Keys/vits_weights.txt @@ -1,650 +1,650 @@ -vq_model.enc_p.ssl_proj.weight -vq_model.enc_p.ssl_proj.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k -vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight -vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k -vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight -vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k -vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight -vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias -vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma -vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta -vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma -vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta -vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma -vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight -vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight -vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight -vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias -vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma -vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta -vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma -vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta -vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma -vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta -vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias -vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k -vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v -vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias -vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight -vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias -vq_model.enc_p.encoder_text.norm_layers_1.0.gamma -vq_model.enc_p.encoder_text.norm_layers_1.0.beta -vq_model.enc_p.encoder_text.norm_layers_1.1.gamma -vq_model.enc_p.encoder_text.norm_layers_1.1.beta -vq_model.enc_p.encoder_text.norm_layers_1.2.gamma -vq_model.enc_p.encoder_text.norm_layers_1.2.beta -vq_model.enc_p.encoder_text.norm_layers_1.3.gamma -vq_model.enc_p.encoder_text.norm_layers_1.3.beta -vq_model.enc_p.encoder_text.norm_layers_1.4.gamma -vq_model.enc_p.encoder_text.norm_layers_1.4.beta -vq_model.enc_p.encoder_text.norm_layers_1.5.gamma -vq_model.enc_p.encoder_text.norm_layers_1.5.beta -vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias -vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight -vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias -vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight -vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias -vq_model.enc_p.encoder_text.norm_layers_2.0.gamma -vq_model.enc_p.encoder_text.norm_layers_2.0.beta -vq_model.enc_p.encoder_text.norm_layers_2.1.gamma -vq_model.enc_p.encoder_text.norm_layers_2.1.beta -vq_model.enc_p.encoder_text.norm_layers_2.2.gamma -vq_model.enc_p.encoder_text.norm_layers_2.2.beta -vq_model.enc_p.encoder_text.norm_layers_2.3.gamma -vq_model.enc_p.encoder_text.norm_layers_2.3.beta -vq_model.enc_p.encoder_text.norm_layers_2.4.gamma -vq_model.enc_p.encoder_text.norm_layers_2.4.beta -vq_model.enc_p.encoder_text.norm_layers_2.5.gamma -vq_model.enc_p.encoder_text.norm_layers_2.5.beta -vq_model.enc_p.text_embedding.weight -vq_model.enc_p.mrte.cross_attention.conv_q.weight -vq_model.enc_p.mrte.cross_attention.conv_q.bias -vq_model.enc_p.mrte.cross_attention.conv_k.weight -vq_model.enc_p.mrte.cross_attention.conv_k.bias -vq_model.enc_p.mrte.cross_attention.conv_v.weight -vq_model.enc_p.mrte.cross_attention.conv_v.bias -vq_model.enc_p.mrte.cross_attention.conv_o.weight -vq_model.enc_p.mrte.cross_attention.conv_o.bias -vq_model.enc_p.mrte.c_pre.weight -vq_model.enc_p.mrte.c_pre.bias -vq_model.enc_p.mrte.text_pre.weight -vq_model.enc_p.mrte.text_pre.bias -vq_model.enc_p.mrte.c_post.weight -vq_model.enc_p.mrte.c_post.bias -vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k -vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v -vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias -vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight -vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias -vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k -vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v -vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias -vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight -vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias -vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k -vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v -vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias -vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight -vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias -vq_model.enc_p.encoder2.norm_layers_1.0.gamma -vq_model.enc_p.encoder2.norm_layers_1.0.beta -vq_model.enc_p.encoder2.norm_layers_1.1.gamma -vq_model.enc_p.encoder2.norm_layers_1.1.beta -vq_model.enc_p.encoder2.norm_layers_1.2.gamma -vq_model.enc_p.encoder2.norm_layers_1.2.beta -vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight -vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias -vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight -vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias -vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight -vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias -vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight -vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias -vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight -vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias -vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight -vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias -vq_model.enc_p.encoder2.norm_layers_2.0.gamma -vq_model.enc_p.encoder2.norm_layers_2.0.beta -vq_model.enc_p.encoder2.norm_layers_2.1.gamma -vq_model.enc_p.encoder2.norm_layers_2.1.beta -vq_model.enc_p.encoder2.norm_layers_2.2.gamma -vq_model.enc_p.encoder2.norm_layers_2.2.beta -vq_model.enc_p.proj.weight -vq_model.enc_p.proj.bias -vq_model.dec.conv_pre.weight -vq_model.dec.conv_pre.bias -vq_model.dec.ups.0.bias -vq_model.dec.ups.0.weight_g -vq_model.dec.ups.0.weight_v -vq_model.dec.ups.1.bias -vq_model.dec.ups.1.weight_g -vq_model.dec.ups.1.weight_v -vq_model.dec.ups.2.bias -vq_model.dec.ups.2.weight_g -vq_model.dec.ups.2.weight_v -vq_model.dec.ups.3.bias -vq_model.dec.ups.3.weight_g -vq_model.dec.ups.3.weight_v -vq_model.dec.ups.4.bias -vq_model.dec.ups.4.weight_g -vq_model.dec.ups.4.weight_v -vq_model.dec.resblocks.0.convs1.0.bias -vq_model.dec.resblocks.0.convs1.0.weight_g -vq_model.dec.resblocks.0.convs1.0.weight_v -vq_model.dec.resblocks.0.convs1.1.bias -vq_model.dec.resblocks.0.convs1.1.weight_g -vq_model.dec.resblocks.0.convs1.1.weight_v -vq_model.dec.resblocks.0.convs1.2.bias -vq_model.dec.resblocks.0.convs1.2.weight_g -vq_model.dec.resblocks.0.convs1.2.weight_v -vq_model.dec.resblocks.0.convs2.0.bias -vq_model.dec.resblocks.0.convs2.0.weight_g -vq_model.dec.resblocks.0.convs2.0.weight_v -vq_model.dec.resblocks.0.convs2.1.bias -vq_model.dec.resblocks.0.convs2.1.weight_g -vq_model.dec.resblocks.0.convs2.1.weight_v -vq_model.dec.resblocks.0.convs2.2.bias -vq_model.dec.resblocks.0.convs2.2.weight_g -vq_model.dec.resblocks.0.convs2.2.weight_v -vq_model.dec.resblocks.1.convs1.0.bias -vq_model.dec.resblocks.1.convs1.0.weight_g -vq_model.dec.resblocks.1.convs1.0.weight_v -vq_model.dec.resblocks.1.convs1.1.bias -vq_model.dec.resblocks.1.convs1.1.weight_g -vq_model.dec.resblocks.1.convs1.1.weight_v -vq_model.dec.resblocks.1.convs1.2.bias -vq_model.dec.resblocks.1.convs1.2.weight_g -vq_model.dec.resblocks.1.convs1.2.weight_v -vq_model.dec.resblocks.1.convs2.0.bias -vq_model.dec.resblocks.1.convs2.0.weight_g -vq_model.dec.resblocks.1.convs2.0.weight_v -vq_model.dec.resblocks.1.convs2.1.bias -vq_model.dec.resblocks.1.convs2.1.weight_g -vq_model.dec.resblocks.1.convs2.1.weight_v -vq_model.dec.resblocks.1.convs2.2.bias -vq_model.dec.resblocks.1.convs2.2.weight_g -vq_model.dec.resblocks.1.convs2.2.weight_v -vq_model.dec.resblocks.2.convs1.0.bias -vq_model.dec.resblocks.2.convs1.0.weight_g -vq_model.dec.resblocks.2.convs1.0.weight_v -vq_model.dec.resblocks.2.convs1.1.bias -vq_model.dec.resblocks.2.convs1.1.weight_g -vq_model.dec.resblocks.2.convs1.1.weight_v -vq_model.dec.resblocks.2.convs1.2.bias -vq_model.dec.resblocks.2.convs1.2.weight_g -vq_model.dec.resblocks.2.convs1.2.weight_v -vq_model.dec.resblocks.2.convs2.0.bias -vq_model.dec.resblocks.2.convs2.0.weight_g -vq_model.dec.resblocks.2.convs2.0.weight_v -vq_model.dec.resblocks.2.convs2.1.bias -vq_model.dec.resblocks.2.convs2.1.weight_g -vq_model.dec.resblocks.2.convs2.1.weight_v -vq_model.dec.resblocks.2.convs2.2.bias -vq_model.dec.resblocks.2.convs2.2.weight_g -vq_model.dec.resblocks.2.convs2.2.weight_v -vq_model.dec.resblocks.3.convs1.0.bias -vq_model.dec.resblocks.3.convs1.0.weight_g -vq_model.dec.resblocks.3.convs1.0.weight_v -vq_model.dec.resblocks.3.convs1.1.bias -vq_model.dec.resblocks.3.convs1.1.weight_g -vq_model.dec.resblocks.3.convs1.1.weight_v -vq_model.dec.resblocks.3.convs1.2.bias -vq_model.dec.resblocks.3.convs1.2.weight_g -vq_model.dec.resblocks.3.convs1.2.weight_v -vq_model.dec.resblocks.3.convs2.0.bias -vq_model.dec.resblocks.3.convs2.0.weight_g -vq_model.dec.resblocks.3.convs2.0.weight_v -vq_model.dec.resblocks.3.convs2.1.bias -vq_model.dec.resblocks.3.convs2.1.weight_g -vq_model.dec.resblocks.3.convs2.1.weight_v -vq_model.dec.resblocks.3.convs2.2.bias -vq_model.dec.resblocks.3.convs2.2.weight_g -vq_model.dec.resblocks.3.convs2.2.weight_v -vq_model.dec.resblocks.4.convs1.0.bias -vq_model.dec.resblocks.4.convs1.0.weight_g -vq_model.dec.resblocks.4.convs1.0.weight_v -vq_model.dec.resblocks.4.convs1.1.bias -vq_model.dec.resblocks.4.convs1.1.weight_g -vq_model.dec.resblocks.4.convs1.1.weight_v -vq_model.dec.resblocks.4.convs1.2.bias -vq_model.dec.resblocks.4.convs1.2.weight_g -vq_model.dec.resblocks.4.convs1.2.weight_v -vq_model.dec.resblocks.4.convs2.0.bias -vq_model.dec.resblocks.4.convs2.0.weight_g -vq_model.dec.resblocks.4.convs2.0.weight_v -vq_model.dec.resblocks.4.convs2.1.bias -vq_model.dec.resblocks.4.convs2.1.weight_g -vq_model.dec.resblocks.4.convs2.1.weight_v -vq_model.dec.resblocks.4.convs2.2.bias -vq_model.dec.resblocks.4.convs2.2.weight_g -vq_model.dec.resblocks.4.convs2.2.weight_v -vq_model.dec.resblocks.5.convs1.0.bias -vq_model.dec.resblocks.5.convs1.0.weight_g -vq_model.dec.resblocks.5.convs1.0.weight_v -vq_model.dec.resblocks.5.convs1.1.bias -vq_model.dec.resblocks.5.convs1.1.weight_g -vq_model.dec.resblocks.5.convs1.1.weight_v -vq_model.dec.resblocks.5.convs1.2.bias -vq_model.dec.resblocks.5.convs1.2.weight_g -vq_model.dec.resblocks.5.convs1.2.weight_v -vq_model.dec.resblocks.5.convs2.0.bias -vq_model.dec.resblocks.5.convs2.0.weight_g -vq_model.dec.resblocks.5.convs2.0.weight_v -vq_model.dec.resblocks.5.convs2.1.bias -vq_model.dec.resblocks.5.convs2.1.weight_g -vq_model.dec.resblocks.5.convs2.1.weight_v -vq_model.dec.resblocks.5.convs2.2.bias -vq_model.dec.resblocks.5.convs2.2.weight_g -vq_model.dec.resblocks.5.convs2.2.weight_v -vq_model.dec.resblocks.6.convs1.0.bias -vq_model.dec.resblocks.6.convs1.0.weight_g -vq_model.dec.resblocks.6.convs1.0.weight_v -vq_model.dec.resblocks.6.convs1.1.bias -vq_model.dec.resblocks.6.convs1.1.weight_g -vq_model.dec.resblocks.6.convs1.1.weight_v -vq_model.dec.resblocks.6.convs1.2.bias -vq_model.dec.resblocks.6.convs1.2.weight_g -vq_model.dec.resblocks.6.convs1.2.weight_v -vq_model.dec.resblocks.6.convs2.0.bias -vq_model.dec.resblocks.6.convs2.0.weight_g -vq_model.dec.resblocks.6.convs2.0.weight_v -vq_model.dec.resblocks.6.convs2.1.bias -vq_model.dec.resblocks.6.convs2.1.weight_g -vq_model.dec.resblocks.6.convs2.1.weight_v -vq_model.dec.resblocks.6.convs2.2.bias -vq_model.dec.resblocks.6.convs2.2.weight_g -vq_model.dec.resblocks.6.convs2.2.weight_v -vq_model.dec.resblocks.7.convs1.0.bias -vq_model.dec.resblocks.7.convs1.0.weight_g -vq_model.dec.resblocks.7.convs1.0.weight_v -vq_model.dec.resblocks.7.convs1.1.bias -vq_model.dec.resblocks.7.convs1.1.weight_g -vq_model.dec.resblocks.7.convs1.1.weight_v -vq_model.dec.resblocks.7.convs1.2.bias -vq_model.dec.resblocks.7.convs1.2.weight_g -vq_model.dec.resblocks.7.convs1.2.weight_v -vq_model.dec.resblocks.7.convs2.0.bias -vq_model.dec.resblocks.7.convs2.0.weight_g -vq_model.dec.resblocks.7.convs2.0.weight_v -vq_model.dec.resblocks.7.convs2.1.bias -vq_model.dec.resblocks.7.convs2.1.weight_g -vq_model.dec.resblocks.7.convs2.1.weight_v -vq_model.dec.resblocks.7.convs2.2.bias -vq_model.dec.resblocks.7.convs2.2.weight_g -vq_model.dec.resblocks.7.convs2.2.weight_v -vq_model.dec.resblocks.8.convs1.0.bias -vq_model.dec.resblocks.8.convs1.0.weight_g -vq_model.dec.resblocks.8.convs1.0.weight_v -vq_model.dec.resblocks.8.convs1.1.bias -vq_model.dec.resblocks.8.convs1.1.weight_g -vq_model.dec.resblocks.8.convs1.1.weight_v -vq_model.dec.resblocks.8.convs1.2.bias -vq_model.dec.resblocks.8.convs1.2.weight_g -vq_model.dec.resblocks.8.convs1.2.weight_v -vq_model.dec.resblocks.8.convs2.0.bias -vq_model.dec.resblocks.8.convs2.0.weight_g -vq_model.dec.resblocks.8.convs2.0.weight_v -vq_model.dec.resblocks.8.convs2.1.bias -vq_model.dec.resblocks.8.convs2.1.weight_g -vq_model.dec.resblocks.8.convs2.1.weight_v -vq_model.dec.resblocks.8.convs2.2.bias -vq_model.dec.resblocks.8.convs2.2.weight_g -vq_model.dec.resblocks.8.convs2.2.weight_v -vq_model.dec.resblocks.9.convs1.0.bias -vq_model.dec.resblocks.9.convs1.0.weight_g -vq_model.dec.resblocks.9.convs1.0.weight_v -vq_model.dec.resblocks.9.convs1.1.bias -vq_model.dec.resblocks.9.convs1.1.weight_g -vq_model.dec.resblocks.9.convs1.1.weight_v -vq_model.dec.resblocks.9.convs1.2.bias -vq_model.dec.resblocks.9.convs1.2.weight_g -vq_model.dec.resblocks.9.convs1.2.weight_v -vq_model.dec.resblocks.9.convs2.0.bias -vq_model.dec.resblocks.9.convs2.0.weight_g -vq_model.dec.resblocks.9.convs2.0.weight_v -vq_model.dec.resblocks.9.convs2.1.bias -vq_model.dec.resblocks.9.convs2.1.weight_g -vq_model.dec.resblocks.9.convs2.1.weight_v -vq_model.dec.resblocks.9.convs2.2.bias -vq_model.dec.resblocks.9.convs2.2.weight_g -vq_model.dec.resblocks.9.convs2.2.weight_v -vq_model.dec.resblocks.10.convs1.0.bias -vq_model.dec.resblocks.10.convs1.0.weight_g -vq_model.dec.resblocks.10.convs1.0.weight_v -vq_model.dec.resblocks.10.convs1.1.bias -vq_model.dec.resblocks.10.convs1.1.weight_g -vq_model.dec.resblocks.10.convs1.1.weight_v -vq_model.dec.resblocks.10.convs1.2.bias -vq_model.dec.resblocks.10.convs1.2.weight_g -vq_model.dec.resblocks.10.convs1.2.weight_v -vq_model.dec.resblocks.10.convs2.0.bias -vq_model.dec.resblocks.10.convs2.0.weight_g -vq_model.dec.resblocks.10.convs2.0.weight_v -vq_model.dec.resblocks.10.convs2.1.bias -vq_model.dec.resblocks.10.convs2.1.weight_g -vq_model.dec.resblocks.10.convs2.1.weight_v -vq_model.dec.resblocks.10.convs2.2.bias -vq_model.dec.resblocks.10.convs2.2.weight_g -vq_model.dec.resblocks.10.convs2.2.weight_v -vq_model.dec.resblocks.11.convs1.0.bias -vq_model.dec.resblocks.11.convs1.0.weight_g -vq_model.dec.resblocks.11.convs1.0.weight_v -vq_model.dec.resblocks.11.convs1.1.bias -vq_model.dec.resblocks.11.convs1.1.weight_g -vq_model.dec.resblocks.11.convs1.1.weight_v -vq_model.dec.resblocks.11.convs1.2.bias -vq_model.dec.resblocks.11.convs1.2.weight_g -vq_model.dec.resblocks.11.convs1.2.weight_v -vq_model.dec.resblocks.11.convs2.0.bias -vq_model.dec.resblocks.11.convs2.0.weight_g -vq_model.dec.resblocks.11.convs2.0.weight_v -vq_model.dec.resblocks.11.convs2.1.bias -vq_model.dec.resblocks.11.convs2.1.weight_g -vq_model.dec.resblocks.11.convs2.1.weight_v -vq_model.dec.resblocks.11.convs2.2.bias -vq_model.dec.resblocks.11.convs2.2.weight_g -vq_model.dec.resblocks.11.convs2.2.weight_v -vq_model.dec.resblocks.12.convs1.0.bias -vq_model.dec.resblocks.12.convs1.0.weight_g -vq_model.dec.resblocks.12.convs1.0.weight_v -vq_model.dec.resblocks.12.convs1.1.bias -vq_model.dec.resblocks.12.convs1.1.weight_g -vq_model.dec.resblocks.12.convs1.1.weight_v -vq_model.dec.resblocks.12.convs1.2.bias -vq_model.dec.resblocks.12.convs1.2.weight_g -vq_model.dec.resblocks.12.convs1.2.weight_v -vq_model.dec.resblocks.12.convs2.0.bias -vq_model.dec.resblocks.12.convs2.0.weight_g -vq_model.dec.resblocks.12.convs2.0.weight_v -vq_model.dec.resblocks.12.convs2.1.bias -vq_model.dec.resblocks.12.convs2.1.weight_g -vq_model.dec.resblocks.12.convs2.1.weight_v -vq_model.dec.resblocks.12.convs2.2.bias -vq_model.dec.resblocks.12.convs2.2.weight_g -vq_model.dec.resblocks.12.convs2.2.weight_v -vq_model.dec.resblocks.13.convs1.0.bias -vq_model.dec.resblocks.13.convs1.0.weight_g -vq_model.dec.resblocks.13.convs1.0.weight_v -vq_model.dec.resblocks.13.convs1.1.bias -vq_model.dec.resblocks.13.convs1.1.weight_g -vq_model.dec.resblocks.13.convs1.1.weight_v -vq_model.dec.resblocks.13.convs1.2.bias -vq_model.dec.resblocks.13.convs1.2.weight_g -vq_model.dec.resblocks.13.convs1.2.weight_v -vq_model.dec.resblocks.13.convs2.0.bias -vq_model.dec.resblocks.13.convs2.0.weight_g -vq_model.dec.resblocks.13.convs2.0.weight_v -vq_model.dec.resblocks.13.convs2.1.bias -vq_model.dec.resblocks.13.convs2.1.weight_g -vq_model.dec.resblocks.13.convs2.1.weight_v -vq_model.dec.resblocks.13.convs2.2.bias -vq_model.dec.resblocks.13.convs2.2.weight_g -vq_model.dec.resblocks.13.convs2.2.weight_v -vq_model.dec.resblocks.14.convs1.0.bias -vq_model.dec.resblocks.14.convs1.0.weight_g -vq_model.dec.resblocks.14.convs1.0.weight_v -vq_model.dec.resblocks.14.convs1.1.bias -vq_model.dec.resblocks.14.convs1.1.weight_g -vq_model.dec.resblocks.14.convs1.1.weight_v -vq_model.dec.resblocks.14.convs1.2.bias -vq_model.dec.resblocks.14.convs1.2.weight_g -vq_model.dec.resblocks.14.convs1.2.weight_v -vq_model.dec.resblocks.14.convs2.0.bias -vq_model.dec.resblocks.14.convs2.0.weight_g -vq_model.dec.resblocks.14.convs2.0.weight_v -vq_model.dec.resblocks.14.convs2.1.bias -vq_model.dec.resblocks.14.convs2.1.weight_g -vq_model.dec.resblocks.14.convs2.1.weight_v -vq_model.dec.resblocks.14.convs2.2.bias -vq_model.dec.resblocks.14.convs2.2.weight_g -vq_model.dec.resblocks.14.convs2.2.weight_v -vq_model.dec.conv_post.weight -vq_model.dec.cond.weight -vq_model.dec.cond.bias -vq_model.flow.flows.0.pre.weight -vq_model.flow.flows.0.pre.bias -vq_model.flow.flows.0.enc.in_layers.0.bias -vq_model.flow.flows.0.enc.in_layers.0.weight_g -vq_model.flow.flows.0.enc.in_layers.0.weight_v -vq_model.flow.flows.0.enc.in_layers.1.bias -vq_model.flow.flows.0.enc.in_layers.1.weight_g -vq_model.flow.flows.0.enc.in_layers.1.weight_v -vq_model.flow.flows.0.enc.in_layers.2.bias -vq_model.flow.flows.0.enc.in_layers.2.weight_g -vq_model.flow.flows.0.enc.in_layers.2.weight_v -vq_model.flow.flows.0.enc.in_layers.3.bias -vq_model.flow.flows.0.enc.in_layers.3.weight_g -vq_model.flow.flows.0.enc.in_layers.3.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.0.bias -vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.1.bias -vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.2.bias -vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.0.enc.res_skip_layers.3.bias -vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.0.enc.cond_layer.bias -vq_model.flow.flows.0.enc.cond_layer.weight_g -vq_model.flow.flows.0.enc.cond_layer.weight_v -vq_model.flow.flows.0.post.weight -vq_model.flow.flows.0.post.bias -vq_model.flow.flows.2.pre.weight -vq_model.flow.flows.2.pre.bias -vq_model.flow.flows.2.enc.in_layers.0.bias -vq_model.flow.flows.2.enc.in_layers.0.weight_g -vq_model.flow.flows.2.enc.in_layers.0.weight_v -vq_model.flow.flows.2.enc.in_layers.1.bias -vq_model.flow.flows.2.enc.in_layers.1.weight_g -vq_model.flow.flows.2.enc.in_layers.1.weight_v -vq_model.flow.flows.2.enc.in_layers.2.bias -vq_model.flow.flows.2.enc.in_layers.2.weight_g -vq_model.flow.flows.2.enc.in_layers.2.weight_v -vq_model.flow.flows.2.enc.in_layers.3.bias -vq_model.flow.flows.2.enc.in_layers.3.weight_g -vq_model.flow.flows.2.enc.in_layers.3.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.0.bias -vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.1.bias -vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.2.bias -vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.2.enc.res_skip_layers.3.bias -vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.2.enc.cond_layer.bias -vq_model.flow.flows.2.enc.cond_layer.weight_g -vq_model.flow.flows.2.enc.cond_layer.weight_v -vq_model.flow.flows.2.post.weight -vq_model.flow.flows.2.post.bias -vq_model.flow.flows.4.pre.weight -vq_model.flow.flows.4.pre.bias -vq_model.flow.flows.4.enc.in_layers.0.bias -vq_model.flow.flows.4.enc.in_layers.0.weight_g -vq_model.flow.flows.4.enc.in_layers.0.weight_v -vq_model.flow.flows.4.enc.in_layers.1.bias -vq_model.flow.flows.4.enc.in_layers.1.weight_g -vq_model.flow.flows.4.enc.in_layers.1.weight_v -vq_model.flow.flows.4.enc.in_layers.2.bias -vq_model.flow.flows.4.enc.in_layers.2.weight_g -vq_model.flow.flows.4.enc.in_layers.2.weight_v -vq_model.flow.flows.4.enc.in_layers.3.bias -vq_model.flow.flows.4.enc.in_layers.3.weight_g -vq_model.flow.flows.4.enc.in_layers.3.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.0.bias -vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.1.bias -vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.2.bias -vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.4.enc.res_skip_layers.3.bias -vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.4.enc.cond_layer.bias -vq_model.flow.flows.4.enc.cond_layer.weight_g -vq_model.flow.flows.4.enc.cond_layer.weight_v -vq_model.flow.flows.4.post.weight -vq_model.flow.flows.4.post.bias -vq_model.flow.flows.6.pre.weight -vq_model.flow.flows.6.pre.bias -vq_model.flow.flows.6.enc.in_layers.0.bias -vq_model.flow.flows.6.enc.in_layers.0.weight_g -vq_model.flow.flows.6.enc.in_layers.0.weight_v -vq_model.flow.flows.6.enc.in_layers.1.bias -vq_model.flow.flows.6.enc.in_layers.1.weight_g -vq_model.flow.flows.6.enc.in_layers.1.weight_v -vq_model.flow.flows.6.enc.in_layers.2.bias -vq_model.flow.flows.6.enc.in_layers.2.weight_g -vq_model.flow.flows.6.enc.in_layers.2.weight_v -vq_model.flow.flows.6.enc.in_layers.3.bias -vq_model.flow.flows.6.enc.in_layers.3.weight_g -vq_model.flow.flows.6.enc.in_layers.3.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.0.bias -vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.1.bias -vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.2.bias -vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v -vq_model.flow.flows.6.enc.res_skip_layers.3.bias -vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g -vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v -vq_model.flow.flows.6.enc.cond_layer.bias -vq_model.flow.flows.6.enc.cond_layer.weight_g -vq_model.flow.flows.6.enc.cond_layer.weight_v -vq_model.flow.flows.6.post.weight -vq_model.flow.flows.6.post.bias -vq_model.quantizer.vq.layers.0._codebook.embed +vq_model.enc_p.ssl_proj.weight +vq_model.enc_p.ssl_proj.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_k +vq_model.enc_p.encoder_ssl.attn_layers.0.emb_rel_v +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_q.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_k.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_v.bias +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.weight +vq_model.enc_p.encoder_ssl.attn_layers.0.conv_o.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_k +vq_model.enc_p.encoder_ssl.attn_layers.1.emb_rel_v +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_q.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_k.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_v.bias +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.weight +vq_model.enc_p.encoder_ssl.attn_layers.1.conv_o.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_k +vq_model.enc_p.encoder_ssl.attn_layers.2.emb_rel_v +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_q.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_k.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_v.bias +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.weight +vq_model.enc_p.encoder_ssl.attn_layers.2.conv_o.bias +vq_model.enc_p.encoder_ssl.norm_layers_1.0.gamma +vq_model.enc_p.encoder_ssl.norm_layers_1.0.beta +vq_model.enc_p.encoder_ssl.norm_layers_1.1.gamma +vq_model.enc_p.encoder_ssl.norm_layers_1.1.beta +vq_model.enc_p.encoder_ssl.norm_layers_1.2.gamma +vq_model.enc_p.encoder_ssl.norm_layers_1.2.beta +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.weight +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_1.bias +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.weight +vq_model.enc_p.encoder_ssl.ffn_layers.0.conv_2.bias +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.weight +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_1.bias +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.weight +vq_model.enc_p.encoder_ssl.ffn_layers.1.conv_2.bias +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.weight +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_1.bias +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.weight +vq_model.enc_p.encoder_ssl.ffn_layers.2.conv_2.bias +vq_model.enc_p.encoder_ssl.norm_layers_2.0.gamma +vq_model.enc_p.encoder_ssl.norm_layers_2.0.beta +vq_model.enc_p.encoder_ssl.norm_layers_2.1.gamma +vq_model.enc_p.encoder_ssl.norm_layers_2.1.beta +vq_model.enc_p.encoder_ssl.norm_layers_2.2.gamma +vq_model.enc_p.encoder_ssl.norm_layers_2.2.beta +vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.0.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.0.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.0.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.0.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.1.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.1.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.1.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.1.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.2.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.2.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.2.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.2.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.3.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.3.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.3.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.3.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.4.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.4.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.4.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.4.conv_o.bias +vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_k +vq_model.enc_p.encoder_text.attn_layers.5.emb_rel_v +vq_model.enc_p.encoder_text.attn_layers.5.conv_q.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_q.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_k.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_k.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_v.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_v.bias +vq_model.enc_p.encoder_text.attn_layers.5.conv_o.weight +vq_model.enc_p.encoder_text.attn_layers.5.conv_o.bias +vq_model.enc_p.encoder_text.norm_layers_1.0.gamma +vq_model.enc_p.encoder_text.norm_layers_1.0.beta +vq_model.enc_p.encoder_text.norm_layers_1.1.gamma +vq_model.enc_p.encoder_text.norm_layers_1.1.beta +vq_model.enc_p.encoder_text.norm_layers_1.2.gamma +vq_model.enc_p.encoder_text.norm_layers_1.2.beta +vq_model.enc_p.encoder_text.norm_layers_1.3.gamma +vq_model.enc_p.encoder_text.norm_layers_1.3.beta +vq_model.enc_p.encoder_text.norm_layers_1.4.gamma +vq_model.enc_p.encoder_text.norm_layers_1.4.beta +vq_model.enc_p.encoder_text.norm_layers_1.5.gamma +vq_model.enc_p.encoder_text.norm_layers_1.5.beta +vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.0.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.0.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.1.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.1.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.2.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.2.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.3.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.3.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.4.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.4.conv_2.bias +vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.weight +vq_model.enc_p.encoder_text.ffn_layers.5.conv_1.bias +vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.weight +vq_model.enc_p.encoder_text.ffn_layers.5.conv_2.bias +vq_model.enc_p.encoder_text.norm_layers_2.0.gamma +vq_model.enc_p.encoder_text.norm_layers_2.0.beta +vq_model.enc_p.encoder_text.norm_layers_2.1.gamma +vq_model.enc_p.encoder_text.norm_layers_2.1.beta +vq_model.enc_p.encoder_text.norm_layers_2.2.gamma +vq_model.enc_p.encoder_text.norm_layers_2.2.beta +vq_model.enc_p.encoder_text.norm_layers_2.3.gamma +vq_model.enc_p.encoder_text.norm_layers_2.3.beta +vq_model.enc_p.encoder_text.norm_layers_2.4.gamma +vq_model.enc_p.encoder_text.norm_layers_2.4.beta +vq_model.enc_p.encoder_text.norm_layers_2.5.gamma +vq_model.enc_p.encoder_text.norm_layers_2.5.beta +vq_model.enc_p.text_embedding.weight +vq_model.enc_p.mrte.cross_attention.conv_q.weight +vq_model.enc_p.mrte.cross_attention.conv_q.bias +vq_model.enc_p.mrte.cross_attention.conv_k.weight +vq_model.enc_p.mrte.cross_attention.conv_k.bias +vq_model.enc_p.mrte.cross_attention.conv_v.weight +vq_model.enc_p.mrte.cross_attention.conv_v.bias +vq_model.enc_p.mrte.cross_attention.conv_o.weight +vq_model.enc_p.mrte.cross_attention.conv_o.bias +vq_model.enc_p.mrte.c_pre.weight +vq_model.enc_p.mrte.c_pre.bias +vq_model.enc_p.mrte.text_pre.weight +vq_model.enc_p.mrte.text_pre.bias +vq_model.enc_p.mrte.c_post.weight +vq_model.enc_p.mrte.c_post.bias +vq_model.enc_p.encoder2.attn_layers.0.emb_rel_k +vq_model.enc_p.encoder2.attn_layers.0.emb_rel_v +vq_model.enc_p.encoder2.attn_layers.0.conv_q.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_q.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_k.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_k.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_v.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_v.bias +vq_model.enc_p.encoder2.attn_layers.0.conv_o.weight +vq_model.enc_p.encoder2.attn_layers.0.conv_o.bias +vq_model.enc_p.encoder2.attn_layers.1.emb_rel_k +vq_model.enc_p.encoder2.attn_layers.1.emb_rel_v +vq_model.enc_p.encoder2.attn_layers.1.conv_q.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_q.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_k.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_k.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_v.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_v.bias +vq_model.enc_p.encoder2.attn_layers.1.conv_o.weight +vq_model.enc_p.encoder2.attn_layers.1.conv_o.bias +vq_model.enc_p.encoder2.attn_layers.2.emb_rel_k +vq_model.enc_p.encoder2.attn_layers.2.emb_rel_v +vq_model.enc_p.encoder2.attn_layers.2.conv_q.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_q.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_k.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_k.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_v.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_v.bias +vq_model.enc_p.encoder2.attn_layers.2.conv_o.weight +vq_model.enc_p.encoder2.attn_layers.2.conv_o.bias +vq_model.enc_p.encoder2.norm_layers_1.0.gamma +vq_model.enc_p.encoder2.norm_layers_1.0.beta +vq_model.enc_p.encoder2.norm_layers_1.1.gamma +vq_model.enc_p.encoder2.norm_layers_1.1.beta +vq_model.enc_p.encoder2.norm_layers_1.2.gamma +vq_model.enc_p.encoder2.norm_layers_1.2.beta +vq_model.enc_p.encoder2.ffn_layers.0.conv_1.weight +vq_model.enc_p.encoder2.ffn_layers.0.conv_1.bias +vq_model.enc_p.encoder2.ffn_layers.0.conv_2.weight +vq_model.enc_p.encoder2.ffn_layers.0.conv_2.bias +vq_model.enc_p.encoder2.ffn_layers.1.conv_1.weight +vq_model.enc_p.encoder2.ffn_layers.1.conv_1.bias +vq_model.enc_p.encoder2.ffn_layers.1.conv_2.weight +vq_model.enc_p.encoder2.ffn_layers.1.conv_2.bias +vq_model.enc_p.encoder2.ffn_layers.2.conv_1.weight +vq_model.enc_p.encoder2.ffn_layers.2.conv_1.bias +vq_model.enc_p.encoder2.ffn_layers.2.conv_2.weight +vq_model.enc_p.encoder2.ffn_layers.2.conv_2.bias +vq_model.enc_p.encoder2.norm_layers_2.0.gamma +vq_model.enc_p.encoder2.norm_layers_2.0.beta +vq_model.enc_p.encoder2.norm_layers_2.1.gamma +vq_model.enc_p.encoder2.norm_layers_2.1.beta +vq_model.enc_p.encoder2.norm_layers_2.2.gamma +vq_model.enc_p.encoder2.norm_layers_2.2.beta +vq_model.enc_p.proj.weight +vq_model.enc_p.proj.bias +vq_model.dec.conv_pre.weight +vq_model.dec.conv_pre.bias +vq_model.dec.ups.0.bias +vq_model.dec.ups.0.weight_g +vq_model.dec.ups.0.weight_v +vq_model.dec.ups.1.bias +vq_model.dec.ups.1.weight_g +vq_model.dec.ups.1.weight_v +vq_model.dec.ups.2.bias +vq_model.dec.ups.2.weight_g +vq_model.dec.ups.2.weight_v +vq_model.dec.ups.3.bias +vq_model.dec.ups.3.weight_g +vq_model.dec.ups.3.weight_v +vq_model.dec.ups.4.bias +vq_model.dec.ups.4.weight_g +vq_model.dec.ups.4.weight_v +vq_model.dec.resblocks.0.convs1.0.bias +vq_model.dec.resblocks.0.convs1.0.weight_g +vq_model.dec.resblocks.0.convs1.0.weight_v +vq_model.dec.resblocks.0.convs1.1.bias +vq_model.dec.resblocks.0.convs1.1.weight_g +vq_model.dec.resblocks.0.convs1.1.weight_v +vq_model.dec.resblocks.0.convs1.2.bias +vq_model.dec.resblocks.0.convs1.2.weight_g +vq_model.dec.resblocks.0.convs1.2.weight_v +vq_model.dec.resblocks.0.convs2.0.bias +vq_model.dec.resblocks.0.convs2.0.weight_g +vq_model.dec.resblocks.0.convs2.0.weight_v +vq_model.dec.resblocks.0.convs2.1.bias +vq_model.dec.resblocks.0.convs2.1.weight_g +vq_model.dec.resblocks.0.convs2.1.weight_v +vq_model.dec.resblocks.0.convs2.2.bias +vq_model.dec.resblocks.0.convs2.2.weight_g +vq_model.dec.resblocks.0.convs2.2.weight_v +vq_model.dec.resblocks.1.convs1.0.bias +vq_model.dec.resblocks.1.convs1.0.weight_g +vq_model.dec.resblocks.1.convs1.0.weight_v +vq_model.dec.resblocks.1.convs1.1.bias +vq_model.dec.resblocks.1.convs1.1.weight_g +vq_model.dec.resblocks.1.convs1.1.weight_v +vq_model.dec.resblocks.1.convs1.2.bias +vq_model.dec.resblocks.1.convs1.2.weight_g +vq_model.dec.resblocks.1.convs1.2.weight_v +vq_model.dec.resblocks.1.convs2.0.bias +vq_model.dec.resblocks.1.convs2.0.weight_g +vq_model.dec.resblocks.1.convs2.0.weight_v +vq_model.dec.resblocks.1.convs2.1.bias +vq_model.dec.resblocks.1.convs2.1.weight_g +vq_model.dec.resblocks.1.convs2.1.weight_v +vq_model.dec.resblocks.1.convs2.2.bias +vq_model.dec.resblocks.1.convs2.2.weight_g +vq_model.dec.resblocks.1.convs2.2.weight_v +vq_model.dec.resblocks.2.convs1.0.bias +vq_model.dec.resblocks.2.convs1.0.weight_g +vq_model.dec.resblocks.2.convs1.0.weight_v +vq_model.dec.resblocks.2.convs1.1.bias +vq_model.dec.resblocks.2.convs1.1.weight_g +vq_model.dec.resblocks.2.convs1.1.weight_v +vq_model.dec.resblocks.2.convs1.2.bias +vq_model.dec.resblocks.2.convs1.2.weight_g +vq_model.dec.resblocks.2.convs1.2.weight_v +vq_model.dec.resblocks.2.convs2.0.bias +vq_model.dec.resblocks.2.convs2.0.weight_g +vq_model.dec.resblocks.2.convs2.0.weight_v +vq_model.dec.resblocks.2.convs2.1.bias +vq_model.dec.resblocks.2.convs2.1.weight_g +vq_model.dec.resblocks.2.convs2.1.weight_v +vq_model.dec.resblocks.2.convs2.2.bias +vq_model.dec.resblocks.2.convs2.2.weight_g +vq_model.dec.resblocks.2.convs2.2.weight_v +vq_model.dec.resblocks.3.convs1.0.bias +vq_model.dec.resblocks.3.convs1.0.weight_g +vq_model.dec.resblocks.3.convs1.0.weight_v +vq_model.dec.resblocks.3.convs1.1.bias +vq_model.dec.resblocks.3.convs1.1.weight_g +vq_model.dec.resblocks.3.convs1.1.weight_v +vq_model.dec.resblocks.3.convs1.2.bias +vq_model.dec.resblocks.3.convs1.2.weight_g +vq_model.dec.resblocks.3.convs1.2.weight_v +vq_model.dec.resblocks.3.convs2.0.bias +vq_model.dec.resblocks.3.convs2.0.weight_g +vq_model.dec.resblocks.3.convs2.0.weight_v +vq_model.dec.resblocks.3.convs2.1.bias +vq_model.dec.resblocks.3.convs2.1.weight_g +vq_model.dec.resblocks.3.convs2.1.weight_v +vq_model.dec.resblocks.3.convs2.2.bias +vq_model.dec.resblocks.3.convs2.2.weight_g +vq_model.dec.resblocks.3.convs2.2.weight_v +vq_model.dec.resblocks.4.convs1.0.bias +vq_model.dec.resblocks.4.convs1.0.weight_g +vq_model.dec.resblocks.4.convs1.0.weight_v +vq_model.dec.resblocks.4.convs1.1.bias +vq_model.dec.resblocks.4.convs1.1.weight_g +vq_model.dec.resblocks.4.convs1.1.weight_v +vq_model.dec.resblocks.4.convs1.2.bias +vq_model.dec.resblocks.4.convs1.2.weight_g +vq_model.dec.resblocks.4.convs1.2.weight_v +vq_model.dec.resblocks.4.convs2.0.bias +vq_model.dec.resblocks.4.convs2.0.weight_g +vq_model.dec.resblocks.4.convs2.0.weight_v +vq_model.dec.resblocks.4.convs2.1.bias +vq_model.dec.resblocks.4.convs2.1.weight_g +vq_model.dec.resblocks.4.convs2.1.weight_v +vq_model.dec.resblocks.4.convs2.2.bias +vq_model.dec.resblocks.4.convs2.2.weight_g +vq_model.dec.resblocks.4.convs2.2.weight_v +vq_model.dec.resblocks.5.convs1.0.bias +vq_model.dec.resblocks.5.convs1.0.weight_g +vq_model.dec.resblocks.5.convs1.0.weight_v +vq_model.dec.resblocks.5.convs1.1.bias +vq_model.dec.resblocks.5.convs1.1.weight_g +vq_model.dec.resblocks.5.convs1.1.weight_v +vq_model.dec.resblocks.5.convs1.2.bias +vq_model.dec.resblocks.5.convs1.2.weight_g +vq_model.dec.resblocks.5.convs1.2.weight_v +vq_model.dec.resblocks.5.convs2.0.bias +vq_model.dec.resblocks.5.convs2.0.weight_g +vq_model.dec.resblocks.5.convs2.0.weight_v +vq_model.dec.resblocks.5.convs2.1.bias +vq_model.dec.resblocks.5.convs2.1.weight_g +vq_model.dec.resblocks.5.convs2.1.weight_v +vq_model.dec.resblocks.5.convs2.2.bias +vq_model.dec.resblocks.5.convs2.2.weight_g +vq_model.dec.resblocks.5.convs2.2.weight_v +vq_model.dec.resblocks.6.convs1.0.bias +vq_model.dec.resblocks.6.convs1.0.weight_g +vq_model.dec.resblocks.6.convs1.0.weight_v +vq_model.dec.resblocks.6.convs1.1.bias +vq_model.dec.resblocks.6.convs1.1.weight_g +vq_model.dec.resblocks.6.convs1.1.weight_v +vq_model.dec.resblocks.6.convs1.2.bias +vq_model.dec.resblocks.6.convs1.2.weight_g +vq_model.dec.resblocks.6.convs1.2.weight_v +vq_model.dec.resblocks.6.convs2.0.bias +vq_model.dec.resblocks.6.convs2.0.weight_g +vq_model.dec.resblocks.6.convs2.0.weight_v +vq_model.dec.resblocks.6.convs2.1.bias +vq_model.dec.resblocks.6.convs2.1.weight_g +vq_model.dec.resblocks.6.convs2.1.weight_v +vq_model.dec.resblocks.6.convs2.2.bias +vq_model.dec.resblocks.6.convs2.2.weight_g +vq_model.dec.resblocks.6.convs2.2.weight_v +vq_model.dec.resblocks.7.convs1.0.bias +vq_model.dec.resblocks.7.convs1.0.weight_g +vq_model.dec.resblocks.7.convs1.0.weight_v +vq_model.dec.resblocks.7.convs1.1.bias +vq_model.dec.resblocks.7.convs1.1.weight_g +vq_model.dec.resblocks.7.convs1.1.weight_v +vq_model.dec.resblocks.7.convs1.2.bias +vq_model.dec.resblocks.7.convs1.2.weight_g +vq_model.dec.resblocks.7.convs1.2.weight_v +vq_model.dec.resblocks.7.convs2.0.bias +vq_model.dec.resblocks.7.convs2.0.weight_g +vq_model.dec.resblocks.7.convs2.0.weight_v +vq_model.dec.resblocks.7.convs2.1.bias +vq_model.dec.resblocks.7.convs2.1.weight_g +vq_model.dec.resblocks.7.convs2.1.weight_v +vq_model.dec.resblocks.7.convs2.2.bias +vq_model.dec.resblocks.7.convs2.2.weight_g +vq_model.dec.resblocks.7.convs2.2.weight_v +vq_model.dec.resblocks.8.convs1.0.bias +vq_model.dec.resblocks.8.convs1.0.weight_g +vq_model.dec.resblocks.8.convs1.0.weight_v +vq_model.dec.resblocks.8.convs1.1.bias +vq_model.dec.resblocks.8.convs1.1.weight_g +vq_model.dec.resblocks.8.convs1.1.weight_v +vq_model.dec.resblocks.8.convs1.2.bias +vq_model.dec.resblocks.8.convs1.2.weight_g +vq_model.dec.resblocks.8.convs1.2.weight_v +vq_model.dec.resblocks.8.convs2.0.bias +vq_model.dec.resblocks.8.convs2.0.weight_g +vq_model.dec.resblocks.8.convs2.0.weight_v +vq_model.dec.resblocks.8.convs2.1.bias +vq_model.dec.resblocks.8.convs2.1.weight_g +vq_model.dec.resblocks.8.convs2.1.weight_v +vq_model.dec.resblocks.8.convs2.2.bias +vq_model.dec.resblocks.8.convs2.2.weight_g +vq_model.dec.resblocks.8.convs2.2.weight_v +vq_model.dec.resblocks.9.convs1.0.bias +vq_model.dec.resblocks.9.convs1.0.weight_g +vq_model.dec.resblocks.9.convs1.0.weight_v +vq_model.dec.resblocks.9.convs1.1.bias +vq_model.dec.resblocks.9.convs1.1.weight_g +vq_model.dec.resblocks.9.convs1.1.weight_v +vq_model.dec.resblocks.9.convs1.2.bias +vq_model.dec.resblocks.9.convs1.2.weight_g +vq_model.dec.resblocks.9.convs1.2.weight_v +vq_model.dec.resblocks.9.convs2.0.bias +vq_model.dec.resblocks.9.convs2.0.weight_g +vq_model.dec.resblocks.9.convs2.0.weight_v +vq_model.dec.resblocks.9.convs2.1.bias +vq_model.dec.resblocks.9.convs2.1.weight_g +vq_model.dec.resblocks.9.convs2.1.weight_v +vq_model.dec.resblocks.9.convs2.2.bias +vq_model.dec.resblocks.9.convs2.2.weight_g +vq_model.dec.resblocks.9.convs2.2.weight_v +vq_model.dec.resblocks.10.convs1.0.bias +vq_model.dec.resblocks.10.convs1.0.weight_g +vq_model.dec.resblocks.10.convs1.0.weight_v +vq_model.dec.resblocks.10.convs1.1.bias +vq_model.dec.resblocks.10.convs1.1.weight_g +vq_model.dec.resblocks.10.convs1.1.weight_v +vq_model.dec.resblocks.10.convs1.2.bias +vq_model.dec.resblocks.10.convs1.2.weight_g +vq_model.dec.resblocks.10.convs1.2.weight_v +vq_model.dec.resblocks.10.convs2.0.bias +vq_model.dec.resblocks.10.convs2.0.weight_g +vq_model.dec.resblocks.10.convs2.0.weight_v +vq_model.dec.resblocks.10.convs2.1.bias +vq_model.dec.resblocks.10.convs2.1.weight_g +vq_model.dec.resblocks.10.convs2.1.weight_v +vq_model.dec.resblocks.10.convs2.2.bias +vq_model.dec.resblocks.10.convs2.2.weight_g +vq_model.dec.resblocks.10.convs2.2.weight_v +vq_model.dec.resblocks.11.convs1.0.bias +vq_model.dec.resblocks.11.convs1.0.weight_g +vq_model.dec.resblocks.11.convs1.0.weight_v +vq_model.dec.resblocks.11.convs1.1.bias +vq_model.dec.resblocks.11.convs1.1.weight_g +vq_model.dec.resblocks.11.convs1.1.weight_v +vq_model.dec.resblocks.11.convs1.2.bias +vq_model.dec.resblocks.11.convs1.2.weight_g +vq_model.dec.resblocks.11.convs1.2.weight_v +vq_model.dec.resblocks.11.convs2.0.bias +vq_model.dec.resblocks.11.convs2.0.weight_g +vq_model.dec.resblocks.11.convs2.0.weight_v +vq_model.dec.resblocks.11.convs2.1.bias +vq_model.dec.resblocks.11.convs2.1.weight_g +vq_model.dec.resblocks.11.convs2.1.weight_v +vq_model.dec.resblocks.11.convs2.2.bias +vq_model.dec.resblocks.11.convs2.2.weight_g +vq_model.dec.resblocks.11.convs2.2.weight_v +vq_model.dec.resblocks.12.convs1.0.bias +vq_model.dec.resblocks.12.convs1.0.weight_g +vq_model.dec.resblocks.12.convs1.0.weight_v +vq_model.dec.resblocks.12.convs1.1.bias +vq_model.dec.resblocks.12.convs1.1.weight_g +vq_model.dec.resblocks.12.convs1.1.weight_v +vq_model.dec.resblocks.12.convs1.2.bias +vq_model.dec.resblocks.12.convs1.2.weight_g +vq_model.dec.resblocks.12.convs1.2.weight_v +vq_model.dec.resblocks.12.convs2.0.bias +vq_model.dec.resblocks.12.convs2.0.weight_g +vq_model.dec.resblocks.12.convs2.0.weight_v +vq_model.dec.resblocks.12.convs2.1.bias +vq_model.dec.resblocks.12.convs2.1.weight_g +vq_model.dec.resblocks.12.convs2.1.weight_v +vq_model.dec.resblocks.12.convs2.2.bias +vq_model.dec.resblocks.12.convs2.2.weight_g +vq_model.dec.resblocks.12.convs2.2.weight_v +vq_model.dec.resblocks.13.convs1.0.bias +vq_model.dec.resblocks.13.convs1.0.weight_g +vq_model.dec.resblocks.13.convs1.0.weight_v +vq_model.dec.resblocks.13.convs1.1.bias +vq_model.dec.resblocks.13.convs1.1.weight_g +vq_model.dec.resblocks.13.convs1.1.weight_v +vq_model.dec.resblocks.13.convs1.2.bias +vq_model.dec.resblocks.13.convs1.2.weight_g +vq_model.dec.resblocks.13.convs1.2.weight_v +vq_model.dec.resblocks.13.convs2.0.bias +vq_model.dec.resblocks.13.convs2.0.weight_g +vq_model.dec.resblocks.13.convs2.0.weight_v +vq_model.dec.resblocks.13.convs2.1.bias +vq_model.dec.resblocks.13.convs2.1.weight_g +vq_model.dec.resblocks.13.convs2.1.weight_v +vq_model.dec.resblocks.13.convs2.2.bias +vq_model.dec.resblocks.13.convs2.2.weight_g +vq_model.dec.resblocks.13.convs2.2.weight_v +vq_model.dec.resblocks.14.convs1.0.bias +vq_model.dec.resblocks.14.convs1.0.weight_g +vq_model.dec.resblocks.14.convs1.0.weight_v +vq_model.dec.resblocks.14.convs1.1.bias +vq_model.dec.resblocks.14.convs1.1.weight_g +vq_model.dec.resblocks.14.convs1.1.weight_v +vq_model.dec.resblocks.14.convs1.2.bias +vq_model.dec.resblocks.14.convs1.2.weight_g +vq_model.dec.resblocks.14.convs1.2.weight_v +vq_model.dec.resblocks.14.convs2.0.bias +vq_model.dec.resblocks.14.convs2.0.weight_g +vq_model.dec.resblocks.14.convs2.0.weight_v +vq_model.dec.resblocks.14.convs2.1.bias +vq_model.dec.resblocks.14.convs2.1.weight_g +vq_model.dec.resblocks.14.convs2.1.weight_v +vq_model.dec.resblocks.14.convs2.2.bias +vq_model.dec.resblocks.14.convs2.2.weight_g +vq_model.dec.resblocks.14.convs2.2.weight_v +vq_model.dec.conv_post.weight +vq_model.dec.cond.weight +vq_model.dec.cond.bias +vq_model.flow.flows.0.pre.weight +vq_model.flow.flows.0.pre.bias +vq_model.flow.flows.0.enc.in_layers.0.bias +vq_model.flow.flows.0.enc.in_layers.0.weight_g +vq_model.flow.flows.0.enc.in_layers.0.weight_v +vq_model.flow.flows.0.enc.in_layers.1.bias +vq_model.flow.flows.0.enc.in_layers.1.weight_g +vq_model.flow.flows.0.enc.in_layers.1.weight_v +vq_model.flow.flows.0.enc.in_layers.2.bias +vq_model.flow.flows.0.enc.in_layers.2.weight_g +vq_model.flow.flows.0.enc.in_layers.2.weight_v +vq_model.flow.flows.0.enc.in_layers.3.bias +vq_model.flow.flows.0.enc.in_layers.3.weight_g +vq_model.flow.flows.0.enc.in_layers.3.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.0.bias +vq_model.flow.flows.0.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.1.bias +vq_model.flow.flows.0.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.2.bias +vq_model.flow.flows.0.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.0.enc.res_skip_layers.3.bias +vq_model.flow.flows.0.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.0.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.0.enc.cond_layer.bias +vq_model.flow.flows.0.enc.cond_layer.weight_g +vq_model.flow.flows.0.enc.cond_layer.weight_v +vq_model.flow.flows.0.post.weight +vq_model.flow.flows.0.post.bias +vq_model.flow.flows.2.pre.weight +vq_model.flow.flows.2.pre.bias +vq_model.flow.flows.2.enc.in_layers.0.bias +vq_model.flow.flows.2.enc.in_layers.0.weight_g +vq_model.flow.flows.2.enc.in_layers.0.weight_v +vq_model.flow.flows.2.enc.in_layers.1.bias +vq_model.flow.flows.2.enc.in_layers.1.weight_g +vq_model.flow.flows.2.enc.in_layers.1.weight_v +vq_model.flow.flows.2.enc.in_layers.2.bias +vq_model.flow.flows.2.enc.in_layers.2.weight_g +vq_model.flow.flows.2.enc.in_layers.2.weight_v +vq_model.flow.flows.2.enc.in_layers.3.bias +vq_model.flow.flows.2.enc.in_layers.3.weight_g +vq_model.flow.flows.2.enc.in_layers.3.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.0.bias +vq_model.flow.flows.2.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.1.bias +vq_model.flow.flows.2.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.2.bias +vq_model.flow.flows.2.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.2.enc.res_skip_layers.3.bias +vq_model.flow.flows.2.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.2.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.2.enc.cond_layer.bias +vq_model.flow.flows.2.enc.cond_layer.weight_g +vq_model.flow.flows.2.enc.cond_layer.weight_v +vq_model.flow.flows.2.post.weight +vq_model.flow.flows.2.post.bias +vq_model.flow.flows.4.pre.weight +vq_model.flow.flows.4.pre.bias +vq_model.flow.flows.4.enc.in_layers.0.bias +vq_model.flow.flows.4.enc.in_layers.0.weight_g +vq_model.flow.flows.4.enc.in_layers.0.weight_v +vq_model.flow.flows.4.enc.in_layers.1.bias +vq_model.flow.flows.4.enc.in_layers.1.weight_g +vq_model.flow.flows.4.enc.in_layers.1.weight_v +vq_model.flow.flows.4.enc.in_layers.2.bias +vq_model.flow.flows.4.enc.in_layers.2.weight_g +vq_model.flow.flows.4.enc.in_layers.2.weight_v +vq_model.flow.flows.4.enc.in_layers.3.bias +vq_model.flow.flows.4.enc.in_layers.3.weight_g +vq_model.flow.flows.4.enc.in_layers.3.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.0.bias +vq_model.flow.flows.4.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.1.bias +vq_model.flow.flows.4.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.2.bias +vq_model.flow.flows.4.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.4.enc.res_skip_layers.3.bias +vq_model.flow.flows.4.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.4.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.4.enc.cond_layer.bias +vq_model.flow.flows.4.enc.cond_layer.weight_g +vq_model.flow.flows.4.enc.cond_layer.weight_v +vq_model.flow.flows.4.post.weight +vq_model.flow.flows.4.post.bias +vq_model.flow.flows.6.pre.weight +vq_model.flow.flows.6.pre.bias +vq_model.flow.flows.6.enc.in_layers.0.bias +vq_model.flow.flows.6.enc.in_layers.0.weight_g +vq_model.flow.flows.6.enc.in_layers.0.weight_v +vq_model.flow.flows.6.enc.in_layers.1.bias +vq_model.flow.flows.6.enc.in_layers.1.weight_g +vq_model.flow.flows.6.enc.in_layers.1.weight_v +vq_model.flow.flows.6.enc.in_layers.2.bias +vq_model.flow.flows.6.enc.in_layers.2.weight_g +vq_model.flow.flows.6.enc.in_layers.2.weight_v +vq_model.flow.flows.6.enc.in_layers.3.bias +vq_model.flow.flows.6.enc.in_layers.3.weight_g +vq_model.flow.flows.6.enc.in_layers.3.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.0.bias +vq_model.flow.flows.6.enc.res_skip_layers.0.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.0.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.1.bias +vq_model.flow.flows.6.enc.res_skip_layers.1.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.1.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.2.bias +vq_model.flow.flows.6.enc.res_skip_layers.2.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.2.weight_v +vq_model.flow.flows.6.enc.res_skip_layers.3.bias +vq_model.flow.flows.6.enc.res_skip_layers.3.weight_g +vq_model.flow.flows.6.enc.res_skip_layers.3.weight_v +vq_model.flow.flows.6.enc.cond_layer.bias +vq_model.flow.flows.6.enc.cond_layer.weight_g +vq_model.flow.flows.6.enc.cond_layer.weight_v +vq_model.flow.flows.6.post.weight +vq_model.flow.flows.6.post.bias +vq_model.quantizer.vq.layers.0._codebook.embed diff --git a/genie_tts/G2P/Chinese/CorrectPronunciation.py b/genie_tts/G2P/Chinese/CorrectPronunciation.py index 33d2024fd9ccea293e44afad15a2819235848f44..804b934ebd49fa390cd93e40c67ae820d15a9c2c 100644 --- a/genie_tts/G2P/Chinese/CorrectPronunciation.py +++ b/genie_tts/G2P/Chinese/CorrectPronunciation.py @@ -1,50 +1,50 @@ -import os -import pickle -from typing import List, Dict, Any, Union - -from ...Core.Resources import Chinese_G2P_DIR - -# 常量定义 -DEFAULT_CACHE_PATH = os.path.join(Chinese_G2P_DIR, "polyphonic.pickle") - - -class PolyphonicDictManager: - _data: Dict[str, Any] = {} - - @classmethod - def get_data(cls, path: str = DEFAULT_CACHE_PATH) -> Dict[str, Any]: - if not cls._data: - with open(path, "rb") as f: - cls._data = pickle.load(f) - return cls._data - - -def correct_pronunciation(word: str, word_pinyin: List[str]) -> Union[List[str], str]: - """ - 根据加载的字典修正发音,作为供外部程序调用的独立接口。 - 逻辑:优先查找整词修正,如果没有整词匹配,则遍历每个字符进行单字修正。 - - Input: - word (str): 原始中文字符串,例如 "银行"。 - word_pinyins (List[str]): 当前预测的拼音列表,例如 ['yin2', 'xing2']。 - - Output: - Union[List[str], str]: 修正后的拼音列表或字符串。 - - Example: - # 字典包含整词 {'银行': ['yin2', 'hang2']} - result = correct_pronunciation("银行", ["yin2", "xing2"]) - # Result: ["yin2", "hang2"] - """ - pp_dict = PolyphonicDictManager.get_data() - new_word_pinyin = list(word_pinyin) - # 1. 尝试整词匹配 - if new_pinyin := pp_dict.get(word): - return new_pinyin - # 2. 逐字修正 - for idx, w in enumerate(word): - if idx >= len(new_word_pinyin): - break - if w_pinyin := pp_dict.get(w): - new_word_pinyin[idx] = w_pinyin[0] - return new_word_pinyin +import os +import pickle +from typing import List, Dict, Any, Union + +from ...Core.Resources import Chinese_G2P_DIR + +# 常量定义 +DEFAULT_CACHE_PATH = os.path.join(Chinese_G2P_DIR, "polyphonic.pickle") + + +class PolyphonicDictManager: + _data: Dict[str, Any] = {} + + @classmethod + def get_data(cls, path: str = DEFAULT_CACHE_PATH) -> Dict[str, Any]: + if not cls._data: + with open(path, "rb") as f: + cls._data = pickle.load(f) + return cls._data + + +def correct_pronunciation(word: str, word_pinyin: List[str]) -> Union[List[str], str]: + """ + 根据加载的字典修正发音,作为供外部程序调用的独立接口。 + 逻辑:优先查找整词修正,如果没有整词匹配,则遍历每个字符进行单字修正。 + + Input: + word (str): 原始中文字符串,例如 "银行"。 + word_pinyins (List[str]): 当前预测的拼音列表,例如 ['yin2', 'xing2']。 + + Output: + Union[List[str], str]: 修正后的拼音列表或字符串。 + + Example: + # 字典包含整词 {'银行': ['yin2', 'hang2']} + result = correct_pronunciation("银行", ["yin2", "xing2"]) + # Result: ["yin2", "hang2"] + """ + pp_dict = PolyphonicDictManager.get_data() + new_word_pinyin = list(word_pinyin) + # 1. 尝试整词匹配 + if new_pinyin := pp_dict.get(word): + return new_pinyin + # 2. 逐字修正 + for idx, w in enumerate(word): + if idx >= len(new_word_pinyin): + break + if w_pinyin := pp_dict.get(w): + new_word_pinyin[idx] = w_pinyin[0] + return new_word_pinyin diff --git a/genie_tts/G2P/Chinese/Erhua.py b/genie_tts/G2P/Chinese/Erhua.py index 8ccc87b895563bc03e4943427219a2495094208b..8f165773fe4502163e624e73001a275813ff321e 100644 --- a/genie_tts/G2P/Chinese/Erhua.py +++ b/genie_tts/G2P/Chinese/Erhua.py @@ -1,49 +1,49 @@ -from typing import List, Tuple, Set - - -class ErhuaProcessor: - """ - 处理中文G2P中的儿化音逻辑。 - """ - - def __init__(self): - self.must_erhua: Set[str] = { - "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" - } - self.not_erhua: Set[str] = { - "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", - "妻儿", "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", - "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", - "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", - "猪儿", "猫儿", "狗儿", "少儿", - } - - def merge_erhua(self, initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]: - # 1. 修正 er1 发音为 er2 (当'儿'在词尾且发音为er1时) - for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn == "er1": - finals[i] = "er2" - # 2. 检查是否跳过儿化处理 - if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}): - return initials, finals - # 3. 长度校验 (处理如 "……" 等长度不一致的特殊符号情况) - if len(finals) != len(word): - return initials, finals - # 4. 执行儿化合并逻辑 (与前一个字发同音) - new_initials = [] - new_finals = [] - for i, phn in enumerate(finals): - # 判断是否需要合并儿化音 - # 条件: 是最后一个字 + 是"儿" + 发音是er2/er5 + 后两字不在非儿化表中 + 前面已有韵母 - if ( - i == len(finals) - 1 - and word[i] == "儿" - and phn in {"er2", "er5"} - and word[-2:] not in self.not_erhua - and new_finals - ): - # 将 'er' 加上前一个字的声调 - phn = "er" + new_finals[-1][-1] - new_initials.append(initials[i]) - new_finals.append(phn) - return new_initials, new_finals +from typing import List, Tuple, Set + + +class ErhuaProcessor: + """ + 处理中文G2P中的儿化音逻辑。 + """ + + def __init__(self): + self.must_erhua: Set[str] = { + "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" + } + self.not_erhua: Set[str] = { + "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", + "妻儿", "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", + "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", + "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", + "猪儿", "猫儿", "狗儿", "少儿", + } + + def merge_erhua(self, initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]: + # 1. 修正 er1 发音为 er2 (当'儿'在词尾且发音为er1时) + for i, phn in enumerate(finals): + if i == len(finals) - 1 and word[i] == "儿" and phn == "er1": + finals[i] = "er2" + # 2. 检查是否跳过儿化处理 + if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}): + return initials, finals + # 3. 长度校验 (处理如 "……" 等长度不一致的特殊符号情况) + if len(finals) != len(word): + return initials, finals + # 4. 执行儿化合并逻辑 (与前一个字发同音) + new_initials = [] + new_finals = [] + for i, phn in enumerate(finals): + # 判断是否需要合并儿化音 + # 条件: 是最后一个字 + 是"儿" + 发音是er2/er5 + 后两字不在非儿化表中 + 前面已有韵母 + if ( + i == len(finals) - 1 + and word[i] == "儿" + and phn in {"er2", "er5"} + and word[-2:] not in self.not_erhua + and new_finals + ): + # 将 'er' 加上前一个字的声调 + phn = "er" + new_finals[-1][-1] + new_initials.append(initials[i]) + new_finals.append(phn) + return new_initials, new_finals diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/__init__.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a28a5edbecd11f764acad8f84bfb1ae97dc05c7f Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/char_convert.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/char_convert.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c423046a6d37b42a65b88ceda68c3adb8c752a6 Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/char_convert.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/chronology.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/chronology.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ae25e3fcd6b370d61ac5da6a6b7761dcb7be676 Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/chronology.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/constants.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0827c54a2486be0ab595c072483466c64980ff8 Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/constants.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/num.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/num.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acab141bcac1d0b822551759c8e4b4f63da27c90 Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/num.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/phonecode.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/phonecode.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43e0328a0752c02a318f3809a8ab14caacd18805 Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/phonecode.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/quantifier.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/quantifier.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d7d85454ecd3203bbaa0fa238a3533e6c86c67c Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/quantifier.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/Normalization/__pycache__/text_normlization.cpython-311.pyc b/genie_tts/G2P/Chinese/Normalization/__pycache__/text_normlization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ca61c154b193ec28deaaad1a00690e5efb27c25 Binary files /dev/null and b/genie_tts/G2P/Chinese/Normalization/__pycache__/text_normlization.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/ToneSandhi.py b/genie_tts/G2P/Chinese/ToneSandhi.py index e57622b21f3abbd13a3ada62c67c96e9aa67e57e..d9076e3c3f34121c04f42a3291084096a3e0b06b 100644 --- a/genie_tts/G2P/Chinese/ToneSandhi.py +++ b/genie_tts/G2P/Chinese/ToneSandhi.py @@ -1,354 +1,354 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -中文拼音变调(Tone Sandhi)自动处理器 -""" - -from typing import List -from typing import Tuple -import jieba_fast as jieba -from pypinyin import lazy_pinyin -from pypinyin import Style - - -class ToneSandhi: - def __init__(self): - self.must_neural_tone_words = { - "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨", - "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠", - "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么", - "这个", "运气", "过去", "软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主", - "豆腐", "讲究", "记性", "记号", "认识", "规矩", "见识", "裁缝", "补丁", "衣裳", - "衣服", "衙门", "街坊", "行李", "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", - "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", - "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", - "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太", - "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", "精神", - "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿", - "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗", - "砚台", "码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛", - "相声", "盘算", "白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意", - "甘蔗", "琵琶", "琢磨", "琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", - "牲口", "牙碜", "牌楼", "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", - "炊帚", "灯笼", "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", - "活泼", "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃", - "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠", "朋友", - "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", "收成", "提防", - "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼", "抬举", - "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实", - "扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么", - "念头", "念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通", - "应酬", "庄稼", "干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", - "差事", "工夫", "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", - "对付", "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆", - "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当", "妖精", - "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫", "多少", "多么", - "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", "嘱咐", "嘟囔", "嘀咕", - "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦", "咳嗽", "和尚", - "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝", "叫唤", - "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹", - "功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", - "出息", "凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", - "便宜", "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", - "人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气", - "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴", "上头", "上司", - "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲", "咕噜", "邋遢", "费用", - "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", "幸福", "熟悉", "计划", "扑腾", - "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", - "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记", - } - self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", - "虎虎", "幺幺", "干嘛", "学子", "哈哈", "数数", "袅袅", "局地", "以下", "娃哈哈", - "花花草草", "留得", "耕地", "想想", "熙熙", "攘攘", "卵子", "死死", "冉冉", "恳恳", - "佼佼", "吵吵", "打打", "考考", "整整", "莘莘", "落地", "算子", "家家户户", "青青", - } - self.punc = ":,;。?!“”‘’':,;.?!" - - # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 - # e.g. - # word: "家里" - # pos: "s" - # finals: ['ia1', 'i3'] - def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: - # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 - for j, item in enumerate(word): - if ( - j - 1 >= 0 - and item == word[j - 1] - and pos[0] in {"n", "v", "a"} - and word not in self.must_not_neural_tone_words - ): - finals[j] = finals[j][:-1] + "5" - ge_idx = word.find("个") - if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": - finals[-1] = finals[-1][:-1] + "5" - elif len(word) >= 1 and word[-1] in "的地得": - finals[-1] = finals[-1][:-1] + "5" - # e.g. 走了, 看着, 去过 - elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: - finals[-1] = finals[-1][:-1] + "5" - elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words: - finals[-1] = finals[-1][:-1] + "5" - # e.g. 桌上, 地下, 家里 - elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: - finals[-1] = finals[-1][:-1] + "5" - # e.g. 上来, 下去 - elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": - finals[-1] = finals[-1][:-1] + "5" - # 个做量词 - elif ( - ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") - ) or word == "个": - finals[ge_idx] = finals[ge_idx][:-1] + "5" - else: - if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: - finals[-1] = finals[-1][:-1] + "5" - - word_list = self._split_word(word) - finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]] - for i, word in enumerate(word_list): - # conventional neural in Chinese - if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: - finals_list[i][-1] = finals_list[i][-1][:-1] + "5" - finals = sum(finals_list, []) - return finals - - @staticmethod - def _bu_sandhi(word: str, finals: List[str]) -> List[str]: - # e.g. 看不懂 - if len(word) == 3 and word[1] == "不": - finals[1] = finals[1][:-1] + "5" - else: - for i, char in enumerate(word): - # "不" before tone4 should be bu2, e.g. 不怕 - if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": - finals[i] = finals[i][:-1] + "2" - return finals - - def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: - # "一" in number sequences, e.g. 一零零, 二一零 - if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]): - return finals - # "一" between reduplication words should be yi5, e.g. 看一看 - elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: - finals[1] = finals[1][:-1] + "5" - # when "一" is ordinal word, it should be yi1 - elif word.startswith("第一"): - finals[1] = finals[1][:-1] + "1" - else: - for i, char in enumerate(word): - if char == "一" and i + 1 < len(word): - # "一" before tone4 should be yi2, e.g. 一段 - if finals[i + 1][-1] == "4": - finals[i] = finals[i][:-1] + "2" - # "一" before non-tone4 should be yi4, e.g. 一天 - else: - # "一" 后面如果是标点,还读一声 - if word[i + 1] not in self.punc: - finals[i] = finals[i][:-1] + "4" - return finals - - @staticmethod - def _split_word(word: str) -> List[str]: - word_list = jieba.cut_for_search(word) - word_list = sorted(word_list, key=lambda i: len(i), reverse=False) - first_subword = word_list[0] - first_begin_idx = word.find(first_subword) - if first_begin_idx == 0: - second_subword = word[len(first_subword):] - new_word_list = [first_subword, second_subword] - else: - second_subword = word[: -len(first_subword)] - new_word_list = [second_subword, first_subword] - return new_word_list - - def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: - if len(word) == 2 and self._all_tone_three(finals): - finals[0] = finals[0][:-1] + "2" - elif len(word) == 3: - word_list = self._split_word(word) - if self._all_tone_three(finals): - # disyllabic + monosyllabic, e.g. 蒙古/包 - if len(word_list[0]) == 2: - finals[0] = finals[0][:-1] + "2" - finals[1] = finals[1][:-1] + "2" - # monosyllabic + disyllabic, e.g. 纸/老虎 - elif len(word_list[0]) == 1: - finals[1] = finals[1][:-1] + "2" - else: - finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]] - if len(finals_list) == 2: - for i, sub in enumerate(finals_list): - # e.g. 所有/人 - if self._all_tone_three(sub) and len(sub) == 2: - finals_list[i][0] = finals_list[i][0][:-1] + "2" - # e.g. 好/喜欢 - elif ( - i == 1 - and not self._all_tone_three(sub) - and finals_list[i][0][-1] == "3" - and finals_list[0][-1][-1] == "3" - ): - finals_list[0][-1] = finals_list[0][-1][:-1] + "2" - finals = sum(finals_list, []) - # split idiom into two words whose length is 2 - elif len(word) == 4: - finals_list = [finals[:2], finals[2:]] - finals = [] - for sub in finals_list: - if self._all_tone_three(sub): - sub[0] = sub[0][:-1] + "2" - finals += sub - - return finals - - @staticmethod - def _all_tone_three(finals: List[str]) -> bool: - # 增加 len(x) > 0 的判断,防止空字符串导致崩溃 - return all(len(x) > 0 and x[-1] == "3" for x in finals) - - @staticmethod - def _merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] - last_word = "" - for word, pos in seg: - if last_word == "不": - word = last_word + word - if word != "不": - new_seg.append((word, pos)) - last_word = word[:] - if last_word == "不": - new_seg.append((last_word, "d")) - return new_seg - - @staticmethod - def _merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] - i = 0 - # function 1 - while i < len(seg): - word, pos = seg[i] - merged = False - if i - 1 >= 0 and word == "一" and i + 1 < len(seg): - last = new_seg[-1] if new_seg else seg[i - 1] - if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v": - combined = last[0] + "一" + seg[i + 1][0] - new_seg[-1] = [combined, last[1]] - i += 2 - merged = True - if not merged: - new_seg.append([word, pos]) - i += 1 - seg = new_seg - new_seg = [] - # function 2 - for word, pos in seg: - if new_seg and new_seg[-1][0] == "一": - new_seg[-1][0] = new_seg[-1][0] + word - else: - new_seg.append([word, pos]) - return new_seg - - # the first and the second words are all_tone_three - def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] - sub_finals_list = [ - lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg - ] - assert len(sub_finals_list) == len(seg) - merge_last = [False] * len(seg) - for i, (word, pos) in enumerate(seg): - if ( - i - 1 >= 0 - and self._all_tone_three(sub_finals_list[i - 1]) - and self._all_tone_three(sub_finals_list[i]) - and not merge_last[i - 1] - ): - # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: - new_seg[-1][0] = new_seg[-1][0] + seg[i][0] - merge_last[i] = True - else: - new_seg.append([word, pos]) - else: - new_seg.append([word, pos]) - - return new_seg - - @staticmethod - def _is_reduplication(word: str) -> bool: - return len(word) == 2 and word[0] == word[1] - - # the last char of first word and the first char of second word is tone_three - def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] - sub_finals_list = [ - lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg - ] - assert len(sub_finals_list) == len(seg) - merge_last = [False] * len(seg) - for i, (word, pos) in enumerate(seg): - if ( - i - 1 >= 0 - and sub_finals_list[i - 1][-1][-1] == "3" - and sub_finals_list[i][0][-1] == "3" - and not merge_last[i - 1] - ): - # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi - if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: - new_seg[-1][0] = new_seg[-1][0] + seg[i][0] - merge_last[i] = True - else: - new_seg.append([word, pos]) - else: - new_seg.append([word, pos]) - return new_seg - - @staticmethod - def _merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] - for i, (word, pos) in enumerate(seg): - if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": - new_seg[-1][0] = new_seg[-1][0] + seg[i][0] - else: - new_seg.append([word, pos]) - return new_seg - - @staticmethod - def _merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - new_seg = [] - for i, (word, pos) in enumerate(seg): - if new_seg and word == new_seg[-1][0]: - new_seg[-1][0] = new_seg[-1][0] + seg[i][0] - else: - new_seg.append([word, pos]) - return new_seg - - def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: - seg = self._merge_bu(seg) - seg = self._merge_yi(seg) - seg = self._merge_reduplication(seg) - seg = self._merge_continuous_three_tones(seg) - seg = self._merge_continuous_three_tones_2(seg) - seg = self._merge_er(seg) - return seg - - def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: - finals = self._bu_sandhi(word, finals) - finals = self._yi_sandhi(word, finals) - finals = self._neural_sandhi(word, pos, finals) - finals = self._three_sandhi(word, finals) - return finals +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +中文拼音变调(Tone Sandhi)自动处理器 +""" + +from typing import List +from typing import Tuple +import jieba_fast as jieba +from pypinyin import lazy_pinyin +from pypinyin import Style + + +class ToneSandhi: + def __init__(self): + self.must_neural_tone_words = { + "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨", + "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠", + "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么", + "这个", "运气", "过去", "软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主", + "豆腐", "讲究", "记性", "记号", "认识", "规矩", "见识", "裁缝", "补丁", "衣裳", + "衣服", "衙门", "街坊", "行李", "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", + "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", + "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", + "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太", + "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", "精神", + "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿", + "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗", + "砚台", "码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛", + "相声", "盘算", "白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意", + "甘蔗", "琵琶", "琢磨", "琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", + "牲口", "牙碜", "牌楼", "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", + "炊帚", "灯笼", "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", + "活泼", "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃", + "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠", "朋友", + "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", "收成", "提防", + "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼", "抬举", + "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实", + "扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么", + "念头", "念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通", + "应酬", "庄稼", "干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", + "差事", "工夫", "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", + "对付", "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆", + "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当", "妖精", + "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫", "多少", "多么", + "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", "嘱咐", "嘟囔", "嘀咕", + "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦", "咳嗽", "和尚", + "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝", "叫唤", + "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹", + "功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", + "出息", "凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", + "便宜", "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", + "人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气", + "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴", "上头", "上司", + "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲", "咕噜", "邋遢", "费用", + "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", "幸福", "熟悉", "计划", "扑腾", + "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", + "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记", + } + self.must_not_neural_tone_words = { + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", + "虎虎", "幺幺", "干嘛", "学子", "哈哈", "数数", "袅袅", "局地", "以下", "娃哈哈", + "花花草草", "留得", "耕地", "想想", "熙熙", "攘攘", "卵子", "死死", "冉冉", "恳恳", + "佼佼", "吵吵", "打打", "考考", "整整", "莘莘", "落地", "算子", "家家户户", "青青", + } + self.punc = ":,;。?!“”‘’':,;.?!" + + # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 + # e.g. + # word: "家里" + # pos: "s" + # finals: ['ia1', 'i3'] + def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: + # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 + for j, item in enumerate(word): + if ( + j - 1 >= 0 + and item == word[j - 1] + and pos[0] in {"n", "v", "a"} + and word not in self.must_not_neural_tone_words + ): + finals[j] = finals[j][:-1] + "5" + ge_idx = word.find("个") + if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": + finals[-1] = finals[-1][:-1] + "5" + elif len(word) >= 1 and word[-1] in "的地得": + finals[-1] = finals[-1][:-1] + "5" + # e.g. 走了, 看着, 去过 + elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: + finals[-1] = finals[-1][:-1] + "5" + elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words: + finals[-1] = finals[-1][:-1] + "5" + # e.g. 桌上, 地下, 家里 + elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: + finals[-1] = finals[-1][:-1] + "5" + # e.g. 上来, 下去 + elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": + finals[-1] = finals[-1][:-1] + "5" + # 个做量词 + elif ( + ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") + ) or word == "个": + finals[ge_idx] = finals[ge_idx][:-1] + "5" + else: + if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: + finals[-1] = finals[-1][:-1] + "5" + + word_list = self._split_word(word) + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]] + for i, word in enumerate(word_list): + # conventional neural in Chinese + if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words: + finals_list[i][-1] = finals_list[i][-1][:-1] + "5" + finals = sum(finals_list, []) + return finals + + @staticmethod + def _bu_sandhi(word: str, finals: List[str]) -> List[str]: + # e.g. 看不懂 + if len(word) == 3 and word[1] == "不": + finals[1] = finals[1][:-1] + "5" + else: + for i, char in enumerate(word): + # "不" before tone4 should be bu2, e.g. 不怕 + if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": + finals[i] = finals[i][:-1] + "2" + return finals + + def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: + # "一" in number sequences, e.g. 一零零, 二一零 + if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]): + return finals + # "一" between reduplication words should be yi5, e.g. 看一看 + elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: + finals[1] = finals[1][:-1] + "5" + # when "一" is ordinal word, it should be yi1 + elif word.startswith("第一"): + finals[1] = finals[1][:-1] + "1" + else: + for i, char in enumerate(word): + if char == "一" and i + 1 < len(word): + # "一" before tone4 should be yi2, e.g. 一段 + if finals[i + 1][-1] == "4": + finals[i] = finals[i][:-1] + "2" + # "一" before non-tone4 should be yi4, e.g. 一天 + else: + # "一" 后面如果是标点,还读一声 + if word[i + 1] not in self.punc: + finals[i] = finals[i][:-1] + "4" + return finals + + @staticmethod + def _split_word(word: str) -> List[str]: + word_list = jieba.cut_for_search(word) + word_list = sorted(word_list, key=lambda i: len(i), reverse=False) + first_subword = word_list[0] + first_begin_idx = word.find(first_subword) + if first_begin_idx == 0: + second_subword = word[len(first_subword):] + new_word_list = [first_subword, second_subword] + else: + second_subword = word[: -len(first_subword)] + new_word_list = [second_subword, first_subword] + return new_word_list + + def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: + if len(word) == 2 and self._all_tone_three(finals): + finals[0] = finals[0][:-1] + "2" + elif len(word) == 3: + word_list = self._split_word(word) + if self._all_tone_three(finals): + # disyllabic + monosyllabic, e.g. 蒙古/包 + if len(word_list[0]) == 2: + finals[0] = finals[0][:-1] + "2" + finals[1] = finals[1][:-1] + "2" + # monosyllabic + disyllabic, e.g. 纸/老虎 + elif len(word_list[0]) == 1: + finals[1] = finals[1][:-1] + "2" + else: + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]):]] + if len(finals_list) == 2: + for i, sub in enumerate(finals_list): + # e.g. 所有/人 + if self._all_tone_three(sub) and len(sub) == 2: + finals_list[i][0] = finals_list[i][0][:-1] + "2" + # e.g. 好/喜欢 + elif ( + i == 1 + and not self._all_tone_three(sub) + and finals_list[i][0][-1] == "3" + and finals_list[0][-1][-1] == "3" + ): + finals_list[0][-1] = finals_list[0][-1][:-1] + "2" + finals = sum(finals_list, []) + # split idiom into two words whose length is 2 + elif len(word) == 4: + finals_list = [finals[:2], finals[2:]] + finals = [] + for sub in finals_list: + if self._all_tone_three(sub): + sub[0] = sub[0][:-1] + "2" + finals += sub + + return finals + + @staticmethod + def _all_tone_three(finals: List[str]) -> bool: + # 增加 len(x) > 0 的判断,防止空字符串导致崩溃 + return all(len(x) > 0 and x[-1] == "3" for x in finals) + + @staticmethod + def _merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + last_word = "" + for word, pos in seg: + if last_word == "不": + word = last_word + word + if word != "不": + new_seg.append((word, pos)) + last_word = word[:] + if last_word == "不": + new_seg.append((last_word, "d")) + return new_seg + + @staticmethod + def _merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + i = 0 + # function 1 + while i < len(seg): + word, pos = seg[i] + merged = False + if i - 1 >= 0 and word == "一" and i + 1 < len(seg): + last = new_seg[-1] if new_seg else seg[i - 1] + if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v": + combined = last[0] + "一" + seg[i + 1][0] + new_seg[-1] = [combined, last[1]] + i += 2 + merged = True + if not merged: + new_seg.append([word, pos]) + i += 1 + seg = new_seg + new_seg = [] + # function 2 + for word, pos in seg: + if new_seg and new_seg[-1][0] == "一": + new_seg[-1][0] = new_seg[-1][0] + word + else: + new_seg.append([word, pos]) + return new_seg + + # the first and the second words are all_tone_three + def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + sub_finals_list = [ + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg + ] + assert len(sub_finals_list) == len(seg) + merge_last = [False] * len(seg) + for i, (word, pos) in enumerate(seg): + if ( + i - 1 >= 0 + and self._all_tone_three(sub_finals_list[i - 1]) + and self._all_tone_three(sub_finals_list[i]) + and not merge_last[i - 1] + ): + # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi + if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + merge_last[i] = True + else: + new_seg.append([word, pos]) + else: + new_seg.append([word, pos]) + + return new_seg + + @staticmethod + def _is_reduplication(word: str) -> bool: + return len(word) == 2 and word[0] == word[1] + + # the last char of first word and the first char of second word is tone_three + def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + sub_finals_list = [ + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg + ] + assert len(sub_finals_list) == len(seg) + merge_last = [False] * len(seg) + for i, (word, pos) in enumerate(seg): + if ( + i - 1 >= 0 + and sub_finals_list[i - 1][-1][-1] == "3" + and sub_finals_list[i][0][-1] == "3" + and not merge_last[i - 1] + ): + # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi + if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3: + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + merge_last[i] = True + else: + new_seg.append([word, pos]) + else: + new_seg.append([word, pos]) + return new_seg + + @staticmethod + def _merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + for i, (word, pos) in enumerate(seg): + if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + else: + new_seg.append([word, pos]) + return new_seg + + @staticmethod + def _merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + for i, (word, pos) in enumerate(seg): + if new_seg and word == new_seg[-1][0]: + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + else: + new_seg.append([word, pos]) + return new_seg + + def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + seg = self._merge_bu(seg) + seg = self._merge_yi(seg) + seg = self._merge_reduplication(seg) + seg = self._merge_continuous_three_tones(seg) + seg = self._merge_continuous_three_tones_2(seg) + seg = self._merge_er(seg) + return seg + + def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: + finals = self._bu_sandhi(word, finals) + finals = self._yi_sandhi(word, finals) + finals = self._neural_sandhi(word, pos, finals) + finals = self._three_sandhi(word, finals) + return finals diff --git a/genie_tts/G2P/Chinese/__pycache__/ChineseG2P.cpython-311.pyc b/genie_tts/G2P/Chinese/__pycache__/ChineseG2P.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72ae0b8bdccc6a776abb8bbc5a0992f67269e729 Binary files /dev/null and b/genie_tts/G2P/Chinese/__pycache__/ChineseG2P.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/__pycache__/CorrectPronunciation.cpython-311.pyc b/genie_tts/G2P/Chinese/__pycache__/CorrectPronunciation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..894d795c181a32e670c41be3d08625ab0e1ba005 Binary files /dev/null and b/genie_tts/G2P/Chinese/__pycache__/CorrectPronunciation.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/__pycache__/Erhua.cpython-311.pyc b/genie_tts/G2P/Chinese/__pycache__/Erhua.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e57ad3d12df8ae6f6876a65123b45632c030bf5 Binary files /dev/null and b/genie_tts/G2P/Chinese/__pycache__/Erhua.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/__pycache__/ToneSandhi.cpython-311.pyc b/genie_tts/G2P/Chinese/__pycache__/ToneSandhi.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb79eca42d06b99312b2d5afafb5e6ddbdef240d Binary files /dev/null and b/genie_tts/G2P/Chinese/__pycache__/ToneSandhi.cpython-311.pyc differ diff --git a/genie_tts/G2P/Chinese/__pycache__/__init__.cpython-311.pyc b/genie_tts/G2P/Chinese/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffc32ed82ba66bf328b6478f2ec1459f10530179 Binary files /dev/null and b/genie_tts/G2P/Chinese/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/G2P/English/EnglishG2P.py b/genie_tts/G2P/English/EnglishG2P.py index 5325385ce889018d128f9fec298de7836a0c5aeb..cad9edffdece9abce37005bc94480e8d2e2514ff 100644 --- a/genie_tts/G2P/English/EnglishG2P.py +++ b/genie_tts/G2P/English/EnglishG2P.py @@ -1,296 +1,296 @@ -import pickle -import os -import re -from typing import List, Dict, Tuple - -import numpy as np -import nltk -from nltk.tokenize import TweetTokenizer -from nltk import pos_tag - -from .Normalization import normalize -from .WordSegment import segment_text -from ..SymbolsV2 import symbols_v2, symbol_to_id_v2 -from ..SymbolsV2 import PUNCTUATION -from ...Core.Resources import English_G2P_DIR - -# nltk 路径和分词器初始化 -nltk.data.path.append(English_G2P_DIR) -word_tokenize = TweetTokenizer().tokenize - -# 路径定义 -CMU_DICT_PATH = os.path.join(English_G2P_DIR, "cmudict.rep") -CMU_DICT_FAST_PATH = os.path.join(English_G2P_DIR, "cmudict-fast.rep") -CMU_DICT_HOT_PATH = os.path.join(English_G2P_DIR, "engdict-hot.rep") -CACHE_PATH = os.path.join(English_G2P_DIR, "engdict_cache.pickle") -NAMECACHE_PATH = os.path.join(English_G2P_DIR, "namedict_cache.pickle") -MODEL_PATH = os.path.join(English_G2P_DIR, "checkpoint20.npz") - -# 正则表达式和映射 -REP_MAP = { - "[;::,;]": ",", - '["’]': "'", - "。": ".", - "!": "!", - "?": "?", -} -REP_MAP_PATTERN = re.compile("|".join(re.escape(p) for p in REP_MAP.keys())) -PUNCTUATIONS_FOR_REGEX = "".join(re.escape(p) for p in PUNCTUATION) -CONSECUTIVE_PUNCTUATION_PATTERN = re.compile(rf"([{PUNCTUATIONS_FOR_REGEX}\s])([{PUNCTUATIONS_FOR_REGEX}])+") - - -# 辅助函数 -def _read_cmu_dict(file_path: str) -> Dict[str, List[str]]: - g2p_dict = {} - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if not line or line.startswith(';;;'): continue - parts = re.split(r'\s+', line, maxsplit=1) - if len(parts) < 2: continue - word, pron_str = parts[0].lower(), parts[1] - pron = pron_str.split(" ") - word = re.sub(r'\(\d+\)$', '', word) - if word not in g2p_dict: g2p_dict[word] = [pron] - return g2p_dict - - -def _load_and_cache_dict() -> Dict[str, List[List[str]]]: - with open(CACHE_PATH, "rb") as f: - g2p_dict = pickle.load(f) - hot_dict = _read_cmu_dict(CMU_DICT_HOT_PATH) - if hot_dict: g2p_dict.update(hot_dict) - return g2p_dict - - -def replace_phs(phs: List[str]) -> List[str]: - rep_map = {"'": "-"} - phs_new = [] - for ph in phs: - if ph in symbols_v2: - phs_new.append(ph) - elif ph in rep_map: - phs_new.append(rep_map[ph]) - return phs_new - - -def replace_consecutive_punctuation(text: str) -> str: - return CONSECUTIVE_PUNCTUATION_PATTERN.sub(r"\1", text) - - -def text_normalize(text: str) -> str: - text = REP_MAP_PATTERN.sub(lambda x: REP_MAP[x.group()], text) - text = normalize(text) - text = replace_consecutive_punctuation(text) - return text - - -class CleanG2p: - """ - 一个集成了神经网络预测功能的、独立的英文G2P转换器。 - - 不再依赖 g2p_en 库,将模型推理逻辑直接内置。 - - 依赖 numpy 库进行计算。 - """ - - def __init__(self): - # 1. 初始化标准组件 - self.cmu = _load_and_cache_dict() - self.namedict = self._load_name_dict() - for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]: - self.cmu.pop(word.lower(), None) - self._setup_homographs() - - # 2. 初始化神经网络模型组件 - self._setup_nn_components() - self._load_nn_model() - - def _setup_nn_components(self): - """设置 G2P 神经网络所需的字母和音素表。""" - self.graphemes = ["", "", ""] + list("abcdefghijklmnopqrstuvwxyz") - self.phonemes = ["", "", "", ""] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', - 'AH2', 'AO0', - 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', - 'B', 'CH', 'D', 'DH', - 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', - 'EY2', 'F', 'G', 'HH', - 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', - 'M', 'N', 'NG', 'OW0', 'OW1', - 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', - 'UH0', 'UH1', 'UH2', 'UW', - 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'] - self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)} - self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)} - self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)} - self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)} - - def _load_nn_model(self): - """从 .npz 文件加载预训练的神经网络权重。""" - if not os.path.exists(MODEL_PATH): - raise FileNotFoundError(f"G2P model file not found at: {MODEL_PATH}. " - f"Please ensure 'checkpoint20.npz' is in the correct directory.") - - variables = np.load(MODEL_PATH) - self.enc_emb = variables["enc_emb"] - self.enc_w_ih = variables["enc_w_ih"] - self.enc_w_hh = variables["enc_w_hh"] - self.enc_b_ih = variables["enc_b_ih"] - self.enc_b_hh = variables["enc_b_hh"] - self.dec_emb = variables["dec_emb"] - self.dec_w_ih = variables["dec_w_ih"] - self.dec_w_hh = variables["dec_w_hh"] - self.dec_b_ih = variables["dec_b_ih"] - self.dec_b_hh = variables["dec_b_hh"] - self.fc_w = variables["fc_w"] - self.fc_b = variables["fc_b"] - # logger.info("G2P neural network model loaded successfully.") - - @staticmethod - def _sigmoid(x): - return 1 / (1 + np.exp(-x)) - - def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh): - rzn_ih = np.matmul(x, w_ih.T) + b_ih - rzn_hh = np.matmul(h, w_hh.T) + b_hh - rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:] - rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:] - rz = self._sigmoid(rz_ih + rz_hh) - r, z = np.split(rz, 2, -1) - n = np.tanh(n_ih + r * n_hh) - h = (1 - z) * n + z * h - return h - - def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None): - if h0 is None: - h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32) - h = h0 - outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32) - for t in range(steps): - h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) - outputs[:, t, ::] = h - return outputs - - def _encode(self, word: str) -> np.ndarray: - chars = list(word.lower()) + [""] - x = [self.g2idx.get(char, self.g2idx[""]) for char in chars] - x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0) - return x - - def predict(self, word: str) -> List[str]: - """使用内置的神经网络模型预测单词的发音。""" - # Encoder - enc = self._encode(word) - enc = self._gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh, - self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32)) - last_hidden = enc[:, -1, :] - - # Decoder - dec = np.take(self.dec_emb, [self.p2idx[""]], axis=0) # Start with - h = last_hidden - preds = [] - for _ in range(20): # Max steps - h = self._grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) - logits = np.matmul(h, self.fc_w.T) + self.fc_b - pred_idx = logits.argmax() - if pred_idx == self.p2idx[""]: break - preds.append(pred_idx) - dec = np.take(self.dec_emb, [pred_idx], axis=0) - - return [self.idx2p.get(idx, "") for idx in preds] - - # --- 标准 G2P 逻辑 --- - - @staticmethod - def _load_name_dict() -> Dict[str, List[List[str]]]: - if os.path.exists(NAMECACHE_PATH): - with open(NAMECACHE_PATH, "rb") as f: return pickle.load(f) - return {} - - def _setup_homographs(self): - self.homograph2features: Dict[str, Tuple[List[str], List[str], str]] = { - "read": (["R", "EH1", "D"], ["R", "IY1", "D"], "VBD"), - "complex": (["K", "AH0", "M", "P", "L", "EH1", "K", "S"], ["K", "AA1", "M", "P", "L", "EH0", "K", "S"], - "JJ"), - "lead": (["L", "IY1", "D"], ["L", "EH1", "D"], "NN"), - "presents": (["P", "R", "IY0", "Z", "EH1", "N", "T", "S"], ["P", "R", "EH1", "Z", "AH0", "N", "T", "S"], - "VBZ"), - } - - def __call__(self, text: str) -> List[str]: - original_words = word_tokenize(text) - normalized_text = text_normalize(text) - normalized_words = word_tokenize(normalized_text) - - corrected_words = [] - original_idx, normalized_idx = 0, 0 - while original_idx < len(original_words) and normalized_idx < len(normalized_words): - if original_words[original_idx] == "I" and \ - " ".join(normalized_words[normalized_idx:normalized_idx + 2]) == "the first": - corrected_words.append("I") - original_idx += 1 - normalized_idx += 2 - else: - corrected_words.append(normalized_words[normalized_idx]) - original_idx += 1 - normalized_idx += 1 - if normalized_idx < len(normalized_words): - corrected_words.extend(normalized_words[normalized_idx:]) - - if not corrected_words: return [] - - tokens = pos_tag(corrected_words) - prons = [] - for o_word, pos in tokens: - word = o_word.lower() - if re.search("[a-z]", word) is None: - pron = [word] - elif word in self.homograph2features: - pron1, pron2, pos1 = self.homograph2features[word] - pron = pron1 if pos.startswith(pos1) else pron2 - else: - pron = self._query_word(o_word) - prons.extend(pron) - prons.extend([" "]) - return prons[:-1] if prons else [] - - def _query_word(self, o_word: str) -> List[str]: - word = o_word.lower() - if word in self.cmu: - if o_word == "A": return ["AH0"] - return self.cmu[word][0] - if o_word.istitle() and word in self.namedict: - return self.namedict[word][0] - if word.endswith("'s") and len(word) > 2: - base_pron = self._query_word(word[:-2]) - if base_pron: - last_ph = base_pron[-1] - if last_ph in {"S", "Z", "SH", "ZH", "CH", "JH"}: return base_pron + ["AH0", "Z"] - if last_ph in {"P", "T", "K", "F", "TH"}: return base_pron + ["S"] - return base_pron + ["Z"] - if "-" in word and len(word) > 1: - parts = [p for p in word.split("-") if p] - if len(parts) > 1: - result = [ph for part in parts for ph in self._query_word(part)] - if result: return result - segments = segment_text(word) - if len(segments) > 1 and "".join(segments) == word: - result = [ph for segment in segments for ph in self._query_word(segment)] - if result: return result - - return self.predict(o_word) - - -_g2p_instance: CleanG2p = CleanG2p() - - -def g2p(text: str) -> List[str]: - if _g2p_instance is None: raise RuntimeError("G2P model is not available.") - raw_phonemes = _g2p_instance(text) - undesired = {" ", "", "UW", "", ""} - phones = ["UNK" if ph == "" else ph for ph in raw_phonemes if ph not in undesired] - return replace_phs(phones) - - -def english_to_phones(text: str) -> List[int]: - phones = g2p(text) - phones = [symbol_to_id_v2[ph] for ph in phones] - return phones +import pickle +import os +import re +from typing import List, Dict, Tuple + +import numpy as np +import nltk +from nltk.tokenize import TweetTokenizer +from nltk import pos_tag + +from .Normalization import normalize +from .WordSegment import segment_text +from ..SymbolsV2 import symbols_v2, symbol_to_id_v2 +from ..SymbolsV2 import PUNCTUATION +from ...Core.Resources import English_G2P_DIR + +# nltk 路径和分词器初始化 +nltk.data.path.append(English_G2P_DIR) +word_tokenize = TweetTokenizer().tokenize + +# 路径定义 +CMU_DICT_PATH = os.path.join(English_G2P_DIR, "cmudict.rep") +CMU_DICT_FAST_PATH = os.path.join(English_G2P_DIR, "cmudict-fast.rep") +CMU_DICT_HOT_PATH = os.path.join(English_G2P_DIR, "engdict-hot.rep") +CACHE_PATH = os.path.join(English_G2P_DIR, "engdict_cache.pickle") +NAMECACHE_PATH = os.path.join(English_G2P_DIR, "namedict_cache.pickle") +MODEL_PATH = os.path.join(English_G2P_DIR, "checkpoint20.npz") + +# 正则表达式和映射 +REP_MAP = { + "[;::,;]": ",", + '["’]': "'", + "。": ".", + "!": "!", + "?": "?", +} +REP_MAP_PATTERN = re.compile("|".join(re.escape(p) for p in REP_MAP.keys())) +PUNCTUATIONS_FOR_REGEX = "".join(re.escape(p) for p in PUNCTUATION) +CONSECUTIVE_PUNCTUATION_PATTERN = re.compile(rf"([{PUNCTUATIONS_FOR_REGEX}\s])([{PUNCTUATIONS_FOR_REGEX}])+") + + +# 辅助函数 +def _read_cmu_dict(file_path: str) -> Dict[str, List[str]]: + g2p_dict = {} + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if not line or line.startswith(';;;'): continue + parts = re.split(r'\s+', line, maxsplit=1) + if len(parts) < 2: continue + word, pron_str = parts[0].lower(), parts[1] + pron = pron_str.split(" ") + word = re.sub(r'\(\d+\)$', '', word) + if word not in g2p_dict: g2p_dict[word] = [pron] + return g2p_dict + + +def _load_and_cache_dict() -> Dict[str, List[List[str]]]: + with open(CACHE_PATH, "rb") as f: + g2p_dict = pickle.load(f) + hot_dict = _read_cmu_dict(CMU_DICT_HOT_PATH) + if hot_dict: g2p_dict.update(hot_dict) + return g2p_dict + + +def replace_phs(phs: List[str]) -> List[str]: + rep_map = {"'": "-"} + phs_new = [] + for ph in phs: + if ph in symbols_v2: + phs_new.append(ph) + elif ph in rep_map: + phs_new.append(rep_map[ph]) + return phs_new + + +def replace_consecutive_punctuation(text: str) -> str: + return CONSECUTIVE_PUNCTUATION_PATTERN.sub(r"\1", text) + + +def text_normalize(text: str) -> str: + text = REP_MAP_PATTERN.sub(lambda x: REP_MAP[x.group()], text) + text = normalize(text) + text = replace_consecutive_punctuation(text) + return text + + +class CleanG2p: + """ + 一个集成了神经网络预测功能的、独立的英文G2P转换器。 + - 不再依赖 g2p_en 库,将模型推理逻辑直接内置。 + - 依赖 numpy 库进行计算。 + """ + + def __init__(self): + # 1. 初始化标准组件 + self.cmu = _load_and_cache_dict() + self.namedict = self._load_name_dict() + for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]: + self.cmu.pop(word.lower(), None) + self._setup_homographs() + + # 2. 初始化神经网络模型组件 + self._setup_nn_components() + self._load_nn_model() + + def _setup_nn_components(self): + """设置 G2P 神经网络所需的字母和音素表。""" + self.graphemes = ["", "", ""] + list("abcdefghijklmnopqrstuvwxyz") + self.phonemes = ["", "", "", ""] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', + 'AH2', 'AO0', + 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', + 'B', 'CH', 'D', 'DH', + 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', + 'EY2', 'F', 'G', 'HH', + 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', + 'M', 'N', 'NG', 'OW0', 'OW1', + 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', + 'UH0', 'UH1', 'UH2', 'UW', + 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'] + self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)} + self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)} + self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)} + self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)} + + def _load_nn_model(self): + """从 .npz 文件加载预训练的神经网络权重。""" + if not os.path.exists(MODEL_PATH): + raise FileNotFoundError(f"G2P model file not found at: {MODEL_PATH}. " + f"Please ensure 'checkpoint20.npz' is in the correct directory.") + + variables = np.load(MODEL_PATH) + self.enc_emb = variables["enc_emb"] + self.enc_w_ih = variables["enc_w_ih"] + self.enc_w_hh = variables["enc_w_hh"] + self.enc_b_ih = variables["enc_b_ih"] + self.enc_b_hh = variables["enc_b_hh"] + self.dec_emb = variables["dec_emb"] + self.dec_w_ih = variables["dec_w_ih"] + self.dec_w_hh = variables["dec_w_hh"] + self.dec_b_ih = variables["dec_b_ih"] + self.dec_b_hh = variables["dec_b_hh"] + self.fc_w = variables["fc_w"] + self.fc_b = variables["fc_b"] + # logger.info("G2P neural network model loaded successfully.") + + @staticmethod + def _sigmoid(x): + return 1 / (1 + np.exp(-x)) + + def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh): + rzn_ih = np.matmul(x, w_ih.T) + b_ih + rzn_hh = np.matmul(h, w_hh.T) + b_hh + rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:] + rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:] + rz = self._sigmoid(rz_ih + rz_hh) + r, z = np.split(rz, 2, -1) + n = np.tanh(n_ih + r * n_hh) + h = (1 - z) * n + z * h + return h + + def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None): + if h0 is None: + h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32) + h = h0 + outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32) + for t in range(steps): + h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) + outputs[:, t, ::] = h + return outputs + + def _encode(self, word: str) -> np.ndarray: + chars = list(word.lower()) + [""] + x = [self.g2idx.get(char, self.g2idx[""]) for char in chars] + x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0) + return x + + def predict(self, word: str) -> List[str]: + """使用内置的神经网络模型预测单词的发音。""" + # Encoder + enc = self._encode(word) + enc = self._gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh, + self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32)) + last_hidden = enc[:, -1, :] + + # Decoder + dec = np.take(self.dec_emb, [self.p2idx[""]], axis=0) # Start with + h = last_hidden + preds = [] + for _ in range(20): # Max steps + h = self._grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) + logits = np.matmul(h, self.fc_w.T) + self.fc_b + pred_idx = logits.argmax() + if pred_idx == self.p2idx[""]: break + preds.append(pred_idx) + dec = np.take(self.dec_emb, [pred_idx], axis=0) + + return [self.idx2p.get(idx, "") for idx in preds] + + # --- 标准 G2P 逻辑 --- + + @staticmethod + def _load_name_dict() -> Dict[str, List[List[str]]]: + if os.path.exists(NAMECACHE_PATH): + with open(NAMECACHE_PATH, "rb") as f: return pickle.load(f) + return {} + + def _setup_homographs(self): + self.homograph2features: Dict[str, Tuple[List[str], List[str], str]] = { + "read": (["R", "EH1", "D"], ["R", "IY1", "D"], "VBD"), + "complex": (["K", "AH0", "M", "P", "L", "EH1", "K", "S"], ["K", "AA1", "M", "P", "L", "EH0", "K", "S"], + "JJ"), + "lead": (["L", "IY1", "D"], ["L", "EH1", "D"], "NN"), + "presents": (["P", "R", "IY0", "Z", "EH1", "N", "T", "S"], ["P", "R", "EH1", "Z", "AH0", "N", "T", "S"], + "VBZ"), + } + + def __call__(self, text: str) -> List[str]: + original_words = word_tokenize(text) + normalized_text = text_normalize(text) + normalized_words = word_tokenize(normalized_text) + + corrected_words = [] + original_idx, normalized_idx = 0, 0 + while original_idx < len(original_words) and normalized_idx < len(normalized_words): + if original_words[original_idx] == "I" and \ + " ".join(normalized_words[normalized_idx:normalized_idx + 2]) == "the first": + corrected_words.append("I") + original_idx += 1 + normalized_idx += 2 + else: + corrected_words.append(normalized_words[normalized_idx]) + original_idx += 1 + normalized_idx += 1 + if normalized_idx < len(normalized_words): + corrected_words.extend(normalized_words[normalized_idx:]) + + if not corrected_words: return [] + + tokens = pos_tag(corrected_words) + prons = [] + for o_word, pos in tokens: + word = o_word.lower() + if re.search("[a-z]", word) is None: + pron = [word] + elif word in self.homograph2features: + pron1, pron2, pos1 = self.homograph2features[word] + pron = pron1 if pos.startswith(pos1) else pron2 + else: + pron = self._query_word(o_word) + prons.extend(pron) + prons.extend([" "]) + return prons[:-1] if prons else [] + + def _query_word(self, o_word: str) -> List[str]: + word = o_word.lower() + if word in self.cmu: + if o_word == "A": return ["AH0"] + return self.cmu[word][0] + if o_word.istitle() and word in self.namedict: + return self.namedict[word][0] + if word.endswith("'s") and len(word) > 2: + base_pron = self._query_word(word[:-2]) + if base_pron: + last_ph = base_pron[-1] + if last_ph in {"S", "Z", "SH", "ZH", "CH", "JH"}: return base_pron + ["AH0", "Z"] + if last_ph in {"P", "T", "K", "F", "TH"}: return base_pron + ["S"] + return base_pron + ["Z"] + if "-" in word and len(word) > 1: + parts = [p for p in word.split("-") if p] + if len(parts) > 1: + result = [ph for part in parts for ph in self._query_word(part)] + if result: return result + segments = segment_text(word) + if len(segments) > 1 and "".join(segments) == word: + result = [ph for segment in segments for ph in self._query_word(segment)] + if result: return result + + return self.predict(o_word) + + +_g2p_instance: CleanG2p = CleanG2p() + + +def g2p(text: str) -> List[str]: + if _g2p_instance is None: raise RuntimeError("G2P model is not available.") + raw_phonemes = _g2p_instance(text) + undesired = {" ", "", "UW", "", ""} + phones = ["UNK" if ph == "" else ph for ph in raw_phonemes if ph not in undesired] + return replace_phs(phones) + + +def english_to_phones(text: str) -> List[int]: + phones = g2p(text) + phones = [symbol_to_id_v2[ph] for ph in phones] + return phones diff --git a/genie_tts/G2P/English/Normalization.py b/genie_tts/G2P/English/Normalization.py index ce5046d73caeaf8750e67a6088012d13d777737a..513a1ec770ad18ffb38479e9ee564acc32c5fed8 100644 --- a/genie_tts/G2P/English/Normalization.py +++ b/genie_tts/G2P/English/Normalization.py @@ -1,286 +1,286 @@ -import re -import unicodedata -from calendar import month_name - - -# ------------------- 核心:自实现数字转单词 (替代 inflect) ------------------- - -def _number_to_words_custom(num_str): - """一个不依赖inflect的、简化的数字到单词转换器。""" - num_str = str(num_str).strip() - if not num_str.isdigit(): return num_str - - num = int(num_str) - if num == 0: return 'zero' - - units = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] - teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", - "nineteen"] - tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] - thousands = ["", "thousand", "million", "billion", "trillion"] - - def convert_less_than_thousand(n): - if n == 0: return "" - if n < 10: return units[n] - if n < 20: return teens[n - 10] - if n < 100: return tens[n // 10] + (" " + units[n % 10] if n % 10 != 0 else "") - return units[n // 100] + " hundred" + (" " + convert_less_than_thousand(n % 100) if n % 100 != 0 else "") - - words = [] - i = 0 - if num == 0: return "zero" - while num > 0: - if num % 1000 != 0: - words.insert(0, convert_less_than_thousand(num % 1000) + " " + thousands[i]) - num //= 1000 - i += 1 - return " ".join(words).strip() - - -def _ordinal_custom(num_str): - """一个不依赖inflect的、简化的序数词转换器。""" - num = int(num_str) - if 10 <= num % 100 <= 20: - suffix = 'th' - else: - suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(num % 10, 'th') - return _number_to_words_custom(str(num)) + suffix - - -# ------------------- 初始化和常量定义 (无 inflect) ------------------- - -_measurement_map = { - "km/h": ["kilometer per hour", "kilometers per hour"], "mph": ["mile per hour", "miles per hour"], - "°C": ["degree celsius", "degrees celsius"], "°F": ["degree fahrenheit", "degrees fahrenheit"], - "tbsp": ["tablespoon", "tablespoons"], "tsp": ["teaspoon", "teaspoons"], - "km": ["kilometer", "kilometers"], "kg": ["kilogram", "kilograms"], "min": ["minute", "minutes"], - "ft": ["foot", "feet"], "cm": ["centimeter", "centimeters"], "m": ["meter", "meters"], - "L": ["liter", "liters"], "h": ["hour", "hours"], "s": ["second", "seconds"], -} - -_abbreviations = [ - (re.compile(r"\bMr\.(?=[\s,.]|\Z)", re.IGNORECASE), "Mister"), - (re.compile(r"\bMrs\.(?=[\s,.]|\Z)", re.IGNORECASE), "Missus"), - (re.compile(r"\bDr\.(?=[\s,.]|\Z)", re.IGNORECASE), "Doctor"), - (re.compile(r"\bProf\.(?=[\s,.]|\Z)", re.IGNORECASE), "Professor"), - (re.compile(r"\bSt\.(?=[\s,.]|\Z)", re.IGNORECASE), "Street"), - (re.compile(r"\bCo\.(?=[\s,.]|\Z)", re.IGNORECASE), "Company"), - (re.compile(r"\bLtd\.(?=[\s,.]|\Z)", re.IGNORECASE), "Limited"), - (re.compile(r"\be\.g\.(?=[\s,.]|\Z)", re.IGNORECASE), "for example"), - (re.compile(r"\bi\.e\.(?=[\s,.]|\Z)", re.IGNORECASE), "that is"), -] - -# ------------------- 正则表达式定义 (与原来保持一致) ------------------- -_currency_suffix_re = re.compile(r"([£$€])([\d,.]*\d)\s*(million|billion|thousand)\b", re.IGNORECASE) -_phone_re = re.compile(r"(\+?\d{1,3}-)?\b(\d{3})-(?:(\d{3})-)?(\d{4})\b") -_roman_re = re.compile(r"\b(XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II)\b", re.IGNORECASE) -_decade_re = re.compile(r"\b((?:1[89]|20)\d0)s\b") -_score_re = re.compile(r"\b(\d{1,2})-(\d{1,2})\b") -_dimension_re = re.compile(r"\b(\d+(?:\.\d+)?)\s*x\s*(\d+(?:\.\d+)?)(?:\s*x\s*(\d+(?:\.\d+)?))?\b") -_alphanumeric_re = re.compile(r"\b([a-zA-Z]+[0-9]+|[0-9]+[a-zA-Z]+)\b") -_date_re = re.compile(r"\b(0?[1-9]|1[0-2])/([0-2]?\d|3[01])/(\d{2,4})\b") -_ordinal_number_re = re.compile(r"\b(\d+)\. ") -_comma_number_re = re.compile(r"(\d[\d,]+\d)") -_currency_re = re.compile(r"([£$€])(\d*\.?\d+)|(\d*\.?\d+)\s*([£$€])") -_time_re = re.compile(r"\b([01]?\d|2[0-3]):([0-5]\d)(?::([0-5]\d))?(\s*(?:a\.?m\.?|p\.?m\.?))?\b", re.IGNORECASE) -units = "|".join(re.escape(key) for key in sorted(_measurement_map.keys(), key=len, reverse=True)) -_measurement_re = re.compile(rf"(? 0: m_word = f" oh {_number_to_words_custom(m)}" if m < 10 else f" {_number_to_words_custom(m)}" - result = f"{h_word}{m_word}".lstrip() - if s_str: result += f" and {_number_to_words_custom(int(s_str))} seconds" - if am_pm: result += ' pm' if 'p' in am_pm.lower() else ' am' - return result - - -def _expand_measurement(m): - num_str, unit = m.groups() - is_neg = num_str.startswith('-') - if is_neg: num_str = num_str[1:] - if '/' in num_str: - num_word = _expand_fraction(re.match(_fraction_re, num_str)) - is_plural = True - else: - num_word = _number_to_words_custom(num_str) - is_plural = float(num_str) != 1 - unit_word = _measurement_map[unit][1] if is_plural else _measurement_map[unit][0] - result = f"{num_word} {unit_word}" - return f"minus {result}" if is_neg else result - - -def _expand_currency(m): - symbol, amount_str = (m.group(1), m.group(2)) if m.group(1) else (m.group(4), m.group(3)) - amount_str = (amount_str or "").replace(",", "") - if amount_str.startswith('.'): amount_str = '0' + amount_str - major_map = {"$": ("dollar", "dollars"), "£": ("pound", "pounds"), "€": ("euro", "euros")} - minor_map = {"$": ("cent", "cents"), "£": ("penny", "pence"), "€": ("cent", "cents")} - major_singular, major_plural = major_map.get(symbol, ("", "")) - parts = amount_str.split('.') - major_val = int(parts[0]) if parts[0] else 0 - minor_val = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 - result = [] - if major_val > 0: - result.append(f"{_number_to_words_custom(major_val)} {major_singular if major_val == 1 else major_plural}") - if minor_val > 0: - minor_singular, minor_plural = minor_map.get(symbol, ("", "")) - result.append(f"{_number_to_words_custom(minor_val)} {minor_singular if minor_val == 1 else minor_plural}") - return " and ".join(result) or f"zero {major_plural}" - - -def _expand_decimal_number(m): - num_str = m.group(1) - parts = num_str.split('.') - integer_part = _number_to_words_custom(parts[0]) - fractional_part = ' '.join(_number_to_words_custom(digit) for digit in parts[1]) - return f"{integer_part} point {fractional_part}" - - -def _expand_date(m): - month, day, year = m.groups() - month_word = month_name[int(month)] - day_word = _ordinal_custom(day) - year_num = int(year) - if len(year) == 2: year_num += 2000 if year_num < 50 else 1900 - return f"{month_word} {day_word}, {_expand_number_positive(str(year_num))}" - - -def _expand_fraction(m): - n, d = int(m.group(1)), int(m.group(2)) - if d == 0: return m.group(0) - common_fractions = {(1, 2): "one half", (1, 4): "one quarter", (3, 4): "three quarters"} - if (n, d) in common_fractions: return common_fractions[(n, d)] - return f"{_number_to_words_custom(n)} over {_number_to_words_custom(d)}" - - -def _expand_ordinal_word(m): - return _ordinal_custom(m.group(0)[:-2]) - - -def _expand_number(m): - num_str = m.group(0) - if num_str.startswith('-'): return f"minus {_expand_number_positive(num_str[1:])}" - return _expand_number_positive(num_str) - - -def _expand_number_positive(num_str): - num = int(num_str) - if 2000 <= num < 2010: return f"two thousand and {_number_to_words_custom(num % 100)}" - if 1100 <= num < 2100 and num % 100 != 0: - return f"{_number_to_words_custom(num // 100)} {_number_to_words_custom(num % 100)}" - return _number_to_words_custom(num_str) - - -def _expand_acronym(m): return " ".join(m.group(0)) - - -def normalize(text): - text = "".join(char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn") - text = re.sub(r"@", " at ", text) - for regex, replacement in _abbreviations: text = regex.sub(replacement, text) - text = re.sub(_currency_suffix_re, _expand_currency_suffix, text) - text = re.sub(_phone_re, _expand_phone_number, text) - text = re.sub(_dimension_re, _expand_dimension, text) - text = re.sub(_roman_re, _expand_roman, text) - text = re.sub(_decade_re, _expand_decade, text) - text = re.sub(_score_re, _expand_score, text) - text = re.sub(_date_re, _expand_date, text) - text = re.sub(_time_re, _expand_time, text) - text = re.sub(_ordinal_number_re, _convert_ordinal, text) - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_currency_re, _expand_currency, text) - text = re.sub(_measurement_re, _expand_measurement, text) - text = re.sub(_fraction_re, _expand_fraction, text) - text = re.sub(_decimal_number_re, _expand_decimal_number, text) - text = re.sub(_ordinal_re, _expand_ordinal_word, text) - text = re.sub(_alphanumeric_re, _expand_alphanumeric, text) - text = re.sub(_acronym_re, _expand_acronym, text) - text = re.sub(_number_re, _expand_number, text) - text = text.lower() - text = re.sub(r"%", " percent", text) - domain_re = re.compile(r'\b([a-z0-9-]+)\.([a-z]{2,})\b') - while domain_re.search(text): text = domain_re.sub(r'\1 dot \2', text) - text = re.sub(r"[^a-z0-9'.,?!:;-]", " ", text) - text = re.sub(r"\s+", " ", text) - return text.strip() +import re +import unicodedata +from calendar import month_name + + +# ------------------- 核心:自实现数字转单词 (替代 inflect) ------------------- + +def _number_to_words_custom(num_str): + """一个不依赖inflect的、简化的数字到单词转换器。""" + num_str = str(num_str).strip() + if not num_str.isdigit(): return num_str + + num = int(num_str) + if num == 0: return 'zero' + + units = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] + teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", + "nineteen"] + tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] + thousands = ["", "thousand", "million", "billion", "trillion"] + + def convert_less_than_thousand(n): + if n == 0: return "" + if n < 10: return units[n] + if n < 20: return teens[n - 10] + if n < 100: return tens[n // 10] + (" " + units[n % 10] if n % 10 != 0 else "") + return units[n // 100] + " hundred" + (" " + convert_less_than_thousand(n % 100) if n % 100 != 0 else "") + + words = [] + i = 0 + if num == 0: return "zero" + while num > 0: + if num % 1000 != 0: + words.insert(0, convert_less_than_thousand(num % 1000) + " " + thousands[i]) + num //= 1000 + i += 1 + return " ".join(words).strip() + + +def _ordinal_custom(num_str): + """一个不依赖inflect的、简化的序数词转换器。""" + num = int(num_str) + if 10 <= num % 100 <= 20: + suffix = 'th' + else: + suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(num % 10, 'th') + return _number_to_words_custom(str(num)) + suffix + + +# ------------------- 初始化和常量定义 (无 inflect) ------------------- + +_measurement_map = { + "km/h": ["kilometer per hour", "kilometers per hour"], "mph": ["mile per hour", "miles per hour"], + "°C": ["degree celsius", "degrees celsius"], "°F": ["degree fahrenheit", "degrees fahrenheit"], + "tbsp": ["tablespoon", "tablespoons"], "tsp": ["teaspoon", "teaspoons"], + "km": ["kilometer", "kilometers"], "kg": ["kilogram", "kilograms"], "min": ["minute", "minutes"], + "ft": ["foot", "feet"], "cm": ["centimeter", "centimeters"], "m": ["meter", "meters"], + "L": ["liter", "liters"], "h": ["hour", "hours"], "s": ["second", "seconds"], +} + +_abbreviations = [ + (re.compile(r"\bMr\.(?=[\s,.]|\Z)", re.IGNORECASE), "Mister"), + (re.compile(r"\bMrs\.(?=[\s,.]|\Z)", re.IGNORECASE), "Missus"), + (re.compile(r"\bDr\.(?=[\s,.]|\Z)", re.IGNORECASE), "Doctor"), + (re.compile(r"\bProf\.(?=[\s,.]|\Z)", re.IGNORECASE), "Professor"), + (re.compile(r"\bSt\.(?=[\s,.]|\Z)", re.IGNORECASE), "Street"), + (re.compile(r"\bCo\.(?=[\s,.]|\Z)", re.IGNORECASE), "Company"), + (re.compile(r"\bLtd\.(?=[\s,.]|\Z)", re.IGNORECASE), "Limited"), + (re.compile(r"\be\.g\.(?=[\s,.]|\Z)", re.IGNORECASE), "for example"), + (re.compile(r"\bi\.e\.(?=[\s,.]|\Z)", re.IGNORECASE), "that is"), +] + +# ------------------- 正则表达式定义 (与原来保持一致) ------------------- +_currency_suffix_re = re.compile(r"([£$€])([\d,.]*\d)\s*(million|billion|thousand)\b", re.IGNORECASE) +_phone_re = re.compile(r"(\+?\d{1,3}-)?\b(\d{3})-(?:(\d{3})-)?(\d{4})\b") +_roman_re = re.compile(r"\b(XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II)\b", re.IGNORECASE) +_decade_re = re.compile(r"\b((?:1[89]|20)\d0)s\b") +_score_re = re.compile(r"\b(\d{1,2})-(\d{1,2})\b") +_dimension_re = re.compile(r"\b(\d+(?:\.\d+)?)\s*x\s*(\d+(?:\.\d+)?)(?:\s*x\s*(\d+(?:\.\d+)?))?\b") +_alphanumeric_re = re.compile(r"\b([a-zA-Z]+[0-9]+|[0-9]+[a-zA-Z]+)\b") +_date_re = re.compile(r"\b(0?[1-9]|1[0-2])/([0-2]?\d|3[01])/(\d{2,4})\b") +_ordinal_number_re = re.compile(r"\b(\d+)\. ") +_comma_number_re = re.compile(r"(\d[\d,]+\d)") +_currency_re = re.compile(r"([£$€])(\d*\.?\d+)|(\d*\.?\d+)\s*([£$€])") +_time_re = re.compile(r"\b([01]?\d|2[0-3]):([0-5]\d)(?::([0-5]\d))?(\s*(?:a\.?m\.?|p\.?m\.?))?\b", re.IGNORECASE) +units = "|".join(re.escape(key) for key in sorted(_measurement_map.keys(), key=len, reverse=True)) +_measurement_re = re.compile(rf"(? 0: m_word = f" oh {_number_to_words_custom(m)}" if m < 10 else f" {_number_to_words_custom(m)}" + result = f"{h_word}{m_word}".lstrip() + if s_str: result += f" and {_number_to_words_custom(int(s_str))} seconds" + if am_pm: result += ' pm' if 'p' in am_pm.lower() else ' am' + return result + + +def _expand_measurement(m): + num_str, unit = m.groups() + is_neg = num_str.startswith('-') + if is_neg: num_str = num_str[1:] + if '/' in num_str: + num_word = _expand_fraction(re.match(_fraction_re, num_str)) + is_plural = True + else: + num_word = _number_to_words_custom(num_str) + is_plural = float(num_str) != 1 + unit_word = _measurement_map[unit][1] if is_plural else _measurement_map[unit][0] + result = f"{num_word} {unit_word}" + return f"minus {result}" if is_neg else result + + +def _expand_currency(m): + symbol, amount_str = (m.group(1), m.group(2)) if m.group(1) else (m.group(4), m.group(3)) + amount_str = (amount_str or "").replace(",", "") + if amount_str.startswith('.'): amount_str = '0' + amount_str + major_map = {"$": ("dollar", "dollars"), "£": ("pound", "pounds"), "€": ("euro", "euros")} + minor_map = {"$": ("cent", "cents"), "£": ("penny", "pence"), "€": ("cent", "cents")} + major_singular, major_plural = major_map.get(symbol, ("", "")) + parts = amount_str.split('.') + major_val = int(parts[0]) if parts[0] else 0 + minor_val = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + result = [] + if major_val > 0: + result.append(f"{_number_to_words_custom(major_val)} {major_singular if major_val == 1 else major_plural}") + if minor_val > 0: + minor_singular, minor_plural = minor_map.get(symbol, ("", "")) + result.append(f"{_number_to_words_custom(minor_val)} {minor_singular if minor_val == 1 else minor_plural}") + return " and ".join(result) or f"zero {major_plural}" + + +def _expand_decimal_number(m): + num_str = m.group(1) + parts = num_str.split('.') + integer_part = _number_to_words_custom(parts[0]) + fractional_part = ' '.join(_number_to_words_custom(digit) for digit in parts[1]) + return f"{integer_part} point {fractional_part}" + + +def _expand_date(m): + month, day, year = m.groups() + month_word = month_name[int(month)] + day_word = _ordinal_custom(day) + year_num = int(year) + if len(year) == 2: year_num += 2000 if year_num < 50 else 1900 + return f"{month_word} {day_word}, {_expand_number_positive(str(year_num))}" + + +def _expand_fraction(m): + n, d = int(m.group(1)), int(m.group(2)) + if d == 0: return m.group(0) + common_fractions = {(1, 2): "one half", (1, 4): "one quarter", (3, 4): "three quarters"} + if (n, d) in common_fractions: return common_fractions[(n, d)] + return f"{_number_to_words_custom(n)} over {_number_to_words_custom(d)}" + + +def _expand_ordinal_word(m): + return _ordinal_custom(m.group(0)[:-2]) + + +def _expand_number(m): + num_str = m.group(0) + if num_str.startswith('-'): return f"minus {_expand_number_positive(num_str[1:])}" + return _expand_number_positive(num_str) + + +def _expand_number_positive(num_str): + num = int(num_str) + if 2000 <= num < 2010: return f"two thousand and {_number_to_words_custom(num % 100)}" + if 1100 <= num < 2100 and num % 100 != 0: + return f"{_number_to_words_custom(num // 100)} {_number_to_words_custom(num % 100)}" + return _number_to_words_custom(num_str) + + +def _expand_acronym(m): return " ".join(m.group(0)) + + +def normalize(text): + text = "".join(char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn") + text = re.sub(r"@", " at ", text) + for regex, replacement in _abbreviations: text = regex.sub(replacement, text) + text = re.sub(_currency_suffix_re, _expand_currency_suffix, text) + text = re.sub(_phone_re, _expand_phone_number, text) + text = re.sub(_dimension_re, _expand_dimension, text) + text = re.sub(_roman_re, _expand_roman, text) + text = re.sub(_decade_re, _expand_decade, text) + text = re.sub(_score_re, _expand_score, text) + text = re.sub(_date_re, _expand_date, text) + text = re.sub(_time_re, _expand_time, text) + text = re.sub(_ordinal_number_re, _convert_ordinal, text) + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_currency_re, _expand_currency, text) + text = re.sub(_measurement_re, _expand_measurement, text) + text = re.sub(_fraction_re, _expand_fraction, text) + text = re.sub(_decimal_number_re, _expand_decimal_number, text) + text = re.sub(_ordinal_re, _expand_ordinal_word, text) + text = re.sub(_alphanumeric_re, _expand_alphanumeric, text) + text = re.sub(_acronym_re, _expand_acronym, text) + text = re.sub(_number_re, _expand_number, text) + text = text.lower() + text = re.sub(r"%", " percent", text) + domain_re = re.compile(r'\b([a-z0-9-]+)\.([a-z]{2,})\b') + while domain_re.search(text): text = domain_re.sub(r'\1 dot \2', text) + text = re.sub(r"[^a-z0-9'.,?!:;-]", " ", text) + text = re.sub(r"\s+", " ", text) + return text.strip() diff --git a/genie_tts/G2P/English/WordSegment.py b/genie_tts/G2P/English/WordSegment.py index 0359d9ca60ad76fbecc4ad2ba92d635942c6b959..26539258da8ddcae171dc0993cd13afa53136684 100644 --- a/genie_tts/G2P/English/WordSegment.py +++ b/genie_tts/G2P/English/WordSegment.py @@ -1,143 +1,143 @@ -import io -import math -import os -from typing import List, Iterator, Tuple, Dict - -from ...Core.Resources import English_G2P_DIR - - -class WordSegmenter: - """ - Contains the core logic for word segmentation, adapted from the original library. - """ - ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789') - TOTAL = 1024908267229.0 - LIMIT = 24 - - def __init__(self): - self.unigrams: Dict[str, float] = {} - self.bigrams: Dict[str, float] = {} - self.words: List[str] = [] - self.total: float = 0.0 - - def load(self, data_directory: str): - """ - Load unigram, bigram, and word counts from the specified data directory. - This is the main modification from the original library. - """ - unigrams_path = os.path.join(data_directory, 'unigrams.txt') - bigrams_path = os.path.join(data_directory, 'bigrams.txt') - words_path = os.path.join(data_directory, 'words.txt') - - for file_path in [unigrams_path, bigrams_path, words_path]: - if not os.path.exists(file_path): - raise FileNotFoundError( - f"Word segmentation data file not found: {file_path}. " - "Please ensure the data directory is correct." - ) - - self.unigrams.update(self._parse(unigrams_path)) - self.bigrams.update(self._parse(bigrams_path)) - with io.open(words_path, encoding='utf-8') as reader: - self.words.extend(reader.read().splitlines()) - - self.total = self.TOTAL - - @staticmethod - def _parse(filename: str) -> Dict[str, float]: - """Read `filename` and parse tab-separated file of word and count pairs.""" - with io.open(filename, encoding='utf-8') as reader: - # CORRECTED LINE: The generator now defines `line` before the comprehension uses it. - lines = (line.split('\t') for line in reader) - return {word: float(number) for word, number in lines if len(word) > 0 and len(number) > 0} - - def score(self, word: str, previous: str = None) -> float: - """Score `word` in the context of `previous` word.""" - if previous is None: - if word in self.unigrams: - return self.unigrams[word] / self.total - return 10.0 / (self.total * 10 ** len(word)) - - bigram = f'{previous} {word}' - if bigram in self.bigrams and previous in self.unigrams: - return self.bigrams[bigram] / self.total / self.score(previous) - - return self.score(word) - - def isegment(self, text: str) -> Iterator[str]: - """Return iterator of words that is the best segmenation of `text`.""" - memo = {} - - def search(text: str, previous: str = '') -> Tuple[float, List[str]]: - if text == '': - return 0.0, [] - - def candidates() -> Iterator[Tuple[float, List[str]]]: - for prefix, suffix in self._divide(text): - prefix_score = math.log10(self.score(prefix, previous)) - - pair = (suffix, prefix) - if pair not in memo: - memo[pair] = search(suffix, prefix) - suffix_score, suffix_words = memo[pair] - - yield prefix_score + suffix_score, [prefix] + suffix_words - - return max(candidates()) - - clean_text = self._clean(text) - - # Original logic to avoid recursion limits by chunking - size = 250 - prefix = '' - if len(clean_text) > size: - for offset in range(0, len(clean_text), size): - chunk = clean_text[offset:(offset + size)] - _, chunk_words = search(prefix + chunk) - - if len(chunk_words) > 5: - prefix = ''.join(chunk_words[-5:]) - del chunk_words[-5:] - else: # handle case where chunk is small - prefix = ''.join(chunk_words) - chunk_words = [] - - for word in chunk_words: - yield word - - _, prefix_words = search(prefix) - for word in prefix_words: - yield word - else: - _, words = search(clean_text) - for word in words: - yield word - - def segment(self, text: str) -> List[str]: - """Return list of words that is the best segmenation of `text`.""" - return list(self.isegment(text)) - - def _divide(self, text: str) -> Iterator[Tuple[str, str]]: - """Yield `(prefix, suffix)` pairs from `text`.""" - for pos in range(1, min(len(text), self.LIMIT) + 1): - yield text[:pos], text[pos:] - - @classmethod - def _clean(cls, text: str) -> str: - """Return `text` lower-cased with non-alphanumeric characters removed.""" - text_lower = text.lower() - return ''.join(letter for letter in text_lower if letter in cls.ALPHABET) - - -# --- Public Interface --- -# Create a single instance to be used by the importing module. - -_segmenter = WordSegmenter() -_segmenter.load(os.path.join(English_G2P_DIR, 'wordsegment')) - - -def segment_text(text: str) -> List[str]: - """ - Public function to segment a text string into a list of words. - """ - return _segmenter.segment(text) +import io +import math +import os +from typing import List, Iterator, Tuple, Dict + +from ...Core.Resources import English_G2P_DIR + + +class WordSegmenter: + """ + Contains the core logic for word segmentation, adapted from the original library. + """ + ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789') + TOTAL = 1024908267229.0 + LIMIT = 24 + + def __init__(self): + self.unigrams: Dict[str, float] = {} + self.bigrams: Dict[str, float] = {} + self.words: List[str] = [] + self.total: float = 0.0 + + def load(self, data_directory: str): + """ + Load unigram, bigram, and word counts from the specified data directory. + This is the main modification from the original library. + """ + unigrams_path = os.path.join(data_directory, 'unigrams.txt') + bigrams_path = os.path.join(data_directory, 'bigrams.txt') + words_path = os.path.join(data_directory, 'words.txt') + + for file_path in [unigrams_path, bigrams_path, words_path]: + if not os.path.exists(file_path): + raise FileNotFoundError( + f"Word segmentation data file not found: {file_path}. " + "Please ensure the data directory is correct." + ) + + self.unigrams.update(self._parse(unigrams_path)) + self.bigrams.update(self._parse(bigrams_path)) + with io.open(words_path, encoding='utf-8') as reader: + self.words.extend(reader.read().splitlines()) + + self.total = self.TOTAL + + @staticmethod + def _parse(filename: str) -> Dict[str, float]: + """Read `filename` and parse tab-separated file of word and count pairs.""" + with io.open(filename, encoding='utf-8') as reader: + # CORRECTED LINE: The generator now defines `line` before the comprehension uses it. + lines = (line.split('\t') for line in reader) + return {word: float(number) for word, number in lines if len(word) > 0 and len(number) > 0} + + def score(self, word: str, previous: str = None) -> float: + """Score `word` in the context of `previous` word.""" + if previous is None: + if word in self.unigrams: + return self.unigrams[word] / self.total + return 10.0 / (self.total * 10 ** len(word)) + + bigram = f'{previous} {word}' + if bigram in self.bigrams and previous in self.unigrams: + return self.bigrams[bigram] / self.total / self.score(previous) + + return self.score(word) + + def isegment(self, text: str) -> Iterator[str]: + """Return iterator of words that is the best segmenation of `text`.""" + memo = {} + + def search(text: str, previous: str = '') -> Tuple[float, List[str]]: + if text == '': + return 0.0, [] + + def candidates() -> Iterator[Tuple[float, List[str]]]: + for prefix, suffix in self._divide(text): + prefix_score = math.log10(self.score(prefix, previous)) + + pair = (suffix, prefix) + if pair not in memo: + memo[pair] = search(suffix, prefix) + suffix_score, suffix_words = memo[pair] + + yield prefix_score + suffix_score, [prefix] + suffix_words + + return max(candidates()) + + clean_text = self._clean(text) + + # Original logic to avoid recursion limits by chunking + size = 250 + prefix = '' + if len(clean_text) > size: + for offset in range(0, len(clean_text), size): + chunk = clean_text[offset:(offset + size)] + _, chunk_words = search(prefix + chunk) + + if len(chunk_words) > 5: + prefix = ''.join(chunk_words[-5:]) + del chunk_words[-5:] + else: # handle case where chunk is small + prefix = ''.join(chunk_words) + chunk_words = [] + + for word in chunk_words: + yield word + + _, prefix_words = search(prefix) + for word in prefix_words: + yield word + else: + _, words = search(clean_text) + for word in words: + yield word + + def segment(self, text: str) -> List[str]: + """Return list of words that is the best segmenation of `text`.""" + return list(self.isegment(text)) + + def _divide(self, text: str) -> Iterator[Tuple[str, str]]: + """Yield `(prefix, suffix)` pairs from `text`.""" + for pos in range(1, min(len(text), self.LIMIT) + 1): + yield text[:pos], text[pos:] + + @classmethod + def _clean(cls, text: str) -> str: + """Return `text` lower-cased with non-alphanumeric characters removed.""" + text_lower = text.lower() + return ''.join(letter for letter in text_lower if letter in cls.ALPHABET) + + +# --- Public Interface --- +# Create a single instance to be used by the importing module. + +_segmenter = WordSegmenter() +_segmenter.load(os.path.join(English_G2P_DIR, 'wordsegment')) + + +def segment_text(text: str) -> List[str]: + """ + Public function to segment a text string into a list of words. + """ + return _segmenter.segment(text) diff --git a/genie_tts/G2P/Japanese/JapaneseG2P.py b/genie_tts/G2P/Japanese/JapaneseG2P.py index 94362fc4b62601530de7b08699e43e91eaa4e591..194d9b459666595b2b609668a5cf50f0fe3f7719 100644 --- a/genie_tts/G2P/Japanese/JapaneseG2P.py +++ b/genie_tts/G2P/Japanese/JapaneseG2P.py @@ -1,150 +1,150 @@ -# -*- coding: utf-8 -*- -""" -用于纯日语的 G2P。 -""" -import re -import pyopenjtalk -from typing import List -from ..SymbolsV2 import symbols_v2, symbol_to_id_v2 - -# 匹配连续的标点符号 -_CONSECUTIVE_PUNCTUATION_RE = re.compile(r"([,./?!~…・])\1+") - -# 匹配需要转换为日语读法的特殊符号 -_SYMBOLS_TO_JAPANESE = [ - (re.compile("%"), "パーセント"), - (re.compile("%"), "パーセント"), -] - -# 匹配日语字符(汉字、假名、全角字母数字等) -_JAPANESE_CHARACTERS_RE = re.compile( - r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" -) - -# 匹配非日语字符(标点、空格等) -_JAPANESE_MARKS_RE = re.compile( - r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" -) - - -class JapaneseG2P: - """ - 一个简化的、封装好的日语Grapheme-to-Phoneme(字素到音素)转换器。 - - 本版本假设 pyopenjtalk 库已安装,并且不使用任何用户自定义词典。 - 它专注于提供一个纯粹、高效的文本到音素转换接口。 - """ - - @staticmethod - def _text_normalize(text: str) -> str: - """对输入文本进行基础的规范化处理。""" - for regex, replacement in _SYMBOLS_TO_JAPANESE: - text = re.sub(regex, replacement, text) - text = _CONSECUTIVE_PUNCTUATION_RE.sub(r"\1", text) - text = text.lower() - return text - - @staticmethod - def _post_replace_phoneme(ph: str) -> str: - """对单个音素或标点进行后处理替换。""" - rep_map = { - ":": ",", ";": ",", ",": ",", "。": ".", - "!": "!", "?": "?", "\n": ".", "·": ",", - "、": ",", "...": "…", - } - return rep_map.get(ph, ph) - - @staticmethod - def _numeric_feature_by_regex(regex: str, s: str) -> int: - """从OpenJTalk标签中提取数值特征。""" - match = re.search(regex, s) - return int(match.group(1)) if match else -50 - - @staticmethod - def _pyopenjtalk_g2p_prosody(text: str) -> List[str]: - """使用pyopenjtalk提取音素及韵律符号。""" - labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) - phones = [] - for n, lab_curr in enumerate(labels): - p3 = re.search(r"-(.*?)\+", lab_curr).group(1) - if p3 in "AEIOU": - p3 = p3.lower() - - if p3 == "sil": - if n == 0: - phones.append("^") - elif n == len(labels) - 1: - e3 = JapaneseG2P._numeric_feature_by_regex(r"!(\d+)_", lab_curr) - phones.append("?" if e3 == 1 else "$") - continue - elif p3 == "pau": - phones.append("_") - continue - else: - phones.append(p3) - - a1 = JapaneseG2P._numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) - a2 = JapaneseG2P._numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) - a3 = JapaneseG2P._numeric_feature_by_regex(r"\+(\d+)/", lab_curr) - f1 = JapaneseG2P._numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) - lab_next = labels[n + 1] if n + 1 < len(labels) else "" - a2_next = JapaneseG2P._numeric_feature_by_regex(r"\+(\d+)\+", lab_next) - - if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": - phones.append("#") - elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: - phones.append("]") - elif a2 == 1 and a2_next == 2: - phones.append("[") - - return phones - - @staticmethod - def g2p(text: str, with_prosody: bool = True) -> List[str]: - """ - 将日语文本转换为音素序列。 - - Args: - text (str): 待转换的日语文本。 - with_prosody (bool): 是否在输出中包含韵律符号。默认为 True。 - - Returns: - List[str]: 音素和符号的列表。 - """ - if not text.strip(): - return [] - - # 1. 文本规范化 - norm_text = JapaneseG2P._text_normalize(text) - - # 2. 使用标点符号分割字符串,得到日语文本片段 - japanese_segments = _JAPANESE_MARKS_RE.split(norm_text) - punctuation_marks = _JAPANESE_MARKS_RE.findall(norm_text) - - phonemes = [] - for i, segment in enumerate(japanese_segments): - if segment: - if with_prosody: # 移除分析结果中句首(^)/句尾($)的符号,因为我们按片段处理 - phones = JapaneseG2P._pyopenjtalk_g2p_prosody(segment)[1:-1] - else: - phones = pyopenjtalk.g2p(segment).split(" ") - phonemes.extend(phones) - - # 将对应的标点符号添加回来 - if i < len(punctuation_marks): - mark = punctuation_marks[i].strip() - if mark: - phonemes.append(mark) - - # 3. 对最终列表中的每个元素进行后处理(主要转换全角标点) - processed_phonemes = [JapaneseG2P._post_replace_phoneme(p) for p in phonemes] - - return processed_phonemes - - -def japanese_to_phones(text: str) -> List[int]: - phones = JapaneseG2P.g2p(text) - phones = [ph for ph in phones if ph in symbols_v2] - # print(phones) - phones = [symbol_to_id_v2[ph] for ph in phones] - return phones +# -*- coding: utf-8 -*- +""" +用于纯日语的 G2P。 +""" +import re +import pyopenjtalk +from typing import List +from ..SymbolsV2 import symbols_v2, symbol_to_id_v2 + +# 匹配连续的标点符号 +_CONSECUTIVE_PUNCTUATION_RE = re.compile(r"([,./?!~…・])\1+") + +# 匹配需要转换为日语读法的特殊符号 +_SYMBOLS_TO_JAPANESE = [ + (re.compile("%"), "パーセント"), + (re.compile("%"), "パーセント"), +] + +# 匹配日语字符(汉字、假名、全角字母数字等) +_JAPANESE_CHARACTERS_RE = re.compile( + r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" +) + +# 匹配非日语字符(标点、空格等) +_JAPANESE_MARKS_RE = re.compile( + r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" +) + + +class JapaneseG2P: + """ + 一个简化的、封装好的日语Grapheme-to-Phoneme(字素到音素)转换器。 + + 本版本假设 pyopenjtalk 库已安装,并且不使用任何用户自定义词典。 + 它专注于提供一个纯粹、高效的文本到音素转换接口。 + """ + + @staticmethod + def _text_normalize(text: str) -> str: + """对输入文本进行基础的规范化处理。""" + for regex, replacement in _SYMBOLS_TO_JAPANESE: + text = re.sub(regex, replacement, text) + text = _CONSECUTIVE_PUNCTUATION_RE.sub(r"\1", text) + text = text.lower() + return text + + @staticmethod + def _post_replace_phoneme(ph: str) -> str: + """对单个音素或标点进行后处理替换。""" + rep_map = { + ":": ",", ";": ",", ",": ",", "。": ".", + "!": "!", "?": "?", "\n": ".", "·": ",", + "、": ",", "...": "…", + } + return rep_map.get(ph, ph) + + @staticmethod + def _numeric_feature_by_regex(regex: str, s: str) -> int: + """从OpenJTalk标签中提取数值特征。""" + match = re.search(regex, s) + return int(match.group(1)) if match else -50 + + @staticmethod + def _pyopenjtalk_g2p_prosody(text: str) -> List[str]: + """使用pyopenjtalk提取音素及韵律符号。""" + labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) + phones = [] + for n, lab_curr in enumerate(labels): + p3 = re.search(r"-(.*?)\+", lab_curr).group(1) + if p3 in "AEIOU": + p3 = p3.lower() + + if p3 == "sil": + if n == 0: + phones.append("^") + elif n == len(labels) - 1: + e3 = JapaneseG2P._numeric_feature_by_regex(r"!(\d+)_", lab_curr) + phones.append("?" if e3 == 1 else "$") + continue + elif p3 == "pau": + phones.append("_") + continue + else: + phones.append(p3) + + a1 = JapaneseG2P._numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) + a2 = JapaneseG2P._numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) + a3 = JapaneseG2P._numeric_feature_by_regex(r"\+(\d+)/", lab_curr) + f1 = JapaneseG2P._numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) + lab_next = labels[n + 1] if n + 1 < len(labels) else "" + a2_next = JapaneseG2P._numeric_feature_by_regex(r"\+(\d+)\+", lab_next) + + if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": + phones.append("#") + elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: + phones.append("]") + elif a2 == 1 and a2_next == 2: + phones.append("[") + + return phones + + @staticmethod + def g2p(text: str, with_prosody: bool = True) -> List[str]: + """ + 将日语文本转换为音素序列。 + + Args: + text (str): 待转换的日语文本。 + with_prosody (bool): 是否在输出中包含韵律符号。默认为 True。 + + Returns: + List[str]: 音素和符号的列表。 + """ + if not text.strip(): + return [] + + # 1. 文本规范化 + norm_text = JapaneseG2P._text_normalize(text) + + # 2. 使用标点符号分割字符串,得到日语文本片段 + japanese_segments = _JAPANESE_MARKS_RE.split(norm_text) + punctuation_marks = _JAPANESE_MARKS_RE.findall(norm_text) + + phonemes = [] + for i, segment in enumerate(japanese_segments): + if segment: + if with_prosody: # 移除分析结果中句首(^)/句尾($)的符号,因为我们按片段处理 + phones = JapaneseG2P._pyopenjtalk_g2p_prosody(segment)[1:-1] + else: + phones = pyopenjtalk.g2p(segment).split(" ") + phonemes.extend(phones) + + # 将对应的标点符号添加回来 + if i < len(punctuation_marks): + mark = punctuation_marks[i].strip() + if mark: + phonemes.append(mark) + + # 3. 对最终列表中的每个元素进行后处理(主要转换全角标点) + processed_phonemes = [JapaneseG2P._post_replace_phoneme(p) for p in phonemes] + + return processed_phonemes + + +def japanese_to_phones(text: str) -> List[int]: + phones = JapaneseG2P.g2p(text) + phones = [ph for ph in phones if ph in symbols_v2] + # print(phones) + phones = [symbol_to_id_v2[ph] for ph in phones] + return phones diff --git a/genie_tts/G2P/__pycache__/SymbolsV2.cpython-311.pyc b/genie_tts/G2P/__pycache__/SymbolsV2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cfd9e35f1ef5c9228bf2c6ffebc88024e3f3baf Binary files /dev/null and b/genie_tts/G2P/__pycache__/SymbolsV2.cpython-311.pyc differ diff --git a/genie_tts/G2P/__pycache__/__init__.cpython-311.pyc b/genie_tts/G2P/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0d8703d9de2a82812e870c42c43fc81129b7dcd Binary files /dev/null and b/genie_tts/G2P/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/GUI/AudioPlayer.py b/genie_tts/GUI/AudioPlayer.py index 76e615ce9985896bd2efca8a199fd2d54a649dc7..bc972bdd30360ea4ea2a13ad7292eee7fd151e29 100644 --- a/genie_tts/GUI/AudioPlayer.py +++ b/genie_tts/GUI/AudioPlayer.py @@ -1,94 +1,94 @@ -import sounddevice as sd -import threading -import queue -from typing import Union, Optional, Callable -import numpy as np -import os -import soundfile as sf - - -def run_in_sub_thread(func) -> Callable[..., threading.Thread]: - def wrapper(*args, **kwargs) -> threading.Thread: - thread = threading.Thread(target=func, args=args, kwargs=kwargs) - thread.daemon = True - thread.start() - return thread - - return wrapper - - -class AudioPlayer: - CHUNK_SIZE: int = 1024 - - def __init__(self): - self._task_queue: queue.Queue[bytes | str] = queue.Queue() - self._worker_thread: Optional[threading.Thread] = None - self._stop_event: threading.Event = threading.Event() - self._start_worker() - - def _start_worker(self): - """启动工作线程(如果它尚未运行或已关闭)。""" - if self._worker_thread and self._worker_thread.is_alive(): - return - self._stop_event.clear() - self._worker_thread = self._playback_worker() - - @run_in_sub_thread - def _playback_worker(self) -> None: - while not self._stop_event.is_set(): - try: - task: str = self._task_queue.get(timeout=0.1) - except queue.Empty: - continue - - stream = None - try: - if isinstance(task, str) and os.path.isfile(task): - with sf.SoundFile(task, 'r') as f: - if sd is not None: - stream = sd.OutputStream( - samplerate=f.samplerate, - channels=f.channels, - dtype='float32', - ) - stream.start() - while not self._stop_event.is_set(): - chunk = f.read(self.CHUNK_SIZE, dtype='float32') - if not chunk.any(): - break - stream.write(chunk) - except Exception as e: - if isinstance(e, sf.SoundFileError): - print(f"无法读取或解析音频文件: {task}, 错误: {e}") - else: - print(f"播放时发生错误: {e}") - finally: - if stream: - stream.stop() - stream.close() - self._task_queue.task_done() - - def play(self, source: Union[str, np.ndarray]): - """将音频源加入播放队列。""" - self._start_worker() - self._task_queue.put(source) - - def stop(self): - """停止播放并清空播放队列。""" - self._stop_event.set() - if self._worker_thread and self._worker_thread.is_alive(): - self._worker_thread.join() - self._stop_event.clear() - - with self._task_queue.mutex: - self._task_queue.queue.clear() - while self._task_queue.unfinished_tasks > 0: - self._task_queue.task_done() - - def wait(self): - """阻塞,直到队列中所有任务都播放完成。""" - self._task_queue.join() - - def close(self): - """永久关闭播放器并释放资源。""" - self.stop() +import sounddevice as sd +import threading +import queue +from typing import Union, Optional, Callable +import numpy as np +import os +import soundfile as sf + + +def run_in_sub_thread(func) -> Callable[..., threading.Thread]: + def wrapper(*args, **kwargs) -> threading.Thread: + thread = threading.Thread(target=func, args=args, kwargs=kwargs) + thread.daemon = True + thread.start() + return thread + + return wrapper + + +class AudioPlayer: + CHUNK_SIZE: int = 1024 + + def __init__(self): + self._task_queue: queue.Queue[bytes | str] = queue.Queue() + self._worker_thread: Optional[threading.Thread] = None + self._stop_event: threading.Event = threading.Event() + self._start_worker() + + def _start_worker(self): + """启动工作线程(如果它尚未运行或已关闭)。""" + if self._worker_thread and self._worker_thread.is_alive(): + return + self._stop_event.clear() + self._worker_thread = self._playback_worker() + + @run_in_sub_thread + def _playback_worker(self) -> None: + while not self._stop_event.is_set(): + try: + task: str = self._task_queue.get(timeout=0.1) + except queue.Empty: + continue + + stream = None + try: + if isinstance(task, str) and os.path.isfile(task): + with sf.SoundFile(task, 'r') as f: + if sd is not None: + stream = sd.OutputStream( + samplerate=f.samplerate, + channels=f.channels, + dtype='float32', + ) + stream.start() + while not self._stop_event.is_set(): + chunk = f.read(self.CHUNK_SIZE, dtype='float32') + if not chunk.any(): + break + stream.write(chunk) + except Exception as e: + if isinstance(e, sf.SoundFileError): + print(f"无法读取或解析音频文件: {task}, 错误: {e}") + else: + print(f"播放时发生错误: {e}") + finally: + if stream: + stream.stop() + stream.close() + self._task_queue.task_done() + + def play(self, source: Union[str, np.ndarray]): + """将音频源加入播放队列。""" + self._start_worker() + self._task_queue.put(source) + + def stop(self): + """停止播放并清空播放队列。""" + self._stop_event.set() + if self._worker_thread and self._worker_thread.is_alive(): + self._worker_thread.join() + self._stop_event.clear() + + with self._task_queue.mutex: + self._task_queue.queue.clear() + while self._task_queue.unfinished_tasks > 0: + self._task_queue.task_done() + + def wait(self): + """阻塞,直到队列中所有任务都播放完成。""" + self._task_queue.join() + + def close(self): + """永久关闭播放器并释放资源。""" + self.stop() diff --git a/genie_tts/GUI/ConverterWidget.py b/genie_tts/GUI/ConverterWidget.py index a1d6bbf8596e1c6aebe67dfd328992dada328614..57e53222797f0cf310366b8e6a6a387198806e0a 100644 --- a/genie_tts/GUI/ConverterWidget.py +++ b/genie_tts/GUI/ConverterWidget.py @@ -1,204 +1,204 @@ -import sys -import os -import datetime - -from PySide6.QtWidgets import ( - QApplication, QWidget, QVBoxLayout, QPushButton, QTextEdit, QFileDialog, - QListView, QTreeView, QAbstractItemView -) -from PySide6.QtCore import Signal, QObject, QSettings, QThread - -from ..Converter.Converter import convert -from ..Converter.v2.Converter import find_ckpt_and_pth - - -def get_timestamp_msg(message: str, level: str = "INFO") -> str: - """辅助函数:生成类似 Logging 格式的带时间戳字符串""" - now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - return f"{now} - {level} - {message}" - - -class Worker(QObject): - finished = Signal() - log_signal = Signal(str) - - def __init__(self, folders): - super().__init__() - self.folders = folders - - def log(self, message: str, level: str = "INFO"): - """内部辅助方法,用于格式化并发送日志""" - formatted_msg = get_timestamp_msg(message, level) - self.log_signal.emit(formatted_msg) - - def run(self): - """执行转换任务""" - try: - root_output_dir = os.path.abspath("./Output") - for folder in self.folders: - character_name: str = os.path.basename(folder) - output_dir: str = os.path.join(root_output_dir, character_name) - if os.path.exists(output_dir): - self.log(f'输出文件夹 {output_dir} 已存在,将覆盖内容。', "WARNING") - torch_ckpt_path, torch_pth_path = find_ckpt_and_pth(folder) - if not torch_ckpt_path or not torch_pth_path: - self.log(f'无法处理文件夹 {folder} 。请保证文件夹内有 GPT—SOVITS V2 导出的 .pth 和 .ckpt 模型。', - "ERROR") - continue - self.log(f'正在处理 {folder} 。') - # 调用转换逻辑 - convert(torch_ckpt_path, torch_pth_path, output_dir) - self.log(f'{folder} 处理完成。') # 可选:提示完成 - os.startfile(root_output_dir) - except Exception as e: - self.log(f"任务执行过程中发生未捕获异常: {str(e)}", "ERROR") - finally: - self.finished.emit() - - -class ConverterWidget(QWidget): - def __init__(self): - super().__init__() - self.setWindowTitle('GENIE Converter (PySide6 Version)') - self.resize(1280, 720) - - self.settings = QSettings("MyCompany", "GENIE Converter") - - main_layout = QVBoxLayout(self) - main_layout.setContentsMargins(20, 20, 20, 20) - main_layout.setSpacing(15) - - self.folder_button = QPushButton('📂 选择一个或多个文件夹') - self.folder_button.setFixedHeight(40) - self.folder_button.clicked.connect(self.open_folder_dialog) - - self.log_display = QTextEdit() - self.log_display.setReadOnly(True) - - main_layout.addWidget(self.folder_button) - main_layout.addWidget(self.log_display) - - self.apply_stylesheet() - - self.thread = None - self.worker = None - - self.append_formatted_log("欢迎使用 GENIE Converter!") - self.append_formatted_log("支持将 GPT—SOVITS V2/V2ProPlus 模型导出为 GENIE 引擎所需的格式。") - self.append_formatted_log("请选择一个或多个文件夹,每个文件夹中包含一对 .pth 和 .ckpt 文件。") - self.append_formatted_log("您可以使用 Ctrl 或 Shift 键来进行多选。\n") - - def apply_stylesheet(self): - self.setStyleSheet(""" - QWidget { - background-color: #2b2b2b; - color: #f0f0f0; - font-family: 'Segoe UI', 'Microsoft YaHei', 'Arial'; - font-size: 14px; - } - QPushButton { - background-color: #007bff; - color: white; - border: none; - padding: 10px; - border-radius: 5px; - font-weight: bold; - } - QPushButton:hover { background-color: #0056b3; } - QPushButton:pressed { background-color: #004494; } - QPushButton:disabled { - background-color: #555; - color: #aaa; - } - QTextEdit { - background-color: #1e1e1e; - border: 1px solid #444; - border-radius: 5px; - padding: 8px; - font-family: 'Consolas', 'Courier New', monospace; - } - QScrollBar:vertical { - border: none; background: #2b2b2b; width: 12px; margin: 0; - } - QScrollBar::handle:vertical { - background: #555; min-height: 20px; border-radius: 6px; - } - QScrollBar::handle:vertical:hover { background: #007bff; } - QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical { height: 0px; } - """) - - def append_log(self, text: str): - """直接给主线程调用的日志打印方法""" - self.log_display.append(text) - self.scroll_to_bottom() - - def append_formatted_log(self, text: str, level="INFO"): - """给主线程调用的带格式日志方法""" - msg = get_timestamp_msg(text, level) - self.append_log(msg) - - def scroll_to_bottom(self): - self.log_display.verticalScrollBar().setValue( - self.log_display.verticalScrollBar().maximum() - ) - - def open_folder_dialog(self): - last_dir = self.settings.value("last_dir", "") - - dialog = QFileDialog(self, '请选择文件夹', str(last_dir)) - dialog.setFileMode(QFileDialog.FileMode.Directory) - dialog.setOption(QFileDialog.Option.DontUseNativeDialog, True) - - list_view = dialog.findChild(QListView, 'listView') - if list_view: - list_view.setSelectionMode(QAbstractItemView.SelectionMode.ExtendedSelection) - - tree_view = dialog.findChild(QTreeView) - if tree_view: - tree_view.setSelectionMode(QAbstractItemView.SelectionMode.ExtendedSelection) - - if dialog.exec(): - selected_folders = dialog.selectedFiles() - if selected_folders: - self.run_conversion_task(selected_folders) - self.settings.setValue("last_dir", os.path.dirname(selected_folders[0])) - - def run_conversion_task(self, folders): - self.folder_button.setEnabled(False) - self.folder_button.setText("🔄 转换中,请稍候...") - - self.thread = QThread() - self.worker = Worker(folders) - self.worker.moveToThread(self.thread) - - # 连接线程控制信号 - self.thread.started.connect(self.worker.run) - self.worker.finished.connect(self.on_conversion_finished) - self.worker.log_signal.connect(self.append_log) - - self.thread.start() - - def on_conversion_finished(self): - self.thread.quit() - self.thread.wait() - self.worker.deleteLater() - self.thread.deleteLater() - self.thread = None - self.worker = None - - self.folder_button.setEnabled(True) - self.folder_button.setText("📂 选择一个或多个文件夹") - self.append_formatted_log("所有任务已完成。", "INFO") - - def closeEvent(self, event): - if self.thread is not None and self.thread.isRunning(): - self.thread.quit() - self.thread.wait() - super().closeEvent(event) - - -def start_gui() -> None: - app = QApplication(sys.argv) - window = ConverterWidget() - window.show() - sys.exit(app.exec()) +import sys +import os +import datetime + +from PySide6.QtWidgets import ( + QApplication, QWidget, QVBoxLayout, QPushButton, QTextEdit, QFileDialog, + QListView, QTreeView, QAbstractItemView +) +from PySide6.QtCore import Signal, QObject, QSettings, QThread + +from ..Converter.Converter import convert +from ..Converter.v2.Converter import find_ckpt_and_pth + + +def get_timestamp_msg(message: str, level: str = "INFO") -> str: + """辅助函数:生成类似 Logging 格式的带时间戳字符串""" + now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return f"{now} - {level} - {message}" + + +class Worker(QObject): + finished = Signal() + log_signal = Signal(str) + + def __init__(self, folders): + super().__init__() + self.folders = folders + + def log(self, message: str, level: str = "INFO"): + """内部辅助方法,用于格式化并发送日志""" + formatted_msg = get_timestamp_msg(message, level) + self.log_signal.emit(formatted_msg) + + def run(self): + """执行转换任务""" + try: + root_output_dir = os.path.abspath("./Output") + for folder in self.folders: + character_name: str = os.path.basename(folder) + output_dir: str = os.path.join(root_output_dir, character_name) + if os.path.exists(output_dir): + self.log(f'输出文件夹 {output_dir} 已存在,将覆盖内容。', "WARNING") + torch_ckpt_path, torch_pth_path = find_ckpt_and_pth(folder) + if not torch_ckpt_path or not torch_pth_path: + self.log(f'无法处理文件夹 {folder} 。请保证文件夹内有 GPT—SOVITS V2 导出的 .pth 和 .ckpt 模型。', + "ERROR") + continue + self.log(f'正在处理 {folder} 。') + # 调用转换逻辑 + convert(torch_ckpt_path, torch_pth_path, output_dir) + self.log(f'{folder} 处理完成。') # 可选:提示完成 + os.startfile(root_output_dir) + except Exception as e: + self.log(f"任务执行过程中发生未捕获异常: {str(e)}", "ERROR") + finally: + self.finished.emit() + + +class ConverterWidget(QWidget): + def __init__(self): + super().__init__() + self.setWindowTitle('GENIE Converter (PySide6 Version)') + self.resize(1280, 720) + + self.settings = QSettings("MyCompany", "GENIE Converter") + + main_layout = QVBoxLayout(self) + main_layout.setContentsMargins(20, 20, 20, 20) + main_layout.setSpacing(15) + + self.folder_button = QPushButton('📂 选择一个或多个文件夹') + self.folder_button.setFixedHeight(40) + self.folder_button.clicked.connect(self.open_folder_dialog) + + self.log_display = QTextEdit() + self.log_display.setReadOnly(True) + + main_layout.addWidget(self.folder_button) + main_layout.addWidget(self.log_display) + + self.apply_stylesheet() + + self.thread = None + self.worker = None + + self.append_formatted_log("欢迎使用 GENIE Converter!") + self.append_formatted_log("支持将 GPT—SOVITS V2/V2ProPlus 模型导出为 GENIE 引擎所需的格式。") + self.append_formatted_log("请选择一个或多个文件夹,每个文件夹中包含一对 .pth 和 .ckpt 文件。") + self.append_formatted_log("您可以使用 Ctrl 或 Shift 键来进行多选。\n") + + def apply_stylesheet(self): + self.setStyleSheet(""" + QWidget { + background-color: #2b2b2b; + color: #f0f0f0; + font-family: 'Segoe UI', 'Microsoft YaHei', 'Arial'; + font-size: 14px; + } + QPushButton { + background-color: #007bff; + color: white; + border: none; + padding: 10px; + border-radius: 5px; + font-weight: bold; + } + QPushButton:hover { background-color: #0056b3; } + QPushButton:pressed { background-color: #004494; } + QPushButton:disabled { + background-color: #555; + color: #aaa; + } + QTextEdit { + background-color: #1e1e1e; + border: 1px solid #444; + border-radius: 5px; + padding: 8px; + font-family: 'Consolas', 'Courier New', monospace; + } + QScrollBar:vertical { + border: none; background: #2b2b2b; width: 12px; margin: 0; + } + QScrollBar::handle:vertical { + background: #555; min-height: 20px; border-radius: 6px; + } + QScrollBar::handle:vertical:hover { background: #007bff; } + QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical { height: 0px; } + """) + + def append_log(self, text: str): + """直接给主线程调用的日志打印方法""" + self.log_display.append(text) + self.scroll_to_bottom() + + def append_formatted_log(self, text: str, level="INFO"): + """给主线程调用的带格式日志方法""" + msg = get_timestamp_msg(text, level) + self.append_log(msg) + + def scroll_to_bottom(self): + self.log_display.verticalScrollBar().setValue( + self.log_display.verticalScrollBar().maximum() + ) + + def open_folder_dialog(self): + last_dir = self.settings.value("last_dir", "") + + dialog = QFileDialog(self, '请选择文件夹', str(last_dir)) + dialog.setFileMode(QFileDialog.FileMode.Directory) + dialog.setOption(QFileDialog.Option.DontUseNativeDialog, True) + + list_view = dialog.findChild(QListView, 'listView') + if list_view: + list_view.setSelectionMode(QAbstractItemView.SelectionMode.ExtendedSelection) + + tree_view = dialog.findChild(QTreeView) + if tree_view: + tree_view.setSelectionMode(QAbstractItemView.SelectionMode.ExtendedSelection) + + if dialog.exec(): + selected_folders = dialog.selectedFiles() + if selected_folders: + self.run_conversion_task(selected_folders) + self.settings.setValue("last_dir", os.path.dirname(selected_folders[0])) + + def run_conversion_task(self, folders): + self.folder_button.setEnabled(False) + self.folder_button.setText("🔄 转换中,请稍候...") + + self.thread = QThread() + self.worker = Worker(folders) + self.worker.moveToThread(self.thread) + + # 连接线程控制信号 + self.thread.started.connect(self.worker.run) + self.worker.finished.connect(self.on_conversion_finished) + self.worker.log_signal.connect(self.append_log) + + self.thread.start() + + def on_conversion_finished(self): + self.thread.quit() + self.thread.wait() + self.worker.deleteLater() + self.thread.deleteLater() + self.thread = None + self.worker = None + + self.folder_button.setEnabled(True) + self.folder_button.setText("📂 选择一个或多个文件夹") + self.append_formatted_log("所有任务已完成。", "INFO") + + def closeEvent(self, event): + if self.thread is not None and self.thread.isRunning(): + self.thread.quit() + self.thread.wait() + super().closeEvent(event) + + +def start_gui() -> None: + app = QApplication(sys.argv) + window = ConverterWidget() + window.show() + sys.exit(app.exec()) diff --git a/genie_tts/GUI/GUI.py b/genie_tts/GUI/GUI.py index c2b4e4408dc8d8567ce30e4bb3ada1f494d62185..8e5716e8e87ab348551a4b9c3e42e6971c1d2af6 100644 --- a/genie_tts/GUI/GUI.py +++ b/genie_tts/GUI/GUI.py @@ -1,649 +1,649 @@ -import sys -import os -import shutil -from typing import List, Optional, TextIO, Any -import uuid - -import soundfile as sf -import numpy as np - -from PySide6.QtWidgets import ( - QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, QLineEdit, QTextEdit, - QFileDialog, QGroupBox, QScrollArea, QMessageBox, QTabWidget, QFormLayout, QFrame, -) -from PySide6.QtCore import ( - Qt, Signal, Slot, QObject, -) -from PySide6.QtGui import QTextCursor, QCloseEvent - -from ..Utils.TextSplitter import TextSplitter -from .Utils import ( - generate_output_filenames, FileSelectorWidget, FileSelectionMode, MyComboBox, sanitize_filename, - MyTextEdit -) -from .AudioPlayer import AudioPlayer -from .PresetManager import PresetManager -from .ServerManager import InferenceWorker -from .ConverterWidget import ConverterWidget - -""" -抄自 Genie CUDA Runtime -""" - -CACHE_DIR = './UserData/Cache/GenieGUI' -os.makedirs(CACHE_DIR, exist_ok=True) - - -# ==================== 后台工作线程 ==================== - -class LogRedirector(QObject): - """重定向 stdout 到 Signal""" - textWritten = Signal(str) - - def __init__(self): - super().__init__() - self._old_stdout: TextIO = sys.stdout - - def write(self, text: Any): - text = str(text) - self.textWritten.emit(text) - if self._old_stdout is not None: - self._old_stdout.write(text) - - def flush(self): - pass - - -# ==================== UI 组件实现 ==================== - -class PreviewItemWidget(QFrame): - """单条音频预览组件""" - - def __init__( - self, - index: int, - text: str, - file_path: str, - player: AudioPlayer, - parent: QWidget = None - ): - super().__init__(parent) - self.text: str = text - self.file_path: str = file_path - self.player: AudioPlayer = player - self.setFrameShape(QFrame.Shape.StyledPanel) - self.setFrameShadow(QFrame.Shadow.Raised) - - layout = QHBoxLayout(self) - layout.setContentsMargins(5, 5, 5, 5) - - # 编号 - lbl_id = QLabel(f"#{index}") - lbl_id.setFixedWidth(40) - lbl_id.setStyleSheet("font-weight: bold; color: #555;") - - # 文本 - lbl_text = QLabel(text) - lbl_text.setFixedWidth(240) - lbl_text.setToolTip(text) - - # 按钮 - 播放 - btn_play = QPushButton("▶ 播放") - btn_play.setFixedWidth(80) - btn_play.clicked.connect(self._play_audio) - - # 按钮 - 保存 - btn_save = QPushButton("⬇ 保存") - btn_save.setFixedWidth(80) - btn_save.clicked.connect(self._save_file) - - # 按钮 - 删除 (新增) - btn_del = QPushButton("🗑 删除") - btn_del.setFixedWidth(80) - btn_del.setStyleSheet("color: #ff4d4d;") # 以此区分删除按钮 - btn_del.clicked.connect(self._delete_item) - - layout.addWidget(lbl_id) - layout.addWidget(lbl_text, 1) # Stretch - layout.addWidget(btn_play) - layout.addWidget(btn_save) - layout.addWidget(btn_del) # 添加到布局 - - def _play_audio(self): - # 播放前先停止其他播放 - self.player.stop() - self.player.play(self.file_path) - - def _save_file(self): - filename = sanitize_filename(self.text) - save_path, _ = QFileDialog.getSaveFileName( - self, "保存音频", f"{filename}.wav", "WAV Audio (*.wav)" - ) - if save_path: - try: - shutil.copy(self.file_path, save_path) - QMessageBox.information(self, "成功", "文件保存成功!") - except Exception as e: - QMessageBox.critical(self, "错误", f"保存失败: {e}") - - def _delete_item(self): - """删除当前条目及对应的文件""" - # 1. 停止播放 - self.player.stop() - - # 2. 尝试删除物理文件 (避免垃圾堆积) - try: - if os.path.exists(self.file_path): - os.remove(self.file_path) - print(f"[INFO] 已删除文件: {self.file_path}") - except Exception as e: - print(f"[WARN] 删除文件失败: {e}") - - # 3. 从界面移除自身 - self.deleteLater() - - -class LogWidget(QWidget): - """日志显示Tab""" - - def __init__(self, parent: QWidget = None): - super().__init__(parent) - layout = QVBoxLayout(self) - self.text_edit: QTextEdit = QTextEdit() - self.text_edit.setReadOnly(True) - self.text_edit.setStyleSheet( - "background-color: #1e1e1e;" - "color: #ecf0f1;" - "font-family: Consolas;" - "font-size: 12pt;" - ) - layout.addWidget(self.text_edit) - - @Slot(str) - def append_log(self, text: str): - self.text_edit.moveCursor(QTextCursor.MoveOperation.End) - self.text_edit.insertPlainText(text) - self.text_edit.moveCursor(QTextCursor.MoveOperation.End) - - -class TTSWidget(QWidget): - """TTS 主交互界面""" - - def __init__(self, player: AudioPlayer, parent: QWidget = None): - super().__init__(parent) - self.player: AudioPlayer = player - self.splitter: TextSplitter = TextSplitter() - self.current_gen_id: int = 0 - self.current_worker: Optional[InferenceWorker] = None - - main_layout = QVBoxLayout(self) - - # ---------------- 顶部:预设管理器 ---------------- - self.preset_manager = PresetManager( - presets_file='./UserData/GenieGuiConfig.json', - state_getter=self.get_ui_state, - ) - self.preset_manager.sig_load_state.connect(self.apply_ui_state) - main_layout.addWidget(self.preset_manager) - - # ---------------- 中间:滚动设置区 ---------------- - scroll = QScrollArea() - scroll.setWidgetResizable(True) - content_widget = QWidget() - - content_layout = QHBoxLayout(content_widget) - left_column_layout = QVBoxLayout() - right_column_layout = QVBoxLayout() - - # ==================== 左侧列内容 ==================== - - # 模型设置组 - group_model = QGroupBox("模型设置") - self.layout_model = QFormLayout() - self.combo_model_type = MyComboBox() - self.combo_model_type.addItems(["Genie-TTS"]) - self.combo_model_type.currentTextChanged.connect(self._update_model_ui_visibility) - self.combo_model_type.setEnabled(False) - self.file_gpt = FileSelectorWidget("gpt_path", FileSelectionMode.FILE, "Checkpoints (*.ckpt)") - self.file_vits = FileSelectorWidget("vits_path", FileSelectionMode.FILE, "Models (*.pth)") - self.file_genie = FileSelectorWidget("genie_dir", FileSelectionMode.DIRECTORY) - self.file_gpt.pathChanged.connect(self._on_gpt_path_changed) - self.file_vits.pathChanged.connect(self._on_vits_path_changed) - self.layout_model.addRow("模型类型:", self.combo_model_type) - self.layout_model.addRow("GPT模型 (.ckpt):", self.file_gpt) - self.layout_model.addRow("VITS模型 (.pth):", self.file_vits) - self.layout_model.addRow("Genie模型目录:", self.file_genie) - group_model.setLayout(self.layout_model) - # 参考音频组 - group_ref = QGroupBox("参考音频") - layout_ref = QFormLayout() - self.file_ref_audio = FileSelectorWidget("ref_audio", FileSelectionMode.FILE, "Audio (*.wav *.mp3)") - self.input_ref_text = QLineEdit() - self.input_ref_text.setPlaceholderText("请输入参考音频对应的文本...") - btn_play_ref = QPushButton("▶️") - btn_play_ref.setFixedWidth(30) - btn_play_ref.clicked.connect(self._play_ref_audio) - hbox_ref_text = QHBoxLayout() - hbox_ref_text.addWidget(self.input_ref_text) - hbox_ref_text.addWidget(btn_play_ref) - layout_ref.addRow("音频文件:", self.file_ref_audio) - layout_ref.addRow("音频文本:", hbox_ref_text) - group_ref.setLayout(layout_ref) - - left_column_layout.addWidget(group_model) - left_column_layout.addWidget(group_ref) - left_column_layout.addStretch() - - # ==================== 右侧列内容 ==================== - - # === 推理设置组 === - group_infer = QGroupBox("推理参数") - layout_infer = QFormLayout() - self.combo_device = MyComboBox() - self.combo_device.addItems(["CPU"]) - self.combo_device.setEnabled(False) - self.combo_quality = MyComboBox() - self.combo_quality.addItems(["质量优先"]) - self.combo_quality.setEnabled(False) - self.combo_split = MyComboBox() - self.combo_split.addItems(["不切分", "智能切分", "按行切分"]) - self.combo_mode = MyComboBox() - self.combo_mode.addItems(["串行推理"]) - self.combo_mode.setEnabled(False) - self.combo_lang = MyComboBox() - self.combo_lang.addItems(["Chinese", "English", "Japanese"]) - layout_infer.addRow("推理设备:\n(重启生效)", self.combo_device) - layout_infer.addRow("推理需求:", self.combo_quality) - layout_infer.addRow("分句方式:", self.combo_split) - layout_infer.addRow("推理模式:", self.combo_mode) - layout_infer.addRow("目标语言:", self.combo_lang) - group_infer.setLayout(layout_infer) - - # === 自动保存组 === - group_save = QGroupBox("自动保存设置") - self.layout_save = QFormLayout() - self.combo_save_mode = MyComboBox() - self.combo_save_mode.addItems(["禁用自动保存", "保存为单个文件", "保存为多个文件"]) - self.combo_save_mode.currentIndexChanged.connect(self._update_save_ui_state) - default_out_path = os.path.join(os.path.expanduser("~"), "Desktop", "Genie 输出语音") - self.file_out_dir = FileSelectorWidget("out_dir", FileSelectionMode.DIRECTORY) - self.file_out_dir.set_path(default_out_path) - self.layout_save.addRow("保存方式:", self.combo_save_mode) - self.layout_save.addRow("输出文件夹:", self.file_out_dir) - group_save.setLayout(self.layout_save) - - right_column_layout.addWidget(group_infer) - right_column_layout.addWidget(group_save) - right_column_layout.addStretch() - - content_layout.addLayout(left_column_layout, 1) - content_layout.addLayout(right_column_layout, 1) - scroll.setWidget(content_widget) - main_layout.addWidget(scroll, 5) - - # ==================== 底部:输入控制 + 输出预览 ==================== - - # 创建底部容器 widget - bottom_widget = QWidget() - bottom_layout = QHBoxLayout(bottom_widget) - bottom_layout.setContentsMargins(0, 0, 0, 0) # 去除边距让它贴合 - - # --- 输入控制组 --- - group_input = QGroupBox("目标文本") - layout_input = QVBoxLayout() - self.text_input = MyTextEdit() - self.text_input.setPlaceholderText("请输入要合成的目标文本...") - self.text_input.setFixedHeight(300) - self.btn_start = QPushButton("开始推理") - self.btn_start.setFixedHeight(40) - self.btn_start.setStyleSheet(""" - QPushButton { - background-color: #4CAF50; - color: white; - font-weight: bold; - border-radius: 5px; - } - QPushButton:hover { background-color: #45a049; } - QPushButton:disabled { background-color: #cccccc; } - """) - self.btn_start.clicked.connect(self._start_inference) - layout_input.addWidget(self.text_input) - layout_input.addWidget(self.btn_start) - group_input.setLayout(layout_input) - - # --- 输出预览组 --- - group_preview = QGroupBox("输出音频预览") - preview_layout = QVBoxLayout() - self.preview_scroll = QScrollArea() - self.preview_scroll.setWidgetResizable(True) - self.preview_container = QWidget() - self.preview_list_layout = QVBoxLayout(self.preview_container) - self.preview_list_layout.setAlignment(Qt.AlignmentFlag.AlignTop) - self.preview_scroll.setWidget(self.preview_container) - preview_layout.addWidget(self.preview_scroll) - group_preview.setLayout(preview_layout) - - bottom_layout.addWidget(group_input, 1) - bottom_layout.addWidget(group_preview, 1) - main_layout.addWidget(bottom_widget, 3) - - self.apply_ui_state(self.preset_manager.current_preset_data) - - # ==================== 状态管理接口 (供 PresetManager 调用) ==================== - - @property - def current_preset_name(self) -> str: - return self.preset_manager.current_preset_name - - @property - def current_preset_data(self) -> dict: - return self.preset_manager.current_preset_data - - def get_ui_state(self) -> dict: - """收集当前UI状态为字典""" - return { - "model_type": self.combo_model_type.currentText(), - "gpt_path": self.file_gpt.get_path(), - "vits_path": self.file_vits.get_path(), - "genie_dir": self.file_genie.get_path(), - "ref_audio": self.file_ref_audio.get_path(), - "ref_text": self.input_ref_text.text(), - "device": self.combo_device.currentText().lower(), - "quality": self.combo_quality.currentText(), - "split": self.combo_split.currentText(), - "mode": self.combo_mode.currentText(), - "lang": self.combo_lang.currentText(), - "save_mode": self.combo_save_mode.currentText(), - "out_dir": self.file_out_dir.get_path() - } - - @Slot(dict) - def apply_ui_state(self, data: dict) -> None: - """将字典数据应用到UI""" - - def set_combo_text(combo: MyComboBox, text: str) -> None: - index = combo.findText(text) - if index >= 0: - combo.setCurrentIndex(index) - - set_combo_text(self.combo_model_type, data.get("model_type", "")) - self.file_gpt.set_path(data.get("gpt_path", ""), block_signals=True) - self.file_vits.set_path(data.get("vits_path", ""), block_signals=True) - self.file_genie.set_path(data.get("genie_dir", "")) - self.file_ref_audio.set_path(data.get("ref_audio", "")) - self.input_ref_text.setText(data.get("ref_text", "")) - - set_combo_text(self.combo_device, data.get("device", "")) - set_combo_text(self.combo_quality, data.get("quality", "")) - set_combo_text(self.combo_split, data.get("split", "")) - set_combo_text(self.combo_mode, data.get("mode", "")) - set_combo_text(self.combo_lang, data.get("lang", "")) - set_combo_text(self.combo_save_mode, data.get("save_mode", "")) - - self.file_out_dir.set_path(data.get("out_dir", "")) - - # 确保UI显隐状态正确 - self._update_model_ui_visibility() - self._update_save_ui_state() - - # ==================== UI 逻辑处理 ==================== - - def _update_model_ui_visibility(self, *args) -> None: - """根据模型类型控制文件选择器的显隐""" - is_gpt = self.combo_model_type.currentText() == "GPT-SoVITS" - self.layout_model.setRowVisible(self.file_gpt, is_gpt) - self.layout_model.setRowVisible(self.file_vits, is_gpt) - self.layout_model.setRowVisible(self.file_genie, not is_gpt) - - @Slot(str) - def _on_gpt_path_changed(self, path: str): - if path and os.path.exists(path) and not self.file_vits.get_path(): - self._try_auto_fill_sibling(path, ".pth", self.file_vits) - - @Slot(str) - def _on_vits_path_changed(self, path: str): - if path and os.path.exists(path) and not self.file_gpt.get_path(): - self._try_auto_fill_sibling(path, ".ckpt", self.file_gpt) - - @staticmethod - def _try_auto_fill_sibling(current_path: str, target_ext: str, target_widget: FileSelectorWidget): - try: - directory = os.path.dirname(current_path) - if not os.path.exists(directory): - return - for f in os.listdir(directory): - if f.lower().endswith(target_ext.lower()): - full_path = os.path.join(directory, f) - target_widget.set_path(full_path) - print(f"[INFO] 自动关联模型文件: {full_path}") - break - except Exception as e: - print(f"[WARN] 自动关联文件失败: {e}") - - def _update_save_ui_state(self) -> None: - enabled = self.combo_save_mode.currentText() != "禁用自动保存" - self.layout_save.setRowVisible(self.file_out_dir, enabled) - - def _play_ref_audio(self) -> None: - path = self.file_ref_audio.get_path() - if os.path.exists(path): - self.player.stop() - self.player.play(path) - else: - QMessageBox.warning(self, "错误", "参考音频文件不存在") - - def _get_split_texts(self, text: str) -> List[str]: - method = self.combo_split.currentText() - if method == "不切分": - return [text] - elif method == "按行切分": - return [line.strip() for line in text.split('\n') if line.strip()] - elif method == "智能切分": - return self.splitter.split(text) - return [text] - - def _start_inference(self) -> None: - text = self.text_input.toPlainText().strip() - if not text: - QMessageBox.warning(self, "提示", "请输入目标文本") - return - - ref_path = self.file_ref_audio.get_path() - ref_text = self.input_ref_text.text().strip() - if not ref_path or not ref_text: - QMessageBox.warning(self, "提示", "请设置参考音频") - return - - if not self.file_genie.get_path(): - QMessageBox.warning(self, "提示", "请选择Genie模型目录") - return - - out_dir = self.file_out_dir.get_path() - save_mode = self.combo_save_mode.currentText() - if not out_dir and save_mode != "禁用自动保存": - desktop = os.path.join(os.path.expanduser("~"), "Desktop", "Genie Output") - self.file_out_dir.set_path(desktop) - print(f"[INFO] 未设置输出文件夹, 将在桌面创建!") - - self.btn_start.setEnabled(False) - self.btn_start.setText("推理中...") - self._chain_import_model() - - # ==================== 推理链式调用 ==================== - - def _chain_import_model(self) -> None: - req = { - "character_name": self.current_preset_name, - "onnx_model_dir": self.file_genie.get_path(), - "language": self.combo_lang.currentText(), - } - worker = InferenceWorker(req, mode="load_character") - worker.finished.connect(lambda s, m, d: self._on_import_finished(s, m)) - worker.start() - self.current_worker = worker - - @Slot(bool, str) - def _on_import_finished(self, success: bool, msg: str) -> None: - if not success: - self._reset_ui_state() - QMessageBox.critical(self, "模型加载失败", msg) - return - print(f"[INFO] {msg}") - self._chain_set_ref() - - def _chain_set_ref(self) -> None: - req = { - "character_name": self.current_preset_name, - "audio_path": self.file_ref_audio.get_path(), - "audio_text": self.input_ref_text.text().strip(), - "language": self.combo_lang.currentText(), - } - worker = InferenceWorker(req, mode="set_reference_audio") - worker.finished.connect(lambda s, m, d: self._on_set_ref_finished(s, m)) - worker.start() - self.current_worker = worker - - @Slot(bool, str) - def _on_set_ref_finished(self, success: bool, msg: str) -> None: - if not success: - self._reset_ui_state() - QMessageBox.critical(self, "设置参考音频失败", msg) - return - print(f"[INFO] {msg}") - self._chain_tts() - - def _chain_tts(self) -> None: - text_full = self.text_input.toPlainText().strip() - text_list = self._get_split_texts(text_full) - - print(f"[INFO] 开始串行推理, 分句结果: {text_list}") - self._process_serial_step(0, text_list, [], 32000) - - def _process_serial_step( - self, - index: int, - text_list: List[str], - audio_accumulator: List[np.ndarray], - sample_rate: int - ) -> None: - # 1. 终止条件:所有句子处理完毕 - if index >= len(text_list): - save_mode = self.combo_save_mode.currentText() - out_dir = self.file_out_dir.get_path() - - if out_dir: - os.makedirs(out_dir, exist_ok=True) - - if audio_accumulator and save_mode != "保存为多个文件": - full_text = ''.join(text_list) - full_audio = np.concatenate(audio_accumulator, axis=0) - if save_mode == "保存为单个文件": - target_names = generate_output_filenames(folder=out_dir, original_texts=[full_text]) - save_path = os.path.join(out_dir, target_names[0]) - else: # "禁用自动保存" - save_path = os.path.join(CACHE_DIR, f"{uuid.uuid4().hex}.wav") - sf.write(save_path, data=full_audio, samplerate=sample_rate, subtype='PCM_16') - self._add_to_preview(full_text, save_path) - - print(f"\n[INFO] 串行推理全部完成,共 {len(text_list)} 句。") - self._reset_ui_state() - return - - # 2. 递归进行:发起当前句子的请求 - req = { - "character_name": self.current_preset_name, - "text": text_list[index], - } - worker = InferenceWorker(req, mode="tts") - worker.finished.connect( - lambda s, m, d: self._on_serial_step_finished(s, m, d, index, text_list, audio_accumulator) - ) - worker.start() - self.current_worker = worker - - @Slot(bool, str, object, int, object, object, object) - def _on_serial_step_finished( - self, - success: bool, - msg: str, - return_data: dict, - index: int, - text_list: List[str], - audio_accumulator: List[np.ndarray] - ) -> None: - if not success: - self._reset_ui_state() - QMessageBox.critical(self, "推理失败", f"第 {index + 1} 句出错: {msg}") - return - - sr = return_data.get("sample_rate", 32000) - audio_list = return_data.get("audio_list", []) - save_mode = self.combo_save_mode.currentText() - out_dir = self.file_out_dir.get_path() - if out_dir: - os.makedirs(out_dir, exist_ok=True) - - if audio_list: - audio_accumulator.append(audio_list[0]) - if save_mode == "保存为多个文件": - target_names = generate_output_filenames(folder=out_dir, original_texts=[text_list[index]]) - save_path = os.path.join(out_dir, target_names[0]) - sf.write(save_path, data=audio_list[0], samplerate=sr, subtype='FLOAT') - self._add_to_preview(text_list[index], save_path) - else: - print(f"[WARN] 第 {index + 1} 句返回空音频") - - # 继续处理下一句 - self._process_serial_step(index + 1, text_list, audio_accumulator, sr) - - def _add_to_preview(self, text: str, path: str) -> None: - item = PreviewItemWidget(self.current_gen_id, text, path, self.player) - self.preview_list_layout.insertWidget(0, item) - self.current_gen_id += 1 - - def _reset_ui_state(self) -> None: - self.btn_start.setEnabled(True) - self.btn_start.setText("开始推理") - - def closeEvent(self, event: QCloseEvent) -> None: - # 委托 PresetManager 处理保存逻辑 - self.preset_manager.shutdown() - super().closeEvent(event) - - -class MainWindow(QMainWindow): - def __init__(self): - super().__init__() - self.setWindowTitle("Genie TTS Inference GUI") - self.resize(1300, 900) - - # 初始化音频播放器 - self.player: AudioPlayer = AudioPlayer() - - # 初始化日志重定向 - self.log_widget: LogWidget = LogWidget() - sys.stdout = LogRedirector() - sys.stdout.textWritten.connect(self.log_widget.append_log) - - # 初始化主界面 - self.tabs: QTabWidget = QTabWidget() - self.tts_widget = TTSWidget(self.player) - self.conv_widget = ConverterWidget() - - self.tabs.addTab(self.log_widget, "GUI Log") - self.tabs.addTab(self.tts_widget, "TTS Inference") - self.tabs.addTab(self.conv_widget, "Converter") - self.tabs.setCurrentIndex(1) # 默认显示TTS页 - - self.setCentralWidget(self.tabs) - - def closeEvent(self, event: QCloseEvent) -> None: - if os.path.exists(CACHE_DIR): - shutil.rmtree(CACHE_DIR) - if hasattr(self, 'player'): - self.player.stop() - # 线程安全退出后,再恢复 stdout - sys.stdout = sys.__stdout__ - if hasattr(self, 'tts_widget'): - self.tts_widget.closeEvent(event) - event.accept() +import sys +import os +import shutil +from typing import List, Optional, TextIO, Any +import uuid + +import soundfile as sf +import numpy as np + +from PySide6.QtWidgets import ( + QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, QLineEdit, QTextEdit, + QFileDialog, QGroupBox, QScrollArea, QMessageBox, QTabWidget, QFormLayout, QFrame, +) +from PySide6.QtCore import ( + Qt, Signal, Slot, QObject, +) +from PySide6.QtGui import QTextCursor, QCloseEvent + +from ..Utils.TextSplitter import TextSplitter +from .Utils import ( + generate_output_filenames, FileSelectorWidget, FileSelectionMode, MyComboBox, sanitize_filename, + MyTextEdit +) +from .AudioPlayer import AudioPlayer +from .PresetManager import PresetManager +from .ServerManager import InferenceWorker +from .ConverterWidget import ConverterWidget + +""" +抄自 Genie CUDA Runtime +""" + +CACHE_DIR = './UserData/Cache/GenieGUI' +os.makedirs(CACHE_DIR, exist_ok=True) + + +# ==================== 后台工作线程 ==================== + +class LogRedirector(QObject): + """重定向 stdout 到 Signal""" + textWritten = Signal(str) + + def __init__(self): + super().__init__() + self._old_stdout: TextIO = sys.stdout + + def write(self, text: Any): + text = str(text) + self.textWritten.emit(text) + if self._old_stdout is not None: + self._old_stdout.write(text) + + def flush(self): + pass + + +# ==================== UI 组件实现 ==================== + +class PreviewItemWidget(QFrame): + """单条音频预览组件""" + + def __init__( + self, + index: int, + text: str, + file_path: str, + player: AudioPlayer, + parent: QWidget = None + ): + super().__init__(parent) + self.text: str = text + self.file_path: str = file_path + self.player: AudioPlayer = player + self.setFrameShape(QFrame.Shape.StyledPanel) + self.setFrameShadow(QFrame.Shadow.Raised) + + layout = QHBoxLayout(self) + layout.setContentsMargins(5, 5, 5, 5) + + # 编号 + lbl_id = QLabel(f"#{index}") + lbl_id.setFixedWidth(40) + lbl_id.setStyleSheet("font-weight: bold; color: #555;") + + # 文本 + lbl_text = QLabel(text) + lbl_text.setFixedWidth(240) + lbl_text.setToolTip(text) + + # 按钮 - 播放 + btn_play = QPushButton("▶ 播放") + btn_play.setFixedWidth(80) + btn_play.clicked.connect(self._play_audio) + + # 按钮 - 保存 + btn_save = QPushButton("⬇ 保存") + btn_save.setFixedWidth(80) + btn_save.clicked.connect(self._save_file) + + # 按钮 - 删除 (新增) + btn_del = QPushButton("🗑 删除") + btn_del.setFixedWidth(80) + btn_del.setStyleSheet("color: #ff4d4d;") # 以此区分删除按钮 + btn_del.clicked.connect(self._delete_item) + + layout.addWidget(lbl_id) + layout.addWidget(lbl_text, 1) # Stretch + layout.addWidget(btn_play) + layout.addWidget(btn_save) + layout.addWidget(btn_del) # 添加到布局 + + def _play_audio(self): + # 播放前先停止其他播放 + self.player.stop() + self.player.play(self.file_path) + + def _save_file(self): + filename = sanitize_filename(self.text) + save_path, _ = QFileDialog.getSaveFileName( + self, "保存音频", f"{filename}.wav", "WAV Audio (*.wav)" + ) + if save_path: + try: + shutil.copy(self.file_path, save_path) + QMessageBox.information(self, "成功", "文件保存成功!") + except Exception as e: + QMessageBox.critical(self, "错误", f"保存失败: {e}") + + def _delete_item(self): + """删除当前条目及对应的文件""" + # 1. 停止播放 + self.player.stop() + + # 2. 尝试删除物理文件 (避免垃圾堆积) + try: + if os.path.exists(self.file_path): + os.remove(self.file_path) + print(f"[INFO] 已删除文件: {self.file_path}") + except Exception as e: + print(f"[WARN] 删除文件失败: {e}") + + # 3. 从界面移除自身 + self.deleteLater() + + +class LogWidget(QWidget): + """日志显示Tab""" + + def __init__(self, parent: QWidget = None): + super().__init__(parent) + layout = QVBoxLayout(self) + self.text_edit: QTextEdit = QTextEdit() + self.text_edit.setReadOnly(True) + self.text_edit.setStyleSheet( + "background-color: #1e1e1e;" + "color: #ecf0f1;" + "font-family: Consolas;" + "font-size: 12pt;" + ) + layout.addWidget(self.text_edit) + + @Slot(str) + def append_log(self, text: str): + self.text_edit.moveCursor(QTextCursor.MoveOperation.End) + self.text_edit.insertPlainText(text) + self.text_edit.moveCursor(QTextCursor.MoveOperation.End) + + +class TTSWidget(QWidget): + """TTS 主交互界面""" + + def __init__(self, player: AudioPlayer, parent: QWidget = None): + super().__init__(parent) + self.player: AudioPlayer = player + self.splitter: TextSplitter = TextSplitter() + self.current_gen_id: int = 0 + self.current_worker: Optional[InferenceWorker] = None + + main_layout = QVBoxLayout(self) + + # ---------------- 顶部:预设管理器 ---------------- + self.preset_manager = PresetManager( + presets_file='./UserData/GenieGuiConfig.json', + state_getter=self.get_ui_state, + ) + self.preset_manager.sig_load_state.connect(self.apply_ui_state) + main_layout.addWidget(self.preset_manager) + + # ---------------- 中间:滚动设置区 ---------------- + scroll = QScrollArea() + scroll.setWidgetResizable(True) + content_widget = QWidget() + + content_layout = QHBoxLayout(content_widget) + left_column_layout = QVBoxLayout() + right_column_layout = QVBoxLayout() + + # ==================== 左侧列内容 ==================== + + # 模型设置组 + group_model = QGroupBox("模型设置") + self.layout_model = QFormLayout() + self.combo_model_type = MyComboBox() + self.combo_model_type.addItems(["Genie-TTS"]) + self.combo_model_type.currentTextChanged.connect(self._update_model_ui_visibility) + self.combo_model_type.setEnabled(False) + self.file_gpt = FileSelectorWidget("gpt_path", FileSelectionMode.FILE, "Checkpoints (*.ckpt)") + self.file_vits = FileSelectorWidget("vits_path", FileSelectionMode.FILE, "Models (*.pth)") + self.file_genie = FileSelectorWidget("genie_dir", FileSelectionMode.DIRECTORY) + self.file_gpt.pathChanged.connect(self._on_gpt_path_changed) + self.file_vits.pathChanged.connect(self._on_vits_path_changed) + self.layout_model.addRow("模型类型:", self.combo_model_type) + self.layout_model.addRow("GPT模型 (.ckpt):", self.file_gpt) + self.layout_model.addRow("VITS模型 (.pth):", self.file_vits) + self.layout_model.addRow("Genie模型目录:", self.file_genie) + group_model.setLayout(self.layout_model) + # 参考音频组 + group_ref = QGroupBox("参考音频") + layout_ref = QFormLayout() + self.file_ref_audio = FileSelectorWidget("ref_audio", FileSelectionMode.FILE, "Audio (*.wav *.mp3)") + self.input_ref_text = QLineEdit() + self.input_ref_text.setPlaceholderText("请输入参考音频对应的文本...") + btn_play_ref = QPushButton("▶️") + btn_play_ref.setFixedWidth(30) + btn_play_ref.clicked.connect(self._play_ref_audio) + hbox_ref_text = QHBoxLayout() + hbox_ref_text.addWidget(self.input_ref_text) + hbox_ref_text.addWidget(btn_play_ref) + layout_ref.addRow("音频文件:", self.file_ref_audio) + layout_ref.addRow("音频文本:", hbox_ref_text) + group_ref.setLayout(layout_ref) + + left_column_layout.addWidget(group_model) + left_column_layout.addWidget(group_ref) + left_column_layout.addStretch() + + # ==================== 右侧列内容 ==================== + + # === 推理设置组 === + group_infer = QGroupBox("推理参数") + layout_infer = QFormLayout() + self.combo_device = MyComboBox() + self.combo_device.addItems(["CPU"]) + self.combo_device.setEnabled(False) + self.combo_quality = MyComboBox() + self.combo_quality.addItems(["质量优先"]) + self.combo_quality.setEnabled(False) + self.combo_split = MyComboBox() + self.combo_split.addItems(["不切分", "智能切分", "按行切分"]) + self.combo_mode = MyComboBox() + self.combo_mode.addItems(["串行推理"]) + self.combo_mode.setEnabled(False) + self.combo_lang = MyComboBox() + self.combo_lang.addItems(["Chinese", "English", "Japanese"]) + layout_infer.addRow("推理设备:\n(重启生效)", self.combo_device) + layout_infer.addRow("推理需求:", self.combo_quality) + layout_infer.addRow("分句方式:", self.combo_split) + layout_infer.addRow("推理模式:", self.combo_mode) + layout_infer.addRow("目标语言:", self.combo_lang) + group_infer.setLayout(layout_infer) + + # === 自动保存组 === + group_save = QGroupBox("自动保存设置") + self.layout_save = QFormLayout() + self.combo_save_mode = MyComboBox() + self.combo_save_mode.addItems(["禁用自动保存", "保存为单个文件", "保存为多个文件"]) + self.combo_save_mode.currentIndexChanged.connect(self._update_save_ui_state) + default_out_path = os.path.join(os.path.expanduser("~"), "Desktop", "Genie 输出语音") + self.file_out_dir = FileSelectorWidget("out_dir", FileSelectionMode.DIRECTORY) + self.file_out_dir.set_path(default_out_path) + self.layout_save.addRow("保存方式:", self.combo_save_mode) + self.layout_save.addRow("输出文件夹:", self.file_out_dir) + group_save.setLayout(self.layout_save) + + right_column_layout.addWidget(group_infer) + right_column_layout.addWidget(group_save) + right_column_layout.addStretch() + + content_layout.addLayout(left_column_layout, 1) + content_layout.addLayout(right_column_layout, 1) + scroll.setWidget(content_widget) + main_layout.addWidget(scroll, 5) + + # ==================== 底部:输入控制 + 输出预览 ==================== + + # 创建底部容器 widget + bottom_widget = QWidget() + bottom_layout = QHBoxLayout(bottom_widget) + bottom_layout.setContentsMargins(0, 0, 0, 0) # 去除边距让它贴合 + + # --- 输入控制组 --- + group_input = QGroupBox("目标文本") + layout_input = QVBoxLayout() + self.text_input = MyTextEdit() + self.text_input.setPlaceholderText("请输入要合成的目标文本...") + self.text_input.setFixedHeight(300) + self.btn_start = QPushButton("开始推理") + self.btn_start.setFixedHeight(40) + self.btn_start.setStyleSheet(""" + QPushButton { + background-color: #4CAF50; + color: white; + font-weight: bold; + border-radius: 5px; + } + QPushButton:hover { background-color: #45a049; } + QPushButton:disabled { background-color: #cccccc; } + """) + self.btn_start.clicked.connect(self._start_inference) + layout_input.addWidget(self.text_input) + layout_input.addWidget(self.btn_start) + group_input.setLayout(layout_input) + + # --- 输出预览组 --- + group_preview = QGroupBox("输出音频预览") + preview_layout = QVBoxLayout() + self.preview_scroll = QScrollArea() + self.preview_scroll.setWidgetResizable(True) + self.preview_container = QWidget() + self.preview_list_layout = QVBoxLayout(self.preview_container) + self.preview_list_layout.setAlignment(Qt.AlignmentFlag.AlignTop) + self.preview_scroll.setWidget(self.preview_container) + preview_layout.addWidget(self.preview_scroll) + group_preview.setLayout(preview_layout) + + bottom_layout.addWidget(group_input, 1) + bottom_layout.addWidget(group_preview, 1) + main_layout.addWidget(bottom_widget, 3) + + self.apply_ui_state(self.preset_manager.current_preset_data) + + # ==================== 状态管理接口 (供 PresetManager 调用) ==================== + + @property + def current_preset_name(self) -> str: + return self.preset_manager.current_preset_name + + @property + def current_preset_data(self) -> dict: + return self.preset_manager.current_preset_data + + def get_ui_state(self) -> dict: + """收集当前UI状态为字典""" + return { + "model_type": self.combo_model_type.currentText(), + "gpt_path": self.file_gpt.get_path(), + "vits_path": self.file_vits.get_path(), + "genie_dir": self.file_genie.get_path(), + "ref_audio": self.file_ref_audio.get_path(), + "ref_text": self.input_ref_text.text(), + "device": self.combo_device.currentText().lower(), + "quality": self.combo_quality.currentText(), + "split": self.combo_split.currentText(), + "mode": self.combo_mode.currentText(), + "lang": self.combo_lang.currentText(), + "save_mode": self.combo_save_mode.currentText(), + "out_dir": self.file_out_dir.get_path() + } + + @Slot(dict) + def apply_ui_state(self, data: dict) -> None: + """将字典数据应用到UI""" + + def set_combo_text(combo: MyComboBox, text: str) -> None: + index = combo.findText(text) + if index >= 0: + combo.setCurrentIndex(index) + + set_combo_text(self.combo_model_type, data.get("model_type", "")) + self.file_gpt.set_path(data.get("gpt_path", ""), block_signals=True) + self.file_vits.set_path(data.get("vits_path", ""), block_signals=True) + self.file_genie.set_path(data.get("genie_dir", "")) + self.file_ref_audio.set_path(data.get("ref_audio", "")) + self.input_ref_text.setText(data.get("ref_text", "")) + + set_combo_text(self.combo_device, data.get("device", "")) + set_combo_text(self.combo_quality, data.get("quality", "")) + set_combo_text(self.combo_split, data.get("split", "")) + set_combo_text(self.combo_mode, data.get("mode", "")) + set_combo_text(self.combo_lang, data.get("lang", "")) + set_combo_text(self.combo_save_mode, data.get("save_mode", "")) + + self.file_out_dir.set_path(data.get("out_dir", "")) + + # 确保UI显隐状态正确 + self._update_model_ui_visibility() + self._update_save_ui_state() + + # ==================== UI 逻辑处理 ==================== + + def _update_model_ui_visibility(self, *args) -> None: + """根据模型类型控制文件选择器的显隐""" + is_gpt = self.combo_model_type.currentText() == "GPT-SoVITS" + self.layout_model.setRowVisible(self.file_gpt, is_gpt) + self.layout_model.setRowVisible(self.file_vits, is_gpt) + self.layout_model.setRowVisible(self.file_genie, not is_gpt) + + @Slot(str) + def _on_gpt_path_changed(self, path: str): + if path and os.path.exists(path) and not self.file_vits.get_path(): + self._try_auto_fill_sibling(path, ".pth", self.file_vits) + + @Slot(str) + def _on_vits_path_changed(self, path: str): + if path and os.path.exists(path) and not self.file_gpt.get_path(): + self._try_auto_fill_sibling(path, ".ckpt", self.file_gpt) + + @staticmethod + def _try_auto_fill_sibling(current_path: str, target_ext: str, target_widget: FileSelectorWidget): + try: + directory = os.path.dirname(current_path) + if not os.path.exists(directory): + return + for f in os.listdir(directory): + if f.lower().endswith(target_ext.lower()): + full_path = os.path.join(directory, f) + target_widget.set_path(full_path) + print(f"[INFO] 自动关联模型文件: {full_path}") + break + except Exception as e: + print(f"[WARN] 自动关联文件失败: {e}") + + def _update_save_ui_state(self) -> None: + enabled = self.combo_save_mode.currentText() != "禁用自动保存" + self.layout_save.setRowVisible(self.file_out_dir, enabled) + + def _play_ref_audio(self) -> None: + path = self.file_ref_audio.get_path() + if os.path.exists(path): + self.player.stop() + self.player.play(path) + else: + QMessageBox.warning(self, "错误", "参考音频文件不存在") + + def _get_split_texts(self, text: str) -> List[str]: + method = self.combo_split.currentText() + if method == "不切分": + return [text] + elif method == "按行切分": + return [line.strip() for line in text.split('\n') if line.strip()] + elif method == "智能切分": + return self.splitter.split(text) + return [text] + + def _start_inference(self) -> None: + text = self.text_input.toPlainText().strip() + if not text: + QMessageBox.warning(self, "提示", "请输入目标文本") + return + + ref_path = self.file_ref_audio.get_path() + ref_text = self.input_ref_text.text().strip() + if not ref_path or not ref_text: + QMessageBox.warning(self, "提示", "请设置参考音频") + return + + if not self.file_genie.get_path(): + QMessageBox.warning(self, "提示", "请选择Genie模型目录") + return + + out_dir = self.file_out_dir.get_path() + save_mode = self.combo_save_mode.currentText() + if not out_dir and save_mode != "禁用自动保存": + desktop = os.path.join(os.path.expanduser("~"), "Desktop", "Genie Output") + self.file_out_dir.set_path(desktop) + print(f"[INFO] 未设置输出文件夹, 将在桌面创建!") + + self.btn_start.setEnabled(False) + self.btn_start.setText("推理中...") + self._chain_import_model() + + # ==================== 推理链式调用 ==================== + + def _chain_import_model(self) -> None: + req = { + "character_name": self.current_preset_name, + "onnx_model_dir": self.file_genie.get_path(), + "language": self.combo_lang.currentText(), + } + worker = InferenceWorker(req, mode="load_character") + worker.finished.connect(lambda s, m, d: self._on_import_finished(s, m)) + worker.start() + self.current_worker = worker + + @Slot(bool, str) + def _on_import_finished(self, success: bool, msg: str) -> None: + if not success: + self._reset_ui_state() + QMessageBox.critical(self, "模型加载失败", msg) + return + print(f"[INFO] {msg}") + self._chain_set_ref() + + def _chain_set_ref(self) -> None: + req = { + "character_name": self.current_preset_name, + "audio_path": self.file_ref_audio.get_path(), + "audio_text": self.input_ref_text.text().strip(), + "language": self.combo_lang.currentText(), + } + worker = InferenceWorker(req, mode="set_reference_audio") + worker.finished.connect(lambda s, m, d: self._on_set_ref_finished(s, m)) + worker.start() + self.current_worker = worker + + @Slot(bool, str) + def _on_set_ref_finished(self, success: bool, msg: str) -> None: + if not success: + self._reset_ui_state() + QMessageBox.critical(self, "设置参考音频失败", msg) + return + print(f"[INFO] {msg}") + self._chain_tts() + + def _chain_tts(self) -> None: + text_full = self.text_input.toPlainText().strip() + text_list = self._get_split_texts(text_full) + + print(f"[INFO] 开始串行推理, 分句结果: {text_list}") + self._process_serial_step(0, text_list, [], 32000) + + def _process_serial_step( + self, + index: int, + text_list: List[str], + audio_accumulator: List[np.ndarray], + sample_rate: int + ) -> None: + # 1. 终止条件:所有句子处理完毕 + if index >= len(text_list): + save_mode = self.combo_save_mode.currentText() + out_dir = self.file_out_dir.get_path() + + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + if audio_accumulator and save_mode != "保存为多个文件": + full_text = ''.join(text_list) + full_audio = np.concatenate(audio_accumulator, axis=0) + if save_mode == "保存为单个文件": + target_names = generate_output_filenames(folder=out_dir, original_texts=[full_text]) + save_path = os.path.join(out_dir, target_names[0]) + else: # "禁用自动保存" + save_path = os.path.join(CACHE_DIR, f"{uuid.uuid4().hex}.wav") + sf.write(save_path, data=full_audio, samplerate=sample_rate, subtype='PCM_16') + self._add_to_preview(full_text, save_path) + + print(f"\n[INFO] 串行推理全部完成,共 {len(text_list)} 句。") + self._reset_ui_state() + return + + # 2. 递归进行:发起当前句子的请求 + req = { + "character_name": self.current_preset_name, + "text": text_list[index], + } + worker = InferenceWorker(req, mode="tts") + worker.finished.connect( + lambda s, m, d: self._on_serial_step_finished(s, m, d, index, text_list, audio_accumulator) + ) + worker.start() + self.current_worker = worker + + @Slot(bool, str, object, int, object, object, object) + def _on_serial_step_finished( + self, + success: bool, + msg: str, + return_data: dict, + index: int, + text_list: List[str], + audio_accumulator: List[np.ndarray] + ) -> None: + if not success: + self._reset_ui_state() + QMessageBox.critical(self, "推理失败", f"第 {index + 1} 句出错: {msg}") + return + + sr = return_data.get("sample_rate", 32000) + audio_list = return_data.get("audio_list", []) + save_mode = self.combo_save_mode.currentText() + out_dir = self.file_out_dir.get_path() + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + if audio_list: + audio_accumulator.append(audio_list[0]) + if save_mode == "保存为多个文件": + target_names = generate_output_filenames(folder=out_dir, original_texts=[text_list[index]]) + save_path = os.path.join(out_dir, target_names[0]) + sf.write(save_path, data=audio_list[0], samplerate=sr, subtype='FLOAT') + self._add_to_preview(text_list[index], save_path) + else: + print(f"[WARN] 第 {index + 1} 句返回空音频") + + # 继续处理下一句 + self._process_serial_step(index + 1, text_list, audio_accumulator, sr) + + def _add_to_preview(self, text: str, path: str) -> None: + item = PreviewItemWidget(self.current_gen_id, text, path, self.player) + self.preview_list_layout.insertWidget(0, item) + self.current_gen_id += 1 + + def _reset_ui_state(self) -> None: + self.btn_start.setEnabled(True) + self.btn_start.setText("开始推理") + + def closeEvent(self, event: QCloseEvent) -> None: + # 委托 PresetManager 处理保存逻辑 + self.preset_manager.shutdown() + super().closeEvent(event) + + +class MainWindow(QMainWindow): + def __init__(self): + super().__init__() + self.setWindowTitle("Genie TTS Inference GUI") + self.resize(1300, 900) + + # 初始化音频播放器 + self.player: AudioPlayer = AudioPlayer() + + # 初始化日志重定向 + self.log_widget: LogWidget = LogWidget() + sys.stdout = LogRedirector() + sys.stdout.textWritten.connect(self.log_widget.append_log) + + # 初始化主界面 + self.tabs: QTabWidget = QTabWidget() + self.tts_widget = TTSWidget(self.player) + self.conv_widget = ConverterWidget() + + self.tabs.addTab(self.log_widget, "GUI Log") + self.tabs.addTab(self.tts_widget, "TTS Inference") + self.tabs.addTab(self.conv_widget, "Converter") + self.tabs.setCurrentIndex(1) # 默认显示TTS页 + + self.setCentralWidget(self.tabs) + + def closeEvent(self, event: QCloseEvent) -> None: + if os.path.exists(CACHE_DIR): + shutil.rmtree(CACHE_DIR) + if hasattr(self, 'player'): + self.player.stop() + # 线程安全退出后,再恢复 stdout + sys.stdout = sys.__stdout__ + if hasattr(self, 'tts_widget'): + self.tts_widget.closeEvent(event) + event.accept() diff --git a/genie_tts/GUI/PresetManager.py b/genie_tts/GUI/PresetManager.py index 15f25131bfcfb19f10cea7267651241c8f2be497..9f62a8bf841b127b2546d0972c569732c5b87d25 100644 --- a/genie_tts/GUI/PresetManager.py +++ b/genie_tts/GUI/PresetManager.py @@ -1,188 +1,188 @@ -import os -import json -from typing import Callable, Optional, Dict - -from PySide6.QtWidgets import ( - QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, - QComboBox, QGroupBox, QMessageBox, QInputDialog -) -from PySide6.QtCore import Signal, QSettings, Slot - - -class PresetManager(QWidget): - # 信号:通知主界面加载数据 - sig_load_state = Signal(dict) - - def __init__(self, - presets_file: str, - state_getter: Callable[[], dict], - parent: QWidget = None): - super().__init__(parent) - self.presets_file: str = presets_file - self.state_getter: Callable[[], dict] = state_getter - self.presets: Dict[str, dict] = {} - self.current_preset_name: Optional[str] = None - self._init_ui() - self._load_from_disk() - # 初始化完成后,应用一次初始状态 - self._apply_initial_preset() - - def _init_ui(self): - layout = QVBoxLayout(self) - layout.setContentsMargins(0, 0, 0, 0) - - group = QGroupBox("预设管理 (自动保存)") - h_layout = QHBoxLayout() - - self.lbl_info = QLabel("当前:") - - self.combo_presets = QComboBox() - self.combo_presets.textActivated.connect(self._on_preset_switch_triggered) - - btn_new = QPushButton("新建") - btn_new.clicked.connect(self.create_preset) - - # === 新增:重命名按钮 === - btn_rename = QPushButton("重命名") - btn_rename.clicked.connect(self.rename_preset) - # ===================== - - btn_del = QPushButton("删除") - btn_del.clicked.connect(self.delete_preset) - - h_layout.addWidget(self.lbl_info) - h_layout.addWidget(self.combo_presets, 1) - h_layout.addWidget(btn_new) - h_layout.addWidget(btn_rename) # 添加到布局 - h_layout.addWidget(btn_del) - - group.setLayout(h_layout) - layout.addWidget(group) - - @property - def current_preset_data(self) -> dict: - return self.presets.get(self.current_preset_name, {}) - - def _load_from_disk(self): - """从磁盘加载 JSON""" - if os.path.exists(self.presets_file): - try: - with open(self.presets_file, 'r', encoding='utf-8') as f: - self.presets = json.load(f) - except Exception as e: - print(f"[ERROR] 预设文件损坏: {e}") - self.presets = {} - - if not self.presets: - self.presets = {"Default": {}} - - self.combo_presets.clear() - self.combo_presets.addItems(list(self.presets.keys())) - - def _save_to_disk(self): - """写入磁盘""" - try: - os.makedirs(os.path.dirname(self.presets_file), exist_ok=True) - with open(self.presets_file, 'w', encoding='utf-8') as f: - json.dump(self.presets, f, indent=4, ensure_ascii=False) - except Exception as e: - print(f"[ERROR] 保存预设失败: {e}") - - def _apply_initial_preset(self): - """初始化时恢复上次的选择""" - last_used = QSettings("MyTTS", "GUI").value("last_preset", "Default") - if last_used not in self.presets and self.combo_presets.count() > 0: - last_used = self.combo_presets.itemText(0) - self.current_preset_name = last_used - self.combo_presets.setCurrentText(last_used) - self._load_preset_data(last_used) - - @Slot(str) - def _on_preset_switch_triggered(self, new_preset_name: str): - if new_preset_name == self.current_preset_name: - return - if self.current_preset_name: - self._save_current_state_to_memory(self.current_preset_name) - self._load_preset_data(new_preset_name) - self.current_preset_name = new_preset_name - QSettings("MyTTS", "GUI").setValue("last_preset", new_preset_name) - self._save_to_disk() - - def _save_current_state_to_memory(self, preset_name: str): - """调用回调获取主界面状态,并更新到内存字典""" - if self.state_getter and preset_name in self.presets: - current_data = self.state_getter() - self.presets[preset_name] = current_data - - def _load_preset_data(self, preset_name: str): - """发送信号给主界面加载数据""" - data = self.presets.get(preset_name, {}) - self.sig_load_state.emit(data) - print(f"[INFO] 已加载预设: {preset_name}") - - # ================= 公共接口 ================= - - def create_preset(self): - """新建预设""" - if self.current_preset_name: - self._save_current_state_to_memory(self.current_preset_name) - - name, ok = QInputDialog.getText(self, "新建预设", "名称:") - if ok and name: - if name in self.presets: - QMessageBox.warning(self, "警告", "预设名已存在") - return - self.presets[name] = {} - self.combo_presets.addItem(name) - self.combo_presets.setCurrentText(name) - self.current_preset_name = name - self._save_to_disk() - self._load_preset_data(name) - print(f"[INFO] 已创建预设: {name}") - - def rename_preset(self): - """重命名当前预设""" - current_name = self.current_preset_name - if not current_name: - return - - # 先保存当前状态到内存,确保重命名时带走的是最新数据 - self._save_current_state_to_memory(current_name) - - new_name, ok = QInputDialog.getText(self, "重命名预设", "新名称:", text=current_name) - if ok and new_name and new_name != current_name: - if new_name in self.presets: - QMessageBox.warning(self, "警告", "预设名已存在") - return - # 迁移数据 - self.presets[new_name] = self.presets.pop(current_name) - self.current_preset_name = new_name - # 更新下拉框显示的文本(更新当前选中的这一项) - current_index = self.combo_presets.currentIndex() - self.combo_presets.setItemText(current_index, new_name) - # 更新配置记录 - QSettings("MyTTS", "GUI").setValue("last_preset", new_name) - self._save_to_disk() - print(f"[INFO] 已重命名预设: {current_name} -> {new_name}") - - def delete_preset(self): - """删除当前预设""" - target = self.current_preset_name - if len(self.presets) <= 1: - QMessageBox.warning(self, "禁止", "至少保留一个预设") - return - - if QMessageBox.StandardButton.Yes == QMessageBox.question(self, "确认", f"删除 '{target}'?"): - del self.presets[target] - self.combo_presets.removeItem(self.combo_presets.currentIndex()) - new_name = self.combo_presets.currentText() - self.current_preset_name = new_name - self._load_preset_data(new_name) - self._save_to_disk() - print(f"[INFO] 已删除预设: {target}") - - def shutdown(self): - """关闭时触发""" - if self.current_preset_name: - self._save_current_state_to_memory(self.current_preset_name) - self._save_to_disk() +import os +import json +from typing import Callable, Optional, Dict + +from PySide6.QtWidgets import ( + QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, + QComboBox, QGroupBox, QMessageBox, QInputDialog +) +from PySide6.QtCore import Signal, QSettings, Slot + + +class PresetManager(QWidget): + # 信号:通知主界面加载数据 + sig_load_state = Signal(dict) + + def __init__(self, + presets_file: str, + state_getter: Callable[[], dict], + parent: QWidget = None): + super().__init__(parent) + self.presets_file: str = presets_file + self.state_getter: Callable[[], dict] = state_getter + self.presets: Dict[str, dict] = {} + self.current_preset_name: Optional[str] = None + self._init_ui() + self._load_from_disk() + # 初始化完成后,应用一次初始状态 + self._apply_initial_preset() + + def _init_ui(self): + layout = QVBoxLayout(self) + layout.setContentsMargins(0, 0, 0, 0) + + group = QGroupBox("预设管理 (自动保存)") + h_layout = QHBoxLayout() + + self.lbl_info = QLabel("当前:") + + self.combo_presets = QComboBox() + self.combo_presets.textActivated.connect(self._on_preset_switch_triggered) + + btn_new = QPushButton("新建") + btn_new.clicked.connect(self.create_preset) + + # === 新增:重命名按钮 === + btn_rename = QPushButton("重命名") + btn_rename.clicked.connect(self.rename_preset) + # ===================== + + btn_del = QPushButton("删除") + btn_del.clicked.connect(self.delete_preset) + + h_layout.addWidget(self.lbl_info) + h_layout.addWidget(self.combo_presets, 1) + h_layout.addWidget(btn_new) + h_layout.addWidget(btn_rename) # 添加到布局 + h_layout.addWidget(btn_del) + + group.setLayout(h_layout) + layout.addWidget(group) + + @property + def current_preset_data(self) -> dict: + return self.presets.get(self.current_preset_name, {}) + + def _load_from_disk(self): + """从磁盘加载 JSON""" + if os.path.exists(self.presets_file): + try: + with open(self.presets_file, 'r', encoding='utf-8') as f: + self.presets = json.load(f) + except Exception as e: + print(f"[ERROR] 预设文件损坏: {e}") + self.presets = {} + + if not self.presets: + self.presets = {"Default": {}} + + self.combo_presets.clear() + self.combo_presets.addItems(list(self.presets.keys())) + + def _save_to_disk(self): + """写入磁盘""" + try: + os.makedirs(os.path.dirname(self.presets_file), exist_ok=True) + with open(self.presets_file, 'w', encoding='utf-8') as f: + json.dump(self.presets, f, indent=4, ensure_ascii=False) + except Exception as e: + print(f"[ERROR] 保存预设失败: {e}") + + def _apply_initial_preset(self): + """初始化时恢复上次的选择""" + last_used = QSettings("MyTTS", "GUI").value("last_preset", "Default") + if last_used not in self.presets and self.combo_presets.count() > 0: + last_used = self.combo_presets.itemText(0) + self.current_preset_name = last_used + self.combo_presets.setCurrentText(last_used) + self._load_preset_data(last_used) + + @Slot(str) + def _on_preset_switch_triggered(self, new_preset_name: str): + if new_preset_name == self.current_preset_name: + return + if self.current_preset_name: + self._save_current_state_to_memory(self.current_preset_name) + self._load_preset_data(new_preset_name) + self.current_preset_name = new_preset_name + QSettings("MyTTS", "GUI").setValue("last_preset", new_preset_name) + self._save_to_disk() + + def _save_current_state_to_memory(self, preset_name: str): + """调用回调获取主界面状态,并更新到内存字典""" + if self.state_getter and preset_name in self.presets: + current_data = self.state_getter() + self.presets[preset_name] = current_data + + def _load_preset_data(self, preset_name: str): + """发送信号给主界面加载数据""" + data = self.presets.get(preset_name, {}) + self.sig_load_state.emit(data) + print(f"[INFO] 已加载预设: {preset_name}") + + # ================= 公共接口 ================= + + def create_preset(self): + """新建预设""" + if self.current_preset_name: + self._save_current_state_to_memory(self.current_preset_name) + + name, ok = QInputDialog.getText(self, "新建预设", "名称:") + if ok and name: + if name in self.presets: + QMessageBox.warning(self, "警告", "预设名已存在") + return + self.presets[name] = {} + self.combo_presets.addItem(name) + self.combo_presets.setCurrentText(name) + self.current_preset_name = name + self._save_to_disk() + self._load_preset_data(name) + print(f"[INFO] 已创建预设: {name}") + + def rename_preset(self): + """重命名当前预设""" + current_name = self.current_preset_name + if not current_name: + return + + # 先保存当前状态到内存,确保重命名时带走的是最新数据 + self._save_current_state_to_memory(current_name) + + new_name, ok = QInputDialog.getText(self, "重命名预设", "新名称:", text=current_name) + if ok and new_name and new_name != current_name: + if new_name in self.presets: + QMessageBox.warning(self, "警告", "预设名已存在") + return + # 迁移数据 + self.presets[new_name] = self.presets.pop(current_name) + self.current_preset_name = new_name + # 更新下拉框显示的文本(更新当前选中的这一项) + current_index = self.combo_presets.currentIndex() + self.combo_presets.setItemText(current_index, new_name) + # 更新配置记录 + QSettings("MyTTS", "GUI").setValue("last_preset", new_name) + self._save_to_disk() + print(f"[INFO] 已重命名预设: {current_name} -> {new_name}") + + def delete_preset(self): + """删除当前预设""" + target = self.current_preset_name + if len(self.presets) <= 1: + QMessageBox.warning(self, "禁止", "至少保留一个预设") + return + + if QMessageBox.StandardButton.Yes == QMessageBox.question(self, "确认", f"删除 '{target}'?"): + del self.presets[target] + self.combo_presets.removeItem(self.combo_presets.currentIndex()) + new_name = self.combo_presets.currentText() + self.current_preset_name = new_name + self._load_preset_data(new_name) + self._save_to_disk() + print(f"[INFO] 已删除预设: {target}") + + def shutdown(self): + """关闭时触发""" + if self.current_preset_name: + self._save_current_state_to_memory(self.current_preset_name) + self._save_to_disk() diff --git a/genie_tts/GUI/ServerManager.py b/genie_tts/GUI/ServerManager.py index 7928b80aba69632032eff0a25143faebe7c719c1..389216058a9075aa6b20d14f85226bbb81ee0d20 100644 --- a/genie_tts/GUI/ServerManager.py +++ b/genie_tts/GUI/ServerManager.py @@ -1,61 +1,61 @@ -from PySide6.QtCore import Signal, QThread - -from ..Utils.Shared import context -from ..Internal import load_character, set_reference_audio -from ..Core.Inference import tts_client -from ..ModelManager import model_manager - - -class InferenceWorker(QThread): - """执行推理任务的 Worker""" - finished = Signal(bool, str, object) # success, message, data - - def __init__(self, request_data: dict, mode: str): - super().__init__() - self.req: dict = request_data - self.mode: str = mode - - def run(self) -> None: - try: - if self.mode == 'load_character': - load_character( - character_name=self.req['character_name'], - onnx_model_dir=self.req['onnx_model_dir'], - language=self.req['language'], - ) - self.finished.emit(True, "导入角色完成", None) - - elif self.mode == 'set_reference_audio': - set_reference_audio( - character_name=self.req['character_name'], - audio_path=self.req['audio_path'], - audio_text=self.req['audio_text'], - language=self.req['language'], - ) - self.finished.emit(True, "设置参考音频完成", None) - - elif self.mode == 'tts': - gsv_model = model_manager.get(self.req['character_name']) - tts_client.stop_event.clear() - audio_chunk = tts_client.tts( - text=self.req['text'], - prompt_audio=context.current_prompt_audio, - encoder=gsv_model.T2S_ENCODER, - first_stage_decoder=gsv_model.T2S_FIRST_STAGE_DECODER, - stage_decoder=gsv_model.T2S_STAGE_DECODER, - vocoder=gsv_model.VITS, - prompt_encoder=gsv_model.PROMPT_ENCODER, - language=gsv_model.LANGUAGE, - ) - audio_chunk = audio_chunk.squeeze() - try: - return_data = { - "sample_rate": 32000, - "audio_list": [audio_chunk], - } - self.finished.emit(True, "推理完成", return_data) - except Exception as e: - self.finished.emit(False, f"数据解析失败: {e}", None) - - except Exception as e: - self.finished.emit(False, f"请求异常: {str(e)}", None) +from PySide6.QtCore import Signal, QThread + +from ..Utils.Shared import context +from ..Internal import load_character, set_reference_audio +from ..Core.Inference import tts_client +from ..ModelManager import model_manager + + +class InferenceWorker(QThread): + """执行推理任务的 Worker""" + finished = Signal(bool, str, object) # success, message, data + + def __init__(self, request_data: dict, mode: str): + super().__init__() + self.req: dict = request_data + self.mode: str = mode + + def run(self) -> None: + try: + if self.mode == 'load_character': + load_character( + character_name=self.req['character_name'], + onnx_model_dir=self.req['onnx_model_dir'], + language=self.req['language'], + ) + self.finished.emit(True, "导入角色完成", None) + + elif self.mode == 'set_reference_audio': + set_reference_audio( + character_name=self.req['character_name'], + audio_path=self.req['audio_path'], + audio_text=self.req['audio_text'], + language=self.req['language'], + ) + self.finished.emit(True, "设置参考音频完成", None) + + elif self.mode == 'tts': + gsv_model = model_manager.get(self.req['character_name']) + tts_client.stop_event.clear() + audio_chunk = tts_client.tts( + text=self.req['text'], + prompt_audio=context.current_prompt_audio, + encoder=gsv_model.T2S_ENCODER, + first_stage_decoder=gsv_model.T2S_FIRST_STAGE_DECODER, + stage_decoder=gsv_model.T2S_STAGE_DECODER, + vocoder=gsv_model.VITS, + prompt_encoder=gsv_model.PROMPT_ENCODER, + language=gsv_model.LANGUAGE, + ) + audio_chunk = audio_chunk.squeeze() + try: + return_data = { + "sample_rate": 32000, + "audio_list": [audio_chunk], + } + self.finished.emit(True, "推理完成", return_data) + except Exception as e: + self.finished.emit(False, f"数据解析失败: {e}", None) + + except Exception as e: + self.finished.emit(False, f"请求异常: {str(e)}", None) diff --git a/genie_tts/GUI/Utils.py b/genie_tts/GUI/Utils.py index c801e2b4635bcba08c40aa64926b9ad408252727..adca79cfa3f40bfdc87841779288c4ac2e725ad7 100644 --- a/genie_tts/GUI/Utils.py +++ b/genie_tts/GUI/Utils.py @@ -1,257 +1,257 @@ -import os -import re -from enum import Enum -from datetime import datetime -import socket -from typing import List - -from PySide6.QtWidgets import ( - QWidget, QHBoxLayout, QPushButton, QLineEdit, QFileDialog, QMessageBox, QComboBox, QTextEdit -) -from PySide6.QtCore import (Qt, Signal, QSettings, Property, QObject, QEvent, QMimeData) -from PySide6.QtGui import QFont - - -def sanitize_filename(filename: str, replacement: str = '') -> str: - """ - 将文本清理为合法的 Windows 文件名。 - - Args: - filename (str): 原始文件名。 - replacement (str): 非法字符的替换字符,默认为空,建议使用 "_"。 - - Returns: - str: 清理后的文件名。 - """ - # 1. 去除非法字符 - # \/:*?"<>| 是标准非法字符 - # \x00-\x1f 是控制字符 (如换行符、制表符等),Windows 也不允许 - cleaned = re.sub(r'[\\/:*?"<>|\x00-\x1f]', replacement, filename) - - # 2. 去除首尾的空格和点 - # Windows 文件名不能以空格或点结尾,也不能以空格开头(虽然允许但通常不推荐) - cleaned = cleaned.strip().rstrip('.') - - # 3. 处理 Windows 保留文件名 (CON, PRN, AUX, NUL, COM1-9, LPT1-9) - # 这些名字不论加什么扩展名都是非法的 (例如 con.txt 也是非法的) - reserved_names = { - "CON", "PRN", "AUX", "NUL", - "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", - "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9" - } - - # 如果文件名(全大写)是保留字,或者文件名是保留字+扩展名(如 con.txt),则加下划线前缀 - filename_upper = cleaned.upper() - file_stem = filename_upper.split('.')[0] # 获取不带后缀的主文件名 - - if filename_upper in reserved_names or file_stem in reserved_names: - cleaned = "_" + cleaned - - # 4. 处理空文件名 (如果输入全是乱码被删光了) - if not cleaned: - cleaned = "unnamed_file" - - # 5. 限制长度 (Windows API 通常限制 255 字符,但在某些路径下更短) - cleaned = truncate_text(cleaned) - - return cleaned - - -def is_port_free(port: int) -> bool: - """返回端口是否可用""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - s.bind(('', port)) - except OSError: - return False - return True - - -def find_free_port(preferred: int = 8000) -> int: - # 先尝试 preferred 端口 - if is_port_free(preferred): - return preferred - - # 否则用系统自动分配 - s = socket.socket() - s.bind(('', 0)) - port = s.getsockname()[1] - s.close() - return port - - -def truncate_text(text: str, max_len: int = 30) -> str: - """ - 按指定宽度截断文本 - """ - result = "" - cur_len = 0 - for ch in text: - char_len = 2 if ('\u4e00' <= ch <= '\u9fff') else 1 - if cur_len + char_len > max_len: - break - result += ch - cur_len += char_len - return result - - -def generate_output_filenames(folder: str, original_texts: List[str]) -> List[str]: - """ - 批量生成文件名: - 输入 original_texts 列表 - 输出等长 filenames 列表 - """ - today = datetime.now().strftime("%Y%m%d") - pattern = re.compile(rf'^\[{today}]\[(\d{{3}})]') - - # ① 查找当天现存最大编号 - max_n = 0 - if os.path.isdir(folder): - for name in os.listdir(folder): - m = pattern.match(name) - if m: - n = int(m.group(1)) - max_n = max(max_n, n) - - filenames = [] - cur_n = max_n - - # ② 依次生成新文件名 - for text in original_texts: - cur_n += 1 - n_str = f"{cur_n:03d}" - - cleaned = sanitize_filename(text) - - filename = f"[{today}][{n_str}]{cleaned}.wav" - filenames.append(filename) - - return filenames - - -# ==================== 通用组件 ==================== - -class FileSelectionMode(Enum): - FILE = 0 - DIRECTORY = 1 - - -class FileSelectorWidget(QWidget): - """一个包含行编辑和浏览按钮的复合控件,支持文件和文件夹选择。(原样引用并稍作适配)""" - pathChanged = Signal(str) - - def __init__( - self, - setting_key: str, - selection_mode: FileSelectionMode = FileSelectionMode.DIRECTORY, - file_filter: str = "All Files (*)", - parent: QWidget = None, - ): - super().__init__(parent) - self.setting_key: str = setting_key - self.selection_mode: FileSelectionMode = selection_mode - self.file_filter: str = file_filter - - # 使用 QSettings 模拟 STUDIO_SETTINGS - self.settings = QSettings("MyTTS", "GUI") - - layout = QHBoxLayout(self) - layout.setContentsMargins(0, 0, 0, 0) - layout.setSpacing(5) - - self.path_edit: QLineEdit = QLineEdit() - self.path_edit.setReadOnly(True) - self.path_edit.setPlaceholderText("未选择路径") - - self.browse_button: QPushButton = QPushButton("📁") - self.browse_button.setCursor(Qt.CursorShape.PointingHandCursor) - default_font = QFont() - default_font.setPointSize(10) - self.browse_button.setFont(default_font) - self.browse_button.setFixedSize(30, 30) - - self.clear_button: QPushButton = QPushButton("❌") - self.clear_button.setCursor(Qt.CursorShape.PointingHandCursor) - self.clear_button.setFont(default_font) - self.clear_button.setFixedSize(30, 30) - - layout.addWidget(self.path_edit) - layout.addWidget(self.browse_button) - layout.addWidget(self.clear_button) - - self.browse_button.clicked.connect(self._open_dialog) - self.clear_button.clicked.connect(self._clear_path) - self.path_edit.textChanged.connect(self.pathChanged) - - def _open_dialog(self): - path = self.path_edit.text() - if path and os.path.exists(path): - start_path = path - else: - # 【修改点】默认路径改为 Desktop - desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") - start_path = str(self.settings.value( - f"last_path_{self.setting_key}", defaultValue=desktop_path - )) - - if self.selection_mode == FileSelectionMode.DIRECTORY: - selected_path = QFileDialog.getExistingDirectory( - self, "选择文件夹", start_path - ) - else: - selected_path, _ = QFileDialog.getOpenFileName( - self, "选择文件", start_path, self.file_filter - ) - - if selected_path: - self.set_path(selected_path) - parent_path = os.path.dirname(selected_path) - if parent_path: - self.settings.setValue( - f"last_path_{self.setting_key}", parent_path) - - def _clear_path(self): - if not self.path_edit.text(): - return - reply = QMessageBox.question(self, '确认', '您确定要清空路径吗?', - QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, - QMessageBox.StandardButton.No) - if reply == QMessageBox.StandardButton.Yes: - self.set_path("") - - def get_path(self) -> str: - text = self.path_edit.text() - # 即使路径不存在(可能是输入时),也返回文本供逻辑判断,或者严格校验 - return text - - def set_path(self, path: str, block_signals: bool = False): - if block_signals: - self.path_edit.blockSignals(True) - self.path_edit.setText(path) - if block_signals: - self.path_edit.blockSignals(False) - - path = Property(str, fget=get_path, fset=set_path, notify=pathChanged) # type: ignore - - -class WheelEventFilter(QObject): - def eventFilter(self, obj, event): - if event.type() == QEvent.Type.Wheel and isinstance(obj, QComboBox): - return True # 阻止默认滚轮行为 - return super().eventFilter(obj, event) - - -class MyComboBox(QComboBox): - def __init__(self, parent: QWidget = None): - super().__init__(parent) - self._wheelFilter = WheelEventFilter() - self.installEventFilter(self._wheelFilter) - - -class MyTextEdit(QTextEdit): - def insertFromMimeData(self, source: QMimeData) -> None: - # 仅取纯文本 - if source.hasText(): - self.insertPlainText(source.text()) - else: - super().insertFromMimeData(source) +import os +import re +from enum import Enum +from datetime import datetime +import socket +from typing import List + +from PySide6.QtWidgets import ( + QWidget, QHBoxLayout, QPushButton, QLineEdit, QFileDialog, QMessageBox, QComboBox, QTextEdit +) +from PySide6.QtCore import (Qt, Signal, QSettings, Property, QObject, QEvent, QMimeData) +from PySide6.QtGui import QFont + + +def sanitize_filename(filename: str, replacement: str = '') -> str: + """ + 将文本清理为合法的 Windows 文件名。 + + Args: + filename (str): 原始文件名。 + replacement (str): 非法字符的替换字符,默认为空,建议使用 "_"。 + + Returns: + str: 清理后的文件名。 + """ + # 1. 去除非法字符 + # \/:*?"<>| 是标准非法字符 + # \x00-\x1f 是控制字符 (如换行符、制表符等),Windows 也不允许 + cleaned = re.sub(r'[\\/:*?"<>|\x00-\x1f]', replacement, filename) + + # 2. 去除首尾的空格和点 + # Windows 文件名不能以空格或点结尾,也不能以空格开头(虽然允许但通常不推荐) + cleaned = cleaned.strip().rstrip('.') + + # 3. 处理 Windows 保留文件名 (CON, PRN, AUX, NUL, COM1-9, LPT1-9) + # 这些名字不论加什么扩展名都是非法的 (例如 con.txt 也是非法的) + reserved_names = { + "CON", "PRN", "AUX", "NUL", + "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", + "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9" + } + + # 如果文件名(全大写)是保留字,或者文件名是保留字+扩展名(如 con.txt),则加下划线前缀 + filename_upper = cleaned.upper() + file_stem = filename_upper.split('.')[0] # 获取不带后缀的主文件名 + + if filename_upper in reserved_names or file_stem in reserved_names: + cleaned = "_" + cleaned + + # 4. 处理空文件名 (如果输入全是乱码被删光了) + if not cleaned: + cleaned = "unnamed_file" + + # 5. 限制长度 (Windows API 通常限制 255 字符,但在某些路径下更短) + cleaned = truncate_text(cleaned) + + return cleaned + + +def is_port_free(port: int) -> bool: + """返回端口是否可用""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind(('', port)) + except OSError: + return False + return True + + +def find_free_port(preferred: int = 8000) -> int: + # 先尝试 preferred 端口 + if is_port_free(preferred): + return preferred + + # 否则用系统自动分配 + s = socket.socket() + s.bind(('', 0)) + port = s.getsockname()[1] + s.close() + return port + + +def truncate_text(text: str, max_len: int = 30) -> str: + """ + 按指定宽度截断文本 + """ + result = "" + cur_len = 0 + for ch in text: + char_len = 2 if ('\u4e00' <= ch <= '\u9fff') else 1 + if cur_len + char_len > max_len: + break + result += ch + cur_len += char_len + return result + + +def generate_output_filenames(folder: str, original_texts: List[str]) -> List[str]: + """ + 批量生成文件名: + 输入 original_texts 列表 + 输出等长 filenames 列表 + """ + today = datetime.now().strftime("%Y%m%d") + pattern = re.compile(rf'^\[{today}]\[(\d{{3}})]') + + # ① 查找当天现存最大编号 + max_n = 0 + if os.path.isdir(folder): + for name in os.listdir(folder): + m = pattern.match(name) + if m: + n = int(m.group(1)) + max_n = max(max_n, n) + + filenames = [] + cur_n = max_n + + # ② 依次生成新文件名 + for text in original_texts: + cur_n += 1 + n_str = f"{cur_n:03d}" + + cleaned = sanitize_filename(text) + + filename = f"[{today}][{n_str}]{cleaned}.wav" + filenames.append(filename) + + return filenames + + +# ==================== 通用组件 ==================== + +class FileSelectionMode(Enum): + FILE = 0 + DIRECTORY = 1 + + +class FileSelectorWidget(QWidget): + """一个包含行编辑和浏览按钮的复合控件,支持文件和文件夹选择。(原样引用并稍作适配)""" + pathChanged = Signal(str) + + def __init__( + self, + setting_key: str, + selection_mode: FileSelectionMode = FileSelectionMode.DIRECTORY, + file_filter: str = "All Files (*)", + parent: QWidget = None, + ): + super().__init__(parent) + self.setting_key: str = setting_key + self.selection_mode: FileSelectionMode = selection_mode + self.file_filter: str = file_filter + + # 使用 QSettings 模拟 STUDIO_SETTINGS + self.settings = QSettings("MyTTS", "GUI") + + layout = QHBoxLayout(self) + layout.setContentsMargins(0, 0, 0, 0) + layout.setSpacing(5) + + self.path_edit: QLineEdit = QLineEdit() + self.path_edit.setReadOnly(True) + self.path_edit.setPlaceholderText("未选择路径") + + self.browse_button: QPushButton = QPushButton("📁") + self.browse_button.setCursor(Qt.CursorShape.PointingHandCursor) + default_font = QFont() + default_font.setPointSize(10) + self.browse_button.setFont(default_font) + self.browse_button.setFixedSize(30, 30) + + self.clear_button: QPushButton = QPushButton("❌") + self.clear_button.setCursor(Qt.CursorShape.PointingHandCursor) + self.clear_button.setFont(default_font) + self.clear_button.setFixedSize(30, 30) + + layout.addWidget(self.path_edit) + layout.addWidget(self.browse_button) + layout.addWidget(self.clear_button) + + self.browse_button.clicked.connect(self._open_dialog) + self.clear_button.clicked.connect(self._clear_path) + self.path_edit.textChanged.connect(self.pathChanged) + + def _open_dialog(self): + path = self.path_edit.text() + if path and os.path.exists(path): + start_path = path + else: + # 【修改点】默认路径改为 Desktop + desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") + start_path = str(self.settings.value( + f"last_path_{self.setting_key}", defaultValue=desktop_path + )) + + if self.selection_mode == FileSelectionMode.DIRECTORY: + selected_path = QFileDialog.getExistingDirectory( + self, "选择文件夹", start_path + ) + else: + selected_path, _ = QFileDialog.getOpenFileName( + self, "选择文件", start_path, self.file_filter + ) + + if selected_path: + self.set_path(selected_path) + parent_path = os.path.dirname(selected_path) + if parent_path: + self.settings.setValue( + f"last_path_{self.setting_key}", parent_path) + + def _clear_path(self): + if not self.path_edit.text(): + return + reply = QMessageBox.question(self, '确认', '您确定要清空路径吗?', + QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, + QMessageBox.StandardButton.No) + if reply == QMessageBox.StandardButton.Yes: + self.set_path("") + + def get_path(self) -> str: + text = self.path_edit.text() + # 即使路径不存在(可能是输入时),也返回文本供逻辑判断,或者严格校验 + return text + + def set_path(self, path: str, block_signals: bool = False): + if block_signals: + self.path_edit.blockSignals(True) + self.path_edit.setText(path) + if block_signals: + self.path_edit.blockSignals(False) + + path = Property(str, fget=get_path, fset=set_path, notify=pathChanged) # type: ignore + + +class WheelEventFilter(QObject): + def eventFilter(self, obj, event): + if event.type() == QEvent.Type.Wheel and isinstance(obj, QComboBox): + return True # 阻止默认滚轮行为 + return super().eventFilter(obj, event) + + +class MyComboBox(QComboBox): + def __init__(self, parent: QWidget = None): + super().__init__(parent) + self._wheelFilter = WheelEventFilter() + self.installEventFilter(self._wheelFilter) + + +class MyTextEdit(QTextEdit): + def insertFromMimeData(self, source: QMimeData) -> None: + # 仅取纯文本 + if source.hasText(): + self.insertPlainText(source.text()) + else: + super().insertFromMimeData(source) diff --git a/genie_tts/GetPhonesAndBert.py b/genie_tts/GetPhonesAndBert.py index c181a948a162341643864c289fbdc81b6a281850..3594ab26e2fd1ad673c5486a08161a4ffdbcfbfd 100644 --- a/genie_tts/GetPhonesAndBert.py +++ b/genie_tts/GetPhonesAndBert.py @@ -1,34 +1,34 @@ -import numpy as np -from typing import Tuple -from .Utils.Constants import BERT_FEATURE_DIM -from .ModelManager import model_manager - - -def get_phones_and_bert(prompt_text: str, language: str = 'japanese') -> Tuple[np.ndarray, np.ndarray]: - if language.lower() == 'english': - from .G2P.English.EnglishG2P import english_to_phones - phones = english_to_phones(prompt_text) - text_bert = np.zeros((len(phones), BERT_FEATURE_DIM), dtype=np.float32) - elif language.lower() == 'chinese': - from .G2P.Chinese.ChineseG2P import chinese_to_phones - text_clean, _, phones, word2ph = chinese_to_phones(prompt_text) - if model_manager.load_roberta_model(): - encoded = model_manager.roberta_tokenizer.encode(text_clean) - input_ids = np.array([encoded.ids], dtype=np.int64) - attention_mask = np.array([encoded.attention_mask], dtype=np.int64) - ort_inputs = { - 'input_ids': input_ids, - 'attention_mask': attention_mask, - 'repeats': np.array(word2ph, dtype=np.int64), - } - outputs = model_manager.roberta_model.run(None, ort_inputs) - text_bert = outputs[0].astype(np.float32) - else: - text_bert = np.zeros((len(phones), BERT_FEATURE_DIM), dtype=np.float32) - else: - from .G2P.Japanese.JapaneseG2P import japanese_to_phones - phones = japanese_to_phones(prompt_text) - text_bert = np.zeros((len(phones), BERT_FEATURE_DIM), dtype=np.float32) - - phones_seq = np.array([phones], dtype=np.int64) - return phones_seq, text_bert +import numpy as np +from typing import Tuple +from .Utils.Constants import BERT_FEATURE_DIM +from .ModelManager import model_manager + + +def get_phones_and_bert(prompt_text: str, language: str = 'japanese') -> Tuple[np.ndarray, np.ndarray]: + if language.lower() == 'english': + from .G2P.English.EnglishG2P import english_to_phones + phones = english_to_phones(prompt_text) + text_bert = np.zeros((len(phones), BERT_FEATURE_DIM), dtype=np.float32) + elif language.lower() == 'chinese': + from .G2P.Chinese.ChineseG2P import chinese_to_phones + text_clean, _, phones, word2ph = chinese_to_phones(prompt_text) + if model_manager.load_roberta_model(): + encoded = model_manager.roberta_tokenizer.encode(text_clean) + input_ids = np.array([encoded.ids], dtype=np.int64) + attention_mask = np.array([encoded.attention_mask], dtype=np.int64) + ort_inputs = { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'repeats': np.array(word2ph, dtype=np.int64), + } + outputs = model_manager.roberta_model.run(None, ort_inputs) + text_bert = outputs[0].astype(np.float32) + else: + text_bert = np.zeros((len(phones), BERT_FEATURE_DIM), dtype=np.float32) + else: + from .G2P.Japanese.JapaneseG2P import japanese_to_phones + phones = japanese_to_phones(prompt_text) + text_bert = np.zeros((len(phones), BERT_FEATURE_DIM), dtype=np.float32) + + phones_seq = np.array([phones], dtype=np.int64) + return phones_seq, text_bert diff --git a/genie_tts/ModelManager.py b/genie_tts/ModelManager.py index f3e84bf65f6f378922fe88191e5ba1561a848e48..69eabbfc781e3286f35040dad42bb9f02c98cd9c 100644 --- a/genie_tts/ModelManager.py +++ b/genie_tts/ModelManager.py @@ -1,324 +1,324 @@ -""" -不再新建 .bin 文件。 -修改后内存: 6448 MB -修改前内存: 5952 MB -""" - -import gc -import logging -import os -from dataclasses import dataclass -from typing import Optional, List, Dict - -import numpy as np -import onnx -import onnxruntime -from onnxruntime import InferenceSession -from tokenizers import Tokenizer - -from .Core.Resources import (HUBERT_MODEL_DIR, SV_MODEL, ROBERTA_MODEL_DIR) -from .Utils.Utils import LRUCacheDict - -onnxruntime.set_default_logger_severity(3) -logger = logging.getLogger(__name__) - - -class GSVModelFile: - T2S_ENCODER_FP32: str = 't2s_encoder_fp32.onnx' - - T2S_FIRST_STAGE_DECODER_FP32: str = 't2s_first_stage_decoder_fp32.onnx' - T2S_FIRST_STAGE_DECODER_FP16: str = 't2s_first_stage_decoder_fp16.onnx' - T2S_STAGE_DECODER_FP32: str = 't2s_stage_decoder_fp32.onnx' - T2S_STAGE_DECODER_FP16: str = 't2s_stage_decoder_fp16.onnx' - T2S_DECODER_WEIGHT_FP16: str = 't2s_shared_fp16.bin' - - VITS_FP32: str = 'vits_fp32.onnx' - VITS_WEIGHT_FP16: str = 'vits_fp16.bin' - - PROMPT_ENCODER: str = 'prompt_encoder_fp32.onnx' - PROMPT_ENCODER_WEIGHT_FP16: str = 'prompt_encoder_fp16.bin' - - HUBERT_MODEL = os.path.join(HUBERT_MODEL_DIR, "chinese-hubert-base.onnx") - HUBERT_MODEL_WEIGHT_FP16 = os.path.join(HUBERT_MODEL_DIR, "chinese-hubert-base_weights_fp16.bin") - - ROBERTA_MODEL = os.path.join(ROBERTA_MODEL_DIR, 'RoBERTa.onnx') - ROBERTA_TOKENIZER = os.path.join(ROBERTA_MODEL_DIR, 'roberta_tokenizer') - - -@dataclass -class GSVModel: - LANGUAGE: str - T2S_ENCODER: InferenceSession - T2S_FIRST_STAGE_DECODER: InferenceSession - T2S_STAGE_DECODER: InferenceSession - VITS: InferenceSession - PROMPT_ENCODER: Optional[InferenceSession] = None - PROMPT_ENCODER_PATH: Optional[str] = None - - -def load_session_with_fp16_conversion( - onnx_path: str, - fp16_bin_path: str, - providers: List[str], - sess_options: Optional[onnxruntime.SessionOptions] = None -) -> InferenceSession: - """ - 通用函数:读取 ONNX 和 FP16 权重文件,在内存中将权重转换为 FP32, - 注入到 ONNX 模型中并加载 InferenceSession,不产生临时文件。 - """ - if not os.path.exists(onnx_path): - raise FileNotFoundError(f"ONNX Model not found: {onnx_path}") - if not os.path.exists(fp16_bin_path): - raise FileNotFoundError(f"FP16 Weight file not found: {fp16_bin_path}") - - model_proto = onnx.load(onnx_path, load_external_data=False) - fp16_data = np.fromfile(fp16_bin_path, dtype=np.float16) - fp32_data = fp16_data.astype(np.float32) - fp32_bytes = fp32_data.tobytes() - - # 遍历并修补模型中的 External Data Initializers - for tensor in model_proto.graph.initializer: - # 检查该 Tensor 是否使用外部数据 - if tensor.data_location == onnx.TensorProto.EXTERNAL: - offset = 0 - length = 0 - # 解析外部数据信息 - for entry in tensor.external_data: - if entry.key == 'offset': - offset = int(entry.value) - elif entry.key == 'length': - length = int(entry.value) - - if offset + length > len(fp32_bytes): - logger.warning( - f"Tensor {tensor.name} requested a data range that exceeds the size of the provided bin file. " - f"Offset: {offset}, Length: {length}, Buffer: {len(fp32_bytes)}" - ) - continue - - tensor_data = fp32_bytes[offset: offset + length] - tensor.raw_data = tensor_data - - del tensor.external_data[:] - tensor.data_location = onnx.TensorProto.DEFAULT - - try: - session = InferenceSession( - model_proto.SerializeToString(), - providers=providers, - sess_options=sess_options - ) - return session - except Exception as e: - logger.error(f"Failed to load in-memory model {os.path.basename(onnx_path)}: {e}") - raise e - - -class ModelManager: - def __init__(self): - capacity_str = os.getenv('Max_Cached_Character_Models', '3') - self.character_to_model: Dict[str, Dict[str, Optional[InferenceSession]]] = LRUCacheDict( - capacity=int(capacity_str) - ) - self.character_to_language: Dict[str, str] = {} - self.character_model_paths: Dict[str, str] = {} - self.providers = ["CPUExecutionProvider"] - - self.cn_hubert: Optional[InferenceSession] = None - self.speaker_verification_model: Optional[InferenceSession] = None - self.roberta_model: Optional[InferenceSession] = None - self.roberta_tokenizer: Optional[Tokenizer] = None - - def load_roberta_model(self, model_path: str = GSVModelFile.ROBERTA_MODEL) -> bool: - if self.roberta_model is not None: - return True - if not os.path.exists(model_path): - # logger.warning(f'RoBERTa model does not exist: {model_path}. BERT features will not be used.') - return False - try: - self.roberta_model = onnxruntime.InferenceSession( - model_path, - providers=self.providers, - ) - self.roberta_tokenizer = Tokenizer.from_file( - os.path.join(GSVModelFile.ROBERTA_TOKENIZER, 'tokenizer.json') - ) - logger.info(f"Successfully loaded RoBERTa model.") - return True - except Exception as e: - logger.error( - f"Error: Failed to load ONNX model '{GSVModelFile.ROBERTA_MODEL}'.\n" - f"Details: {e}" - ) - return False - - def load_sv_model(self, model_path: str = SV_MODEL) -> bool: - if self.speaker_verification_model is not None: - return True - try: - self.speaker_verification_model = onnxruntime.InferenceSession( - model_path, - providers=self.providers, - ) - logger.info(f"Successfully loaded Speaker Verification model.") - return True - except Exception as e: - logger.error( - f"Error: Failed to load ONNX model '{SV_MODEL}'.\n" - f"Details: {e}" - ) - return False - - def load_cn_hubert(self, model_path: str = GSVModelFile.HUBERT_MODEL) -> bool: - if self.cn_hubert is not None: - return True - try: - # Hubert 也应用内存转换逻辑 - if model_path == GSVModelFile.HUBERT_MODEL and os.path.exists(GSVModelFile.HUBERT_MODEL_WEIGHT_FP16): - self.cn_hubert = load_session_with_fp16_conversion( - model_path, - GSVModelFile.HUBERT_MODEL_WEIGHT_FP16, - self.providers - ) - else: - self.cn_hubert = onnxruntime.InferenceSession( - model_path, - providers=self.providers, - ) - logger.info("Successfully loaded CN_HuBERT model.") - return True - except Exception as e: - logger.error( - f"Error: Failed to load ONNX model '{GSVModelFile.HUBERT_MODEL}'.\n" - f"Details: {e}" - ) - return False - - def get(self, character_name: str) -> Optional[GSVModel]: - character_name = character_name.lower() - language = self.character_to_language.get(character_name, 'Japanese') - if character_name in self.character_to_model: - model_map: dict = self.character_to_model[character_name] - # 简化获取逻辑 - t2s_first_stage_decoder = model_map.get(GSVModelFile.T2S_FIRST_STAGE_DECODER_FP32) or \ - model_map.get(GSVModelFile.T2S_FIRST_STAGE_DECODER_FP16) - t2s_stage_decoder = model_map.get(GSVModelFile.T2S_STAGE_DECODER_FP32) or \ - model_map.get(GSVModelFile.T2S_STAGE_DECODER_FP16) - prompt_encoder_path = os.path.join(self.character_model_paths[character_name], GSVModelFile.PROMPT_ENCODER) - - return GSVModel( - LANGUAGE=language, - T2S_ENCODER=model_map[GSVModelFile.T2S_ENCODER_FP32], - T2S_FIRST_STAGE_DECODER=t2s_first_stage_decoder, - T2S_STAGE_DECODER=t2s_stage_decoder, - VITS=model_map[GSVModelFile.VITS_FP32], - PROMPT_ENCODER=model_map[GSVModelFile.PROMPT_ENCODER], - PROMPT_ENCODER_PATH=prompt_encoder_path, - ) - if character_name in self.character_model_paths: - model_dir = self.character_model_paths[character_name] - if self.load_character(character_name, model_dir, language=language): - return self.get(character_name) - else: - del self.character_model_paths[character_name] - return None - return None - - def has_character(self, character_name: str) -> bool: - character_name = character_name.lower() - return character_name in self.character_model_paths - - def load_character( - self, - character_name: str, - model_dir: str, - language: str, - ) -> bool: - """ - 加载角色模型,如果需要,在内存中动态转换 FP16 权重。 - """ - character_name = character_name.lower() - if character_name in self.character_to_model: - _ = self.character_to_model[character_name] - return True - - model_dict: Dict[str, Optional[InferenceSession]] = {} - - # 定义 ONNX 文件到 FP16 Bin 文件的映射关系 - onnx_to_fp16_map = { - GSVModelFile.T2S_FIRST_STAGE_DECODER_FP32: GSVModelFile.T2S_DECODER_WEIGHT_FP16, - GSVModelFile.T2S_STAGE_DECODER_FP32: GSVModelFile.T2S_DECODER_WEIGHT_FP16, - GSVModelFile.VITS_FP32: GSVModelFile.VITS_WEIGHT_FP16, - GSVModelFile.PROMPT_ENCODER: GSVModelFile.PROMPT_ENCODER_WEIGHT_FP16 - } - - # 确定需要加载的模型列表 - model_files_to_load = [ - GSVModelFile.T2S_ENCODER_FP32, - GSVModelFile.VITS_FP32, - GSVModelFile.PROMPT_ENCODER, - ] - - fp32_decoders = [GSVModelFile.T2S_FIRST_STAGE_DECODER_FP32, GSVModelFile.T2S_STAGE_DECODER_FP32] - model_files_to_load.extend(fp32_decoders) - - try: - for model_file in model_files_to_load: - model_path = os.path.normpath(os.path.join(model_dir, model_file)) - - # 设置 Session Options - sess_options = onnxruntime.SessionOptions() - sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - - if os.path.exists(model_path): - fp16_bin_name = onnx_to_fp16_map.get(model_file) - fp16_bin_path = os.path.join(model_dir, fp16_bin_name) if fp16_bin_name else None - - if fp16_bin_path and os.path.exists(fp16_bin_path): - model_dict[model_file] = load_session_with_fp16_conversion( - model_path, fp16_bin_path, self.providers, sess_options - ) - else: - model_dict[model_file] = onnxruntime.InferenceSession( - model_path, - providers=self.providers, - sess_options=sess_options, - ) - elif model_file == GSVModelFile.PROMPT_ENCODER: - model_dict[model_file] = None - else: - raise FileNotFoundError(f'文件 {model_path} 不存在!') - - # 日志信息 - is_v2pp = model_dict[GSVModelFile.PROMPT_ENCODER] is not None - logger.info( - f"Character {character_name.capitalize()} loaded successfully.\n" - f"- Model Path: {model_dir}\n" - f"- Model Type: {'V2ProPlus' if is_v2pp else 'V2'}" - ) - - self.character_to_model[character_name] = model_dict - self.character_to_language[character_name] = language - self.character_model_paths[character_name] = model_dir - return True - - except Exception as e: - logger.error( - f"Error: Failed to load ONNX model '{model_dir}'.\n" - f"Details: {e}" - ) - return False - - def remove_all_character(self) -> None: - self.character_to_model.clear() - gc.collect() - - def remove_character(self, character_name: str) -> None: - character_name = character_name.lower() - if character_name in self.character_to_model: - del self.character_to_model[character_name] - gc.collect() - logger.info(f"Character {character_name.capitalize()} removed successfully.") - - -model_manager: ModelManager = ModelManager() +""" +不再新建 .bin 文件。 +修改后内存: 6448 MB +修改前内存: 5952 MB +""" + +import gc +import logging +import os +from dataclasses import dataclass +from typing import Optional, List, Dict + +import numpy as np +import onnx +import onnxruntime +from onnxruntime import InferenceSession +from tokenizers import Tokenizer + +from .Core.Resources import (HUBERT_MODEL_DIR, SV_MODEL, ROBERTA_MODEL_DIR) +from .Utils.Utils import LRUCacheDict + +onnxruntime.set_default_logger_severity(3) +logger = logging.getLogger(__name__) + + +class GSVModelFile: + T2S_ENCODER_FP32: str = 't2s_encoder_fp32.onnx' + + T2S_FIRST_STAGE_DECODER_FP32: str = 't2s_first_stage_decoder_fp32.onnx' + T2S_FIRST_STAGE_DECODER_FP16: str = 't2s_first_stage_decoder_fp16.onnx' + T2S_STAGE_DECODER_FP32: str = 't2s_stage_decoder_fp32.onnx' + T2S_STAGE_DECODER_FP16: str = 't2s_stage_decoder_fp16.onnx' + T2S_DECODER_WEIGHT_FP16: str = 't2s_shared_fp16.bin' + + VITS_FP32: str = 'vits_fp32.onnx' + VITS_WEIGHT_FP16: str = 'vits_fp16.bin' + + PROMPT_ENCODER: str = 'prompt_encoder_fp32.onnx' + PROMPT_ENCODER_WEIGHT_FP16: str = 'prompt_encoder_fp16.bin' + + HUBERT_MODEL = os.path.join(HUBERT_MODEL_DIR, "chinese-hubert-base.onnx") + HUBERT_MODEL_WEIGHT_FP16 = os.path.join(HUBERT_MODEL_DIR, "chinese-hubert-base_weights_fp16.bin") + + ROBERTA_MODEL = os.path.join(ROBERTA_MODEL_DIR, 'RoBERTa.onnx') + ROBERTA_TOKENIZER = os.path.join(ROBERTA_MODEL_DIR, 'roberta_tokenizer') + + +@dataclass +class GSVModel: + LANGUAGE: str + T2S_ENCODER: InferenceSession + T2S_FIRST_STAGE_DECODER: InferenceSession + T2S_STAGE_DECODER: InferenceSession + VITS: InferenceSession + PROMPT_ENCODER: Optional[InferenceSession] = None + PROMPT_ENCODER_PATH: Optional[str] = None + + +def load_session_with_fp16_conversion( + onnx_path: str, + fp16_bin_path: str, + providers: List[str], + sess_options: Optional[onnxruntime.SessionOptions] = None +) -> InferenceSession: + """ + 通用函数:读取 ONNX 和 FP16 权重文件,在内存中将权重转换为 FP32, + 注入到 ONNX 模型中并加载 InferenceSession,不产生临时文件。 + """ + if not os.path.exists(onnx_path): + raise FileNotFoundError(f"ONNX Model not found: {onnx_path}") + if not os.path.exists(fp16_bin_path): + raise FileNotFoundError(f"FP16 Weight file not found: {fp16_bin_path}") + + model_proto = onnx.load(onnx_path, load_external_data=False) + fp16_data = np.fromfile(fp16_bin_path, dtype=np.float16) + fp32_data = fp16_data.astype(np.float32) + fp32_bytes = fp32_data.tobytes() + + # 遍历并修补模型中的 External Data Initializers + for tensor in model_proto.graph.initializer: + # 检查该 Tensor 是否使用外部数据 + if tensor.data_location == onnx.TensorProto.EXTERNAL: + offset = 0 + length = 0 + # 解析外部数据信息 + for entry in tensor.external_data: + if entry.key == 'offset': + offset = int(entry.value) + elif entry.key == 'length': + length = int(entry.value) + + if offset + length > len(fp32_bytes): + logger.warning( + f"Tensor {tensor.name} requested a data range that exceeds the size of the provided bin file. " + f"Offset: {offset}, Length: {length}, Buffer: {len(fp32_bytes)}" + ) + continue + + tensor_data = fp32_bytes[offset: offset + length] + tensor.raw_data = tensor_data + + del tensor.external_data[:] + tensor.data_location = onnx.TensorProto.DEFAULT + + try: + session = InferenceSession( + model_proto.SerializeToString(), + providers=providers, + sess_options=sess_options + ) + return session + except Exception as e: + logger.error(f"Failed to load in-memory model {os.path.basename(onnx_path)}: {e}") + raise e + + +class ModelManager: + def __init__(self): + capacity_str = os.getenv('Max_Cached_Character_Models', '3') + self.character_to_model: Dict[str, Dict[str, Optional[InferenceSession]]] = LRUCacheDict( + capacity=int(capacity_str) + ) + self.character_to_language: Dict[str, str] = {} + self.character_model_paths: Dict[str, str] = {} + self.providers = ["CPUExecutionProvider"] + + self.cn_hubert: Optional[InferenceSession] = None + self.speaker_verification_model: Optional[InferenceSession] = None + self.roberta_model: Optional[InferenceSession] = None + self.roberta_tokenizer: Optional[Tokenizer] = None + + def load_roberta_model(self, model_path: str = GSVModelFile.ROBERTA_MODEL) -> bool: + if self.roberta_model is not None: + return True + if not os.path.exists(model_path): + # logger.warning(f'RoBERTa model does not exist: {model_path}. BERT features will not be used.') + return False + try: + self.roberta_model = onnxruntime.InferenceSession( + model_path, + providers=self.providers, + ) + self.roberta_tokenizer = Tokenizer.from_file( + os.path.join(GSVModelFile.ROBERTA_TOKENIZER, 'tokenizer.json') + ) + logger.info(f"Successfully loaded RoBERTa model.") + return True + except Exception as e: + logger.error( + f"Error: Failed to load ONNX model '{GSVModelFile.ROBERTA_MODEL}'.\n" + f"Details: {e}" + ) + return False + + def load_sv_model(self, model_path: str = SV_MODEL) -> bool: + if self.speaker_verification_model is not None: + return True + try: + self.speaker_verification_model = onnxruntime.InferenceSession( + model_path, + providers=self.providers, + ) + logger.info(f"Successfully loaded Speaker Verification model.") + return True + except Exception as e: + logger.error( + f"Error: Failed to load ONNX model '{SV_MODEL}'.\n" + f"Details: {e}" + ) + return False + + def load_cn_hubert(self, model_path: str = GSVModelFile.HUBERT_MODEL) -> bool: + if self.cn_hubert is not None: + return True + try: + # Hubert 也应用内存转换逻辑 + if model_path == GSVModelFile.HUBERT_MODEL and os.path.exists(GSVModelFile.HUBERT_MODEL_WEIGHT_FP16): + self.cn_hubert = load_session_with_fp16_conversion( + model_path, + GSVModelFile.HUBERT_MODEL_WEIGHT_FP16, + self.providers + ) + else: + self.cn_hubert = onnxruntime.InferenceSession( + model_path, + providers=self.providers, + ) + logger.info("Successfully loaded CN_HuBERT model.") + return True + except Exception as e: + logger.error( + f"Error: Failed to load ONNX model '{GSVModelFile.HUBERT_MODEL}'.\n" + f"Details: {e}" + ) + return False + + def get(self, character_name: str) -> Optional[GSVModel]: + character_name = character_name.lower() + language = self.character_to_language.get(character_name, 'Japanese') + if character_name in self.character_to_model: + model_map: dict = self.character_to_model[character_name] + # 简化获取逻辑 + t2s_first_stage_decoder = model_map.get(GSVModelFile.T2S_FIRST_STAGE_DECODER_FP32) or \ + model_map.get(GSVModelFile.T2S_FIRST_STAGE_DECODER_FP16) + t2s_stage_decoder = model_map.get(GSVModelFile.T2S_STAGE_DECODER_FP32) or \ + model_map.get(GSVModelFile.T2S_STAGE_DECODER_FP16) + prompt_encoder_path = os.path.join(self.character_model_paths[character_name], GSVModelFile.PROMPT_ENCODER) + + return GSVModel( + LANGUAGE=language, + T2S_ENCODER=model_map[GSVModelFile.T2S_ENCODER_FP32], + T2S_FIRST_STAGE_DECODER=t2s_first_stage_decoder, + T2S_STAGE_DECODER=t2s_stage_decoder, + VITS=model_map[GSVModelFile.VITS_FP32], + PROMPT_ENCODER=model_map[GSVModelFile.PROMPT_ENCODER], + PROMPT_ENCODER_PATH=prompt_encoder_path, + ) + if character_name in self.character_model_paths: + model_dir = self.character_model_paths[character_name] + if self.load_character(character_name, model_dir, language=language): + return self.get(character_name) + else: + del self.character_model_paths[character_name] + return None + return None + + def has_character(self, character_name: str) -> bool: + character_name = character_name.lower() + return character_name in self.character_model_paths + + def load_character( + self, + character_name: str, + model_dir: str, + language: str, + ) -> bool: + """ + 加载角色模型,如果需要,在内存中动态转换 FP16 权重。 + """ + character_name = character_name.lower() + if character_name in self.character_to_model: + _ = self.character_to_model[character_name] + return True + + model_dict: Dict[str, Optional[InferenceSession]] = {} + + # 定义 ONNX 文件到 FP16 Bin 文件的映射关系 + onnx_to_fp16_map = { + GSVModelFile.T2S_FIRST_STAGE_DECODER_FP32: GSVModelFile.T2S_DECODER_WEIGHT_FP16, + GSVModelFile.T2S_STAGE_DECODER_FP32: GSVModelFile.T2S_DECODER_WEIGHT_FP16, + GSVModelFile.VITS_FP32: GSVModelFile.VITS_WEIGHT_FP16, + GSVModelFile.PROMPT_ENCODER: GSVModelFile.PROMPT_ENCODER_WEIGHT_FP16 + } + + # 确定需要加载的模型列表 + model_files_to_load = [ + GSVModelFile.T2S_ENCODER_FP32, + GSVModelFile.VITS_FP32, + GSVModelFile.PROMPT_ENCODER, + ] + + fp32_decoders = [GSVModelFile.T2S_FIRST_STAGE_DECODER_FP32, GSVModelFile.T2S_STAGE_DECODER_FP32] + model_files_to_load.extend(fp32_decoders) + + try: + for model_file in model_files_to_load: + model_path = os.path.normpath(os.path.join(model_dir, model_file)) + + # 设置 Session Options + sess_options = onnxruntime.SessionOptions() + sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + + if os.path.exists(model_path): + fp16_bin_name = onnx_to_fp16_map.get(model_file) + fp16_bin_path = os.path.join(model_dir, fp16_bin_name) if fp16_bin_name else None + + if fp16_bin_path and os.path.exists(fp16_bin_path): + model_dict[model_file] = load_session_with_fp16_conversion( + model_path, fp16_bin_path, self.providers, sess_options + ) + else: + model_dict[model_file] = onnxruntime.InferenceSession( + model_path, + providers=self.providers, + sess_options=sess_options, + ) + elif model_file == GSVModelFile.PROMPT_ENCODER: + model_dict[model_file] = None + else: + raise FileNotFoundError(f'文件 {model_path} 不存在!') + + # 日志信息 + is_v2pp = model_dict[GSVModelFile.PROMPT_ENCODER] is not None + logger.info( + f"Character {character_name.capitalize()} loaded successfully.\n" + f"- Model Path: {model_dir}\n" + f"- Model Type: {'V2ProPlus' if is_v2pp else 'V2'}" + ) + + self.character_to_model[character_name] = model_dict + self.character_to_language[character_name] = language + self.character_model_paths[character_name] = model_dir + return True + + except Exception as e: + logger.error( + f"Error: Failed to load ONNX model '{model_dir}'.\n" + f"Details: {e}" + ) + return False + + def remove_all_character(self) -> None: + self.character_to_model.clear() + gc.collect() + + def remove_character(self, character_name: str) -> None: + character_name = character_name.lower() + if character_name in self.character_to_model: + del self.character_to_model[character_name] + gc.collect() + logger.info(f"Character {character_name.capitalize()} removed successfully.") + + +model_manager: ModelManager = ModelManager() diff --git a/genie_tts/PredefinedCharacter.py b/genie_tts/PredefinedCharacter.py index c8ba07dd6b4a5eb4107e8a5a61f1b6f1d4ec82c8..2f084cd7ad8ce02c305fc5ce976038a9f9d7e0a4 100644 --- a/genie_tts/PredefinedCharacter.py +++ b/genie_tts/PredefinedCharacter.py @@ -1,39 +1,39 @@ -from huggingface_hub import snapshot_download -import os -from typing import Dict - -CHARA_LANG: Dict[str, str] = { - 'mika': 'Japanese', - 'feibi': 'Chinese', - 'thirtyseven': 'English', -} -CHARA_ALIAS_MAP: Dict[str, str] = { - "mika": "mika", - "misono mika": "mika", - "圣园未花": "mika", - "未花": "mika", - "みその みか": "mika", - "feibi": "feibi", - "菲比": "feibi", - "37": "thirtyseven", - "thirtyseven": "thirtyseven", -} - - -def download_chara(chara: str, version: str = "v2ProPlus") -> str: - local_dir = os.path.join("CharacterModels", version, chara) - if os.path.exists(local_dir): - print(f"✔ Model for '{chara}' already exists locally. Skipping download.") - return local_dir - - print(f"🚀 Starting download of model for character '{chara}'. This may take a few moments... ⏳") - remote_path = f"CharacterModels/{version}/{chara}/*" - snapshot_download( - repo_id="High-Logic/Genie", - repo_type="model", - allow_patterns=remote_path, - local_dir=".", - local_dir_use_symlinks=True, # 软链接 - ) - print(f"🎉 All model files for '{chara}' have been downloaded to '{os.path.abspath(local_dir)}' 📂") - return local_dir +from huggingface_hub import snapshot_download +import os +from typing import Dict + +CHARA_LANG: Dict[str, str] = { + 'mika': 'Japanese', + 'feibi': 'Chinese', + 'thirtyseven': 'English', +} +CHARA_ALIAS_MAP: Dict[str, str] = { + "mika": "mika", + "misono mika": "mika", + "圣园未花": "mika", + "未花": "mika", + "みその みか": "mika", + "feibi": "feibi", + "菲比": "feibi", + "37": "thirtyseven", + "thirtyseven": "thirtyseven", +} + + +def download_chara(chara: str, version: str = "v2ProPlus") -> str: + local_dir = os.path.join("CharacterModels", version, chara) + if os.path.exists(local_dir): + print(f"✔ Model for '{chara}' already exists locally. Skipping download.") + return local_dir + + print(f"🚀 Starting download of model for character '{chara}'. This may take a few moments... ⏳") + remote_path = f"CharacterModels/{version}/{chara}/*" + snapshot_download( + repo_id="High-Logic/Genie", + repo_type="model", + allow_patterns=remote_path, + local_dir=".", + local_dir_use_symlinks=True, # 软链接 + ) + print(f"🎉 All model files for '{chara}' have been downloaded to '{os.path.abspath(local_dir)}' 📂") + return local_dir diff --git a/genie_tts/Server.py b/genie_tts/Server.py index 458bccd4c2cce79f46ed85f99bebbb969821a587..499e9d86d21021fb20aabab7c8f84951bd01726d 100644 --- a/genie_tts/Server.py +++ b/genie_tts/Server.py @@ -1,169 +1,169 @@ -import asyncio -import os -from typing import AsyncIterator, Optional, Callable, Union, Dict -import logging - -import uvicorn -from fastapi import FastAPI, HTTPException -from fastapi.responses import StreamingResponse -from pydantic import BaseModel - -from .Audio.ReferenceAudio import ReferenceAudio -from .Core.TTSPlayer import tts_player -from .ModelManager import model_manager -from .Utils.Shared import context -from .Utils.Language import normalize_language - -logger = logging.getLogger(__name__) - -_reference_audios: Dict[str, dict] = {} -SUPPORTED_AUDIO_EXTS = {'.wav', '.flac', '.ogg', '.aiff', '.aif'} - -app = FastAPI() - - -class CharacterPayload(BaseModel): - character_name: str - onnx_model_dir: str - language: str - - -class UnloadCharacterPayload(BaseModel): - character_name: str - - -class ReferenceAudioPayload(BaseModel): - character_name: str - audio_path: str - audio_text: str - language: str - - -class TTSPayload(BaseModel): - character_name: str - text: str - split_sentence: bool = False - save_path: Optional[str] = None - - -@app.post("/load_character") -def load_character_endpoint(payload: CharacterPayload): - try: - model_manager.load_character( - character_name=payload.character_name, - model_dir=payload.onnx_model_dir, - language=normalize_language(payload.language), - ) - return {"status": "success", "message": f"Character '{payload.character_name}' loaded."} - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/unload_character") -def unload_character_endpoint(payload: UnloadCharacterPayload): - try: - model_manager.remove_character(character_name=payload.character_name) - return {"status": "success", "message": f"Character '{payload.character_name}' unloaded."} - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/set_reference_audio") -def set_reference_audio_endpoint(payload: ReferenceAudioPayload): - ext = os.path.splitext(payload.audio_path)[1].lower() - if ext not in SUPPORTED_AUDIO_EXTS: - raise HTTPException( - status_code=400, - detail=f"Audio format '{ext}' is not supported. Supported formats: {SUPPORTED_AUDIO_EXTS}", - ) - _reference_audios[payload.character_name] = { - 'audio_path': payload.audio_path, - 'audio_text': payload.audio_text, - 'language': normalize_language(payload.language), - } - return {"status": "success", "message": f"Reference audio for '{payload.character_name}' set."} - - -def run_tts_in_background( - character_name: str, - text: str, - split_sentence: bool, - save_path: Optional[str], - chunk_callback: Callable[[Optional[bytes]], None] -): - try: - context.current_speaker = character_name - context.current_prompt_audio = ReferenceAudio( - prompt_wav=_reference_audios[character_name]['audio_path'], - prompt_text=_reference_audios[character_name]['audio_text'], - language=_reference_audios[character_name]['language'], - ) - tts_player.start_session( - play=False, - split=split_sentence, - save_path=save_path, - chunk_callback=chunk_callback, - ) - tts_player.feed(text) - tts_player.end_session() - tts_player.wait_for_tts_completion() - except Exception as e: - logger.error(f"Error in TTS background task: {e}", exc_info=True) - - -async def audio_stream_generator(queue: asyncio.Queue) -> AsyncIterator[bytes]: - while True: - chunk = await queue.get() - if chunk is None: - break - yield chunk - - -@app.post("/tts") -async def tts_endpoint(payload: TTSPayload): - if payload.character_name not in _reference_audios: - raise HTTPException(status_code=404, detail="Character not found or reference audio not set.") - - loop = asyncio.get_running_loop() - stream_queue: asyncio.Queue[Union[bytes, None]] = asyncio.Queue() - - def tts_chunk_callback(chunk: Optional[bytes]): - loop.call_soon_threadsafe(stream_queue.put_nowait, chunk) - - loop.run_in_executor( - None, - run_tts_in_background, - payload.character_name, - payload.text, - payload.split_sentence, - payload.save_path, - tts_chunk_callback - ) - - return StreamingResponse(audio_stream_generator(stream_queue), media_type="audio/wav") - - -@app.post("/stop") -def stop_endpoint(): - try: - tts_player.stop() - return {"status": "success", "message": "TTS stopped."} - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/clear_reference_audio_cache") -def clear_reference_audio_cache_endpoint(): - try: - ReferenceAudio.clear_cache() - return {"status": "success", "message": "Reference audio cache cleared."} - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -def start_server(host: str = "127.0.0.1", port: int = 8000, workers: int = 1): - uvicorn.run(app, host=host, port=port, workers=workers) - - -if __name__ == "__main__": - start_server(host="0.0.0.0", port=8000, workers=1) +import asyncio +import os +from typing import AsyncIterator, Optional, Callable, Union, Dict +import logging + +import uvicorn +from fastapi import FastAPI, HTTPException +from fastapi.responses import StreamingResponse +from pydantic import BaseModel + +from .Audio.ReferenceAudio import ReferenceAudio +from .Core.TTSPlayer import tts_player +from .ModelManager import model_manager +from .Utils.Shared import context +from .Utils.Language import normalize_language + +logger = logging.getLogger(__name__) + +_reference_audios: Dict[str, dict] = {} +SUPPORTED_AUDIO_EXTS = {'.wav', '.flac', '.ogg', '.aiff', '.aif'} + +app = FastAPI() + + +class CharacterPayload(BaseModel): + character_name: str + onnx_model_dir: str + language: str + + +class UnloadCharacterPayload(BaseModel): + character_name: str + + +class ReferenceAudioPayload(BaseModel): + character_name: str + audio_path: str + audio_text: str + language: str + + +class TTSPayload(BaseModel): + character_name: str + text: str + split_sentence: bool = False + save_path: Optional[str] = None + + +@app.post("/load_character") +def load_character_endpoint(payload: CharacterPayload): + try: + model_manager.load_character( + character_name=payload.character_name, + model_dir=payload.onnx_model_dir, + language=normalize_language(payload.language), + ) + return {"status": "success", "message": f"Character '{payload.character_name}' loaded."} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/unload_character") +def unload_character_endpoint(payload: UnloadCharacterPayload): + try: + model_manager.remove_character(character_name=payload.character_name) + return {"status": "success", "message": f"Character '{payload.character_name}' unloaded."} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/set_reference_audio") +def set_reference_audio_endpoint(payload: ReferenceAudioPayload): + ext = os.path.splitext(payload.audio_path)[1].lower() + if ext not in SUPPORTED_AUDIO_EXTS: + raise HTTPException( + status_code=400, + detail=f"Audio format '{ext}' is not supported. Supported formats: {SUPPORTED_AUDIO_EXTS}", + ) + _reference_audios[payload.character_name] = { + 'audio_path': payload.audio_path, + 'audio_text': payload.audio_text, + 'language': normalize_language(payload.language), + } + return {"status": "success", "message": f"Reference audio for '{payload.character_name}' set."} + + +def run_tts_in_background( + character_name: str, + text: str, + split_sentence: bool, + save_path: Optional[str], + chunk_callback: Callable[[Optional[bytes]], None] +): + try: + context.current_speaker = character_name + context.current_prompt_audio = ReferenceAudio( + prompt_wav=_reference_audios[character_name]['audio_path'], + prompt_text=_reference_audios[character_name]['audio_text'], + language=_reference_audios[character_name]['language'], + ) + tts_player.start_session( + play=False, + split=split_sentence, + save_path=save_path, + chunk_callback=chunk_callback, + ) + tts_player.feed(text) + tts_player.end_session() + tts_player.wait_for_tts_completion() + except Exception as e: + logger.error(f"Error in TTS background task: {e}", exc_info=True) + + +async def audio_stream_generator(queue: asyncio.Queue) -> AsyncIterator[bytes]: + while True: + chunk = await queue.get() + if chunk is None: + break + yield chunk + + +@app.post("/tts") +async def tts_endpoint(payload: TTSPayload): + if payload.character_name not in _reference_audios: + raise HTTPException(status_code=404, detail="Character not found or reference audio not set.") + + loop = asyncio.get_running_loop() + stream_queue: asyncio.Queue[Union[bytes, None]] = asyncio.Queue() + + def tts_chunk_callback(chunk: Optional[bytes]): + loop.call_soon_threadsafe(stream_queue.put_nowait, chunk) + + loop.run_in_executor( + None, + run_tts_in_background, + payload.character_name, + payload.text, + payload.split_sentence, + payload.save_path, + tts_chunk_callback + ) + + return StreamingResponse(audio_stream_generator(stream_queue), media_type="audio/wav") + + +@app.post("/stop") +def stop_endpoint(): + try: + tts_player.stop() + return {"status": "success", "message": "TTS stopped."} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/clear_reference_audio_cache") +def clear_reference_audio_cache_endpoint(): + try: + ReferenceAudio.clear_cache() + return {"status": "success", "message": "Reference audio cache cleared."} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +def start_server(host: str = "127.0.0.1", port: int = 8000, workers: int = 1): + uvicorn.run(app, host=host, port=port, workers=workers) + + +if __name__ == "__main__": + start_server(host="0.0.0.0", port=8000, workers=1) diff --git a/genie_tts/Utils/Constants.py b/genie_tts/Utils/Constants.py index 6b134e10b7612f06b6cc548938b27cf745575d1a..9dbafa80454155805ecee50eb104431a7450626c 100644 --- a/genie_tts/Utils/Constants.py +++ b/genie_tts/Utils/Constants.py @@ -1,2 +1,2 @@ -BERT_FEATURE_DIM = 1024 -PACKAGE_NAME = "genie_tts" +BERT_FEATURE_DIM = 1024 +PACKAGE_NAME = "genie_tts" diff --git a/genie_tts/Utils/Language.py b/genie_tts/Utils/Language.py index 7d6f23aa8bdedcddd125c7400d51381ee3b5f1cc..345671a478ba6a5f3ff8a18305e95fb4941d9107 100644 --- a/genie_tts/Utils/Language.py +++ b/genie_tts/Utils/Language.py @@ -1,26 +1,26 @@ -language_map = { - # Chinese - "chinese": "Chinese", - "zh": "Chinese", - "zh-cn": "Chinese", - "zh-tw": "Chinese", - "zh-hans": "Chinese", - "zh-hant": "Chinese", - - # English - "english": "English", - "en": "English", - "en-us": "English", - "en-gb": "English", - "eng": "English", - - # Japanese - "japanese": "Japanese", - "jp": "Japanese", - "ja": "Japanese", - "nihongo": "Japanese", -} - - -def normalize_language(lang: str) -> str: - return language_map.get(lang.lower(), lang) +language_map = { + # Chinese + "chinese": "Chinese", + "zh": "Chinese", + "zh-cn": "Chinese", + "zh-tw": "Chinese", + "zh-hans": "Chinese", + "zh-hant": "Chinese", + + # English + "english": "English", + "en": "English", + "en-us": "English", + "en-gb": "English", + "eng": "English", + + # Japanese + "japanese": "Japanese", + "jp": "Japanese", + "ja": "Japanese", + "nihongo": "Japanese", +} + + +def normalize_language(lang: str) -> str: + return language_map.get(lang.lower(), lang) diff --git a/genie_tts/Utils/TextSplitter.py b/genie_tts/Utils/TextSplitter.py index b9dca1045dbad88a5ed8a9315ac4d0c4307f76b8..775aa15bd3ac1bf7b6a6e83d1842ec556292ef4f 100644 --- a/genie_tts/Utils/TextSplitter.py +++ b/genie_tts/Utils/TextSplitter.py @@ -1,123 +1,123 @@ -import re -from typing import List, Set, Pattern - - -class TextSplitter: - def __init__(self, max_len: int = 40, min_len: int = 5): - """ - 初始化文本切分器。 - - :param max_len: 软限制最大长度 (Effective Length)。超过此长度遇到分隔符时会切分。 - :param min_len: 硬限制最小长度 (Effective Length)。小于此长度遇到终止符也不会切分。 - """ - self.max_len: int = max_len - self.min_len: int = min_len - - # 1. 定义基础字符集合 - # 只要标点块中包含这些字符,就视为 Ending (终止符) - self.end_chars: Set[str] = { - '。', '!', '?', '…', - '!', '?', '.' - } - - # 2. 定义标点符号全集 (用于正则匹配和长度计算过滤) - self.all_puncts_chars: Set[str] = self.end_chars | { - ',', '、', ';', ':', '——', - ',', ';', ':', - '“', '”', '‘', '’', '"', "'", - } - - # 3. 编译正则表达式 - # 使用非捕获组 (?:) 配合 + 号,实现贪婪匹配连续标点 - # sort + escape 确保正则安全且优先匹配长标点 - sorted_puncts: List[str] = sorted(list(self.all_puncts_chars), key=len, reverse=True) - escaped_puncts: List[str] = [re.escape(p) for p in sorted_puncts] - self.pattern: Pattern = re.compile(f"((?:{'|'.join(escaped_puncts)})+)") - - @staticmethod - def get_char_width(char: str) -> int: - """计算单字符宽度:ASCII算1,其他(中日韩)算2""" - return 1 if ord(char) < 128 else 2 - - def get_effective_len(self, text: str) -> int: - """ - 计算字符串的有效长度。 - 逻辑:跳过标点符号,仅计算内容字符。 - 例如:"你好......" -> 有效长度为 4 (你好),而不是 10。 - """ - length = 0 - for char in text: - # 如果是标点符号集合里的字符,不计入长度 - if char in self.all_puncts_chars: - continue - length += self.get_char_width(char) - return length - - def is_terminator_block(self, block: str) -> bool: - """ - 判断一个标点块是否起到结束句子的作用。 - 只要块中包含任意一个结束字符(如句号),则视为结束块。 - """ - for char in block: - if char in self.end_chars: - return True - return False - - def split(self, text: str) -> List[str]: - """核心切分逻辑""" - if not text: - return [] - - text = text.replace('\n', '') - - # 正则切分,segments 格式如: ['你好', '......', '我是谁', '?!', ''] - segments: List[str] = self.pattern.split(text) - - sentences: List[str] = [] - current_buffer: str = "" - - for segment in segments: - if not segment: - continue - - # 判断当前片段是否是标点块(通过首字符判断即可,正则保证了一致性) - is_punct_block = segment[0] in self.all_puncts_chars - - if is_punct_block: - current_buffer += segment - - # 计算缓冲区内容的【有效长度】 - eff_len = self.get_effective_len(current_buffer) - - # 判断逻辑 - if self.is_terminator_block(segment): - # Case B: 结束符号 -> 检查 min_len - if eff_len >= self.min_len: - sentences.append(current_buffer.strip()) - current_buffer = "" - # else: 有效长度太短,合并到下一句 - else: - # Case A-B: 分隔符号 -> 检查 max_len - if eff_len >= self.max_len: - sentences.append(current_buffer.strip()) - current_buffer = "" - # else: 没到最大长度,继续累积 - else: - # 纯文本 - current_buffer += segment - - # 处理残留缓冲区 - if current_buffer: - self._flush_buffer(sentences, current_buffer) - - return sentences - - def _flush_buffer(self, sentences: List[str], buffer: str): - candidate = buffer.strip() - if not candidate: - return - eff_len = self.get_effective_len(candidate) - if eff_len > 0: - sentences.append(candidate) - elif sentences: - sentences[-1] += candidate +import re +from typing import List, Set, Pattern + + +class TextSplitter: + def __init__(self, max_len: int = 40, min_len: int = 5): + """ + 初始化文本切分器。 + + :param max_len: 软限制最大长度 (Effective Length)。超过此长度遇到分隔符时会切分。 + :param min_len: 硬限制最小长度 (Effective Length)。小于此长度遇到终止符也不会切分。 + """ + self.max_len: int = max_len + self.min_len: int = min_len + + # 1. 定义基础字符集合 + # 只要标点块中包含这些字符,就视为 Ending (终止符) + self.end_chars: Set[str] = { + '。', '!', '?', '…', + '!', '?', '.' + } + + # 2. 定义标点符号全集 (用于正则匹配和长度计算过滤) + self.all_puncts_chars: Set[str] = self.end_chars | { + ',', '、', ';', ':', '——', + ',', ';', ':', + '“', '”', '‘', '’', '"', "'", + } + + # 3. 编译正则表达式 + # 使用非捕获组 (?:) 配合 + 号,实现贪婪匹配连续标点 + # sort + escape 确保正则安全且优先匹配长标点 + sorted_puncts: List[str] = sorted(list(self.all_puncts_chars), key=len, reverse=True) + escaped_puncts: List[str] = [re.escape(p) for p in sorted_puncts] + self.pattern: Pattern = re.compile(f"((?:{'|'.join(escaped_puncts)})+)") + + @staticmethod + def get_char_width(char: str) -> int: + """计算单字符宽度:ASCII算1,其他(中日韩)算2""" + return 1 if ord(char) < 128 else 2 + + def get_effective_len(self, text: str) -> int: + """ + 计算字符串的有效长度。 + 逻辑:跳过标点符号,仅计算内容字符。 + 例如:"你好......" -> 有效长度为 4 (你好),而不是 10。 + """ + length = 0 + for char in text: + # 如果是标点符号集合里的字符,不计入长度 + if char in self.all_puncts_chars: + continue + length += self.get_char_width(char) + return length + + def is_terminator_block(self, block: str) -> bool: + """ + 判断一个标点块是否起到结束句子的作用。 + 只要块中包含任意一个结束字符(如句号),则视为结束块。 + """ + for char in block: + if char in self.end_chars: + return True + return False + + def split(self, text: str) -> List[str]: + """核心切分逻辑""" + if not text: + return [] + + text = text.replace('\n', '') + + # 正则切分,segments 格式如: ['你好', '......', '我是谁', '?!', ''] + segments: List[str] = self.pattern.split(text) + + sentences: List[str] = [] + current_buffer: str = "" + + for segment in segments: + if not segment: + continue + + # 判断当前片段是否是标点块(通过首字符判断即可,正则保证了一致性) + is_punct_block = segment[0] in self.all_puncts_chars + + if is_punct_block: + current_buffer += segment + + # 计算缓冲区内容的【有效长度】 + eff_len = self.get_effective_len(current_buffer) + + # 判断逻辑 + if self.is_terminator_block(segment): + # Case B: 结束符号 -> 检查 min_len + if eff_len >= self.min_len: + sentences.append(current_buffer.strip()) + current_buffer = "" + # else: 有效长度太短,合并到下一句 + else: + # Case A-B: 分隔符号 -> 检查 max_len + if eff_len >= self.max_len: + sentences.append(current_buffer.strip()) + current_buffer = "" + # else: 没到最大长度,继续累积 + else: + # 纯文本 + current_buffer += segment + + # 处理残留缓冲区 + if current_buffer: + self._flush_buffer(sentences, current_buffer) + + return sentences + + def _flush_buffer(self, sentences: List[str], buffer: str): + candidate = buffer.strip() + if not candidate: + return + eff_len = self.get_effective_len(candidate) + if eff_len > 0: + sentences.append(candidate) + elif sentences: + sentences[-1] += candidate diff --git a/genie_tts/Utils/UserData.py b/genie_tts/Utils/UserData.py index 1d1bd517e02d35f683e92c3536fbc6027ab2b438..a32820c04d8753d609130f29a4693731f530dbee 100644 --- a/genie_tts/Utils/UserData.py +++ b/genie_tts/Utils/UserData.py @@ -1,44 +1,44 @@ -import json -from pathlib import Path -from typing import Dict, Any -import logging -import importlib.resources - -from .Constants import PACKAGE_NAME - -logger = logging.getLogger(__name__) - - -class UserDataManager: - def __init__(self, file_path: str = "./UserData.json"): - self.file_path = Path(file_path) - self._data: Dict[str, Any] = self._load() - - def _load(self) -> Dict[str, Any]: - if self.file_path.exists(): - try: - with self.file_path.open('r', encoding='utf-8') as f: - return json.load(f) - except (json.JSONDecodeError, IOError) as e: - logger.warning(f"Failed to read user data file {self.file_path}. Using empty configuration. Error: {e}") - - return {} - return {} - - def _save(self): - try: - with self.file_path.open('w', encoding='utf-8') as f: - json.dump(self._data, f, indent=4, ensure_ascii=False) # type: ignore - except IOError as e: - logger.warning(f"Failed to write user data file {self.file_path}. Error: {e}") - - def get(self, key: str, default: Any = None) -> Any: - return self._data.get(key, default) - - def set(self, key: str, value: Any): - self._data[key] = value - self._save() - - -userdata_file: str = str(importlib.resources.files(PACKAGE_NAME) / 'UserData.json') -userdata_manager = UserDataManager(file_path=userdata_file) +import json +from pathlib import Path +from typing import Dict, Any +import logging +import importlib.resources + +from .Constants import PACKAGE_NAME + +logger = logging.getLogger(__name__) + + +class UserDataManager: + def __init__(self, file_path: str = "./UserData.json"): + self.file_path = Path(file_path) + self._data: Dict[str, Any] = self._load() + + def _load(self) -> Dict[str, Any]: + if self.file_path.exists(): + try: + with self.file_path.open('r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + logger.warning(f"Failed to read user data file {self.file_path}. Using empty configuration. Error: {e}") + + return {} + return {} + + def _save(self): + try: + with self.file_path.open('w', encoding='utf-8') as f: + json.dump(self._data, f, indent=4, ensure_ascii=False) # type: ignore + except IOError as e: + logger.warning(f"Failed to write user data file {self.file_path}. Error: {e}") + + def get(self, key: str, default: Any = None) -> Any: + return self._data.get(key, default) + + def set(self, key: str, value: Any): + self._data[key] = value + self._save() + + +userdata_file: str = str(importlib.resources.files(PACKAGE_NAME) / 'UserData.json') +userdata_manager = UserDataManager(file_path=userdata_file) diff --git a/genie_tts/Utils/Utils.py b/genie_tts/Utils/Utils.py index 1453655a72e22716f4e966ef418480497f0a314c..7a5dc3aee0a8438d7c60ea1c9170f24708dd401b 100644 --- a/genie_tts/Utils/Utils.py +++ b/genie_tts/Utils/Utils.py @@ -1,28 +1,28 @@ -from collections import OrderedDict -import queue - - -class LRUCacheDict(OrderedDict): - def __init__(self, capacity): - super().__init__() - self.capacity = capacity - - def __getitem__(self, key): - value = super().__getitem__(key) - self.move_to_end(key) # 访问后移到末尾 - return value - - def __setitem__(self, key, value): - if key in self: - self.move_to_end(key) - super().__setitem__(key, value) - if len(self) > self.capacity: - self.popitem(last=False) # 删除最旧的(第一个) - - -def clear_queue(q: queue.Queue) -> None: - while not q.empty(): - try: - q.get_nowait() - except queue.Empty: - break +from collections import OrderedDict +import queue + + +class LRUCacheDict(OrderedDict): + def __init__(self, capacity): + super().__init__() + self.capacity = capacity + + def __getitem__(self, key): + value = super().__getitem__(key) + self.move_to_end(key) # 访问后移到末尾 + return value + + def __setitem__(self, key, value): + if key in self: + self.move_to_end(key) + super().__setitem__(key, value) + if len(self) > self.capacity: + self.popitem(last=False) # 删除最旧的(第一个) + + +def clear_queue(q: queue.Queue) -> None: + while not q.empty(): + try: + q.get_nowait() + except queue.Empty: + break diff --git a/genie_tts/Utils/__pycache__/Constants.cpython-311.pyc b/genie_tts/Utils/__pycache__/Constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..545d5045f6341fee86d4c69b4672e5fd65bfce5d Binary files /dev/null and b/genie_tts/Utils/__pycache__/Constants.cpython-311.pyc differ diff --git a/genie_tts/Utils/__pycache__/Language.cpython-311.pyc b/genie_tts/Utils/__pycache__/Language.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f5889d376989ff760e7ba39c580c32d5126c377 Binary files /dev/null and b/genie_tts/Utils/__pycache__/Language.cpython-311.pyc differ diff --git a/genie_tts/Utils/__pycache__/Shared.cpython-311.pyc b/genie_tts/Utils/__pycache__/Shared.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e806748737923a29612ec2dececb7cdf64c33ff2 Binary files /dev/null and b/genie_tts/Utils/__pycache__/Shared.cpython-311.pyc differ diff --git a/genie_tts/Utils/__pycache__/TextSplitter.cpython-311.pyc b/genie_tts/Utils/__pycache__/TextSplitter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80885a6eb7908aa4a1fb49c83c4c44af056c3de8 Binary files /dev/null and b/genie_tts/Utils/__pycache__/TextSplitter.cpython-311.pyc differ diff --git a/genie_tts/Utils/__pycache__/Utils.cpython-311.pyc b/genie_tts/Utils/__pycache__/Utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d8a9a67807edd0849ee5468d9e9b328c4a9b08f Binary files /dev/null and b/genie_tts/Utils/__pycache__/Utils.cpython-311.pyc differ diff --git a/genie_tts/Utils/__pycache__/__init__.cpython-311.pyc b/genie_tts/Utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02a8876a02344b41c35e92b723c01343a3e99035 Binary files /dev/null and b/genie_tts/Utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/genie_tts/__init__.py b/genie_tts/__init__.py index d910acdb31af65801d20206bbebfb55544af3570..b0d4fdcbcd124bdac127cc4037b95de73b4092d3 100644 --- a/genie_tts/__init__.py +++ b/genie_tts/__init__.py @@ -1,29 +1,31 @@ -from .Internal import ( - load_character, - unload_character, - set_reference_audio, - tts_async, - tts, - stop, - convert_to_onnx, - clear_reference_audio_cache, - load_predefined_character, - wait_for_playback_done, -) -from .Server import start_server -from .Core.Resources import download_genie_data - -__all__ = [ - "load_character", - "unload_character", - "set_reference_audio", - "tts_async", - "tts", - "stop", - "convert_to_onnx", - "clear_reference_audio_cache", - "start_server", - "load_predefined_character", - "wait_for_playback_done", - 'download_genie_data', -] +from .Internal import ( + load_character, + unload_character, + set_reference_audio, + tts_async, + tts, + stop, + convert_to_onnx, + clear_reference_audio_cache, + load_predefined_character, + wait_for_playback_done, +) +from .Server import start_server +from .Core.Resources import download_genie_data +from .ModelManager import model_manager + +__all__ = [ + "load_character", + "unload_character", + "set_reference_audio", + "tts_async", + "tts", + "stop", + "convert_to_onnx", + "clear_reference_audio_cache", + "start_server", + "load_predefined_character", + "wait_for_playback_done", + 'download_genie_data', + "model_manager", +] diff --git a/genie_tts/__pycache__/GetPhonesAndBert.cpython-311.pyc b/genie_tts/__pycache__/GetPhonesAndBert.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5854509ff9027dc8542f66f475b0817d88574255 Binary files /dev/null and b/genie_tts/__pycache__/GetPhonesAndBert.cpython-311.pyc differ diff --git a/genie_tts/__pycache__/Internal.cpython-311.pyc b/genie_tts/__pycache__/Internal.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d12a3b2d88c15beacc3ca945587fb579e7a0ffd Binary files /dev/null and b/genie_tts/__pycache__/Internal.cpython-311.pyc differ diff --git a/genie_tts/__pycache__/ModelManager.cpython-311.pyc b/genie_tts/__pycache__/ModelManager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e0c176223b24fde61940caadc612b98765f381f Binary files /dev/null and b/genie_tts/__pycache__/ModelManager.cpython-311.pyc differ diff --git a/genie_tts/__pycache__/PredefinedCharacter.cpython-311.pyc b/genie_tts/__pycache__/PredefinedCharacter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..387529a1944a268d693d20e0b632964dbc8f8d17 Binary files /dev/null and b/genie_tts/__pycache__/PredefinedCharacter.cpython-311.pyc differ diff --git a/genie_tts/__pycache__/Server.cpython-311.pyc b/genie_tts/__pycache__/Server.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6847332b0bce32e9469d20f991a34d0c2fb8ee06 Binary files /dev/null and b/genie_tts/__pycache__/Server.cpython-311.pyc differ diff --git a/genie_tts/__pycache__/__init__.cpython-311.pyc b/genie_tts/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63d8a1e31e8d271e5f3e0865a4abe8b2ebf2e877 Binary files /dev/null and b/genie_tts/__pycache__/__init__.cpython-311.pyc differ diff --git a/models/base/ref.wav b/models/base/ref.wav index 60b4426d3fd8427b875fb482f42abfd9739b4510..9a7847612f1411ae1c804b93e3b99812758d6645 100644 Binary files a/models/base/ref.wav and b/models/base/ref.wav differ diff --git a/models/god/ref.wav b/models/god/ref.wav index 9c9b6bf056f646d4f329537f718c62e3c111fa1a..b718b57dc5f9a25e7cd6be1479420468a4bce59c 100644 Binary files a/models/god/ref.wav and b/models/god/ref.wav differ