Approximetal commited on
Commit
34fb334
·
verified ·
1 Parent(s): 4d882a6

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +116 -9
  3. app.py +19 -0
  4. apt.txt +3 -0
  5. denoised_audio.wav +3 -0
  6. inference_gradio.py +603 -0
  7. lemas_tts/__init__.py +6 -0
  8. lemas_tts/api.py +252 -0
  9. lemas_tts/configs/multilingual_grl.yaml +78 -0
  10. lemas_tts/configs/multilingual_prosody.yaml +78 -0
  11. lemas_tts/infer/frontend.py +251 -0
  12. lemas_tts/infer/infer_cli.py +386 -0
  13. lemas_tts/infer/text_norm/__init__.py +0 -0
  14. lemas_tts/infer/text_norm/cn_tn.py +824 -0
  15. lemas_tts/infer/text_norm/en_tn.py +178 -0
  16. lemas_tts/infer/text_norm/gp2py.py +148 -0
  17. lemas_tts/infer/text_norm/id_tn.py +275 -0
  18. lemas_tts/infer/text_norm/jieba_dict.txt +0 -0
  19. lemas_tts/infer/text_norm/pinyin-lexicon-r.txt +4120 -0
  20. lemas_tts/infer/text_norm/symbols.py +419 -0
  21. lemas_tts/infer/text_norm/tokenizer.py +219 -0
  22. lemas_tts/infer/text_norm/txt2pinyin.py +225 -0
  23. lemas_tts/infer/utils_infer.py +651 -0
  24. lemas_tts/model/backbones/README.md +20 -0
  25. lemas_tts/model/backbones/dit.py +254 -0
  26. lemas_tts/model/backbones/ecapa_tdnn.py +931 -0
  27. lemas_tts/model/backbones/mmdit.py +189 -0
  28. lemas_tts/model/backbones/prosody_encoder.py +433 -0
  29. lemas_tts/model/backbones/unett.py +250 -0
  30. lemas_tts/model/cfm.py +899 -0
  31. lemas_tts/model/modules.py +802 -0
  32. lemas_tts/model/utils.py +190 -0
  33. lemas_tts/scripts/inference_gradio.py +584 -0
  34. requirements.txt +182 -0
  35. uvr5/gui_data/constants.py +1147 -0
  36. uvr5/lib_v5/mdxnet.py +140 -0
  37. uvr5/lib_v5/mixer.ckpt +3 -0
  38. uvr5/lib_v5/modules.py +74 -0
  39. uvr5/lib_v5/pyrb.py +92 -0
  40. uvr5/lib_v5/spec_utils.py +703 -0
  41. uvr5/lib_v5/vr_network/__init__.py +1 -0
  42. uvr5/lib_v5/vr_network/layers.py +143 -0
  43. uvr5/lib_v5/vr_network/layers_new.py +126 -0
  44. uvr5/lib_v5/vr_network/model_param_init.py +59 -0
  45. uvr5/lib_v5/vr_network/modelparams/1band_sr16000_hl512.json +19 -0
  46. uvr5/lib_v5/vr_network/modelparams/1band_sr32000_hl512.json +19 -0
  47. uvr5/lib_v5/vr_network/modelparams/1band_sr33075_hl384.json +19 -0
  48. uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json +19 -0
  49. uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl256.json +19 -0
  50. uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl512.json +19 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ denoised_audio.wav filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: LEMAS TTS
3
- emoji: 🔥
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LEMAS-TTS Gradio Demo (Hugging Face Space)
2
+
3
+ This folder is a **clean, inference-only** version of LEMAS-TTS, organized for easy deployment on **Hugging Face Spaces**.
4
+
5
+ It keeps only:
6
+ - the inference models & configs (`lemas_tts`)
7
+ - pretrained checkpoints and vocab (`pretrained_models`)
8
+ - the bundled UVR5 denoiser (`uvr5`)
9
+ - a Gradio web UI (`inference_gradio.py`, `app.py`)
10
+
11
+ ---
12
+
13
+ ## 1. Features
14
+
15
+ - Zero-shot TTS: clone voice from a reference audio + reference text
16
+ - Multilingual text input (Chinese / English / ES / IT / PT / DE, etc.)
17
+ - Optional UVR5-based reference denoising
18
+ - Two custom LEMAS checkpoints:
19
+ - `multilingual_prosody_custom`
20
+ - `multilingual_acc_grl_custom`
21
+
22
+ ---
23
+
24
+ ## 2. Project Structure
25
+
26
+ ```text
27
+ LEMAS-TTS_gradio/
28
+ app.py # HF Space entrypoint (Gradio Blocks)
29
+ inference_gradio.py # Full Gradio UI & logic
30
+ requirements.txt # Minimal runtime dependencies
31
+
32
+ lemas_tts/ # Core LEMAS-TTS package (inference only)
33
+ api.py # F5TTS API (used by the UI)
34
+ configs/ # Model configs (F5TTS / E2TTS)
35
+ infer/ # Inference utilities & text frontend
36
+ model/ # DiT backbone, utils, etc.
37
+
38
+ pretrained_models/ # All local assets needed for inference
39
+ ckpts/
40
+ F5TTS_v1_Base_vocos_custom_multilingual_prosody/model_2698000.pt
41
+ F5TTS_v1_Base_vocos_custom_multilingual_acc_grl/model_2680000.pt
42
+ prosody_encoder/...
43
+ vocos-mel-24khz/...
44
+ data/
45
+ multilingual_prosody_custom/vocab.txt
46
+ multilingual_acc_grl_custom/vocab.txt
47
+ test_examples/*.wav # Demo audios used in the UI
48
+ uvr5/
49
+ models/MDX_Net_Models/model_data/*.onnx, *.json
50
+
51
+ uvr5/ # Bundled UVR5 implementation for denoising
52
+ ```
53
+
54
+ `lemas_tts.api.F5TTS` automatically resolves `pretrained_models/` based on the repo layout, so no extra path configuration is required.
55
+
56
+ ---
57
+
58
+ ## 3. How to Run Locally
59
+
60
+ ```bash
61
+ cd LEMAS-TTS_gradio
62
+ pip install -r requirements.txt
63
+ python app.py
64
+ ```
65
+
66
+ Then open the printed URL (default `http://127.0.0.1:7860`) in your browser.
67
+
68
  ---
69
+
70
+ ## 4. Hugging Face Space Setup
71
+
72
+ 1. Create a new Space (type: **Gradio**).
73
+ 2. Upload the contents of `LEMAS-TTS_gradio/` to the Space repo:
74
+ - `app.py`
75
+ - `inference_gradio.py`
76
+ - `requirements.txt`
77
+ - `lemas_tts/`
78
+ - `pretrained_models/`
79
+ - `uvr5/`
80
+ 3. In the Space settings, choose a GPU hardware profile (the model is heavy).
81
+ 4. The Space will automatically run `app.py` and launch the Gradio Blocks named `app`.
82
+
83
+ No extra arguments are needed; all paths are relative inside the repo.
84
+
85
  ---
86
 
87
+ ## 5. Usage Tips
88
+
89
+ - **Reference Text** should match the reference audio roughly in content and language for best voice cloning.
90
+ - **Denoise**:
91
+ - Turn on if your reference audio is noisy; it runs UVR5 on CPU.
92
+ - Turn off if the reference is already clean (saves time).
93
+ - **Seed**:
94
+ - `-1` → random seed
95
+ - Any other integer → reproducible output
96
+
97
+ ---
98
+
99
+ ## 6. 中文说明(简要)
100
+
101
+ 这个目录是专门为 **Hugging Face Space** 打包的 **推理版 LEMAS-TTS**:
102
+
103
+ - 只保留推理相关代码(`lemas_tts`)、预训练模型(`pretrained_models`)和 UVR5 去噪模块(`uvr5`)
104
+ - Gradio 入口为 `app.py`,内部调用 `inference_gradio.py` 里的 `app`(一个 `gr.Blocks` 界面)
105
+ - `pretrained_models/` 下已经包含:
106
+ - 自定义多语种 prosody / accent GRL 的 finetune 权重
107
+ - vocoder(`vocos-mel-24khz`)
108
+ - prosody encoder
109
+ - 以及示例语音 `test_examples/*.wav`
110
+
111
+ 在本地或 Space 中运行步骤:
112
+
113
+ ```bash
114
+ pip install -r requirements.txt
115
+ python app.py
116
+ ```
117
+
118
+ 然后在浏览器中打开提示的链接即可使用零样本 TTS Demo。
119
+
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio entrypoint for Hugging Face Spaces.
3
+
4
+ This file simply re-exports the `app` Blocks defined in `inference_gradio.py`
5
+ so that Spaces can discover and launch it.
6
+ """
7
+
8
+ import gradio as gr # noqa: F401
9
+
10
+ from inference_gradio import app as _app
11
+
12
+ # Expose as both `app` and `demo` for maximum compatibility
13
+ app = _app
14
+ demo = _app
15
+
16
+
17
+ if __name__ == "__main__":
18
+ app.queue(api_open=True).launch()
19
+
apt.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ffmpeg
2
+ espeak-ng
3
+ espeak
denoised_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d715f294233999f56d51424b1ab8e9d28aed5dfb0821b427c2fb4f4abaa3aa
3
+ size 1386548
inference_gradio.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ import platform
4
+ import psutil
5
+ import tempfile
6
+ from glob import glob
7
+ import traceback
8
+ import click
9
+ import gradio as gr
10
+ import torch
11
+ import torchaudio
12
+ import soundfile as sf
13
+ from pathlib import Path
14
+
15
+ from cached_path import cached_path
16
+
17
+ from lemas_tts.api import TTS, PRETRAINED_ROOT, CKPTS_ROOT
18
+
19
+ # Global variables
20
+ tts_api = None
21
+ last_checkpoint = ""
22
+ last_device = ""
23
+ last_ema = None
24
+
25
+ # Device detection
26
+ device = (
27
+ "cuda"
28
+ if torch.cuda.is_available()
29
+ else "xpu"
30
+ if torch.xpu.is_available()
31
+ else "mps"
32
+ if torch.backends.mps.is_available()
33
+ else "cpu"
34
+ )
35
+
36
+ REPO_ROOT = Path(__file__).resolve().parent
37
+
38
+ # HF location for pretrained assets (used as a fallback when local files are missing)
39
+ HF_PRETRAINED_ROOT = "hf://LEMAS-Project/LEMAS-TTS/pretrained_models"
40
+
41
+ # 1) 指向你仓库里的 libespeak-ng.so
42
+ ESPEAK_LIB = PRETRAINED_ROOT / "espeak-ng-lib" / "libespeak-ng.so"
43
+ os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = str(ESPEAK_LIB)
44
+
45
+ # 2) 指向你仓库里的 espeak-ng-data
46
+ ESPEAK_DATA_DIR = PRETRAINED_ROOT / "espeak-ng-data"
47
+ os.environ["ESPEAK_DATA_PATH"] = str(ESPEAK_DATA_DIR)
48
+ os.environ["ESPEAKNG_DATA_PATH"] = str(ESPEAK_DATA_DIR)
49
+
50
+
51
+ class UVR5:
52
+ """Small wrapper around the bundled uvr5 implementation for denoising."""
53
+
54
+ def __init__(self, model_dir: Path, code_dir: Path):
55
+ self.model = self.load_model(str(model_dir), str(code_dir))
56
+
57
+ def load_model(self, model_dir: str, code_dir: str):
58
+ import sys
59
+ import json
60
+
61
+ if code_dir not in sys.path:
62
+ sys.path.append(code_dir)
63
+
64
+ from multiprocess_cuda_infer import ModelData, Inference
65
+
66
+ model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
67
+ config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
68
+ configs = json.loads(open(config_path, "r", encoding="utf-8").read())
69
+ model_data = ModelData(
70
+ model_path=model_path,
71
+ audio_path=model_dir,
72
+ result_path=model_dir,
73
+ device="cpu",
74
+ process_method="MDX-Net",
75
+ base_dir=model_dir, # keep base_dir and model_dir the same (paths under `pretrained_models`)
76
+ **configs,
77
+ )
78
+
79
+ uvr5_model = Inference(model_data, "cpu")
80
+ uvr5_model.load_model(model_path, 1)
81
+ return uvr5_model
82
+
83
+ def denoise(self, audio_info):
84
+ print("denoise UVR5: ", audio_info)
85
+ input_audio = load_wav(audio_info, sr=44100, channel=2)
86
+ output_audio = self.model.demix_base({0: input_audio.squeeze()}, is_match_mix=False)
87
+ return output_audio.squeeze().T.numpy(), 44100
88
+
89
+ denoise_model = UVR5(
90
+ model_dir=PRETRAINED_ROOT / "uvr5",
91
+ code_dir=REPO_ROOT / "uvr5",
92
+ )
93
+
94
+ def load_wav(audio_info, sr=16000, channel=1):
95
+ print("load audio:", audio_info)
96
+ audio, raw_sr = torchaudio.load(audio_info)
97
+ audio = audio.T if len(audio.shape) > 1 and audio.shape[1] == 2 else audio
98
+ audio = audio / torch.max(torch.abs(audio))
99
+ audio = audio.squeeze().float()
100
+ if channel == 1 and len(audio.shape) == 2: # stereo to mono
101
+ audio = audio.mean(dim=0, keepdim=True)
102
+ elif channel == 2 and len(audio.shape) == 1:
103
+ audio = torch.stack((audio, audio)) # mono to stereo
104
+ if raw_sr != sr:
105
+ audio = torchaudio.functional.resample(audio.squeeze(), raw_sr, sr)
106
+ audio = torch.clip(audio, -0.999, 0.999).squeeze()
107
+ return audio
108
+
109
+
110
+ def denoise(audio_info):
111
+ save_path = "./denoised_audio.wav"
112
+ denoised_audio, sr = denoise_model.denoise(audio_info)
113
+ sf.write(save_path, denoised_audio, sr, format='wav', subtype='PCM_24')
114
+ print("save denoised audio:", save_path)
115
+ return save_path
116
+
117
+ def cancel_denoise(audio_info):
118
+ return audio_info
119
+
120
+
121
+ def get_checkpoints_project(project_name=None, is_gradio=True):
122
+ """Get available checkpoint files"""
123
+ checkpoint_dir = [str(CKPTS_ROOT)]
124
+ # Remote ckpt locations on HF (used if local ckpts are not present)
125
+ remote_ckpts = {
126
+ "multilingual_grl": f"{HF_PRETRAINED_ROOT}/ckpts/multilingual_grl/multilingual_grl.safetensors",
127
+ "multilingual_prosody": f"{HF_PRETRAINED_ROOT}/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
128
+ }
129
+
130
+ if project_name is None:
131
+ # Look for checkpoints in local directory
132
+ files_checkpoints = []
133
+ for path in checkpoint_dir:
134
+ if os.path.isdir(path):
135
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.pt"), recursive=True))
136
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.safetensors"), recursive=True))
137
+ break
138
+ # Fallback: use HF ckpts
139
+ if not files_checkpoints:
140
+ files_checkpoints = list(remote_ckpts.values())
141
+ else:
142
+ if os.path.isdir(checkpoint_dir[0]):
143
+ files_checkpoints = glob(os.path.join(checkpoint_dir[0], project_name, "*.pt"))
144
+ files_checkpoints.extend(glob(os.path.join(checkpoint_dir[0], project_name, "*.safetensors")))
145
+ else:
146
+ ckpt = remote_ckpts.get(project_name)
147
+ files_checkpoints = [ckpt] if ckpt is not None else []
148
+ print("files_checkpoints:", project_name, files_checkpoints)
149
+ # Separate pretrained and regular checkpoints
150
+ pretrained_checkpoints = [f for f in files_checkpoints if "pretrained_" in os.path.basename(f)]
151
+ regular_checkpoints = [
152
+ f
153
+ for f in files_checkpoints
154
+ if "pretrained_" not in os.path.basename(f) and "model_last.pt" not in os.path.basename(f)
155
+ ]
156
+ last_checkpoint = [f for f in files_checkpoints if "model_last.pt" in os.path.basename(f)]
157
+
158
+ # Sort regular checkpoints by number
159
+ try:
160
+ regular_checkpoints = sorted(
161
+ regular_checkpoints, key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
162
+ )
163
+ except (IndexError, ValueError):
164
+ regular_checkpoints = sorted(regular_checkpoints)
165
+
166
+ # Combine in order: pretrained, regular, last
167
+ files_checkpoints = pretrained_checkpoints + regular_checkpoints + last_checkpoint
168
+
169
+ select_checkpoint = None if not files_checkpoints else files_checkpoints[-1]
170
+
171
+ if is_gradio:
172
+ return gr.update(choices=files_checkpoints, value=select_checkpoint)
173
+
174
+ return files_checkpoints, select_checkpoint
175
+
176
+
177
+ def get_available_projects():
178
+ """Get available project names from data directory"""
179
+ data_paths = [
180
+ str(PRETRAINED_ROOT / "data"),
181
+ ]
182
+
183
+ project_list = []
184
+ for data_path in data_paths:
185
+ if os.path.isdir(data_path):
186
+ for folder in os.listdir(data_path):
187
+ path_folder = os.path.join(data_path, folder)
188
+ if "test" not in folder:
189
+ project_list.append(folder)
190
+ break
191
+ # Fallback: if no local data dir, default to known HF projects
192
+ if not project_list:
193
+ project_list = ["multilingual_grl", "multilingual_prosody"]
194
+ project_list.sort()
195
+ print("project_list:", project_list)
196
+ return project_list
197
+
198
+
199
+ def infer(
200
+ project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
201
+ ):
202
+ global last_checkpoint, last_device, tts_api, last_ema
203
+
204
+ # Resolve checkpoint path (local or HF)
205
+ ckpt_path = file_checkpoint
206
+ if isinstance(ckpt_path, str) and ckpt_path.startswith("hf://"):
207
+ try:
208
+ ckpt_resolved = str(cached_path(ckpt_path))
209
+ except Exception as e:
210
+ traceback.print_exc()
211
+ return None, f"Error downloading checkpoint: {str(e)}", ""
212
+ else:
213
+ ckpt_resolved = ckpt_path
214
+
215
+ if not os.path.isfile(ckpt_resolved):
216
+ return None, "Checkpoint not found!", ""
217
+
218
+ if denoise_audio:
219
+ ref_audio = denoise_audio
220
+
221
+ device_test = device # Use the global device
222
+
223
+ if last_checkpoint != ckpt_resolved or last_device != device_test or last_ema != use_ema or tts_api is None:
224
+ if last_checkpoint != ckpt_resolved:
225
+ last_checkpoint = ckpt_resolved
226
+
227
+ if last_device != device_test:
228
+ last_device = device_test
229
+
230
+ if last_ema != use_ema:
231
+ last_ema = use_ema
232
+
233
+ # Automatically enable prosody encoder when using the prosody checkpoint
234
+ use_prosody_encoder = True if "prosody" in str(ckpt_resolved) else False
235
+
236
+ # Resolve vocab file (local or HF)
237
+ local_vocab = Path(PRETRAINED_ROOT) / "data" / project / "vocab.txt"
238
+ if local_vocab.is_file():
239
+ vocab_file = str(local_vocab)
240
+ else:
241
+ remote_vocab_map = {
242
+ "multilingual_grl": f"{HF_PRETRAINED_ROOT}/data/multilingual_grl/vocab.txt",
243
+ "multilingual_prosody": f"{HF_PRETRAINED_ROOT}/data/multilingual_prosody/vocab.txt",
244
+ }
245
+ remote_vocab = remote_vocab_map.get(project)
246
+ if remote_vocab is None:
247
+ return None, "Vocab file not found!", ""
248
+ try:
249
+ vocab_file = str(cached_path(remote_vocab))
250
+ except Exception as e:
251
+ traceback.print_exc()
252
+ return None, f"Error downloading vocab: {str(e)}", ""
253
+
254
+ # Resolve prosody encoder config & weights
255
+ local_prosody_cfg = CKPTS_ROOT / "prosody_encoder" / "pretssel_cfg.json"
256
+ local_prosody_ckpt = CKPTS_ROOT / "prosody_encoder" / "prosody_encoder_UnitY2.pt"
257
+ if local_prosody_cfg.is_file():
258
+ prosody_cfg_path = str(local_prosody_cfg)
259
+ else:
260
+ prosody_cfg_path = str(
261
+ cached_path(f"{HF_PRETRAINED_ROOT}/ckpts/prosody_encoder/pretssel_cfg.json")
262
+ )
263
+ if local_prosody_ckpt.is_file():
264
+ prosody_ckpt_path = str(local_prosody_ckpt)
265
+ else:
266
+ prosody_ckpt_path = str(
267
+ cached_path(f"{HF_PRETRAINED_ROOT}/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt")
268
+ )
269
+
270
+ try:
271
+ tts_api = TTS(
272
+ model=exp_name,
273
+ ckpt_file=ckpt_resolved,
274
+ vocab_file=vocab_file,
275
+ device=device_test,
276
+ use_ema=use_ema,
277
+ frontend=frontend,
278
+ use_prosody_encoder=use_prosody_encoder,
279
+ prosody_cfg_path=prosody_cfg_path,
280
+ prosody_ckpt_path=prosody_ckpt_path,
281
+ )
282
+ except Exception as e:
283
+ traceback.print_exc()
284
+ return None, f"Error loading model: {str(e)}", ""
285
+
286
+ print("Model loaded >>", device_test, file_checkpoint, use_ema)
287
+
288
+ if seed == -1: # -1 used for random
289
+ seed = None
290
+
291
+ try:
292
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
293
+ tts_api.infer(
294
+ ref_file=ref_audio,
295
+ ref_text=ref_text.strip(),
296
+ gen_text=gen_text.strip(),
297
+ nfe_step=nfe_step,
298
+ separate_langs=separate_langs,
299
+ speed=speed,
300
+ cfg_strength=cfg_strength,
301
+ sway_sampling_coef=sway_sampling_coef,
302
+ use_acc_grl=use_acc_grl,
303
+ ref_ratio=ref_ratio,
304
+ no_ref_audio=no_ref_audio,
305
+ use_prosody_encoder=use_prosody_encoder,
306
+ file_wave=f.name,
307
+ seed=seed,
308
+ )
309
+ return f.name, f"Device: {tts_api.device}", str(tts_api.seed)
310
+ except Exception as e:
311
+ traceback.print_exc()
312
+ return None, f"Inference error: {str(e)}", ""
313
+
314
+
315
+ def get_gpu_stats():
316
+ """Get GPU statistics"""
317
+ gpu_stats = ""
318
+
319
+ if torch.cuda.is_available():
320
+ gpu_count = torch.cuda.device_count()
321
+ for i in range(gpu_count):
322
+ gpu_name = torch.cuda.get_device_name(i)
323
+ gpu_properties = torch.cuda.get_device_properties(i)
324
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
325
+ allocated_memory = torch.cuda.memory_allocated(i) / (1024**2) # in MB
326
+ reserved_memory = torch.cuda.memory_reserved(i) / (1024**2) # in MB
327
+
328
+ gpu_stats += (
329
+ f"GPU {i} Name: {gpu_name}\n"
330
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
331
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
332
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
333
+ )
334
+ elif torch.xpu.is_available():
335
+ gpu_count = torch.xpu.device_count()
336
+ for i in range(gpu_count):
337
+ gpu_name = torch.xpu.get_device_name(i)
338
+ gpu_properties = torch.xpu.get_device_properties(i)
339
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
340
+ allocated_memory = torch.xpu.memory_allocated(i) / (1024**2) # in MB
341
+ reserved_memory = torch.xpu.memory_reserved(i) / (1024**2) # in MB
342
+
343
+ gpu_stats += (
344
+ f"GPU {i} Name: {gpu_name}\n"
345
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
346
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
347
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
348
+ )
349
+ elif torch.backends.mps.is_available():
350
+ gpu_count = 1
351
+ gpu_stats += "MPS GPU\n"
352
+ total_memory = psutil.virtual_memory().total / (
353
+ 1024**3
354
+ ) # Total system memory (MPS doesn't have its own memory)
355
+ allocated_memory = 0
356
+ reserved_memory = 0
357
+
358
+ gpu_stats += (
359
+ f"Total system memory: {total_memory:.2f} GB\n"
360
+ f"Allocated GPU memory (MPS): {allocated_memory:.2f} MB\n"
361
+ f"Reserved GPU memory (MPS): {reserved_memory:.2f} MB\n"
362
+ )
363
+
364
+ else:
365
+ gpu_stats = "No GPU available"
366
+
367
+ return gpu_stats
368
+
369
+
370
+ def get_cpu_stats():
371
+ """Get CPU statistics"""
372
+ cpu_usage = psutil.cpu_percent(interval=1)
373
+ memory_info = psutil.virtual_memory()
374
+ memory_used = memory_info.used / (1024**2)
375
+ memory_total = memory_info.total / (1024**2)
376
+ memory_percent = memory_info.percent
377
+
378
+ pid = os.getpid()
379
+ process = psutil.Process(pid)
380
+ nice_value = process.nice()
381
+
382
+ cpu_stats = (
383
+ f"CPU Usage: {cpu_usage:.2f}%\n"
384
+ f"System Memory: {memory_used:.2f} MB used / {memory_total:.2f} MB total ({memory_percent}% used)\n"
385
+ f"Process Priority (Nice value): {nice_value}"
386
+ )
387
+
388
+ return cpu_stats
389
+
390
+
391
+ def get_combined_stats():
392
+ """Get combined system stats"""
393
+ gpu_stats = get_gpu_stats()
394
+ cpu_stats = get_cpu_stats()
395
+ combined_stats = f"### GPU Stats\n{gpu_stats}\n\n### CPU Stats\n{cpu_stats}"
396
+ return combined_stats
397
+
398
+
399
+ # Create Gradio interface
400
+ with gr.Blocks(title="LEMAS-TTS Inference") as app:
401
+ gr.Markdown(
402
+ """
403
+ # Zero-Shot TTS
404
+
405
+ Set seed to -1 for random generation.
406
+ """
407
+ )
408
+ with gr.Accordion("Model configuration", open=False):
409
+ # Model configuration
410
+ with gr.Row():
411
+ exp_name = gr.Radio(
412
+ label="Model",
413
+ choices=["multilingual_grl", "multilingual_prosody"],
414
+ value="multilingual_grl",
415
+ visible=False,
416
+ )
417
+ # Project selection
418
+ available_projects = get_available_projects()
419
+
420
+ # Get initial checkpoints
421
+ list_checkpoints, checkpoint_select = get_checkpoints_project(available_projects[0] if available_projects else None, False)
422
+
423
+ with gr.Row():
424
+ with gr.Column(scale=1):
425
+ # load_models_btn = gr.Button(value="Load models")
426
+ cm_project = gr.Dropdown(
427
+ choices=available_projects,
428
+ value=available_projects[0] if available_projects else None,
429
+ label="Project",
430
+ allow_custom_value=True,
431
+ scale=4
432
+ )
433
+
434
+ with gr.Column(scale=5):
435
+ cm_checkpoint = gr.Dropdown(
436
+ choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True # scale=4,
437
+ )
438
+ bt_checkpoint_refresh = gr.Button("Refresh", scale=1)
439
+
440
+ with gr.Row():
441
+ ch_use_ema = gr.Checkbox(label="Use EMA", visible=False, value=True, scale=2, info="Turn off at early stage might offer better results")
442
+ frontend = gr.Radio(label="Frontend", visible=False, choices=["phone", "char", "bpe"], value="phone", scale=3)
443
+ separate_langs = gr.Checkbox(label="Separate Languages", visible=False, value=True, scale=2, info="separate language tokens")
444
+
445
+ # Inference parameters
446
+ with gr.Row():
447
+ nfe_step = gr.Number(label="NFE Step", scale=1, value=64)
448
+ speed = gr.Slider(label="Speed", scale=3, value=1.0, minimum=0.5, maximum=1.5, step=0.1)
449
+ cfg_strength = gr.Slider(label="CFG Strength", scale=2, value=5.0, minimum=0.0, maximum=10.0, step=1)
450
+ sway_sampling_coef = gr.Slider(label="Sway Sampling Coef", scale=2, value=3, minimum=2, maximum=5, step=0.1)
451
+ ref_ratio = gr.Slider(label="Ref Ratio", scale=2, value=1.0, minimum=0.0, maximum=1.0, step=0.1)
452
+ no_ref_audio = gr.Checkbox(label="No Reference Audio", visible=False, value=False, scale=1, info="No mel condition")
453
+ use_acc_grl = gr.Checkbox(label="Use accent grl condition", visible=False, value=True, scale=1, info="Use accent grl condition")
454
+ use_prosody_encoder = gr.Checkbox(label="Use prosody encoder", visible=False, value=False, scale=1, info="Use prosody encoder")
455
+ seed = gr.Number(label="Random Seed", scale=1, value=-1, minimum=-1)
456
+
457
+
458
+ # Input fields
459
+ ref_text = gr.Textbox(label="Reference Text", placeholder="Enter the text for the reference audio...")
460
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
461
+
462
+
463
+ with gr.Accordion("Denoise audio (Optional / Recommend)", open=True):
464
+ with gr.Row():
465
+ denoise_btn = gr.Button(value="Denoise")
466
+ cancel_btn = gr.Button(value="Cancel Denoise")
467
+ denoise_audio = gr.Audio(label="Denoised Audio", value=None, type="filepath", interactive=True, show_download_button=True, editable=True)
468
+
469
+ gen_text = gr.Textbox(label="Text to Generate", placeholder="Enter the text you want to generate...")
470
+
471
+ # Inference button and outputs
472
+ with gr.Row():
473
+ txt_info_gpu = gr.Textbox("", label="Device Info")
474
+ seed_info = gr.Textbox(label="Used Random Seed")
475
+ check_button_infer = gr.Button("Generate Audio", variant="primary")
476
+
477
+ gen_audio = gr.Audio(label="Generated Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
478
+
479
+ # Examples
480
+ def _resolve_example(name: str) -> str:
481
+ local = PRETRAINED_ROOT / "data" / "test_examples" / name
482
+ if local.is_file():
483
+ return str(local)
484
+ remote_map = {
485
+ "en.wav": f"{HF_PRETRAINED_ROOT}/data/test_examples/en.wav",
486
+ "es.wav": f"{HF_PRETRAINED_ROOT}/data/test_examples/es.wav",
487
+ "pt.wav": f"{HF_PRETRAINED_ROOT}/data/test_examples/pt.wav",
488
+ }
489
+ url = remote_map.get(name)
490
+ return str(cached_path(url)) if url is not None else ""
491
+
492
+ examples = gr.Examples(
493
+ examples=[
494
+ ["em, #1 I have a list of YouTubers, and I'm gonna be going to their houses and raiding them by.",
495
+ _resolve_example("en.wav"),
496
+ "我有一份 YouTuber 名单,我打算去他们家,对他们进行突袭。",
497
+ ],
498
+ ["Te voy a dar un tip #1 que le copia a John Rockefeller, uno de los empresarios más picudos de la historia.",
499
+ _resolve_example("es.wav"),
500
+ "我要给你一个从历史上最精明的商人之一约翰·洛克菲勒那里抄来的秘诀。",
501
+ ],
502
+ ["Nova, #1 dia 25 desse mês vai rolar operação the last Frontier.",
503
+ _resolve_example("pt.wav"),
504
+ "新消息,本月二十五日,'最后的边疆行动'将启动。",
505
+ ],
506
+ ],
507
+ inputs=[
508
+ ref_text,
509
+ ref_audio,
510
+ gen_text,
511
+ ],
512
+ outputs=[gen_audio, txt_info_gpu, seed_info],
513
+ fn=infer,
514
+ cache_examples=False
515
+ )
516
+
517
+ # System Info section at the bottom
518
+ gr.Markdown("---")
519
+ gr.Markdown("## System Information")
520
+ with gr.Accordion("Update System Stats", open=False):
521
+ update_button = gr.Button("Update System Stats", scale=1)
522
+ output_box = gr.Textbox(label="GPU and CPU Information", lines=5, scale=5)
523
+
524
+ def update_stats():
525
+ return get_combined_stats()
526
+
527
+
528
+ denoise_btn.click(fn=denoise,
529
+ inputs=[ref_audio],
530
+ outputs=[denoise_audio])
531
+
532
+ cancel_btn.click(fn=cancel_denoise,
533
+ inputs=[ref_audio],
534
+ outputs=[denoise_audio])
535
+
536
+ # Event handlers
537
+ check_button_infer.click(
538
+ fn=infer,
539
+ inputs=[
540
+ cm_project,
541
+ cm_checkpoint,
542
+ exp_name,
543
+ ref_text,
544
+ ref_audio,
545
+ denoise_audio,
546
+ gen_text,
547
+ nfe_step,
548
+ ch_use_ema,
549
+ separate_langs,
550
+ frontend,
551
+ speed,
552
+ cfg_strength,
553
+ use_acc_grl,
554
+ ref_ratio,
555
+ no_ref_audio,
556
+ sway_sampling_coef,
557
+ use_prosody_encoder,
558
+ seed,
559
+ ],
560
+ outputs=[gen_audio, txt_info_gpu, seed_info],
561
+ )
562
+
563
+ bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
564
+ cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
565
+
566
+ ref_audio.change(
567
+ fn=lambda x: None,
568
+ inputs=[ref_audio],
569
+ outputs=[denoise_audio]
570
+ )
571
+
572
+ update_button.click(fn=update_stats, outputs=output_box)
573
+
574
+ # Auto-load system stats on startup
575
+ app.load(fn=update_stats, outputs=output_box)
576
+
577
+
578
+ @click.command()
579
+ @click.option("--port", "-p", default=7860, type=int, help="Port to run the app on")
580
+ @click.option("--host", "-H", default="0.0.0.0", help="Host to run the app on")
581
+ @click.option(
582
+ "--share",
583
+ "-s",
584
+ default=False,
585
+ is_flag=True,
586
+ help="Share the app via Gradio share link",
587
+ )
588
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
589
+ def main(port, host, share, api):
590
+ global app
591
+ print("Starting LEMAS-TTS Inference Interface...")
592
+ print(f"Device: {device}")
593
+ app.queue(api_open=api).launch(
594
+ server_name=host,
595
+ server_port=port,
596
+ share=share,
597
+ show_api=api,
598
+ allowed_paths=[str(PRETRAINED_ROOT / "data")],
599
+ )
600
+
601
+
602
+ if __name__ == "__main__":
603
+ main()
lemas_tts/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .api import TTS
2
+
3
+ __all__ = ["TTS"]
4
+
5
+ __version__ = "0.1.0"
6
+
lemas_tts/api.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import sys
3
+ from pathlib import Path
4
+ import re, regex
5
+ import soundfile as sf
6
+ import tqdm
7
+ from cached_path import cached_path
8
+ from hydra.utils import get_class
9
+ from omegaconf import OmegaConf
10
+
11
+ from lemas_tts.infer.utils_infer import (
12
+ load_model,
13
+ load_vocoder,
14
+ transcribe,
15
+ preprocess_ref_audio_text,
16
+ infer_process,
17
+ remove_silence_for_generated_wav,
18
+ save_spectrogram,
19
+ )
20
+ from lemas_tts.model.utils import seed_everything
21
+ from lemas_tts.model.backbones.dit import DiT
22
+
23
+
24
+ # Resolve repository layout so we can find pretrained assets (ckpts, vocoder, etc.)
25
+ THIS_FILE = Path(__file__).resolve()
26
+
27
+
28
+ def _find_repo_root(start: Path) -> Path:
29
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
30
+ for p in [start, *start.parents]:
31
+ if (p / "pretrained_models").is_dir():
32
+ return p
33
+ cwd = Path.cwd()
34
+ if (cwd / "pretrained_models").is_dir():
35
+ return cwd
36
+ return start
37
+
38
+
39
+ REPO_ROOT = _find_repo_root(THIS_FILE)
40
+ # Local pretrained root (used when running from a repo / Space that bundles weights)
41
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
42
+ # Remote pretrained root on Hugging Face Hub (fallback when local files are absent)
43
+ HF_PRETRAINED_ROOT = "hf://LEMAS-Project/LEMAS-TTS/pretrained_models"
44
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
45
+
46
+ class TTS:
47
+ def __init__(
48
+ self,
49
+ model="multilingual",
50
+ ckpt_file="",
51
+ vocab_file="",
52
+ ode_method="euler",
53
+ use_ema=False,
54
+ vocoder_local_path=str(CKPTS_ROOT / "vocos-mel-24khz"),
55
+ use_prosody_encoder=False,
56
+ prosody_cfg_path="",
57
+ prosody_ckpt_path="",
58
+ device=None,
59
+ hf_cache_dir=None,
60
+ frontend="phone",
61
+ ):
62
+ # Load model architecture config from bundled yaml
63
+ config_dir = THIS_FILE.parent / "configs"
64
+ model_cfg = OmegaConf.load(config_dir / f"{model}.yaml")
65
+ # model_cls = get_class(f"lemas_tts.model.dit.{model_cfg.model.backbone}")
66
+ model_arc = model_cfg.model.arch
67
+
68
+ self.mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
69
+ self.target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
70
+
71
+ self.ode_method = ode_method
72
+ self.use_ema = use_ema
73
+ self.langs = {"cmn":"zh", "zh":"zh", "en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "id":"id", "vi":"vi", "th":"th"}
74
+
75
+ if device is not None:
76
+ self.device = device
77
+ else:
78
+ import torch
79
+
80
+ self.device = (
81
+ "cuda"
82
+ if torch.cuda.is_available()
83
+ else "xpu"
84
+ if torch.xpu.is_available()
85
+ else "mps"
86
+ if torch.backends.mps.is_available()
87
+ else "cpu"
88
+ )
89
+
90
+ # # Load models
91
+ # Prefer local vocoder directory if it exists; otherwise let `load_vocoder`
92
+ # fall back to downloading from the default HF repo (charactr/vocos-mel-24khz).
93
+ vocoder_is_local = False
94
+ if vocoder_local_path is not None:
95
+ try:
96
+ vocoder_is_local = Path(vocoder_local_path).is_dir()
97
+ except TypeError:
98
+ vocoder_is_local = False
99
+
100
+ self.vocoder = load_vocoder(
101
+ self.mel_spec_type, vocoder_is_local, vocoder_local_path, self.device, hf_cache_dir
102
+ )
103
+ # self.vocoder = load_vocoder(vocoder_name="vocos", is_local=True, local_path=vocoder_local_path, device=self.device)
104
+ if frontend is not None:
105
+ from lemas_tts.infer.frontend import TextNorm
106
+ self.frontend = TextNorm(dtype=frontend)
107
+ else:
108
+ self.frontend = None
109
+
110
+
111
+ self.ema_model = load_model(
112
+ DiT, model_arc, ckpt_file, self.mel_spec_type, vocab_file, self.ode_method, self.use_ema, self.device,
113
+ use_prosody_encoder=use_prosody_encoder, prosody_cfg_path=prosody_cfg_path, prosody_ckpt_path=prosody_ckpt_path,
114
+ )
115
+
116
+ def transcribe(self, ref_audio, language=None):
117
+ return transcribe(ref_audio, language)
118
+
119
+ def export_wav(self, wav, file_wave, remove_silence=False):
120
+ sf.write(file_wave, wav, self.target_sample_rate)
121
+
122
+ if remove_silence:
123
+ remove_silence_for_generated_wav(file_wave)
124
+
125
+ def export_spectrogram(self, spec, file_spec):
126
+ save_spectrogram(spec, file_spec)
127
+
128
+ def infer(
129
+ self,
130
+ ref_file,
131
+ ref_text,
132
+ gen_text,
133
+ show_info=print,
134
+ progress=tqdm,
135
+ target_rms=0.1,
136
+ cross_fade_duration=0.15,
137
+ use_acc_grl=False,
138
+ ref_ratio=None,
139
+ no_ref_audio=False,
140
+ cfg_strength=2,
141
+ nfe_step=32,
142
+ speed=1.0,
143
+ sway_sampling_coef=5,
144
+ separate_langs=False,
145
+ fix_duration=None,
146
+ use_prosody_encoder=True,
147
+ file_wave=None,
148
+ file_spec=None,
149
+ seed=None,
150
+ ):
151
+ if seed is None:
152
+ seed = random.randint(0, sys.maxsize)
153
+ seed_everything(seed)
154
+ self.seed = seed
155
+
156
+ ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text)
157
+ print("preprocesss:\n", "ref_file:", ref_file, "\nref_text:", ref_text)
158
+ if self.frontend.dtype == "phone":
159
+ ref_text = self.frontend.text2phn(ref_text+". ").replace("(cmn)", "(zh)").split("|")
160
+ gen_text = gen_text.split("\n")
161
+ gen_text = [self.frontend.text2phn(x+". ").replace("(cmn)", "(zh)").split("|") for x in gen_text]
162
+
163
+ elif self.frontend.dtype == "char":
164
+ src_lang, ref_text = self.frontend.text2norm(ref_text+". ")
165
+ ref_text = ["("+src_lang.replace("cmn", "zh")+")"] + list(ref_text)
166
+ gen_text = gen_text.split("\n")
167
+ gen_text = [self.frontend.text2norm(x+". ") for x in gen_text]
168
+ gen_text = [["("+x[0].replace("cmn", "zh")+")"] + list(x[1]) for x in gen_text]
169
+ print("after frontend:\n", "ref_text:", ref_text, "\ngen_text:", gen_text)
170
+
171
+ if separate_langs:
172
+ ref_text = self.process_phone_list(ref_text) # Optional
173
+ gen_text = [self.process_phone_list(x) for x in gen_text]
174
+
175
+ print("gen_text:", gen_text, "\nref_text:", ref_text)
176
+
177
+ wav, sr, spec = infer_process(
178
+ ref_file,
179
+ ref_text,
180
+ gen_text,
181
+ self.ema_model,
182
+ self.vocoder,
183
+ self.mel_spec_type,
184
+ show_info=show_info,
185
+ progress=progress,
186
+ target_rms=target_rms,
187
+ cross_fade_duration=cross_fade_duration,
188
+ nfe_step=nfe_step,
189
+ cfg_strength=cfg_strength,
190
+ sway_sampling_coef=sway_sampling_coef,
191
+ use_prosody_encoder=use_prosody_encoder,
192
+ use_acc_grl=use_acc_grl,
193
+ ref_ratio=ref_ratio,
194
+ no_ref_audio=no_ref_audio,
195
+ speed=speed,
196
+ fix_duration=fix_duration,
197
+ device=self.device,
198
+ )
199
+
200
+ if file_wave is not None:
201
+ self.export_wav(wav, file_wave, remove_silence=False)
202
+
203
+ if file_spec is not None:
204
+ self.export_spectrogram(spec, file_spec)
205
+
206
+ return wav, sr, spec
207
+
208
+
209
+ def process_phone_list(self, parts):
210
+ puncs = {"#1", "#2", "#3", "#4", "_", "!", ",", ".", "?", '"', "'", "^", "。", ",", "?", "!"}
211
+ """(vocab756 ver)处理phone list,给不带language id的phone添加当前language id前缀"""
212
+ # parts = phn_str.split('|')
213
+ processed = []
214
+ current_lang = ""
215
+ for i in range(len(parts)):
216
+ part = parts[i]
217
+ if part.startswith('(') and part.endswith(')') and part[1:-1] in self.langs:
218
+ # 这是一个language id
219
+ current_lang = part
220
+ # processed.append(part)
221
+ elif part in puncs: # not bool(regex.search(r'\p{L}', part[0])): # 匹配非字母数字、非空格的字符
222
+ # 是停顿符或标点
223
+ if len(processed) > 0 and processed[-1] == "_":
224
+ processed.pop()
225
+ elif len(processed) > 0 and processed[-1] in puncs and part == "_":
226
+ continue
227
+ processed.append(part)
228
+ # if i < len(parts) - 1 and parts[i+1] != "_":
229
+ # processed.append("_")
230
+ elif current_lang is not None:
231
+ # 不是language id且有当前language id,添加前缀
232
+ processed.append(f"{current_lang}{part}")
233
+ return processed
234
+
235
+
236
+ if __name__ == "__main__":
237
+ f5tts = F5TTS()
238
+
239
+ wav, sr, spec = f5tts.infer(
240
+ ref_file=str((THIS_FILE.parent / "infer" / "examples" / "basic" / "basic_ref_en.wav").resolve()),
241
+ ref_text="some call me nature, others call me mother nature.",
242
+ gen_text=(
243
+ "I don't really care what you call me. I've been a silent spectator, watching species evolve, "
244
+ "empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture "
245
+ "you; ignore me and you shall face the consequences."
246
+ ),
247
+ file_wave=str((REPO_ROOT / "outputs" / "api_out.wav").resolve()),
248
+ file_spec=str((REPO_ROOT / "outputs" / "api_out.png").resolve()),
249
+ seed=None,
250
+ )
251
+
252
+ print("seed :", f5tts.seed)
lemas_tts/configs/multilingual_grl.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # compute_environment: LOCAL_MACHINE
2
+ # debug: false
3
+ # distributed_type: MULTI_GPU
4
+ # downcast_bf16: 'no'
5
+ # enable_cpu_affinity: true
6
+ # gpu_ids: all
7
+ # # machine_rank: 0
8
+ # # main_training_function: main
9
+ # mixed_precision: bf16
10
+ # num_machines: 1
11
+ # num_processes: 16
12
+ # # rdzv_backend: static
13
+ # same_network: true
14
+ # use_cpu: false
15
+
16
+
17
+ hydra:
18
+ run:
19
+ dir: exp/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
20
+
21
+ datasets:
22
+ name: multilingual_vocab898_acc_grl_ctc_fix # dataset name
23
+ batch_size_per_gpu: 40000 # 8 GPUs, 8 * 38400 = 307200
24
+ batch_size_type: frame # frame | sample
25
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
26
+ num_workers: 2
27
+ separate_langs: True
28
+
29
+ optim:
30
+ epochs: 100
31
+ learning_rate: 2e-5
32
+ num_warmup_updates: 1000 # warmup updates
33
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
34
+ max_grad_norm: 1.0 # gradient clipping
35
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
36
+ model:
37
+ name: multilingual # model name
38
+ tokenizer: custom # tokenizer type
39
+ tokenizer_path: "pretrained_models/data/multilingual_grl/vocab.txt" # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
40
+ audio_dir: "pretrained_models/data/multilingual_grl"
41
+ use_ctc_loss: True # whether to use ctc loss
42
+ use_spk_enc: False
43
+ use_prosody_encoder: False
44
+ prosody_cfg_path: "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json" # pretssel_cfg.json
45
+ prosody_ckpt_path: "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt" # prosody_encoder_pretssel.pt
46
+
47
+ backbone: DiT
48
+ arch:
49
+ dim: 1024
50
+ depth: 22
51
+ heads: 16
52
+ ff_mult: 2
53
+ text_dim: 512
54
+ text_mask_padding: True
55
+ qk_norm: null # null | rms_norm
56
+ conv_layers: 4
57
+ pe_attn_head: null
58
+ checkpoint_activations: False # recompute activations and save memory for extra compute
59
+ mel_spec:
60
+ target_sample_rate: 24000
61
+ n_mel_channels: 100
62
+ hop_length: 256
63
+ win_length: 1024
64
+ n_fft: 1024
65
+ mel_spec_type: vocos # vocos | bigvgan
66
+ vocoder:
67
+ is_local: True # use local offline ckpt or not
68
+ # Path in the original training environment; kept here for reference only.
69
+ # For the open-sourced LEMAS-TTS repo, use `pretrained_models/ckpts/vocos-mel-24khz`.
70
+ local_path: "pretrained_models/ckpts/vocos-mel-24khz" # local vocoder path
71
+
72
+ ckpts:
73
+ logger: tensorboard # wandb | tensorboard | null
74
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
75
+ save_per_updates: 1000 # save checkpoint per updates
76
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
77
+ last_per_updates: 1000 # save last checkpoint per updates
78
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
lemas_tts/configs/multilingual_prosody.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # compute_environment: LOCAL_MACHINE
2
+ # debug: false
3
+ # distributed_type: MULTI_GPU
4
+ # downcast_bf16: 'no'
5
+ # enable_cpu_affinity: true
6
+ # gpu_ids: all
7
+ # # machine_rank: 0
8
+ # # main_training_function: main
9
+ # mixed_precision: bf16
10
+ # num_machines: 1
11
+ # num_processes: 16
12
+ # # rdzv_backend: static
13
+ # same_network: true
14
+ # use_cpu: false
15
+
16
+
17
+ hydra:
18
+ run:
19
+ dir: exp/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
20
+
21
+ datasets:
22
+ name: multilingual_vocab898_acc_grl_prosody_ctc_fix # dataset name
23
+ batch_size_per_gpu: 40000 # 8 GPUs, 8 * 38400 = 307200
24
+ batch_size_type: frame # frame | sample
25
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
26
+ num_workers: 2
27
+ separate_langs: True
28
+
29
+ optim:
30
+ epochs: 100
31
+ learning_rate: 2e-5
32
+ num_warmup_updates: 1000 # warmup updates
33
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
34
+ max_grad_norm: 1.0 # gradient clipping
35
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
36
+ model:
37
+ name: multilingual # model name
38
+ tokenizer: custom # tokenizer type
39
+ tokenizer_path: "pretrained_models/data/multilingual_grl/vocab.txt" # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
40
+ audio_dir: "pretrained_models/data/multilingual_grl"
41
+ use_ctc_loss: True # whether to use ctc loss
42
+ use_spk_enc: False
43
+ use_prosody_encoder: True
44
+ prosody_cfg_path: "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json" # pretssel_cfg.json
45
+ prosody_ckpt_path: "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt" # prosody_encoder_pretssel.pt
46
+
47
+ backbone: DiT
48
+ arch:
49
+ dim: 1024
50
+ depth: 22
51
+ heads: 16
52
+ ff_mult: 2
53
+ text_dim: 512
54
+ text_mask_padding: True
55
+ qk_norm: null # null | rms_norm
56
+ conv_layers: 4
57
+ pe_attn_head: null
58
+ checkpoint_activations: False # recompute activations and save memory for extra compute
59
+ mel_spec:
60
+ target_sample_rate: 24000
61
+ n_mel_channels: 100
62
+ hop_length: 256
63
+ win_length: 1024
64
+ n_fft: 1024
65
+ mel_spec_type: vocos # vocos | bigvgan
66
+ vocoder:
67
+ is_local: True # use local offline ckpt or not
68
+ # Path in the original training environment; kept here for reference only.
69
+ # For the open-sourced LEMAS-TTS repo, use `pretrained_models/ckpts/vocos-mel-24khz`.
70
+ local_path: "pretrained_models/ckpts/vocos-mel-24khz" # local vocoder path
71
+
72
+ ckpts:
73
+ logger: tensorboard # wandb | tensorboard | null
74
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
75
+ save_per_updates: 1000 # save checkpoint per updates
76
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
77
+ last_per_updates: 1000 # save last checkpoint per updates
78
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
lemas_tts/infer/frontend.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, regex
2
+ import langid
3
+ import uroman as ur
4
+ import jieba, zhconv
5
+ from num2words import num2words
6
+
7
+ jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__) + "/../infer/text_norm/jieba_dict.txt"))
8
+ # from pypinyin.core import Pinyin
9
+ from pypinyin import pinyin, lazy_pinyin, Style
10
+
11
+ from .text_norm.txt2pinyin import _PAUSE_SYMBOL, get_phoneme_from_char_and_pinyin
12
+ from .text_norm.cn_tn import NSWNormalizer
13
+ from .text_norm.tokenizer import TextTokenizer, txt2phone
14
+ from pypinyin.contrib.tone_convert import to_initials, to_finals_tone3
15
+ from pypinyin_dict.phrase_pinyin_data import large_pinyin # large_pinyin # cc_cedict
16
+ large_pinyin.load()
17
+
18
+ class TextNorm():
19
+ def __init__(self, dtype="phone"):
20
+ # my_pinyin = Pinyin(MyConverter())
21
+ # self.pinyin_parser = my_pinyin.pinyin
22
+ cmn_lexicon = open(os.path.join(os.path.dirname(__file__)+'/../infer/text_norm/pinyin-lexicon-r.txt'),'r', encoding="utf-8").readlines()
23
+ cmn_lexicon = [x.strip().split() for x in cmn_lexicon]
24
+ self.cmn_dict = {x[0]:x[1:] for x in cmn_lexicon}
25
+ langid.set_languages(['es','pt','zh','en','de','fr','it','ru', 'vi','id','th','ja','ko','ar'])
26
+ langs = {"en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "vi":"vi", "id":"id", "th":"th", "ja":"ja", "ko":"ko"} # "zh":"cmn", "cmn":"cmn", "ar":"ar-sa"}
27
+ text_tokenizer = {}
28
+ for k,v in langs.items():
29
+ tokenizer = TextTokenizer(language=v, backend="espeak")
30
+ lang = "zh" if k == "cmn" else k
31
+ text_tokenizer[k] = (lang, tokenizer)
32
+ self.text_tokenizer = text_tokenizer
33
+ self.cn_tn = NSWNormalizer()
34
+ self.dtype = dtype
35
+
36
+ def detect_lang(self, text):
37
+ lang, _ = langid.classify(text)[0]
38
+ return lang
39
+
40
+ def sil_type(self, time_s):
41
+ if round(time_s) < 0.4:
42
+ return ""
43
+ elif round(time_s) >= 0.4 and round(time_s) < 0.8:
44
+ return "#1"
45
+ elif round(time_s) >= 0.8 and round(time_s) < 1.5:
46
+ return "#2"
47
+ elif round(time_s) >= 1.5 and round(time_s) < 3.0:
48
+ return "#3"
49
+ elif round(time_s) >= 3.0:
50
+ return "#4"
51
+
52
+
53
+ def add_sil_raw(self, sub_list, start_time, end_time, target_transcript):
54
+ txt = []
55
+ txt_list = [x["word"] for x in sub_list]
56
+ sil = self.sil_type(sub_list[0]["start"])
57
+ if len(sil) > 0:
58
+ txt.append(sil)
59
+ txt.append(txt_list[0])
60
+ for i in range(1, len(sub_list)):
61
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
62
+ txt.append(target_transcript)
63
+ target_transcript = ""
64
+ else:
65
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
66
+ if len(sil) > 0:
67
+ txt.append(sil)
68
+ txt.append(txt_list[i])
69
+ return ' '.join(txt)
70
+
71
+ def add_sil(self, sub_list, start_time, end_time, target_transcript, src_lang, tar_lang):
72
+ txts = []
73
+ txt_list = [x["word"] for x in sub_list]
74
+ sil = self.sil_type(sub_list[0]["start"])
75
+ if len(sil) > 0:
76
+ txts.append([src_lang, sil])
77
+
78
+ if sub_list[0]["start"] < start_time:
79
+ txts.append([src_lang, txt_list[0]])
80
+ for i in range(1, len(sub_list)):
81
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
82
+ txts.append([tar_lang, target_transcript])
83
+ target_transcript = ""
84
+ else:
85
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
86
+ if len(sil) > 0:
87
+ txts.append([src_lang, sil])
88
+ txts.append([src_lang, txt_list[i]])
89
+
90
+ target_txt = [txts[0]]
91
+ for txt in txts[1:]:
92
+ if txt[1] == "":
93
+ continue
94
+ if txt[0] != target_txt[-1][0]:
95
+ target_txt.append([txt[0], ""])
96
+ target_txt[-1][-1] += " " + txt[1]
97
+
98
+ return target_txt
99
+
100
+ def replace_numbers_with_words(self, sentence, lang="en"):
101
+ sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
102
+
103
+ def replace_with_words(match):
104
+ num = match.group(0)
105
+ try:
106
+ return num2words(num, lang=lang) # Convert numbers to words
107
+ except:
108
+ return num # In case num2words fails (unlikely with digits but just to be safe)
109
+ return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
110
+
111
+
112
+ def get_prompt(self, sub_list, start_time, end_time, src_lang):
113
+ txts = []
114
+ txt_list = [x["word"] for x in sub_list]
115
+
116
+ if start_time <= sub_list[0]["start"]:
117
+ sil = self.sil_type(sub_list[0]["start"])
118
+ if len(sil) > 0:
119
+ txts.append([src_lang, sil])
120
+ txts.append([src_lang, txt_list[0]])
121
+
122
+ for i in range(1, len(sub_list)):
123
+ # if sub_list[i]["start"] <= start_time and sub_list[i]["end"] <= end_time:
124
+ # txts.append([tar_lang, target_transcript])
125
+ # target_transcript = ""
126
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
127
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
128
+ if len(sil) > 0:
129
+ txts.append([src_lang, sil])
130
+ txts.append([src_lang, txt_list[i]])
131
+
132
+ target_txt = [txts[0]]
133
+ for txt in txts[1:]:
134
+ if txt[1] == "":
135
+ continue
136
+ if txt[0] != target_txt[-1][0]:
137
+ target_txt.append([txt[0], ""])
138
+ target_txt[-1][-1] += " " + txt[1]
139
+ return target_txt
140
+
141
+
142
+ def txt2pinyin(self, text):
143
+ txts, phonemes = [], []
144
+ texts = re.split(r"(#\d)", text)
145
+ print("before norm: ", texts)
146
+ for text in texts:
147
+ if text in {'#1', '#2', '#3', '#4'}:
148
+ txts.append(text)
149
+ phonemes.append(text)
150
+ continue
151
+ text = self.cn_tn.normalize(text.strip())
152
+
153
+ text_list = list(jieba.cut(text))
154
+ print("jieba cut: ", text, text_list)
155
+ for words in text_list:
156
+ if words in _PAUSE_SYMBOL:
157
+ # phonemes[-1] += _PAUSE_SYMBOL[words]
158
+ phonemes.append(_PAUSE_SYMBOL[words])
159
+ # phonemes.append('#1')
160
+ txts[-1] += words
161
+ elif re.search("[\u4e00-\u9fa5]+", words):
162
+ # pinyin = self.pinyin_parser(words, style=Style.TONE3, errors="ignore")
163
+ pinyin = lazy_pinyin(words, style=Style.TONE3, tone_sandhi=True, neutral_tone_with_five=True)
164
+ new_pinyin = []
165
+ for x in pinyin:
166
+ x = "".join(x)
167
+ if "#" not in x:
168
+ new_pinyin.append(x)
169
+ else:
170
+ phonemes.append(words)
171
+ continue
172
+ # new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
173
+ phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin)
174
+ phonemes += phoneme
175
+ txts += list(words)
176
+ elif re.search(r"[a-zA-Z]", words) or re.search(r"#[1-4]", words):
177
+ phonemes.append(words.upper())
178
+ txts.append(words.upper())
179
+ # phonemes.append("#1")
180
+ # phones = " ".join(phonemes)
181
+ return txts, phonemes
182
+
183
+
184
+ def txt2pin_phns(self, text):
185
+ text = re.sub(r'(?<! )(' + r'[^\w\s]' + r')', r' \1', text)
186
+ text = re.sub(r'\s+', ' ', text).strip()
187
+
188
+ # print(text.split(" "))
189
+ res_list = []
190
+ for txt in text.split(" "):
191
+ if txt in self.cmn_dict:
192
+ # res_list += ["(zh)" + x for x in self.cmn_dict[txt]]
193
+ res_list.append("(zh)")
194
+ res_list.append(to_initials(txt, strict=False))
195
+ res_list.append(to_finals_tone3(txt, neutral_tone_with_five=True))
196
+ elif txt == '':
197
+ continue
198
+ elif txt[0] in {"#1", "#2", "#3", "#4"} or not bool(regex.search(r'\p{L}', txt[0][0])):
199
+ if len(res_list) > 0 and res_list[-1] == "_":
200
+ res_list.pop()
201
+ res_list += [txt]
202
+ continue
203
+ else:
204
+ if len(res_list) > 0 and res_list[-1] == "_":
205
+ res_list.pop()
206
+ lang = langid.classify(txt)[0]
207
+ lang = lang if lang in self.text_tokenizer else "en"
208
+ tokenizer = self.text_tokenizer[lang][1]
209
+ ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1)
210
+ phns = ipa[0] if ipa[0][0] == "(" else f"({lang})_" + ipa[0]
211
+ res_list += phns.replace("_", "|_|").split("|")
212
+
213
+ # lang = phns.split(")")[0][1:]
214
+ # phns = phns[len(lang)+3:].replace("_", "|_|")
215
+ # phns = phns.split("|")
216
+ # for i in range(len(phns)):
217
+ # if phns[i] not in {"#1", "#2", "#3", "#4", "_", ",", ".", "?", "!"}:
218
+ # phns[i] = f"({lang})" + phns[i]
219
+ # res_list += phns
220
+ res_list.append("_")
221
+ res = "|".join(res_list)
222
+ res = re.sub(r'(\|_)+', '|_', res)
223
+ return res
224
+
225
+
226
+ def text2phn(self, sentence, lang=None):
227
+ if not lang:
228
+ lang = langid.classify(sentence)[0]
229
+ if re.search("[\u4e00-\u9fa5]+", sentence):
230
+ txts, phones = self.txt2pinyin(sentence)
231
+ transcript_norm = " ".join(phones)
232
+ phones = self.txt2pin_phns(transcript_norm) # IPA mix Pinyin
233
+ else:
234
+ transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
235
+ transcript_norm = sentence
236
+ # All IPA
237
+ phones = txt2phone(self.text_tokenizer[lang][1], transcript_norm.strip().replace(".", ",").replace("。", ","))
238
+ phones = f"({lang})|" + phones if phones[0] != "(" else phones
239
+ return phones
240
+
241
+
242
+ def text2norm(self, sentence, lang=None):
243
+ if not lang:
244
+ lang = langid.classify(sentence)[0]
245
+ if re.search("[\u4e00-\u9fa5]+", sentence):
246
+ txts, phones = self.txt2pinyin(sentence)
247
+ transcript_norm = " ".join(phones)
248
+ else:
249
+ transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
250
+ transcript_norm = sentence
251
+ return (lang, transcript_norm)
lemas_tts/infer/infer_cli.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import codecs
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+ from importlib.resources import files
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import tomli
12
+ from cached_path import cached_path
13
+ from hydra.utils import get_class
14
+ from omegaconf import OmegaConf
15
+
16
+ from lemas_tts.infer.utils_infer import (
17
+ mel_spec_type,
18
+ target_rms,
19
+ cross_fade_duration,
20
+ nfe_step,
21
+ cfg_strength,
22
+ sway_sampling_coef,
23
+ speed,
24
+ fix_duration,
25
+ device,
26
+ infer_process,
27
+ load_model,
28
+ load_vocoder,
29
+ preprocess_ref_audio_text,
30
+ remove_silence_for_generated_wav,
31
+ )
32
+
33
+ THIS_FILE = Path(__file__).resolve()
34
+
35
+
36
+ def _find_repo_root(start: Path) -> Path:
37
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
38
+ for p in [start, *start.parents]:
39
+ if (p / "pretrained_models").is_dir():
40
+ return p
41
+ cwd = Path.cwd()
42
+ if (cwd / "pretrained_models").is_dir():
43
+ return cwd
44
+ return start
45
+
46
+
47
+ REPO_ROOT = _find_repo_root(THIS_FILE)
48
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
49
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
50
+
51
+
52
+ parser = argparse.ArgumentParser(
53
+ prog="python3 infer-cli.py",
54
+ description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
55
+ epilog="Specify options above to override one or more settings from config.",
56
+ )
57
+ parser.add_argument(
58
+ "-c",
59
+ "--config",
60
+ type=str,
61
+ default=os.path.join(files("lemas_tts").joinpath("infer/examples/basic"), "basic.toml"),
62
+ help="The configuration file, default see infer/examples/basic/basic.toml",
63
+ )
64
+
65
+
66
+ # Note. Not to provide default value here in order to read default from config file
67
+
68
+ parser.add_argument(
69
+ "-m",
70
+ "--model",
71
+ type=str,
72
+ help="The model name: F5TTS_v1_Base | F5TTS_Base | E2TTS_Base | etc.",
73
+ )
74
+ parser.add_argument(
75
+ "-mc",
76
+ "--model_cfg",
77
+ type=str,
78
+ help="The path to F5-TTS model config file .yaml",
79
+ )
80
+ parser.add_argument(
81
+ "-p",
82
+ "--ckpt_file",
83
+ type=str,
84
+ help="The path to model checkpoint .pt, leave blank to use default",
85
+ )
86
+ parser.add_argument(
87
+ "-v",
88
+ "--vocab_file",
89
+ type=str,
90
+ help="The path to vocab file .txt, leave blank to use default",
91
+ )
92
+ parser.add_argument(
93
+ "-r",
94
+ "--ref_audio",
95
+ type=str,
96
+ help="The reference audio file.",
97
+ )
98
+ parser.add_argument(
99
+ "-s",
100
+ "--ref_text",
101
+ type=str,
102
+ help="The transcript/subtitle for the reference audio",
103
+ )
104
+ parser.add_argument(
105
+ "-t",
106
+ "--gen_text",
107
+ type=str,
108
+ help="The text to make model synthesize a speech",
109
+ )
110
+ parser.add_argument(
111
+ "-f",
112
+ "--gen_file",
113
+ type=str,
114
+ help="The file with text to generate, will ignore --gen_text",
115
+ )
116
+ parser.add_argument(
117
+ "-o",
118
+ "--output_dir",
119
+ type=str,
120
+ help="The path to output folder",
121
+ )
122
+ parser.add_argument(
123
+ "-w",
124
+ "--output_file",
125
+ type=str,
126
+ help="The name of output file",
127
+ )
128
+ parser.add_argument(
129
+ "--save_chunk",
130
+ action="store_true",
131
+ help="To save each audio chunks during inference",
132
+ )
133
+ parser.add_argument(
134
+ "--remove_silence",
135
+ action="store_true",
136
+ help="To remove long silence found in ouput",
137
+ )
138
+ parser.add_argument(
139
+ "--load_vocoder_from_local",
140
+ action="store_true",
141
+ help="To load vocoder from local dir, default to ../checkpoints/vocos-mel-24khz",
142
+ )
143
+ parser.add_argument(
144
+ "--vocoder_name",
145
+ type=str,
146
+ choices=["vocos", "bigvgan"],
147
+ help=f"Used vocoder name: vocos | bigvgan, default {mel_spec_type}",
148
+ )
149
+ parser.add_argument(
150
+ "--target_rms",
151
+ type=float,
152
+ help=f"Target output speech loudness normalization value, default {target_rms}",
153
+ )
154
+ parser.add_argument(
155
+ "--cross_fade_duration",
156
+ type=float,
157
+ help=f"Duration of cross-fade between audio segments in seconds, default {cross_fade_duration}",
158
+ )
159
+ parser.add_argument(
160
+ "--nfe_step",
161
+ type=int,
162
+ help=f"The number of function evaluation (denoising steps), default {nfe_step}",
163
+ )
164
+ parser.add_argument(
165
+ "--cfg_strength",
166
+ type=float,
167
+ help=f"Classifier-free guidance strength, default {cfg_strength}",
168
+ )
169
+ parser.add_argument(
170
+ "--sway_sampling_coef",
171
+ type=float,
172
+ help=f"Sway Sampling coefficient, default {sway_sampling_coef}",
173
+ )
174
+ parser.add_argument(
175
+ "--speed",
176
+ type=float,
177
+ help=f"The speed of the generated audio, default {speed}",
178
+ )
179
+ parser.add_argument(
180
+ "--fix_duration",
181
+ type=float,
182
+ help=f"Fix the total duration (ref and gen audios) in seconds, default {fix_duration}",
183
+ )
184
+ parser.add_argument(
185
+ "--device",
186
+ type=str,
187
+ help="Specify the device to run on",
188
+ )
189
+ args = parser.parse_args()
190
+
191
+
192
+ # config file
193
+
194
+ config = tomli.load(open(args.config, "rb"))
195
+
196
+
197
+ # command-line interface parameters
198
+
199
+ model = args.model or config.get("model", "F5TTS_v1_Base")
200
+ ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
201
+ vocab_file = args.vocab_file or config.get("vocab_file", "")
202
+
203
+ ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
204
+ ref_text = (
205
+ args.ref_text
206
+ if args.ref_text is not None
207
+ else config.get("ref_text", "Some call me nature, others call me mother nature.")
208
+ )
209
+ gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
210
+ gen_file = args.gen_file or config.get("gen_file", "")
211
+
212
+ output_dir = args.output_dir or config.get("output_dir", "tests")
213
+ output_file = args.output_file or config.get(
214
+ "output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav"
215
+ )
216
+
217
+ save_chunk = args.save_chunk or config.get("save_chunk", False)
218
+ remove_silence = args.remove_silence or config.get("remove_silence", False)
219
+ load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
220
+
221
+ vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
222
+ target_rms = args.target_rms or config.get("target_rms", target_rms)
223
+ cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
224
+ nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
225
+ cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
226
+ sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
227
+ speed = args.speed or config.get("speed", speed)
228
+ fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
229
+ device = args.device or config.get("device", device)
230
+
231
+
232
+ # patches for pip pkg user
233
+ if "infer/examples/" in ref_audio:
234
+ ref_audio = str(files("lemas_tts").joinpath(f"{ref_audio}"))
235
+ if "infer/examples/" in gen_file:
236
+ gen_file = str(files("lemas_tts").joinpath(f"{gen_file}"))
237
+ if "voices" in config:
238
+ for voice in config["voices"]:
239
+ voice_ref_audio = config["voices"][voice]["ref_audio"]
240
+ if "infer/examples/" in voice_ref_audio:
241
+ config["voices"][voice]["ref_audio"] = str(files("lemas_tts").joinpath(f"{voice_ref_audio}"))
242
+
243
+
244
+ # ignore gen_text if gen_file provided
245
+
246
+ if gen_file:
247
+ gen_text = codecs.open(gen_file, "r", "utf-8").read()
248
+
249
+
250
+ # output path
251
+
252
+ wave_path = Path(output_dir) / output_file
253
+ # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
254
+ if save_chunk:
255
+ output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
256
+ if not os.path.exists(output_chunk_dir):
257
+ os.makedirs(output_chunk_dir)
258
+
259
+
260
+ # load vocoder
261
+
262
+ if vocoder_name == "vocos":
263
+ vocoder_local_path = str(CKPTS_ROOT / "vocos-mel-24khz")
264
+ elif vocoder_name == "bigvgan":
265
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
266
+
267
+ vocoder = load_vocoder(
268
+ vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path, device=device
269
+ )
270
+
271
+
272
+ # load TTS model
273
+
274
+ model_cfg = OmegaConf.load(
275
+ args.model_cfg or config.get("model_cfg", str(files("lemas_tts").joinpath(f"configs/{model}.yaml")))
276
+ )
277
+ model_cls = get_class(f"lemas_tts.model.{model_cfg.model.backbone}")
278
+ model_arc = model_cfg.model.arch
279
+
280
+ repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
281
+
282
+ if model != "F5TTS_Base":
283
+ assert vocoder_name == model_cfg.model.mel_spec.mel_spec_type
284
+
285
+ # override for previous models
286
+ if model == "F5TTS_Base":
287
+ if vocoder_name == "vocos":
288
+ ckpt_step = 1200000
289
+ elif vocoder_name == "bigvgan":
290
+ model = "F5TTS_Base_bigvgan"
291
+ ckpt_type = "pt"
292
+ elif model == "E2TTS_Base":
293
+ repo_name = "E2-TTS"
294
+ ckpt_step = 1200000
295
+
296
+ if not ckpt_file:
297
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}"))
298
+
299
+ print(f"Using {model}...")
300
+ ema_model = load_model(
301
+ model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
302
+ )
303
+
304
+
305
+ # inference process
306
+
307
+
308
+ def main():
309
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
310
+ if "voices" not in config:
311
+ voices = {"main": main_voice}
312
+ else:
313
+ voices = config["voices"]
314
+ voices["main"] = main_voice
315
+ for voice in voices:
316
+ print("Voice:", voice)
317
+ print("ref_audio ", voices[voice]["ref_audio"])
318
+ voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
319
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
320
+ )
321
+ print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
322
+
323
+ generated_audio_segments = []
324
+ reg1 = r"(?=\[\w+\])"
325
+ chunks = re.split(reg1, gen_text)
326
+ reg2 = r"\[(\w+)\]"
327
+ for text in chunks:
328
+ if not text.strip():
329
+ continue
330
+ match = re.match(reg2, text)
331
+ if match:
332
+ voice = match[1]
333
+ else:
334
+ print("No voice tag found, using main.")
335
+ voice = "main"
336
+ if voice not in voices:
337
+ print(f"Voice {voice} not found, using main.")
338
+ voice = "main"
339
+ text = re.sub(reg2, "", text)
340
+ ref_audio_ = voices[voice]["ref_audio"]
341
+ ref_text_ = voices[voice]["ref_text"]
342
+ gen_text_ = text.strip()
343
+ print(f"Voice: {voice}")
344
+ audio_segment, final_sample_rate, spectragram = infer_process(
345
+ ref_audio_,
346
+ ref_text_,
347
+ gen_text_,
348
+ ema_model,
349
+ vocoder,
350
+ mel_spec_type=vocoder_name,
351
+ target_rms=target_rms,
352
+ cross_fade_duration=cross_fade_duration,
353
+ nfe_step=nfe_step,
354
+ cfg_strength=cfg_strength,
355
+ sway_sampling_coef=sway_sampling_coef,
356
+ speed=speed,
357
+ fix_duration=fix_duration,
358
+ device=device,
359
+ )
360
+ generated_audio_segments.append(audio_segment)
361
+
362
+ if save_chunk:
363
+ if len(gen_text_) > 200:
364
+ gen_text_ = gen_text_[:200] + " ... "
365
+ sf.write(
366
+ os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
367
+ audio_segment,
368
+ final_sample_rate,
369
+ )
370
+
371
+ if generated_audio_segments:
372
+ final_wave = np.concatenate(generated_audio_segments)
373
+
374
+ if not os.path.exists(output_dir):
375
+ os.makedirs(output_dir)
376
+
377
+ with open(wave_path, "wb") as f:
378
+ sf.write(f.name, final_wave, final_sample_rate)
379
+ # Remove silence
380
+ if remove_silence:
381
+ remove_silence_for_generated_wav(f.name)
382
+ print(f.name)
383
+
384
+
385
+ if __name__ == "__main__":
386
+ main()
lemas_tts/infer/text_norm/__init__.py ADDED
File without changes
lemas_tts/infer/text_norm/cn_tn.py ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # coding=utf-8
3
+ # Authors:
4
+ # 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
5
+ # 2019.9 Jiayu DU
6
+ #
7
+ # requirements:
8
+ # - python 3.X
9
+ # notes: python 2.X WILL fail or produce misleading results
10
+
11
+ import sys, os, argparse, codecs, string, re, unicodedata
12
+
13
+ # ================================================================================ #
14
+ # basic constant
15
+ # ================================================================================ #
16
+ CHINESE_DIGIS = u'零一二三四五六七八九'
17
+ BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
18
+ BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
19
+ SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
20
+ SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
21
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
22
+ LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
23
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
24
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
25
+
26
+ ZERO_ALT = u'〇'
27
+ ONE_ALT = u'幺'
28
+ TWO_ALTS = [u'两', u'兩']
29
+
30
+ POSITIVE = [u'正', u'正']
31
+ NEGATIVE = [u'负', u'負']
32
+ POINT = [u'点', u'點']
33
+ # PLUS = [u'加', u'加']
34
+ # SIL = [u'杠', u'槓']
35
+
36
+ # 中文数字系统类型
37
+ NUMBERING_TYPES = ['low', 'mid', 'high']
38
+
39
+ CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
40
+ '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
41
+ CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
42
+ COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
43
+ '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
44
+ '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
45
+ '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
46
+ '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
47
+ '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
48
+
49
+ # punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
50
+ CHINESE_PUNC_STOP = '!?。。'
51
+ CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
52
+ CHINESE_PUNC_OTHER = '·〈〉-'
53
+ CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER
54
+
55
+ # ================================================================================ #
56
+ # basic class
57
+ # ================================================================================ #
58
+ class ChineseChar(object):
59
+ """
60
+ 中文字符
61
+ 每个字符对应简体和繁体,
62
+ e.g. 简体 = '负', 繁体 = '負'
63
+ 转换时可转换为简体或繁体
64
+ """
65
+
66
+ def __init__(self, simplified, traditional):
67
+ self.simplified = simplified
68
+ self.traditional = traditional
69
+ #self.__repr__ = self.__str__
70
+
71
+ def __str__(self):
72
+ return self.simplified or self.traditional or None
73
+
74
+ def __repr__(self):
75
+ return self.__str__()
76
+
77
+
78
+ class ChineseNumberUnit(ChineseChar):
79
+ """
80
+ 中文数字/数位字符
81
+ 每个字符除繁简体外还有一个额外的大写字符
82
+ e.g. '陆' 和 '陸'
83
+ """
84
+
85
+ def __init__(self, power, simplified, traditional, big_s, big_t):
86
+ super(ChineseNumberUnit, self).__init__(simplified, traditional)
87
+ self.power = power
88
+ self.big_s = big_s
89
+ self.big_t = big_t
90
+
91
+ def __str__(self):
92
+ return '10^{}'.format(self.power)
93
+
94
+ @classmethod
95
+ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
96
+
97
+ if small_unit:
98
+ return ChineseNumberUnit(power=index + 1,
99
+ simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
100
+ elif numbering_type == NUMBERING_TYPES[0]:
101
+ return ChineseNumberUnit(power=index + 8,
102
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
103
+ elif numbering_type == NUMBERING_TYPES[1]:
104
+ return ChineseNumberUnit(power=(index + 2) * 4,
105
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
106
+ elif numbering_type == NUMBERING_TYPES[2]:
107
+ return ChineseNumberUnit(power=pow(2, index + 3),
108
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
109
+ else:
110
+ raise ValueError(
111
+ 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
112
+
113
+
114
+ class ChineseNumberDigit(ChineseChar):
115
+ """
116
+ 中文数字字符
117
+ """
118
+
119
+ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
120
+ super(ChineseNumberDigit, self).__init__(simplified, traditional)
121
+ self.value = value
122
+ self.big_s = big_s
123
+ self.big_t = big_t
124
+ self.alt_s = alt_s
125
+ self.alt_t = alt_t
126
+
127
+ def __str__(self):
128
+ return str(self.value)
129
+
130
+ @classmethod
131
+ def create(cls, i, v):
132
+ return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
133
+
134
+
135
+ class ChineseMath(ChineseChar):
136
+ """
137
+ 中文数位字符
138
+ """
139
+
140
+ def __init__(self, simplified, traditional, symbol, expression=None):
141
+ super(ChineseMath, self).__init__(simplified, traditional)
142
+ self.symbol = symbol
143
+ self.expression = expression
144
+ self.big_s = simplified
145
+ self.big_t = traditional
146
+
147
+
148
+ CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
149
+
150
+
151
+ class NumberSystem(object):
152
+ """
153
+ 中文数字系统
154
+ """
155
+ pass
156
+
157
+
158
+ class MathSymbol(object):
159
+ """
160
+ 用于中文数字系统的数学符号 (繁/简体), e.g.
161
+ positive = ['正', '正']
162
+ negative = ['负', '負']
163
+ point = ['点', '點']
164
+ """
165
+
166
+ def __init__(self, positive, negative, point):
167
+ self.positive = positive
168
+ self.negative = negative
169
+ self.point = point
170
+
171
+ def __iter__(self):
172
+ for v in self.__dict__.values():
173
+ yield v
174
+
175
+
176
+ # class OtherSymbol(object):
177
+ # """
178
+ # 其他符号
179
+ # """
180
+ #
181
+ # def __init__(self, sil):
182
+ # self.sil = sil
183
+ #
184
+ # def __iter__(self):
185
+ # for v in self.__dict__.values():
186
+ # yield v
187
+
188
+
189
+ # ================================================================================ #
190
+ # basic utils
191
+ # ================================================================================ #
192
+ def create_system(numbering_type=NUMBERING_TYPES[1]):
193
+ """
194
+ 根据数字系统类型返回创建相应的数字系统,默认为 mid
195
+ NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
196
+ low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
197
+ mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
198
+ high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
199
+ 返回对应的数字系统
200
+ """
201
+
202
+ # chinese number units of '亿' and larger
203
+ all_larger_units = zip(
204
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
205
+ larger_units = [CNU.create(i, v, numbering_type, False)
206
+ for i, v in enumerate(all_larger_units)]
207
+ # chinese number units of '十, 百, 千, 万'
208
+ all_smaller_units = zip(
209
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
210
+ smaller_units = [CNU.create(i, v, small_unit=True)
211
+ for i, v in enumerate(all_smaller_units)]
212
+ # digis
213
+ chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
214
+ BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
215
+ digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
216
+ digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
217
+ digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
218
+ digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
219
+
220
+ # symbols
221
+ positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
222
+ negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
223
+ point_cn = CM(POINT[0], POINT[1], '.', lambda x,
224
+ y: float(str(x) + '.' + str(y)))
225
+ # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
226
+ system = NumberSystem()
227
+ system.units = smaller_units + larger_units
228
+ system.digits = digits
229
+ system.math = MathSymbol(positive_cn, negative_cn, point_cn)
230
+ # system.symbols = OtherSymbol(sil_cn)
231
+ return system
232
+
233
+
234
+ def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
235
+
236
+ def get_symbol(char, system):
237
+ for u in system.units:
238
+ if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
239
+ return u
240
+ for d in system.digits:
241
+ if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
242
+ return d
243
+ for m in system.math:
244
+ if char in [m.traditional, m.simplified]:
245
+ return m
246
+
247
+ def string2symbols(chinese_string, system):
248
+ int_string, dec_string = chinese_string, ''
249
+ for p in [system.math.point.simplified, system.math.point.traditional]:
250
+ if p in chinese_string:
251
+ int_string, dec_string = chinese_string.split(p)
252
+ break
253
+ return [get_symbol(c, system) for c in int_string], \
254
+ [get_symbol(c, system) for c in dec_string]
255
+
256
+ def correct_symbols(integer_symbols, system):
257
+ """
258
+ 一百八 to 一百八十
259
+ 一亿一千三百万 to 一亿 一千万 三百万
260
+ """
261
+
262
+ if integer_symbols and isinstance(integer_symbols[0], CNU):
263
+ if integer_symbols[0].power == 1:
264
+ integer_symbols = [system.digits[1]] + integer_symbols
265
+
266
+ if len(integer_symbols) > 1:
267
+ if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
268
+ integer_symbols.append(
269
+ CNU(integer_symbols[-2].power - 1, None, None, None, None))
270
+
271
+ result = []
272
+ unit_count = 0
273
+ for s in integer_symbols:
274
+ if isinstance(s, CND):
275
+ result.append(s)
276
+ unit_count = 0
277
+ elif isinstance(s, CNU):
278
+ current_unit = CNU(s.power, None, None, None, None)
279
+ unit_count += 1
280
+
281
+ if unit_count == 1:
282
+ result.append(current_unit)
283
+ elif unit_count > 1:
284
+ for i in range(len(result)):
285
+ if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
286
+ result[-i - 1] = CNU(result[-i - 1].power +
287
+ current_unit.power, None, None, None, None)
288
+ return result
289
+
290
+ def compute_value(integer_symbols):
291
+ """
292
+ Compute the value.
293
+ When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
294
+ e.g. '两千万' = 2000 * 10000 not 2000 + 10000
295
+ """
296
+ value = [0]
297
+ last_power = 0
298
+ for s in integer_symbols:
299
+ if isinstance(s, CND):
300
+ value[-1] = s.value
301
+ elif isinstance(s, CNU):
302
+ value[-1] *= pow(10, s.power)
303
+ if s.power > last_power:
304
+ value[:-1] = list(map(lambda v: v *
305
+ pow(10, s.power), value[:-1]))
306
+ last_power = s.power
307
+ value.append(0)
308
+ return sum(value)
309
+
310
+ system = create_system(numbering_type)
311
+ int_part, dec_part = string2symbols(chinese_string, system)
312
+ int_part = correct_symbols(int_part, system)
313
+ int_str = str(compute_value(int_part))
314
+ dec_str = ''.join([str(d.value) for d in dec_part])
315
+ if dec_part:
316
+ return '{0}.{1}'.format(int_str, dec_str)
317
+ else:
318
+ return int_str
319
+
320
+
321
+ def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
322
+ traditional=False, alt_zero=False, alt_one=False, alt_two=True,
323
+ use_zeros=True, use_units=True):
324
+
325
+ def get_value(value_string, use_zeros=True):
326
+
327
+ striped_string = value_string.lstrip('0')
328
+
329
+ # record nothing if all zeros
330
+ if not striped_string:
331
+ return []
332
+
333
+ # record one digits
334
+ elif len(striped_string) == 1:
335
+ if use_zeros and len(value_string) != len(striped_string):
336
+ return [system.digits[0], system.digits[int(striped_string)]]
337
+ else:
338
+ return [system.digits[int(striped_string)]]
339
+
340
+ # recursively record multiple digits
341
+ else:
342
+ result_unit = next(u for u in reversed(
343
+ system.units) if u.power < len(striped_string))
344
+ result_string = value_string[:-result_unit.power]
345
+ return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
346
+
347
+ system = create_system(numbering_type)
348
+
349
+ int_dec = number_string.split('.')
350
+ if len(int_dec) == 1:
351
+ int_string = int_dec[0]
352
+ dec_string = ""
353
+ elif len(int_dec) == 2:
354
+ int_string = int_dec[0]
355
+ dec_string = int_dec[1]
356
+ else:
357
+ raise ValueError(
358
+ "invalid input num string with more than one dot: {}".format(number_string))
359
+
360
+ if use_units and len(int_string) > 1:
361
+ result_symbols = get_value(int_string)
362
+ else:
363
+ result_symbols = [system.digits[int(c)] for c in int_string]
364
+ dec_symbols = [system.digits[int(c)] for c in dec_string]
365
+ if dec_string:
366
+ result_symbols += [system.math.point] + dec_symbols
367
+
368
+ if alt_two:
369
+ liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
370
+ system.digits[2].big_s, system.digits[2].big_t)
371
+ for i, v in enumerate(result_symbols):
372
+ if isinstance(v, CND) and v.value == 2:
373
+ next_symbol = result_symbols[i +
374
+ 1] if i < len(result_symbols) - 1 else None
375
+ previous_symbol = result_symbols[i - 1] if i > 0 else None
376
+ if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
377
+ if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
378
+ result_symbols[i] = liang
379
+
380
+ # if big is True, '两' will not be used and `alt_two` has no impact on output
381
+ if big:
382
+ attr_name = 'big_'
383
+ if traditional:
384
+ attr_name += 't'
385
+ else:
386
+ attr_name += 's'
387
+ else:
388
+ if traditional:
389
+ attr_name = 'traditional'
390
+ else:
391
+ attr_name = 'simplified'
392
+
393
+ result = ''.join([getattr(s, attr_name) for s in result_symbols])
394
+
395
+ # if not use_zeros:
396
+ # result = result.strip(getattr(system.digits[0], attr_name))
397
+
398
+ if alt_zero:
399
+ result = result.replace(
400
+ getattr(system.digits[0], attr_name), system.digits[0].alt_s)
401
+
402
+ if alt_one:
403
+ result = result.replace(
404
+ getattr(system.digits[1], attr_name), system.digits[1].alt_s)
405
+
406
+ for i, p in enumerate(POINT):
407
+ if result.startswith(p):
408
+ return CHINESE_DIGIS[0] + result
409
+
410
+ # ^10, 11, .., 19
411
+ if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
412
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
413
+ result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
414
+ result = result[1:]
415
+
416
+ return result
417
+
418
+
419
+ # ================================================================================ #
420
+ # different types of rewriters
421
+ # ================================================================================ #
422
+ class Cardinal:
423
+ """
424
+ CARDINAL类
425
+ """
426
+
427
+ def __init__(self, cardinal=None, chntext=None):
428
+ self.cardinal = cardinal
429
+ self.chntext = chntext
430
+
431
+ def chntext2cardinal(self):
432
+ return chn2num(self.chntext)
433
+
434
+ def cardinal2chntext(self):
435
+ return num2chn(self.cardinal)
436
+
437
+ class Digit:
438
+ """
439
+ DIGIT类
440
+ """
441
+
442
+ def __init__(self, digit=None, chntext=None):
443
+ self.digit = digit
444
+ self.chntext = chntext
445
+
446
+ # def chntext2digit(self):
447
+ # return chn2num(self.chntext)
448
+
449
+ def digit2chntext(self):
450
+ return num2chn(self.digit, alt_two=False, use_units=False)
451
+
452
+
453
+ class TelePhone:
454
+ """
455
+ TELEPHONE类
456
+ """
457
+
458
+ def __init__(self, telephone=None, raw_chntext=None, chntext=None):
459
+ self.telephone = telephone
460
+ self.raw_chntext = raw_chntext
461
+ self.chntext = chntext
462
+
463
+ # def chntext2telephone(self):
464
+ # sil_parts = self.raw_chntext.split('<SIL>')
465
+ # self.telephone = '-'.join([
466
+ # str(chn2num(p)) for p in sil_parts
467
+ # ])
468
+ # return self.telephone
469
+
470
+ def telephone2chntext(self, fixed=False):
471
+
472
+ if fixed:
473
+ sil_parts = self.telephone.split('-')
474
+ self.raw_chntext = '<SIL>'.join([
475
+ num2chn(part, alt_two=False, use_units=False) for part in sil_parts
476
+ ])
477
+ self.chntext = self.raw_chntext.replace('<SIL>', '')
478
+ else:
479
+ sp_parts = self.telephone.strip('+').split()
480
+ self.raw_chntext = '<SP>'.join([
481
+ num2chn(part, alt_two=False, use_units=False) for part in sp_parts
482
+ ])
483
+ self.chntext = self.raw_chntext.replace('<SP>', '')
484
+ return self.chntext
485
+
486
+
487
+ class Fraction:
488
+ """
489
+ FRACTION类
490
+ """
491
+
492
+ def __init__(self, fraction=None, chntext=None):
493
+ self.fraction = fraction
494
+ self.chntext = chntext
495
+
496
+ def chntext2fraction(self):
497
+ denominator, numerator = self.chntext.split('分之')
498
+ return chn2num(numerator) + '/' + chn2num(denominator)
499
+
500
+ def fraction2chntext(self):
501
+ numerator, denominator = self.fraction.split('/')
502
+ return num2chn(denominator) + '分之' + num2chn(numerator)
503
+
504
+
505
+ class Date:
506
+ """
507
+ DATE类
508
+ """
509
+
510
+ def __init__(self, date=None, chntext=None):
511
+ self.date = date
512
+ self.chntext = chntext
513
+
514
+ # def chntext2date(self):
515
+ # chntext = self.chntext
516
+ # try:
517
+ # year, other = chntext.strip().split('年', maxsplit=1)
518
+ # year = Digit(chntext=year).digit2chntext() + '年'
519
+ # except ValueError:
520
+ # other = chntext
521
+ # year = ''
522
+ # if other:
523
+ # try:
524
+ # month, day = other.strip().split('月', maxsplit=1)
525
+ # month = Cardinal(chntext=month).chntext2cardinal() + '月'
526
+ # except ValueError:
527
+ # day = chntext
528
+ # month = ''
529
+ # if day:
530
+ # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
531
+ # else:
532
+ # month = ''
533
+ # day = ''
534
+ # date = year + month + day
535
+ # self.date = date
536
+ # return self.date
537
+
538
+ def date2chntext(self):
539
+ date = self.date
540
+ try:
541
+ year, other = date.strip().split('年', 1)
542
+ year = Digit(digit=year).digit2chntext() + '年'
543
+ except ValueError:
544
+ other = date
545
+ year = ''
546
+ if other:
547
+ try:
548
+ month, day = other.strip().split('月', 1)
549
+ month = Cardinal(cardinal=month).cardinal2chntext() + '月'
550
+ except ValueError:
551
+ day = date
552
+ month = ''
553
+ if day:
554
+ day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
555
+ else:
556
+ month = ''
557
+ day = ''
558
+ chntext = year + month + day
559
+ self.chntext = chntext
560
+ return self.chntext
561
+
562
+ class Time:
563
+ """
564
+ MONEY类
565
+ """
566
+
567
+ def __init__(self, time=None, chntext=None):
568
+ self.time = time
569
+ self.chntext = chntext
570
+
571
+ # def chntext2money(self):
572
+ # return self.money
573
+
574
+ def time2chntext(self):
575
+ time = self.time.replace('-', '至')
576
+ pattern = re.compile(r'(\d{1,2}:\d{1,2}(:)?(\d{1,2})?)')
577
+ matchers = pattern.findall(time)
578
+ if matchers:
579
+ if len(matchers[0])>2:
580
+ time = time.replace(':', '时', 1)
581
+ time = time.replace(':', '分', 1)
582
+ self.chntext = time
583
+ return self.chntext
584
+
585
+ class Money:
586
+ """
587
+ MONEY类
588
+ """
589
+
590
+ def __init__(self, money=None, chntext=None):
591
+ self.money = money
592
+ self.chntext = chntext
593
+
594
+ # def chntext2money(self):
595
+ # return self.money
596
+
597
+ def money2chntext(self):
598
+ money = self.money
599
+ pattern = re.compile(r'(\d+(\.\d+)?)')
600
+ matchers = pattern.findall(money)
601
+ if matchers:
602
+ for matcher in matchers:
603
+ money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
604
+ self.chntext = money
605
+ return self.chntext
606
+
607
+
608
+ class Percentage:
609
+ """
610
+ PERCENTAGE类
611
+ """
612
+
613
+ def __init__(self, percentage=None, chntext=None):
614
+ self.percentage = percentage
615
+ self.chntext = chntext
616
+
617
+ def chntext2percentage(self):
618
+ return chn2num(self.chntext.strip().strip('百分之')) + '%'
619
+
620
+ def percentage2chntext(self):
621
+ return '百分之' + num2chn(self.percentage.strip().strip('%'))
622
+
623
+
624
+ # ================================================================================ #
625
+ # NSW Normalizer
626
+ # ================================================================================ #
627
+ class NSWNormalizer:
628
+ def __init__(self):
629
+ self.raw_text = ' ' # '^' + raw_text + '$'
630
+ self.norm_text = ''
631
+
632
+ def _particular(self):
633
+ text = self.norm_text
634
+ pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
635
+ matchers = pattern.findall(text)
636
+ if matchers:
637
+ # print('particular')
638
+ for matcher in matchers:
639
+ text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
640
+ self.norm_text = text
641
+ return self.norm_text
642
+
643
+ def normalize(self, raw_text):
644
+ self.raw_text = '^' + raw_text + '$'
645
+ text = unicodedata.normalize("NFKC", self.raw_text)
646
+ # 规范化日期
647
+ pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
648
+ matchers = pattern.findall(text)
649
+ if matchers:
650
+ #print('date')
651
+ for matcher in matchers:
652
+ text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
653
+
654
+ # 规范化时间
655
+ pattern = re.compile(r"\D+((\d{1,2}-)?\d{1,2}[时点:]((\d{1,2}-)?\d{1,2}[分:]((\d{1,2}-)?\d{1,2}秒)?)?)")
656
+ matchers = pattern.findall(text)
657
+ if matchers:
658
+ #print('time')
659
+ for matcher in matchers:
660
+ text = text.replace(matcher[0], Time(time=matcher[0]).time2chntext(), 1)
661
+
662
+ # 规范化金钱
663
+ pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
664
+ matchers = pattern.findall(text)
665
+ if matchers:
666
+ #print('money')
667
+ for matcher in matchers:
668
+ text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
669
+
670
+ # 规范化固话/手机号码
671
+ # 手机
672
+ # http://www.jihaoba.com/news/show/13680
673
+ # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
674
+ # 联通:130、131、132、156、155、186、185、176
675
+ # 电信:133、153、189、180、181、177
676
+ pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
677
+ matchers = pattern.findall(text)
678
+ if matchers:
679
+ #print('telephone')
680
+ for matcher in matchers:
681
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
682
+ # 固话
683
+ pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
684
+ matchers = pattern.findall(text)
685
+ if matchers:
686
+ # print('fixed telephone')
687
+ for matcher in matchers:
688
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
689
+
690
+ # 规范化分数
691
+ pattern = re.compile(r"(\d+/\d+)")
692
+ matchers = pattern.findall(text)
693
+ if matchers:
694
+ #print('fraction')
695
+ for matcher in matchers:
696
+ text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
697
+
698
+ # 规范化百分数
699
+ text = text.replace('%', '%')
700
+ pattern = re.compile(r"(\d+(\.\d+)?%)")
701
+ matchers = pattern.findall(text)
702
+ if matchers:
703
+ #print('percentage')
704
+ for matcher in matchers:
705
+ text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
706
+
707
+ # 规范化纯数+量词
708
+ pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
709
+ matchers = pattern.findall(text)
710
+ if matchers:
711
+ #print('cardinal+quantifier')
712
+ for matcher in matchers:
713
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
714
+
715
+ # 规范化数字编号
716
+ pattern = re.compile(r"(\d{2,32})")
717
+ matchers = pattern.findall(text)
718
+ if matchers:
719
+ #print('digit')
720
+ for matcher in matchers:
721
+ text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
722
+
723
+ # 规范化纯数
724
+ pattern = re.compile(r"(\d+(\.\d+)?)")
725
+ matchers = pattern.findall(text)
726
+ if matchers:
727
+ #print('cardinal')
728
+ for matcher in matchers:
729
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
730
+
731
+ self.norm_text = text
732
+ self._particular()
733
+
734
+ return self.norm_text.lstrip('^').rstrip('$')
735
+
736
+
737
+ def nsw_test_case(raw_text):
738
+ print('I:' + raw_text)
739
+ print('O:' + NSWNormalizer(raw_text).normalize())
740
+ print('')
741
+
742
+
743
+ def nsw_test():
744
+ nsw_test_case('固话:0595-23865596或23880880。')
745
+ nsw_test_case('固话:0595-23865596或23880880。')
746
+ nsw_test_case('手机:+86 19859213959或15659451527。')
747
+ nsw_test_case('分数:32477/76391。')
748
+ nsw_test_case('百分数:80.03%。')
749
+ nsw_test_case('编号:31520181154418。')
750
+ nsw_test_case('纯数:2983.07克或12345.60米。')
751
+ nsw_test_case('日期:1999年2月20日或09年3月15号。')
752
+ nsw_test_case('金钱:12块5,34.5元,20.1万')
753
+ nsw_test_case('特殊:O2O或B2C。')
754
+ nsw_test_case('3456万吨')
755
+ nsw_test_case('2938个')
756
+ nsw_test_case('938')
757
+ nsw_test_case('今天吃了115个小笼包231个馒头')
758
+ nsw_test_case('有62%的概率')
759
+
760
+
761
+ if __name__ == '__main__':
762
+ #nsw_test()
763
+
764
+ p = argparse.ArgumentParser()
765
+ p.add_argument('ifile', help='input filename, assume utf-8 encoding')
766
+ p.add_argument('ofile', help='output filename')
767
+ p.add_argument('--to_upper', action='store_true', help='convert to upper case')
768
+ p.add_argument('--to_lower', action='store_true', help='convert to lower case')
769
+ p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
770
+ p.add_argument('--log_interval', type=int, default=100000, help='log interval in number of processed lines')
771
+ args = p.parse_args()
772
+
773
+ ifile = codecs.open(args.ifile, 'r', 'utf8')
774
+ ofile = codecs.open(args.ofile, 'w+', 'utf8')
775
+
776
+ n = 0
777
+ for l in ifile:
778
+ key = ''
779
+ text = ''
780
+ if args.has_key:
781
+ cols = l.split(maxsplit=1)
782
+ key = cols[0]
783
+ if len(cols) == 2:
784
+ text = cols[1].strip()
785
+ else:
786
+ text = ''
787
+ else:
788
+ text = l.strip()
789
+
790
+ # cases
791
+ if args.to_upper and args.to_lower:
792
+ sys.stderr.write('cn_tn.py: to_upper OR to_lower?')
793
+ exit(1)
794
+ if args.to_upper:
795
+ text = text.upper()
796
+ if args.to_lower:
797
+ text = text.lower()
798
+
799
+ # NSW(Non-Standard-Word) normalization
800
+ text = NSWNormalizer(text).normalize()
801
+
802
+ # Punctuations removal
803
+ old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
804
+ new_chars = ' ' * len(old_chars)
805
+ del_chars = ''
806
+ text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
807
+
808
+ #
809
+ if args.has_key:
810
+ ofile.write(key + '\t' + text + '\n')
811
+ else:
812
+ if text.strip() != '': # skip empty line in pure text format(without Kaldi's utt key)
813
+ ofile.write(text + '\n')
814
+
815
+ n += 1
816
+ if n % args.log_interval == 0:
817
+ sys.stderr.write("cn_tn.py: {} lines done.\n".format(n))
818
+ sys.stderr.flush()
819
+
820
+ sys.stderr.write("cn_tn.py: {} lines done in total.\n".format(n))
821
+ sys.stderr.flush()
822
+
823
+ ifile.close()
824
+ ofile.close()
lemas_tts/infer/text_norm/en_tn.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2017 Keith Ito
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ # of this software and associated documentation files (the "Software"), to deal
6
+ # in the Software without restriction, including without limitation the rights
7
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the Software is
9
+ # furnished to do so, subject to the following conditions:
10
+
11
+ # The above copyright notice and this permission notice shall be included in
12
+ # all copies or substantial portions of the Software.
13
+
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ # THE SOFTWARE.
21
+
22
+ import re
23
+ from unidecode import unidecode
24
+ import inflect
25
+
26
+ _inflect = inflect.engine()
27
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
28
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
29
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
30
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
31
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
32
+ _number_re = re.compile(r"[0-9]+")
33
+
34
+
35
+ def _remove_commas(m):
36
+ return m.group(1).replace(",", "")
37
+
38
+
39
+ def _expand_decimal_point(m):
40
+ return m.group(1).replace(".", " point ")
41
+
42
+
43
+ def _expand_dollars(m):
44
+ match = m.group(1)
45
+ parts = match.split(".")
46
+ if len(parts) > 2:
47
+ return match + " dollars" # Unexpected format
48
+ dollars = int(parts[0]) if parts[0] else 0
49
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
50
+ if dollars and cents:
51
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
52
+ cent_unit = "cent" if cents == 1 else "cents"
53
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
54
+ elif dollars:
55
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
56
+ return "%s %s" % (dollars, dollar_unit)
57
+ elif cents:
58
+ cent_unit = "cent" if cents == 1 else "cents"
59
+ return "%s %s" % (cents, cent_unit)
60
+ else:
61
+ return "zero dollars"
62
+
63
+
64
+ def _expand_ordinal(m):
65
+ return _inflect.number_to_words(m.group(0))
66
+
67
+
68
+ def _expand_number(m):
69
+ num = int(m.group(0))
70
+ if num > 1000 and num < 3000:
71
+ if num == 2000:
72
+ return "two thousand"
73
+ elif num > 2000 and num < 2010:
74
+ return "two thousand " + _inflect.number_to_words(num % 100)
75
+ elif num % 100 == 0:
76
+ return _inflect.number_to_words(num // 100) + " hundred"
77
+ else:
78
+ return _inflect.number_to_words(
79
+ num, andword="", zero="oh", group=2
80
+ ).replace(", ", " ")
81
+ else:
82
+ return _inflect.number_to_words(num, andword="")
83
+
84
+
85
+ def normalize_numbers(text):
86
+ text = re.sub(_comma_number_re, _remove_commas, text)
87
+ text = re.sub(_pounds_re, r"\1 pounds", text)
88
+ text = re.sub(_dollars_re, _expand_dollars, text)
89
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
90
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
91
+ text = re.sub(_number_re, _expand_number, text)
92
+ return text
93
+
94
+ # Regular expression matching whitespace:
95
+ _whitespace_re = re.compile(r"\s+")
96
+
97
+ # List of (regular expression, replacement) pairs for abbreviations:
98
+ _abbreviations = [
99
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
100
+ for x in [
101
+ ("mrs", "misess"),
102
+ ("mr", "mister"),
103
+ ("dr", "doctor"),
104
+ ("st", "saint"),
105
+ ("co", "company"),
106
+ ("jr", "junior"),
107
+ ("maj", "major"),
108
+ ("gen", "general"),
109
+ ("drs", "doctors"),
110
+ ("rev", "reverend"),
111
+ ("lt", "lieutenant"),
112
+ ("hon", "honorable"),
113
+ ("sgt", "sergeant"),
114
+ ("capt", "captain"),
115
+ ("esq", "esquire"),
116
+ ("ltd", "limited"),
117
+ ("col", "colonel"),
118
+ ("ft", "fort"),
119
+ ]
120
+ ]
121
+
122
+
123
+ def expand_abbreviations(text):
124
+ for regex, replacement in _abbreviations:
125
+ text = re.sub(regex, replacement, text)
126
+ return text
127
+
128
+
129
+ def expand_numbers(text):
130
+ return normalize_numbers(text)
131
+
132
+
133
+ def lowercase(text):
134
+ return text.lower()
135
+
136
+
137
+ def collapse_whitespace(text):
138
+ return re.sub(_whitespace_re, " ", text)
139
+
140
+
141
+ def convert_to_ascii(text):
142
+ return unidecode(text)
143
+
144
+
145
+ def basic_cleaners(text):
146
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
147
+ text = lowercase(text)
148
+ text = collapse_whitespace(text)
149
+ return text
150
+
151
+
152
+ def transliteration_cleaners(text):
153
+ """Pipeline for non-English text that transliterates to ASCII."""
154
+ text = convert_to_ascii(text)
155
+ text = lowercase(text)
156
+ text = collapse_whitespace(text)
157
+ return text
158
+
159
+
160
+ def english_cleaners(text):
161
+ """Pipeline for English text, including number and abbreviation expansion."""
162
+ text = convert_to_ascii(text)
163
+ text = lowercase(text)
164
+ text = expand_numbers(text)
165
+ text = expand_abbreviations(text)
166
+ text = collapse_whitespace(text)
167
+ return text
168
+
169
+ def read_lexicon(lex_path):
170
+ lexicon = {}
171
+ with open(lex_path) as f:
172
+ for line in f:
173
+ temp = re.split(r"\s+", line.strip("\n"))
174
+ word = temp[0]
175
+ phones = temp[1:]
176
+ if word not in lexicon:
177
+ lexicon[word] = phones
178
+ return lexicon
lemas_tts/infer/text_norm/gp2py.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import copy
3
+ import os
4
+ from typing import List
5
+
6
+ import jieba
7
+ import pypinyin
8
+
9
+ SPECIAL_NOTES = '。?!?!.;;:,,:'
10
+
11
+
12
+ def read_vocab(file: os.PathLike) -> List[str]:
13
+ with open(file) as f:
14
+ vocab = f.read().split('\n')
15
+ vocab = [v for v in vocab if len(v) > 0 and v != '\n']
16
+ return vocab
17
+
18
+
19
+ class TextNormal:
20
+ def __init__(self,
21
+ gp_vocab_file: os.PathLike,
22
+ py_vocab_file: os.PathLike,
23
+ add_sp1=False,
24
+ fix_er=False,
25
+ add_sil=True):
26
+ if gp_vocab_file is not None:
27
+ self.gp_vocab = read_vocab(gp_vocab_file)
28
+ if py_vocab_file is not None:
29
+ self.py_vocab = read_vocab(py_vocab_file)
30
+ self.in_py_vocab = dict([(p, True) for p in self.py_vocab])
31
+ self.add_sp1 = add_sp1
32
+ self.add_sil = add_sil
33
+ self.fix_er = fix_er
34
+
35
+ # gp2idx = dict([(c, i) for i, c in enumerate(self.gp_vocab)])
36
+ # idx2gp = dict([(i, c) for i, c in enumerate(self.gp_vocab)])
37
+
38
+ def _split2sent(self, text):
39
+ new_sub = [text]
40
+ while True:
41
+ sub = copy.deepcopy(new_sub)
42
+ new_sub = []
43
+ for s in sub:
44
+ sp = False
45
+ for t in SPECIAL_NOTES:
46
+ if t in s:
47
+ new_sub += s.split(t)
48
+ sp = True
49
+ break
50
+
51
+ if not sp and len(s) > 0:
52
+ new_sub += [s]
53
+ if len(new_sub) == len(sub):
54
+ break
55
+ tokens = [a for a in text if a in SPECIAL_NOTES]
56
+
57
+ return new_sub, tokens
58
+
59
+ def _correct_tone3(self, pys: List[str]) -> List[str]:
60
+ """Fix the continuous tone3 pronunciation problem"""
61
+ for i in range(2, len(pys)):
62
+ if pys[i][-1] == '3' and pys[i - 1][-1] == '3' and pys[i - 2][-1] == '3':
63
+ pys[i - 1] = pys[i - 1][:-1] + '2' # change the middle one
64
+ for i in range(1, len(pys)):
65
+ if pys[i][-1] == '3':
66
+ if pys[i - 1][-1] == '3':
67
+ pys[i - 1] = pys[i - 1][:-1] + '2'
68
+ return pys
69
+
70
+ def _correct_tone4(self, pys: List[str]) -> List[str]:
71
+ """Fixed the problem of pronouncing 不 bu2 yao4 / bu4 neng2"""
72
+ for i in range(len(pys) - 1):
73
+ if pys[i] == 'bu4':
74
+ if pys[i + 1][-1] == '4':
75
+ pys[i] = 'bu2'
76
+ return pys
77
+
78
+ def _replace_with_sp(self, pys: List[str]) -> List[str]:
79
+ for i, p in enumerate(pys):
80
+ if p in ',,、':
81
+ pys[i] = 'sp1'
82
+ return pys
83
+
84
+ def _correct_tone5(self, pys: List[str]) -> List[str]:
85
+ for i in range(len(pys)):
86
+ if pys[i][-1] not in '1234':
87
+ pys[i] += '5'
88
+ return pys
89
+
90
+ def gp2py(self, gp_text: str) -> List[str]:
91
+
92
+ gp_sent_list, tokens = self._split2sent(gp_text)
93
+ py_sent_list = []
94
+ for sent in gp_sent_list:
95
+ pys = []
96
+ for words in list(jieba.cut(sent)):
97
+ py = pypinyin.pinyin(words, pypinyin.TONE3)
98
+ py = [p[0] for p in py]
99
+ pys += py
100
+ if self.add_sp1:
101
+ pys = self._replace_with_sp(pys)
102
+ pys = self._correct_tone3(pys)
103
+ pys = self._correct_tone4(pys)
104
+ pys = self._correct_tone5(pys)
105
+ if self.add_sil:
106
+ py_sent_list += [' '.join(['sil'] + pys + ['sil'])]
107
+ else:
108
+ py_sent_list += [' '.join(pys)]
109
+
110
+ if self.add_sil:
111
+ gp_sent_list = ['sil ' + ' '.join(list(gp)) + ' sil' for gp in gp_sent_list]
112
+ else:
113
+ gp_sent_list = [' '.join(list(gp)) for gp in gp_sent_list]
114
+
115
+ if self.fix_er:
116
+ new_py_sent_list = []
117
+ for py, gp in zip(py_sent_list, gp_sent_list):
118
+ py = self._convert_er2(py, gp)
119
+ new_py_sent_list += [py]
120
+ py_sent_list = new_py_sent_list
121
+ print(new_py_sent_list)
122
+
123
+ return py_sent_list, gp_sent_list
124
+
125
+ def _convert_er2(self, py, gp):
126
+ py2hz = dict([(p, h) for p, h in zip(py.split(), gp.split())])
127
+ py_list = py.split()
128
+ for i, p in enumerate(py_list):
129
+ if (p == 'er2' and py2hz[p] == '儿' and i > 1 and len(py_list[i - 1]) > 2 and py_list[i - 1][-1] in '1234'):
130
+
131
+ py_er = py_list[i - 1][:-1] + 'r' + py_list[i - 1][-1]
132
+
133
+ if self.in_py_vocab.get(py_er, False): # must in vocab
134
+ py_list[i - 1] = py_er
135
+ py_list[i] = 'r'
136
+ py = ' '.join(py_list)
137
+ return py
138
+
139
+
140
+ if __name__ == '__main__':
141
+ parser = argparse.ArgumentParser()
142
+ parser.add_argument('-t', '--text', type=str)
143
+ args = parser.parse_args()
144
+ text = args.text
145
+ tn = TextNormal('gp.vocab', 'py.vocab', add_sp1=True, fix_er=True)
146
+ py_list, gp_list = tn.gp2py(text)
147
+ for py, gp in zip(py_list, gp_list):
148
+ print(py + '|' + gp)
lemas_tts/infer/text_norm/id_tn.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indonesian TTS Text Normalization for YouTube subtitles
2
+ # Requirements: pip install num2words
3
+ import re
4
+ from num2words import num2words
5
+
6
+ # --- small slang map (expandable) ---
7
+ SLANG_MAP = {
8
+ "gpp": "nggak apa-apa",
9
+ "gak": "nggak", "ga": "nggak", "gk": "nggak",
10
+ "sy": "saya", "sya": "saya",
11
+ "km": "kamu",
12
+ "tp": "tapi", "tpi": "tapi",
13
+ "jd": "jadi",
14
+ "bgt": "banget",
15
+ "blm": "belum",
16
+ "trs": "terus",
17
+ "sm": "sama",
18
+ "wkwk": "wkwk", # keep as-is (laugh token) or strip later
19
+ "wkwkwk": "wkwk"
20
+ }
21
+
22
+ # emoji pattern: removes most emoji blocks
23
+ EMOJI_PATTERN = re.compile(
24
+ "["
25
+ "\U0001F600-\U0001F64F" # emoticons
26
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
27
+ "\U0001F680-\U0001F6FF" # transport & map symbols
28
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
29
+ "\U00002700-\U000027BF" # dingbats
30
+ "\U000024C2-\U0001F251"
31
+ "]+", flags=re.UNICODE)
32
+
33
+ # units map
34
+ UNITS = {
35
+ "kg": "kilogram","g": "gram","km": "kilometer",
36
+ "m": "meter","cm": "sentimeter","mm": "milimeter",
37
+ "l": "liter"
38
+ }
39
+
40
+ # helper: safe num2words for Indonesian
41
+ def num_to_words_ind(num_str):
42
+ """Convert numeric string to Indonesian words.
43
+ - Handles integers and simple decimals like '1.5' (reads digits after decimal).
44
+ - Removes grouping dots in Indonesian numbers (e.g. '10.000').
45
+ """
46
+ num_str = num_str.strip()
47
+ # remove thousand separators commonly used in Indonesian (dot)
48
+ # but if decimal point (like '1,5' or '1.5'), assume '.' is decimal point (we expect '.' used)
49
+ # We'll treat commas as thousand separators too if no decimal comma present.
50
+ if re.match(r'^\d+[.,]\d+$', num_str):
51
+ # decimal number: normalize to use '.' then split
52
+ s = num_str.replace(',', '.')
53
+ left, right = s.split('.', 1)
54
+ try:
55
+ left_w = num2words(int(left), lang='id')
56
+ except:
57
+ left_w = left
58
+ # read each decimal digit separately
59
+ right_w = " ".join(num2words(int(d), lang='id') for d in right if d.isdigit())
60
+ return f"{left_w} koma {right_w}"
61
+ else:
62
+ # remove non-digit separators like dots or commas used as thousand separators
63
+ cleaned = re.sub(r'[.,]', '', num_str)
64
+ try:
65
+ return num2words(int(cleaned), lang='id')
66
+ except:
67
+ return num_str
68
+
69
+ # helper: per-digit reader for phone numbers (default)
70
+ def read_digits_per_digit(number_str, prefix_plus=False):
71
+ digits = re.findall(r'\d', number_str)
72
+ words = " ".join(num2words(int(d), lang='id') for d in digits)
73
+ if prefix_plus:
74
+ return "plus " + words
75
+ return words
76
+
77
+ # noise removal rule for tokens like 'yyy6yy' or other long mixed garbage:
78
+ def is_noise_token(tok):
79
+ # remove tokens that:
80
+ # - length >=4 and contain at least one digit and at least one letter (typical ASR/keyboard noise)
81
+ # - or tokens of a single repeated char length >=4 (e.g., 'aaaa', '!!!!!!' but punctuation handled earlier)
82
+ if len(tok) < 4:
83
+ return False
84
+ if re.search(r'[A-Za-z]', tok) and re.search(r'\d', tok):
85
+ return True
86
+ if re.fullmatch(r'(.)\1{3,}', tok): # same char repeated >=4
87
+ return True
88
+ return False
89
+
90
+ # --- 新增:标点规范化函数 ---
91
+ def punctuation_normalize(text):
92
+ """
93
+ - 替换除 . , ! ? 之外的所有标点为逗号
94
+ - 统一多重逗号为单逗号
95
+ - 去掉开头多余逗号、省略号
96
+ - 逗号后空格规范化
97
+ """
98
+ # 替换括号、引号、冒号、分号、破折号、省略号等为逗号
99
+ text = re.sub(r'[:;()\[\]{}"“”«»…—–/\\]', ',', text)
100
+ # 多个逗号替换成一个
101
+ text = re.sub(r',+', ',', text)
102
+ # 开头去掉逗号和省略号
103
+ text = re.sub(r'^(,|\.\.\.|…)+\s*', '', text)
104
+ # 逗号后空格规范
105
+ text = re.sub(r'\s*,\s*', ', ', text)
106
+ # 多余空白合并
107
+ text = re.sub(r'\s+', ' ', text).strip()
108
+ return text
109
+
110
+
111
+ def normalize_id_tts(text):
112
+ """
113
+ Main normalization pipeline tailored for:
114
+ - Indonesian YouTube subtitles (mostly ASR/MT)
115
+ - TTS frontend requirements:
116
+ * Remove emojis
117
+ * Keep . , ! ? as sentence/phrase delimiters
118
+ * Replace other punctuation with comma
119
+ * Expand numbers, percents, currency, units, times, dates
120
+ * Remove keyboard noise like 'yyy6yy'
121
+ * Keep English words as-is
122
+ * Keep repeated words (do not collapse)
123
+ """
124
+ if not text:
125
+ return text
126
+
127
+ # 1) Normalize whitespace and trim
128
+ text = text.strip()
129
+ text = re.sub(r'\s+', ' ', text)
130
+
131
+ # 2) Remove emojis
132
+ text = EMOJI_PATTERN.sub('', text)
133
+
134
+ # 3) 标点规范化(替代原有 PUNCT_TO_COMMA 替换)
135
+ text = punctuation_normalize(text)
136
+
137
+ # 保护时间和日期的代码(防止被逗号破坏)
138
+ text = re.sub(r'(\d{1,2}):(\d{2})', lambda m: f"__TIME_{m.group(1)}_{m.group(2)}__", text)
139
+ text = re.sub(r'(\d{1,4})[\/-](\d{1,2})[\/-](\d{1,4})', lambda m: f"__DATE_{m.group(1)}_{m.group(2)}_{m.group(3)}__", text)
140
+
141
+ # 恢复时间日期标记
142
+ text = re.sub(r'__TIME_(\d{1,2})_(\d{2})__', lambda m: f"{m.group(1)}:{m.group(2)}", text)
143
+ text = re.sub(r'__DATE_(\d{1,4})_(\d{1,2})_(\d{1,4})__', lambda m: f"{m.group(1)}/{m.group(2)}/{m.group(3)}", text)
144
+
145
+ # 4) Tokenize loosely by spaces and punctuation
146
+ tokens = re.split(r'(\s+|[,.!?])', text) # keep delimiters
147
+
148
+ out_tokens = []
149
+ for tok in tokens:
150
+ if not tok or tok.isspace():
151
+ out_tokens.append(tok)
152
+ continue
153
+
154
+ # keep punctuation .,!? as-is
155
+ if tok in ['.', ',', '!', '?']:
156
+ out_tokens.append(tok)
157
+ continue
158
+
159
+ # remove any remaining emojis or control chars
160
+ if EMOJI_PATTERN.search(tok):
161
+ continue
162
+
163
+ # slang normalization
164
+ lower_tok = tok.lower()
165
+ if lower_tok in SLANG_MAP:
166
+ out_tokens.append(SLANG_MAP[lower_tok])
167
+ continue
168
+
169
+ # remove noise tokens
170
+ if is_noise_token(tok):
171
+ continue
172
+
173
+ # currency: Rp 10.000 or rp10.000
174
+ m = re.match(r'^(Rp|rp)\s*([0-9\.,]+)$', tok)
175
+ if m:
176
+ num = m.group(2)
177
+ cleaned = re.sub(r'[.,]', '', num)
178
+ out_tokens.append(f"{num_to_words_ind(cleaned)} rupiah")
179
+ continue
180
+
181
+ # percent like 30%
182
+ m = re.match(r'^(\d+)%$', tok)
183
+ if m:
184
+ out_tokens.append(f"{num_to_words_ind(m.group(1))} persen")
185
+ continue
186
+
187
+ # phone numbers +62..., 0812...
188
+ m = re.match(r'^\+?\d[\d\-\s]{6,}\d$', tok)
189
+ if m:
190
+ prefix_plus = tok.startswith('+')
191
+ out_tokens.append(read_digits_per_digit(tok, prefix_plus=prefix_plus))
192
+ continue
193
+
194
+ # time hh:mm
195
+ m = re.match(r'^(\d{1,2}):(\d{2})$', tok)
196
+ if m:
197
+ h, mi = m.group(1), m.group(2)
198
+ h_w = num_to_words_ind(h.lstrip('0') or '0')
199
+ mi_w = num_to_words_ind(mi.lstrip('0') or '0')
200
+ out_tokens.append(f"pukul {h_w} lewat {mi_w} menit")
201
+ continue
202
+
203
+ # date yyyy/mm/dd or dd/mm/yyyy
204
+ m = re.match(r'^(\d{1,4})\/(\d{1,2})\/(\d{1,4})$', tok)
205
+ if m:
206
+ a,b,c = m.group(1), m.group(2).zfill(2), m.group(3)
207
+ if len(a) == 4:
208
+ year, month, day = a, b, c
209
+ elif len(c) == 4:
210
+ day, month, year = a, b, c
211
+ else:
212
+ day, month, year = a, b, c
213
+ MONTHS = {
214
+ "01": "Januari","02": "Februari","03": "Maret","04": "April",
215
+ "05": "Mei","06": "Juni","07": "Juli","08": "Agustus",
216
+ "09": "September","10": "Oktober","11": "November","12": "Desember"
217
+ }
218
+ day_w = num_to_words_ind(day.lstrip('0') or '0')
219
+ year_w = num_to_words_ind(year)
220
+ month_name = MONTHS.get(month, month)
221
+ out_tokens.append(f"{day_w} {month_name} {year_w}")
222
+ continue
223
+
224
+ # units like 30kg
225
+ m = re.match(r'^(\d+)\s*(kg|g|km|m|cm|mm|l)$', tok, flags=re.I)
226
+ if m:
227
+ num, unit = m.group(1), m.group(2).lower()
228
+ unit_word = UNITS.get(unit, unit)
229
+ out_tokens.append(f"{num_to_words_ind(num)} {unit_word}")
230
+ continue
231
+
232
+ # plain integers
233
+ if re.fullmatch(r'\d+', tok):
234
+ out_tokens.append(num_to_words_ind(tok))
235
+ continue
236
+
237
+ # numbers with separators
238
+ if re.fullmatch(r'[\d\.,]+', tok) and re.search(r'[.,]', tok):
239
+ out_tokens.append(num_to_words_ind(tok))
240
+ continue
241
+
242
+ # keep English/as-is tokens
243
+ out_tokens.append(tok)
244
+
245
+ normalized = "".join(out_tokens)
246
+
247
+ # final cleanup: spacing around punctuation
248
+ normalized = re.sub(r'\s+,', ',', normalized)
249
+ normalized = re.sub(r',\s*', ', ', normalized)
250
+ normalized = re.sub(r'\s+\.', '.', normalized)
251
+ normalized = re.sub(r'\s+!', '!', normalized)
252
+ normalized = re.sub(r'\s+\?', '?', normalized)
253
+ normalized = re.sub(r'\s+', ' ', normalized).strip()
254
+
255
+ # 如果你不想全部小写,注释掉下面这行
256
+ normalized = normalized.lower()
257
+
258
+ return normalized
259
+
260
+ # -------------------------
261
+ # Example usage and tests
262
+ # -------------------------
263
+ if __name__ == "__main__":
264
+ examples = [
265
+ "kita cek Project nadi PHP pemberi harapan palsu tuh yyy6yy 46 ini ini usernya ini di bagian user",
266
+ "Harga Rp 10.000, diskon 30%! Buka jam 09:30 (hari 2025/11/28).",
267
+ "Call +62 812-3456-7890 sekarang!",
268
+ "angka kecil 3.14 dan 1,234 serta 1000",
269
+ "[musik]",
270
+ "... atau mungkin juga jumlah anggota keluarga mereka."
271
+ ]
272
+ for ex in examples:
273
+ print("IN: ", ex)
274
+ print("OUT:", normalize_id_tts(ex))
275
+ print("-"*60)
lemas_tts/infer/text_norm/jieba_dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
lemas_tts/infer/text_norm/pinyin-lexicon-r.txt ADDED
@@ -0,0 +1,4120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a1 a1
2
+ a2 a2
3
+ a3 a3
4
+ a4 a4
5
+ a5 a5
6
+ ai1 ai1
7
+ ai2 ai2
8
+ ai3 ai3
9
+ ai4 ai4
10
+ ai5 ai5
11
+ an1 an1
12
+ an2 an2
13
+ an3 an3
14
+ an4 an4
15
+ an5 an5
16
+ ang1 ang1
17
+ ang2 ang2
18
+ ang3 ang3
19
+ ang4 ang4
20
+ ang5 ang5
21
+ ao1 ao1
22
+ ao2 ao2
23
+ ao3 ao3
24
+ ao4 ao4
25
+ ao5 ao5
26
+ ba1 b a1
27
+ ba2 b a2
28
+ ba3 b a3
29
+ ba4 b a4
30
+ ba5 b a5
31
+ bai1 b ai1
32
+ bai2 b ai2
33
+ bai3 b ai3
34
+ bai4 b ai4
35
+ bai5 b ai5
36
+ ban1 b an1
37
+ ban2 b an2
38
+ ban3 b an3
39
+ ban4 b an4
40
+ ban5 b an5
41
+ bang1 b ang1
42
+ bang2 b ang2
43
+ bang3 b ang3
44
+ bang4 b ang4
45
+ bang5 b ang5
46
+ bao1 b ao1
47
+ bao2 b ao2
48
+ bao3 b ao3
49
+ bao4 b ao4
50
+ bao5 b ao5
51
+ bei1 b ei1
52
+ bei2 b ei2
53
+ bei3 b ei3
54
+ bei4 b ei4
55
+ bei5 b ei5
56
+ ben1 b en1
57
+ ben2 b en2
58
+ ben3 b en3
59
+ ben4 b en4
60
+ ben5 b en5
61
+ beng1 b eng1
62
+ beng2 b eng2
63
+ beng3 b eng3
64
+ beng4 b eng4
65
+ beng5 b eng5
66
+ bi1 b i1
67
+ bi2 b i2
68
+ bi3 b i3
69
+ bi4 b i4
70
+ bi5 b i5
71
+ bian1 b ian1
72
+ bian2 b ian2
73
+ bian3 b ian3
74
+ bian4 b ian4
75
+ bian5 b ian5
76
+ biao1 b iao1
77
+ biao2 b iao2
78
+ biao3 b iao3
79
+ biao4 b iao4
80
+ biao5 b iao5
81
+ bie1 b ie1
82
+ bie2 b ie2
83
+ bie3 b ie3
84
+ bie4 b ie4
85
+ bie5 b ie5
86
+ bin1 b in1
87
+ bin2 b in2
88
+ bin3 b in3
89
+ bin4 b in4
90
+ bin5 b in5
91
+ bing1 b ing1
92
+ bing2 b ing2
93
+ bing3 b ing3
94
+ bing4 b ing4
95
+ bing5 b ing5
96
+ bo1 b o1
97
+ bo2 b o2
98
+ bo3 b o3
99
+ bo4 b o4
100
+ bo5 b o5
101
+ bu1 b u1
102
+ bu2 b u2
103
+ bu3 b u3
104
+ bu4 b u4
105
+ bu5 b u5
106
+ ca1 c a1
107
+ ca2 c a2
108
+ ca3 c a3
109
+ ca4 c a4
110
+ ca5 c a5
111
+ cai1 c ai1
112
+ cai2 c ai2
113
+ cai3 c ai3
114
+ cai4 c ai4
115
+ cai5 c ai5
116
+ can1 c an1
117
+ can2 c an2
118
+ can3 c an3
119
+ can4 c an4
120
+ can5 c an5
121
+ cang1 c ang1
122
+ cang2 c ang2
123
+ cang3 c ang3
124
+ cang4 c ang4
125
+ cang5 c ang5
126
+ cao1 c ao1
127
+ cao2 c ao2
128
+ cao3 c ao3
129
+ cao4 c ao4
130
+ cao5 c ao5
131
+ ce1 c e1
132
+ ce2 c e2
133
+ ce3 c e3
134
+ ce4 c e4
135
+ ce5 c e5
136
+ cen1 c en1
137
+ cen2 c en2
138
+ cen3 c en3
139
+ cen4 c en4
140
+ cen5 c en5
141
+ ceng1 c eng1
142
+ ceng2 c eng2
143
+ ceng3 c eng3
144
+ ceng4 c eng4
145
+ ceng5 c eng5
146
+ cha1 ch a1
147
+ cha2 ch a2
148
+ cha3 ch a3
149
+ cha4 ch a4
150
+ cha5 ch a5
151
+ chai1 ch ai1
152
+ chai2 ch ai2
153
+ chai3 ch ai3
154
+ chai4 ch ai4
155
+ chai5 ch ai5
156
+ chan1 ch an1
157
+ chan2 ch an2
158
+ chan3 ch an3
159
+ chan4 ch an4
160
+ chan5 ch an5
161
+ chang1 ch ang1
162
+ chang2 ch ang2
163
+ chang3 ch ang3
164
+ chang4 ch ang4
165
+ chang5 ch ang5
166
+ chao1 ch ao1
167
+ chao2 ch ao2
168
+ chao3 ch ao3
169
+ chao4 ch ao4
170
+ chao5 ch ao5
171
+ che1 ch e1
172
+ che2 ch e2
173
+ che3 ch e3
174
+ che4 ch e4
175
+ che5 ch e5
176
+ chen1 ch en1
177
+ chen2 ch en2
178
+ chen3 ch en3
179
+ chen4 ch en4
180
+ chen5 ch en5
181
+ cheng1 ch eng1
182
+ cheng2 ch eng2
183
+ cheng3 ch eng3
184
+ cheng4 ch eng4
185
+ cheng5 ch eng5
186
+ chi1 ch iii1
187
+ chi2 ch iii2
188
+ chi3 ch iii3
189
+ chi4 ch iii4
190
+ chi5 ch iii5
191
+ chong1 ch ong1
192
+ chong2 ch ong2
193
+ chong3 ch ong3
194
+ chong4 ch ong4
195
+ chong5 ch ong5
196
+ chou1 ch ou1
197
+ chou2 ch ou2
198
+ chou3 ch ou3
199
+ chou4 ch ou4
200
+ chou5 ch ou5
201
+ chu1 ch u1
202
+ chu2 ch u2
203
+ chu3 ch u3
204
+ chu4 ch u4
205
+ chu5 ch u5
206
+ chuai1 ch uai1
207
+ chuai2 ch uai2
208
+ chuai3 ch uai3
209
+ chuai4 ch uai4
210
+ chuai5 ch uai5
211
+ chuan1 ch uan1
212
+ chuan2 ch uan2
213
+ chuan3 ch uan3
214
+ chuan4 ch uan4
215
+ chuan5 ch uan5
216
+ chuang1 ch uang1
217
+ chuang2 ch uang2
218
+ chuang3 ch uang3
219
+ chuang4 ch uang4
220
+ chuang5 ch uang5
221
+ chui1 ch uei1
222
+ chui2 ch uei2
223
+ chui3 ch uei3
224
+ chui4 ch uei4
225
+ chui5 ch uei5
226
+ chun1 ch uen1
227
+ chun2 ch uen2
228
+ chun3 ch uen3
229
+ chun4 ch uen4
230
+ chun5 ch uen5
231
+ chuo1 ch uo1
232
+ chuo2 ch uo2
233
+ chuo3 ch uo3
234
+ chuo4 ch uo4
235
+ chuo5 ch uo5
236
+ ci1 c ii1
237
+ ci2 c ii2
238
+ ci3 c ii3
239
+ ci4 c ii4
240
+ ci5 c ii5
241
+ cong1 c ong1
242
+ cong2 c ong2
243
+ cong3 c ong3
244
+ cong4 c ong4
245
+ cong5 c ong5
246
+ cou1 c ou1
247
+ cou2 c ou2
248
+ cou3 c ou3
249
+ cou4 c ou4
250
+ cou5 c ou5
251
+ cu1 c u1
252
+ cu2 c u2
253
+ cu3 c u3
254
+ cu4 c u4
255
+ cu5 c u5
256
+ cuan1 c uan1
257
+ cuan2 c uan2
258
+ cuan3 c uan3
259
+ cuan4 c uan4
260
+ cuan5 c uan5
261
+ cui1 c uei1
262
+ cui2 c uei2
263
+ cui3 c uei3
264
+ cui4 c uei4
265
+ cui5 c uei5
266
+ cun1 c uen1
267
+ cun2 c uen2
268
+ cun3 c uen3
269
+ cun4 c uen4
270
+ cun5 c uen5
271
+ cuo1 c uo1
272
+ cuo2 c uo2
273
+ cuo3 c uo3
274
+ cuo4 c uo4
275
+ cuo5 c uo5
276
+ da1 d a1
277
+ da2 d a2
278
+ da3 d a3
279
+ da4 d a4
280
+ da5 d a5
281
+ dai1 d ai1
282
+ dai2 d ai2
283
+ dai3 d ai3
284
+ dai4 d ai4
285
+ dai5 d ai5
286
+ dan1 d an1
287
+ dan2 d an2
288
+ dan3 d an3
289
+ dan4 d an4
290
+ dan5 d an5
291
+ dang1 d ang1
292
+ dang2 d ang2
293
+ dang3 d ang3
294
+ dang4 d ang4
295
+ dang5 d ang5
296
+ dao1 d ao1
297
+ dao2 d ao2
298
+ dao3 d ao3
299
+ dao4 d ao4
300
+ dao5 d ao5
301
+ de1 d e1
302
+ de2 d e2
303
+ de3 d e3
304
+ de4 d e4
305
+ de5 d e5
306
+ dei1 d ei1
307
+ dei2 d ei2
308
+ dei3 d ei3
309
+ dei4 d ei4
310
+ dei5 d ei5
311
+ den1 d en1
312
+ den2 d en2
313
+ den3 d en3
314
+ den4 d en4
315
+ den5 d en5
316
+ deng1 d eng1
317
+ deng2 d eng2
318
+ deng3 d eng3
319
+ deng4 d eng4
320
+ deng5 d eng5
321
+ di1 d i1
322
+ di2 d i2
323
+ di3 d i3
324
+ di4 d i4
325
+ di5 d i5
326
+ dia1 d ia1
327
+ dia2 d ia2
328
+ dia3 d ia3
329
+ dia4 d ia4
330
+ dia5 d ia5
331
+ dian1 d ian1
332
+ dian2 d ian2
333
+ dian3 d ian3
334
+ dian4 d ian4
335
+ dian5 d ian5
336
+ diao1 d iao1
337
+ diao2 d iao2
338
+ diao3 d iao3
339
+ diao4 d iao4
340
+ diao5 d iao5
341
+ die1 d ie1
342
+ die2 d ie2
343
+ die3 d ie3
344
+ die4 d ie4
345
+ die5 d ie5
346
+ ding1 d ing1
347
+ ding2 d ing2
348
+ ding3 d ing3
349
+ ding4 d ing4
350
+ ding5 d ing5
351
+ diu1 d iou1
352
+ diu2 d iou2
353
+ diu3 d iou3
354
+ diu4 d iou4
355
+ diu5 d iou5
356
+ dong1 d ong1
357
+ dong2 d ong2
358
+ dong3 d ong3
359
+ dong4 d ong4
360
+ dong5 d ong5
361
+ dou1 d ou1
362
+ dou2 d ou2
363
+ dou3 d ou3
364
+ dou4 d ou4
365
+ dou5 d ou5
366
+ du1 d u1
367
+ du2 d u2
368
+ du3 d u3
369
+ du4 d u4
370
+ du5 d u5
371
+ duan1 d uan1
372
+ duan2 d uan2
373
+ duan3 d uan3
374
+ duan4 d uan4
375
+ duan5 d uan5
376
+ dui1 d uei1
377
+ dui2 d uei2
378
+ dui3 d uei3
379
+ dui4 d uei4
380
+ dui5 d uei5
381
+ dun1 d uen1
382
+ dun2 d uen2
383
+ dun3 d uen3
384
+ dun4 d uen4
385
+ dun5 d uen5
386
+ duo1 d uo1
387
+ duo2 d uo2
388
+ duo3 d uo3
389
+ duo4 d uo4
390
+ duo5 d uo5
391
+ e1 e1
392
+ e2 e2
393
+ e3 e3
394
+ e4 e4
395
+ e5 e5
396
+ ei1 ei1
397
+ ei2 ei2
398
+ ei3 ei3
399
+ ei4 ei4
400
+ ei5 ei5
401
+ en1 en1
402
+ en2 en2
403
+ en3 en3
404
+ en4 en4
405
+ en5 en5
406
+ eng1 eng1
407
+ eng2 eng2
408
+ eng3 eng3
409
+ eng4 eng4
410
+ eng5 eng5
411
+ r1 er1
412
+ r2 er2
413
+ r3 er3
414
+ r4 er4
415
+ r5 er5
416
+ er1 er1
417
+ er2 er2
418
+ er3 er3
419
+ er4 er4
420
+ er5 er5
421
+ fa1 f a1
422
+ fa2 f a2
423
+ fa3 f a3
424
+ fa4 f a4
425
+ fa5 f a5
426
+ fan1 f an1
427
+ fan2 f an2
428
+ fan3 f an3
429
+ fan4 f an4
430
+ fan5 f an5
431
+ fang1 f ang1
432
+ fang2 f ang2
433
+ fang3 f ang3
434
+ fang4 f ang4
435
+ fang5 f ang5
436
+ fei1 f ei1
437
+ fei2 f ei2
438
+ fei3 f ei3
439
+ fei4 f ei4
440
+ fei5 f ei5
441
+ fen1 f en1
442
+ fen2 f en2
443
+ fen3 f en3
444
+ fen4 f en4
445
+ fen5 f en5
446
+ feng1 f eng1
447
+ feng2 f eng2
448
+ feng3 f eng3
449
+ feng4 f eng4
450
+ feng5 f eng5
451
+ fo1 f o1
452
+ fo2 f o2
453
+ fo3 f o3
454
+ fo4 f o4
455
+ fo5 f o5
456
+ fou1 f ou1
457
+ fou2 f ou2
458
+ fou3 f ou3
459
+ fou4 f ou4
460
+ fou5 f ou5
461
+ fu1 f u1
462
+ fu2 f u2
463
+ fu3 f u3
464
+ fu4 f u4
465
+ fu5 f u5
466
+ ga1 g a1
467
+ ga2 g a2
468
+ ga3 g a3
469
+ ga4 g a4
470
+ ga5 g a5
471
+ gai1 g ai1
472
+ gai2 g ai2
473
+ gai3 g ai3
474
+ gai4 g ai4
475
+ gai5 g ai5
476
+ gan1 g an1
477
+ gan2 g an2
478
+ gan3 g an3
479
+ gan4 g an4
480
+ gan5 g an5
481
+ gang1 g ang1
482
+ gang2 g ang2
483
+ gang3 g ang3
484
+ gang4 g ang4
485
+ gang5 g ang5
486
+ gao1 g ao1
487
+ gao2 g ao2
488
+ gao3 g ao3
489
+ gao4 g ao4
490
+ gao5 g ao5
491
+ ge1 g e1
492
+ ge2 g e2
493
+ ge3 g e3
494
+ ge4 g e4
495
+ ge5 g e5
496
+ gei1 g ei1
497
+ gei2 g ei2
498
+ gei3 g ei3
499
+ gei4 g ei4
500
+ gei5 g ei5
501
+ gen1 g en1
502
+ gen2 g en2
503
+ gen3 g en3
504
+ gen4 g en4
505
+ gen5 g en5
506
+ geng1 g eng1
507
+ geng2 g eng2
508
+ geng3 g eng3
509
+ geng4 g eng4
510
+ geng5 g eng5
511
+ gong1 g ong1
512
+ gong2 g ong2
513
+ gong3 g ong3
514
+ gong4 g ong4
515
+ gong5 g ong5
516
+ gou1 g ou1
517
+ gou2 g ou2
518
+ gou3 g ou3
519
+ gou4 g ou4
520
+ gou5 g ou5
521
+ gu1 g u1
522
+ gu2 g u2
523
+ gu3 g u3
524
+ gu4 g u4
525
+ gu5 g u5
526
+ gua1 g ua1
527
+ gua2 g ua2
528
+ gua3 g ua3
529
+ gua4 g ua4
530
+ gua5 g ua5
531
+ guai1 g uai1
532
+ guai2 g uai2
533
+ guai3 g uai3
534
+ guai4 g uai4
535
+ guai5 g uai5
536
+ guan1 g uan1
537
+ guan2 g uan2
538
+ guan3 g uan3
539
+ guan4 g uan4
540
+ guan5 g uan5
541
+ guang1 g uang1
542
+ guang2 g uang2
543
+ guang3 g uang3
544
+ guang4 g uang4
545
+ guang5 g uang5
546
+ gui1 g uei1
547
+ gui2 g uei2
548
+ gui3 g uei3
549
+ gui4 g uei4
550
+ gui5 g uei5
551
+ gun1 g uen1
552
+ gun2 g uen2
553
+ gun3 g uen3
554
+ gun4 g uen4
555
+ gun5 g uen5
556
+ guo1 g uo1
557
+ guo2 g uo2
558
+ guo3 g uo3
559
+ guo4 g uo4
560
+ guo5 g uo5
561
+ ha1 h a1
562
+ ha2 h a2
563
+ ha3 h a3
564
+ ha4 h a4
565
+ ha5 h a5
566
+ hai1 h ai1
567
+ hai2 h ai2
568
+ hai3 h ai3
569
+ hai4 h ai4
570
+ hai5 h ai5
571
+ han1 h an1
572
+ han2 h an2
573
+ han3 h an3
574
+ han4 h an4
575
+ han5 h an5
576
+ hang1 h ang1
577
+ hang2 h ang2
578
+ hang3 h ang3
579
+ hang4 h ang4
580
+ hang5 h ang5
581
+ hao1 h ao1
582
+ hao2 h ao2
583
+ hao3 h ao3
584
+ hao4 h ao4
585
+ hao5 h ao5
586
+ he1 h e1
587
+ he2 h e2
588
+ he3 h e3
589
+ he4 h e4
590
+ he5 h e5
591
+ hei1 h ei1
592
+ hei2 h ei2
593
+ hei3 h ei3
594
+ hei4 h ei4
595
+ hei5 h ei5
596
+ hen1 h en1
597
+ hen2 h en2
598
+ hen3 h en3
599
+ hen4 h en4
600
+ hen5 h en5
601
+ heng1 h eng1
602
+ heng2 h eng2
603
+ heng3 h eng3
604
+ heng4 h eng4
605
+ heng5 h eng5
606
+ hong1 h ong1
607
+ hong2 h ong2
608
+ hong3 h ong3
609
+ hong4 h ong4
610
+ hong5 h ong5
611
+ hou1 h ou1
612
+ hou2 h ou2
613
+ hou3 h ou3
614
+ hou4 h ou4
615
+ hou5 h ou5
616
+ hu1 h u1
617
+ hu2 h u2
618
+ hu3 h u3
619
+ hu4 h u4
620
+ hu5 h u5
621
+ hua1 h ua1
622
+ hua2 h ua2
623
+ hua3 h ua3
624
+ hua4 h ua4
625
+ hua5 h ua5
626
+ huai1 h uai1
627
+ huai2 h uai2
628
+ huai3 h uai3
629
+ huai4 h uai4
630
+ huai5 h uai5
631
+ huan1 h uan1
632
+ huan2 h uan2
633
+ huan3 h uan3
634
+ huan4 h uan4
635
+ huan5 h uan5
636
+ huang1 h uang1
637
+ huang2 h uang2
638
+ huang3 h uang3
639
+ huang4 h uang4
640
+ huang5 h uang5
641
+ hui1 h uei1
642
+ hui2 h uei2
643
+ hui3 h uei3
644
+ hui4 h uei4
645
+ hui5 h uei5
646
+ hun1 h uen1
647
+ hun2 h uen2
648
+ hun3 h uen3
649
+ hun4 h uen4
650
+ hun5 h uen5
651
+ huo1 h uo1
652
+ huo2 h uo2
653
+ huo3 h uo3
654
+ huo4 h uo4
655
+ huo5 h uo5
656
+ ji1 j i1
657
+ ji2 j i2
658
+ ji3 j i3
659
+ ji4 j i4
660
+ ji5 j i5
661
+ jia1 j ia1
662
+ jia2 j ia2
663
+ jia3 j ia3
664
+ jia4 j ia4
665
+ jia5 j ia5
666
+ jian1 j ian1
667
+ jian2 j ian2
668
+ jian3 j ian3
669
+ jian4 j ian4
670
+ jian5 j ian5
671
+ jiang1 j iang1
672
+ jiang2 j iang2
673
+ jiang3 j iang3
674
+ jiang4 j iang4
675
+ jiang5 j iang5
676
+ jiao1 j iao1
677
+ jiao2 j iao2
678
+ jiao3 j iao3
679
+ jiao4 j iao4
680
+ jiao5 j iao5
681
+ jie1 j ie1
682
+ jie2 j ie2
683
+ jie3 j ie3
684
+ jie4 j ie4
685
+ jie5 j ie5
686
+ jin1 j in1
687
+ jin2 j in2
688
+ jin3 j in3
689
+ jin4 j in4
690
+ jin5 j in5
691
+ jing1 j ing1
692
+ jing2 j ing2
693
+ jing3 j ing3
694
+ jing4 j ing4
695
+ jing5 j ing5
696
+ jiong1 j iong1
697
+ jiong2 j iong2
698
+ jiong3 j iong3
699
+ jiong4 j iong4
700
+ jiong5 j iong5
701
+ jiu1 j iou1
702
+ jiu2 j iou2
703
+ jiu3 j iou3
704
+ jiu4 j iou4
705
+ jiu5 j iou5
706
+ ju1 j v1
707
+ ju2 j v2
708
+ ju3 j v3
709
+ ju4 j v4
710
+ ju5 j v5
711
+ juan1 j van1
712
+ juan2 j van2
713
+ juan3 j van3
714
+ juan4 j van4
715
+ juan5 j van5
716
+ jue1 j ve1
717
+ jue2 j ve2
718
+ jue3 j ve3
719
+ jue4 j ve4
720
+ jue5 j ve5
721
+ jun1 j vn1
722
+ jun2 j vn2
723
+ jun3 j vn3
724
+ jun4 j vn4
725
+ jun5 j vn5
726
+ ka1 k a1
727
+ ka2 k a2
728
+ ka3 k a3
729
+ ka4 k a4
730
+ ka5 k a5
731
+ kai1 k ai1
732
+ kai2 k ai2
733
+ kai3 k ai3
734
+ kai4 k ai4
735
+ kai5 k ai5
736
+ kan1 k an1
737
+ kan2 k an2
738
+ kan3 k an3
739
+ kan4 k an4
740
+ kan5 k an5
741
+ kang1 k ang1
742
+ kang2 k ang2
743
+ kang3 k ang3
744
+ kang4 k ang4
745
+ kang5 k ang5
746
+ kao1 k ao1
747
+ kao2 k ao2
748
+ kao3 k ao3
749
+ kao4 k ao4
750
+ kao5 k ao5
751
+ ke1 k e1
752
+ ke2 k e2
753
+ ke3 k e3
754
+ ke4 k e4
755
+ ke5 k e5
756
+ kei1 k ei1
757
+ kei2 k ei2
758
+ kei3 k ei3
759
+ kei4 k ei4
760
+ kei5 k ei5
761
+ ken1 k en1
762
+ ken2 k en2
763
+ ken3 k en3
764
+ ken4 k en4
765
+ ken5 k en5
766
+ keng1 k eng1
767
+ keng2 k eng2
768
+ keng3 k eng3
769
+ keng4 k eng4
770
+ keng5 k eng5
771
+ kong1 k ong1
772
+ kong2 k ong2
773
+ kong3 k ong3
774
+ kong4 k ong4
775
+ kong5 k ong5
776
+ kou1 k ou1
777
+ kou2 k ou2
778
+ kou3 k ou3
779
+ kou4 k ou4
780
+ kou5 k ou5
781
+ ku1 k u1
782
+ ku2 k u2
783
+ ku3 k u3
784
+ ku4 k u4
785
+ ku5 k u5
786
+ kua1 k ua1
787
+ kua2 k ua2
788
+ kua3 k ua3
789
+ kua4 k ua4
790
+ kua5 k ua5
791
+ kuai1 k uai1
792
+ kuai2 k uai2
793
+ kuai3 k uai3
794
+ kuai4 k uai4
795
+ kuai5 k uai5
796
+ kuan1 k uan1
797
+ kuan2 k uan2
798
+ kuan3 k uan3
799
+ kuan4 k uan4
800
+ kuan5 k uan5
801
+ kuang1 k uang1
802
+ kuang2 k uang2
803
+ kuang3 k uang3
804
+ kuang4 k uang4
805
+ kuang5 k uang5
806
+ kui1 k uei1
807
+ kui2 k uei2
808
+ kui3 k uei3
809
+ kui4 k uei4
810
+ kui5 k uei5
811
+ kun1 k uen1
812
+ kun2 k uen2
813
+ kun3 k uen3
814
+ kun4 k uen4
815
+ kun5 k uen5
816
+ kuo1 k uo1
817
+ kuo2 k uo2
818
+ kuo3 k uo3
819
+ kuo4 k uo4
820
+ kuo5 k uo5
821
+ la1 l a1
822
+ la2 l a2
823
+ la3 l a3
824
+ la4 l a4
825
+ la5 l a5
826
+ lai1 l ai1
827
+ lai2 l ai2
828
+ lai3 l ai3
829
+ lai4 l ai4
830
+ lai5 l ai5
831
+ lan1 l an1
832
+ lan2 l an2
833
+ lan3 l an3
834
+ lan4 l an4
835
+ lan5 l an5
836
+ lang1 l ang1
837
+ lang2 l ang2
838
+ lang3 l ang3
839
+ lang4 l ang4
840
+ lang5 l ang5
841
+ lao1 l ao1
842
+ lao2 l ao2
843
+ lao3 l ao3
844
+ lao4 l ao4
845
+ lao5 l ao5
846
+ le1 l e1
847
+ le2 l e2
848
+ le3 l e3
849
+ le4 l e4
850
+ le5 l e5
851
+ lei1 l ei1
852
+ lei2 l ei2
853
+ lei3 l ei3
854
+ lei4 l ei4
855
+ lei5 l ei5
856
+ leng1 l eng1
857
+ leng2 l eng2
858
+ leng3 l eng3
859
+ leng4 l eng4
860
+ leng5 l eng5
861
+ li1 l i1
862
+ li2 l i2
863
+ li3 l i3
864
+ li4 l i4
865
+ li5 l i5
866
+ lia1 l ia1
867
+ lia2 l ia2
868
+ lia3 l ia3
869
+ lia4 l ia4
870
+ lia5 l ia5
871
+ lian1 l ian1
872
+ lian2 l ian2
873
+ lian3 l ian3
874
+ lian4 l ian4
875
+ lian5 l ian5
876
+ liang1 l iang1
877
+ liang2 l iang2
878
+ liang3 l iang3
879
+ liang4 l iang4
880
+ liang5 l iang5
881
+ liao1 l iao1
882
+ liao2 l iao2
883
+ liao3 l iao3
884
+ liao4 l iao4
885
+ liao5 l iao5
886
+ lie1 l ie1
887
+ lie2 l ie2
888
+ lie3 l ie3
889
+ lie4 l ie4
890
+ lie5 l ie5
891
+ lin1 l in1
892
+ lin2 l in2
893
+ lin3 l in3
894
+ lin4 l in4
895
+ lin5 l in5
896
+ ling1 l ing1
897
+ ling2 l ing2
898
+ ling3 l ing3
899
+ ling4 l ing4
900
+ ling5 l ing5
901
+ liu1 l iou1
902
+ liu2 l iou2
903
+ liu3 l iou3
904
+ liu4 l iou4
905
+ liu5 l iou5
906
+ lo1 l o1
907
+ lo2 l o2
908
+ lo3 l o3
909
+ lo4 l o4
910
+ lo5 l o5
911
+ long1 l ong1
912
+ long2 l ong2
913
+ long3 l ong3
914
+ long4 l ong4
915
+ long5 l ong5
916
+ lou1 l ou1
917
+ lou2 l ou2
918
+ lou3 l ou3
919
+ lou4 l ou4
920
+ lou5 l ou5
921
+ lu1 l u1
922
+ lu2 l u2
923
+ lu3 l u3
924
+ lu4 l u4
925
+ lu5 l u5
926
+ luan1 l uan1
927
+ luan2 l uan2
928
+ luan3 l uan3
929
+ luan4 l uan4
930
+ luan5 l uan5
931
+ lue1 l ve1
932
+ lue2 l ve2
933
+ lue3 l ve3
934
+ lue4 l ve4
935
+ lue5 l ve5
936
+ lve1 l ve1
937
+ lve2 l ve2
938
+ lve3 l ve3
939
+ lve4 l ve4
940
+ lve5 l ve5
941
+ lun1 l uen1
942
+ lun2 l uen2
943
+ lun3 l uen3
944
+ lun4 l uen4
945
+ lun5 l uen5
946
+ luo1 l uo1
947
+ luo2 l uo2
948
+ luo3 l uo3
949
+ luo4 l uo4
950
+ luo5 l uo5
951
+ lv1 l v1
952
+ lv2 l v2
953
+ lv3 l v3
954
+ lv4 l v4
955
+ lv5 l v5
956
+ ma1 m a1
957
+ ma2 m a2
958
+ ma3 m a3
959
+ ma4 m a4
960
+ ma5 m a5
961
+ mai1 m ai1
962
+ mai2 m ai2
963
+ mai3 m ai3
964
+ mai4 m ai4
965
+ mai5 m ai5
966
+ man1 m an1
967
+ man2 m an2
968
+ man3 m an3
969
+ man4 m an4
970
+ man5 m an5
971
+ mang1 m ang1
972
+ mang2 m ang2
973
+ mang3 m ang3
974
+ mang4 m ang4
975
+ mang5 m ang5
976
+ mao1 m ao1
977
+ mao2 m ao2
978
+ mao3 m ao3
979
+ mao4 m ao4
980
+ mao5 m ao5
981
+ me1 m e1
982
+ me2 m e2
983
+ me3 m e3
984
+ me4 m e4
985
+ me5 m e5
986
+ mei1 m ei1
987
+ mei2 m ei2
988
+ mei3 m ei3
989
+ mei4 m ei4
990
+ mei5 m ei5
991
+ men1 m en1
992
+ men2 m en2
993
+ men3 m en3
994
+ men4 m en4
995
+ men5 m en5
996
+ meng1 m eng1
997
+ meng2 m eng2
998
+ meng3 m eng3
999
+ meng4 m eng4
1000
+ meng5 m eng5
1001
+ mi1 m i1
1002
+ mi2 m i2
1003
+ mi3 m i3
1004
+ mi4 m i4
1005
+ mi5 m i5
1006
+ mian1 m ian1
1007
+ mian2 m ian2
1008
+ mian3 m ian3
1009
+ mian4 m ian4
1010
+ mian5 m ian5
1011
+ miao1 m iao1
1012
+ miao2 m iao2
1013
+ miao3 m iao3
1014
+ miao4 m iao4
1015
+ miao5 m iao5
1016
+ mie1 m ie1
1017
+ mie2 m ie2
1018
+ mie3 m ie3
1019
+ mie4 m ie4
1020
+ mie5 m ie5
1021
+ min1 m in1
1022
+ min2 m in2
1023
+ min3 m in3
1024
+ min4 m in4
1025
+ min5 m in5
1026
+ ming1 m ing1
1027
+ ming2 m ing2
1028
+ ming3 m ing3
1029
+ ming4 m ing4
1030
+ ming5 m ing5
1031
+ miu1 m iou1
1032
+ miu2 m iou2
1033
+ miu3 m iou3
1034
+ miu4 m iou4
1035
+ miu5 m iou5
1036
+ mo1 m o1
1037
+ mo2 m o2
1038
+ mo3 m o3
1039
+ mo4 m o4
1040
+ mo5 m o5
1041
+ mou1 m ou1
1042
+ mou2 m ou2
1043
+ mou3 m ou3
1044
+ mou4 m ou4
1045
+ mou5 m ou5
1046
+ mu1 m u1
1047
+ mu2 m u2
1048
+ mu3 m u3
1049
+ mu4 m u4
1050
+ mu5 m u5
1051
+ na1 n a1
1052
+ na2 n a2
1053
+ na3 n a3
1054
+ na4 n a4
1055
+ na5 n a5
1056
+ nai1 n ai1
1057
+ nai2 n ai2
1058
+ nai3 n ai3
1059
+ nai4 n ai4
1060
+ nai5 n ai5
1061
+ nan1 n an1
1062
+ nan2 n an2
1063
+ nan3 n an3
1064
+ nan4 n an4
1065
+ nan5 n an5
1066
+ nang1 n ang1
1067
+ nang2 n ang2
1068
+ nang3 n ang3
1069
+ nang4 n ang4
1070
+ nang5 n ang5
1071
+ nao1 n ao1
1072
+ nao2 n ao2
1073
+ nao3 n ao3
1074
+ nao4 n ao4
1075
+ nao5 n ao5
1076
+ ne1 n e1
1077
+ ne2 n e2
1078
+ ne3 n e3
1079
+ ne4 n e4
1080
+ ne5 n e5
1081
+ nei1 n ei1
1082
+ nei2 n ei2
1083
+ nei3 n ei3
1084
+ nei4 n ei4
1085
+ nei5 n ei5
1086
+ nen1 n en1
1087
+ nen2 n en2
1088
+ nen3 n en3
1089
+ nen4 n en4
1090
+ nen5 n en5
1091
+ neng1 n eng1
1092
+ neng2 n eng2
1093
+ neng3 n eng3
1094
+ neng4 n eng4
1095
+ neng5 n eng5
1096
+ ni1 n i1
1097
+ ni2 n i2
1098
+ ni3 n i3
1099
+ ni4 n i4
1100
+ ni5 n i5
1101
+ nian1 n ian1
1102
+ nian2 n ian2
1103
+ nian3 n ian3
1104
+ nian4 n ian4
1105
+ nian5 n ian5
1106
+ niang1 n iang1
1107
+ niang2 n iang2
1108
+ niang3 n iang3
1109
+ niang4 n iang4
1110
+ niang5 n iang5
1111
+ niao1 n iao1
1112
+ niao2 n iao2
1113
+ niao3 n iao3
1114
+ niao4 n iao4
1115
+ niao5 n iao5
1116
+ nie1 n ie1
1117
+ nie2 n ie2
1118
+ nie3 n ie3
1119
+ nie4 n ie4
1120
+ nie5 n ie5
1121
+ nin1 n in1
1122
+ nin2 n in2
1123
+ nin3 n in3
1124
+ nin4 n in4
1125
+ nin5 n in5
1126
+ ning1 n ing1
1127
+ ning2 n ing2
1128
+ ning3 n ing3
1129
+ ning4 n ing4
1130
+ ning5 n ing5
1131
+ niu1 n iou1
1132
+ niu2 n iou2
1133
+ niu3 n iou3
1134
+ niu4 n iou4
1135
+ niu5 n iou5
1136
+ nong1 n ong1
1137
+ nong2 n ong2
1138
+ nong3 n ong3
1139
+ nong4 n ong4
1140
+ nong5 n ong5
1141
+ nou1 n ou1
1142
+ nou2 n ou2
1143
+ nou3 n ou3
1144
+ nou4 n ou4
1145
+ nou5 n ou5
1146
+ nu1 n u1
1147
+ nu2 n u2
1148
+ nu3 n u3
1149
+ nu4 n u4
1150
+ nu5 n u5
1151
+ nuan1 n uan1
1152
+ nuan2 n uan2
1153
+ nuan3 n uan3
1154
+ nuan4 n uan4
1155
+ nuan5 n uan5
1156
+ nue1 n ve1
1157
+ nue2 n ve2
1158
+ nue3 n ve3
1159
+ nue4 n ve4
1160
+ nue5 n ve5
1161
+ nve1 n ve1
1162
+ nve2 n ve2
1163
+ nve3 n ve3
1164
+ nve4 n ve4
1165
+ nve5 n ve5
1166
+ nuo1 n uo1
1167
+ nuo2 n uo2
1168
+ nuo3 n uo3
1169
+ nuo4 n uo4
1170
+ nuo5 n uo5
1171
+ nv1 n v1
1172
+ nv2 n v2
1173
+ nv3 n v3
1174
+ nv4 n v4
1175
+ nv5 n v5
1176
+ o1 o1
1177
+ o2 o2
1178
+ o3 o3
1179
+ o4 o4
1180
+ o5 o5
1181
+ ou1 ou1
1182
+ ou2 ou2
1183
+ ou3 ou3
1184
+ ou4 ou4
1185
+ ou5 ou5
1186
+ pa1 p a1
1187
+ pa2 p a2
1188
+ pa3 p a3
1189
+ pa4 p a4
1190
+ pa5 p a5
1191
+ pai1 p ai1
1192
+ pai2 p ai2
1193
+ pai3 p ai3
1194
+ pai4 p ai4
1195
+ pai5 p ai5
1196
+ pan1 p an1
1197
+ pan2 p an2
1198
+ pan3 p an3
1199
+ pan4 p an4
1200
+ pan5 p an5
1201
+ pang1 p ang1
1202
+ pang2 p ang2
1203
+ pang3 p ang3
1204
+ pang4 p ang4
1205
+ pang5 p ang5
1206
+ pao1 p ao1
1207
+ pao2 p ao2
1208
+ pao3 p ao3
1209
+ pao4 p ao4
1210
+ pao5 p ao5
1211
+ pei1 p ei1
1212
+ pei2 p ei2
1213
+ pei3 p ei3
1214
+ pei4 p ei4
1215
+ pei5 p ei5
1216
+ pen1 p en1
1217
+ pen2 p en2
1218
+ pen3 p en3
1219
+ pen4 p en4
1220
+ pen5 p en5
1221
+ peng1 p eng1
1222
+ peng2 p eng2
1223
+ peng3 p eng3
1224
+ peng4 p eng4
1225
+ peng5 p eng5
1226
+ pi1 p i1
1227
+ pi2 p i2
1228
+ pi3 p i3
1229
+ pi4 p i4
1230
+ pi5 p i5
1231
+ pian1 p ian1
1232
+ pian2 p ian2
1233
+ pian3 p ian3
1234
+ pian4 p ian4
1235
+ pian5 p ian5
1236
+ piao1 p iao1
1237
+ piao2 p iao2
1238
+ piao3 p iao3
1239
+ piao4 p iao4
1240
+ piao5 p iao5
1241
+ pie1 p ie1
1242
+ pie2 p ie2
1243
+ pie3 p ie3
1244
+ pie4 p ie4
1245
+ pie5 p ie5
1246
+ pin1 p in1
1247
+ pin2 p in2
1248
+ pin3 p in3
1249
+ pin4 p in4
1250
+ pin5 p in5
1251
+ ping1 p ing1
1252
+ ping2 p ing2
1253
+ ping3 p ing3
1254
+ ping4 p ing4
1255
+ ping5 p ing5
1256
+ po1 p o1
1257
+ po2 p o2
1258
+ po3 p o3
1259
+ po4 p o4
1260
+ po5 p o5
1261
+ pou1 p ou1
1262
+ pou2 p ou2
1263
+ pou3 p ou3
1264
+ pou4 p ou4
1265
+ pou5 p ou5
1266
+ pu1 p u1
1267
+ pu2 p u2
1268
+ pu3 p u3
1269
+ pu4 p u4
1270
+ pu5 p u5
1271
+ qi1 q i1
1272
+ qi2 q i2
1273
+ qi3 q i3
1274
+ qi4 q i4
1275
+ qi5 q i5
1276
+ qia1 q ia1
1277
+ qia2 q ia2
1278
+ qia3 q ia3
1279
+ qia4 q ia4
1280
+ qia5 q ia5
1281
+ qian1 q ian1
1282
+ qian2 q ian2
1283
+ qian3 q ian3
1284
+ qian4 q ian4
1285
+ qian5 q ian5
1286
+ qiang1 q iang1
1287
+ qiang2 q iang2
1288
+ qiang3 q iang3
1289
+ qiang4 q iang4
1290
+ qiang5 q iang5
1291
+ qiao1 q iao1
1292
+ qiao2 q iao2
1293
+ qiao3 q iao3
1294
+ qiao4 q iao4
1295
+ qiao5 q iao5
1296
+ qie1 q ie1
1297
+ qie2 q ie2
1298
+ qie3 q ie3
1299
+ qie4 q ie4
1300
+ qie5 q ie5
1301
+ qin1 q in1
1302
+ qin2 q in2
1303
+ qin3 q in3
1304
+ qin4 q in4
1305
+ qin5 q in5
1306
+ qing1 q ing1
1307
+ qing2 q ing2
1308
+ qing3 q ing3
1309
+ qing4 q ing4
1310
+ qing5 q ing5
1311
+ qiong1 q iong1
1312
+ qiong2 q iong2
1313
+ qiong3 q iong3
1314
+ qiong4 q iong4
1315
+ qiong5 q iong5
1316
+ qiu1 q iou1
1317
+ qiu2 q iou2
1318
+ qiu3 q iou3
1319
+ qiu4 q iou4
1320
+ qiu5 q iou5
1321
+ qu1 q v1
1322
+ qu2 q v2
1323
+ qu3 q v3
1324
+ qu4 q v4
1325
+ qu5 q v5
1326
+ quan1 q van1
1327
+ quan2 q van2
1328
+ quan3 q van3
1329
+ quan4 q van4
1330
+ quan5 q van5
1331
+ que1 q ve1
1332
+ que2 q ve2
1333
+ que3 q ve3
1334
+ que4 q ve4
1335
+ que5 q ve5
1336
+ qun1 q vn1
1337
+ qun2 q vn2
1338
+ qun3 q vn3
1339
+ qun4 q vn4
1340
+ qun5 q vn5
1341
+ ran1 r an1
1342
+ ran2 r an2
1343
+ ran3 r an3
1344
+ ran4 r an4
1345
+ ran5 r an5
1346
+ rang1 r ang1
1347
+ rang2 r ang2
1348
+ rang3 r ang3
1349
+ rang4 r ang4
1350
+ rang5 r ang5
1351
+ rao1 r ao1
1352
+ rao2 r ao2
1353
+ rao3 r ao3
1354
+ rao4 r ao4
1355
+ rao5 r ao5
1356
+ re1 r e1
1357
+ re2 r e2
1358
+ re3 r e3
1359
+ re4 r e4
1360
+ re5 r e5
1361
+ ren1 r en1
1362
+ ren2 r en2
1363
+ ren3 r en3
1364
+ ren4 r en4
1365
+ ren5 r en5
1366
+ reng1 r eng1
1367
+ reng2 r eng2
1368
+ reng3 r eng3
1369
+ reng4 r eng4
1370
+ reng5 r eng5
1371
+ ri1 r iii1
1372
+ ri2 r iii2
1373
+ ri3 r iii3
1374
+ ri4 r iii4
1375
+ ri5 r iii5
1376
+ rong1 r ong1
1377
+ rong2 r ong2
1378
+ rong3 r ong3
1379
+ rong4 r ong4
1380
+ rong5 r ong5
1381
+ rou1 r ou1
1382
+ rou2 r ou2
1383
+ rou3 r ou3
1384
+ rou4 r ou4
1385
+ rou5 r ou5
1386
+ ru1 r u1
1387
+ ru2 r u2
1388
+ ru3 r u3
1389
+ ru4 r u4
1390
+ ru5 r u5
1391
+ rua1 r ua1
1392
+ rua2 r ua2
1393
+ rua3 r ua3
1394
+ rua4 r ua4
1395
+ rua5 r ua5
1396
+ ruan1 r uan1
1397
+ ruan2 r uan2
1398
+ ruan3 r uan3
1399
+ ruan4 r uan4
1400
+ ruan5 r uan5
1401
+ rui1 r uei1
1402
+ rui2 r uei2
1403
+ rui3 r uei3
1404
+ rui4 r uei4
1405
+ rui5 r uei5
1406
+ run1 r uen1
1407
+ run2 r uen2
1408
+ run3 r uen3
1409
+ run4 r uen4
1410
+ run5 r uen5
1411
+ ruo1 r uo1
1412
+ ruo2 r uo2
1413
+ ruo3 r uo3
1414
+ ruo4 r uo4
1415
+ ruo5 r uo5
1416
+ sa1 s a1
1417
+ sa2 s a2
1418
+ sa3 s a3
1419
+ sa4 s a4
1420
+ sa5 s a5
1421
+ sai1 s ai1
1422
+ sai2 s ai2
1423
+ sai3 s ai3
1424
+ sai4 s ai4
1425
+ sai5 s ai5
1426
+ san1 s an1
1427
+ san2 s an2
1428
+ san3 s an3
1429
+ san4 s an4
1430
+ san5 s an5
1431
+ sang1 s ang1
1432
+ sang2 s ang2
1433
+ sang3 s ang3
1434
+ sang4 s ang4
1435
+ sang5 s ang5
1436
+ sao1 s ao1
1437
+ sao2 s ao2
1438
+ sao3 s ao3
1439
+ sao4 s ao4
1440
+ sao5 s ao5
1441
+ se1 s e1
1442
+ se2 s e2
1443
+ se3 s e3
1444
+ se4 s e4
1445
+ se5 s e5
1446
+ sen1 s en1
1447
+ sen2 s en2
1448
+ sen3 s en3
1449
+ sen4 s en4
1450
+ sen5 s en5
1451
+ seng1 s eng1
1452
+ seng2 s eng2
1453
+ seng3 s eng3
1454
+ seng4 s eng4
1455
+ seng5 s eng5
1456
+ sha1 sh a1
1457
+ sha2 sh a2
1458
+ sha3 sh a3
1459
+ sha4 sh a4
1460
+ sha5 sh a5
1461
+ shai1 sh ai1
1462
+ shai2 sh ai2
1463
+ shai3 sh ai3
1464
+ shai4 sh ai4
1465
+ shai5 sh ai5
1466
+ shan1 sh an1
1467
+ shan2 sh an2
1468
+ shan3 sh an3
1469
+ shan4 sh an4
1470
+ shan5 sh an5
1471
+ shang1 sh ang1
1472
+ shang2 sh ang2
1473
+ shang3 sh ang3
1474
+ shang4 sh ang4
1475
+ shang5 sh ang5
1476
+ shao1 sh ao1
1477
+ shao2 sh ao2
1478
+ shao3 sh ao3
1479
+ shao4 sh ao4
1480
+ shao5 sh ao5
1481
+ she1 sh e1
1482
+ she2 sh e2
1483
+ she3 sh e3
1484
+ she4 sh e4
1485
+ she5 sh e5
1486
+ shei1 sh ei1
1487
+ shei2 sh ei2
1488
+ shei3 sh ei3
1489
+ shei4 sh ei4
1490
+ shei5 sh ei5
1491
+ shen1 sh en1
1492
+ shen2 sh en2
1493
+ shen3 sh en3
1494
+ shen4 sh en4
1495
+ shen5 sh en5
1496
+ sheng1 sh eng1
1497
+ sheng2 sh eng2
1498
+ sheng3 sh eng3
1499
+ sheng4 sh eng4
1500
+ sheng5 sh eng5
1501
+ shi1 sh iii1
1502
+ shi2 sh iii2
1503
+ shi3 sh iii3
1504
+ shi4 sh iii4
1505
+ shi5 sh iii5
1506
+ shou1 sh ou1
1507
+ shou2 sh ou2
1508
+ shou3 sh ou3
1509
+ shou4 sh ou4
1510
+ shou5 sh ou5
1511
+ shu1 sh u1
1512
+ shu2 sh u2
1513
+ shu3 sh u3
1514
+ shu4 sh u4
1515
+ shu5 sh u5
1516
+ shua1 sh ua1
1517
+ shua2 sh ua2
1518
+ shua3 sh ua3
1519
+ shua4 sh ua4
1520
+ shua5 sh ua5
1521
+ shuai1 sh uai1
1522
+ shuai2 sh uai2
1523
+ shuai3 sh uai3
1524
+ shuai4 sh uai4
1525
+ shuai5 sh uai5
1526
+ shuan1 sh uan1
1527
+ shuan2 sh uan2
1528
+ shuan3 sh uan3
1529
+ shuan4 sh uan4
1530
+ shuan5 sh uan5
1531
+ shuang1 sh uang1
1532
+ shuang2 sh uang2
1533
+ shuang3 sh uang3
1534
+ shuang4 sh uang4
1535
+ shuang5 sh uang5
1536
+ shui1 sh uei1
1537
+ shui2 sh uei2
1538
+ shui3 sh uei3
1539
+ shui4 sh uei4
1540
+ shui5 sh uei5
1541
+ shun1 sh uen1
1542
+ shun2 sh uen2
1543
+ shun3 sh uen3
1544
+ shun4 sh uen4
1545
+ shun5 sh uen5
1546
+ shuo1 sh uo1
1547
+ shuo2 sh uo2
1548
+ shuo3 sh uo3
1549
+ shuo4 sh uo4
1550
+ shuo5 sh uo5
1551
+ si1 s ii1
1552
+ si2 s ii2
1553
+ si3 s ii3
1554
+ si4 s ii4
1555
+ si5 s ii5
1556
+ song1 s ong1
1557
+ song2 s ong2
1558
+ song3 s ong3
1559
+ song4 s ong4
1560
+ song5 s ong5
1561
+ sou1 s ou1
1562
+ sou2 s ou2
1563
+ sou3 s ou3
1564
+ sou4 s ou4
1565
+ sou5 s ou5
1566
+ su1 s u1
1567
+ su2 s u2
1568
+ su3 s u3
1569
+ su4 s u4
1570
+ su5 s u5
1571
+ suan1 s uan1
1572
+ suan2 s uan2
1573
+ suan3 s uan3
1574
+ suan4 s uan4
1575
+ suan5 s uan5
1576
+ sui1 s uei1
1577
+ sui2 s uei2
1578
+ sui3 s uei3
1579
+ sui4 s uei4
1580
+ sui5 s uei5
1581
+ sun1 s uen1
1582
+ sun2 s uen2
1583
+ sun3 s uen3
1584
+ sun4 s uen4
1585
+ sun5 s uen5
1586
+ suo1 s uo1
1587
+ suo2 s uo2
1588
+ suo3 s uo3
1589
+ suo4 s uo4
1590
+ suo5 s uo5
1591
+ ta1 t a1
1592
+ ta2 t a2
1593
+ ta3 t a3
1594
+ ta4 t a4
1595
+ ta5 t a5
1596
+ tai1 t ai1
1597
+ tai2 t ai2
1598
+ tai3 t ai3
1599
+ tai4 t ai4
1600
+ tai5 t ai5
1601
+ tan1 t an1
1602
+ tan2 t an2
1603
+ tan3 t an3
1604
+ tan4 t an4
1605
+ tan5 t an5
1606
+ tang1 t ang1
1607
+ tang2 t ang2
1608
+ tang3 t ang3
1609
+ tang4 t ang4
1610
+ tang5 t ang5
1611
+ tao1 t ao1
1612
+ tao2 t ao2
1613
+ tao3 t ao3
1614
+ tao4 t ao4
1615
+ tao5 t ao5
1616
+ te1 t e1
1617
+ te2 t e2
1618
+ te3 t e3
1619
+ te4 t e4
1620
+ te5 t e5
1621
+ tei1 t ei1
1622
+ tei2 t ei2
1623
+ tei3 t ei3
1624
+ tei4 t ei4
1625
+ tei5 t ei5
1626
+ teng1 t eng1
1627
+ teng2 t eng2
1628
+ teng3 t eng3
1629
+ teng4 t eng4
1630
+ teng5 t eng5
1631
+ ti1 t i1
1632
+ ti2 t i2
1633
+ ti3 t i3
1634
+ ti4 t i4
1635
+ ti5 t i5
1636
+ tian1 t ian1
1637
+ tian2 t ian2
1638
+ tian3 t ian3
1639
+ tian4 t ian4
1640
+ tian5 t ian5
1641
+ tiao1 t iao1
1642
+ tiao2 t iao2
1643
+ tiao3 t iao3
1644
+ tiao4 t iao4
1645
+ tiao5 t iao5
1646
+ tie1 t ie1
1647
+ tie2 t ie2
1648
+ tie3 t ie3
1649
+ tie4 t ie4
1650
+ tie5 t ie5
1651
+ ting1 t ing1
1652
+ ting2 t ing2
1653
+ ting3 t ing3
1654
+ ting4 t ing4
1655
+ ting5 t ing5
1656
+ tong1 t ong1
1657
+ tong2 t ong2
1658
+ tong3 t ong3
1659
+ tong4 t ong4
1660
+ tong5 t ong5
1661
+ tou1 t ou1
1662
+ tou2 t ou2
1663
+ tou3 t ou3
1664
+ tou4 t ou4
1665
+ tou5 t ou5
1666
+ tu1 t u1
1667
+ tu2 t u2
1668
+ tu3 t u3
1669
+ tu4 t u4
1670
+ tu5 t u5
1671
+ tuan1 t uan1
1672
+ tuan2 t uan2
1673
+ tuan3 t uan3
1674
+ tuan4 t uan4
1675
+ tuan5 t uan5
1676
+ tui1 t uei1
1677
+ tui2 t uei2
1678
+ tui3 t uei3
1679
+ tui4 t uei4
1680
+ tui5 t uei5
1681
+ tun1 t uen1
1682
+ tun2 t uen2
1683
+ tun3 t uen3
1684
+ tun4 t uen4
1685
+ tun5 t uen5
1686
+ tuo1 t uo1
1687
+ tuo2 t uo2
1688
+ tuo3 t uo3
1689
+ tuo4 t uo4
1690
+ tuo5 t uo5
1691
+ wa1 w ua1
1692
+ wa2 w ua2
1693
+ wa3 w ua3
1694
+ wa4 w ua4
1695
+ wa5 w ua5
1696
+ wai1 w uai1
1697
+ wai2 w uai2
1698
+ wai3 w uai3
1699
+ wai4 w uai4
1700
+ wai5 w uai5
1701
+ wan1 w uan1
1702
+ wan2 w uan2
1703
+ wan3 w uan3
1704
+ wan4 w uan4
1705
+ wan5 w uan5
1706
+ wang1 w uang1
1707
+ wang2 w uang2
1708
+ wang3 w uang3
1709
+ wang4 w uang4
1710
+ wang5 w uang5
1711
+ wei1 w uei1
1712
+ wei2 w uei2
1713
+ wei3 w uei3
1714
+ wei4 w uei4
1715
+ wei5 w uei5
1716
+ wen1 w uen1
1717
+ wen2 w uen2
1718
+ wen3 w uen3
1719
+ wen4 w uen4
1720
+ wen5 w uen5
1721
+ weng1 w uen1
1722
+ weng2 w uen2
1723
+ weng3 w uen3
1724
+ weng4 w uen4
1725
+ weng5 w uen5
1726
+ wo1 w uo1
1727
+ wo2 w uo2
1728
+ wo3 w uo3
1729
+ wo4 w uo4
1730
+ wo5 w uo5
1731
+ wu1 w u1
1732
+ wu2 w u2
1733
+ wu3 w u3
1734
+ wu4 w u4
1735
+ wu5 w u5
1736
+ xi1 x i1
1737
+ xi2 x i2
1738
+ xi3 x i3
1739
+ xi4 x i4
1740
+ xi5 x i5
1741
+ xia1 x ia1
1742
+ xia2 x ia2
1743
+ xia3 x ia3
1744
+ xia4 x ia4
1745
+ xia5 x ia5
1746
+ xian1 x ian1
1747
+ xian2 x ian2
1748
+ xian3 x ian3
1749
+ xian4 x ian4
1750
+ xian5 x ian5
1751
+ xiang1 x iang1
1752
+ xiang2 x iang2
1753
+ xiang3 x iang3
1754
+ xiang4 x iang4
1755
+ xiang5 x iang5
1756
+ xiao1 x iao1
1757
+ xiao2 x iao2
1758
+ xiao3 x iao3
1759
+ xiao4 x iao4
1760
+ xiao5 x iao5
1761
+ xie1 x ie1
1762
+ xie2 x ie2
1763
+ xie3 x ie3
1764
+ xie4 x ie4
1765
+ xie5 x ie5
1766
+ xin1 x in1
1767
+ xin2 x in2
1768
+ xin3 x in3
1769
+ xin4 x in4
1770
+ xin5 x in5
1771
+ xing1 x ing1
1772
+ xing2 x ing2
1773
+ xing3 x ing3
1774
+ xing4 x ing4
1775
+ xing5 x ing5
1776
+ xiong1 x iong1
1777
+ xiong2 x iong2
1778
+ xiong3 x iong3
1779
+ xiong4 x iong4
1780
+ xiong5 x iong5
1781
+ xiu1 x iou1
1782
+ xiu2 x iou2
1783
+ xiu3 x iou3
1784
+ xiu4 x iou4
1785
+ xiu5 x iou5
1786
+ xu1 x v1
1787
+ xu2 x v2
1788
+ xu3 x v3
1789
+ xu4 x v4
1790
+ xu5 x v5
1791
+ xuan1 x van1
1792
+ xuan2 x van2
1793
+ xuan3 x van3
1794
+ xuan4 x van4
1795
+ xuan5 x van5
1796
+ xue1 x ve1
1797
+ xue2 x ve2
1798
+ xue3 x ve3
1799
+ xue4 x ve4
1800
+ xue5 x ve5
1801
+ xun1 x vn1
1802
+ xun2 x vn2
1803
+ xun3 x vn3
1804
+ xun4 x vn4
1805
+ xun5 x vn5
1806
+ ya1 y ia1
1807
+ ya2 y ia2
1808
+ ya3 y ia3
1809
+ ya4 y ia4
1810
+ ya5 y ia5
1811
+ yan1 y ian1
1812
+ yan2 y ian2
1813
+ yan3 y ian3
1814
+ yan4 y ian4
1815
+ yan5 y ian5
1816
+ yang1 y iang1
1817
+ yang2 y iang2
1818
+ yang3 y iang3
1819
+ yang4 y iang4
1820
+ yang5 y iang5
1821
+ yao1 y iao1
1822
+ yao2 y iao2
1823
+ yao3 y iao3
1824
+ yao4 y iao4
1825
+ yao5 y iao5
1826
+ ye1 y ie1
1827
+ ye2 y ie2
1828
+ ye3 y ie3
1829
+ ye4 y ie4
1830
+ ye5 y ie5
1831
+ yi1 y i1
1832
+ yi2 y i2
1833
+ yi3 y i3
1834
+ yi4 y i4
1835
+ yi5 y i5
1836
+ yin1 y in1
1837
+ yin2 y in2
1838
+ yin3 y in3
1839
+ yin4 y in4
1840
+ yin5 y in5
1841
+ ying1 y ing1
1842
+ ying2 y ing2
1843
+ ying3 y ing3
1844
+ ying4 y ing4
1845
+ ying5 y ing5
1846
+ yo1 y iou1
1847
+ yo2 y iou2
1848
+ yo3 y iou3
1849
+ yo4 y iou4
1850
+ yo5 y iou5
1851
+ yong1 y iong1
1852
+ yong2 y iong2
1853
+ yong3 y iong3
1854
+ yong4 y iong4
1855
+ yong5 y iong5
1856
+ you1 y iou1
1857
+ you2 y iou2
1858
+ you3 y iou3
1859
+ you4 y iou4
1860
+ you5 y iou5
1861
+ yu1 y v1
1862
+ yu2 y v2
1863
+ yu3 y v3
1864
+ yu4 y v4
1865
+ yu5 y v5
1866
+ yuan1 y van1
1867
+ yuan2 y van2
1868
+ yuan3 y van3
1869
+ yuan4 y van4
1870
+ yuan5 y van5
1871
+ yue1 y ve1
1872
+ yue2 y ve2
1873
+ yue3 y ve3
1874
+ yue4 y ve4
1875
+ yue5 y ve5
1876
+ yun1 y vn1
1877
+ yun2 y vn2
1878
+ yun3 y vn3
1879
+ yun4 y vn4
1880
+ yun5 y vn5
1881
+ za1 z a1
1882
+ za2 z a2
1883
+ za3 z a3
1884
+ za4 z a4
1885
+ za5 z a5
1886
+ zai1 z ai1
1887
+ zai2 z ai2
1888
+ zai3 z ai3
1889
+ zai4 z ai4
1890
+ zai5 z ai5
1891
+ zan1 z an1
1892
+ zan2 z an2
1893
+ zan3 z an3
1894
+ zan4 z an4
1895
+ zan5 z an5
1896
+ zang1 z ang1
1897
+ zang2 z ang2
1898
+ zang3 z ang3
1899
+ zang4 z ang4
1900
+ zang5 z ang5
1901
+ zao1 z ao1
1902
+ zao2 z ao2
1903
+ zao3 z ao3
1904
+ zao4 z ao4
1905
+ zao5 z ao5
1906
+ ze1 z e1
1907
+ ze2 z e2
1908
+ ze3 z e3
1909
+ ze4 z e4
1910
+ ze5 z e5
1911
+ zei1 z ei1
1912
+ zei2 z ei2
1913
+ zei3 z ei3
1914
+ zei4 z ei4
1915
+ zei5 z ei5
1916
+ zen1 z en1
1917
+ zen2 z en2
1918
+ zen3 z en3
1919
+ zen4 z en4
1920
+ zen5 z en5
1921
+ zeng1 z eng1
1922
+ zeng2 z eng2
1923
+ zeng3 z eng3
1924
+ zeng4 z eng4
1925
+ zeng5 z eng5
1926
+ zha1 zh a1
1927
+ zha2 zh a2
1928
+ zha3 zh a3
1929
+ zha4 zh a4
1930
+ zha5 zh a5
1931
+ zhai1 zh ai1
1932
+ zhai2 zh ai2
1933
+ zhai3 zh ai3
1934
+ zhai4 zh ai4
1935
+ zhai5 zh ai5
1936
+ zhan1 zh an1
1937
+ zhan2 zh an2
1938
+ zhan3 zh an3
1939
+ zhan4 zh an4
1940
+ zhan5 zh an5
1941
+ zhang1 zh ang1
1942
+ zhang2 zh ang2
1943
+ zhang3 zh ang3
1944
+ zhang4 zh ang4
1945
+ zhang5 zh ang5
1946
+ zhao1 zh ao1
1947
+ zhao2 zh ao2
1948
+ zhao3 zh ao3
1949
+ zhao4 zh ao4
1950
+ zhao5 zh ao5
1951
+ zhe1 zh e1
1952
+ zhe2 zh e2
1953
+ zhe3 zh e3
1954
+ zhe4 zh e4
1955
+ zhe5 zh e5
1956
+ zhei1 zh ei1
1957
+ zhei2 zh ei2
1958
+ zhei3 zh ei3
1959
+ zhei4 zh ei4
1960
+ zhei5 zh ei5
1961
+ zhen1 zh en1
1962
+ zhen2 zh en2
1963
+ zhen3 zh en3
1964
+ zhen4 zh en4
1965
+ zhen5 zh en5
1966
+ zheng1 zh eng1
1967
+ zheng2 zh eng2
1968
+ zheng3 zh eng3
1969
+ zheng4 zh eng4
1970
+ zheng5 zh eng5
1971
+ zhi1 zh iii1
1972
+ zhi2 zh iii2
1973
+ zhi3 zh iii3
1974
+ zhi4 zh iii4
1975
+ zhi5 zh iii5
1976
+ zhong1 zh ong1
1977
+ zhong2 zh ong2
1978
+ zhong3 zh ong3
1979
+ zhong4 zh ong4
1980
+ zhong5 zh ong5
1981
+ zhou1 zh ou1
1982
+ zhou2 zh ou2
1983
+ zhou3 zh ou3
1984
+ zhou4 zh ou4
1985
+ zhou5 zh ou5
1986
+ zhu1 zh u1
1987
+ zhu2 zh u2
1988
+ zhu3 zh u3
1989
+ zhu4 zh u4
1990
+ zhu5 zh u5
1991
+ zhua1 zh ua1
1992
+ zhua2 zh ua2
1993
+ zhua3 zh ua3
1994
+ zhua4 zh ua4
1995
+ zhua5 zh ua5
1996
+ zhuai1 zh uai1
1997
+ zhuai2 zh uai2
1998
+ zhuai3 zh uai3
1999
+ zhuai4 zh uai4
2000
+ zhuai5 zh uai5
2001
+ zhuan1 zh uan1
2002
+ zhuan2 zh uan2
2003
+ zhuan3 zh uan3
2004
+ zhuan4 zh uan4
2005
+ zhuan5 zh uan5
2006
+ zhuang1 zh uang1
2007
+ zhuang2 zh uang2
2008
+ zhuang3 zh uang3
2009
+ zhuang4 zh uang4
2010
+ zhuang5 zh uang5
2011
+ zhui1 zh uei1
2012
+ zhui2 zh uei2
2013
+ zhui3 zh uei3
2014
+ zhui4 zh uei4
2015
+ zhui5 zh uei5
2016
+ zhun1 zh uen1
2017
+ zhun2 zh uen2
2018
+ zhun3 zh uen3
2019
+ zhun4 zh uen4
2020
+ zhun5 zh uen5
2021
+ zhuo1 zh uo1
2022
+ zhuo2 zh uo2
2023
+ zhuo3 zh uo3
2024
+ zhuo4 zh uo4
2025
+ zhuo5 zh uo5
2026
+ zi1 z ii1
2027
+ zi2 z ii2
2028
+ zi3 z ii3
2029
+ zi4 z ii4
2030
+ zi5 z ii5
2031
+ zong1 z ong1
2032
+ zong2 z ong2
2033
+ zong3 z ong3
2034
+ zong4 z ong4
2035
+ zong5 z ong5
2036
+ zou1 z ou1
2037
+ zou2 z ou2
2038
+ zou3 z ou3
2039
+ zou4 z ou4
2040
+ zou5 z ou5
2041
+ zu1 z u1
2042
+ zu2 z u2
2043
+ zu3 z u3
2044
+ zu4 z u4
2045
+ zu5 z u5
2046
+ zuan1 z uan1
2047
+ zuan2 z uan2
2048
+ zuan3 z uan3
2049
+ zuan4 z uan4
2050
+ zuan5 z uan5
2051
+ zui1 z uei1
2052
+ zui2 z uei2
2053
+ zui3 z uei3
2054
+ zui4 z uei4
2055
+ zui5 z uei5
2056
+ zun1 z uen1
2057
+ zun2 z uen2
2058
+ zun3 z uen3
2059
+ zun4 z uen4
2060
+ zun5 z uen5
2061
+ zuo1 z uo1
2062
+ zuo2 z uo2
2063
+ zuo3 z uo3
2064
+ zuo4 z uo4
2065
+ zuo5 z uo5
2066
+ ar1 a1 rr
2067
+ ar2 a2 rr
2068
+ ar3 a3 rr
2069
+ ar4 a4 rr
2070
+ ar5 a5 rr
2071
+ air1 ai1 rr
2072
+ air2 ai2 rr
2073
+ air3 ai3 rr
2074
+ air4 ai4 rr
2075
+ air5 ai5 rr
2076
+ anr1 an1 rr
2077
+ anr2 an2 rr
2078
+ anr3 an3 rr
2079
+ anr4 an4 rr
2080
+ anr5 an5 rr
2081
+ angr1 ang1 rr
2082
+ angr2 ang2 rr
2083
+ angr3 ang3 rr
2084
+ angr4 ang4 rr
2085
+ angr5 ang5 rr
2086
+ aor1 ao1 rr
2087
+ aor2 ao2 rr
2088
+ aor3 ao3 rr
2089
+ aor4 ao4 rr
2090
+ aor5 ao5 rr
2091
+ bar1 b a1 rr
2092
+ bar2 b a2 rr
2093
+ bar3 b a3 rr
2094
+ bar4 b a4 rr
2095
+ bar5 b a5 rr
2096
+ bair1 b ai1 rr
2097
+ bair2 b ai2 rr
2098
+ bair3 b ai3 rr
2099
+ bair4 b ai4 rr
2100
+ bair5 b ai5 rr
2101
+ banr1 b an1 rr
2102
+ banr2 b an2 rr
2103
+ banr3 b an3 rr
2104
+ banr4 b an4 rr
2105
+ banr5 b an5 rr
2106
+ bangr1 b ang1 rr
2107
+ bangr2 b ang2 rr
2108
+ bangr3 b ang3 rr
2109
+ bangr4 b ang4 rr
2110
+ bangr5 b ang5 rr
2111
+ baor1 b ao1 rr
2112
+ baor2 b ao2 rr
2113
+ baor3 b ao3 rr
2114
+ baor4 b ao4 rr
2115
+ baor5 b ao5 rr
2116
+ beir1 b ei1 rr
2117
+ beir2 b ei2 rr
2118
+ beir3 b ei3 rr
2119
+ beir4 b ei4 rr
2120
+ beir5 b ei5 rr
2121
+ benr1 b en1 rr
2122
+ benr2 b en2 rr
2123
+ benr3 b en3 rr
2124
+ benr4 b en4 rr
2125
+ benr5 b en5 rr
2126
+ bengr1 b eng1 rr
2127
+ bengr2 b eng2 rr
2128
+ bengr3 b eng3 rr
2129
+ bengr4 b eng4 rr
2130
+ bengr5 b eng5 rr
2131
+ bir1 b i1 rr
2132
+ bir2 b i2 rr
2133
+ bir3 b i3 rr
2134
+ bir4 b i4 rr
2135
+ bir5 b i5 rr
2136
+ bianr1 b ian1 rr
2137
+ bianr2 b ian2 rr
2138
+ bianr3 b ian3 rr
2139
+ bianr4 b ian4 rr
2140
+ bianr5 b ian5 rr
2141
+ biaor1 b iao1 rr
2142
+ biaor2 b iao2 rr
2143
+ biaor3 b iao3 rr
2144
+ biaor4 b iao4 rr
2145
+ biaor5 b iao5 rr
2146
+ bier1 b ie1 rr
2147
+ bier2 b ie2 rr
2148
+ bier3 b ie3 rr
2149
+ bier4 b ie4 rr
2150
+ bier5 b ie5 rr
2151
+ binr1 b in1 rr
2152
+ binr2 b in2 rr
2153
+ binr3 b in3 rr
2154
+ binr4 b in4 rr
2155
+ binr5 b in5 rr
2156
+ bingr1 b ing1 rr
2157
+ bingr2 b ing2 rr
2158
+ bingr3 b ing3 rr
2159
+ bingr4 b ing4 rr
2160
+ bingr5 b ing5 rr
2161
+ bor1 b o1 rr
2162
+ bor2 b o2 rr
2163
+ bor3 b o3 rr
2164
+ bor4 b o4 rr
2165
+ bor5 b o5 rr
2166
+ bur1 b u1 rr
2167
+ bur2 b u2 rr
2168
+ bur3 b u3 rr
2169
+ bur4 b u4 rr
2170
+ bur5 b u5 rr
2171
+ car1 c a1 rr
2172
+ car2 c a2 rr
2173
+ car3 c a3 rr
2174
+ car4 c a4 rr
2175
+ car5 c a5 rr
2176
+ cair1 c ai1 rr
2177
+ cair2 c ai2 rr
2178
+ cair3 c ai3 rr
2179
+ cair4 c ai4 rr
2180
+ cair5 c ai5 rr
2181
+ canr1 c an1 rr
2182
+ canr2 c an2 rr
2183
+ canr3 c an3 rr
2184
+ canr4 c an4 rr
2185
+ canr5 c an5 rr
2186
+ cangr1 c ang1 rr
2187
+ cangr2 c ang2 rr
2188
+ cangr3 c ang3 rr
2189
+ cangr4 c ang4 rr
2190
+ cangr5 c ang5 rr
2191
+ caor1 c ao1 rr
2192
+ caor2 c ao2 rr
2193
+ caor3 c ao3 rr
2194
+ caor4 c ao4 rr
2195
+ caor5 c ao5 rr
2196
+ cer1 c e1 rr
2197
+ cer2 c e2 rr
2198
+ cer3 c e3 rr
2199
+ cer4 c e4 rr
2200
+ cer5 c e5 rr
2201
+ cenr1 c en1 rr
2202
+ cenr2 c en2 rr
2203
+ cenr3 c en3 rr
2204
+ cenr4 c en4 rr
2205
+ cenr5 c en5 rr
2206
+ cengr1 c eng1 rr
2207
+ cengr2 c eng2 rr
2208
+ cengr3 c eng3 rr
2209
+ cengr4 c eng4 rr
2210
+ cengr5 c eng5 rr
2211
+ char1 ch a1 rr
2212
+ char2 ch a2 rr
2213
+ char3 ch a3 rr
2214
+ char4 ch a4 rr
2215
+ char5 ch a5 rr
2216
+ chair1 ch ai1 rr
2217
+ chair2 ch ai2 rr
2218
+ chair3 ch ai3 rr
2219
+ chair4 ch ai4 rr
2220
+ chair5 ch ai5 rr
2221
+ chanr1 ch an1 rr
2222
+ chanr2 ch an2 rr
2223
+ chanr3 ch an3 rr
2224
+ chanr4 ch an4 rr
2225
+ chanr5 ch an5 rr
2226
+ changr1 ch ang1 rr
2227
+ changr2 ch ang2 rr
2228
+ changr3 ch ang3 rr
2229
+ changr4 ch ang4 rr
2230
+ changr5 ch ang5 rr
2231
+ chaor1 ch ao1 rr
2232
+ chaor2 ch ao2 rr
2233
+ chaor3 ch ao3 rr
2234
+ chaor4 ch ao4 rr
2235
+ chaor5 ch ao5 rr
2236
+ cher1 ch e1 rr
2237
+ cher2 ch e2 rr
2238
+ cher3 ch e3 rr
2239
+ cher4 ch e4 rr
2240
+ cher5 ch e5 rr
2241
+ chenr1 ch en1 rr
2242
+ chenr2 ch en2 rr
2243
+ chenr3 ch en3 rr
2244
+ chenr4 ch en4 rr
2245
+ chenr5 ch en5 rr
2246
+ chengr1 ch eng1 rr
2247
+ chengr2 ch eng2 rr
2248
+ chengr3 ch eng3 rr
2249
+ chengr4 ch eng4 rr
2250
+ chengr5 ch eng5 rr
2251
+ chir1 ch iii1 rr
2252
+ chir2 ch iii2 rr
2253
+ chir3 ch iii3 rr
2254
+ chir4 ch iii4 rr
2255
+ chir5 ch iii5 rr
2256
+ chongr1 ch ong1 rr
2257
+ chongr2 ch ong2 rr
2258
+ chongr3 ch ong3 rr
2259
+ chongr4 ch ong4 rr
2260
+ chongr5 ch ong5 rr
2261
+ chour1 ch ou1 rr
2262
+ chour2 ch ou2 rr
2263
+ chour3 ch ou3 rr
2264
+ chour4 ch ou4 rr
2265
+ chour5 ch ou5 rr
2266
+ chur1 ch u1 rr
2267
+ chur2 ch u2 rr
2268
+ chur3 ch u3 rr
2269
+ chur4 ch u4 rr
2270
+ chur5 ch u5 rr
2271
+ chuair1 ch uai1 rr
2272
+ chuair2 ch uai2 rr
2273
+ chuair3 ch uai3 rr
2274
+ chuair4 ch uai4 rr
2275
+ chuair5 ch uai5 rr
2276
+ chuanr1 ch uan1 rr
2277
+ chuanr2 ch uan2 rr
2278
+ chuanr3 ch uan3 rr
2279
+ chuanr4 ch uan4 rr
2280
+ chuanr5 ch uan5 rr
2281
+ chuangr1 ch uang1 rr
2282
+ chuangr2 ch uang2 rr
2283
+ chuangr3 ch uang3 rr
2284
+ chuangr4 ch uang4 rr
2285
+ chuangr5 ch uang5 rr
2286
+ chuir1 ch uei1 rr
2287
+ chuir2 ch uei2 rr
2288
+ chuir3 ch uei3 rr
2289
+ chuir4 ch uei4 rr
2290
+ chuir5 ch uei5 rr
2291
+ chunr1 ch uen1 rr
2292
+ chunr2 ch uen2 rr
2293
+ chunr3 ch uen3 rr
2294
+ chunr4 ch uen4 rr
2295
+ chunr5 ch uen5 rr
2296
+ chuor1 ch uo1 rr
2297
+ chuor2 ch uo2 rr
2298
+ chuor3 ch uo3 rr
2299
+ chuor4 ch uo4 rr
2300
+ chuor5 ch uo5 rr
2301
+ cir1 c ii1 rr
2302
+ cir2 c ii2 rr
2303
+ cir3 c ii3 rr
2304
+ cir4 c ii4 rr
2305
+ cir5 c ii5 rr
2306
+ congr1 c ong1 rr
2307
+ congr2 c ong2 rr
2308
+ congr3 c ong3 rr
2309
+ congr4 c ong4 rr
2310
+ congr5 c ong5 rr
2311
+ cour1 c ou1 rr
2312
+ cour2 c ou2 rr
2313
+ cour3 c ou3 rr
2314
+ cour4 c ou4 rr
2315
+ cour5 c ou5 rr
2316
+ cur1 c u1 rr
2317
+ cur2 c u2 rr
2318
+ cur3 c u3 rr
2319
+ cur4 c u4 rr
2320
+ cur5 c u5 rr
2321
+ cuanr1 c uan1 rr
2322
+ cuanr2 c uan2 rr
2323
+ cuanr3 c uan3 rr
2324
+ cuanr4 c uan4 rr
2325
+ cuanr5 c uan5 rr
2326
+ cuir1 c uei1 rr
2327
+ cuir2 c uei2 rr
2328
+ cuir3 c uei3 rr
2329
+ cuir4 c uei4 rr
2330
+ cuir5 c uei5 rr
2331
+ cunr1 c uen1 rr
2332
+ cunr2 c uen2 rr
2333
+ cunr3 c uen3 rr
2334
+ cunr4 c uen4 rr
2335
+ cunr5 c uen5 rr
2336
+ cuor1 c uo1 rr
2337
+ cuor2 c uo2 rr
2338
+ cuor3 c uo3 rr
2339
+ cuor4 c uo4 rr
2340
+ cuor5 c uo5 rr
2341
+ dar1 d a1 rr
2342
+ dar2 d a2 rr
2343
+ dar3 d a3 rr
2344
+ dar4 d a4 rr
2345
+ dar5 d a5 rr
2346
+ dair1 d ai1 rr
2347
+ dair2 d ai2 rr
2348
+ dair3 d ai3 rr
2349
+ dair4 d ai4 rr
2350
+ dair5 d ai5 rr
2351
+ danr1 d an1 rr
2352
+ danr2 d an2 rr
2353
+ danr3 d an3 rr
2354
+ danr4 d an4 rr
2355
+ danr5 d an5 rr
2356
+ dangr1 d ang1 rr
2357
+ dangr2 d ang2 rr
2358
+ dangr3 d ang3 rr
2359
+ dangr4 d ang4 rr
2360
+ dangr5 d ang5 rr
2361
+ daor1 d ao1 rr
2362
+ daor2 d ao2 rr
2363
+ daor3 d ao3 rr
2364
+ daor4 d ao4 rr
2365
+ daor5 d ao5 rr
2366
+ der1 d e1 rr
2367
+ der2 d e2 rr
2368
+ der3 d e3 rr
2369
+ der4 d e4 rr
2370
+ der5 d e5 rr
2371
+ deir1 d ei1 rr
2372
+ deir2 d ei2 rr
2373
+ deir3 d ei3 rr
2374
+ deir4 d ei4 rr
2375
+ deir5 d ei5 rr
2376
+ denr1 d en1 rr
2377
+ denr2 d en2 rr
2378
+ denr3 d en3 rr
2379
+ denr4 d en4 rr
2380
+ denr5 d en5 rr
2381
+ dengr1 d eng1 rr
2382
+ dengr2 d eng2 rr
2383
+ dengr3 d eng3 rr
2384
+ dengr4 d eng4 rr
2385
+ dengr5 d eng5 rr
2386
+ dir1 d i1 rr
2387
+ dir2 d i2 rr
2388
+ dir3 d i3 rr
2389
+ dir4 d i4 rr
2390
+ dir5 d i5 rr
2391
+ diar1 d ia1 rr
2392
+ diar2 d ia2 rr
2393
+ diar3 d ia3 rr
2394
+ diar4 d ia4 rr
2395
+ diar5 d ia5 rr
2396
+ dianr1 d ian1 rr
2397
+ dianr2 d ian2 rr
2398
+ dianr3 d ian3 rr
2399
+ dianr4 d ian4 rr
2400
+ dianr5 d ian5 rr
2401
+ diaor1 d iao1 rr
2402
+ diaor2 d iao2 rr
2403
+ diaor3 d iao3 rr
2404
+ diaor4 d iao4 rr
2405
+ diaor5 d iao5 rr
2406
+ dier1 d ie1 rr
2407
+ dier2 d ie2 rr
2408
+ dier3 d ie3 rr
2409
+ dier4 d ie4 rr
2410
+ dier5 d ie5 rr
2411
+ dingr1 d ing1 rr
2412
+ dingr2 d ing2 rr
2413
+ dingr3 d ing3 rr
2414
+ dingr4 d ing4 rr
2415
+ dingr5 d ing5 rr
2416
+ diur1 d iou1 rr
2417
+ diur2 d iou2 rr
2418
+ diur3 d iou3 rr
2419
+ diur4 d iou4 rr
2420
+ diur5 d iou5 rr
2421
+ dongr1 d ong1 rr
2422
+ dongr2 d ong2 rr
2423
+ dongr3 d ong3 rr
2424
+ dongr4 d ong4 rr
2425
+ dongr5 d ong5 rr
2426
+ dour1 d ou1 rr
2427
+ dour2 d ou2 rr
2428
+ dour3 d ou3 rr
2429
+ dour4 d ou4 rr
2430
+ dour5 d ou5 rr
2431
+ dur1 d u1 rr
2432
+ dur2 d u2 rr
2433
+ dur3 d u3 rr
2434
+ dur4 d u4 rr
2435
+ dur5 d u5 rr
2436
+ duanr1 d uan1 rr
2437
+ duanr2 d uan2 rr
2438
+ duanr3 d uan3 rr
2439
+ duanr4 d uan4 rr
2440
+ duanr5 d uan5 rr
2441
+ duir1 d uei1 rr
2442
+ duir2 d uei2 rr
2443
+ duir3 d uei3 rr
2444
+ duir4 d uei4 rr
2445
+ duir5 d uei5 rr
2446
+ dunr1 d uen1 rr
2447
+ dunr2 d uen2 rr
2448
+ dunr3 d uen3 rr
2449
+ dunr4 d uen4 rr
2450
+ dunr5 d uen5 rr
2451
+ duor1 d uo1 rr
2452
+ duor2 d uo2 rr
2453
+ duor3 d uo3 rr
2454
+ duor4 d uo4 rr
2455
+ duor5 d uo5 rr
2456
+ er1 e1 rr
2457
+ er2 e2 rr
2458
+ er3 e3 rr
2459
+ er4 e4 rr
2460
+ er5 e5 rr
2461
+ eir1 ei1 rr
2462
+ eir2 ei2 rr
2463
+ eir3 ei3 rr
2464
+ eir4 ei4 rr
2465
+ eir5 ei5 rr
2466
+ enr1 en1 rr
2467
+ enr2 en2 rr
2468
+ enr3 en3 rr
2469
+ enr4 en4 rr
2470
+ enr5 en5 rr
2471
+ engr1 eng1 rr
2472
+ engr2 eng2 rr
2473
+ engr3 eng3 rr
2474
+ engr4 eng4 rr
2475
+ engr5 eng5 rr
2476
+ far1 f a1 rr
2477
+ far2 f a2 rr
2478
+ far3 f a3 rr
2479
+ far4 f a4 rr
2480
+ far5 f a5 rr
2481
+ fanr1 f an1 rr
2482
+ fanr2 f an2 rr
2483
+ fanr3 f an3 rr
2484
+ fanr4 f an4 rr
2485
+ fanr5 f an5 rr
2486
+ fangr1 f ang1 rr
2487
+ fangr2 f ang2 rr
2488
+ fangr3 f ang3 rr
2489
+ fangr4 f ang4 rr
2490
+ fangr5 f ang5 rr
2491
+ feir1 f ei1 rr
2492
+ feir2 f ei2 rr
2493
+ feir3 f ei3 rr
2494
+ feir4 f ei4 rr
2495
+ feir5 f ei5 rr
2496
+ fenr1 f en1 rr
2497
+ fenr2 f en2 rr
2498
+ fenr3 f en3 rr
2499
+ fenr4 f en4 rr
2500
+ fenr5 f en5 rr
2501
+ fengr1 f eng1 rr
2502
+ fengr2 f eng2 rr
2503
+ fengr3 f eng3 rr
2504
+ fengr4 f eng4 rr
2505
+ fengr5 f eng5 rr
2506
+ for1 f o1 rr
2507
+ for2 f o2 rr
2508
+ for3 f o3 rr
2509
+ for4 f o4 rr
2510
+ for5 f o5 rr
2511
+ four1 f ou1 rr
2512
+ four2 f ou2 rr
2513
+ four3 f ou3 rr
2514
+ four4 f ou4 rr
2515
+ four5 f ou5 rr
2516
+ fur1 f u1 rr
2517
+ fur2 f u2 rr
2518
+ fur3 f u3 rr
2519
+ fur4 f u4 rr
2520
+ fur5 f u5 rr
2521
+ gar1 g a1 rr
2522
+ gar2 g a2 rr
2523
+ gar3 g a3 rr
2524
+ gar4 g a4 rr
2525
+ gar5 g a5 rr
2526
+ gair1 g ai1 rr
2527
+ gair2 g ai2 rr
2528
+ gair3 g ai3 rr
2529
+ gair4 g ai4 rr
2530
+ gair5 g ai5 rr
2531
+ ganr1 g an1 rr
2532
+ ganr2 g an2 rr
2533
+ ganr3 g an3 rr
2534
+ ganr4 g an4 rr
2535
+ ganr5 g an5 rr
2536
+ gangr1 g ang1 rr
2537
+ gangr2 g ang2 rr
2538
+ gangr3 g ang3 rr
2539
+ gangr4 g ang4 rr
2540
+ gangr5 g ang5 rr
2541
+ gaor1 g ao1 rr
2542
+ gaor2 g ao2 rr
2543
+ gaor3 g ao3 rr
2544
+ gaor4 g ao4 rr
2545
+ gaor5 g ao5 rr
2546
+ ger1 g e1 rr
2547
+ ger2 g e2 rr
2548
+ ger3 g e3 rr
2549
+ ger4 g e4 rr
2550
+ ger5 g e5 rr
2551
+ geir1 g ei1 rr
2552
+ geir2 g ei2 rr
2553
+ geir3 g ei3 rr
2554
+ geir4 g ei4 rr
2555
+ geir5 g ei5 rr
2556
+ genr1 g en1 rr
2557
+ genr2 g en2 rr
2558
+ genr3 g en3 rr
2559
+ genr4 g en4 rr
2560
+ genr5 g en5 rr
2561
+ gengr1 g eng1 rr
2562
+ gengr2 g eng2 rr
2563
+ gengr3 g eng3 rr
2564
+ gengr4 g eng4 rr
2565
+ gengr5 g eng5 rr
2566
+ gongr1 g ong1 rr
2567
+ gongr2 g ong2 rr
2568
+ gongr3 g ong3 rr
2569
+ gongr4 g ong4 rr
2570
+ gongr5 g ong5 rr
2571
+ gour1 g ou1 rr
2572
+ gour2 g ou2 rr
2573
+ gour3 g ou3 rr
2574
+ gour4 g ou4 rr
2575
+ gour5 g ou5 rr
2576
+ gur1 g u1 rr
2577
+ gur2 g u2 rr
2578
+ gur3 g u3 rr
2579
+ gur4 g u4 rr
2580
+ gur5 g u5 rr
2581
+ guar1 g ua1 rr
2582
+ guar2 g ua2 rr
2583
+ guar3 g ua3 rr
2584
+ guar4 g ua4 rr
2585
+ guar5 g ua5 rr
2586
+ guair1 g uai1 rr
2587
+ guair2 g uai2 rr
2588
+ guair3 g uai3 rr
2589
+ guair4 g uai4 rr
2590
+ guair5 g uai5 rr
2591
+ guanr1 g uan1 rr
2592
+ guanr2 g uan2 rr
2593
+ guanr3 g uan3 rr
2594
+ guanr4 g uan4 rr
2595
+ guanr5 g uan5 rr
2596
+ guangr1 g uang1 rr
2597
+ guangr2 g uang2 rr
2598
+ guangr3 g uang3 rr
2599
+ guangr4 g uang4 rr
2600
+ guangr5 g uang5 rr
2601
+ guir1 g uei1 rr
2602
+ guir2 g uei2 rr
2603
+ guir3 g uei3 rr
2604
+ guir4 g uei4 rr
2605
+ guir5 g uei5 rr
2606
+ gunr1 g uen1 rr
2607
+ gunr2 g uen2 rr
2608
+ gunr3 g uen3 rr
2609
+ gunr4 g uen4 rr
2610
+ gunr5 g uen5 rr
2611
+ guor1 g uo1 rr
2612
+ guor2 g uo2 rr
2613
+ guor3 g uo3 rr
2614
+ guor4 g uo4 rr
2615
+ guor5 g uo5 rr
2616
+ har1 h a1 rr
2617
+ har2 h a2 rr
2618
+ har3 h a3 rr
2619
+ har4 h a4 rr
2620
+ har5 h a5 rr
2621
+ hair1 h ai1 rr
2622
+ hair2 h ai2 rr
2623
+ hair3 h ai3 rr
2624
+ hair4 h ai4 rr
2625
+ hair5 h ai5 rr
2626
+ hanr1 h an1 rr
2627
+ hanr2 h an2 rr
2628
+ hanr3 h an3 rr
2629
+ hanr4 h an4 rr
2630
+ hanr5 h an5 rr
2631
+ hangr1 h ang1 rr
2632
+ hangr2 h ang2 rr
2633
+ hangr3 h ang3 rr
2634
+ hangr4 h ang4 rr
2635
+ hangr5 h ang5 rr
2636
+ haor1 h ao1 rr
2637
+ haor2 h ao2 rr
2638
+ haor3 h ao3 rr
2639
+ haor4 h ao4 rr
2640
+ haor5 h ao5 rr
2641
+ her1 h e1 rr
2642
+ her2 h e2 rr
2643
+ her3 h e3 rr
2644
+ her4 h e4 rr
2645
+ her5 h e5 rr
2646
+ heir1 h ei1 rr
2647
+ heir2 h ei2 rr
2648
+ heir3 h ei3 rr
2649
+ heir4 h ei4 rr
2650
+ heir5 h ei5 rr
2651
+ henr1 h en1 rr
2652
+ henr2 h en2 rr
2653
+ henr3 h en3 rr
2654
+ henr4 h en4 rr
2655
+ henr5 h en5 rr
2656
+ hengr1 h eng1 rr
2657
+ hengr2 h eng2 rr
2658
+ hengr3 h eng3 rr
2659
+ hengr4 h eng4 rr
2660
+ hengr5 h eng5 rr
2661
+ hongr1 h ong1 rr
2662
+ hongr2 h ong2 rr
2663
+ hongr3 h ong3 rr
2664
+ hongr4 h ong4 rr
2665
+ hongr5 h ong5 rr
2666
+ hour1 h ou1 rr
2667
+ hour2 h ou2 rr
2668
+ hour3 h ou3 rr
2669
+ hour4 h ou4 rr
2670
+ hour5 h ou5 rr
2671
+ hur1 h u1 rr
2672
+ hur2 h u2 rr
2673
+ hur3 h u3 rr
2674
+ hur4 h u4 rr
2675
+ hur5 h u5 rr
2676
+ huar1 h ua1 rr
2677
+ huar2 h ua2 rr
2678
+ huar3 h ua3 rr
2679
+ huar4 h ua4 rr
2680
+ huar5 h ua5 rr
2681
+ huair1 h uai1 rr
2682
+ huair2 h uai2 rr
2683
+ huair3 h uai3 rr
2684
+ huair4 h uai4 rr
2685
+ huair5 h uai5 rr
2686
+ huanr1 h uan1 rr
2687
+ huanr2 h uan2 rr
2688
+ huanr3 h uan3 rr
2689
+ huanr4 h uan4 rr
2690
+ huanr5 h uan5 rr
2691
+ huangr1 h uang1 rr
2692
+ huangr2 h uang2 rr
2693
+ huangr3 h uang3 rr
2694
+ huangr4 h uang4 rr
2695
+ huangr5 h uang5 rr
2696
+ huir1 h uei1 rr
2697
+ huir2 h uei2 rr
2698
+ huir3 h uei3 rr
2699
+ huir4 h uei4 rr
2700
+ huir5 h uei5 rr
2701
+ hunr1 h uen1 rr
2702
+ hunr2 h uen2 rr
2703
+ hunr3 h uen3 rr
2704
+ hunr4 h uen4 rr
2705
+ hunr5 h uen5 rr
2706
+ huor1 h uo1 rr
2707
+ huor2 h uo2 rr
2708
+ huor3 h uo3 rr
2709
+ huor4 h uo4 rr
2710
+ huor5 h uo5 rr
2711
+ jir1 j i1 rr
2712
+ jir2 j i2 rr
2713
+ jir3 j i3 rr
2714
+ jir4 j i4 rr
2715
+ jir5 j i5 rr
2716
+ jiar1 j ia1 rr
2717
+ jiar2 j ia2 rr
2718
+ jiar3 j ia3 rr
2719
+ jiar4 j ia4 rr
2720
+ jiar5 j ia5 rr
2721
+ jianr1 j ian1 rr
2722
+ jianr2 j ian2 rr
2723
+ jianr3 j ian3 rr
2724
+ jianr4 j ian4 rr
2725
+ jianr5 j ian5 rr
2726
+ jiangr1 j iang1 rr
2727
+ jiangr2 j iang2 rr
2728
+ jiangr3 j iang3 rr
2729
+ jiangr4 j iang4 rr
2730
+ jiangr5 j iang5 rr
2731
+ jiaor1 j iao1 rr
2732
+ jiaor2 j iao2 rr
2733
+ jiaor3 j iao3 rr
2734
+ jiaor4 j iao4 rr
2735
+ jiaor5 j iao5 rr
2736
+ jier1 j ie1 rr
2737
+ jier2 j ie2 rr
2738
+ jier3 j ie3 rr
2739
+ jier4 j ie4 rr
2740
+ jier5 j ie5 rr
2741
+ jinr1 j in1 rr
2742
+ jinr2 j in2 rr
2743
+ jinr3 j in3 rr
2744
+ jinr4 j in4 rr
2745
+ jinr5 j in5 rr
2746
+ jingr1 j ing1 rr
2747
+ jingr2 j ing2 rr
2748
+ jingr3 j ing3 rr
2749
+ jingr4 j ing4 rr
2750
+ jingr5 j ing5 rr
2751
+ jiongr1 j iong1 rr
2752
+ jiongr2 j iong2 rr
2753
+ jiongr3 j iong3 rr
2754
+ jiongr4 j iong4 rr
2755
+ jiongr5 j iong5 rr
2756
+ jiur1 j iou1 rr
2757
+ jiur2 j iou2 rr
2758
+ jiur3 j iou3 rr
2759
+ jiur4 j iou4 rr
2760
+ jiur5 j iou5 rr
2761
+ jur1 j v1 rr
2762
+ jur2 j v2 rr
2763
+ jur3 j v3 rr
2764
+ jur4 j v4 rr
2765
+ jur5 j v5 rr
2766
+ juanr1 j van1 rr
2767
+ juanr2 j van2 rr
2768
+ juanr3 j van3 rr
2769
+ juanr4 j van4 rr
2770
+ juanr5 j van5 rr
2771
+ juer1 j ve1 rr
2772
+ juer2 j ve2 rr
2773
+ juer3 j ve3 rr
2774
+ juer4 j ve4 rr
2775
+ juer5 j ve5 rr
2776
+ junr1 j vn1 rr
2777
+ junr2 j vn2 rr
2778
+ junr3 j vn3 rr
2779
+ junr4 j vn4 rr
2780
+ junr5 j vn5 rr
2781
+ kar1 k a1 rr
2782
+ kar2 k a2 rr
2783
+ kar3 k a3 rr
2784
+ kar4 k a4 rr
2785
+ kar5 k a5 rr
2786
+ kair1 k ai1 rr
2787
+ kair2 k ai2 rr
2788
+ kair3 k ai3 rr
2789
+ kair4 k ai4 rr
2790
+ kair5 k ai5 rr
2791
+ kanr1 k an1 rr
2792
+ kanr2 k an2 rr
2793
+ kanr3 k an3 rr
2794
+ kanr4 k an4 rr
2795
+ kanr5 k an5 rr
2796
+ kangr1 k ang1 rr
2797
+ kangr2 k ang2 rr
2798
+ kangr3 k ang3 rr
2799
+ kangr4 k ang4 rr
2800
+ kangr5 k ang5 rr
2801
+ kaor1 k ao1 rr
2802
+ kaor2 k ao2 rr
2803
+ kaor3 k ao3 rr
2804
+ kaor4 k ao4 rr
2805
+ kaor5 k ao5 rr
2806
+ ker1 k e1 rr
2807
+ ker2 k e2 rr
2808
+ ker3 k e3 rr
2809
+ ker4 k e4 rr
2810
+ ker5 k e5 rr
2811
+ keir1 k ei1 rr
2812
+ keir2 k ei2 rr
2813
+ keir3 k ei3 rr
2814
+ keir4 k ei4 rr
2815
+ keir5 k ei5 rr
2816
+ kenr1 k en1 rr
2817
+ kenr2 k en2 rr
2818
+ kenr3 k en3 rr
2819
+ kenr4 k en4 rr
2820
+ kenr5 k en5 rr
2821
+ kengr1 k eng1 rr
2822
+ kengr2 k eng2 rr
2823
+ kengr3 k eng3 rr
2824
+ kengr4 k eng4 rr
2825
+ kengr5 k eng5 rr
2826
+ kongr1 k ong1 rr
2827
+ kongr2 k ong2 rr
2828
+ kongr3 k ong3 rr
2829
+ kongr4 k ong4 rr
2830
+ kongr5 k ong5 rr
2831
+ kour1 k ou1 rr
2832
+ kour2 k ou2 rr
2833
+ kour3 k ou3 rr
2834
+ kour4 k ou4 rr
2835
+ kour5 k ou5 rr
2836
+ kur1 k u1 rr
2837
+ kur2 k u2 rr
2838
+ kur3 k u3 rr
2839
+ kur4 k u4 rr
2840
+ kur5 k u5 rr
2841
+ kuar1 k ua1 rr
2842
+ kuar2 k ua2 rr
2843
+ kuar3 k ua3 rr
2844
+ kuar4 k ua4 rr
2845
+ kuar5 k ua5 rr
2846
+ kuair1 k uai1 rr
2847
+ kuair2 k uai2 rr
2848
+ kuair3 k uai3 rr
2849
+ kuair4 k uai4 rr
2850
+ kuair5 k uai5 rr
2851
+ kuanr1 k uan1 rr
2852
+ kuanr2 k uan2 rr
2853
+ kuanr3 k uan3 rr
2854
+ kuanr4 k uan4 rr
2855
+ kuanr5 k uan5 rr
2856
+ kuangr1 k uang1 rr
2857
+ kuangr2 k uang2 rr
2858
+ kuangr3 k uang3 rr
2859
+ kuangr4 k uang4 rr
2860
+ kuangr5 k uang5 rr
2861
+ kuir1 k uei1 rr
2862
+ kuir2 k uei2 rr
2863
+ kuir3 k uei3 rr
2864
+ kuir4 k uei4 rr
2865
+ kuir5 k uei5 rr
2866
+ kunr1 k uen1 rr
2867
+ kunr2 k uen2 rr
2868
+ kunr3 k uen3 rr
2869
+ kunr4 k uen4 rr
2870
+ kunr5 k uen5 rr
2871
+ kuor1 k uo1 rr
2872
+ kuor2 k uo2 rr
2873
+ kuor3 k uo3 rr
2874
+ kuor4 k uo4 rr
2875
+ kuor5 k uo5 rr
2876
+ lar1 l a1 rr
2877
+ lar2 l a2 rr
2878
+ lar3 l a3 rr
2879
+ lar4 l a4 rr
2880
+ lar5 l a5 rr
2881
+ lair1 l ai1 rr
2882
+ lair2 l ai2 rr
2883
+ lair3 l ai3 rr
2884
+ lair4 l ai4 rr
2885
+ lair5 l ai5 rr
2886
+ lanr1 l an1 rr
2887
+ lanr2 l an2 rr
2888
+ lanr3 l an3 rr
2889
+ lanr4 l an4 rr
2890
+ lanr5 l an5 rr
2891
+ langr1 l ang1 rr
2892
+ langr2 l ang2 rr
2893
+ langr3 l ang3 rr
2894
+ langr4 l ang4 rr
2895
+ langr5 l ang5 rr
2896
+ laor1 l ao1 rr
2897
+ laor2 l ao2 rr
2898
+ laor3 l ao3 rr
2899
+ laor4 l ao4 rr
2900
+ laor5 l ao5 rr
2901
+ ler1 l e1 rr
2902
+ ler2 l e2 rr
2903
+ ler3 l e3 rr
2904
+ ler4 l e4 rr
2905
+ ler5 l e5 rr
2906
+ leir1 l ei1 rr
2907
+ leir2 l ei2 rr
2908
+ leir3 l ei3 rr
2909
+ leir4 l ei4 rr
2910
+ leir5 l ei5 rr
2911
+ lengr1 l eng1 rr
2912
+ lengr2 l eng2 rr
2913
+ lengr3 l eng3 rr
2914
+ lengr4 l eng4 rr
2915
+ lengr5 l eng5 rr
2916
+ lir1 l i1 rr
2917
+ lir2 l i2 rr
2918
+ lir3 l i3 rr
2919
+ lir4 l i4 rr
2920
+ lir5 l i5 rr
2921
+ liar1 l ia1 rr
2922
+ liar2 l ia2 rr
2923
+ liar3 l ia3 rr
2924
+ liar4 l ia4 rr
2925
+ liar5 l ia5 rr
2926
+ lianr1 l ian1 rr
2927
+ lianr2 l ian2 rr
2928
+ lianr3 l ian3 rr
2929
+ lianr4 l ian4 rr
2930
+ lianr5 l ian5 rr
2931
+ liangr1 l iang1 rr
2932
+ liangr2 l iang2 rr
2933
+ liangr3 l iang3 rr
2934
+ liangr4 l iang4 rr
2935
+ liangr5 l iang5 rr
2936
+ liaor1 l iao1 rr
2937
+ liaor2 l iao2 rr
2938
+ liaor3 l iao3 rr
2939
+ liaor4 l iao4 rr
2940
+ liaor5 l iao5 rr
2941
+ lier1 l ie1 rr
2942
+ lier2 l ie2 rr
2943
+ lier3 l ie3 rr
2944
+ lier4 l ie4 rr
2945
+ lier5 l ie5 rr
2946
+ linr1 l in1 rr
2947
+ linr2 l in2 rr
2948
+ linr3 l in3 rr
2949
+ linr4 l in4 rr
2950
+ linr5 l in5 rr
2951
+ lingr1 l ing1 rr
2952
+ lingr2 l ing2 rr
2953
+ lingr3 l ing3 rr
2954
+ lingr4 l ing4 rr
2955
+ lingr5 l ing5 rr
2956
+ liur1 l iou1 rr
2957
+ liur2 l iou2 rr
2958
+ liur3 l iou3 rr
2959
+ liur4 l iou4 rr
2960
+ liur5 l iou5 rr
2961
+ lor1 l o1 rr
2962
+ lor2 l o2 rr
2963
+ lor3 l o3 rr
2964
+ lor4 l o4 rr
2965
+ lor5 l o5 rr
2966
+ longr1 l ong1 rr
2967
+ longr2 l ong2 rr
2968
+ longr3 l ong3 rr
2969
+ longr4 l ong4 rr
2970
+ longr5 l ong5 rr
2971
+ lour1 l ou1 rr
2972
+ lour2 l ou2 rr
2973
+ lour3 l ou3 rr
2974
+ lour4 l ou4 rr
2975
+ lour5 l ou5 rr
2976
+ lur1 l u1 rr
2977
+ lur2 l u2 rr
2978
+ lur3 l u3 rr
2979
+ lur4 l u4 rr
2980
+ lur5 l u5 rr
2981
+ luanr1 l uan1 rr
2982
+ luanr2 l uan2 rr
2983
+ luanr3 l uan3 rr
2984
+ luanr4 l uan4 rr
2985
+ luanr5 l uan5 rr
2986
+ luer1 l ve1 rr
2987
+ luer2 l ve2 rr
2988
+ luer3 l ve3 rr
2989
+ luer4 l ve4 rr
2990
+ luer5 l ve5 rr
2991
+ lver1 l ve1 rr
2992
+ lver2 l ve2 rr
2993
+ lver3 l ve3 rr
2994
+ lver4 l ve4 rr
2995
+ lver5 l ve5 rr
2996
+ lunr1 l uen1 rr
2997
+ lunr2 l uen2 rr
2998
+ lunr3 l uen3 rr
2999
+ lunr4 l uen4 rr
3000
+ lunr5 l uen5 rr
3001
+ luor1 l uo1 rr
3002
+ luor2 l uo2 rr
3003
+ luor3 l uo3 rr
3004
+ luor4 l uo4 rr
3005
+ luor5 l uo5 rr
3006
+ lvr1 l v1 rr
3007
+ lvr2 l v2 rr
3008
+ lvr3 l v3 rr
3009
+ lvr4 l v4 rr
3010
+ lvr5 l v5 rr
3011
+ mar1 m a1 rr
3012
+ mar2 m a2 rr
3013
+ mar3 m a3 rr
3014
+ mar4 m a4 rr
3015
+ mar5 m a5 rr
3016
+ mair1 m ai1 rr
3017
+ mair2 m ai2 rr
3018
+ mair3 m ai3 rr
3019
+ mair4 m ai4 rr
3020
+ mair5 m ai5 rr
3021
+ manr1 m an1 rr
3022
+ manr2 m an2 rr
3023
+ manr3 m an3 rr
3024
+ manr4 m an4 rr
3025
+ manr5 m an5 rr
3026
+ mangr1 m ang1 rr
3027
+ mangr2 m ang2 rr
3028
+ mangr3 m ang3 rr
3029
+ mangr4 m ang4 rr
3030
+ mangr5 m ang5 rr
3031
+ maor1 m ao1 rr
3032
+ maor2 m ao2 rr
3033
+ maor3 m ao3 rr
3034
+ maor4 m ao4 rr
3035
+ maor5 m ao5 rr
3036
+ mer1 m e1 rr
3037
+ mer2 m e2 rr
3038
+ mer3 m e3 rr
3039
+ mer4 m e4 rr
3040
+ mer5 m e5 rr
3041
+ meir1 m ei1 rr
3042
+ meir2 m ei2 rr
3043
+ meir3 m ei3 rr
3044
+ meir4 m ei4 rr
3045
+ meir5 m ei5 rr
3046
+ menr1 m en1 rr
3047
+ menr2 m en2 rr
3048
+ menr3 m en3 rr
3049
+ menr4 m en4 rr
3050
+ menr5 m en5 rr
3051
+ mengr1 m eng1 rr
3052
+ mengr2 m eng2 rr
3053
+ mengr3 m eng3 rr
3054
+ mengr4 m eng4 rr
3055
+ mengr5 m eng5 rr
3056
+ mir1 m i1 rr
3057
+ mir2 m i2 rr
3058
+ mir3 m i3 rr
3059
+ mir4 m i4 rr
3060
+ mir5 m i5 rr
3061
+ mianr1 m ian1 rr
3062
+ mianr2 m ian2 rr
3063
+ mianr3 m ian3 rr
3064
+ mianr4 m ian4 rr
3065
+ mianr5 m ian5 rr
3066
+ miaor1 m iao1 rr
3067
+ miaor2 m iao2 rr
3068
+ miaor3 m iao3 rr
3069
+ miaor4 m iao4 rr
3070
+ miaor5 m iao5 rr
3071
+ mier1 m ie1 rr
3072
+ mier2 m ie2 rr
3073
+ mier3 m ie3 rr
3074
+ mier4 m ie4 rr
3075
+ mier5 m ie5 rr
3076
+ minr1 m in1 rr
3077
+ minr2 m in2 rr
3078
+ minr3 m in3 rr
3079
+ minr4 m in4 rr
3080
+ minr5 m in5 rr
3081
+ mingr1 m ing1 rr
3082
+ mingr2 m ing2 rr
3083
+ mingr3 m ing3 rr
3084
+ mingr4 m ing4 rr
3085
+ mingr5 m ing5 rr
3086
+ miur1 m iou1 rr
3087
+ miur2 m iou2 rr
3088
+ miur3 m iou3 rr
3089
+ miur4 m iou4 rr
3090
+ miur5 m iou5 rr
3091
+ mor1 m o1 rr
3092
+ mor2 m o2 rr
3093
+ mor3 m o3 rr
3094
+ mor4 m o4 rr
3095
+ mor5 m o5 rr
3096
+ mour1 m ou1 rr
3097
+ mour2 m ou2 rr
3098
+ mour3 m ou3 rr
3099
+ mour4 m ou4 rr
3100
+ mour5 m ou5 rr
3101
+ mur1 m u1 rr
3102
+ mur2 m u2 rr
3103
+ mur3 m u3 rr
3104
+ mur4 m u4 rr
3105
+ mur5 m u5 rr
3106
+ nar1 n a1 rr
3107
+ nar2 n a2 rr
3108
+ nar3 n a3 rr
3109
+ nar4 n a4 rr
3110
+ nar5 n a5 rr
3111
+ nair1 n ai1 rr
3112
+ nair2 n ai2 rr
3113
+ nair3 n ai3 rr
3114
+ nair4 n ai4 rr
3115
+ nair5 n ai5 rr
3116
+ nanr1 n an1 rr
3117
+ nanr2 n an2 rr
3118
+ nanr3 n an3 rr
3119
+ nanr4 n an4 rr
3120
+ nanr5 n an5 rr
3121
+ nangr1 n ang1 rr
3122
+ nangr2 n ang2 rr
3123
+ nangr3 n ang3 rr
3124
+ nangr4 n ang4 rr
3125
+ nangr5 n ang5 rr
3126
+ naor1 n ao1 rr
3127
+ naor2 n ao2 rr
3128
+ naor3 n ao3 rr
3129
+ naor4 n ao4 rr
3130
+ naor5 n ao5 rr
3131
+ ner1 n e1 rr
3132
+ ner2 n e2 rr
3133
+ ner3 n e3 rr
3134
+ ner4 n e4 rr
3135
+ ner5 n e5 rr
3136
+ neir1 n ei1 rr
3137
+ neir2 n ei2 rr
3138
+ neir3 n ei3 rr
3139
+ neir4 n ei4 rr
3140
+ neir5 n ei5 rr
3141
+ nenr1 n en1 rr
3142
+ nenr2 n en2 rr
3143
+ nenr3 n en3 rr
3144
+ nenr4 n en4 rr
3145
+ nenr5 n en5 rr
3146
+ nengr1 n eng1 rr
3147
+ nengr2 n eng2 rr
3148
+ nengr3 n eng3 rr
3149
+ nengr4 n eng4 rr
3150
+ nengr5 n eng5 rr
3151
+ nir1 n i1 rr
3152
+ nir2 n i2 rr
3153
+ nir3 n i3 rr
3154
+ nir4 n i4 rr
3155
+ nir5 n i5 rr
3156
+ nianr1 n ian1 rr
3157
+ nianr2 n ian2 rr
3158
+ nianr3 n ian3 rr
3159
+ nianr4 n ian4 rr
3160
+ nianr5 n ian5 rr
3161
+ niangr1 n iang1 rr
3162
+ niangr2 n iang2 rr
3163
+ niangr3 n iang3 rr
3164
+ niangr4 n iang4 rr
3165
+ niangr5 n iang5 rr
3166
+ niaor1 n iao1 rr
3167
+ niaor2 n iao2 rr
3168
+ niaor3 n iao3 rr
3169
+ niaor4 n iao4 rr
3170
+ niaor5 n iao5 rr
3171
+ nier1 n ie1 rr
3172
+ nier2 n ie2 rr
3173
+ nier3 n ie3 rr
3174
+ nier4 n ie4 rr
3175
+ nier5 n ie5 rr
3176
+ ninr1 n in1 rr
3177
+ ninr2 n in2 rr
3178
+ ninr3 n in3 rr
3179
+ ninr4 n in4 rr
3180
+ ninr5 n in5 rr
3181
+ ningr1 n ing1 rr
3182
+ ningr2 n ing2 rr
3183
+ ningr3 n ing3 rr
3184
+ ningr4 n ing4 rr
3185
+ ningr5 n ing5 rr
3186
+ niur1 n iou1 rr
3187
+ niur2 n iou2 rr
3188
+ niur3 n iou3 rr
3189
+ niur4 n iou4 rr
3190
+ niur5 n iou5 rr
3191
+ nongr1 n ong1 rr
3192
+ nongr2 n ong2 rr
3193
+ nongr3 n ong3 rr
3194
+ nongr4 n ong4 rr
3195
+ nongr5 n ong5 rr
3196
+ nour1 n ou1 rr
3197
+ nour2 n ou2 rr
3198
+ nour3 n ou3 rr
3199
+ nour4 n ou4 rr
3200
+ nour5 n ou5 rr
3201
+ nur1 n u1 rr
3202
+ nur2 n u2 rr
3203
+ nur3 n u3 rr
3204
+ nur4 n u4 rr
3205
+ nur5 n u5 rr
3206
+ nuanr1 n uan1 rr
3207
+ nuanr2 n uan2 rr
3208
+ nuanr3 n uan3 rr
3209
+ nuanr4 n uan4 rr
3210
+ nuanr5 n uan5 rr
3211
+ nuer1 n ve1 rr
3212
+ nuer2 n ve2 rr
3213
+ nuer3 n ve3 rr
3214
+ nuer4 n ve4 rr
3215
+ nuer5 n ve5 rr
3216
+ nver1 n ve1 rr
3217
+ nver2 n ve2 rr
3218
+ nver3 n ve3 rr
3219
+ nver4 n ve4 rr
3220
+ nver5 n ve5 rr
3221
+ nuor1 n uo1 rr
3222
+ nuor2 n uo2 rr
3223
+ nuor3 n uo3 rr
3224
+ nuor4 n uo4 rr
3225
+ nuor5 n uo5 rr
3226
+ nvr1 n v1 rr
3227
+ nvr2 n v2 rr
3228
+ nvr3 n v3 rr
3229
+ nvr4 n v4 rr
3230
+ nvr5 n v5 rr
3231
+ or1 o1 rr
3232
+ or2 o2 rr
3233
+ or3 o3 rr
3234
+ or4 o4 rr
3235
+ or5 o5 rr
3236
+ our1 ou1 rr
3237
+ our2 ou2 rr
3238
+ our3 ou3 rr
3239
+ our4 ou4 rr
3240
+ our5 ou5 rr
3241
+ par1 p a1 rr
3242
+ par2 p a2 rr
3243
+ par3 p a3 rr
3244
+ par4 p a4 rr
3245
+ par5 p a5 rr
3246
+ pair1 p ai1 rr
3247
+ pair2 p ai2 rr
3248
+ pair3 p ai3 rr
3249
+ pair4 p ai4 rr
3250
+ pair5 p ai5 rr
3251
+ panr1 p an1 rr
3252
+ panr2 p an2 rr
3253
+ panr3 p an3 rr
3254
+ panr4 p an4 rr
3255
+ panr5 p an5 rr
3256
+ pangr1 p ang1 rr
3257
+ pangr2 p ang2 rr
3258
+ pangr3 p ang3 rr
3259
+ pangr4 p ang4 rr
3260
+ pangr5 p ang5 rr
3261
+ paor1 p ao1 rr
3262
+ paor2 p ao2 rr
3263
+ paor3 p ao3 rr
3264
+ paor4 p ao4 rr
3265
+ paor5 p ao5 rr
3266
+ peir1 p ei1 rr
3267
+ peir2 p ei2 rr
3268
+ peir3 p ei3 rr
3269
+ peir4 p ei4 rr
3270
+ peir5 p ei5 rr
3271
+ penr1 p en1 rr
3272
+ penr2 p en2 rr
3273
+ penr3 p en3 rr
3274
+ penr4 p en4 rr
3275
+ penr5 p en5 rr
3276
+ pengr1 p eng1 rr
3277
+ pengr2 p eng2 rr
3278
+ pengr3 p eng3 rr
3279
+ pengr4 p eng4 rr
3280
+ pengr5 p eng5 rr
3281
+ pir1 p i1 rr
3282
+ pir2 p i2 rr
3283
+ pir3 p i3 rr
3284
+ pir4 p i4 rr
3285
+ pir5 p i5 rr
3286
+ pianr1 p ian1 rr
3287
+ pianr2 p ian2 rr
3288
+ pianr3 p ian3 rr
3289
+ pianr4 p ian4 rr
3290
+ pianr5 p ian5 rr
3291
+ piaor1 p iao1 rr
3292
+ piaor2 p iao2 rr
3293
+ piaor3 p iao3 rr
3294
+ piaor4 p iao4 rr
3295
+ piaor5 p iao5 rr
3296
+ pier1 p ie1 rr
3297
+ pier2 p ie2 rr
3298
+ pier3 p ie3 rr
3299
+ pier4 p ie4 rr
3300
+ pier5 p ie5 rr
3301
+ pinr1 p in1 rr
3302
+ pinr2 p in2 rr
3303
+ pinr3 p in3 rr
3304
+ pinr4 p in4 rr
3305
+ pinr5 p in5 rr
3306
+ pingr1 p ing1 rr
3307
+ pingr2 p ing2 rr
3308
+ pingr3 p ing3 rr
3309
+ pingr4 p ing4 rr
3310
+ pingr5 p ing5 rr
3311
+ por1 p o1 rr
3312
+ por2 p o2 rr
3313
+ por3 p o3 rr
3314
+ por4 p o4 rr
3315
+ por5 p o5 rr
3316
+ pour1 p ou1 rr
3317
+ pour2 p ou2 rr
3318
+ pour3 p ou3 rr
3319
+ pour4 p ou4 rr
3320
+ pour5 p ou5 rr
3321
+ pur1 p u1 rr
3322
+ pur2 p u2 rr
3323
+ pur3 p u3 rr
3324
+ pur4 p u4 rr
3325
+ pur5 p u5 rr
3326
+ qir1 q i1 rr
3327
+ qir2 q i2 rr
3328
+ qir3 q i3 rr
3329
+ qir4 q i4 rr
3330
+ qir5 q i5 rr
3331
+ qiar1 q ia1 rr
3332
+ qiar2 q ia2 rr
3333
+ qiar3 q ia3 rr
3334
+ qiar4 q ia4 rr
3335
+ qiar5 q ia5 rr
3336
+ qianr1 q ian1 rr
3337
+ qianr2 q ian2 rr
3338
+ qianr3 q ian3 rr
3339
+ qianr4 q ian4 rr
3340
+ qianr5 q ian5 rr
3341
+ qiangr1 q iang1 rr
3342
+ qiangr2 q iang2 rr
3343
+ qiangr3 q iang3 rr
3344
+ qiangr4 q iang4 rr
3345
+ qiangr5 q iang5 rr
3346
+ qiaor1 q iao1 rr
3347
+ qiaor2 q iao2 rr
3348
+ qiaor3 q iao3 rr
3349
+ qiaor4 q iao4 rr
3350
+ qiaor5 q iao5 rr
3351
+ qier1 q ie1 rr
3352
+ qier2 q ie2 rr
3353
+ qier3 q ie3 rr
3354
+ qier4 q ie4 rr
3355
+ qier5 q ie5 rr
3356
+ qinr1 q in1 rr
3357
+ qinr2 q in2 rr
3358
+ qinr3 q in3 rr
3359
+ qinr4 q in4 rr
3360
+ qinr5 q in5 rr
3361
+ qingr1 q ing1 rr
3362
+ qingr2 q ing2 rr
3363
+ qingr3 q ing3 rr
3364
+ qingr4 q ing4 rr
3365
+ qingr5 q ing5 rr
3366
+ qiongr1 q iong1 rr
3367
+ qiongr2 q iong2 rr
3368
+ qiongr3 q iong3 rr
3369
+ qiongr4 q iong4 rr
3370
+ qiongr5 q iong5 rr
3371
+ qiur1 q iou1 rr
3372
+ qiur2 q iou2 rr
3373
+ qiur3 q iou3 rr
3374
+ qiur4 q iou4 rr
3375
+ qiur5 q iou5 rr
3376
+ qur1 q v1 rr
3377
+ qur2 q v2 rr
3378
+ qur3 q v3 rr
3379
+ qur4 q v4 rr
3380
+ qur5 q v5 rr
3381
+ quanr1 q van1 rr
3382
+ quanr2 q van2 rr
3383
+ quanr3 q van3 rr
3384
+ quanr4 q van4 rr
3385
+ quanr5 q van5 rr
3386
+ quer1 q ve1 rr
3387
+ quer2 q ve2 rr
3388
+ quer3 q ve3 rr
3389
+ quer4 q ve4 rr
3390
+ quer5 q ve5 rr
3391
+ qunr1 q vn1 rr
3392
+ qunr2 q vn2 rr
3393
+ qunr3 q vn3 rr
3394
+ qunr4 q vn4 rr
3395
+ qunr5 q vn5 rr
3396
+ ranr1 r an1 rr
3397
+ ranr2 r an2 rr
3398
+ ranr3 r an3 rr
3399
+ ranr4 r an4 rr
3400
+ ranr5 r an5 rr
3401
+ rangr1 r ang1 rr
3402
+ rangr2 r ang2 rr
3403
+ rangr3 r ang3 rr
3404
+ rangr4 r ang4 rr
3405
+ rangr5 r ang5 rr
3406
+ raor1 r ao1 rr
3407
+ raor2 r ao2 rr
3408
+ raor3 r ao3 rr
3409
+ raor4 r ao4 rr
3410
+ raor5 r ao5 rr
3411
+ rer1 r e1 rr
3412
+ rer2 r e2 rr
3413
+ rer3 r e3 rr
3414
+ rer4 r e4 rr
3415
+ rer5 r e5 rr
3416
+ renr1 r en1 rr
3417
+ renr2 r en2 rr
3418
+ renr3 r en3 rr
3419
+ renr4 r en4 rr
3420
+ renr5 r en5 rr
3421
+ rengr1 r eng1 rr
3422
+ rengr2 r eng2 rr
3423
+ rengr3 r eng3 rr
3424
+ rengr4 r eng4 rr
3425
+ rengr5 r eng5 rr
3426
+ rir1 r iii1 rr
3427
+ rir2 r iii2 rr
3428
+ rir3 r iii3 rr
3429
+ rir4 r iii4 rr
3430
+ rir5 r iii5 rr
3431
+ rongr1 r ong1 rr
3432
+ rongr2 r ong2 rr
3433
+ rongr3 r ong3 rr
3434
+ rongr4 r ong4 rr
3435
+ rongr5 r ong5 rr
3436
+ rour1 r ou1 rr
3437
+ rour2 r ou2 rr
3438
+ rour3 r ou3 rr
3439
+ rour4 r ou4 rr
3440
+ rour5 r ou5 rr
3441
+ rur1 r u1 rr
3442
+ rur2 r u2 rr
3443
+ rur3 r u3 rr
3444
+ rur4 r u4 rr
3445
+ rur5 r u5 rr
3446
+ ruar1 r ua1 rr
3447
+ ruar2 r ua2 rr
3448
+ ruar3 r ua3 rr
3449
+ ruar4 r ua4 rr
3450
+ ruar5 r ua5 rr
3451
+ ruanr1 r uan1 rr
3452
+ ruanr2 r uan2 rr
3453
+ ruanr3 r uan3 rr
3454
+ ruanr4 r uan4 rr
3455
+ ruanr5 r uan5 rr
3456
+ ruir1 r uei1 rr
3457
+ ruir2 r uei2 rr
3458
+ ruir3 r uei3 rr
3459
+ ruir4 r uei4 rr
3460
+ ruir5 r uei5 rr
3461
+ runr1 r uen1 rr
3462
+ runr2 r uen2 rr
3463
+ runr3 r uen3 rr
3464
+ runr4 r uen4 rr
3465
+ runr5 r uen5 rr
3466
+ ruor1 r uo1 rr
3467
+ ruor2 r uo2 rr
3468
+ ruor3 r uo3 rr
3469
+ ruor4 r uo4 rr
3470
+ ruor5 r uo5 rr
3471
+ sar1 s a1 rr
3472
+ sar2 s a2 rr
3473
+ sar3 s a3 rr
3474
+ sar4 s a4 rr
3475
+ sar5 s a5 rr
3476
+ sair1 s ai1 rr
3477
+ sair2 s ai2 rr
3478
+ sair3 s ai3 rr
3479
+ sair4 s ai4 rr
3480
+ sair5 s ai5 rr
3481
+ sanr1 s an1 rr
3482
+ sanr2 s an2 rr
3483
+ sanr3 s an3 rr
3484
+ sanr4 s an4 rr
3485
+ sanr5 s an5 rr
3486
+ sangr1 s ang1 rr
3487
+ sangr2 s ang2 rr
3488
+ sangr3 s ang3 rr
3489
+ sangr4 s ang4 rr
3490
+ sangr5 s ang5 rr
3491
+ saor1 s ao1 rr
3492
+ saor2 s ao2 rr
3493
+ saor3 s ao3 rr
3494
+ saor4 s ao4 rr
3495
+ saor5 s ao5 rr
3496
+ ser1 s e1 rr
3497
+ ser2 s e2 rr
3498
+ ser3 s e3 rr
3499
+ ser4 s e4 rr
3500
+ ser5 s e5 rr
3501
+ senr1 s en1 rr
3502
+ senr2 s en2 rr
3503
+ senr3 s en3 rr
3504
+ senr4 s en4 rr
3505
+ senr5 s en5 rr
3506
+ sengr1 s eng1 rr
3507
+ sengr2 s eng2 rr
3508
+ sengr3 s eng3 rr
3509
+ sengr4 s eng4 rr
3510
+ sengr5 s eng5 rr
3511
+ shar1 sh a1 rr
3512
+ shar2 sh a2 rr
3513
+ shar3 sh a3 rr
3514
+ shar4 sh a4 rr
3515
+ shar5 sh a5 rr
3516
+ shair1 sh ai1 rr
3517
+ shair2 sh ai2 rr
3518
+ shair3 sh ai3 rr
3519
+ shair4 sh ai4 rr
3520
+ shair5 sh ai5 rr
3521
+ shanr1 sh an1 rr
3522
+ shanr2 sh an2 rr
3523
+ shanr3 sh an3 rr
3524
+ shanr4 sh an4 rr
3525
+ shanr5 sh an5 rr
3526
+ shangr1 sh ang1 rr
3527
+ shangr2 sh ang2 rr
3528
+ shangr3 sh ang3 rr
3529
+ shangr4 sh ang4 rr
3530
+ shangr5 sh ang5 rr
3531
+ shaor1 sh ao1 rr
3532
+ shaor2 sh ao2 rr
3533
+ shaor3 sh ao3 rr
3534
+ shaor4 sh ao4 rr
3535
+ shaor5 sh ao5 rr
3536
+ sher1 sh e1 rr
3537
+ sher2 sh e2 rr
3538
+ sher3 sh e3 rr
3539
+ sher4 sh e4 rr
3540
+ sher5 sh e5 rr
3541
+ sheir1 sh ei1 rr
3542
+ sheir2 sh ei2 rr
3543
+ sheir3 sh ei3 rr
3544
+ sheir4 sh ei4 rr
3545
+ sheir5 sh ei5 rr
3546
+ shenr1 sh en1 rr
3547
+ shenr2 sh en2 rr
3548
+ shenr3 sh en3 rr
3549
+ shenr4 sh en4 rr
3550
+ shenr5 sh en5 rr
3551
+ shengr1 sh eng1 rr
3552
+ shengr2 sh eng2 rr
3553
+ shengr3 sh eng3 rr
3554
+ shengr4 sh eng4 rr
3555
+ shengr5 sh eng5 rr
3556
+ shir1 sh iii1 rr
3557
+ shir2 sh iii2 rr
3558
+ shir3 sh iii3 rr
3559
+ shir4 sh iii4 rr
3560
+ shir5 sh iii5 rr
3561
+ shour1 sh ou1 rr
3562
+ shour2 sh ou2 rr
3563
+ shour3 sh ou3 rr
3564
+ shour4 sh ou4 rr
3565
+ shour5 sh ou5 rr
3566
+ shur1 sh u1 rr
3567
+ shur2 sh u2 rr
3568
+ shur3 sh u3 rr
3569
+ shur4 sh u4 rr
3570
+ shur5 sh u5 rr
3571
+ shuar1 sh ua1 rr
3572
+ shuar2 sh ua2 rr
3573
+ shuar3 sh ua3 rr
3574
+ shuar4 sh ua4 rr
3575
+ shuar5 sh ua5 rr
3576
+ shuair1 sh uai1 rr
3577
+ shuair2 sh uai2 rr
3578
+ shuair3 sh uai3 rr
3579
+ shuair4 sh uai4 rr
3580
+ shuair5 sh uai5 rr
3581
+ shuanr1 sh uan1 rr
3582
+ shuanr2 sh uan2 rr
3583
+ shuanr3 sh uan3 rr
3584
+ shuanr4 sh uan4 rr
3585
+ shuanr5 sh uan5 rr
3586
+ shuangr1 sh uang1 rr
3587
+ shuangr2 sh uang2 rr
3588
+ shuangr3 sh uang3 rr
3589
+ shuangr4 sh uang4 rr
3590
+ shuangr5 sh uang5 rr
3591
+ shuir1 sh uei1 rr
3592
+ shuir2 sh uei2 rr
3593
+ shuir3 sh uei3 rr
3594
+ shuir4 sh uei4 rr
3595
+ shuir5 sh uei5 rr
3596
+ shunr1 sh uen1 rr
3597
+ shunr2 sh uen2 rr
3598
+ shunr3 sh uen3 rr
3599
+ shunr4 sh uen4 rr
3600
+ shunr5 sh uen5 rr
3601
+ shuor1 sh uo1 rr
3602
+ shuor2 sh uo2 rr
3603
+ shuor3 sh uo3 rr
3604
+ shuor4 sh uo4 rr
3605
+ shuor5 sh uo5 rr
3606
+ sir1 s ii1 rr
3607
+ sir2 s ii2 rr
3608
+ sir3 s ii3 rr
3609
+ sir4 s ii4 rr
3610
+ sir5 s ii5 rr
3611
+ songr1 s ong1 rr
3612
+ songr2 s ong2 rr
3613
+ songr3 s ong3 rr
3614
+ songr4 s ong4 rr
3615
+ songr5 s ong5 rr
3616
+ sour1 s ou1 rr
3617
+ sour2 s ou2 rr
3618
+ sour3 s ou3 rr
3619
+ sour4 s ou4 rr
3620
+ sour5 s ou5 rr
3621
+ sur1 s u1 rr
3622
+ sur2 s u2 rr
3623
+ sur3 s u3 rr
3624
+ sur4 s u4 rr
3625
+ sur5 s u5 rr
3626
+ suanr1 s uan1 rr
3627
+ suanr2 s uan2 rr
3628
+ suanr3 s uan3 rr
3629
+ suanr4 s uan4 rr
3630
+ suanr5 s uan5 rr
3631
+ suir1 s uei1 rr
3632
+ suir2 s uei2 rr
3633
+ suir3 s uei3 rr
3634
+ suir4 s uei4 rr
3635
+ suir5 s uei5 rr
3636
+ sunr1 s uen1 rr
3637
+ sunr2 s uen2 rr
3638
+ sunr3 s uen3 rr
3639
+ sunr4 s uen4 rr
3640
+ sunr5 s uen5 rr
3641
+ suor1 s uo1 rr
3642
+ suor2 s uo2 rr
3643
+ suor3 s uo3 rr
3644
+ suor4 s uo4 rr
3645
+ suor5 s uo5 rr
3646
+ tar1 t a1 rr
3647
+ tar2 t a2 rr
3648
+ tar3 t a3 rr
3649
+ tar4 t a4 rr
3650
+ tar5 t a5 rr
3651
+ tair1 t ai1 rr
3652
+ tair2 t ai2 rr
3653
+ tair3 t ai3 rr
3654
+ tair4 t ai4 rr
3655
+ tair5 t ai5 rr
3656
+ tanr1 t an1 rr
3657
+ tanr2 t an2 rr
3658
+ tanr3 t an3 rr
3659
+ tanr4 t an4 rr
3660
+ tanr5 t an5 rr
3661
+ tangr1 t ang1 rr
3662
+ tangr2 t ang2 rr
3663
+ tangr3 t ang3 rr
3664
+ tangr4 t ang4 rr
3665
+ tangr5 t ang5 rr
3666
+ taor1 t ao1 rr
3667
+ taor2 t ao2 rr
3668
+ taor3 t ao3 rr
3669
+ taor4 t ao4 rr
3670
+ taor5 t ao5 rr
3671
+ ter1 t e1 rr
3672
+ ter2 t e2 rr
3673
+ ter3 t e3 rr
3674
+ ter4 t e4 rr
3675
+ ter5 t e5 rr
3676
+ teir1 t ei1 rr
3677
+ teir2 t ei2 rr
3678
+ teir3 t ei3 rr
3679
+ teir4 t ei4 rr
3680
+ teir5 t ei5 rr
3681
+ tengr1 t eng1 rr
3682
+ tengr2 t eng2 rr
3683
+ tengr3 t eng3 rr
3684
+ tengr4 t eng4 rr
3685
+ tengr5 t eng5 rr
3686
+ tir1 t i1 rr
3687
+ tir2 t i2 rr
3688
+ tir3 t i3 rr
3689
+ tir4 t i4 rr
3690
+ tir5 t i5 rr
3691
+ tianr1 t ian1 rr
3692
+ tianr2 t ian2 rr
3693
+ tianr3 t ian3 rr
3694
+ tianr4 t ian4 rr
3695
+ tianr5 t ian5 rr
3696
+ tiaor1 t iao1 rr
3697
+ tiaor2 t iao2 rr
3698
+ tiaor3 t iao3 rr
3699
+ tiaor4 t iao4 rr
3700
+ tiaor5 t iao5 rr
3701
+ tier1 t ie1 rr
3702
+ tier2 t ie2 rr
3703
+ tier3 t ie3 rr
3704
+ tier4 t ie4 rr
3705
+ tier5 t ie5 rr
3706
+ tingr1 t ing1 rr
3707
+ tingr2 t ing2 rr
3708
+ tingr3 t ing3 rr
3709
+ tingr4 t ing4 rr
3710
+ tingr5 t ing5 rr
3711
+ tongr1 t ong1 rr
3712
+ tongr2 t ong2 rr
3713
+ tongr3 t ong3 rr
3714
+ tongr4 t ong4 rr
3715
+ tongr5 t ong5 rr
3716
+ tour1 t ou1 rr
3717
+ tour2 t ou2 rr
3718
+ tour3 t ou3 rr
3719
+ tour4 t ou4 rr
3720
+ tour5 t ou5 rr
3721
+ tur1 t u1 rr
3722
+ tur2 t u2 rr
3723
+ tur3 t u3 rr
3724
+ tur4 t u4 rr
3725
+ tur5 t u5 rr
3726
+ tuanr1 t uan1 rr
3727
+ tuanr2 t uan2 rr
3728
+ tuanr3 t uan3 rr
3729
+ tuanr4 t uan4 rr
3730
+ tuanr5 t uan5 rr
3731
+ tuir1 t uei1 rr
3732
+ tuir2 t uei2 rr
3733
+ tuir3 t uei3 rr
3734
+ tuir4 t uei4 rr
3735
+ tuir5 t uei5 rr
3736
+ tunr1 t uen1 rr
3737
+ tunr2 t uen2 rr
3738
+ tunr3 t uen3 rr
3739
+ tunr4 t uen4 rr
3740
+ tunr5 t uen5 rr
3741
+ tuor1 t uo1 rr
3742
+ tuor2 t uo2 rr
3743
+ tuor3 t uo3 rr
3744
+ tuor4 t uo4 rr
3745
+ tuor5 t uo5 rr
3746
+ war1 w ua1 rr
3747
+ war2 w ua2 rr
3748
+ war3 w ua3 rr
3749
+ war4 w ua4 rr
3750
+ war5 w ua5 rr
3751
+ wair1 w uai1 rr
3752
+ wair2 w uai2 rr
3753
+ wair3 w uai3 rr
3754
+ wair4 w uai4 rr
3755
+ wair5 w uai5 rr
3756
+ wanr1 w uan1 rr
3757
+ wanr2 w uan2 rr
3758
+ wanr3 w uan3 rr
3759
+ wanr4 w uan4 rr
3760
+ wanr5 w uan5 rr
3761
+ wangr1 w uang1 rr
3762
+ wangr2 w uang2 rr
3763
+ wangr3 w uang3 rr
3764
+ wangr4 w uang4 rr
3765
+ wangr5 w uang5 rr
3766
+ weir1 w uei1 rr
3767
+ weir2 w uei2 rr
3768
+ weir3 w uei3 rr
3769
+ weir4 w uei4 rr
3770
+ weir5 w uei5 rr
3771
+ wenr1 w uen1 rr
3772
+ wenr2 w uen2 rr
3773
+ wenr3 w uen3 rr
3774
+ wenr4 w uen4 rr
3775
+ wenr5 w uen5 rr
3776
+ wengr1 w uen1 rr
3777
+ wengr2 w uen2 rr
3778
+ wengr3 w uen3 rr
3779
+ wengr4 w uen4 rr
3780
+ wengr5 w uen5 rr
3781
+ wor1 w uo1 rr
3782
+ wor2 w uo2 rr
3783
+ wor3 w uo3 rr
3784
+ wor4 w uo4 rr
3785
+ wor5 w uo5 rr
3786
+ wur1 w u1 rr
3787
+ wur2 w u2 rr
3788
+ wur3 w u3 rr
3789
+ wur4 w u4 rr
3790
+ wur5 w u5 rr
3791
+ xir1 x i1 rr
3792
+ xir2 x i2 rr
3793
+ xir3 x i3 rr
3794
+ xir4 x i4 rr
3795
+ xir5 x i5 rr
3796
+ xiar1 x ia1 rr
3797
+ xiar2 x ia2 rr
3798
+ xiar3 x ia3 rr
3799
+ xiar4 x ia4 rr
3800
+ xiar5 x ia5 rr
3801
+ xianr1 x ian1 rr
3802
+ xianr2 x ian2 rr
3803
+ xianr3 x ian3 rr
3804
+ xianr4 x ian4 rr
3805
+ xianr5 x ian5 rr
3806
+ xiangr1 x iang1 rr
3807
+ xiangr2 x iang2 rr
3808
+ xiangr3 x iang3 rr
3809
+ xiangr4 x iang4 rr
3810
+ xiangr5 x iang5 rr
3811
+ xiaor1 x iao1 rr
3812
+ xiaor2 x iao2 rr
3813
+ xiaor3 x iao3 rr
3814
+ xiaor4 x iao4 rr
3815
+ xiaor5 x iao5 rr
3816
+ xier1 x ie1 rr
3817
+ xier2 x ie2 rr
3818
+ xier3 x ie3 rr
3819
+ xier4 x ie4 rr
3820
+ xier5 x ie5 rr
3821
+ xinr1 x in1 rr
3822
+ xinr2 x in2 rr
3823
+ xinr3 x in3 rr
3824
+ xinr4 x in4 rr
3825
+ xinr5 x in5 rr
3826
+ xingr1 x ing1 rr
3827
+ xingr2 x ing2 rr
3828
+ xingr3 x ing3 rr
3829
+ xingr4 x ing4 rr
3830
+ xingr5 x ing5 rr
3831
+ xiongr1 x iong1 rr
3832
+ xiongr2 x iong2 rr
3833
+ xiongr3 x iong3 rr
3834
+ xiongr4 x iong4 rr
3835
+ xiongr5 x iong5 rr
3836
+ xiur1 x iou1 rr
3837
+ xiur2 x iou2 rr
3838
+ xiur3 x iou3 rr
3839
+ xiur4 x iou4 rr
3840
+ xiur5 x iou5 rr
3841
+ xur1 x v1 rr
3842
+ xur2 x v2 rr
3843
+ xur3 x v3 rr
3844
+ xur4 x v4 rr
3845
+ xur5 x v5 rr
3846
+ xuanr1 x van1 rr
3847
+ xuanr2 x van2 rr
3848
+ xuanr3 x van3 rr
3849
+ xuanr4 x van4 rr
3850
+ xuanr5 x van5 rr
3851
+ xuer1 x ve1 rr
3852
+ xuer2 x ve2 rr
3853
+ xuer3 x ve3 rr
3854
+ xuer4 x ve4 rr
3855
+ xuer5 x ve5 rr
3856
+ xunr1 x vn1 rr
3857
+ xunr2 x vn2 rr
3858
+ xunr3 x vn3 rr
3859
+ xunr4 x vn4 rr
3860
+ xunr5 x vn5 rr
3861
+ yar1 y ia1 rr
3862
+ yar2 y ia2 rr
3863
+ yar3 y ia3 rr
3864
+ yar4 y ia4 rr
3865
+ yar5 y ia5 rr
3866
+ yanr1 y ian1 rr
3867
+ yanr2 y ian2 rr
3868
+ yanr3 y ian3 rr
3869
+ yanr4 y ian4 rr
3870
+ yanr5 y ian5 rr
3871
+ yangr1 y iang1 rr
3872
+ yangr2 y iang2 rr
3873
+ yangr3 y iang3 rr
3874
+ yangr4 y iang4 rr
3875
+ yangr5 y iang5 rr
3876
+ yaor1 y iao1 rr
3877
+ yaor2 y iao2 rr
3878
+ yaor3 y iao3 rr
3879
+ yaor4 y iao4 rr
3880
+ yaor5 y iao5 rr
3881
+ yer1 y ie1 rr
3882
+ yer2 y ie2 rr
3883
+ yer3 y ie3 rr
3884
+ yer4 y ie4 rr
3885
+ yer5 y ie5 rr
3886
+ yir1 y i1 rr
3887
+ yir2 y i2 rr
3888
+ yir3 y i3 rr
3889
+ yir4 y i4 rr
3890
+ yir5 y i5 rr
3891
+ yinr1 y in1 rr
3892
+ yinr2 y in2 rr
3893
+ yinr3 y in3 rr
3894
+ yinr4 y in4 rr
3895
+ yinr5 y in5 rr
3896
+ yingr1 y ing1 rr
3897
+ yingr2 y ing2 rr
3898
+ yingr3 y ing3 rr
3899
+ yingr4 y ing4 rr
3900
+ yingr5 y ing5 rr
3901
+ yor1 y iou1 rr
3902
+ yor2 y iou2 rr
3903
+ yor3 y iou3 rr
3904
+ yor4 y iou4 rr
3905
+ yor5 y iou5 rr
3906
+ yongr1 y iong1 rr
3907
+ yongr2 y iong2 rr
3908
+ yongr3 y iong3 rr
3909
+ yongr4 y iong4 rr
3910
+ yongr5 y iong5 rr
3911
+ your1 y iou1 rr
3912
+ your2 y iou2 rr
3913
+ your3 y iou3 rr
3914
+ your4 y iou4 rr
3915
+ your5 y iou5 rr
3916
+ yur1 y v1 rr
3917
+ yur2 y v2 rr
3918
+ yur3 y v3 rr
3919
+ yur4 y v4 rr
3920
+ yur5 y v5 rr
3921
+ yuanr1 y van1 rr
3922
+ yuanr2 y van2 rr
3923
+ yuanr3 y van3 rr
3924
+ yuanr4 y van4 rr
3925
+ yuanr5 y van5 rr
3926
+ yuer1 y ve1 rr
3927
+ yuer2 y ve2 rr
3928
+ yuer3 y ve3 rr
3929
+ yuer4 y ve4 rr
3930
+ yuer5 y ve5 rr
3931
+ yunr1 y vn1 rr
3932
+ yunr2 y vn2 rr
3933
+ yunr3 y vn3 rr
3934
+ yunr4 y vn4 rr
3935
+ yunr5 y vn5 rr
3936
+ zar1 z a1 rr
3937
+ zar2 z a2 rr
3938
+ zar3 z a3 rr
3939
+ zar4 z a4 rr
3940
+ zar5 z a5 rr
3941
+ zair1 z ai1 rr
3942
+ zair2 z ai2 rr
3943
+ zair3 z ai3 rr
3944
+ zair4 z ai4 rr
3945
+ zair5 z ai5 rr
3946
+ zanr1 z an1 rr
3947
+ zanr2 z an2 rr
3948
+ zanr3 z an3 rr
3949
+ zanr4 z an4 rr
3950
+ zanr5 z an5 rr
3951
+ zangr1 z ang1 rr
3952
+ zangr2 z ang2 rr
3953
+ zangr3 z ang3 rr
3954
+ zangr4 z ang4 rr
3955
+ zangr5 z ang5 rr
3956
+ zaor1 z ao1 rr
3957
+ zaor2 z ao2 rr
3958
+ zaor3 z ao3 rr
3959
+ zaor4 z ao4 rr
3960
+ zaor5 z ao5 rr
3961
+ zer1 z e1 rr
3962
+ zer2 z e2 rr
3963
+ zer3 z e3 rr
3964
+ zer4 z e4 rr
3965
+ zer5 z e5 rr
3966
+ zeir1 z ei1 rr
3967
+ zeir2 z ei2 rr
3968
+ zeir3 z ei3 rr
3969
+ zeir4 z ei4 rr
3970
+ zeir5 z ei5 rr
3971
+ zenr1 z en1 rr
3972
+ zenr2 z en2 rr
3973
+ zenr3 z en3 rr
3974
+ zenr4 z en4 rr
3975
+ zenr5 z en5 rr
3976
+ zengr1 z eng1 rr
3977
+ zengr2 z eng2 rr
3978
+ zengr3 z eng3 rr
3979
+ zengr4 z eng4 rr
3980
+ zengr5 z eng5 rr
3981
+ zhar1 zh a1 rr
3982
+ zhar2 zh a2 rr
3983
+ zhar3 zh a3 rr
3984
+ zhar4 zh a4 rr
3985
+ zhar5 zh a5 rr
3986
+ zhair1 zh ai1 rr
3987
+ zhair2 zh ai2 rr
3988
+ zhair3 zh ai3 rr
3989
+ zhair4 zh ai4 rr
3990
+ zhair5 zh ai5 rr
3991
+ zhanr1 zh an1 rr
3992
+ zhanr2 zh an2 rr
3993
+ zhanr3 zh an3 rr
3994
+ zhanr4 zh an4 rr
3995
+ zhanr5 zh an5 rr
3996
+ zhangr1 zh ang1 rr
3997
+ zhangr2 zh ang2 rr
3998
+ zhangr3 zh ang3 rr
3999
+ zhangr4 zh ang4 rr
4000
+ zhangr5 zh ang5 rr
4001
+ zhaor1 zh ao1 rr
4002
+ zhaor2 zh ao2 rr
4003
+ zhaor3 zh ao3 rr
4004
+ zhaor4 zh ao4 rr
4005
+ zhaor5 zh ao5 rr
4006
+ zher1 zh e1 rr
4007
+ zher2 zh e2 rr
4008
+ zher3 zh e3 rr
4009
+ zher4 zh e4 rr
4010
+ zher5 zh e5 rr
4011
+ zheir1 zh ei1 rr
4012
+ zheir2 zh ei2 rr
4013
+ zheir3 zh ei3 rr
4014
+ zheir4 zh ei4 rr
4015
+ zheir5 zh ei5 rr
4016
+ zhenr1 zh en1 rr
4017
+ zhenr2 zh en2 rr
4018
+ zhenr3 zh en3 rr
4019
+ zhenr4 zh en4 rr
4020
+ zhenr5 zh en5 rr
4021
+ zhengr1 zh eng1 rr
4022
+ zhengr2 zh eng2 rr
4023
+ zhengr3 zh eng3 rr
4024
+ zhengr4 zh eng4 rr
4025
+ zhengr5 zh eng5 rr
4026
+ zhir1 zh iii1 rr
4027
+ zhir2 zh iii2 rr
4028
+ zhir3 zh iii3 rr
4029
+ zhir4 zh iii4 rr
4030
+ zhir5 zh iii5 rr
4031
+ zhongr1 zh ong1 rr
4032
+ zhongr2 zh ong2 rr
4033
+ zhongr3 zh ong3 rr
4034
+ zhongr4 zh ong4 rr
4035
+ zhongr5 zh ong5 rr
4036
+ zhour1 zh ou1 rr
4037
+ zhour2 zh ou2 rr
4038
+ zhour3 zh ou3 rr
4039
+ zhour4 zh ou4 rr
4040
+ zhour5 zh ou5 rr
4041
+ zhur1 zh u1 rr
4042
+ zhur2 zh u2 rr
4043
+ zhur3 zh u3 rr
4044
+ zhur4 zh u4 rr
4045
+ zhur5 zh u5 rr
4046
+ zhuar1 zh ua1 rr
4047
+ zhuar2 zh ua2 rr
4048
+ zhuar3 zh ua3 rr
4049
+ zhuar4 zh ua4 rr
4050
+ zhuar5 zh ua5 rr
4051
+ zhuair1 zh uai1 rr
4052
+ zhuair2 zh uai2 rr
4053
+ zhuair3 zh uai3 rr
4054
+ zhuair4 zh uai4 rr
4055
+ zhuair5 zh uai5 rr
4056
+ zhuanr1 zh uan1 rr
4057
+ zhuanr2 zh uan2 rr
4058
+ zhuanr3 zh uan3 rr
4059
+ zhuanr4 zh uan4 rr
4060
+ zhuanr5 zh uan5 rr
4061
+ zhuangr1 zh uang1 rr
4062
+ zhuangr2 zh uang2 rr
4063
+ zhuangr3 zh uang3 rr
4064
+ zhuangr4 zh uang4 rr
4065
+ zhuangr5 zh uang5 rr
4066
+ zhuir1 zh uei1 rr
4067
+ zhuir2 zh uei2 rr
4068
+ zhuir3 zh uei3 rr
4069
+ zhuir4 zh uei4 rr
4070
+ zhuir5 zh uei5 rr
4071
+ zhunr1 zh uen1 rr
4072
+ zhunr2 zh uen2 rr
4073
+ zhunr3 zh uen3 rr
4074
+ zhunr4 zh uen4 rr
4075
+ zhunr5 zh uen5 rr
4076
+ zhuor1 zh uo1 rr
4077
+ zhuor2 zh uo2 rr
4078
+ zhuor3 zh uo3 rr
4079
+ zhuor4 zh uo4 rr
4080
+ zhuor5 zh uo5 rr
4081
+ zir1 z ii1 rr
4082
+ zir2 z ii2 rr
4083
+ zir3 z ii3 rr
4084
+ zir4 z ii4 rr
4085
+ zir5 z ii5 rr
4086
+ zongr1 z ong1 rr
4087
+ zongr2 z ong2 rr
4088
+ zongr3 z ong3 rr
4089
+ zongr4 z ong4 rr
4090
+ zongr5 z ong5 rr
4091
+ zour1 z ou1 rr
4092
+ zour2 z ou2 rr
4093
+ zour3 z ou3 rr
4094
+ zour4 z ou4 rr
4095
+ zour5 z ou5 rr
4096
+ zur1 z u1 rr
4097
+ zur2 z u2 rr
4098
+ zur3 z u3 rr
4099
+ zur4 z u4 rr
4100
+ zur5 z u5 rr
4101
+ zuanr1 z uan1 rr
4102
+ zuanr2 z uan2 rr
4103
+ zuanr3 z uan3 rr
4104
+ zuanr4 z uan4 rr
4105
+ zuanr5 z uan5 rr
4106
+ zuir1 z uei1 rr
4107
+ zuir2 z uei2 rr
4108
+ zuir3 z uei3 rr
4109
+ zuir4 z uei4 rr
4110
+ zuir5 z uei5 rr
4111
+ zunr1 z uen1 rr
4112
+ zunr2 z uen2 rr
4113
+ zunr3 z uen3 rr
4114
+ zunr4 z uen4 rr
4115
+ zunr5 z uen5 rr
4116
+ zuor1 z uo1 rr
4117
+ zuor2 z uo2 rr
4118
+ zuor3 z uo3 rr
4119
+ zuor4 z uo4 rr
4120
+ zuor5 z uo5 rr
lemas_tts/infer/text_norm/symbols.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pinyin_dict = {
2
+ "a": ("^", "a"),
3
+ "ai": ("^", "ai"),
4
+ "an": ("^", "an"),
5
+ "ang": ("^", "ang"),
6
+ "ao": ("^", "ao"),
7
+ "ba": ("b", "a"),
8
+ "bai": ("b", "ai"),
9
+ "ban": ("b", "an"),
10
+ "bang": ("b", "ang"),
11
+ "bao": ("b", "ao"),
12
+ "be": ("b", "e"),
13
+ "bei": ("b", "ei"),
14
+ "ben": ("b", "en"),
15
+ "beng": ("b", "eng"),
16
+ "bi": ("b", "i"),
17
+ "bian": ("b", "ian"),
18
+ "biao": ("b", "iao"),
19
+ "bie": ("b", "ie"),
20
+ "bin": ("b", "in"),
21
+ "bing": ("b", "ing"),
22
+ "bo": ("b", "o"),
23
+ "bu": ("b", "u"),
24
+ "ca": ("c", "a"),
25
+ "cai": ("c", "ai"),
26
+ "can": ("c", "an"),
27
+ "cang": ("c", "ang"),
28
+ "cao": ("c", "ao"),
29
+ "ce": ("c", "e"),
30
+ "cen": ("c", "en"),
31
+ "ceng": ("c", "eng"),
32
+ "cha": ("ch", "a"),
33
+ "chai": ("ch", "ai"),
34
+ "chan": ("ch", "an"),
35
+ "chang": ("ch", "ang"),
36
+ "chao": ("ch", "ao"),
37
+ "che": ("ch", "e"),
38
+ "chen": ("ch", "en"),
39
+ "cheng": ("ch", "eng"),
40
+ "chi": ("ch", "iii"),
41
+ "chong": ("ch", "ong"),
42
+ "chou": ("ch", "ou"),
43
+ "chu": ("ch", "u"),
44
+ "chua": ("ch", "ua"),
45
+ "chuai": ("ch", "uai"),
46
+ "chuan": ("ch", "uan"),
47
+ "chuang": ("ch", "uang"),
48
+ "chui": ("ch", "uei"),
49
+ "chun": ("ch", "uen"),
50
+ "chuo": ("ch", "uo"),
51
+ "ci": ("c", "ii"),
52
+ "cong": ("c", "ong"),
53
+ "cou": ("c", "ou"),
54
+ "cu": ("c", "u"),
55
+ "cuan": ("c", "uan"),
56
+ "cui": ("c", "uei"),
57
+ "cun": ("c", "uen"),
58
+ "cuo": ("c", "uo"),
59
+ "da": ("d", "a"),
60
+ "dai": ("d", "ai"),
61
+ "dan": ("d", "an"),
62
+ "dang": ("d", "ang"),
63
+ "dao": ("d", "ao"),
64
+ "de": ("d", "e"),
65
+ "dei": ("d", "ei"),
66
+ "den": ("d", "en"),
67
+ "deng": ("d", "eng"),
68
+ "di": ("d", "i"),
69
+ "dia": ("d", "ia"),
70
+ "dian": ("d", "ian"),
71
+ "diao": ("d", "iao"),
72
+ "die": ("d", "ie"),
73
+ "ding": ("d", "ing"),
74
+ "diu": ("d", "iou"),
75
+ "dong": ("d", "ong"),
76
+ "dou": ("d", "ou"),
77
+ "du": ("d", "u"),
78
+ "duan": ("d", "uan"),
79
+ "dui": ("d", "uei"),
80
+ "dun": ("d", "uen"),
81
+ "duo": ("d", "uo"),
82
+ "e": ("^", "e"),
83
+ "ei": ("^", "ei"),
84
+ "en": ("^", "en"),
85
+ "ng": ("^", "en"),
86
+ "eng": ("^", "eng"),
87
+ "er": ("^", "er"),
88
+ "fa": ("f", "a"),
89
+ "fan": ("f", "an"),
90
+ "fang": ("f", "ang"),
91
+ "fei": ("f", "ei"),
92
+ "fen": ("f", "en"),
93
+ "feng": ("f", "eng"),
94
+ "fo": ("f", "o"),
95
+ "fou": ("f", "ou"),
96
+ "fu": ("f", "u"),
97
+ "ga": ("g", "a"),
98
+ "gai": ("g", "ai"),
99
+ "gan": ("g", "an"),
100
+ "gang": ("g", "ang"),
101
+ "gao": ("g", "ao"),
102
+ "ge": ("g", "e"),
103
+ "gei": ("g", "ei"),
104
+ "gen": ("g", "en"),
105
+ "geng": ("g", "eng"),
106
+ "gong": ("g", "ong"),
107
+ "gou": ("g", "ou"),
108
+ "gu": ("g", "u"),
109
+ "gua": ("g", "ua"),
110
+ "guai": ("g", "uai"),
111
+ "guan": ("g", "uan"),
112
+ "guang": ("g", "uang"),
113
+ "gui": ("g", "uei"),
114
+ "gun": ("g", "uen"),
115
+ "guo": ("g", "uo"),
116
+ "ha": ("h", "a"),
117
+ "hai": ("h", "ai"),
118
+ "han": ("h", "an"),
119
+ "hang": ("h", "ang"),
120
+ "hao": ("h", "ao"),
121
+ "he": ("h", "e"),
122
+ "hei": ("h", "ei"),
123
+ "hen": ("h", "en"),
124
+ "heng": ("h", "eng"),
125
+ "hong": ("h", "ong"),
126
+ "hou": ("h", "ou"),
127
+ "hu": ("h", "u"),
128
+ "hua": ("h", "ua"),
129
+ "huai": ("h", "uai"),
130
+ "huan": ("h", "uan"),
131
+ "huang": ("h", "uang"),
132
+ "hui": ("h", "uei"),
133
+ "hun": ("h", "uen"),
134
+ "huo": ("h", "uo"),
135
+ "ji": ("j", "i"),
136
+ "jia": ("j", "ia"),
137
+ "jian": ("j", "ian"),
138
+ "jiang": ("j", "iang"),
139
+ "jiao": ("j", "iao"),
140
+ "jie": ("j", "ie"),
141
+ "jin": ("j", "in"),
142
+ "jing": ("j", "ing"),
143
+ "jiong": ("j", "iong"),
144
+ "jiu": ("j", "iou"),
145
+ "ju": ("j", "v"),
146
+ "juan": ("j", "van"),
147
+ "jue": ("j", "ve"),
148
+ "jun": ("j", "vn"),
149
+ "ka": ("k", "a"),
150
+ "kai": ("k", "ai"),
151
+ "kan": ("k", "an"),
152
+ "kang": ("k", "ang"),
153
+ "kao": ("k", "ao"),
154
+ "ke": ("k", "e"),
155
+ "kei": ("k", "ei"),
156
+ "ken": ("k", "en"),
157
+ "keng": ("k", "eng"),
158
+ "kong": ("k", "ong"),
159
+ "kou": ("k", "ou"),
160
+ "ku": ("k", "u"),
161
+ "kua": ("k", "ua"),
162
+ "kuai": ("k", "uai"),
163
+ "kuan": ("k", "uan"),
164
+ "kuang": ("k", "uang"),
165
+ "kui": ("k", "uei"),
166
+ "kun": ("k", "uen"),
167
+ "kuo": ("k", "uo"),
168
+ "la": ("l", "a"),
169
+ "lai": ("l", "ai"),
170
+ "lan": ("l", "an"),
171
+ "lang": ("l", "ang"),
172
+ "lao": ("l", "ao"),
173
+ "le": ("l", "e"),
174
+ "lei": ("l", "ei"),
175
+ "leng": ("l", "eng"),
176
+ "li": ("l", "i"),
177
+ "lia": ("l", "ia"),
178
+ "lian": ("l", "ian"),
179
+ "liang": ("l", "iang"),
180
+ "liao": ("l", "iao"),
181
+ "lie": ("l", "ie"),
182
+ "lin": ("l", "in"),
183
+ "ling": ("l", "ing"),
184
+ "liu": ("l", "iou"),
185
+ "lo": ("l", "o"),
186
+ "long": ("l", "ong"),
187
+ "lou": ("l", "ou"),
188
+ "lu": ("l", "u"),
189
+ "lv": ("l", "v"),
190
+ "luan": ("l", "uan"),
191
+ "lve": ("l", "ve"),
192
+ "lue": ("l", "ve"),
193
+ "lun": ("l", "uen"),
194
+ "luo": ("l", "uo"),
195
+ "ma": ("m", "a"),
196
+ "mai": ("m", "ai"),
197
+ "man": ("m", "an"),
198
+ "mang": ("m", "ang"),
199
+ "mao": ("m", "ao"),
200
+ "me": ("m", "e"),
201
+ "mei": ("m", "ei"),
202
+ "men": ("m", "en"),
203
+ "meng": ("m", "eng"),
204
+ "mi": ("m", "i"),
205
+ "mian": ("m", "ian"),
206
+ "miao": ("m", "iao"),
207
+ "mie": ("m", "ie"),
208
+ "min": ("m", "in"),
209
+ "ming": ("m", "ing"),
210
+ "miu": ("m", "iou"),
211
+ "mo": ("m", "o"),
212
+ "mou": ("m", "ou"),
213
+ "mu": ("m", "u"),
214
+ "na": ("n", "a"),
215
+ "nai": ("n", "ai"),
216
+ "nan": ("n", "an"),
217
+ "nang": ("n", "ang"),
218
+ "nao": ("n", "ao"),
219
+ "ne": ("n", "e"),
220
+ "nei": ("n", "ei"),
221
+ "nen": ("n", "en"),
222
+ "neng": ("n", "eng"),
223
+ "ni": ("n", "i"),
224
+ "nia": ("n", "ia"),
225
+ "nian": ("n", "ian"),
226
+ "niang": ("n", "iang"),
227
+ "niao": ("n", "iao"),
228
+ "nie": ("n", "ie"),
229
+ "nin": ("n", "in"),
230
+ "ning": ("n", "ing"),
231
+ "niu": ("n", "iou"),
232
+ "nong": ("n", "ong"),
233
+ "nou": ("n", "ou"),
234
+ "nu": ("n", "u"),
235
+ "nv": ("n", "v"),
236
+ "nuan": ("n", "uan"),
237
+ "nve": ("n", "ve"),
238
+ "nue": ("n", "ve"),
239
+ "nuo": ("n", "uo"),
240
+ "o": ("^", "o"),
241
+ "ou": ("^", "ou"),
242
+ "pa": ("p", "a"),
243
+ "pai": ("p", "ai"),
244
+ "pan": ("p", "an"),
245
+ "pang": ("p", "ang"),
246
+ "pao": ("p", "ao"),
247
+ "pe": ("p", "e"),
248
+ "pei": ("p", "ei"),
249
+ "pen": ("p", "en"),
250
+ "peng": ("p", "eng"),
251
+ "pi": ("p", "i"),
252
+ "pian": ("p", "ian"),
253
+ "piao": ("p", "iao"),
254
+ "pie": ("p", "ie"),
255
+ "pin": ("p", "in"),
256
+ "ping": ("p", "ing"),
257
+ "po": ("p", "o"),
258
+ "pou": ("p", "ou"),
259
+ "pu": ("p", "u"),
260
+ "qi": ("q", "i"),
261
+ "qia": ("q", "ia"),
262
+ "qian": ("q", "ian"),
263
+ "qiang": ("q", "iang"),
264
+ "qiao": ("q", "iao"),
265
+ "qie": ("q", "ie"),
266
+ "qin": ("q", "in"),
267
+ "qing": ("q", "ing"),
268
+ "qiong": ("q", "iong"),
269
+ "qiu": ("q", "iou"),
270
+ "qu": ("q", "v"),
271
+ "quan": ("q", "van"),
272
+ "que": ("q", "ve"),
273
+ "qun": ("q", "vn"),
274
+ "ran": ("r", "an"),
275
+ "rang": ("r", "ang"),
276
+ "rao": ("r", "ao"),
277
+ "re": ("r", "e"),
278
+ "ren": ("r", "en"),
279
+ "reng": ("r", "eng"),
280
+ "ri": ("r", "iii"),
281
+ "rong": ("r", "ong"),
282
+ "rou": ("r", "ou"),
283
+ "ru": ("r", "u"),
284
+ "rua": ("r", "ua"),
285
+ "ruan": ("r", "uan"),
286
+ "rui": ("r", "uei"),
287
+ "run": ("r", "uen"),
288
+ "ruo": ("r", "uo"),
289
+ "sa": ("s", "a"),
290
+ "sai": ("s", "ai"),
291
+ "san": ("s", "an"),
292
+ "sang": ("s", "ang"),
293
+ "sao": ("s", "ao"),
294
+ "se": ("s", "e"),
295
+ "sen": ("s", "en"),
296
+ "seng": ("s", "eng"),
297
+ "sha": ("sh", "a"),
298
+ "shai": ("sh", "ai"),
299
+ "shan": ("sh", "an"),
300
+ "shang": ("sh", "ang"),
301
+ "shao": ("sh", "ao"),
302
+ "she": ("sh", "e"),
303
+ "shei": ("sh", "ei"),
304
+ "shen": ("sh", "en"),
305
+ "sheng": ("sh", "eng"),
306
+ "shi": ("sh", "iii"),
307
+ "shou": ("sh", "ou"),
308
+ "shu": ("sh", "u"),
309
+ "shua": ("sh", "ua"),
310
+ "shuai": ("sh", "uai"),
311
+ "shuan": ("sh", "uan"),
312
+ "shuang": ("sh", "uang"),
313
+ "shui": ("sh", "uei"),
314
+ "shun": ("sh", "uen"),
315
+ "shuo": ("sh", "uo"),
316
+ "si": ("s", "ii"),
317
+ "song": ("s", "ong"),
318
+ "sou": ("s", "ou"),
319
+ "su": ("s", "u"),
320
+ "suan": ("s", "uan"),
321
+ "sui": ("s", "uei"),
322
+ "sun": ("s", "uen"),
323
+ "suo": ("s", "uo"),
324
+ "ta": ("t", "a"),
325
+ "tai": ("t", "ai"),
326
+ "tan": ("t", "an"),
327
+ "tang": ("t", "ang"),
328
+ "tao": ("t", "ao"),
329
+ "te": ("t", "e"),
330
+ "tei": ("t", "ei"),
331
+ "teng": ("t", "eng"),
332
+ "ti": ("t", "i"),
333
+ "tian": ("t", "ian"),
334
+ "tiao": ("t", "iao"),
335
+ "tie": ("t", "ie"),
336
+ "ting": ("t", "ing"),
337
+ "tong": ("t", "ong"),
338
+ "tou": ("t", "ou"),
339
+ "tu": ("t", "u"),
340
+ "tuan": ("t", "uan"),
341
+ "tui": ("t", "uei"),
342
+ "tun": ("t", "uen"),
343
+ "tuo": ("t", "uo"),
344
+ "wa": ("^", "ua"),
345
+ "wai": ("^", "uai"),
346
+ "wan": ("^", "uan"),
347
+ "wang": ("^", "uang"),
348
+ "wei": ("^", "uei"),
349
+ "wen": ("^", "uen"),
350
+ "weng": ("^", "ueng"),
351
+ "wo": ("^", "uo"),
352
+ "wu": ("^", "u"),
353
+ "xi": ("x", "i"),
354
+ "xia": ("x", "ia"),
355
+ "xian": ("x", "ian"),
356
+ "xiang": ("x", "iang"),
357
+ "xiao": ("x", "iao"),
358
+ "xie": ("x", "ie"),
359
+ "xin": ("x", "in"),
360
+ "xing": ("x", "ing"),
361
+ "xiong": ("x", "iong"),
362
+ "xiu": ("x", "iou"),
363
+ "xu": ("x", "v"),
364
+ "xuan": ("x", "van"),
365
+ "xue": ("x", "ve"),
366
+ "xun": ("x", "vn"),
367
+ "ya": ("^", "ia"),
368
+ "yan": ("^", "ian"),
369
+ "yang": ("^", "iang"),
370
+ "yao": ("^", "iao"),
371
+ "ye": ("^", "ie"),
372
+ "yi": ("^", "i"),
373
+ "yin": ("^", "in"),
374
+ "ying": ("^", "ing"),
375
+ "yo": ("^", "iou"),
376
+ "yong": ("^", "iong"),
377
+ "you": ("^", "iou"),
378
+ "yu": ("^", "v"),
379
+ "yuan": ("^", "van"),
380
+ "yue": ("^", "ve"),
381
+ "yun": ("^", "vn"),
382
+ "za": ("z", "a"),
383
+ "zai": ("z", "ai"),
384
+ "zan": ("z", "an"),
385
+ "zang": ("z", "ang"),
386
+ "zao": ("z", "ao"),
387
+ "ze": ("z", "e"),
388
+ "zei": ("z", "ei"),
389
+ "zen": ("z", "en"),
390
+ "zeng": ("z", "eng"),
391
+ "zha": ("zh", "a"),
392
+ "zhai": ("zh", "ai"),
393
+ "zhan": ("zh", "an"),
394
+ "zhang": ("zh", "ang"),
395
+ "zhao": ("zh", "ao"),
396
+ "zhe": ("zh", "e"),
397
+ "zhei": ("zh", "ei"),
398
+ "zhen": ("zh", "en"),
399
+ "zheng": ("zh", "eng"),
400
+ "zhi": ("zh", "iii"),
401
+ "zhong": ("zh", "ong"),
402
+ "zhou": ("zh", "ou"),
403
+ "zhu": ("zh", "u"),
404
+ "zhua": ("zh", "ua"),
405
+ "zhuai": ("zh", "uai"),
406
+ "zhuan": ("zh", "uan"),
407
+ "zhuang": ("zh", "uang"),
408
+ "zhui": ("zh", "uei"),
409
+ "zhun": ("zh", "uen"),
410
+ "zhuo": ("zh", "uo"),
411
+ "zi": ("z", "ii"),
412
+ "zong": ("z", "ong"),
413
+ "zou": ("z", "ou"),
414
+ "zu": ("z", "u"),
415
+ "zuan": ("z", "uan"),
416
+ "zui": ("z", "uei"),
417
+ "zun": ("z", "uen"),
418
+ "zuo": ("z", "uo"),
419
+ }
lemas_tts/infer/text_norm/tokenizer.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/data/tokenizer.py
2
+ # Copyright 2023 (authors: Feiteng Li)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import re, logging
17
+ from dataclasses import asdict, dataclass
18
+ from typing import Any, Dict, List, Optional, Pattern, Union
19
+ import math
20
+ import numpy as np
21
+ import torch
22
+ import torchaudio
23
+ # from lhotse.features import FeatureExtractor
24
+ # from lhotse.utils import Seconds, compute_num_frames
25
+ from phonemizer.backend import EspeakBackend
26
+ from phonemizer.backend.espeak.language_switch import LanguageSwitch
27
+ from phonemizer.backend.espeak.words_mismatch import WordMismatch
28
+ from phonemizer.punctuation import Punctuation
29
+ from phonemizer.separator import Separator
30
+
31
+
32
+
33
+ class TextTokenizer:
34
+ """Phonemize Text."""
35
+
36
+ def __init__(
37
+ self,
38
+ language="en-us",
39
+ backend="espeak",
40
+ separator=Separator(word="_", syllable="-", phone="|"),
41
+ preserve_punctuation=True,
42
+ punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
43
+ with_stress: bool = False,
44
+ tie: Union[bool, str] = False,
45
+ language_switch: LanguageSwitch = "keep-flags",
46
+ words_mismatch: WordMismatch = "ignore",
47
+ ) -> None:
48
+ phonemizer = EspeakBackend(
49
+ language,
50
+ punctuation_marks=punctuation_marks,
51
+ preserve_punctuation=preserve_punctuation,
52
+ with_stress=with_stress,
53
+ tie=tie,
54
+ language_switch=language_switch,
55
+ words_mismatch=words_mismatch,
56
+ )
57
+
58
+ self.backend = phonemizer
59
+ self.separator = separator
60
+
61
+ def to_list(self, phonemized: str) -> List[str]:
62
+ fields = []
63
+ for word in phonemized.split(self.separator.word):
64
+ # "ɐ m|iː|n?" ɹ|ɪ|z|ɜː|v; h|ɪ|z.
65
+ pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE)
66
+ fields.extend(
67
+ [p for p in pp if p != self.separator.phone]
68
+ + [self.separator.word]
69
+ )
70
+ assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count(
71
+ self.separator.phone
72
+ )
73
+ return fields[:-1]
74
+
75
+ def __call__(self, text, strip=True) -> List[List[str]]:
76
+ if isinstance(text, str):
77
+ text = [text]
78
+ phones = []
79
+ for txt in text:
80
+ if txt == '':
81
+ continue
82
+ if txt[0] == '#':
83
+ phones.append(txt)
84
+ else:
85
+ ipa = text_tokenizer.backend.phonemize([txt], separator=text_tokenizer.separator, strip=True, njobs=1, logger=logging.basicConfig(level=logging.ERROR))
86
+ phones += text_tokenizer.to_list(ipa[0])
87
+ return phones
88
+
89
+
90
+ def tokenize_text(tokenizer: TextTokenizer, text: str) -> List[str]:
91
+ phonemes = tokenizer([text.strip()])
92
+ return phonemes[0] # k2symbols
93
+
94
+
95
+ _PAUSE_SYMBOL = {'、':',', ',':',', '。':',', '!':'!', '?':'?', ':':':'}
96
+ def _replace(match):
97
+ word = match.group(0)
98
+ return _PAUSE_SYMBOL[word]
99
+
100
+ def txt2phone(tokenizer: TextTokenizer, text: str):
101
+ text = re.sub('|'.join(_PAUSE_SYMBOL.keys()), _replace, text)
102
+ text = re.split(r"(#\d)", text)
103
+ phones = []
104
+ for txt in text:
105
+ if txt == '':
106
+ continue
107
+ if txt[0] == '#':
108
+ phones.append(txt)
109
+ else:
110
+ ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1)
111
+ phones += tokenizer.to_list(ipa[0])
112
+ phones = "|".join(phones).replace("(|", "(").replace("|)", ")")
113
+ # phones = ["(cmn)"] + phones.split("|")
114
+ return phones
115
+
116
+
117
+ def convert_audio(wav: torch.Tensor, sr: int, target_sr: int, target_channels: int):
118
+ assert wav.shape[0] in [1, 2], "Audio must be mono or stereo."
119
+ if target_channels == 1:
120
+ wav = wav.mean(0, keepdim=True)
121
+ elif target_channels == 2:
122
+ *shape, _, length = wav.shape
123
+ wav = wav.expand(*shape, target_channels, length)
124
+ elif wav.shape[0] == 1:
125
+ wav = wav.expand(target_channels, -1)
126
+ wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
127
+ return wav
128
+
129
+
130
+ class AudioTokenizer:
131
+ """EnCodec audio."""
132
+
133
+ def __init__(
134
+ self,
135
+ device: Any = None,
136
+ signature = None
137
+ ) -> None:
138
+ from audiocraft.solvers import CompressionSolver
139
+ model = CompressionSolver.model_from_checkpoint(signature)
140
+ self.sample_rate = model.sample_rate
141
+ self.channels = model.channels
142
+
143
+ if not device:
144
+ device = torch.device("cpu")
145
+ if torch.cuda.is_available():
146
+ device = torch.device("cuda:0")
147
+
148
+ self._device = device
149
+
150
+ self.codec = model.to(device)
151
+
152
+ @property
153
+ def device(self):
154
+ return self._device
155
+
156
+ def encode(self, wav: torch.Tensor) -> torch.Tensor:
157
+ codes = self.codec.encode(wav.to(self.device))
158
+ return [(codes[0], None)]
159
+
160
+ def decode(self, frames: torch.Tensor) -> torch.Tensor:
161
+ frames = frames[0][0] # [1,4,T]
162
+ return self.codec.decode(frames)
163
+
164
+
165
+
166
+ def tokenize_audio(tokenizer: AudioTokenizer, audio, offset = -1, num_frames=-1):
167
+ # Load and pre-process the audio waveform
168
+ if type(audio) == str:
169
+ if offset != -1 and num_frames!=-1:
170
+ wav, sr = torchaudio.load(audio, frame_offset=offset, num_frames=num_frames)
171
+ else:
172
+ wav, sr = torchaudio.load(audio)
173
+ wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
174
+ wav = wav.unsqueeze(0)
175
+ else:
176
+ wav = audio.unsqueeze(0).unsqueeze(0)
177
+ # Extract discrete codes from EnCodec
178
+ with torch.no_grad():
179
+ encoded_frames = tokenizer.encode(wav)
180
+ return encoded_frames
181
+
182
+
183
+ class AudioSR:
184
+ """EnCodec audio."""
185
+
186
+ def __init__(
187
+ self,
188
+ model_path,
189
+ device = "cpu",
190
+ ) -> None:
191
+ import dac
192
+ self.codec = dac.DAC.load(model_path)
193
+ self.codec.to(device)
194
+ self.codec.eval()
195
+
196
+ self.sample_rate = self.codec.sample_rate
197
+ self.channels = 1
198
+ self._device = device
199
+
200
+ @property
201
+ def device(self):
202
+ return self._device
203
+
204
+ def encode(self, wav: torch.Tensor) -> torch.Tensor:
205
+ length = wav.shape[-1]
206
+ right_pad = math.ceil(length / self.codec.hop_length) * self.codec.hop_length - length
207
+ wav = torch.nn.functional.pad(wav, (0, right_pad))
208
+ z, codes, _, _, _ = self.codec.encode(wav.to(self._device))
209
+ return [(codes, z)]
210
+
211
+ def decode(self, frames: torch.Tensor) -> torch.Tensor:
212
+ # frames = frames[0][0] # [1,4,T]
213
+ # with torch.no_grad():
214
+ # z = self.codec.quantizer.from_codes(frames)[0]
215
+ # y = self.codec.decode(z)
216
+ z = frames[0][1] # [1, 2048, T]
217
+ with torch.no_grad():
218
+ y = self.codec.decode(z)
219
+ return y
lemas_tts/infer/text_norm/txt2pinyin.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ from concurrent.futures import ProcessPoolExecutor
3
+ import argparse
4
+ import os, sys, re
5
+ from random import shuffle
6
+ from tqdm import tqdm
7
+ from pypinyin import Style
8
+ from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
9
+ from pypinyin.converter import DefaultConverter
10
+ from pypinyin.core import Pinyin
11
+ import jieba
12
+ jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__)+'/jieba_dict.txt'))
13
+
14
+ from .symbols import pinyin_dict
15
+ from .cn_tn import NSWNormalizer
16
+
17
+
18
+ zh_pattern = re.compile("[\u4e00-\u9fa5]")
19
+ alpha_pattern = re.compile(r"[a-zA-Z]")
20
+
21
+ def is_zh(word):
22
+ global zh_pattern
23
+ match = zh_pattern.search(word)
24
+ return match is not None
25
+
26
+ def is_alpha(word):
27
+ global alpha_pattern
28
+ match = alpha_pattern.search(word)
29
+ return match is not None
30
+
31
+ def get_phoneme_from_char_and_pinyin(chn_char, pinyin):
32
+ # we do not need #4, use sil to replace it
33
+ chn_char = chn_char.replace("#4", "")
34
+ char_len = len(chn_char)
35
+ i, j = 0, 0
36
+ result = []
37
+ # print(pinyin)
38
+ while i < char_len:
39
+ cur_char = chn_char[i]
40
+ if is_zh(cur_char):
41
+ if pinyin[j][:-1] == 'n': # 处理特殊“嗯” 特殊拼音
42
+ pinyin[j] = 'en' + pinyin[j][-1]
43
+ if i < len(chn_char)-2 and is_zh(chn_char[i:i+3]) and pinyin[j][-1] == pinyin[j+1][-1] == pinyin[j+2][-1] == '3': # 处理连续三个三声变调
44
+ pinyin[j+1] = pinyin[j+1][:-1] + '2'
45
+ # print(chn_char[i:i+3], pinyin[j:j+3])
46
+ if i < len(chn_char)-1 and pinyin[j][:-1] in pinyin_dict and is_zh(chn_char[i]) and is_zh(chn_char[i+1]) and pinyin[j][-1] == pinyin[j+1][-1] == '3': # 处理连续两个三声变调
47
+ pinyin[j] = pinyin[j][:-1] + '2'
48
+ # print('change tone ', chn_char[i:i+2], pinyin[j:j + 2])
49
+ if pinyin[j][:-1] not in pinyin_dict: # 处理儿化音
50
+ assert chn_char[i + 1] == "儿", f"current_char : {cur_char}, next_char: {chn_char[i+1]}, cur_pinyin: {pinyin[j]}"
51
+ assert pinyin[j][-2] == "r"
52
+ tone = pinyin[j][-1]
53
+ a = pinyin[j][:-2]
54
+ # a1, a2 = pinyin_dict[a]
55
+ # result += [a1, a2 + tone, "er5"]
56
+ result += [a + tone, er5]
57
+ if i + 2 < char_len and chn_char[i + 2] != "#":
58
+ result.append("#0")
59
+ i += 2
60
+ j += 1
61
+ else:
62
+ tone = pinyin[j][-1]
63
+ a = pinyin[j][:-1]
64
+ a1, a2 = pinyin_dict[a] # a="wen" a1="^", a2="en"
65
+ # result += [a1, a2 + tone] # result = [zh, ong1, ^,en2]
66
+ result.append(a+tone)
67
+ # if i + 1 < char_len and chn_char[i + 1] != "#": # 每个字后面接一个#0
68
+ # result.append("#0")
69
+
70
+ i += 1
71
+ j += 1
72
+
73
+ # TODO support English alpha
74
+ # elif is_alpha(cur_char):
75
+ # result += ALPHA_PHONE_DICT[cur_char.upper()]
76
+ # if i + 1 < char_len and chn_char[i + 1] not in "#、,。!?:" : # 每个字后面接一个#0
77
+ # result.append("#0")
78
+ # i += 1
79
+ # j += 1 # baker alpha dataset "ABC" in pinyin
80
+ elif cur_char == "#":
81
+ result.append(chn_char[i : i + 2])
82
+ i += 2
83
+ elif cur_char in _PAUSE_SYMBOL: # 遇到标点符号,添加停顿
84
+ result.pop() # 去掉#0
85
+ result.append("#3")
86
+ i += 1
87
+ else:
88
+ # ignore the unknown char
89
+ # result.append(chn_char[i])
90
+ i += 1
91
+ if result[-1] == "#0": # 去掉最后的#0,改为sil
92
+ result = result[:-1]
93
+ # if result[-1] != "sil":
94
+ # result.append("sil")
95
+ assert j == len(pinyin)
96
+ return result
97
+
98
+ # _PAUSE_SYMBOL = {'、', ',', '。', ',', '!', '!', '?', ':', ':', '《', '》', '·', '(', ')', '(', ')'}
99
+ _PAUSE_SYMBOL = {'.':'.', '、':',', ',':',', '。':'.', ',':',', '!':'!', '!':'!', '?':'?', '?':'?', ':':',', ':':',', '——':','}
100
+
101
+ class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
102
+ pass
103
+
104
+
105
+ def checkErHuaYin(text, GT_pinyin):
106
+ new_pinyin = []
107
+ check_pattern = re.compile("[\\t\.\!\?\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()“”:;]+")
108
+ check_text = check_pattern.sub('', text)
109
+ if len(check_text) > len(GT_pinyin) and '儿' in check_text:
110
+ # print('Size mismatch: ', check_text, len(check_text), '\n', GT_pinyin, len(GT_pinyin))
111
+ for i in range(len(GT_pinyin)):
112
+ if GT_pinyin[i][-2] == 'r' and GT_pinyin[i][:2] != 'er' and check_text[i + 1] == '儿':
113
+ new_pinyin.append(GT_pinyin[i][:-2] + GT_pinyin[i][-1])
114
+ new_pinyin.append('er5')
115
+ replace_word = check_text[i:i + 2]
116
+ replace_pattern = re.compile(replace_word)
117
+ # text = replace_pattern.sub(replace_word[:-1], text)
118
+ check_text = replace_pattern.sub(replace_word[:-1], check_text, count=1)
119
+ else:
120
+ new_pinyin.append(GT_pinyin[i])
121
+ GT_pinyin = new_pinyin
122
+ return GT_pinyin
123
+
124
+
125
+ def change_tone_in_bu_or_yi(chars, pinyin_list):
126
+ location_yi = [m.start() for m in re.finditer(r'一', chars)]
127
+ location_bu = [m.start() for m in re.finditer(r'不', chars)]
128
+ # print('data: ', chars, pinyin_list, location_yi, location_bu)
129
+ for l in location_yi:
130
+ if l > 0 and l<len(chars) and chars[l-1]==chars[l+1]:
131
+ pinyin_list[l] = 'yi5'
132
+ elif l<len(chars) and pinyin_list[l+1][-1] == '4':
133
+ pinyin_list[l] = 'yi2'
134
+ for l in location_bu:
135
+ if l<len(chars) and pinyin_list[l+1][-1] == '4':
136
+ pinyin_list[l] = 'bu2'
137
+ return pinyin_list
138
+
139
+
140
+ def txt2pinyin(text, pinyin_parser):
141
+ phonemes = []
142
+ text = NSWNormalizer(text.strip()).normalize().upper()
143
+ texts = text.split(' ')
144
+ for text in texts:
145
+ text_list = list(jieba.cut(text))
146
+ for words in text_list:
147
+ # print('words: ', words)
148
+ if words in _PAUSE_SYMBOL:
149
+ # phonemes.append('#2')
150
+ phonemes[-1] += _PAUSE_SYMBOL[words]
151
+ elif re.search("[\u4e00-\u9fa5]+", words):
152
+ pinyin = pinyin_parser(words, style=Style.TONE3, errors="ignore")
153
+ new_pinyin = []
154
+ for x in pinyin:
155
+ x = "".join(x)
156
+ if "#" not in x:
157
+ new_pinyin.append(x)
158
+ new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
159
+ phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin) # phoneme seq: [sil c e4 #0 sh iii4 #0 ^ uen2 #0 b en3 sil] string 的list
160
+ phonemes += phoneme
161
+ elif re.search(r"[a-zA-Z]", words):
162
+ phonemes.append(words.upper())
163
+ # phonemes.append("#1")
164
+ phones = " ".join(phonemes)
165
+ return phones
166
+
167
+
168
+
169
+ def process_batch(text_list, save_dir):
170
+ my_pinyin = Pinyin(MyConverter())
171
+ pinyin_parser = my_pinyin.pinyin
172
+
173
+ for text_info in tqdm(text_list):
174
+ try:
175
+ name, text = text_info
176
+ save_path = os.path.join(save_dir, name+".txt")
177
+ phones = txt2pinyin(text, pinyin_parser)
178
+ open(save_path, 'w', encoding='utf-8').write(phones)
179
+ except Exception as e:
180
+ print(text_info, e)
181
+
182
+ def parallel_process(filenames, num_processes, save_dir):
183
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
184
+ tasks = []
185
+ for i in range(num_processes):
186
+ start = int(i * len(filenames) / num_processes)
187
+ end = int((i + 1) * len(filenames) / num_processes)
188
+ chunk = filenames[start:end]
189
+ tasks.append(executor.submit(process_batch, chunk, save_dir))
190
+
191
+ for task in tqdm(tasks):
192
+ task.result()
193
+
194
+
195
+ if __name__ == "__main__":
196
+ parser = argparse.ArgumentParser()
197
+ parser.add_argument(
198
+ "--text_file", type=str, default="", help="path to input text file")
199
+ parser.add_argument(
200
+ "--save_dir", type=str, default="", help="path to output text file")
201
+ parser.add_argument(
202
+ '--workers', type=int, default=4, help='You are advised to set the number of processes to the same as the number of CPU cores')
203
+ args = parser.parse_args()
204
+
205
+ sampling_rate = 16000
206
+
207
+ os.makedirs(args.save_dir, exist_ok=True)
208
+
209
+ filenames = open(args.text_file, 'r', encoding='utf-8').readlines()
210
+ filenames = [x.strip().split('\t') for x in tqdm(filenames)]
211
+ filenames = [[x[0], x[-1]] for x in tqdm(filenames)]
212
+ # shuffle(filenames)
213
+ print(len(filenames))
214
+ multiprocessing.set_start_method("spawn", force=True)
215
+
216
+ if args.workers == 0:
217
+ args.workers = os.cpu_count()
218
+
219
+ parallel_process(filenames, args.workers, args.save_dir)
220
+
221
+
222
+ #################################################################################
223
+
224
+
225
+
lemas_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ from concurrent.futures import ThreadPoolExecutor
7
+
8
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
9
+ sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
10
+
11
+ import hashlib
12
+ import re
13
+ import tempfile
14
+ from importlib.resources import files
15
+
16
+ import matplotlib
17
+
18
+ matplotlib.use("Agg")
19
+
20
+ import matplotlib.pylab as plt
21
+ import numpy as np
22
+ import torch
23
+ import torchaudio
24
+ import tqdm
25
+ from huggingface_hub import hf_hub_download
26
+ from pydub import AudioSegment, silence
27
+ from transformers import pipeline
28
+ from vocos import Vocos
29
+
30
+ from lemas_tts.model.cfm import CFM
31
+ from lemas_tts.model.utils import (
32
+ get_tokenizer,
33
+ convert_char_to_pinyin,
34
+ )
35
+
36
+
37
+ def _find_repo_root(start: Path) -> Path:
38
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
39
+ for p in [start, *start.parents]:
40
+ if (p / "pretrained_models").is_dir():
41
+ return p
42
+ cwd = Path.cwd()
43
+ if (cwd / "pretrained_models").is_dir():
44
+ return cwd
45
+ return start
46
+
47
+
48
+ # Resolve repository layout for pretrained assets when running from source tree
49
+ THIS_FILE = Path(__file__).resolve()
50
+ REPO_ROOT = _find_repo_root(THIS_FILE)
51
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
52
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
53
+
54
+ _ref_audio_cache = {}
55
+
56
+ device = (
57
+ "cuda"
58
+ if torch.cuda.is_available()
59
+ else "xpu"
60
+ if torch.xpu.is_available()
61
+ else "mps"
62
+ if torch.backends.mps.is_available()
63
+ else "cpu"
64
+ )
65
+
66
+ # -----------------------------------------
67
+
68
+ target_sample_rate = 24000
69
+ n_mel_channels = 100
70
+ hop_length = 256
71
+ win_length = 1024
72
+ n_fft = 1024
73
+ mel_spec_type = "vocos"
74
+ target_rms = 0.1
75
+ cross_fade_duration = 0.15
76
+ ode_method = "euler"
77
+ nfe_step = 32 # 16, 32
78
+ cfg_strength = 3.0
79
+ sway_sampling_coef = 1
80
+ speed = 1.0
81
+ fix_duration = None
82
+
83
+ # -----------------------------------------
84
+
85
+
86
+ # chunk text into smaller pieces
87
+
88
+
89
+ def chunk_text(text, max_chars=135):
90
+ """
91
+ Splits the input text into chunks, each with a maximum number of characters.
92
+
93
+ Args:
94
+ text (str): The text to be split.
95
+ max_chars (int): The maximum number of characters per chunk.
96
+
97
+ Returns:
98
+ List[str]: A list of text chunks.
99
+ """
100
+ chunks = []
101
+ current_chunk = ""
102
+ # Split the text into sentences based on punctuation followed by whitespace
103
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
104
+
105
+ for sentence in sentences:
106
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
107
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
108
+ else:
109
+ if current_chunk:
110
+ chunks.append(current_chunk.strip())
111
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
112
+
113
+ if current_chunk:
114
+ chunks.append(current_chunk.strip())
115
+
116
+ return chunks
117
+
118
+
119
+ # load vocoder
120
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
121
+ if vocoder_name == "vocos":
122
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
123
+ if is_local:
124
+ print(f"Load vocos from local path {local_path}")
125
+ config_path = f"{local_path}/config.yaml"
126
+ model_path = f"{local_path}/pytorch_model.bin"
127
+ else:
128
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
129
+ repo_id = "charactr/vocos-mel-24khz"
130
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
131
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
132
+ vocoder = Vocos.from_hparams(config_path)
133
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
134
+ from vocos.feature_extractors import EncodecFeatures
135
+
136
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
137
+ encodec_parameters = {
138
+ "feature_extractor.encodec." + key: value
139
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
140
+ }
141
+ state_dict.update(encodec_parameters)
142
+ vocoder.load_state_dict(state_dict)
143
+ vocoder = vocoder.eval().to(device)
144
+ elif vocoder_name == "bigvgan":
145
+ try:
146
+ from third_party.BigVGAN import bigvgan
147
+ except ImportError:
148
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
149
+ if is_local:
150
+ # download generator from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main
151
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
152
+ else:
153
+ vocoder = bigvgan.BigVGAN.from_pretrained(
154
+ "nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False, cache_dir=hf_cache_dir
155
+ )
156
+
157
+ vocoder.remove_weight_norm()
158
+ vocoder = vocoder.eval().to(device)
159
+ return vocoder
160
+
161
+
162
+ # load asr pipeline
163
+
164
+ asr_pipe = None
165
+
166
+
167
+ def initialize_asr_pipeline(device: str = device, dtype=None):
168
+ if dtype is None:
169
+ dtype = (
170
+ torch.float16
171
+ if "cuda" in device
172
+ and torch.cuda.get_device_properties(device).major >= 7
173
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
174
+ else torch.float32
175
+ )
176
+ global asr_pipe
177
+ asr_pipe = pipeline(
178
+ "automatic-speech-recognition",
179
+ model="openai/whisper-large-v3-turbo",
180
+ torch_dtype=dtype,
181
+ device=device,
182
+ )
183
+
184
+
185
+ # transcribe
186
+
187
+
188
+ def transcribe(ref_audio, language=None):
189
+ global asr_pipe
190
+ if asr_pipe is None:
191
+ initialize_asr_pipeline(device=device)
192
+ return asr_pipe(
193
+ ref_audio,
194
+ chunk_length_s=30,
195
+ batch_size=128,
196
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
197
+ return_timestamps=False,
198
+ )["text"].strip()
199
+
200
+
201
+ # load model checkpoint for inference
202
+
203
+
204
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
205
+ if dtype is None:
206
+ dtype = (
207
+ torch.float16
208
+ if "cuda" in device
209
+ and torch.cuda.get_device_properties(device).major >= 7
210
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
211
+ else torch.float32
212
+ )
213
+ model = model.to(dtype)
214
+
215
+ ckpt_type = ckpt_path.split(".")[-1]
216
+ if ckpt_type == "safetensors":
217
+ from safetensors.torch import load_file
218
+
219
+ checkpoint = load_file(ckpt_path, device=device)
220
+ else:
221
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
222
+
223
+ if use_ema:
224
+ if ckpt_type == "safetensors":
225
+ checkpoint = {"ema_model_state_dict": checkpoint}
226
+ checkpoint["model_state_dict"] = {
227
+ k.replace("ema_model.", ""): v
228
+ for k, v in checkpoint["ema_model_state_dict"].items()
229
+ if k not in ["initted", "step"]
230
+ }
231
+
232
+ # patch for backward compatibility, 305e3ea
233
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window", "ctc.proj.0.weight", "ctc.proj.0.bias", "ctc.ctc_proj.weight", "ctc.ctc_proj.bias"]:
234
+ if key in checkpoint["model_state_dict"]:
235
+ del checkpoint["model_state_dict"][key]
236
+
237
+ model.load_state_dict(checkpoint["model_state_dict"])
238
+ else:
239
+ if ckpt_type == "safetensors":
240
+ checkpoint = {"model_state_dict": checkpoint}
241
+ model.load_state_dict(checkpoint["model_state_dict"])
242
+
243
+ del checkpoint
244
+ torch.cuda.empty_cache()
245
+
246
+ return model.to(device)
247
+
248
+
249
+ # load model for inference
250
+
251
+
252
+ def load_model(
253
+ model_cls,
254
+ model_cfg,
255
+ ckpt_path,
256
+ mel_spec_type=mel_spec_type,
257
+ vocab_file="",
258
+ ode_method=ode_method,
259
+ use_ema=True,
260
+ device=device,
261
+ use_prosody_encoder=False,
262
+ prosody_cfg_path="",
263
+ prosody_ckpt_path="",
264
+ ):
265
+ if vocab_file == "":
266
+ vocab_file = str(files("lemas_tts").joinpath("infer/examples/vocab.txt"))
267
+ tokenizer = "custom"
268
+
269
+ print("\nvocab : ", vocab_file)
270
+ print("token : ", tokenizer)
271
+ print("model : ", ckpt_path, "\n")
272
+
273
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
274
+
275
+ # Resolve prosody encoder assets if requested but paths not provided
276
+ if use_prosody_encoder:
277
+ if not prosody_cfg_path:
278
+ prosody_cfg_path = str(CKPTS_ROOT / "prosody_encoder" / "pretssel_cfg.json")
279
+ if not prosody_ckpt_path:
280
+ prosody_ckpt_path = str(CKPTS_ROOT / "prosody_encoder" / "prosody_encoder_UnitY2.pt")
281
+ model = CFM(
282
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels, use_prosody_encoder=use_prosody_encoder),
283
+ mel_spec_kwargs=dict(
284
+ n_fft=n_fft,
285
+ hop_length=hop_length,
286
+ win_length=win_length,
287
+ n_mel_channels=n_mel_channels,
288
+ target_sample_rate=target_sample_rate,
289
+ mel_spec_type=mel_spec_type,
290
+ ),
291
+ odeint_kwargs=dict(
292
+ method=ode_method,
293
+ ),
294
+ vocab_char_map=vocab_char_map,
295
+ use_prosody_encoder=use_prosody_encoder,
296
+ prosody_cfg_path=prosody_cfg_path,
297
+ prosody_ckpt_path=prosody_ckpt_path,
298
+ ).to(device)
299
+
300
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
301
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
302
+
303
+ return model
304
+
305
+
306
+ def remove_silence_edges(audio, silence_threshold=-42):
307
+ # Remove silence from the start
308
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
309
+ audio = audio[non_silent_start_idx:]
310
+
311
+ # Remove silence from the end
312
+ non_silent_end_duration = audio.duration_seconds
313
+ for ms in reversed(audio):
314
+ if ms.dBFS > silence_threshold:
315
+ break
316
+ non_silent_end_duration -= 0.001
317
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
318
+
319
+ return trimmed_audio
320
+
321
+
322
+ # preprocess reference audio and text
323
+
324
+
325
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print):
326
+ show_info("Converting audio...")
327
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
328
+ aseg = AudioSegment.from_file(ref_audio_orig)
329
+
330
+ if clip_short:
331
+ # 1. try to find long silence for clipping
332
+ non_silent_segs = silence.split_on_silence(
333
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
334
+ )
335
+ non_silent_wave = AudioSegment.silent(duration=0)
336
+ for non_silent_seg in non_silent_segs:
337
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
338
+ show_info("Audio is over 12s, clipping short. (1)")
339
+ break
340
+ non_silent_wave += non_silent_seg
341
+
342
+ # 2. try to find short silence for clipping if 1. failed
343
+ if len(non_silent_wave) > 12000:
344
+ non_silent_segs = silence.split_on_silence(
345
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
346
+ )
347
+ non_silent_wave = AudioSegment.silent(duration=0)
348
+ for non_silent_seg in non_silent_segs:
349
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
350
+ show_info("Audio is over 12s, clipping short. (2)")
351
+ break
352
+ non_silent_wave += non_silent_seg
353
+
354
+ aseg = non_silent_wave
355
+
356
+ # 3. if no proper silence found for clipping
357
+ if len(aseg) > 12000:
358
+ aseg = aseg[:12000]
359
+ show_info("Audio is over 12s, clipping short. (3)")
360
+
361
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
362
+ aseg.export(f.name, format="wav")
363
+ ref_audio = f.name
364
+
365
+ # Compute a hash of the reference audio file
366
+ with open(ref_audio, "rb") as audio_file:
367
+ audio_data = audio_file.read()
368
+ audio_hash = hashlib.md5(audio_data).hexdigest()
369
+
370
+ if not ref_text.strip():
371
+ global _ref_audio_cache
372
+ if audio_hash in _ref_audio_cache:
373
+ # Use cached asr transcription
374
+ show_info("Using cached reference text...")
375
+ ref_text = _ref_audio_cache[audio_hash]
376
+ else:
377
+ show_info("No reference text provided, transcribing reference audio...")
378
+ ref_text = transcribe(ref_audio)
379
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
380
+ _ref_audio_cache[audio_hash] = ref_text
381
+ else:
382
+ show_info("Using custom reference text...")
383
+
384
+ # Ensure ref_text ends with a proper sentence-ending punctuation
385
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
386
+ if ref_text.endswith("."):
387
+ ref_text += " "
388
+ else:
389
+ ref_text += ". "
390
+
391
+ print("\nref_text ", ref_text)
392
+
393
+ return ref_audio, ref_text
394
+
395
+
396
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
397
+
398
+
399
+ def infer_process(
400
+ ref_audio,
401
+ ref_text,
402
+ gen_text,
403
+ model_obj,
404
+ vocoder,
405
+ mel_spec_type=mel_spec_type,
406
+ show_info=print,
407
+ progress=tqdm,
408
+ target_rms=target_rms,
409
+ cross_fade_duration=cross_fade_duration,
410
+ nfe_step=nfe_step,
411
+ cfg_strength=cfg_strength,
412
+ sway_sampling_coef=sway_sampling_coef,
413
+ use_acc_grl=True,
414
+ use_prosody_encoder=True,
415
+ ref_ratio=None,
416
+ no_ref_audio=False,
417
+ speed=speed,
418
+ fix_duration=fix_duration,
419
+ device=device,
420
+ ):
421
+ # Split the input text into batches
422
+ audio, sr = torchaudio.load(ref_audio)
423
+
424
+ if type(ref_text) == str:
425
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr))
426
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
427
+ else:
428
+ gen_text_batches = gen_text
429
+
430
+ print(f"ref_text:", ref_text)
431
+ for i, gen_text in enumerate(gen_text_batches):
432
+ print(f"gen_text {i}", gen_text)
433
+ print("\n")
434
+
435
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
436
+ return next(
437
+ infer_batch_process(
438
+ (audio, sr),
439
+ ref_text,
440
+ gen_text_batches,
441
+ model_obj,
442
+ vocoder,
443
+ mel_spec_type=mel_spec_type,
444
+ progress=progress,
445
+ target_rms=target_rms,
446
+ cross_fade_duration=cross_fade_duration,
447
+ nfe_step=nfe_step,
448
+ cfg_strength=cfg_strength,
449
+ sway_sampling_coef=sway_sampling_coef,
450
+ use_acc_grl=use_acc_grl,
451
+ use_prosody_encoder=use_prosody_encoder,
452
+ ref_ratio=ref_ratio,
453
+ no_ref_audio=no_ref_audio,
454
+ speed=speed,
455
+ fix_duration=fix_duration,
456
+ device=device,
457
+ )
458
+ )
459
+
460
+
461
+ # infer batches
462
+
463
+
464
+ def infer_batch_process(
465
+ ref_audio,
466
+ ref_text,
467
+ gen_text_batches,
468
+ model_obj,
469
+ vocoder,
470
+ mel_spec_type="vocos",
471
+ progress=tqdm,
472
+ target_rms=0.1,
473
+ cross_fade_duration=0.15,
474
+ nfe_step=32,
475
+ cfg_strength=2.0,
476
+ sway_sampling_coef=-1,
477
+ use_acc_grl=True,
478
+ use_prosody_encoder=True,
479
+ ref_ratio=None,
480
+ no_ref_audio=False,
481
+ speed=1,
482
+ fix_duration=None,
483
+ device=None,
484
+ streaming=False,
485
+ chunk_size=2048,
486
+ ):
487
+ audio, sr = ref_audio
488
+ if audio.shape[0] > 1:
489
+ audio = torch.mean(audio, dim=0, keepdim=True)
490
+
491
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
492
+ if rms < target_rms:
493
+ audio = audio * target_rms / rms
494
+ if sr != target_sample_rate:
495
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
496
+ audio = resampler(audio)
497
+ audio = audio.to(device)
498
+
499
+ generated_waves = []
500
+ spectrograms = []
501
+
502
+ if type(ref_text) == str:
503
+ if len(ref_text[-1].encode("utf-8")) == 1:
504
+ ref_text = ref_text + " "
505
+
506
+ def process_batch(gen_text):
507
+ local_speed = speed
508
+
509
+ if type(ref_text) == str:
510
+ if len(gen_text.encode("utf-8")) < 10:
511
+ local_speed = 0.3
512
+
513
+ # Prepare the text
514
+ text_list = [ref_text + gen_text]
515
+ final_text_list = convert_char_to_pinyin(text_list)
516
+ else:
517
+ final_text_list = [ref_text + gen_text]
518
+ print("final_text_list:", final_text_list)
519
+
520
+ ref_audio_len = audio.shape[-1] // hop_length
521
+ if fix_duration is not None:
522
+ duration = int(fix_duration * target_sample_rate / hop_length)
523
+ else:
524
+ # Calculate duration
525
+ ref_text_len = len(ref_text) # .encode("utf-8")
526
+ gen_text_len = len(gen_text) # .encode("utf-8")
527
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / local_speed)
528
+
529
+ # inference
530
+ with torch.inference_mode():
531
+ generated, _ = model_obj.sample(
532
+ cond=audio,
533
+ text=final_text_list,
534
+ duration=duration,
535
+ steps=nfe_step,
536
+ cfg_strength=cfg_strength,
537
+ sway_sampling_coef=sway_sampling_coef,
538
+ use_acc_grl=use_acc_grl,
539
+ use_prosody_encoder=use_prosody_encoder,
540
+ ref_ratio=ref_ratio,
541
+ no_ref_audio=no_ref_audio,
542
+ )
543
+ del _
544
+
545
+ generated = generated.to(torch.float32) # generated mel spectrogram
546
+ generated = generated[:, ref_audio_len:, :]
547
+ generated = generated.permute(0, 2, 1)
548
+ if mel_spec_type == "vocos":
549
+ generated_wave = vocoder.decode(generated)
550
+ elif mel_spec_type == "bigvgan":
551
+ generated_wave = vocoder(generated)
552
+ if rms < target_rms:
553
+ generated_wave = generated_wave * rms / target_rms
554
+
555
+ # wav -> numpy
556
+ # generated_wave = torch.clip(generated_wave, -0.999, 0.999)
557
+ generated_wave = generated_wave.squeeze().cpu().numpy()
558
+
559
+ if streaming:
560
+ for j in range(0, len(generated_wave), chunk_size):
561
+ yield generated_wave[j : j + chunk_size], target_sample_rate
562
+ else:
563
+ generated_cpu = generated[0].cpu().numpy()
564
+ del generated
565
+ yield generated_wave, generated_cpu
566
+
567
+ if streaming:
568
+ for gen_text in progress.tqdm(gen_text_batches) if progress is not None else gen_text_batches:
569
+ for chunk in process_batch(gen_text):
570
+ yield chunk
571
+ else:
572
+ with ThreadPoolExecutor() as executor:
573
+ futures = [executor.submit(process_batch, gen_text) for gen_text in gen_text_batches]
574
+ for future in progress.tqdm(futures) if progress is not None else futures:
575
+ result = future.result()
576
+ if result:
577
+ generated_wave, generated_mel_spec = next(result)
578
+ generated_waves.append(generated_wave)
579
+ spectrograms.append(generated_mel_spec)
580
+
581
+ if generated_waves:
582
+ if cross_fade_duration <= 0:
583
+ # Simply concatenate
584
+ final_wave = np.concatenate(generated_waves)
585
+ else:
586
+ # Combine all generated waves with cross-fading
587
+ final_wave = generated_waves[0]
588
+ for i in range(1, len(generated_waves)):
589
+ prev_wave = final_wave
590
+ next_wave = generated_waves[i]
591
+
592
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
593
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
594
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
595
+
596
+ if cross_fade_samples <= 0:
597
+ # No overlap possible, concatenate
598
+ final_wave = np.concatenate([prev_wave, next_wave])
599
+ continue
600
+
601
+ # Overlapping parts
602
+ prev_overlap = prev_wave[-cross_fade_samples:]
603
+ next_overlap = next_wave[:cross_fade_samples]
604
+
605
+ # Fade out and fade in
606
+ fade_out = np.linspace(1, 0, cross_fade_samples)
607
+ fade_in = np.linspace(0, 1, cross_fade_samples)
608
+
609
+ # Cross-faded overlap
610
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
611
+
612
+ # Combine
613
+ new_wave = np.concatenate(
614
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
615
+ )
616
+
617
+ final_wave = new_wave
618
+
619
+ # Create a combined spectrogram
620
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
621
+ final_wave = np.clip(final_wave, -0.999, 0.999)
622
+ yield final_wave, target_sample_rate, combined_spectrogram
623
+
624
+ else:
625
+ yield None, target_sample_rate, None
626
+
627
+
628
+ # remove silence from generated wav
629
+
630
+
631
+ def remove_silence_for_generated_wav(filename):
632
+ aseg = AudioSegment.from_file(filename)
633
+ non_silent_segs = silence.split_on_silence(
634
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
635
+ )
636
+ non_silent_wave = AudioSegment.silent(duration=0)
637
+ for non_silent_seg in non_silent_segs:
638
+ non_silent_wave += non_silent_seg
639
+ aseg = non_silent_wave
640
+ aseg.export(filename, format="wav")
641
+
642
+
643
+ # save spectrogram
644
+
645
+
646
+ def save_spectrogram(spectrogram, path):
647
+ plt.figure(figsize=(12, 4))
648
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
649
+ plt.colorbar()
650
+ plt.savefig(path)
651
+ plt.close()
lemas_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - stable diffusion 3 block structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
lemas_tts/model/backbones/dit.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Optional
13
+
14
+ import torch
15
+ from torch import nn
16
+ import torch.nn.functional as F
17
+
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from lemas_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ DiTBlock,
25
+ AdaLayerNorm_Final,
26
+ precompute_freqs_cis,
27
+ get_pos_embed_indices,
28
+ )
29
+ from lemas_tts.model.backbones.ecapa_tdnn import ECAPA_TDNN
30
+
31
+ # Text embedding
32
+
33
+
34
+ class TextEmbedding(nn.Module):
35
+ def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
36
+ super().__init__()
37
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
38
+
39
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
40
+
41
+ if conv_layers > 0:
42
+ self.extra_modeling = True
43
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
44
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
45
+ self.text_blocks = nn.Sequential(
46
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
47
+ )
48
+ else:
49
+ self.extra_modeling = False
50
+
51
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
52
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
53
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
54
+ batch, text_len = text.shape[0], text.shape[1]
55
+ text = F.pad(text, (0, seq_len - text_len), value=0)
56
+ if self.mask_padding:
57
+ text_mask = text == 0
58
+
59
+ if drop_text: # cfg for text
60
+ text = torch.zeros_like(text)
61
+
62
+ text = self.text_embed(text) # b n -> b n d
63
+
64
+ # possible extra modeling
65
+ if self.extra_modeling:
66
+ # sinus pos emb
67
+ batch_start = torch.zeros((batch,), dtype=torch.long)
68
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
69
+ text_pos_embed = self.freqs_cis[pos_idx]
70
+ text = text + text_pos_embed
71
+
72
+ # convnextv2 blocks
73
+ if self.mask_padding:
74
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
75
+ for block in self.text_blocks:
76
+ text = block(text)
77
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
78
+ else:
79
+ text = self.text_blocks(text)
80
+
81
+ return text
82
+
83
+
84
+ # noised input audio and context mixing embedding
85
+
86
+
87
+ class InputEmbedding(nn.Module):
88
+ def __init__(self, mel_dim, text_dim, out_dim):
89
+ super().__init__()
90
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
91
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
92
+
93
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
94
+ if drop_audio_cond: # cfg for cond audio
95
+ cond = torch.zeros_like(cond)
96
+
97
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
98
+ x = self.conv_pos_embed(x) + x
99
+ return x
100
+
101
+
102
+ # Transformer backbone using DiT blocks
103
+
104
+
105
+ class DiT(nn.Module):
106
+ def __init__(
107
+ self,
108
+ *,
109
+ dim,
110
+ depth=8,
111
+ heads=8,
112
+ dim_head=64,
113
+ dropout=0.1,
114
+ ff_mult=4,
115
+ mel_dim=100,
116
+ text_num_embeds=256,
117
+ text_dim=None,
118
+ text_mask_padding=True,
119
+ qk_norm=None,
120
+ conv_layers=0,
121
+ pe_attn_head=None,
122
+ long_skip_connection=False,
123
+ checkpoint_activations=False,
124
+ use_prosody_encoder=False,
125
+ ):
126
+ super().__init__()
127
+
128
+ self.time_embed = TimestepEmbedding(dim)
129
+ if text_dim is None:
130
+ text_dim = mel_dim
131
+ self.text_embed = TextEmbedding(
132
+ text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
133
+ )
134
+ # project prosody embeddings (512-dim) to text_dim for conditioning
135
+ self.use_prosody_encoder = use_prosody_encoder
136
+ if use_prosody_encoder:
137
+ self.prosody_text_proj = nn.Linear(512, text_dim)
138
+ else:
139
+ self.prosody_text_proj = None
140
+ self.text_cond, self.text_uncond = None, None # text cache
141
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
142
+
143
+ self.rotary_embed = RotaryEmbedding(dim_head)
144
+
145
+ self.dim = dim
146
+ self.depth = depth
147
+
148
+ self.transformer_blocks = nn.ModuleList(
149
+ [
150
+ DiTBlock(
151
+ dim=dim,
152
+ heads=heads,
153
+ dim_head=dim_head,
154
+ ff_mult=ff_mult,
155
+ dropout=dropout,
156
+ qk_norm=qk_norm,
157
+ pe_attn_head=pe_attn_head,
158
+ )
159
+ for _ in range(depth)
160
+ ]
161
+ )
162
+ self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
163
+
164
+ self.norm_out = AdaLayerNorm_Final(dim) # final modulation
165
+ self.proj_out = nn.Linear(dim, mel_dim)
166
+
167
+ self.checkpoint_activations = checkpoint_activations
168
+
169
+ self.initialize_weights()
170
+
171
+ def initialize_weights(self):
172
+ # Zero-out AdaLN layers in DiT blocks:
173
+ for block in self.transformer_blocks:
174
+ nn.init.constant_(block.attn_norm.linear.weight, 0)
175
+ nn.init.constant_(block.attn_norm.linear.bias, 0)
176
+
177
+ # Zero-out output layers:
178
+ nn.init.constant_(self.norm_out.linear.weight, 0)
179
+ nn.init.constant_(self.norm_out.linear.bias, 0)
180
+ nn.init.constant_(self.proj_out.weight, 0)
181
+ nn.init.constant_(self.proj_out.bias, 0)
182
+
183
+ def ckpt_wrapper(self, module):
184
+ # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
185
+ def ckpt_forward(*inputs):
186
+ outputs = module(*inputs)
187
+ return outputs
188
+
189
+ return ckpt_forward
190
+
191
+ def clear_cache(self):
192
+ self.text_cond, self.text_uncond = None, None
193
+
194
+ def forward(
195
+ self,
196
+ x: float["b n d"], # nosied input audio # noqa: F722
197
+ cond: float["b n d"], # masked cond audio # noqa: F722
198
+ text: int["b nt"], # text # noqa: F722
199
+ time: float["b"] | float[""], # time step # noqa: F821 F722
200
+ drop_audio_cond, # cfg for cond audio
201
+ drop_text, # cfg for text
202
+ mask: bool["b n"] | None = None, # noqa: F722
203
+ cache=False,
204
+ prosody_text: Optional[torch.Tensor] = None,
205
+ ):
206
+ batch, seq_len = x.shape[0], x.shape[1]
207
+ if time.ndim == 0:
208
+ time = time.repeat(batch)
209
+
210
+ # t: conditioning time, text: text, x: noised audio + cond audio + text
211
+ t = self.time_embed(time)
212
+ if cache:
213
+ if drop_text:
214
+ if self.text_uncond is None:
215
+ self.text_uncond = self.text_embed(text, seq_len, drop_text=True)
216
+ text_embed = self.text_uncond
217
+ else:
218
+ if self.text_cond is None:
219
+ self.text_cond = self.text_embed(text, seq_len, drop_text=False)
220
+ text_embed = self.text_cond
221
+ else:
222
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
223
+
224
+ # optional prosody conditioning on text side
225
+ if prosody_text is not None and self.use_prosody_encoder:
226
+ # prosody_text: (B, T_text, 512) -> project to text_dim and align to seq_len
227
+ pt = self.prosody_text_proj(prosody_text)
228
+ if pt.size(1) < seq_len:
229
+ pad_len = seq_len - pt.size(1)
230
+ pt = F.pad(pt, (0, 0, 0, pad_len))
231
+ elif pt.size(1) > seq_len:
232
+ pt = pt[:, :seq_len]
233
+ text_embed = text_embed + pt
234
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
235
+
236
+ rope = self.rotary_embed.forward_from_seq_len(seq_len)
237
+
238
+ if self.long_skip_connection is not None:
239
+ residual = x
240
+
241
+ for block in self.transformer_blocks:
242
+ if self.checkpoint_activations:
243
+ # https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
244
+ x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
245
+ else:
246
+ x = block(x, t, mask=mask, rope=rope)
247
+
248
+ if self.long_skip_connection is not None:
249
+ x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
250
+
251
+ x = self.norm_out(x, t)
252
+ output = self.proj_out(x)
253
+
254
+ return output
lemas_tts/model/backbones/ecapa_tdnn.py ADDED
@@ -0,0 +1,931 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A popular speaker recognition and diarization model.
2
+
3
+ Authors
4
+ * Hwidong Na 2020
5
+ """
6
+
7
+ import math
8
+ import os
9
+ import torch # noqa: F401
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ def length_to_mask(length, max_len=None, dtype=None, device=None):
15
+ """Creates a binary mask for each sequence.
16
+
17
+ Reference: https://discuss.pytorch.org/t/how-to-generate-variable-length-mask/23397/3
18
+
19
+ Arguments
20
+ ---------
21
+ length : torch.LongTensor
22
+ Containing the length of each sequence in the batch. Must be 1D.
23
+ max_len : int
24
+ Max length for the mask, also the size of the second dimension.
25
+ dtype : torch.dtype, default: None
26
+ The dtype of the generated mask.
27
+ device: torch.device, default: None
28
+ The device to put the mask variable.
29
+
30
+ Returns
31
+ -------
32
+ mask : tensor
33
+ The binary mask.
34
+
35
+ Example
36
+ -------
37
+ >>> length=torch.Tensor([1,2,3])
38
+ >>> mask=length_to_mask(length)
39
+ >>> mask
40
+ tensor([[1., 0., 0.],
41
+ [1., 1., 0.],
42
+ [1., 1., 1.]])
43
+ """
44
+ assert len(length.shape) == 1
45
+
46
+ if max_len is None:
47
+ max_len = length.max().long().item() # using arange to generate mask
48
+ mask = torch.arange(max_len, device=length.device, dtype=length.dtype).expand(
49
+ len(length), max_len
50
+ ) < length.unsqueeze(1)
51
+
52
+ if dtype is None:
53
+ dtype = length.dtype
54
+
55
+ if device is None:
56
+ device = length.device
57
+
58
+ mask = torch.as_tensor(mask, dtype=dtype, device=device)
59
+ return mask
60
+
61
+
62
+ def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
63
+ """This function computes the number of elements to add for zero-padding.
64
+
65
+ Arguments
66
+ ---------
67
+ L_in : int
68
+ stride: int
69
+ kernel_size : int
70
+ dilation : int
71
+ """
72
+ if stride > 1:
73
+ n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
74
+ L_out = stride * (n_steps - 1) + kernel_size * dilation
75
+ padding = [kernel_size // 2, kernel_size // 2]
76
+
77
+ else:
78
+ L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
79
+
80
+ padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
81
+ return padding
82
+
83
+
84
+ class Conv1d(nn.Module):
85
+ """This function implements 1d convolution.
86
+
87
+ Arguments
88
+ ---------
89
+ out_channels : int
90
+ It is the number of output channels.
91
+ kernel_size : int
92
+ Kernel size of the convolutional filters.
93
+ input_shape : tuple
94
+ The shape of the input. Alternatively use ``in_channels``.
95
+ in_channels : int
96
+ The number of input channels. Alternatively use ``input_shape``.
97
+ stride : int
98
+ Stride factor of the convolutional filters. When the stride factor > 1,
99
+ a decimation in time is performed.
100
+ dilation : int
101
+ Dilation factor of the convolutional filters.
102
+ padding : str
103
+ (same, valid, causal). If "valid", no padding is performed.
104
+ If "same" and stride is 1, output shape is the same as the input shape.
105
+ "causal" results in causal (dilated) convolutions.
106
+ padding_mode : str
107
+ This flag specifies the type of padding. See torch.nn documentation
108
+ for more information.
109
+ skip_transpose : bool
110
+ If False, uses batch x time x channel convention of speechbrain.
111
+ If True, uses batch x channel x time convention.
112
+
113
+ Example
114
+ -------
115
+ >>> inp_tensor = torch.rand([10, 40, 16])
116
+ >>> cnn_1d = Conv1d(
117
+ ... input_shape=inp_tensor.shape, out_channels=8, kernel_size=5
118
+ ... )
119
+ >>> out_tensor = cnn_1d(inp_tensor)
120
+ >>> out_tensor.shape
121
+ torch.Size([10, 40, 8])
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ out_channels,
127
+ kernel_size,
128
+ input_shape=None,
129
+ in_channels=None,
130
+ stride=1,
131
+ dilation=1,
132
+ padding="same",
133
+ groups=1,
134
+ bias=True,
135
+ padding_mode="reflect",
136
+ skip_transpose=True,
137
+ ):
138
+ super().__init__()
139
+ self.kernel_size = kernel_size
140
+ self.stride = stride
141
+ self.dilation = dilation
142
+ self.padding = padding
143
+ self.padding_mode = padding_mode
144
+ self.unsqueeze = False
145
+ self.skip_transpose = skip_transpose
146
+
147
+ if input_shape is None and in_channels is None:
148
+ raise ValueError("Must provide one of input_shape or in_channels")
149
+
150
+ if in_channels is None:
151
+ in_channels = self._check_input_shape(input_shape)
152
+
153
+ self.conv = nn.Conv1d(
154
+ in_channels,
155
+ out_channels,
156
+ self.kernel_size,
157
+ stride=self.stride,
158
+ dilation=self.dilation,
159
+ padding=0,
160
+ groups=groups,
161
+ bias=bias,
162
+ )
163
+
164
+ def forward(self, x):
165
+ """Returns the output of the convolution.
166
+
167
+ Arguments
168
+ ---------
169
+ x : torch.Tensor (batch, time, channel)
170
+ input to convolve. 2d or 4d tensors are expected.
171
+ """
172
+
173
+ if not self.skip_transpose:
174
+ x = x.transpose(1, -1)
175
+
176
+ if self.unsqueeze:
177
+ x = x.unsqueeze(1)
178
+
179
+ if self.padding == "same":
180
+ x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride)
181
+
182
+ elif self.padding == "causal":
183
+ num_pad = (self.kernel_size - 1) * self.dilation
184
+ x = F.pad(x, (num_pad, 0))
185
+
186
+ elif self.padding == "valid":
187
+ pass
188
+
189
+ else:
190
+ raise ValueError(
191
+ "Padding must be 'same', 'valid' or 'causal'. Got " + self.padding
192
+ )
193
+
194
+ wx = self.conv(x.to(self.conv.weight.dtype))
195
+
196
+ if self.unsqueeze:
197
+ wx = wx.squeeze(1)
198
+
199
+ if not self.skip_transpose:
200
+ wx = wx.transpose(1, -1)
201
+
202
+ return wx
203
+
204
+ def _manage_padding(
205
+ self,
206
+ x,
207
+ kernel_size: int,
208
+ dilation: int,
209
+ stride: int,
210
+ ):
211
+ """This function performs zero-padding on the time axis
212
+ such that their lengths is unchanged after the convolution.
213
+
214
+ Arguments
215
+ ---------
216
+ x : torch.Tensor
217
+ Input tensor.
218
+ kernel_size : int
219
+ Size of kernel.
220
+ dilation : int
221
+ Dilation used.
222
+ stride : int
223
+ Stride.
224
+ """
225
+
226
+ # Detecting input shape
227
+ L_in = x.shape[-1]
228
+
229
+ # Time padding
230
+ padding = get_padding_elem(L_in, stride, kernel_size, dilation)
231
+
232
+ # Applying padding
233
+ x = F.pad(x, padding, mode=self.padding_mode)
234
+
235
+ return x
236
+
237
+ def _check_input_shape(self, shape):
238
+ """Checks the input shape and returns the number of input channels."""
239
+
240
+ if len(shape) == 2:
241
+ self.unsqueeze = True
242
+ in_channels = 1
243
+ elif self.skip_transpose:
244
+ in_channels = shape[1]
245
+ elif len(shape) == 3:
246
+ in_channels = shape[2]
247
+ else:
248
+ raise ValueError("conv1d expects 2d, 3d inputs. Got " + str(len(shape)))
249
+
250
+ # Kernel size must be odd
251
+ if self.kernel_size % 2 == 0:
252
+ raise ValueError(
253
+ "The field kernel size must be an odd number. Got %s."
254
+ % (self.kernel_size)
255
+ )
256
+ return in_channels
257
+
258
+
259
+ class Fp32BatchNorm(nn.Module):
260
+ def __init__(self, sync=True, *args, **kwargs):
261
+ super().__init__()
262
+
263
+ if (
264
+ not torch.distributed.is_initialized()
265
+ or torch.distributed.get_world_size() == 1
266
+ ):
267
+ sync = False
268
+
269
+ if sync:
270
+ self.bn = nn.SyncBatchNorm(*args, **kwargs)
271
+ else:
272
+ self.bn = nn.BatchNorm1d(*args, **kwargs)
273
+
274
+ self.sync = sync
275
+
276
+ def forward(self, input):
277
+ if self.bn.running_mean.dtype != torch.float:
278
+ if self.sync:
279
+ self.bn.running_mean = self.bn.running_mean.float()
280
+ self.bn.running_var = self.bn.running_var.float()
281
+ if self.bn.affine:
282
+ try:
283
+ self.bn.weight = self.bn.weight.float()
284
+ self.bn.bias = self.bn.bias.float()
285
+ except:
286
+ self.bn.float()
287
+ else:
288
+ self.bn.float()
289
+
290
+ output = self.bn(input.float())
291
+ return output.type_as(input)
292
+
293
+
294
+ class BatchNorm1d(nn.Module):
295
+ """Applies 1d batch normalization to the input tensor.
296
+
297
+ Arguments
298
+ ---------
299
+ input_shape : tuple
300
+ The expected shape of the input. Alternatively, use ``input_size``.
301
+ input_size : int
302
+ The expected size of the input. Alternatively, use ``input_shape``.
303
+ eps : float
304
+ This value is added to std deviation estimation to improve the numerical
305
+ stability.
306
+ momentum : float
307
+ It is a value used for the running_mean and running_var computation.
308
+ affine : bool
309
+ When set to True, the affine parameters are learned.
310
+ track_running_stats : bool
311
+ When set to True, this module tracks the running mean and variance,
312
+ and when set to False, this module does not track such statistics.
313
+ combine_batch_time : bool
314
+ When true, it combines batch an time axis.
315
+
316
+
317
+ Example
318
+ -------
319
+ >>> input = torch.randn(100, 10)
320
+ >>> norm = BatchNorm1d(input_shape=input.shape)
321
+ >>> output = norm(input)
322
+ >>> output.shape
323
+ torch.Size([100, 10])
324
+ """
325
+
326
+ def __init__(
327
+ self,
328
+ input_shape=None,
329
+ input_size=None,
330
+ eps=1e-05,
331
+ momentum=0.1,
332
+ affine=True,
333
+ track_running_stats=True,
334
+ combine_batch_time=False,
335
+ skip_transpose=True,
336
+ enabled=True,
337
+ ):
338
+ super().__init__()
339
+ self.combine_batch_time = combine_batch_time
340
+ self.skip_transpose = skip_transpose
341
+
342
+ if input_size is None and skip_transpose:
343
+ input_size = input_shape[1]
344
+ elif input_size is None:
345
+ input_size = input_shape[-1]
346
+
347
+ if enabled:
348
+ self.norm = Fp32BatchNorm(
349
+ num_features=input_size,
350
+ eps=eps,
351
+ momentum=momentum,
352
+ affine=affine,
353
+ track_running_stats=track_running_stats,
354
+ )
355
+ else:
356
+ self.norm = nn.Identity()
357
+
358
+ def forward(self, x):
359
+ """Returns the normalized input tensor.
360
+
361
+ Arguments
362
+ ---------
363
+ x : torch.Tensor (batch, time, [channels])
364
+ input to normalize. 2d or 3d tensors are expected in input
365
+ 4d tensors can be used when combine_dims=True.
366
+ """
367
+ shape_or = x.shape
368
+ if self.combine_batch_time:
369
+ if x.ndim == 3:
370
+ x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
371
+ else:
372
+ x = x.reshape(shape_or[0] * shape_or[1], shape_or[3], shape_or[2])
373
+
374
+ elif not self.skip_transpose:
375
+ x = x.transpose(-1, 1)
376
+
377
+ x_n = self.norm(x)
378
+
379
+ if self.combine_batch_time:
380
+ x_n = x_n.reshape(shape_or)
381
+ elif not self.skip_transpose:
382
+ x_n = x_n.transpose(1, -1)
383
+
384
+ return x_n
385
+
386
+
387
+ class Linear(torch.nn.Module):
388
+ """Computes a linear transformation y = wx + b.
389
+
390
+ Arguments
391
+ ---------
392
+ n_neurons : int
393
+ It is the number of output neurons (i.e, the dimensionality of the
394
+ output).
395
+ bias : bool
396
+ If True, the additive bias b is adopted.
397
+ combine_dims : bool
398
+ If True and the input is 4D, combine 3rd and 4th dimensions of input.
399
+
400
+ Example
401
+ -------
402
+ >>> inputs = torch.rand(10, 50, 40)
403
+ >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
404
+ >>> output = lin_t(inputs)
405
+ >>> output.shape
406
+ torch.Size([10, 50, 100])
407
+ """
408
+
409
+ def __init__(
410
+ self,
411
+ n_neurons,
412
+ input_shape=None,
413
+ input_size=None,
414
+ bias=True,
415
+ combine_dims=False,
416
+ ):
417
+ super().__init__()
418
+ self.combine_dims = combine_dims
419
+
420
+ if input_shape is None and input_size is None:
421
+ raise ValueError("Expected one of input_shape or input_size")
422
+
423
+ if input_size is None:
424
+ input_size = input_shape[-1]
425
+ if len(input_shape) == 4 and self.combine_dims:
426
+ input_size = input_shape[2] * input_shape[3]
427
+
428
+ # Weights are initialized following pytorch approach
429
+ self.w = nn.Linear(input_size, n_neurons, bias=bias)
430
+
431
+ def forward(self, x):
432
+ """Returns the linear transformation of input tensor.
433
+
434
+ Arguments
435
+ ---------
436
+ x : torch.Tensor
437
+ Input to transform linearly.
438
+ """
439
+ if x.ndim == 4 and self.combine_dims:
440
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
441
+
442
+ wx = self.w(x)
443
+
444
+ return wx
445
+
446
+
447
+ class TDNNBlock(nn.Module):
448
+ """An implementation of TDNN.
449
+
450
+ Arguments
451
+ ----------
452
+ in_channels : int
453
+ Number of input channels.
454
+ out_channels : int
455
+ The number of output channels.
456
+ kernel_size : int
457
+ The kernel size of the TDNN blocks.
458
+ dilation : int
459
+ The dilation of the Res2Net block.
460
+ activation : torch class
461
+ A class for constructing the activation layers.
462
+
463
+ Example
464
+ -------
465
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
466
+ >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
467
+ >>> out_tensor = layer(inp_tensor).transpose(1, 2)
468
+ >>> out_tensor.shape
469
+ torch.Size([8, 120, 64])
470
+ """
471
+
472
+ def __init__(
473
+ self,
474
+ in_channels,
475
+ out_channels,
476
+ kernel_size,
477
+ dilation,
478
+ activation=nn.ReLU,
479
+ batch_norm=True,
480
+ ):
481
+ super(TDNNBlock, self).__init__()
482
+ self.conv = Conv1d(
483
+ in_channels=in_channels,
484
+ out_channels=out_channels,
485
+ kernel_size=kernel_size,
486
+ dilation=dilation,
487
+ )
488
+ self.activation = activation()
489
+ self.norm = BatchNorm1d(input_size=out_channels, enabled=batch_norm)
490
+
491
+ def forward(self, x):
492
+ return self.norm(self.activation(self.conv(x)))
493
+
494
+
495
+ class Res2NetBlock(torch.nn.Module):
496
+ """An implementation of Res2NetBlock w/ dilation.
497
+
498
+ Arguments
499
+ ---------
500
+ in_channels : int
501
+ The number of channels expected in the input.
502
+ out_channels : int
503
+ The number of output channels.
504
+ scale : int
505
+ The scale of the Res2Net block.
506
+ kernel_size: int
507
+ The kernel size of the Res2Net block.
508
+ dilation : int
509
+ The dilation of the Res2Net block.
510
+
511
+ Example
512
+ -------
513
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
514
+ >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
515
+ >>> out_tensor = layer(inp_tensor).transpose(1, 2)
516
+ >>> out_tensor.shape
517
+ torch.Size([8, 120, 64])
518
+ """
519
+
520
+ def __init__(
521
+ self,
522
+ in_channels,
523
+ out_channels,
524
+ scale=8,
525
+ kernel_size=3,
526
+ dilation=1,
527
+ batch_norm=True,
528
+ ):
529
+ super(Res2NetBlock, self).__init__()
530
+ assert in_channels % scale == 0
531
+ assert out_channels % scale == 0
532
+
533
+ in_channel = in_channels // scale
534
+ hidden_channel = out_channels // scale
535
+
536
+ self.blocks = nn.ModuleList(
537
+ [
538
+ TDNNBlock(
539
+ in_channel,
540
+ hidden_channel,
541
+ kernel_size=kernel_size,
542
+ dilation=dilation,
543
+ batch_norm=batch_norm,
544
+ )
545
+ for i in range(scale - 1)
546
+ ]
547
+ )
548
+ self.scale = scale
549
+
550
+ def forward(self, x):
551
+ y = []
552
+ for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
553
+ if i == 0:
554
+ y_i = x_i
555
+ elif i == 1:
556
+ y_i = self.blocks[i - 1](x_i)
557
+ else:
558
+ y_i = self.blocks[i - 1](x_i + y_i)
559
+ y.append(y_i)
560
+ y = torch.cat(y, dim=1)
561
+ return y
562
+
563
+
564
+ class SEBlock(nn.Module):
565
+ """An implementation of squeeze-and-excitation block.
566
+
567
+ Arguments
568
+ ---------
569
+ in_channels : int
570
+ The number of input channels.
571
+ se_channels : int
572
+ The number of output channels after squeeze.
573
+ out_channels : int
574
+ The number of output channels.
575
+
576
+ Example
577
+ -------
578
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
579
+ >>> se_layer = SEBlock(64, 16, 64)
580
+ >>> lengths = torch.rand((8,))
581
+ >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
582
+ >>> out_tensor.shape
583
+ torch.Size([8, 120, 64])
584
+ """
585
+
586
+ def __init__(self, in_channels, se_channels, out_channels):
587
+ super(SEBlock, self).__init__()
588
+
589
+ self.conv1 = Conv1d(
590
+ in_channels=in_channels, out_channels=se_channels, kernel_size=1
591
+ )
592
+ self.relu = torch.nn.ReLU(inplace=True)
593
+ self.conv2 = Conv1d(
594
+ in_channels=se_channels, out_channels=out_channels, kernel_size=1
595
+ )
596
+ self.sigmoid = torch.nn.Sigmoid()
597
+
598
+ def forward(self, x, lengths=None):
599
+ L = x.shape[-1]
600
+ if lengths is not None:
601
+ mask = length_to_mask(lengths * L, max_len=L, device=x.device)
602
+ mask = mask.unsqueeze(1)
603
+ total = mask.sum(dim=2, keepdim=True)
604
+ s = (x * mask).sum(dim=2, keepdim=True) / total
605
+ else:
606
+ s = x.mean(dim=2, keepdim=True)
607
+
608
+ s = self.relu(self.conv1(s))
609
+ s = self.sigmoid(self.conv2(s))
610
+
611
+ return s * x
612
+
613
+
614
+ class AttentiveStatisticsPooling(nn.Module):
615
+ """This class implements an attentive statistic pooling layer for each channel.
616
+ It returns the concatenated mean and std of the input tensor.
617
+
618
+ Arguments
619
+ ---------
620
+ channels: int
621
+ The number of input channels.
622
+ attention_channels: int
623
+ The number of attention channels.
624
+
625
+ Example
626
+ -------
627
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
628
+ >>> asp_layer = AttentiveStatisticsPooling(64)
629
+ >>> lengths = torch.rand((8,))
630
+ >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
631
+ >>> out_tensor.shape
632
+ torch.Size([8, 1, 128])
633
+ """
634
+
635
+ def __init__(
636
+ self, channels, attention_channels=128, global_context=True, batch_norm=True
637
+ ):
638
+ super().__init__()
639
+
640
+ self.eps = 1e-12
641
+ self.global_context = global_context
642
+ if global_context:
643
+ self.tdnn = TDNNBlock(
644
+ channels * 3, attention_channels, 1, 1, batch_norm=batch_norm
645
+ )
646
+ else:
647
+ self.tdnn = TDNNBlock(
648
+ channels, attention_channels, 1, 1, batch_norm, batch_norm
649
+ )
650
+ self.tanh = nn.Tanh()
651
+ self.conv = Conv1d(
652
+ in_channels=attention_channels, out_channels=channels, kernel_size=1
653
+ )
654
+
655
+ def forward(self, x, lengths=None):
656
+ """Calculates mean and std for a batch (input tensor).
657
+
658
+ Arguments
659
+ ---------
660
+ x : torch.Tensor
661
+ Tensor of shape [N, C, L].
662
+ """
663
+ L = x.shape[-1]
664
+
665
+ def _compute_statistics(x, m, dim=2, eps=self.eps):
666
+ mean = (m * x).sum(dim)
667
+ std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
668
+ return mean, std
669
+
670
+ if lengths is None:
671
+ lengths = torch.ones(x.shape[0], device=x.device)
672
+
673
+ # Make binary mask of shape [N, 1, L]
674
+ mask = length_to_mask(lengths * L, max_len=L, device=x.device)
675
+ mask = mask.unsqueeze(1)
676
+
677
+ # Expand the temporal context of the pooling layer by allowing the
678
+ # self-attention to look at global properties of the utterance.
679
+ if self.global_context:
680
+ # torch.std is unstable for backward computation
681
+ # https://github.com/pytorch/pytorch/issues/4320
682
+ total = mask.sum(dim=2, keepdim=True).float()
683
+ mean, std = _compute_statistics(x, mask / total)
684
+ mean = mean.unsqueeze(2).repeat(1, 1, L)
685
+ std = std.unsqueeze(2).repeat(1, 1, L)
686
+ attn = torch.cat([x, mean, std], dim=1)
687
+ else:
688
+ attn = x
689
+
690
+ # Apply layers
691
+ attn = self.conv(self.tanh(self.tdnn(attn)))
692
+
693
+ # Filter out zero-paddings
694
+ attn = attn.masked_fill(mask == 0, float("-inf"))
695
+
696
+ attn = F.softmax(attn, dim=2)
697
+ mean, std = _compute_statistics(x, attn)
698
+ # Append mean and std of the batch
699
+ pooled_stats = torch.cat((mean, std), dim=1)
700
+ pooled_stats = pooled_stats.unsqueeze(2)
701
+
702
+ return pooled_stats
703
+
704
+
705
+ class SERes2NetBlock(nn.Module):
706
+ """An implementation of building block in ECAPA-TDNN, i.e.,
707
+ TDNN-Res2Net-TDNN-SEBlock.
708
+
709
+ Arguments
710
+ ----------
711
+ out_channels: int
712
+ The number of output channels.
713
+ res2net_scale: int
714
+ The scale of the Res2Net block.
715
+ kernel_size: int
716
+ The kernel size of the TDNN blocks.
717
+ dilation: int
718
+ The dilation of the Res2Net block.
719
+ activation : torch class
720
+ A class for constructing the activation layers.
721
+
722
+ Example
723
+ -------
724
+ >>> x = torch.rand(8, 120, 64).transpose(1, 2)
725
+ >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
726
+ >>> out = conv(x).transpose(1, 2)
727
+ >>> out.shape
728
+ torch.Size([8, 120, 64])
729
+ """
730
+
731
+ def __init__(
732
+ self,
733
+ in_channels,
734
+ out_channels,
735
+ res2net_scale=8,
736
+ se_channels=128,
737
+ kernel_size=1,
738
+ dilation=1,
739
+ activation=torch.nn.ReLU,
740
+ batch_norm=True,
741
+ ):
742
+ super().__init__()
743
+ self.out_channels = out_channels
744
+ self.tdnn1 = TDNNBlock(
745
+ in_channels,
746
+ out_channels,
747
+ kernel_size=1,
748
+ dilation=1,
749
+ activation=activation,
750
+ batch_norm=batch_norm,
751
+ )
752
+ self.res2net_block = Res2NetBlock(
753
+ out_channels,
754
+ out_channels,
755
+ res2net_scale,
756
+ kernel_size,
757
+ dilation,
758
+ batch_norm=batch_norm,
759
+ )
760
+ self.tdnn2 = TDNNBlock(
761
+ out_channels,
762
+ out_channels,
763
+ kernel_size=1,
764
+ dilation=1,
765
+ activation=activation,
766
+ batch_norm=batch_norm,
767
+ )
768
+ self.se_block = SEBlock(out_channels, se_channels, out_channels)
769
+
770
+ self.shortcut = None
771
+ if in_channels != out_channels:
772
+ self.shortcut = Conv1d(
773
+ in_channels=in_channels,
774
+ out_channels=out_channels,
775
+ kernel_size=1,
776
+ )
777
+
778
+ def forward(self, x, lengths=None):
779
+ residual = x
780
+ if self.shortcut:
781
+ residual = self.shortcut(x)
782
+
783
+ x = self.tdnn1(x)
784
+ x = self.res2net_block(x)
785
+ x = self.tdnn2(x)
786
+ x = self.se_block(x, lengths)
787
+
788
+ return x + residual
789
+
790
+
791
+ class ECAPA_TDNN(torch.nn.Module):
792
+ """An implementation of the speaker embedding model in a paper.
793
+ "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
794
+ TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
795
+
796
+ Arguments
797
+ ---------
798
+ device : str
799
+ Device used, e.g., "cpu" or "cuda".
800
+ activation : torch class
801
+ A class for constructing the activation layers.
802
+ channels : list of ints
803
+ Output channels for TDNN/SERes2Net layer.
804
+ kernel_sizes : list of ints
805
+ List of kernel sizes for each layer.
806
+ dilations : list of ints
807
+ List of dilations for kernels in each layer.
808
+ lin_neurons : int
809
+ Number of neurons in linear layers.
810
+
811
+ Example
812
+ -------
813
+ >>> input_feats = torch.rand([5, 120, 80])
814
+ >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
815
+ >>> outputs = compute_embedding(input_feats)
816
+ >>> outputs.shape
817
+ torch.Size([5, 1, 192])
818
+ """
819
+
820
+ def __init__(
821
+ self,
822
+ input_size,
823
+ lin_neurons=192,
824
+ activation=torch.nn.ReLU,
825
+ channels=[512, 512, 512, 512, 1536],
826
+ kernel_sizes=[5, 3, 3, 3, 1],
827
+ dilations=[1, 2, 3, 4, 1],
828
+ attention_channels=128,
829
+ res2net_scale=8,
830
+ se_channels=128,
831
+ global_context=True,
832
+ batch_norm=True,
833
+ ):
834
+
835
+ super().__init__()
836
+ assert len(channels) == len(kernel_sizes)
837
+ assert len(channels) == len(dilations)
838
+ self.channels = channels
839
+ self.blocks = nn.ModuleList()
840
+
841
+ # The initial TDNN layer
842
+ self.blocks.append(
843
+ TDNNBlock(
844
+ input_size,
845
+ channels[0],
846
+ kernel_sizes[0],
847
+ dilations[0],
848
+ activation,
849
+ batch_norm=batch_norm,
850
+ )
851
+ )
852
+
853
+ # SE-Res2Net layers
854
+ for i in range(1, len(channels) - 1):
855
+ self.blocks.append(
856
+ SERes2NetBlock(
857
+ channels[i - 1],
858
+ channels[i],
859
+ res2net_scale=res2net_scale,
860
+ se_channels=se_channels,
861
+ kernel_size=kernel_sizes[i],
862
+ dilation=dilations[i],
863
+ activation=activation,
864
+ batch_norm=batch_norm,
865
+ )
866
+ )
867
+
868
+ # Multi-layer feature aggregation
869
+ self.mfa = TDNNBlock(
870
+ channels[-1],
871
+ channels[-1],
872
+ kernel_sizes[-1],
873
+ dilations[-1],
874
+ activation,
875
+ batch_norm=batch_norm,
876
+ )
877
+
878
+ # Attentive Statistical Pooling
879
+ self.asp = AttentiveStatisticsPooling(
880
+ channels[-1],
881
+ attention_channels=attention_channels,
882
+ global_context=global_context,
883
+ batch_norm=batch_norm,
884
+ )
885
+ self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2, enabled=batch_norm)
886
+
887
+ # Final linear transformation
888
+ self.fc = Conv1d(
889
+ in_channels=channels[-1] * 2,
890
+ out_channels=input_size, # lin_neurons,
891
+ kernel_size=1,
892
+ )
893
+
894
+ # @torch.cuda.amp.autocast(enabled=True, dtype=torch.float32)
895
+ def forward(self, x, lengths=None):
896
+ """Returns the embedding vector.
897
+
898
+ Arguments
899
+ ---------
900
+ x : torch.Tensor
901
+ Tensor of shape (batch, time, channel).
902
+ """
903
+ # Minimize transpose for efficiency
904
+ x = x.transpose(1, 2)
905
+
906
+ xl = []
907
+ for layer in self.blocks:
908
+ try:
909
+ x = layer(x, lengths=lengths)
910
+ except TypeError:
911
+ x = layer(x)
912
+ xl.append(x)
913
+
914
+ # Multi-layer feature aggregation
915
+ x = torch.cat(xl[1:], dim=1)
916
+ x = self.mfa(x)
917
+
918
+ # Attentive Statistical Pooling
919
+ x = self.asp(x, lengths=lengths)
920
+ x = self.asp_bn(x)
921
+
922
+ # Final linear transformation
923
+ x = self.fc(x)
924
+
925
+ x = x.squeeze(-1)
926
+ return x
927
+
928
+
929
+ if __name__ == "__main__":
930
+ model = ECAPA_TDNN(128, batch_norm=False)
931
+ # print(model)
lemas_tts/model/backbones/mmdit.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+
15
+ from x_transformers.x_transformers import RotaryEmbedding
16
+
17
+ from lemas_tts.model.modules import (
18
+ TimestepEmbedding,
19
+ ConvPositionEmbedding,
20
+ MMDiTBlock,
21
+ AdaLayerNorm_Final,
22
+ precompute_freqs_cis,
23
+ get_pos_embed_indices,
24
+ )
25
+
26
+
27
+ # text embedding
28
+
29
+
30
+ class TextEmbedding(nn.Module):
31
+ def __init__(self, out_dim, text_num_embeds, mask_padding=True):
32
+ super().__init__()
33
+ self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
34
+
35
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
36
+
37
+ self.precompute_max_pos = 1024
38
+ self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
39
+
40
+ def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
41
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
42
+ if self.mask_padding:
43
+ text_mask = text == 0
44
+
45
+ if drop_text: # cfg for text
46
+ text = torch.zeros_like(text)
47
+
48
+ text = self.text_embed(text) # b nt -> b nt d
49
+
50
+ # sinus pos emb
51
+ batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
52
+ batch_text_len = text.shape[1]
53
+ pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
54
+ text_pos_embed = self.freqs_cis[pos_idx]
55
+
56
+ text = text + text_pos_embed
57
+
58
+ if self.mask_padding:
59
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
60
+
61
+ return text
62
+
63
+
64
+ # noised input & masked cond audio embedding
65
+
66
+
67
+ class AudioEmbedding(nn.Module):
68
+ def __init__(self, in_dim, out_dim):
69
+ super().__init__()
70
+ self.linear = nn.Linear(2 * in_dim, out_dim)
71
+ self.conv_pos_embed = ConvPositionEmbedding(out_dim)
72
+
73
+ def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
74
+ if drop_audio_cond:
75
+ cond = torch.zeros_like(cond)
76
+ x = torch.cat((x, cond), dim=-1)
77
+ x = self.linear(x)
78
+ x = self.conv_pos_embed(x) + x
79
+ return x
80
+
81
+
82
+ # Transformer backbone using MM-DiT blocks
83
+
84
+
85
+ class MMDiT(nn.Module):
86
+ def __init__(
87
+ self,
88
+ *,
89
+ dim,
90
+ depth=8,
91
+ heads=8,
92
+ dim_head=64,
93
+ dropout=0.1,
94
+ ff_mult=4,
95
+ mel_dim=100,
96
+ text_num_embeds=256,
97
+ text_mask_padding=True,
98
+ qk_norm=None,
99
+ ):
100
+ super().__init__()
101
+
102
+ self.time_embed = TimestepEmbedding(dim)
103
+ self.text_embed = TextEmbedding(dim, text_num_embeds, mask_padding=text_mask_padding)
104
+ self.text_cond, self.text_uncond = None, None # text cache
105
+ self.audio_embed = AudioEmbedding(mel_dim, dim)
106
+
107
+ self.rotary_embed = RotaryEmbedding(dim_head)
108
+
109
+ self.dim = dim
110
+ self.depth = depth
111
+
112
+ self.transformer_blocks = nn.ModuleList(
113
+ [
114
+ MMDiTBlock(
115
+ dim=dim,
116
+ heads=heads,
117
+ dim_head=dim_head,
118
+ dropout=dropout,
119
+ ff_mult=ff_mult,
120
+ context_pre_only=i == depth - 1,
121
+ qk_norm=qk_norm,
122
+ )
123
+ for i in range(depth)
124
+ ]
125
+ )
126
+ self.norm_out = AdaLayerNorm_Final(dim) # final modulation
127
+ self.proj_out = nn.Linear(dim, mel_dim)
128
+
129
+ self.initialize_weights()
130
+
131
+ def initialize_weights(self):
132
+ # Zero-out AdaLN layers in MMDiT blocks:
133
+ for block in self.transformer_blocks:
134
+ nn.init.constant_(block.attn_norm_x.linear.weight, 0)
135
+ nn.init.constant_(block.attn_norm_x.linear.bias, 0)
136
+ nn.init.constant_(block.attn_norm_c.linear.weight, 0)
137
+ nn.init.constant_(block.attn_norm_c.linear.bias, 0)
138
+
139
+ # Zero-out output layers:
140
+ nn.init.constant_(self.norm_out.linear.weight, 0)
141
+ nn.init.constant_(self.norm_out.linear.bias, 0)
142
+ nn.init.constant_(self.proj_out.weight, 0)
143
+ nn.init.constant_(self.proj_out.bias, 0)
144
+
145
+ def clear_cache(self):
146
+ self.text_cond, self.text_uncond = None, None
147
+
148
+ def forward(
149
+ self,
150
+ x: float["b n d"], # nosied input audio # noqa: F722
151
+ cond: float["b n d"], # masked cond audio # noqa: F722
152
+ text: int["b nt"], # text # noqa: F722
153
+ time: float["b"] | float[""], # time step # noqa: F821 F722
154
+ drop_audio_cond, # cfg for cond audio
155
+ drop_text, # cfg for text
156
+ mask: bool["b n"] | None = None, # noqa: F722
157
+ cache=False,
158
+ ):
159
+ batch = x.shape[0]
160
+ if time.ndim == 0:
161
+ time = time.repeat(batch)
162
+
163
+ # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
164
+ t = self.time_embed(time)
165
+ if cache:
166
+ if drop_text:
167
+ if self.text_uncond is None:
168
+ self.text_uncond = self.text_embed(text, drop_text=True)
169
+ c = self.text_uncond
170
+ else:
171
+ if self.text_cond is None:
172
+ self.text_cond = self.text_embed(text, drop_text=False)
173
+ c = self.text_cond
174
+ else:
175
+ c = self.text_embed(text, drop_text=drop_text)
176
+ x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
177
+
178
+ seq_len = x.shape[1]
179
+ text_len = text.shape[1]
180
+ rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
181
+ rope_text = self.rotary_embed.forward_from_seq_len(text_len)
182
+
183
+ for block in self.transformer_blocks:
184
+ c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
185
+
186
+ x = self.norm_out(x, t)
187
+ output = self.proj_out(x)
188
+
189
+ return output
lemas_tts/model/backbones/prosody_encoder.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prosody encoder backbone based on the Pretssel ECAPA-TDNN architecture.
3
+
4
+ This module provides:
5
+ - ProsodyEncoder: wraps an ECAPA-TDNN model to produce utterance-level
6
+ prosody embeddings from 80-dim FBANK features.
7
+ - extract_fbank_16k: utility to compute 80-bin FBANK from 16kHz audio.
8
+
9
+ It is self-contained (no fairseq2 dependency) and can be used inside
10
+ CFM or other models as a conditioning network.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+ from typing import List, Optional, Tuple
17
+ import json
18
+
19
+ import torch
20
+ import torchaudio
21
+ from torch import Tensor
22
+ from torch import nn
23
+ from torch.nn import Conv1d, LayerNorm, Module, ModuleList, ReLU, Sigmoid, Tanh, init
24
+ import torch.nn.functional as F
25
+
26
+
27
+ AUDIO_SAMPLE_RATE = 16_000
28
+
29
+
30
+ class ECAPA_TDNN(Module):
31
+ """
32
+ ECAPA-TDNN core used in Pretssel prosody encoder.
33
+
34
+ Expects input features of shape (B, T, C) with C=80 and returns
35
+ a normalized embedding of shape (B, embed_dim).
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ channels: List[int],
41
+ kernel_sizes: List[int],
42
+ dilations: List[int],
43
+ attention_channels: int,
44
+ res2net_scale: int,
45
+ se_channels: int,
46
+ global_context: bool,
47
+ groups: List[int],
48
+ embed_dim: int,
49
+ input_dim: int,
50
+ ):
51
+ super().__init__()
52
+ assert len(channels) == len(kernel_sizes) == len(dilations)
53
+ self.channels = channels
54
+ self.embed_dim = embed_dim
55
+ self.blocks = ModuleList()
56
+
57
+ self.blocks.append(
58
+ TDNNBlock(
59
+ input_dim,
60
+ channels[0],
61
+ kernel_sizes[0],
62
+ dilations[0],
63
+ groups[0],
64
+ )
65
+ )
66
+
67
+ for i in range(1, len(channels) - 1):
68
+ self.blocks.append(
69
+ SERes2NetBlock(
70
+ channels[i - 1],
71
+ channels[i],
72
+ res2net_scale=res2net_scale,
73
+ se_channels=se_channels,
74
+ kernel_size=kernel_sizes[i],
75
+ dilation=dilations[i],
76
+ groups=groups[i],
77
+ )
78
+ )
79
+
80
+ self.mfa = TDNNBlock(
81
+ channels[-1],
82
+ channels[-1],
83
+ kernel_sizes[-1],
84
+ dilations[-1],
85
+ groups=groups[-1],
86
+ )
87
+
88
+ self.asp = AttentiveStatisticsPooling(
89
+ channels[-1],
90
+ attention_channels=attention_channels,
91
+ global_context=global_context,
92
+ )
93
+ self.asp_norm = LayerNorm(channels[-1] * 2, eps=1e-12)
94
+
95
+ self.fc = Conv1d(
96
+ in_channels=channels[-1] * 2,
97
+ out_channels=embed_dim,
98
+ kernel_size=1,
99
+ )
100
+
101
+ self.reset_parameters()
102
+
103
+ def reset_parameters(self) -> None:
104
+ def encoder_init(m: Module) -> None:
105
+ if isinstance(m, Conv1d):
106
+ init.xavier_uniform_(m.weight, init.calculate_gain("relu"))
107
+
108
+ self.apply(encoder_init)
109
+
110
+ def forward(
111
+ self,
112
+ x: Tensor,
113
+ padding_mask: Optional[Tensor] = None,
114
+ ) -> Tensor:
115
+ # x: (B, T, C)
116
+ x = x.transpose(1, 2) # (B, C, T)
117
+
118
+ xl = []
119
+ for layer in self.blocks:
120
+ x = layer(x, padding_mask=padding_mask)
121
+ xl.append(x)
122
+
123
+ x = torch.cat(xl[1:], dim=1)
124
+ x = self.mfa(x)
125
+
126
+ x = self.asp(x, padding_mask=padding_mask)
127
+ x = self.asp_norm(x.transpose(1, 2)).transpose(1, 2)
128
+
129
+ x = self.fc(x)
130
+
131
+ x = x.transpose(1, 2).squeeze(1) # (B, embed_dim)
132
+ return F.normalize(x, dim=-1)
133
+
134
+
135
+ class TDNNBlock(Module):
136
+ def __init__(
137
+ self,
138
+ in_channels: int,
139
+ out_channels: int,
140
+ kernel_size: int,
141
+ dilation: int,
142
+ groups: int = 1,
143
+ ):
144
+ super().__init__()
145
+ self.conv = Conv1d(
146
+ in_channels=in_channels,
147
+ out_channels=out_channels,
148
+ kernel_size=kernel_size,
149
+ dilation=dilation,
150
+ padding=dilation * (kernel_size - 1) // 2,
151
+ groups=groups,
152
+ )
153
+ self.activation = ReLU()
154
+ self.norm = LayerNorm(out_channels, eps=1e-12)
155
+
156
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
157
+ x = self.activation(self.conv(x))
158
+ return self.norm(x.transpose(1, 2)).transpose(1, 2)
159
+
160
+
161
+ class Res2NetBlock(Module):
162
+ def __init__(
163
+ self,
164
+ in_channels: int,
165
+ out_channels: int,
166
+ scale: int = 8,
167
+ kernel_size: int = 3,
168
+ dilation: int = 1,
169
+ ):
170
+ super().__init__()
171
+ assert in_channels % scale == 0
172
+ assert out_channels % scale == 0
173
+
174
+ in_channel = in_channels // scale
175
+ hidden_channel = out_channels // scale
176
+ self.blocks = ModuleList(
177
+ [
178
+ TDNNBlock(
179
+ in_channel,
180
+ hidden_channel,
181
+ kernel_size=kernel_size,
182
+ dilation=dilation,
183
+ )
184
+ for _ in range(scale - 1)
185
+ ]
186
+ )
187
+ self.scale = scale
188
+
189
+ def forward(self, x: Tensor) -> Tensor:
190
+ y = []
191
+ for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
192
+ if i == 0:
193
+ y_i = x_i
194
+ elif i == 1:
195
+ y_i = self.blocks[i - 1](x_i)
196
+ else:
197
+ y_i = self.blocks[i - 1](x_i + y_i)
198
+ y.append(y_i)
199
+ return torch.cat(y, dim=1)
200
+
201
+
202
+ class SEBlock(Module):
203
+ def __init__(
204
+ self,
205
+ in_channels: int,
206
+ se_channels: int,
207
+ out_channels: int,
208
+ ):
209
+ super().__init__()
210
+ self.conv1 = Conv1d(in_channels=in_channels, out_channels=se_channels, kernel_size=1)
211
+ self.relu = ReLU(inplace=True)
212
+ self.conv2 = Conv1d(in_channels=se_channels, out_channels=out_channels, kernel_size=1)
213
+ self.sigmoid = Sigmoid()
214
+
215
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
216
+ if padding_mask is not None:
217
+ # padding_mask: (B, T) with 1 for valid, 0 for pad
218
+ mask = padding_mask.unsqueeze(1) # (B, 1, T)
219
+ lengths = mask.sum(dim=2, keepdim=True)
220
+ s = (x * mask).sum(dim=2, keepdim=True) / torch.clamp(lengths, min=1.0)
221
+ else:
222
+ s = x.mean(dim=2, keepdim=True)
223
+
224
+ s = self.relu(self.conv1(s))
225
+ s = self.sigmoid(self.conv2(s))
226
+ return s * x
227
+
228
+
229
+ class AttentiveStatisticsPooling(Module):
230
+ def __init__(
231
+ self, channels: int, attention_channels: int = 128, global_context: bool = True
232
+ ):
233
+ super().__init__()
234
+ self.eps = 1e-12
235
+ self.global_context = global_context
236
+ if global_context:
237
+ self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
238
+ else:
239
+ self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
240
+
241
+ self.tanh = Tanh()
242
+ self.conv = Conv1d(in_channels=attention_channels, out_channels=channels, kernel_size=1)
243
+
244
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
245
+ # x: (N, C, L)
246
+ N, C, L = x.shape
247
+
248
+ def _compute_statistics(
249
+ x: Tensor, m: Tensor, dim: int = 2, eps: float = 1e-12
250
+ ) -> Tuple[Tensor, Tensor]:
251
+ mean = (m * x).sum(dim)
252
+ std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
253
+ return mean, std
254
+
255
+ if padding_mask is not None:
256
+ mask = padding_mask
257
+ else:
258
+ mask = torch.ones(N, L, device=x.device, dtype=x.dtype)
259
+ mask = mask.unsqueeze(1) # (N, 1, L)
260
+
261
+ if self.global_context:
262
+ total = mask.sum(dim=2, keepdim=True).to(x)
263
+ mean, std = _compute_statistics(x, mask / total)
264
+ mean = mean.unsqueeze(2).repeat(1, 1, L)
265
+ std = std.unsqueeze(2).repeat(1, 1, L)
266
+ attn = torch.cat([x, mean, std], dim=1)
267
+ else:
268
+ attn = x
269
+
270
+ attn = self.conv(self.tanh(self.tdnn(attn)))
271
+
272
+ attn = attn.masked_fill(mask == 0, float("-inf"))
273
+
274
+ attn = F.softmax(attn, dim=2)
275
+ mean, std = _compute_statistics(x, attn)
276
+ pooled_stats = torch.cat((mean, std), dim=1)
277
+ pooled_stats = pooled_stats.unsqueeze(2)
278
+ return pooled_stats
279
+
280
+
281
+ class SERes2NetBlock(Module):
282
+ def __init__(
283
+ self,
284
+ in_channels: int,
285
+ out_channels: int,
286
+ res2net_scale: int = 8,
287
+ se_channels: int = 128,
288
+ kernel_size: int = 1,
289
+ dilation: int = 1,
290
+ groups: int = 1,
291
+ ):
292
+ super().__init__()
293
+ self.out_channels = out_channels
294
+ self.tdnn1 = TDNNBlock(
295
+ in_channels,
296
+ out_channels,
297
+ kernel_size=1,
298
+ dilation=1,
299
+ groups=groups,
300
+ )
301
+ self.res2net_block = Res2NetBlock(
302
+ out_channels,
303
+ out_channels,
304
+ res2net_scale,
305
+ kernel_size,
306
+ dilation,
307
+ )
308
+ self.tdnn2 = TDNNBlock(
309
+ out_channels,
310
+ out_channels,
311
+ kernel_size=1,
312
+ dilation=1,
313
+ groups=groups,
314
+ )
315
+ self.se_block = SEBlock(out_channels, se_channels, out_channels)
316
+
317
+ self.shortcut = None
318
+ if in_channels != out_channels:
319
+ self.shortcut = Conv1d(
320
+ in_channels=in_channels,
321
+ out_channels=out_channels,
322
+ kernel_size=1,
323
+ )
324
+
325
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
326
+ residual = x
327
+ if self.shortcut:
328
+ residual = self.shortcut(x)
329
+
330
+ x = self.tdnn1(x)
331
+ x = self.res2net_block(x)
332
+ x = self.tdnn2(x)
333
+ x = self.se_block(x, padding_mask=padding_mask)
334
+ return x + residual
335
+
336
+
337
+ def extract_fbank_16k(audio_16k: Tensor) -> Tensor:
338
+ """
339
+ Compute 80-dim FBANK features from 16kHz audio.
340
+
341
+ Args:
342
+ audio_16k: Tensor of shape (T,) or (1, T)
343
+ Returns:
344
+ fbank: Tensor of shape (T_fbank, 80)
345
+ """
346
+ if audio_16k.ndim == 1:
347
+ audio_16k = audio_16k.unsqueeze(0)
348
+
349
+ # Ensure minimum length for kaldi.fbank window (default 25ms @16k -> 400 samples)
350
+ min_len = 400
351
+
352
+ if audio_16k.shape[-1] < min_len:
353
+ repeat_times = (min_len // audio_16k.shape[-1]) + 1
354
+ audio_16k = audio_16k.repeat(1, repeat_times) if audio_16k.dim() > 1 else audio_16k.repeat(repeat_times)
355
+
356
+ fbank = torchaudio.compliance.kaldi.fbank(
357
+ audio_16k,
358
+ num_mel_bins=80,
359
+ sample_frequency=AUDIO_SAMPLE_RATE,
360
+ )
361
+ return fbank
362
+
363
+
364
+ class ProsodyEncoder(nn.Module):
365
+ """
366
+ High-level wrapper for the Pretssel prosody encoder.
367
+
368
+ Usage:
369
+ encoder = ProsodyEncoder(cfg_path, ckpt_path, freeze=True)
370
+ emb = encoder(fbank_batch) # (B, 512)
371
+ """
372
+
373
+ def __init__(self, cfg_path: Path, ckpt_path: Path, freeze: bool = True):
374
+ super().__init__()
375
+ model_cfg = self._load_pretssel_model_cfg(cfg_path)
376
+ self.encoder = self._build_prosody_encoder(model_cfg)
377
+ self._load_prosody_encoder_state(self.encoder, ckpt_path)
378
+ if freeze:
379
+ for p in self.encoder.parameters():
380
+ p.requires_grad = False
381
+
382
+ @staticmethod
383
+ def _load_pretssel_model_cfg(cfg_path: Path) -> dict:
384
+ cfg = json.loads(cfg_path.read_text())
385
+ if "model" not in cfg:
386
+ raise ValueError(f"{cfg_path} does not contain a top-level 'model' key.")
387
+ return cfg["model"]
388
+
389
+ @staticmethod
390
+ def _build_prosody_encoder(model_cfg: dict) -> ECAPA_TDNN:
391
+ encoder = ECAPA_TDNN(
392
+ channels=model_cfg["prosody_channels"],
393
+ kernel_sizes=model_cfg["prosody_kernel_sizes"],
394
+ dilations=model_cfg["prosody_dilations"],
395
+ attention_channels=model_cfg["prosody_attention_channels"],
396
+ res2net_scale=model_cfg["prosody_res2net_scale"],
397
+ se_channels=model_cfg["prosody_se_channels"],
398
+ global_context=model_cfg["prosody_global_context"],
399
+ groups=model_cfg["prosody_groups"],
400
+ embed_dim=model_cfg["prosody_embed_dim"],
401
+ input_dim=model_cfg["input_feat_per_channel"],
402
+ )
403
+ return encoder
404
+
405
+ @staticmethod
406
+ def _load_prosody_encoder_state(model: Module, ckpt_path: Path) -> None:
407
+ state = torch.load(ckpt_path, map_location="cpu")
408
+ if isinstance(state, dict):
409
+ if all(isinstance(k, str) for k in state.keys()) and (
410
+ any(k.startswith("prosody_encoder.") for k in state.keys())
411
+ or any(k.startswith("prosody_encoder_model.") for k in state.keys())
412
+ ):
413
+ state = {
414
+ k.replace("prosody_encoder_model.", "", 1).replace("prosody_encoder.", "", 1): v
415
+ for k, v in state.items()
416
+ if k.startswith("prosody_encoder.") or k.startswith("prosody_encoder_model.")
417
+ }
418
+ missing, unexpected = model.load_state_dict(state, strict=False)
419
+ if missing or unexpected:
420
+ raise RuntimeError(
421
+ f"Error loading checkpoint {ckpt_path}: missing keys={missing}, "
422
+ f"unexpected keys={unexpected}"
423
+ )
424
+
425
+ def forward(self, fbank: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
426
+ """
427
+ Args:
428
+ fbank: Tensor of shape (B, T, 80)
429
+ padding_mask: Optional tensor of shape (B, T) with 1 for valid.
430
+ Returns:
431
+ emb: Tensor of shape (B, 512)
432
+ """
433
+ return self.encoder(fbank, padding_mask=padding_mask)
lemas_tts/model/backbones/unett.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from typing import Literal
12
+
13
+ import torch
14
+ from torch import nn
15
+ import torch.nn.functional as F
16
+
17
+ from x_transformers import RMSNorm
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from lemas_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ Attention,
25
+ AttnProcessor,
26
+ FeedForward,
27
+ precompute_freqs_cis,
28
+ get_pos_embed_indices,
29
+ )
30
+
31
+
32
+ # Text embedding
33
+
34
+
35
+ class TextEmbedding(nn.Module):
36
+ def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
37
+ super().__init__()
38
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
39
+
40
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
41
+
42
+ if conv_layers > 0:
43
+ self.extra_modeling = True
44
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
45
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
46
+ self.text_blocks = nn.Sequential(
47
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
48
+ )
49
+ else:
50
+ self.extra_modeling = False
51
+
52
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
53
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
54
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
55
+ batch, text_len = text.shape[0], text.shape[1]
56
+ text = F.pad(text, (0, seq_len - text_len), value=0)
57
+ if self.mask_padding:
58
+ text_mask = text == 0
59
+
60
+ if drop_text: # cfg for text
61
+ text = torch.zeros_like(text)
62
+
63
+ text = self.text_embed(text) # b n -> b n d
64
+
65
+ # possible extra modeling
66
+ if self.extra_modeling:
67
+ # sinus pos emb
68
+ batch_start = torch.zeros((batch,), dtype=torch.long)
69
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
70
+ text_pos_embed = self.freqs_cis[pos_idx]
71
+ text = text + text_pos_embed
72
+
73
+ # convnextv2 blocks
74
+ if self.mask_padding:
75
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
76
+ for block in self.text_blocks:
77
+ text = block(text)
78
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
79
+ else:
80
+ text = self.text_blocks(text)
81
+
82
+ return text
83
+
84
+
85
+ # noised input audio and context mixing embedding
86
+
87
+
88
+ class InputEmbedding(nn.Module):
89
+ def __init__(self, mel_dim, text_dim, out_dim):
90
+ super().__init__()
91
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
92
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
93
+
94
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
95
+ if drop_audio_cond: # cfg for cond audio
96
+ cond = torch.zeros_like(cond)
97
+
98
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
99
+ x = self.conv_pos_embed(x) + x
100
+ return x
101
+
102
+
103
+ # Flat UNet Transformer backbone
104
+
105
+
106
+ class UNetT(nn.Module):
107
+ def __init__(
108
+ self,
109
+ *,
110
+ dim,
111
+ depth=8,
112
+ heads=8,
113
+ dim_head=64,
114
+ dropout=0.1,
115
+ ff_mult=4,
116
+ mel_dim=100,
117
+ text_num_embeds=256,
118
+ text_dim=None,
119
+ text_mask_padding=True,
120
+ qk_norm=None,
121
+ conv_layers=0,
122
+ pe_attn_head=None,
123
+ skip_connect_type: Literal["add", "concat", "none"] = "concat",
124
+ ):
125
+ super().__init__()
126
+ assert depth % 2 == 0, "UNet-Transformer's depth should be even."
127
+
128
+ self.time_embed = TimestepEmbedding(dim)
129
+ if text_dim is None:
130
+ text_dim = mel_dim
131
+ self.text_embed = TextEmbedding(
132
+ text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
133
+ )
134
+ self.text_cond, self.text_uncond = None, None # text cache
135
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
136
+
137
+ self.rotary_embed = RotaryEmbedding(dim_head)
138
+
139
+ # transformer layers & skip connections
140
+
141
+ self.dim = dim
142
+ self.skip_connect_type = skip_connect_type
143
+ needs_skip_proj = skip_connect_type == "concat"
144
+
145
+ self.depth = depth
146
+ self.layers = nn.ModuleList([])
147
+
148
+ for idx in range(depth):
149
+ is_later_half = idx >= (depth // 2)
150
+
151
+ attn_norm = RMSNorm(dim)
152
+ attn = Attention(
153
+ processor=AttnProcessor(pe_attn_head=pe_attn_head),
154
+ dim=dim,
155
+ heads=heads,
156
+ dim_head=dim_head,
157
+ dropout=dropout,
158
+ qk_norm=qk_norm,
159
+ )
160
+
161
+ ff_norm = RMSNorm(dim)
162
+ ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
163
+
164
+ skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
165
+
166
+ self.layers.append(
167
+ nn.ModuleList(
168
+ [
169
+ skip_proj,
170
+ attn_norm,
171
+ attn,
172
+ ff_norm,
173
+ ff,
174
+ ]
175
+ )
176
+ )
177
+
178
+ self.norm_out = RMSNorm(dim)
179
+ self.proj_out = nn.Linear(dim, mel_dim)
180
+
181
+ def clear_cache(self):
182
+ self.text_cond, self.text_uncond = None, None
183
+
184
+ def forward(
185
+ self,
186
+ x: float["b n d"], # nosied input audio # noqa: F722
187
+ cond: float["b n d"], # masked cond audio # noqa: F722
188
+ text: int["b nt"], # text # noqa: F722
189
+ time: float["b"] | float[""], # time step # noqa: F821 F722
190
+ drop_audio_cond, # cfg for cond audio
191
+ drop_text, # cfg for text
192
+ mask: bool["b n"] | None = None, # noqa: F722
193
+ cache=False,
194
+ ):
195
+ batch, seq_len = x.shape[0], x.shape[1]
196
+ if time.ndim == 0:
197
+ time = time.repeat(batch)
198
+
199
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
200
+ t = self.time_embed(time)
201
+ if cache:
202
+ if drop_text:
203
+ if self.text_uncond is None:
204
+ self.text_uncond = self.text_embed(text, seq_len, drop_text=True)
205
+ text_embed = self.text_uncond
206
+ else:
207
+ if self.text_cond is None:
208
+ self.text_cond = self.text_embed(text, seq_len, drop_text=False)
209
+ text_embed = self.text_cond
210
+ else:
211
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
212
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
213
+
214
+ # postfix time t to input x, [b n d] -> [b n+1 d]
215
+ x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
216
+ if mask is not None:
217
+ mask = F.pad(mask, (1, 0), value=1)
218
+
219
+ rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
220
+
221
+ # flat unet transformer
222
+ skip_connect_type = self.skip_connect_type
223
+ skips = []
224
+ for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
225
+ layer = idx + 1
226
+
227
+ # skip connection logic
228
+ is_first_half = layer <= (self.depth // 2)
229
+ is_later_half = not is_first_half
230
+
231
+ if is_first_half:
232
+ skips.append(x)
233
+
234
+ if is_later_half:
235
+ skip = skips.pop()
236
+ if skip_connect_type == "concat":
237
+ x = torch.cat((x, skip), dim=-1)
238
+ x = maybe_skip_proj(x)
239
+ elif skip_connect_type == "add":
240
+ x = x + skip
241
+
242
+ # attention and feedforward blocks
243
+ x = attn(attn_norm(x), rope=rope, mask=mask) + x
244
+ x = ff(ff_norm(x)) + x
245
+
246
+ assert len(skips) == 0
247
+
248
+ x = self.norm_out(x)[:, 1:, :] # unpack t from x
249
+
250
+ return self.proj_out(x)
lemas_tts/model/cfm.py ADDED
@@ -0,0 +1,899 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from random import random
13
+ import random as _random
14
+ from typing import Callable, Dict, OrderedDict
15
+ import math
16
+ from pathlib import Path
17
+
18
+ import torch
19
+ import torch.nn.functional as F
20
+ import torchaudio
21
+ from torch import nn
22
+ from torch.nn.utils.rnn import pad_sequence
23
+ from torchdiffeq import odeint
24
+
25
+ from lemas_tts.model.modules import MelSpec
26
+ from lemas_tts.model.modules import MIEsitmator, AccentClassifier, grad_reverse
27
+ from lemas_tts.model.backbones.ecapa_tdnn import ECAPA_TDNN
28
+ from lemas_tts.model.backbones.prosody_encoder import ProsodyEncoder, extract_fbank_16k
29
+ from lemas_tts.model.utils import (
30
+ default,
31
+ exists,
32
+ lens_to_mask,
33
+ list_str_to_idx,
34
+ list_str_to_tensor,
35
+ mask_from_frac_lengths,
36
+ )
37
+
38
+
39
+ def clip_and_shuffle(mel, mel_len, sample_rate=24000, hop_length=256, ratio=None):
40
+ """
41
+ Randomly clip a mel-spectrogram segment and shuffle 1-second chunks to
42
+ create an accent-invariant conditioning segment.
43
+
44
+ This is a inference-time utility used by the accent GRL path.
45
+
46
+ Args:
47
+ mel: [n_mels, T]
48
+ mel_len: int, original mel length (T)
49
+ """
50
+ frames_per_second = int(sample_rate / hop_length) # ≈ 94 frames / second
51
+
52
+ # ---- 1. Randomly crop 25%~75% of the original length (or ratio * length) ----
53
+ total_len = mel_len
54
+ if not ratio:
55
+ seg_len = _random.randint(int(0.25 * total_len), int(0.75 * total_len))
56
+ else:
57
+ seg_len = int(total_len * ratio)
58
+ start = _random.randint(0, max(0, total_len - seg_len))
59
+ mel_seg = mel[:, start : start + seg_len]
60
+
61
+ # ---- 2. Split into ~1-second chunks ----
62
+ n_chunks = (mel_seg.size(1) + frames_per_second - 1) // frames_per_second
63
+ chunks = []
64
+ for i in range(n_chunks):
65
+ chunk = mel_seg[:, i * frames_per_second : (i + 1) * frames_per_second]
66
+ chunks.append(chunk)
67
+
68
+ # ---- 3. Shuffle chunk order ----
69
+ _random.shuffle(chunks)
70
+ shuffled_mel = torch.cat(chunks, dim=1)
71
+
72
+ # ---- 4. Repeat random chunks until reaching original length ----
73
+ if shuffled_mel.size(1) < total_len:
74
+ repeat_chunks = []
75
+ while sum(c.size(1) for c in repeat_chunks) < total_len:
76
+ repeat_chunks.append(_random.choice(chunks))
77
+ shuffled_mel = torch.cat([shuffled_mel] + repeat_chunks, dim=1)
78
+
79
+ # ---- 5. Trim to exactly mel_len ----
80
+ shuffled_mel = shuffled_mel[:, :total_len]
81
+ assert shuffled_mel.shape == mel.shape, f"shuffled_mel.shape != mel.shape: {shuffled_mel.shape} != {mel.shape}"
82
+
83
+ return shuffled_mel
84
+
85
+ class CFM(nn.Module):
86
+ def __init__(
87
+ self,
88
+ transformer: nn.Module,
89
+ sigma=0.0,
90
+ odeint_kwargs: dict = dict(
91
+ # atol = 1e-5,
92
+ # rtol = 1e-5,
93
+ method="euler" # 'midpoint'
94
+ ),
95
+ audio_drop_prob=0.3,
96
+ text_drop_prob=0.1,
97
+ num_channels=None,
98
+ mel_spec_module: nn.Module | None = None,
99
+ mel_spec_kwargs: dict = dict(),
100
+ frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
101
+ vocab_char_map: dict[str:int] | None = None,
102
+ use_ctc_loss: bool = False,
103
+ use_spk_enc: bool = False,
104
+ use_prosody_encoder: bool = False,
105
+ prosody_cfg_path: str | None = None,
106
+ prosody_ckpt_path: str | None = None,
107
+ ):
108
+ super().__init__()
109
+
110
+ self.frac_lengths_mask = frac_lengths_mask
111
+
112
+ # mel spec
113
+ self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
114
+ num_channels = default(num_channels, self.mel_spec.n_mel_channels)
115
+ self.num_channels = num_channels
116
+
117
+ # classifier-free guidance
118
+ self.audio_drop_prob = audio_drop_prob
119
+ self.text_drop_prob = text_drop_prob
120
+
121
+ # transformer
122
+ self.transformer = transformer
123
+ dim = transformer.dim
124
+ self.dim = dim
125
+
126
+ # conditional flow related
127
+ self.sigma = sigma
128
+
129
+ # sampling related
130
+ self.odeint_kwargs = odeint_kwargs
131
+
132
+ # vocab map for tokenization
133
+ self.vocab_char_map = vocab_char_map
134
+
135
+ # Prosody encoder (Pretssel ECAPA-TDNN)
136
+ self.use_prosody_encoder = (
137
+ use_prosody_encoder and prosody_cfg_path is not None and prosody_ckpt_path is not None
138
+ )
139
+ if self.use_prosody_encoder:
140
+ cfg_path = Path(prosody_cfg_path)
141
+ ckpt_path = Path(prosody_ckpt_path)
142
+ self.prosody_encoder = ProsodyEncoder(cfg_path, ckpt_path, freeze=True)
143
+ # 512-d prosody -> mel channel dimension
144
+ self.prosody_to_mel = nn.Linear(512, self.num_channels)
145
+ self.prosody_dropout = nn.Dropout(p=0.2)
146
+ else:
147
+ self.prosody_encoder = None
148
+
149
+ # Speaker encoder
150
+ self.use_spk_enc = use_spk_enc
151
+ if use_spk_enc:
152
+ self.speaker_encoder = ECAPA_TDNN(
153
+ self.num_channels,
154
+ self.dim,
155
+ channels=[512, 512, 512, 512, 1536],
156
+ kernel_sizes=[5, 3, 3, 3, 1],
157
+ dilations=[1, 2, 3, 4, 1],
158
+ attention_channels=128,
159
+ res2net_scale=4,
160
+ se_channels=128,
161
+ global_context=True,
162
+ batch_norm=True,
163
+ )
164
+ # self.load_partial_weights(self.speaker_encoder, "/cto_labs/vistring/zhaozhiyuan/outputs/F5-TTS/pretrain/speaker.bin", device="cpu")
165
+
166
+ self.use_ctc_loss = use_ctc_loss
167
+ if use_ctc_loss:
168
+ # print("vocab_char_map:", len(vocab_char_map)+1, "dim:", dim, "mel_spec_kwargs:",mel_spec_kwargs)
169
+ self.ctc = MIEsitmator(len(self.vocab_char_map), self.num_channels, self.dim, dropout=self.text_drop_prob)
170
+
171
+ self.accent_classifier = AccentClassifier(input_dim=self.num_channels, hidden_dim=self.dim, num_accents=12)
172
+ self.accent_criterion = nn.CrossEntropyLoss()
173
+
174
+ def load_partial_weights(self, model: nn.Module,
175
+ ckpt_path: str,
176
+ device="cpu",
177
+ verbose=True) -> int:
178
+ """
179
+ 仅加载形状匹配的参数,其余跳过。
180
+ 返回成功加载的参数数量。
181
+ """
182
+ state_dict = torch.load(ckpt_path, map_location=device)
183
+ model_dict = model.state_dict()
184
+
185
+ ok_count = 0
186
+ new_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
187
+
188
+ for k, v in state_dict.items():
189
+ if k in model_dict and v.shape == model_dict[k].shape:
190
+ new_dict[k] = v
191
+ ok_count += 1
192
+ else:
193
+ if verbose:
194
+ print(f"[SKIP] {k} ckpt:{v.shape} model:{model_dict[k].shape if k in model_dict else 'N/A'}")
195
+
196
+ model_dict.update(new_dict)
197
+ model.load_state_dict(model_dict)
198
+ if verbose:
199
+ print(f"=> 成功加载 {ok_count}/{len(state_dict)} 个参数")
200
+ return ok_count
201
+
202
+ @property
203
+ def device(self):
204
+ return next(self.parameters()).device
205
+
206
+ @torch.no_grad()
207
+ def sample(
208
+ self,
209
+ cond: float["b n d"] | float["b nw"], # noqa: F722
210
+ text: int["b nt"] | list[str], # noqa: F722
211
+ duration: int | int["b"], # noqa: F821
212
+ *,
213
+ lens: int["b"] | None = None, # noqa: F821
214
+ steps=32,
215
+ cfg_strength=1.0,
216
+ sway_sampling_coef=None,
217
+ seed: int | None = None,
218
+ max_duration=4096,
219
+ vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
220
+ no_ref_audio=False,
221
+ duplicate_test=False,
222
+ t_inter=0.1,
223
+ edit_mask=None,
224
+ use_acc_grl = True,
225
+ use_prosody_encoder = True,
226
+ ref_ratio = 1,
227
+ ):
228
+ self.eval()
229
+
230
+ # raw wave -> mel, keep a copy for prosody encoder if available
231
+ raw_audio = None
232
+ if cond.ndim == 2:
233
+ raw_audio = cond.clone() # (B, nw)
234
+ cond = self.mel_spec(cond)
235
+ cond = cond.permute(0, 2, 1)
236
+ assert cond.shape[-1] == self.num_channels
237
+
238
+ cond = cond.to(next(self.parameters()).dtype)
239
+ cond_mean = cond.mean(dim=1, keepdim=True)
240
+ batch, cond_seq_len, device = *cond.shape[:2], cond.device
241
+ if not exists(lens):
242
+ lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
243
+
244
+ # optional global prosody conditioning at inference (one embedding per sample)
245
+ prosody_mel_cond = None
246
+ prosody_text_cond = None
247
+ prosody_embeds = None
248
+ if self.prosody_encoder is not None and raw_audio is not None and use_prosody_encoder:
249
+ embeds = []
250
+ for b in range(batch):
251
+ audio_b = raw_audio[b].unsqueeze(0) # (1, nw)
252
+ src_sr = self.mel_spec.target_sample_rate
253
+ if src_sr != 16_000:
254
+ audio_16k = torchaudio.functional.resample(
255
+ audio_b, src_sr, 16_000
256
+ ).squeeze(0)
257
+ else:
258
+ audio_16k = audio_b.squeeze(0)
259
+ fbank = extract_fbank_16k(audio_16k)
260
+ fbank = fbank.unsqueeze(0).to(device=device, dtype=cond.dtype)
261
+ emb = self.prosody_encoder(fbank, padding_mask=None)[0] # (512,)
262
+ embeds.append(emb)
263
+ prosody_embeds = torch.stack(embeds, dim=0) # (B, 512)
264
+ # broadcast along mel and text
265
+ prosody_mel_cond = prosody_embeds[:, None, :].expand(-1, cond_seq_len, -1)
266
+
267
+ if use_acc_grl:
268
+ # rand_mel = clip_and_shuffle(cond.permute(0, 2, 1).squeeze(0), cond.shape[1])
269
+ # rand_mel = rand_mel.unsqueeze(0).permute(0, 2, 1)
270
+ # assert rand_mel.shape == cond.shape, f"Shape diff: rand_mel.shape: {rand_mel.shape}, cond.shape: {cond.shape}"
271
+ # cond_grl = grad_reverse(rand_mel, lambda_=1.0)
272
+
273
+ if ref_ratio < 1:
274
+ rand_mel = clip_and_shuffle(cond.permute(0, 2, 1).squeeze(0), cond.shape[1], ratio=ref_ratio)
275
+ rand_mel = rand_mel.unsqueeze(0).permute(0, 2, 1)
276
+ assert rand_mel.shape == cond.shape, f"Shape diff: rand_mel.shape: {rand_mel.shape}, cond.shape: {cond.shape}"
277
+ cond_grl = grad_reverse(rand_mel, lambda_=1.0)
278
+ else:
279
+ cond_grl = grad_reverse(cond, lambda_=1.0)
280
+ # print("cond:", cond.shape, cond.mean(), cond.max(), cond.min(), "rand_mel:", rand_mel.mean(), rand_mel.max(), rand_mel.min(), "cond_grl:", cond_grl.mean(), cond_grl.max(), cond_grl.min())
281
+
282
+ # text
283
+
284
+ if isinstance(text, list):
285
+ if exists(self.vocab_char_map):
286
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
287
+ else:
288
+ text = list_str_to_tensor(text).to(device)
289
+ assert text.shape[0] == batch
290
+
291
+ # duration
292
+
293
+ cond_mask = lens_to_mask(lens)
294
+ if edit_mask is not None:
295
+ cond_mask = cond_mask & edit_mask
296
+
297
+ if isinstance(duration, int):
298
+ duration = torch.full((batch,), duration, device=device, dtype=torch.long)
299
+
300
+ duration = torch.maximum(
301
+ torch.maximum((text != -1).sum(dim=-1), lens) + 1, duration
302
+ ) # duration at least text/audio prompt length plus one token, so something is generated
303
+ # clamp and convert max_duration to python int for padding ops
304
+ duration = duration.clamp(max=max_duration)
305
+ max_duration = int(duration.amax().item())
306
+
307
+ # duplicate test corner for inner time step oberservation
308
+ if duplicate_test:
309
+ test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
310
+
311
+ cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
312
+
313
+ if prosody_mel_cond is not None:
314
+ prosody_mel_cond = F.pad(
315
+ prosody_mel_cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0
316
+ )
317
+ prosody_mel_proj = self.prosody_to_mel(prosody_mel_cond)
318
+ cond = cond + prosody_mel_proj
319
+
320
+ if no_ref_audio:
321
+ random_cond = torch.randn_like(cond) * 0.1 + cond_mean
322
+ random_cond = random_cond / random_cond.mean(dim=1, keepdim=True) * cond_mean
323
+ print("cond:", cond.mean(), cond.max(), cond.min(), "random_cond:", random_cond.mean(), random_cond.max(), random_cond.min(), "mean_cond:", cond_mean.shape)
324
+ cond = random_cond
325
+
326
+ cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
327
+ cond_mask = cond_mask.unsqueeze(-1)
328
+
329
+ if use_acc_grl:
330
+ cond_grl = F.pad(cond_grl, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
331
+
332
+
333
+ step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond)) # allow direct control (cut cond audio) with lens passed in
334
+
335
+
336
+ if batch > 1:
337
+ mask = lens_to_mask(duration)
338
+ else: # save memory and speed up, as single inference need no mask currently
339
+ mask = None
340
+
341
+ # neural ode
342
+
343
+ def compute_sway_max(steps: int,
344
+ t_start: float = 0.0,
345
+ dtype=torch.float32,
346
+ min_ratio: float | None = None,
347
+ safety_factor: float = 0.5) -> float:
348
+ """
349
+ Compute a safe upper bound for sway_sampling_coef given steps and t_start.
350
+
351
+ - steps: number of ODE steps
352
+ - t_start: start time in [0,1)
353
+ - dtype: torch dtype (for machine eps)
354
+ - min_ratio: smallest distinguishable dt^p (if None, use conservative default)
355
+ - safety_factor: scale down the theoretical maximum to be safe
356
+ """
357
+ assert 0.0 <= t_start < 1.0
358
+ dt = (1.0 - t_start) / max(1, steps)
359
+ eps = torch.finfo(dtype).eps
360
+
361
+ if min_ratio is None:
362
+ # conservative default: ~100 * eps (float32 -> ~1e-5)
363
+ min_ratio = max(1e-9, 1e2 * float(eps))
364
+
365
+ if dt >= 0.9:
366
+ p_max = 1.0 + 10.0
367
+ else:
368
+ # solve dt^p >= min_ratio => p <= log(min_ratio)/log(dt)
369
+ p_max = math.log(min_ratio) / math.log(dt)
370
+
371
+ sway_max = max(0.0, p_max - 1.0)
372
+ sway_max = sway_max * float(safety_factor)
373
+ return torch.tensor(sway_max, device=device, dtype=dtype)
374
+
375
+ # prepare text-side prosody conditioning if embeddings available
376
+ if prosody_embeds is not None:
377
+ text_len = text.shape[1]
378
+ prosody_text_cond = prosody_embeds[:, None, :].expand(-1, text_len, -1)
379
+ else:
380
+ prosody_text_cond = None
381
+
382
+ def fn(t, x):
383
+ # at each step, conditioning is fixed
384
+ # if use_spk_enc:
385
+ # mix_cond = t * cond + (1-t) * spk_emb
386
+ # step_cond = torch.where(cond_mask, mix_cond, torch.zeros_like(mix_cond))
387
+ if use_acc_grl:
388
+ step_cond = torch.where(cond_mask, cond_grl, torch.zeros_like(cond_grl))
389
+ else:
390
+ step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
391
+
392
+ # predict flow
393
+ pred = self.transformer(
394
+ x=x,
395
+ cond=step_cond,
396
+ text=text,
397
+ time=t,
398
+ mask=mask,
399
+ drop_audio_cond=False,
400
+ drop_text=False,
401
+ cache=True,
402
+ prosody_text=prosody_text_cond,
403
+ )
404
+ if cfg_strength < 1e-5:
405
+ return pred
406
+
407
+ null_pred = self.transformer(
408
+ x=x,
409
+ cond=step_cond,
410
+ text=text,
411
+ time=t,
412
+ mask=mask,
413
+ drop_audio_cond=True,
414
+ drop_text=True,
415
+ cache=True,
416
+ prosody_text=prosody_text_cond,
417
+ )
418
+ # cfg_t = cfg_strength * torch.cos(0.5 * torch.pi * t)
419
+ # cfg_t = cfg_strength * (1 - t)
420
+ cfg_t = cfg_strength * ((1 - t) ** 2)
421
+ # print("t:", t, "cfg_t:", cfg_t)
422
+ res = pred + (pred - null_pred) * cfg_t
423
+ # print("t:", t.item(), "\tres:", res.shape, res.mean().item(), res.max().item(), res.min().item(), "\tpred:", pred.mean().item(), pred.max().item(), pred.min().item(), "\tnull_pred:", null_pred.mean().item(), null_pred.max().item(), null_pred.min().item(), "\tcfg_t:", cfg_t.item())
424
+ res = res.clamp(-20, 20)
425
+ return res
426
+
427
+ # noise input
428
+ # to make sure batch inference result is same with different batch size, and for sure single inference
429
+ # still some difference maybe due to convolutional layers
430
+ y0 = []
431
+ for dur in duration:
432
+ if exists(seed):
433
+ torch.manual_seed(seed)
434
+ y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
435
+ y0 = pad_sequence(y0, padding_value=0, batch_first=True)
436
+
437
+ t_start = 0
438
+
439
+ # duplicate test corner for inner time step oberservation
440
+ if duplicate_test:
441
+ t_start = t_inter
442
+ y0 = (1 - t_start) * y0 + t_start * test_cond
443
+ steps = int(steps * (1 - t_start))
444
+
445
+ t = torch.linspace(t_start, 1, int(steps + 1), device=self.device, dtype=step_cond.dtype)
446
+
447
+ sway_max = compute_sway_max(steps, t_start=t_start, dtype=step_cond.dtype, min_ratio=1e-9, safety_factor=0.7)
448
+ if sway_sampling_coef is not None:
449
+ sway_sampling_coef = min(sway_max, sway_sampling_coef)
450
+ # t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
451
+ t = t ** (1 + sway_sampling_coef)
452
+ else:
453
+ t = t ** (1 + sway_max)
454
+ # print("t:",t, "sway_max:", sway_max, "sway_sampling_coef:", sway_sampling_coef)
455
+
456
+ trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
457
+ self.transformer.clear_cache()
458
+
459
+ sampled = trajectory[-1]
460
+ out = sampled
461
+ out = torch.where(cond_mask, cond, out)
462
+
463
+ # out生成的部分,或者说pad补0的部分,单独计算mean, 然后和cond的mean做对齐(乘以系数,两个的均值要差不多)
464
+ if no_ref_audio:
465
+ out_mean = out[:,cond_seq_len:,:].mean(dim=1, keepdim=True)
466
+ out[:,cond_seq_len:,:] = out[:,cond_seq_len:,:] - (out_mean - cond_mean)
467
+ # print("out_mean:", out_mean.shape, out_mean.mean(), "cond_mean:", cond_mean.shape, cond_mean.mean(), "out:", out[:,cond_seq_len:,:].shape, out[:,cond_seq_len:,:].mean().item(), out[:,cond_seq_len:,:].max().item(), out[:,cond_seq_len:,:].min().item())
468
+
469
+ if exists(vocoder):
470
+ out = out.permute(0, 2, 1)
471
+ out = vocoder(out)
472
+ # print("out:", out.shape, "trajectory:", trajectory.shape)
473
+ return out, trajectory
474
+
475
+
476
+ def info_nce_speaker(self,
477
+ e_gt: torch.Tensor,
478
+ e_pred: torch.Tensor,
479
+ temperature: float = 0.1):
480
+ """
481
+ InfoNCE loss for speaker encoder training.
482
+ 同一条样本的 e_gt 与 e_pred 互为正例,其余均为负例。
483
+
484
+ Args:
485
+ temperature: 温度缩放 τ
486
+
487
+ Returns:
488
+ loss: 标量 tensor,可 backward
489
+ """
490
+ B = e_gt.size(0)
491
+ # 2. L2 归一化
492
+ e_gt = F.normalize(e_gt, dim=1)
493
+ e_pred = F.normalize(e_pred, dim=1)
494
+
495
+ # 3. 计算 B×B 相似度矩阵(pred 对 gt)
496
+ logits = torch.einsum('bd,cd->bc', e_pred, e_gt) / temperature # [B, B]
497
+
498
+ # 4. 正例标签正好是对角线
499
+ labels = torch.arange(B, device=logits.device)
500
+
501
+ # 5. InfoNCE = cross-entropy over in-batch negatives
502
+ loss = F.cross_entropy(logits, labels)
503
+ return loss
504
+
505
+
506
+ def forward_old(
507
+ self,
508
+ batchs: Dict[str, torch.Tensor],
509
+ # inp: float["b n d"] | float["b nw"], # mel or raw wave # noqa: F722
510
+ # text: int["b nt"] | list[str], # noqa: F722
511
+ *,
512
+ # lens: int["b"] | None = None, # noqa: F821
513
+ noise_scheduler: str | None = None,
514
+ ):
515
+
516
+ inp = batchs["mel"].permute(0, 2, 1)
517
+ lens = batchs["mel_lengths"]
518
+
519
+ rand_mel = batchs["rand_mel"].permute(0, 2, 1)
520
+
521
+ text = batchs["text"]
522
+ target_text_lengths = torch.tensor([len(x) for x in text], device=inp.device)
523
+
524
+ langs = batchs["langs"]
525
+
526
+ # print("inp:", inp.shape, "rand_mel:", rand_mel.shape, "lens:", lens, "target_text_lengths:", target_text_lengths, "langs:", langs)
527
+
528
+ # handle raw wave
529
+ if inp.ndim == 2:
530
+ inp = self.mel_spec(inp)
531
+ inp = inp.permute(0, 2, 1)
532
+ assert inp.shape[-1] == self.num_channels
533
+
534
+ batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
535
+ # print("inp_shape:", inp.shape, inp.max(), inp.min(), "dtype:", dtype, "device:", device, "σ1:", _σ1)
536
+
537
+ # handle text as string
538
+ if isinstance(text, list):
539
+ if exists(self.vocab_char_map):
540
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
541
+ else:
542
+ text = list_str_to_tensor(text).to(device)
543
+ assert text.shape[0] == batch
544
+
545
+ # lens and mask
546
+ if not exists(lens):
547
+ lens = torch.full((batch,), seq_len, device=device)
548
+
549
+ mask = lens_to_mask(lens, length=seq_len) # useless here, as collate_fn will pad to max length in batch
550
+
551
+ # get a random span to mask out for training conditionally
552
+ frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
553
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
554
+
555
+ if exists(mask):
556
+ rand_span_mask &= mask
557
+
558
+ # mel is x1
559
+ x1 = inp
560
+
561
+ # x0 is gaussian noise
562
+ x0 = torch.randn_like(x1)
563
+
564
+ # time step
565
+ time = torch.rand((batch,), dtype=dtype, device=self.device)
566
+ # TODO. noise_scheduler
567
+
568
+ # sample xt (φ_t(x) in the paper)
569
+ t = time.unsqueeze(-1).unsqueeze(-1)
570
+ φ = (1 - t) * x0 + t * x1
571
+ flow = x1 - x0
572
+
573
+ # cond = torch.where(rand_span_mask[..., None], torch.zeros_like(rand_mel), rand_mel)
574
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
575
+
576
+ # print("seq_len:", seq_len, "lens:", lens)
577
+ if self.use_spk_enc: # 50%的概率使用spk_emb
578
+
579
+ spk_emb = self.speaker_encoder(rand_mel, lens)
580
+ # global_emb: [batch, 1, dim] -> 重复扩展到 [batch, seq_len, dim]
581
+ spk_emb = spk_emb.unsqueeze(1).expand_as(x1)
582
+ # print("spk_emb_shape:", spk_emb.shape)
583
+ # 应用mask操作
584
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(spk_emb), spk_emb)
585
+ # assert cond.shape[0] == batch, "speaker encoder batch size mismatch"
586
+ # print("x1.shape:", x1.shape, "cond_shape:", cond.shape)
587
+
588
+ # 给一个随机数,把spk_emb * 随机数,再加上原来的cond *(1-随机数)
589
+ rand_num = torch.rand((batch, 1, 1), dtype=dtype, device=self.device)
590
+ cond = cond * rand_num + spk_emb * (1 - rand_num)
591
+
592
+ cond_grl = grad_reverse(cond, lambda_=1.0)
593
+
594
+ # print("inp_shape:", inp.shape, "rand_span_mask:", rand_span_mask.shape)
595
+
596
+ # # # transformer and cfg training with a drop rate
597
+ # drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
598
+ # drop_text_cond = random() < self.text_drop_prob # p_drop in voicebox paper
599
+ drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
600
+ if random() < self.text_drop_prob: # p_uncond in voicebox paper
601
+ drop_audio_cond = True
602
+ drop_text_cond = True
603
+ else:
604
+ drop_text_cond = False
605
+
606
+ # print("drop_audio_cond:", drop_audio_cond, "drop_text_cond:", drop_text_cond)
607
+ # if want rigorously mask out padding, record in collate_fn in dataset.py, and pass in here
608
+ # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
609
+ pred = self.transformer(x=φ, cond=cond_grl, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text_cond)
610
+
611
+ # flow matching loss
612
+ pred_clamp = pred.float().clamp(-20, 20)
613
+ loss = F.mse_loss(pred_clamp, flow, reduction="none")
614
+ loss = loss[rand_span_mask] # [N]
615
+
616
+ # # # 1. 全局截断:>2 或 NaN → 0(全局)
617
+ # print("mse loss shape:", loss.shape, "loss max:", loss.max(), "loss min:", loss.min(), target_text_lengths[0])
618
+
619
+ # # 2. 统计非NaN值的百分比
620
+ # valid_mask = ~torch.isnan(loss)
621
+ # total_count = loss.numel() # 总元素数量(所有维度)
622
+ # valid_count = valid_mask.sum().item() # 非NaN元素数量
623
+ # valid_percentage = (valid_count / total_count) * 100
624
+ # print(f"mse loss: total_count: {total_count}", f"valid_count: {valid_count}", f"valid_percentage: {valid_percentage:.2f}%")
625
+
626
+ # valid_loss = loss[~torch.isnan(loss)]
627
+ loss = torch.where(torch.isnan(loss) | (loss > 300.0), 300.0, loss)
628
+ loss = loss.mean()
629
+
630
+ # loss = torch.tanh(torch.log1p(loss.mean())) # 对数缩放
631
+ # if len(valid_loss) > 0:
632
+ # clipped_loss = torch.clamp(valid_loss, max=150)
633
+ # loss = torch.tanh(torch.log1p(clipped_loss.mean())) # 对数缩放
634
+ # else:
635
+ # loss = torch.tensor(0.0, device=pred.device)
636
+
637
+
638
+ accent_logits = self.accent_classifier(cond_grl)
639
+ accent_logits_mean = accent_logits.mean(dim=1)
640
+ lang_labels = langs.to(accent_logits.device).long()
641
+ # print("langs:", lang_labels, "accent_logits:", accent_logits.shape, "accent_logits_mean:", accent_logits_mean.shape)
642
+ accent_loss = self.accent_criterion(accent_logits_mean, lang_labels)
643
+ # guard against NaN / Inf in accent_loss
644
+ if not torch.isfinite(accent_loss):
645
+ accent_loss = torch.zeros_like(accent_loss, device=accent_loss.device)
646
+ # accent_loss = torch.zeros_like(loss, device=loss.device, requires_grad=True)
647
+ loss += 0.1 * accent_loss
648
+
649
+ valid_indices = torch.where(time > 0.5)[0]
650
+ # print("torch.where(time > 0.5):", valid_indices, torch.where(time > 0.5))
651
+ if valid_indices.size(0) > 2:
652
+ # 动态选择符合条件的sample
653
+ selected_gt = inp[valid_indices]
654
+ selected_pred = pred[valid_indices]
655
+ selected_text = text[valid_indices]
656
+ selected_lens = lens[valid_indices]
657
+ selected_target_lengths = target_text_lengths[valid_indices]
658
+ # print("pred:", selected_pred.shape, "valid_indices:", valid_indices, "lens:", selected_lens, "target_lengths:", selected_target_lengths)
659
+
660
+ if self.use_spk_enc and valid_indices.size(0) > 2:
661
+ # speaker encoder loss
662
+ e_gt = self.speaker_encoder(selected_gt, selected_lens)
663
+ e_pred = self.speaker_encoder(selected_pred, selected_lens)
664
+ spk_loss = self.info_nce_speaker(e_gt, e_pred)
665
+ if not torch.isnan(spk_loss).any(): # and spk_loss.item() > 1e-6:
666
+ loss = loss + spk_loss * 10.0
667
+ else:
668
+ spk_loss = torch.zeros_like(loss, device=loss.device, requires_grad=False)
669
+ else:
670
+ spk_loss = torch.zeros_like(loss, device=loss.device, requires_grad=False)
671
+ # print("spk_loss:", spk_loss)
672
+
673
+ # ctc loss
674
+ if self.use_ctc_loss and valid_indices.size(0) > 2:
675
+ # 如果t大于0.5 则计算ctc loss
676
+ ctc_loss = self.ctc(
677
+ decoder_outputs=selected_pred,
678
+ target_phones=selected_text,
679
+ decoder_lengths=selected_lens,
680
+ target_lengths=selected_target_lengths,
681
+ )
682
+ # print("loss:", loss, "ctc_loss:", ctc_loss, "time: ", time.shape, time[valid_indices].mean())
683
+ # 如果ctc loss没有nan,才加上ctc loss
684
+ if not torch.isnan(ctc_loss).any() and ctc_loss.item() > 1e-6:
685
+ # ctc_scaled = torch.tanh(torch.log1p(ctc_loss))
686
+ ctc_scaled = ctc_loss
687
+ loss = loss + 0.1 * ctc_scaled
688
+ else:
689
+ ctc_scaled = torch.zeros_like(loss, device=loss.device, requires_grad=False)
690
+ # print("loss:", loss, "ctc_scaled:", ctc_scaled)
691
+ else:
692
+ ctc_scaled = torch.zeros_like(loss, device=loss.device, requires_grad=False)
693
+
694
+
695
+ # 在计算完 total loss 之前
696
+ total_loss = loss # base flow loss + others you added
697
+ # note: we intentionally do NOT add 0.0 * pred.sum() etc. here, to avoid
698
+ # propagating NaNs from intermediate tensors into the loss scalar.
699
+
700
+ return total_loss, ctc_scaled, accent_loss, len(valid_indices), cond, pred # accent_loss,
701
+
702
+
703
+ def forward(self, batchs: Dict[str, torch.Tensor], *, noise_scheduler: str | None = None):
704
+ """
705
+ Simplified forward version for accent-invariant flow matching.
706
+ Removes speaker encoder and CTC parts, keeps accent GRL.
707
+ """
708
+ inp = batchs["mel"].permute(0, 2, 1) # [B, T_mel, D]
709
+ lens = batchs["mel_lengths"]
710
+ text = batchs["text"]
711
+ langs = batchs["langs"]
712
+ audio_16k_list = batchs.get("audio_16k", None)
713
+ prosody_idx_list = batchs.get("prosody_idx", None)
714
+
715
+ # # ---- 4. 随机截取并打乱 segment ----
716
+ # rand_mel = [clip_and_shuffle(spec, spec.shape[-1]) for spec in batchs["mel"]]
717
+
718
+ # padded_rand_mel = []
719
+ # for spec in rand_mel:
720
+ # padding = (0, batchs["mel"].shape[-1] - spec.size(-1))
721
+ # padded_spec = F.pad(spec, padding, value=0)
722
+ # padded_rand_mel.append(padded_spec)
723
+ # rand_mel = torch.stack(padded_rand_mel).permute(0, 2, 1)
724
+ # assert rand_mel.shape == inp.shape, f"shape diff: rand_mel.shape: {rand_mel.shape}, inp.shape: {inp.shape}"
725
+
726
+ if inp.ndim == 2:
727
+ inp = self.mel_spec(inp).permute(0, 2, 1)
728
+ assert inp.shape[-1] == self.num_channels
729
+
730
+ batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, self.device
731
+
732
+ # --- handle text
733
+ if isinstance(text, list):
734
+ if exists(self.vocab_char_map):
735
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
736
+ else:
737
+ text = list_str_to_tensor(text).to(device)
738
+ assert text.shape[0] == batch
739
+ # print("text:", batchs["text"][0], text.shape, text[0], batchs["text_lengths"][0])
740
+ # --- prosody conditioning (compute embeddings per sub-utterance)
741
+ prosody_mel_cond = None
742
+ prosody_text_cond = None
743
+ if (
744
+ self.prosody_encoder is not None
745
+ and audio_16k_list is not None
746
+ and prosody_idx_list is not None
747
+ ):
748
+ # prepare zero tensors for each sample
749
+ T_mel = seq_len
750
+ T_text = text.shape[1]
751
+ prosody_mel_cond = torch.zeros(batch, T_mel, 512, device=device, dtype=dtype)
752
+ prosody_text_cond = torch.zeros(batch, T_text, 512, device=device, dtype=dtype)
753
+
754
+ # collect all segments, run encoder per segment
755
+ seg_embeds: list[Tensor] = []
756
+ seg_meta: list[tuple[int, int, int, int, int, int]] = []
757
+ for b in range(batch):
758
+ audio_b = audio_16k_list[b]
759
+ idx_list = prosody_idx_list[b]
760
+ if audio_b is None or idx_list is None:
761
+ continue
762
+ audio_b = audio_b.to(device=device, dtype=dtype)
763
+ for seg in idx_list:
764
+ text_start, text_end, mel_start, mel_end, audio_start, audio_end = seg
765
+ # clamp audio indices
766
+ audio_start = max(0, min(audio_start, audio_b.shape[0] - 1))
767
+ audio_end = max(audio_start + 1, min(audio_end, audio_b.shape[0]))
768
+ audio_seg = audio_b[audio_start:audio_end]
769
+ if audio_seg.numel() == 0:
770
+ continue
771
+ fbank = extract_fbank_16k(audio_seg) # (T_fbank, 80)
772
+ fbank = fbank.unsqueeze(0).to(device=device, dtype=dtype) # (1, T_fbank, 80)
773
+ with torch.no_grad():
774
+ emb = self.prosody_encoder(fbank, padding_mask=None)[0] # (512,)
775
+ seg_embeds.append(emb)
776
+ seg_meta.append(
777
+ (b, text_start, text_end, mel_start, mel_end)
778
+ )
779
+
780
+ if seg_embeds:
781
+ seg_embeds_tensor = torch.stack(seg_embeds, dim=0) # (N_seg, 512)
782
+ # scatter embeddings back to per-sample tensors
783
+ for emb, meta in zip(seg_embeds_tensor, seg_meta):
784
+ b, ts, te, ms, me = meta
785
+ emb_exp = emb.to(device=device, dtype=dtype)
786
+ prosody_mel_cond[b, ms:me, :] = emb_exp
787
+ prosody_text_cond[b, ts:te, :] = emb_exp
788
+
789
+ # dropout on prosody conditioning
790
+ prosody_mel_cond = self.prosody_dropout(prosody_mel_cond)
791
+ prosody_text_cond = self.prosody_dropout(prosody_text_cond)
792
+
793
+ # --- mask & random span
794
+ mask = lens_to_mask(lens, length=seq_len)
795
+ frac_lengths = torch.zeros((batch,), device=device).float().uniform_(*self.frac_lengths_mask)
796
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
797
+ if exists(mask):
798
+ rand_span_mask &= mask
799
+
800
+ # --- flow setup
801
+ x1 = inp
802
+ x0 = torch.randn_like(x1)
803
+ time = torch.rand((batch,), dtype=dtype, device=device)
804
+ t = time[:, None, None]
805
+ φ = (1 - t) * x0 + t * x1
806
+ flow = x1 - x0
807
+
808
+ # --- conditional input (masked mel) + optional prosody
809
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1) # x1 # rand_mel
810
+ if prosody_mel_cond is not None:
811
+ prosody_mel_proj = self.prosody_to_mel(prosody_mel_cond) # (B, T_mel, num_channels)
812
+ # if needed, pad/crop to seq_len
813
+ if prosody_mel_proj.size(1) < seq_len:
814
+ pad_len = seq_len - prosody_mel_proj.size(1)
815
+ prosody_mel_proj = F.pad(prosody_mel_proj, (0, 0, 0, pad_len))
816
+ elif prosody_mel_proj.size(1) > seq_len:
817
+ prosody_mel_proj = prosody_mel_proj[:, :seq_len, :]
818
+ cond = cond + prosody_mel_proj
819
+
820
+ # --- Gradient reversal: encourage accent-invariant cond
821
+ cond_grl = grad_reverse(cond, lambda_=1.0)
822
+
823
+ # # --- random drop condition for CFG-like robustness
824
+ # drop_audio_cond = random() < self.audio_drop_prob
825
+ # drop_text_cond = random() < self.text_drop_prob if not drop_audio_cond else True
826
+
827
+ # safe per-batch random (tensor)
828
+ rand_for_drop = torch.rand(1, device=device)
829
+ drop_audio_cond = (rand_for_drop.item() < self.audio_drop_prob)
830
+ rand_for_text = torch.rand(1, device=device)
831
+ drop_text_cond = (rand_for_text.item() < self.text_drop_prob)
832
+
833
+ # --- main prediction
834
+ pred = self.transformer(
835
+ x=φ,
836
+ cond=cond_grl,
837
+ text=text,
838
+ time=time,
839
+ drop_audio_cond=drop_audio_cond,
840
+ drop_text=drop_text_cond,
841
+ prosody_text=prosody_text_cond,
842
+ )
843
+
844
+ # === FLOW LOSS (robust mask-weighted) ===
845
+ pred_clamp = pred.float().clamp(-20, 20)
846
+ per_elem_loss = F.mse_loss(pred_clamp, flow, reduction="none") # [B, T, D]
847
+
848
+ mask_exp = rand_span_mask.unsqueeze(-1).to(dtype=per_elem_loss.dtype) # [B, T, 1]
849
+ masked_loss = per_elem_loss * mask_exp # zeros where mask False
850
+
851
+ # total selected scalar (frames * dim)
852
+ n_selected = mask_exp.sum() * per_elem_loss.size(-1) # scalar
853
+ denom = torch.clamp(n_selected, min=1.0)
854
+
855
+ loss_sum = masked_loss.sum()
856
+ loss = loss_sum / denom
857
+ # numeric safety
858
+ loss = torch.where(torch.isnan(loss) | (loss > 300.0), torch.tensor(300.0, device=loss.device, dtype=loss.dtype), loss)
859
+
860
+ # === ACCENT LOSS ===
861
+ accent_logits = self.accent_classifier(cond_grl)
862
+ # pool across time -> [B, C]
863
+ accent_logits_mean = accent_logits.mean(dim=1)
864
+ lang_labels = langs.to(accent_logits_mean.device).long()
865
+ accent_loss = self.accent_criterion(accent_logits_mean, lang_labels)
866
+ # guard against NaN / Inf in accent_loss
867
+ if not torch.isfinite(accent_loss):
868
+ accent_loss = torch.zeros_like(accent_loss, device=accent_loss.device)
869
+
870
+ base_loss = loss + 0.1 * accent_loss
871
+
872
+ # === OPTIONAL CTC LOSS (robust, only on valid samples) ===
873
+ ctc_scaled = torch.tensor(0.0, device=device, dtype=dtype)
874
+ if getattr(self, "use_ctc_loss", False) and getattr(self, "ctc", None) is not None:
875
+ # select samples with larger t for CTC supervision (similar to forward_old)
876
+ valid_indices = torch.where(time > 0.5)[0]
877
+ if valid_indices.size(0) > 2:
878
+ selected_pred = pred[valid_indices]
879
+ selected_text = text[valid_indices]
880
+ selected_lens = lens[valid_indices]
881
+ # text was tokenized from list_str_to_idx, where padding is -1
882
+ selected_target_lengths = (selected_text != -1).sum(dim=-1)
883
+
884
+ ctc_loss = self.ctc(
885
+ decoder_outputs=selected_pred,
886
+ target_phones=selected_text,
887
+ decoder_lengths=selected_lens,
888
+ target_lengths=selected_target_lengths,
889
+ )
890
+ if torch.isfinite(ctc_loss) and ctc_loss.item() > 1e-6:
891
+ ctc_scaled = ctc_loss
892
+ base_loss = base_loss + 0.1 * ctc_scaled
893
+
894
+ total_loss = base_loss
895
+
896
+ # note: we intentionally do NOT add 0.0 * pred.sum() etc. here, to avoid
897
+ # propagating NaNs from intermediate tensors into the loss scalar.
898
+
899
+ return total_loss, accent_loss, ctc_scaled, cond, pred
lemas_tts/model/modules.py ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from typing import Optional
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import torchaudio
18
+ from librosa.filters import mel as librosa_mel_fn
19
+ from torch import nn
20
+ from x_transformers.x_transformers import apply_rotary_pos_emb
21
+ from torch.autograd import Function
22
+
23
+ # raw wav to mel spec
24
+
25
+
26
+ mel_basis_cache = {}
27
+ hann_window_cache = {}
28
+
29
+
30
+ def get_bigvgan_mel_spectrogram(
31
+ waveform,
32
+ n_fft=1024,
33
+ n_mel_channels=100,
34
+ target_sample_rate=24000,
35
+ hop_length=256,
36
+ win_length=1024,
37
+ fmin=0,
38
+ fmax=None,
39
+ center=False,
40
+ ): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
41
+ device = waveform.device
42
+ key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
43
+
44
+ if key not in mel_basis_cache:
45
+ mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
46
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
47
+ hann_window_cache[key] = torch.hann_window(win_length).to(device)
48
+
49
+ mel_basis = mel_basis_cache[key]
50
+ hann_window = hann_window_cache[key]
51
+
52
+ padding = (n_fft - hop_length) // 2
53
+ waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
54
+
55
+ spec = torch.stft(
56
+ waveform,
57
+ n_fft,
58
+ hop_length=hop_length,
59
+ win_length=win_length,
60
+ window=hann_window,
61
+ center=center,
62
+ pad_mode="reflect",
63
+ normalized=False,
64
+ onesided=True,
65
+ return_complex=True,
66
+ )
67
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
68
+
69
+ mel_spec = torch.matmul(mel_basis, spec)
70
+ mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
71
+
72
+ return mel_spec
73
+
74
+
75
+ def get_vocos_mel_spectrogram(
76
+ waveform,
77
+ n_fft=1024,
78
+ n_mel_channels=100,
79
+ target_sample_rate=24000,
80
+ hop_length=256,
81
+ win_length=1024,
82
+ ):
83
+ mel_stft = torchaudio.transforms.MelSpectrogram(
84
+ sample_rate=target_sample_rate,
85
+ n_fft=n_fft,
86
+ win_length=win_length,
87
+ hop_length=hop_length,
88
+ n_mels=n_mel_channels,
89
+ power=1,
90
+ center=True,
91
+ normalized=False,
92
+ norm=None,
93
+ ).to(waveform.device)
94
+ if len(waveform.shape) == 3:
95
+ waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
96
+
97
+ assert len(waveform.shape) == 2
98
+
99
+ mel = mel_stft(waveform)
100
+ mel = mel.clamp(min=1e-5).log()
101
+ return mel
102
+
103
+
104
+ class MelSpec(nn.Module):
105
+ def __init__(
106
+ self,
107
+ n_fft=1024,
108
+ hop_length=256,
109
+ win_length=1024,
110
+ n_mel_channels=100,
111
+ target_sample_rate=24_000,
112
+ mel_spec_type="vocos",
113
+ ):
114
+ super().__init__()
115
+ assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
116
+
117
+ self.n_fft = n_fft
118
+ self.hop_length = hop_length
119
+ self.win_length = win_length
120
+ self.n_mel_channels = n_mel_channels
121
+ self.target_sample_rate = target_sample_rate
122
+
123
+ if mel_spec_type == "vocos":
124
+ self.extractor = get_vocos_mel_spectrogram
125
+ elif mel_spec_type == "bigvgan":
126
+ self.extractor = get_bigvgan_mel_spectrogram
127
+
128
+ self.register_buffer("dummy", torch.tensor(0), persistent=False)
129
+
130
+ def forward(self, wav):
131
+ if self.dummy.device != wav.device:
132
+ self.to(wav.device)
133
+
134
+ mel = self.extractor(
135
+ waveform=wav,
136
+ n_fft=self.n_fft,
137
+ n_mel_channels=self.n_mel_channels,
138
+ target_sample_rate=self.target_sample_rate,
139
+ hop_length=self.hop_length,
140
+ win_length=self.win_length,
141
+ )
142
+
143
+ return mel
144
+
145
+
146
+ # sinusoidal position embedding
147
+
148
+
149
+ class SinusPositionEmbedding(nn.Module):
150
+ def __init__(self, dim):
151
+ super().__init__()
152
+ self.dim = dim
153
+
154
+ def forward(self, x, scale=1000):
155
+ device = x.device
156
+ half_dim = self.dim // 2
157
+ emb = math.log(10000) / (half_dim - 1)
158
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
159
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
160
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
161
+ return emb
162
+
163
+
164
+ # convolutional position embedding
165
+
166
+
167
+ class ConvPositionEmbedding(nn.Module):
168
+ def __init__(self, dim, kernel_size=31, groups=16):
169
+ super().__init__()
170
+ assert kernel_size % 2 != 0
171
+ self.conv1d = nn.Sequential(
172
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
173
+ nn.Mish(),
174
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
175
+ nn.Mish(),
176
+ )
177
+
178
+ def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
179
+ if mask is not None:
180
+ mask = mask[..., None]
181
+ x = x.masked_fill(~mask, 0.0)
182
+
183
+ x = x.permute(0, 2, 1)
184
+ x = self.conv1d(x)
185
+ out = x.permute(0, 2, 1)
186
+
187
+ if mask is not None:
188
+ out = out.masked_fill(~mask, 0.0)
189
+
190
+ return out
191
+
192
+
193
+ # rotary positional embedding related
194
+
195
+
196
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
197
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
198
+ # has some connection to NTK literature
199
+ # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
200
+ # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
201
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
202
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
203
+ t = torch.arange(end, device=freqs.device) # type: ignore
204
+ freqs = torch.outer(t, freqs).float() # type: ignore
205
+ freqs_cos = torch.cos(freqs) # real part
206
+ freqs_sin = torch.sin(freqs) # imaginary part
207
+ return torch.cat([freqs_cos, freqs_sin], dim=-1)
208
+
209
+
210
+ def get_pos_embed_indices(start, length, max_pos, scale=1.0):
211
+ # length = length if isinstance(length, int) else length.max()
212
+ scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
213
+ pos = (
214
+ start.unsqueeze(1)
215
+ + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
216
+ )
217
+ # avoid extra long error.
218
+ pos = torch.where(pos < max_pos, pos, max_pos - 1)
219
+ return pos
220
+
221
+
222
+ # Global Response Normalization layer (Instance Normalization ?)
223
+
224
+
225
+ class GRN(nn.Module):
226
+ def __init__(self, dim):
227
+ super().__init__()
228
+ self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
229
+ self.beta = nn.Parameter(torch.zeros(1, 1, dim))
230
+
231
+ def forward(self, x):
232
+ Gx = torch.norm(x, p=2, dim=1, keepdim=True)
233
+ Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
234
+ return self.gamma * (x * Nx) + self.beta + x
235
+
236
+
237
+ # ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
238
+ # ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
239
+
240
+
241
+ class ConvNeXtV2Block(nn.Module):
242
+ def __init__(
243
+ self,
244
+ dim: int,
245
+ intermediate_dim: int,
246
+ dilation: int = 1,
247
+ ):
248
+ super().__init__()
249
+ padding = (dilation * (7 - 1)) // 2
250
+ self.dwconv = nn.Conv1d(
251
+ dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
252
+ ) # depthwise conv
253
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
254
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
255
+ self.act = nn.GELU()
256
+ self.grn = GRN(intermediate_dim)
257
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
258
+
259
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
260
+ residual = x
261
+ x = x.transpose(1, 2) # b n d -> b d n
262
+ x = self.dwconv(x)
263
+ x = x.transpose(1, 2) # b d n -> b n d
264
+ x = self.norm(x)
265
+ x = self.pwconv1(x)
266
+ x = self.act(x)
267
+ x = self.grn(x)
268
+ x = self.pwconv2(x)
269
+ return residual + x
270
+
271
+
272
+ # RMSNorm
273
+
274
+
275
+ class RMSNorm(nn.Module):
276
+ def __init__(self, dim: int, eps: float):
277
+ super().__init__()
278
+ self.eps = eps
279
+ self.weight = nn.Parameter(torch.ones(dim))
280
+ self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
281
+
282
+ def forward(self, x):
283
+ if self.native_rms_norm:
284
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
285
+ x = x.to(self.weight.dtype)
286
+ x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
287
+ else:
288
+ variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
289
+ x = x * torch.rsqrt(variance + self.eps)
290
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
291
+ x = x.to(self.weight.dtype)
292
+ x = x * self.weight
293
+
294
+ return x
295
+
296
+
297
+ # AdaLayerNorm
298
+ # return with modulated x for attn input, and params for later mlp modulation
299
+
300
+
301
+ class AdaLayerNorm(nn.Module):
302
+ def __init__(self, dim):
303
+ super().__init__()
304
+
305
+ self.silu = nn.SiLU()
306
+ self.linear = nn.Linear(dim, dim * 6)
307
+
308
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
309
+
310
+ def forward(self, x, emb=None):
311
+ emb = self.linear(self.silu(emb))
312
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
313
+
314
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
315
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
316
+
317
+
318
+ # AdaLayerNorm for final layer
319
+ # return only with modulated x for attn input, cuz no more mlp modulation
320
+
321
+
322
+ class AdaLayerNorm_Final(nn.Module):
323
+ def __init__(self, dim):
324
+ super().__init__()
325
+
326
+ self.silu = nn.SiLU()
327
+ self.linear = nn.Linear(dim, dim * 2)
328
+
329
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
330
+
331
+ def forward(self, x, emb):
332
+ emb = self.linear(self.silu(emb))
333
+ scale, shift = torch.chunk(emb, 2, dim=1)
334
+
335
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
336
+ return x
337
+
338
+
339
+ # FeedForward
340
+
341
+
342
+ class FeedForward(nn.Module):
343
+ def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
344
+ super().__init__()
345
+ inner_dim = int(dim * mult)
346
+ dim_out = dim_out if dim_out is not None else dim
347
+
348
+ activation = nn.GELU(approximate=approximate)
349
+ project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
350
+ self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
351
+
352
+ def forward(self, x):
353
+ return self.ff(x)
354
+
355
+
356
+ # Attention with possible joint part
357
+ # modified from diffusers/src/diffusers/models/attention_processor.py
358
+
359
+
360
+ class Attention(nn.Module):
361
+ def __init__(
362
+ self,
363
+ processor: JointAttnProcessor | AttnProcessor,
364
+ dim: int,
365
+ heads: int = 8,
366
+ dim_head: int = 64,
367
+ dropout: float = 0.0,
368
+ context_dim: Optional[int] = None, # if not None -> joint attention
369
+ context_pre_only: bool = False,
370
+ qk_norm: Optional[str] = None,
371
+ ):
372
+ super().__init__()
373
+
374
+ if not hasattr(F, "scaled_dot_product_attention"):
375
+ raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
376
+
377
+ self.processor = processor
378
+
379
+ self.dim = dim
380
+ self.heads = heads
381
+ self.inner_dim = dim_head * heads
382
+ self.dropout = dropout
383
+
384
+ self.context_dim = context_dim
385
+ self.context_pre_only = context_pre_only
386
+
387
+ self.to_q = nn.Linear(dim, self.inner_dim)
388
+ self.to_k = nn.Linear(dim, self.inner_dim)
389
+ self.to_v = nn.Linear(dim, self.inner_dim)
390
+
391
+ if qk_norm is None:
392
+ self.q_norm = None
393
+ self.k_norm = None
394
+ elif qk_norm == "rms_norm":
395
+ self.q_norm = RMSNorm(dim_head, eps=1e-6)
396
+ self.k_norm = RMSNorm(dim_head, eps=1e-6)
397
+ else:
398
+ raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
399
+
400
+ if self.context_dim is not None:
401
+ self.to_q_c = nn.Linear(context_dim, self.inner_dim)
402
+ self.to_k_c = nn.Linear(context_dim, self.inner_dim)
403
+ self.to_v_c = nn.Linear(context_dim, self.inner_dim)
404
+ if qk_norm is None:
405
+ self.c_q_norm = None
406
+ self.c_k_norm = None
407
+ elif qk_norm == "rms_norm":
408
+ self.c_q_norm = RMSNorm(dim_head, eps=1e-6)
409
+ self.c_k_norm = RMSNorm(dim_head, eps=1e-6)
410
+
411
+ self.to_out = nn.ModuleList([])
412
+ self.to_out.append(nn.Linear(self.inner_dim, dim))
413
+ self.to_out.append(nn.Dropout(dropout))
414
+
415
+ if self.context_dim is not None and not self.context_pre_only:
416
+ self.to_out_c = nn.Linear(self.inner_dim, context_dim)
417
+
418
+ def forward(
419
+ self,
420
+ x: float["b n d"], # noised input x # noqa: F722
421
+ c: float["b n d"] = None, # context c # noqa: F722
422
+ mask: bool["b n"] | None = None, # noqa: F722
423
+ rope=None, # rotary position embedding for x
424
+ c_rope=None, # rotary position embedding for c
425
+ ) -> torch.Tensor:
426
+ if c is not None:
427
+ return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
428
+ else:
429
+ return self.processor(self, x, mask=mask, rope=rope)
430
+
431
+
432
+ # Attention processor
433
+
434
+
435
+ class AttnProcessor:
436
+ def __init__(
437
+ self,
438
+ pe_attn_head: int | None = None, # number of attention head to apply rope, None for all
439
+ ):
440
+ self.pe_attn_head = pe_attn_head
441
+
442
+ def __call__(
443
+ self,
444
+ attn: Attention,
445
+ x: float["b n d"], # noised input x # noqa: F722
446
+ mask: bool["b n"] | None = None, # noqa: F722
447
+ rope=None, # rotary position embedding
448
+ ) -> torch.FloatTensor:
449
+ batch_size = x.shape[0]
450
+
451
+ # `sample` projections
452
+ query = attn.to_q(x)
453
+ key = attn.to_k(x)
454
+ value = attn.to_v(x)
455
+
456
+ # attention
457
+ inner_dim = key.shape[-1]
458
+ head_dim = inner_dim // attn.heads
459
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
460
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
461
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
462
+
463
+ # qk norm
464
+ if attn.q_norm is not None:
465
+ query = attn.q_norm(query)
466
+ if attn.k_norm is not None:
467
+ key = attn.k_norm(key)
468
+
469
+ # apply rotary position embedding
470
+ if rope is not None:
471
+ freqs, xpos_scale = rope
472
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
473
+
474
+ if self.pe_attn_head is not None:
475
+ pn = self.pe_attn_head
476
+ query[:, :pn, :, :] = apply_rotary_pos_emb(query[:, :pn, :, :], freqs, q_xpos_scale)
477
+ key[:, :pn, :, :] = apply_rotary_pos_emb(key[:, :pn, :, :], freqs, k_xpos_scale)
478
+ else:
479
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
480
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
481
+
482
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
483
+ if mask is not None:
484
+ attn_mask = mask
485
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
486
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
487
+ else:
488
+ attn_mask = None
489
+
490
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
491
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
492
+ x = x.to(query.dtype)
493
+
494
+ # linear proj
495
+ x = attn.to_out[0](x)
496
+ # dropout
497
+ x = attn.to_out[1](x)
498
+
499
+ if mask is not None:
500
+ mask = mask.unsqueeze(-1)
501
+ x = x.masked_fill(~mask, 0.0)
502
+
503
+ return x
504
+
505
+
506
+ # Joint Attention processor for MM-DiT
507
+ # modified from diffusers/src/diffusers/models/attention_processor.py
508
+
509
+
510
+ class JointAttnProcessor:
511
+ def __init__(self):
512
+ pass
513
+
514
+ def __call__(
515
+ self,
516
+ attn: Attention,
517
+ x: float["b n d"], # noised input x # noqa: F722
518
+ c: float["b nt d"] = None, # context c, here text # noqa: F722
519
+ mask: bool["b n"] | None = None, # noqa: F722
520
+ rope=None, # rotary position embedding for x
521
+ c_rope=None, # rotary position embedding for c
522
+ ) -> torch.FloatTensor:
523
+ residual = x
524
+
525
+ batch_size = c.shape[0]
526
+
527
+ # `sample` projections
528
+ query = attn.to_q(x)
529
+ key = attn.to_k(x)
530
+ value = attn.to_v(x)
531
+
532
+ # `context` projections
533
+ c_query = attn.to_q_c(c)
534
+ c_key = attn.to_k_c(c)
535
+ c_value = attn.to_v_c(c)
536
+
537
+ # attention
538
+ inner_dim = key.shape[-1]
539
+ head_dim = inner_dim // attn.heads
540
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
541
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
542
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
543
+ c_query = c_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
544
+ c_key = c_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
545
+ c_value = c_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
546
+
547
+ # qk norm
548
+ if attn.q_norm is not None:
549
+ query = attn.q_norm(query)
550
+ if attn.k_norm is not None:
551
+ key = attn.k_norm(key)
552
+ if attn.c_q_norm is not None:
553
+ c_query = attn.c_q_norm(c_query)
554
+ if attn.c_k_norm is not None:
555
+ c_key = attn.c_k_norm(c_key)
556
+
557
+ # apply rope for context and noised input independently
558
+ if rope is not None:
559
+ freqs, xpos_scale = rope
560
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
561
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
562
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
563
+ if c_rope is not None:
564
+ freqs, xpos_scale = c_rope
565
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
566
+ c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
567
+ c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
568
+
569
+ # joint attention
570
+ query = torch.cat([query, c_query], dim=2)
571
+ key = torch.cat([key, c_key], dim=2)
572
+ value = torch.cat([value, c_value], dim=2)
573
+
574
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
575
+ if mask is not None:
576
+ attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
577
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
578
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
579
+ else:
580
+ attn_mask = None
581
+
582
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
583
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
584
+ x = x.to(query.dtype)
585
+
586
+ # Split the attention outputs.
587
+ x, c = (
588
+ x[:, : residual.shape[1]],
589
+ x[:, residual.shape[1] :],
590
+ )
591
+
592
+ # linear proj
593
+ x = attn.to_out[0](x)
594
+ # dropout
595
+ x = attn.to_out[1](x)
596
+ if not attn.context_pre_only:
597
+ c = attn.to_out_c(c)
598
+
599
+ if mask is not None:
600
+ mask = mask.unsqueeze(-1)
601
+ x = x.masked_fill(~mask, 0.0)
602
+ # c = c.masked_fill(~mask, 0.) # no mask for c (text)
603
+
604
+ return x, c
605
+
606
+
607
+ # DiT Block
608
+
609
+
610
+ class DiTBlock(nn.Module):
611
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, qk_norm=None, pe_attn_head=None):
612
+ super().__init__()
613
+
614
+ self.attn_norm = AdaLayerNorm(dim)
615
+ self.attn = Attention(
616
+ processor=AttnProcessor(pe_attn_head=pe_attn_head),
617
+ dim=dim,
618
+ heads=heads,
619
+ dim_head=dim_head,
620
+ dropout=dropout,
621
+ qk_norm=qk_norm,
622
+ )
623
+
624
+ self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
625
+ self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
626
+
627
+ def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
628
+ # pre-norm & modulation for attention input
629
+ norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
630
+
631
+ # attention
632
+ attn_output = self.attn(x=norm, mask=mask, rope=rope)
633
+
634
+ # process attention output for input x
635
+ x = x + gate_msa.unsqueeze(1) * attn_output
636
+
637
+ norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
638
+ ff_output = self.ff(norm)
639
+ x = x + gate_mlp.unsqueeze(1) * ff_output
640
+
641
+ return x
642
+
643
+
644
+ # MMDiT Block https://arxiv.org/abs/2403.03206
645
+
646
+
647
+ class MMDiTBlock(nn.Module):
648
+ r"""
649
+ modified from diffusers/src/diffusers/models/attention.py
650
+
651
+ notes.
652
+ _c: context related. text, cond, etc. (left part in sd3 fig2.b)
653
+ _x: noised input related. (right part)
654
+ context_pre_only: last layer only do prenorm + modulation cuz no more ffn
655
+ """
656
+
657
+ def __init__(
658
+ self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_dim=None, context_pre_only=False, qk_norm=None
659
+ ):
660
+ super().__init__()
661
+ if context_dim is None:
662
+ context_dim = dim
663
+ self.context_pre_only = context_pre_only
664
+
665
+ self.attn_norm_c = AdaLayerNorm_Final(context_dim) if context_pre_only else AdaLayerNorm(context_dim)
666
+ self.attn_norm_x = AdaLayerNorm(dim)
667
+ self.attn = Attention(
668
+ processor=JointAttnProcessor(),
669
+ dim=dim,
670
+ heads=heads,
671
+ dim_head=dim_head,
672
+ dropout=dropout,
673
+ context_dim=context_dim,
674
+ context_pre_only=context_pre_only,
675
+ qk_norm=qk_norm,
676
+ )
677
+
678
+ if not context_pre_only:
679
+ self.ff_norm_c = nn.LayerNorm(context_dim, elementwise_affine=False, eps=1e-6)
680
+ self.ff_c = FeedForward(dim=context_dim, mult=ff_mult, dropout=dropout, approximate="tanh")
681
+ else:
682
+ self.ff_norm_c = None
683
+ self.ff_c = None
684
+ self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
685
+ self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
686
+
687
+ def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
688
+ # pre-norm & modulation for attention input
689
+ if self.context_pre_only:
690
+ norm_c = self.attn_norm_c(c, t)
691
+ else:
692
+ norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
693
+ norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
694
+
695
+ # attention
696
+ x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
697
+
698
+ # process attention output for context c
699
+ if self.context_pre_only:
700
+ c = None
701
+ else: # if not last layer
702
+ c = c + c_gate_msa.unsqueeze(1) * c_attn_output
703
+
704
+ norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
705
+ c_ff_output = self.ff_c(norm_c)
706
+ c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
707
+
708
+ # process attention output for input x
709
+ x = x + x_gate_msa.unsqueeze(1) * x_attn_output
710
+
711
+ norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
712
+ x_ff_output = self.ff_x(norm_x)
713
+ x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
714
+
715
+ return c, x
716
+
717
+
718
+ # time step conditioning embedding
719
+
720
+
721
+ class TimestepEmbedding(nn.Module):
722
+ def __init__(self, dim, freq_embed_dim=256):
723
+ super().__init__()
724
+ self.time_embed = SinusPositionEmbedding(freq_embed_dim)
725
+ self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
726
+
727
+ def forward(self, timestep: float["b"]): # noqa: F821
728
+ time_hidden = self.time_embed(timestep)
729
+ time_hidden = time_hidden.to(timestep.dtype)
730
+ time = self.time_mlp(time_hidden) # b d
731
+ return time
732
+
733
+
734
+ class MIEsitmator(nn.Module):
735
+ def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5):
736
+ super(MIEsitmator, self).__init__()
737
+ self.proj = nn.Sequential(
738
+ torch.nn.Linear(decoder_dim, hidden_size, bias=True),
739
+ nn.ReLU(),
740
+ nn.Dropout(p=dropout)
741
+ )
742
+ self.ctc_proj = torch.nn.Linear(hidden_size, vocab_size + 1, bias=True)
743
+ self.ctc = nn.CTCLoss(blank=vocab_size, reduction='mean', zero_infinity=True)
744
+
745
+ def forward(self, decoder_outputs, target_phones, decoder_lengths, target_lengths):
746
+ out = self.proj(decoder_outputs.type(self.ctc_proj.weight.dtype))
747
+ log_probs = self.ctc_proj(out).log_softmax(dim=2)
748
+ log_probs = log_probs.transpose(1, 0)
749
+ ctc_loss = self.ctc(log_probs.float(), target_phones, decoder_lengths, target_lengths)
750
+ ctc_loss = ctc_loss / decoder_lengths.float()
751
+
752
+ # print("ctc_loss:", ctc_loss.shape, "ctc_max:", torch.max(ctc_loss), "ctc_min:", torch.min(ctc_loss), decoder_lengths[0])
753
+
754
+ # # 2. 统计非NaN值的百分比
755
+ # mask = ~torch.isnan(ctc_loss)
756
+ # total_count = ctc_loss.numel() # 总元素数量(所有维度)
757
+ # valid_count = mask.sum().item() # 非NaN元素数量
758
+ # valid_percentage = (valid_count / total_count) * 100
759
+ # print(f"ctc loss: total_count: {total_count}", f"valid_count: {valid_count}", f"valid_percentage: {valid_percentage:.2f}%")
760
+
761
+ # 3. 将NaN或大于150的值替换为150
762
+ # ctc_loss = torch.where(torch.isnan(ctc_loss), 150.0, ctc_loss)
763
+ ctc_loss = torch.where((ctc_loss > 300.0) | torch.isnan(ctc_loss), 300.0, ctc_loss)
764
+ # ctc_loss = torch.nan_to_num(ctc_loss, nan=0.0, posinf=0.0, neginf=0.0)
765
+ # average by number of frames since taco_loss is averaged.
766
+ ctc_loss = ctc_loss.mean()
767
+ return ctc_loss
768
+
769
+ def inference(self, decoder_output):
770
+ out = self.proj(decoder_output.type(self.ctc_proj.weight.dtype))
771
+ log_probs = self.ctc_proj(out).log_softmax(dim=2)
772
+ log_probs = log_probs.transpose(1, 0)
773
+ return log_probs.item()
774
+
775
+
776
+ class AccentClassifier(nn.Module):
777
+ def __init__(self, input_dim, hidden_dim, num_accents, dropout=0.3):
778
+ super().__init__()
779
+ self.net = nn.Sequential(
780
+ nn.Linear(input_dim, hidden_dim),
781
+ nn.ReLU(),
782
+ nn.Dropout(dropout),
783
+ nn.Linear(hidden_dim, num_accents)
784
+ )
785
+
786
+ def forward(self, x):
787
+ return self.net(x)
788
+
789
+
790
+ class GradientReversalFunction(Function):
791
+ @staticmethod
792
+ def forward(ctx, x, lambda_):
793
+ ctx.lambda_ = lambda_
794
+ return x.view_as(x)
795
+
796
+ @staticmethod
797
+ def backward(ctx, grad_output):
798
+ return grad_output.neg() * ctx.lambda_, None
799
+
800
+ def grad_reverse(x, lambda_=1.0):
801
+ return GradientReversalFunction.apply(x, lambda_)
802
+
lemas_tts/model/utils.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ from collections import defaultdict
6
+ from importlib.resources import files
7
+
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ import jieba
12
+ from pypinyin import lazy_pinyin, Style
13
+ import sys
14
+
15
+ # seed everything
16
+
17
+
18
+ def seed_everything(seed=0):
19
+ random.seed(seed)
20
+ os.environ["PYTHONHASHSEED"] = str(seed)
21
+ torch.manual_seed(seed)
22
+ torch.cuda.manual_seed(seed)
23
+ torch.cuda.manual_seed_all(seed)
24
+ torch.backends.cudnn.deterministic = True
25
+ torch.backends.cudnn.benchmark = False
26
+
27
+
28
+ # helpers
29
+
30
+
31
+ def exists(v):
32
+ return v is not None
33
+
34
+
35
+ def default(v, d):
36
+ return v if exists(v) else d
37
+
38
+
39
+ # tensor helpers
40
+
41
+
42
+ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]: # noqa: F722 F821
43
+ if not exists(length):
44
+ length = t.amax()
45
+
46
+ seq = torch.arange(length, device=t.device)
47
+ return seq[None, :] < t[:, None]
48
+
49
+
50
+ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]): # noqa: F722 F821
51
+ max_seq_len = seq_len.max().item()
52
+ seq = torch.arange(max_seq_len, device=start.device).long()
53
+ start_mask = seq[None, :] >= start[:, None]
54
+ end_mask = seq[None, :] < end[:, None]
55
+ return start_mask & end_mask
56
+
57
+
58
+ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]): # noqa: F722 F821
59
+ lengths = (frac_lengths * seq_len).long()
60
+ max_start = seq_len - lengths
61
+
62
+ rand = torch.rand_like(frac_lengths)
63
+ start = (max_start * rand).long().clamp(min=0)
64
+ end = start + lengths
65
+
66
+ return mask_from_start_end_indices(seq_len, start, end)
67
+
68
+
69
+ def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]: # noqa: F722
70
+ if not exists(mask):
71
+ return t.mean(dim=1)
72
+
73
+ t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
74
+ num = t.sum(dim=1)
75
+ den = mask.float().sum(dim=1)
76
+
77
+ return num / den.clamp(min=1.0)
78
+
79
+
80
+ # simple utf-8 tokenizer, since paper went character based
81
+ def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]: # noqa: F722
82
+ list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text] # ByT5 style
83
+ text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
84
+ return text
85
+
86
+ # char tokenizer, based on custom dataset's extracted .txt file
87
+ def list_str_to_idx(
88
+ text: list[str] | list[list[str]],
89
+ vocab_char_map: dict[str, int], # {char: idx}
90
+ padding_value=-1,
91
+ ) -> int["b nt"]: # noqa: F722
92
+ list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style
93
+ text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
94
+ return text
95
+
96
+
97
+ # Get tokenizer
98
+ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
99
+ """
100
+ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file
101
+ - "char" for char-wise tokenizer, need .txt vocab_file
102
+ - "byte" for utf-8 tokenizer
103
+ - "custom" if you're directly passing in a path to the vocab.txt you want to use
104
+ vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
105
+ - if use "char", derived from unfiltered character & symbol counts of custom dataset
106
+ - if use "byte", set to 256 (unicode byte range)
107
+ """
108
+ if tokenizer in ["pinyin", "char"]:
109
+ tokenizer_path = os.path.join(files("lemas_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
110
+ with open(tokenizer_path, "r", encoding="utf-8") as f:
111
+ vocab_char_map = {}
112
+ for i, char in enumerate(f):
113
+ vocab_char_map[char[:-1]] = i
114
+ vocab_size = len(vocab_char_map)
115
+ assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
116
+
117
+ elif tokenizer == "byte":
118
+ vocab_char_map = None
119
+ vocab_size = 256
120
+
121
+ elif tokenizer == "custom":
122
+ with open(dataset_name, "r", encoding="utf-8") as f:
123
+ vocab_char_map = {}
124
+ for i, char in enumerate(f):
125
+ vocab_char_map[char[:-1]] = i
126
+ vocab_size = len(vocab_char_map)
127
+
128
+ return vocab_char_map, vocab_size
129
+
130
+
131
+ # convert char to pinyin
132
+ def convert_char_to_pinyin(text_list, polyphone=True):
133
+ if jieba.dt.initialized is False:
134
+ jieba.default_logger.setLevel(50) # CRITICAL
135
+ jieba.initialize()
136
+
137
+ final_text_list = []
138
+ custom_trans = str.maketrans(
139
+ {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
140
+ ) # add custom trans here, to address oov
141
+
142
+ def is_chinese(c):
143
+ return (
144
+ "\u3100" <= c <= "\u9fff" # common chinese characters
145
+ )
146
+
147
+ for text in text_list:
148
+ char_list = []
149
+ text = text.translate(custom_trans)
150
+ from lemas_tts.infer.cn_tn import NSWNormalizer
151
+ text = NSWNormalizer(text.strip()).normalize()
152
+ text = list(jieba.cut(text))
153
+ for seg in text:
154
+ seg_byte_len = len(bytes(seg, "UTF-8"))
155
+ if seg_byte_len == len(seg): # if pure alphabets and symbols
156
+ if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
157
+ char_list.append(" ")
158
+ char_list.extend(seg)
159
+ elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters
160
+ seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
161
+ for i, c in enumerate(seg):
162
+ if is_chinese(c):
163
+ char_list.append(" ")
164
+ char_list.append(seg_[i])
165
+ else: # if mixed characters, alphabets and symbols
166
+ for c in seg:
167
+ if ord(c) < 256:
168
+ char_list.extend(c)
169
+ elif is_chinese(c):
170
+ char_list.append(" ")
171
+ char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
172
+ else:
173
+ char_list.append(c)
174
+ final_text_list.append(char_list)
175
+
176
+ return final_text_list
177
+
178
+
179
+ # filter func for dirty data with many repetitions
180
+
181
+
182
+ def repetition_found(text, length=2, tolerance=10):
183
+ pattern_count = defaultdict(int)
184
+ for i in range(len(text) - length + 1):
185
+ pattern = text[i : i + length]
186
+ pattern_count[pattern] += 1
187
+ for pattern, count in pattern_count.items():
188
+ if count > tolerance:
189
+ return True
190
+ return False
lemas_tts/scripts/inference_gradio.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ import platform
4
+ import psutil
5
+ import tempfile
6
+ from glob import glob
7
+ import traceback
8
+ import click
9
+ import gradio as gr
10
+ import torch
11
+
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ # Add the local code directory so that `lemas_tts` can be imported when running this
16
+ # script directly without installing the package.
17
+ THIS_FILE = Path(__file__).resolve()
18
+ SRC_ROOT = THIS_FILE.parents[2] # .../code
19
+ sys.path.append(str(SRC_ROOT))
20
+
21
+
22
+ def _find_repo_root(start: Path) -> Path:
23
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
24
+ for p in [start, *start.parents]:
25
+ if (p / "pretrained_models").is_dir():
26
+ return p
27
+ cwd = Path.cwd()
28
+ if (cwd / "pretrained_models").is_dir():
29
+ return cwd
30
+ return start
31
+
32
+
33
+ REPO_ROOT = _find_repo_root(THIS_FILE)
34
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
35
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
36
+ DATA_ROOT = PRETRAINED_ROOT / "data"
37
+ UVR5_CODE_DIR = REPO_ROOT / "code" / "uvr5"
38
+ UVR5_MODEL_DIR = PRETRAINED_ROOT / "uvr5" / "models" / "MDX_Net_Models" / "model_data"
39
+
40
+ from lemas_tts.api import F5TTS
41
+ import torch, torchaudio
42
+ import soundfile as sf
43
+
44
+ # Global variables
45
+ tts_api = None
46
+ last_checkpoint = ""
47
+ last_device = ""
48
+ last_ema = None
49
+
50
+ # Device detection
51
+ device = (
52
+ "cuda"
53
+ if torch.cuda.is_available()
54
+ else "xpu"
55
+ if torch.xpu.is_available()
56
+ else "mps"
57
+ if torch.backends.mps.is_available()
58
+ else "cpu"
59
+ )
60
+
61
+
62
+ class UVR5:
63
+ def __init__(self, model_dir):
64
+ code_dir = str(UVR5_CODE_DIR)
65
+ self.model = self.load_model(str(model_dir), code_dir)
66
+
67
+ def load_model(self, model_dir, code_dir):
68
+ import sys, json, os
69
+ sys.path.append(code_dir)
70
+ from multiprocess_cuda_infer import ModelData, Inference
71
+ model_path = os.path.join(model_dir, 'Kim_Vocal_1.onnx')
72
+ config_path = os.path.join(model_dir, 'MDX-Net-Kim-Vocal1.json')
73
+ configs = json.loads(open(config_path, 'r', encoding='utf-8').read())
74
+ model_data = ModelData(
75
+ model_path=model_path,
76
+ audio_path = model_dir,
77
+ result_path = model_dir,
78
+ device = 'cpu',
79
+ process_method = "MDX-Net",
80
+ base_dir=code_dir,
81
+ **configs
82
+ )
83
+
84
+ uvr5_model = Inference(model_data, 'cpu')
85
+ uvr5_model.load_model(model_path, 1)
86
+ return uvr5_model
87
+
88
+ def denoise(self, audio_info):
89
+ print("denoise UVR5: ", audio_info)
90
+ input_audio = load_wav(audio_info, sr=44100, channel=2)
91
+ output_audio = self.model.demix_base({0:input_audio.squeeze()}, is_match_mix=False)
92
+ # transform = torchaudio.transforms.Resample(44100, 16000)
93
+ # output_audio = transform(output_audio)
94
+ return output_audio.squeeze().T.numpy(), 44100
95
+
96
+
97
+ denoise_model = UVR5(UVR5_MODEL_DIR)
98
+
99
+ def load_wav(audio_info, sr=16000, channel=1):
100
+ print("load audio:", audio_info)
101
+ audio, raw_sr = torchaudio.load(audio_info)
102
+ audio = audio.T if len(audio.shape) > 1 and audio.shape[1] == 2 else audio
103
+ audio = audio / torch.max(torch.abs(audio))
104
+ audio = audio.squeeze().float()
105
+ if channel == 1 and len(audio.shape) == 2: # stereo to mono
106
+ audio = audio.mean(dim=0, keepdim=True)
107
+ elif channel == 2 and len(audio.shape) == 1:
108
+ audio = torch.stack((audio, audio)) # mono to stereo
109
+ if raw_sr != sr:
110
+ audio = torchaudio.functional.resample(audio.squeeze(), raw_sr, sr)
111
+ audio = torch.clip(audio, -0.999, 0.999).squeeze()
112
+ return audio
113
+
114
+
115
+ def denoise(audio_info):
116
+ save_path = "./denoised_audio.wav"
117
+ denoised_audio, sr = denoise_model.denoise(audio_info)
118
+ sf.write(save_path, denoised_audio, sr, format='wav', subtype='PCM_24')
119
+ print("save denoised audio:", save_path)
120
+ return save_path
121
+
122
+ def cancel_denoise(audio_info):
123
+ return audio_info
124
+
125
+
126
+ def get_checkpoints_project(project_name=None, is_gradio=True):
127
+ """Get available checkpoint files"""
128
+ checkpoint_dir = [str(CKPTS_ROOT)]
129
+ if project_name is None:
130
+ # Look for checkpoints in common locations
131
+ files_checkpoints = []
132
+ for path in checkpoint_dir:
133
+ if os.path.isdir(path):
134
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.pt"), recursive=True))
135
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.safetensors"), recursive=True))
136
+ break
137
+ else:
138
+ # project_name = project_name.replace("_pinyin", "").replace("_char", "")
139
+ project_name = "_".join(["F5TTS_v1_Base", "vocos", "custom", project_name.replace("_custom", "")]) if project_name != "F5TTS_v1_Base" else project_name
140
+ if os.path.isdir(checkpoint_dir[0]):
141
+ files_checkpoints = glob(os.path.join(checkpoint_dir[0], project_name, "*.pt"))
142
+ files_checkpoints.extend(glob(os.path.join(checkpoint_dir[0], project_name, "*.safetensors")))
143
+ else:
144
+ files_checkpoints = []
145
+ print("files_checkpoints:", project_name, files_checkpoints)
146
+ # Separate pretrained and regular checkpoints
147
+ pretrained_checkpoints = [f for f in files_checkpoints if "pretrained_" in os.path.basename(f)]
148
+ regular_checkpoints = [
149
+ f
150
+ for f in files_checkpoints
151
+ if "pretrained_" not in os.path.basename(f) and "model_last.pt" not in os.path.basename(f)
152
+ ]
153
+ last_checkpoint = [f for f in files_checkpoints if "model_last.pt" in os.path.basename(f)]
154
+
155
+ # Sort regular checkpoints by number
156
+ try:
157
+ regular_checkpoints = sorted(
158
+ regular_checkpoints, key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
159
+ )
160
+ except (IndexError, ValueError):
161
+ regular_checkpoints = sorted(regular_checkpoints)
162
+
163
+ # Combine in order: pretrained, regular, last
164
+ files_checkpoints = pretrained_checkpoints + regular_checkpoints + last_checkpoint
165
+
166
+ select_checkpoint = None if not files_checkpoints else files_checkpoints[-1]
167
+
168
+ if is_gradio:
169
+ return gr.update(choices=files_checkpoints, value=select_checkpoint)
170
+
171
+ return files_checkpoints, select_checkpoint
172
+
173
+
174
+ def get_available_projects():
175
+ """Get available project names from data directory"""
176
+ data_path = str(DATA_ROOT)
177
+
178
+ project_list = []
179
+ if os.path.isdir(data_path):
180
+ for folder in os.listdir(data_path):
181
+ if "test" in folder:
182
+ continue
183
+ project_list.append(folder)
184
+
185
+ # Fallback to a sensible default if no projects are found
186
+ if not project_list:
187
+ project_list = ["multilingual_acc_grl_custom"]
188
+
189
+ return project_list
190
+
191
+
192
+ def infer(
193
+ project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
194
+ ):
195
+ global last_checkpoint, last_device, tts_api, last_ema
196
+
197
+ if not os.path.isfile(file_checkpoint):
198
+ return None, "Checkpoint not found!", ""
199
+
200
+ if denoise_audio:
201
+ ref_audio = denoise_audio
202
+
203
+ device_test = device # Use the global device
204
+
205
+ if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
206
+ if last_checkpoint != file_checkpoint:
207
+ last_checkpoint = file_checkpoint
208
+
209
+ if last_device != device_test:
210
+ last_device = device_test
211
+
212
+ if last_ema != use_ema:
213
+ last_ema = use_ema
214
+
215
+ # Try to find vocab file
216
+ vocab_file = None
217
+ possible_vocab_paths = [
218
+ str(DATA_ROOT / project / "vocab.txt"),
219
+ # legacy fallbacks for older layouts
220
+ f"./data/{project}/vocab.txt",
221
+ f"../../data/{project}/vocab.txt",
222
+ "./data/Emilia_ZH_EN_pinyin/vocab.txt",
223
+ "../../data/Emilia_ZH_EN_pinyin/vocab.txt",
224
+ ]
225
+
226
+ for path in possible_vocab_paths:
227
+ if os.path.isfile(path):
228
+ vocab_file = path
229
+ break
230
+
231
+ if vocab_file is None:
232
+ return None, "Vocab file not found!", ""
233
+
234
+ try:
235
+ tts_api = F5TTS(
236
+ model=exp_name,
237
+ ckpt_file=file_checkpoint,
238
+ vocab_file=vocab_file,
239
+ device=device_test,
240
+ use_ema=use_ema,
241
+ frontend=frontend,
242
+ use_prosody_encoder=use_prosody_encoder,
243
+ prosody_cfg_path=str(CKPTS_ROOT / "prosody_encoder" / "pretssel_cfg.json"),
244
+ prosody_ckpt_path=str(CKPTS_ROOT / "prosody_encoder" / "prosody_encoder_UnitY2.pt"),
245
+ )
246
+ except Exception as e:
247
+ traceback.print_exc()
248
+ return None, f"Error loading model: {str(e)}", ""
249
+
250
+ print("Model loaded >>", device_test, file_checkpoint, use_ema)
251
+
252
+ if seed == -1: # -1 used for random
253
+ seed = None
254
+
255
+ try:
256
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
257
+ tts_api.infer(
258
+ ref_file=ref_audio,
259
+ ref_text=ref_text.strip(),
260
+ gen_text=gen_text.strip(),
261
+ nfe_step=nfe_step,
262
+ separate_langs=separate_langs,
263
+ speed=speed,
264
+ cfg_strength=cfg_strength,
265
+ sway_sampling_coef=sway_sampling_coef,
266
+ use_acc_grl=use_acc_grl,
267
+ ref_ratio=ref_ratio,
268
+ no_ref_audio=no_ref_audio,
269
+ use_prosody_encoder=use_prosody_encoder,
270
+ file_wave=f.name,
271
+ seed=seed,
272
+ )
273
+ return f.name, f"Device: {tts_api.device}", str(tts_api.seed)
274
+ except Exception as e:
275
+ traceback.print_exc()
276
+ return None, f"Inference error: {str(e)}", ""
277
+
278
+
279
+ def get_gpu_stats():
280
+ """Get GPU statistics"""
281
+ gpu_stats = ""
282
+
283
+ if torch.cuda.is_available():
284
+ gpu_count = torch.cuda.device_count()
285
+ for i in range(gpu_count):
286
+ gpu_name = torch.cuda.get_device_name(i)
287
+ gpu_properties = torch.cuda.get_device_properties(i)
288
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
289
+ allocated_memory = torch.cuda.memory_allocated(i) / (1024**2) # in MB
290
+ reserved_memory = torch.cuda.memory_reserved(i) / (1024**2) # in MB
291
+
292
+ gpu_stats += (
293
+ f"GPU {i} Name: {gpu_name}\n"
294
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
295
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
296
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
297
+ )
298
+ elif torch.xpu.is_available():
299
+ gpu_count = torch.xpu.device_count()
300
+ for i in range(gpu_count):
301
+ gpu_name = torch.xpu.get_device_name(i)
302
+ gpu_properties = torch.xpu.get_device_properties(i)
303
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
304
+ allocated_memory = torch.xpu.memory_allocated(i) / (1024**2) # in MB
305
+ reserved_memory = torch.xpu.memory_reserved(i) / (1024**2) # in MB
306
+
307
+ gpu_stats += (
308
+ f"GPU {i} Name: {gpu_name}\n"
309
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
310
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
311
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
312
+ )
313
+ elif torch.backends.mps.is_available():
314
+ gpu_count = 1
315
+ gpu_stats += "MPS GPU\n"
316
+ total_memory = psutil.virtual_memory().total / (
317
+ 1024**3
318
+ ) # Total system memory (MPS doesn't have its own memory)
319
+ allocated_memory = 0
320
+ reserved_memory = 0
321
+
322
+ gpu_stats += (
323
+ f"Total system memory: {total_memory:.2f} GB\n"
324
+ f"Allocated GPU memory (MPS): {allocated_memory:.2f} MB\n"
325
+ f"Reserved GPU memory (MPS): {reserved_memory:.2f} MB\n"
326
+ )
327
+
328
+ else:
329
+ gpu_stats = "No GPU available"
330
+
331
+ return gpu_stats
332
+
333
+
334
+ def get_cpu_stats():
335
+ """Get CPU statistics"""
336
+ cpu_usage = psutil.cpu_percent(interval=1)
337
+ memory_info = psutil.virtual_memory()
338
+ memory_used = memory_info.used / (1024**2)
339
+ memory_total = memory_info.total / (1024**2)
340
+ memory_percent = memory_info.percent
341
+
342
+ pid = os.getpid()
343
+ process = psutil.Process(pid)
344
+ nice_value = process.nice()
345
+
346
+ cpu_stats = (
347
+ f"CPU Usage: {cpu_usage:.2f}%\n"
348
+ f"System Memory: {memory_used:.2f} MB used / {memory_total:.2f} MB total ({memory_percent}% used)\n"
349
+ f"Process Priority (Nice value): {nice_value}"
350
+ )
351
+
352
+ return cpu_stats
353
+
354
+
355
+ def get_combined_stats():
356
+ """Get combined system stats"""
357
+ gpu_stats = get_gpu_stats()
358
+ cpu_stats = get_cpu_stats()
359
+ combined_stats = f"### GPU Stats\n{gpu_stats}\n\n### CPU Stats\n{cpu_stats}"
360
+ return combined_stats
361
+
362
+
363
+ # Create Gradio interface
364
+ with gr.Blocks(title="LEMAS-TTS Inference") as app:
365
+ gr.Markdown(
366
+ """
367
+ # Zero-Shot TTS
368
+
369
+ Set seed to -1 for random generation.
370
+ """
371
+ )
372
+ with gr.Accordion("Model configuration", open=False):
373
+ # Model configuration
374
+ with gr.Row():
375
+ exp_name = gr.Radio(
376
+ label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base", visible=False
377
+ )
378
+ # Project selection
379
+ available_projects = get_available_projects()
380
+
381
+ # Get initial checkpoints
382
+ list_checkpoints, checkpoint_select = get_checkpoints_project(available_projects[0] if available_projects else None, False)
383
+
384
+ with gr.Row():
385
+ with gr.Column(scale=1):
386
+ # load_models_btn = gr.Button(value="Load models")
387
+ cm_project = gr.Dropdown(
388
+ choices=available_projects,
389
+ value=available_projects[0] if available_projects else None,
390
+ label="Project",
391
+ allow_custom_value=True,
392
+ scale=4
393
+ )
394
+
395
+ with gr.Column(scale=5):
396
+ cm_checkpoint = gr.Dropdown(
397
+ choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True # scale=4,
398
+ )
399
+ bt_checkpoint_refresh = gr.Button("Refresh", scale=1)
400
+
401
+ with gr.Row():
402
+ ch_use_ema = gr.Checkbox(label="Use EMA", value=True, scale=2, info="Turn off at early stage might offer better results")
403
+ frontend = gr.Radio(label="Frontend", choices=["phone", "char", "bpe"], value="phone", scale=3)
404
+ separate_langs = gr.Checkbox(label="Separate Languages", value=True, scale=2, info="separate language tokens")
405
+
406
+ # Inference parameters
407
+ with gr.Row():
408
+ nfe_step = gr.Number(label="NFE Step", scale=1, value=64)
409
+ speed = gr.Slider(label="Speed", scale=3, value=1.0, minimum=0.5, maximum=1.5, step=0.1)
410
+ cfg_strength = gr.Slider(label="CFG Strength", scale=2, value=5.0, minimum=0.0, maximum=10.0, step=1)
411
+ sway_sampling_coef = gr.Slider(label="Sway Sampling Coef", scale=2, value=3, minimum=-1, maximum=5, step=0.1)
412
+ ref_ratio = gr.Slider(label="Ref Ratio", scale=2, value=1.0, minimum=0.0, maximum=1.0, step=0.1)
413
+ no_ref_audio = gr.Checkbox(label="No Reference Audio", value=False, scale=1, info="No mel condition")
414
+ use_acc_grl = gr.Checkbox(label="Use accent grl condition", value=False, scale=1, info="Use accent grl condition")
415
+ use_prosody_encoder = gr.Checkbox(label="Use prosody encoder", value=False, scale=1, info="Use prosody encoder")
416
+ seed = gr.Number(label="Random Seed", scale=1, value=5828684826493313192, minimum=-1)
417
+
418
+
419
+ # Input fields
420
+ ref_text = gr.Textbox(label="Reference Text", placeholder="Enter the text for the reference audio...")
421
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
422
+
423
+
424
+ with gr.Row():
425
+ denoise_btn = gr.Button(value="Denoise")
426
+ cancel_btn = gr.Button(value="Cancel Denoise")
427
+ denoise_audio = gr.Audio(label="Denoised Audio", value=None, type="filepath", interactive=True, show_download_button=True, editable=True)
428
+
429
+ gen_text = gr.Textbox(label="Text to Generate", placeholder="Enter the text you want to generate...")
430
+
431
+ # Inference button and outputs
432
+ with gr.Row():
433
+ txt_info_gpu = gr.Textbox("", label="Device Info")
434
+ seed_info = gr.Textbox(label="Used Random Seed")
435
+ check_button_infer = gr.Button("Generate Audio", variant="primary")
436
+
437
+ gen_audio = gr.Audio(label="Generated Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
438
+
439
+ # Examples
440
+ examples = gr.Examples(
441
+ examples=[
442
+ [
443
+ "Ich glaub, mein Schwein pfeift.",
444
+ str(DATA_ROOT / "test_examples" / "de.wav"),
445
+ "我觉得我的猪在吹口哨。",
446
+ ],
447
+ [
448
+ "em, #1 I have a list of YouTubers, and I'm gonna be going to their houses and raiding them by.",
449
+ str(DATA_ROOT / "test_examples" / "en.wav"),
450
+ "我有一份 YouTuber 名单,我打算去他们家,对他们进行突袭。",
451
+ ],
452
+ [
453
+ "Te voy a dar un tip #1 que le copia a John Rockefeller, uno de los empresarios más picudos de la historia.",
454
+ str(DATA_ROOT / "test_examples" / "es.wav"),
455
+ "我要给你一个从历史上最精明的商人之一约翰·洛克菲勒那里抄来的秘诀。",
456
+ ],
457
+ [
458
+ "Per l'amor di Dio #1 fai, #2 se pensi di non poterti fermare, fallo #1 e fallo.",
459
+ str(DATA_ROOT / "test_examples" / "it.wav"),
460
+ "看在上帝的份上,去做吧,如果你认为你无法停止,那就去做吧,继续做下去。",
461
+ ],
462
+ [
463
+ "Nova, #1 dia 25 desse mês vai rolar operação the last Frontier.",
464
+ str(DATA_ROOT / "test_examples" / "pt.wav"),
465
+ "新消息,本月二十五日,'最后的边疆行动'将启动。",
466
+ ],
467
+ # ["Good morning! #1 ",
468
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_0.wav",
469
+ # " #1"
470
+ # ],
471
+ # ["Good morning! #1 ",
472
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_1.wav",
473
+ # " #1",
474
+ # ],
475
+ # ["Good morning! #1 ",
476
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_2.wav",
477
+ # " #1",
478
+ # ],
479
+ # ["Oh, and in case I don't see ya, #1",
480
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_3.wav",
481
+ # " #1",
482
+ # ],
483
+ # ["Good afternoon, good evening, and good night. #1",
484
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_4.wav",
485
+ # " #1",
486
+ # ],
487
+ ],
488
+ inputs=[
489
+ ref_text,
490
+ ref_audio,
491
+ gen_text,
492
+ ],
493
+ outputs=[gen_audio, txt_info_gpu, seed_info],
494
+ fn=infer,
495
+ cache_examples=False
496
+ )
497
+
498
+ # System Info section at the bottom
499
+ gr.Markdown("---")
500
+ gr.Markdown("## System Information")
501
+ with gr.Accordion("Update System Stats", open=False):
502
+ update_button = gr.Button("Update System Stats", scale=1)
503
+ output_box = gr.Textbox(label="GPU and CPU Information", lines=5, scale=5)
504
+
505
+ def update_stats():
506
+ return get_combined_stats()
507
+
508
+
509
+ denoise_btn.click(fn=denoise,
510
+ inputs=[ref_audio],
511
+ outputs=[denoise_audio])
512
+
513
+ cancel_btn.click(fn=cancel_denoise,
514
+ inputs=[ref_audio],
515
+ outputs=[denoise_audio])
516
+
517
+ # Event handlers
518
+ check_button_infer.click(
519
+ fn=infer,
520
+ inputs=[
521
+ cm_project,
522
+ cm_checkpoint,
523
+ exp_name,
524
+ ref_text,
525
+ ref_audio,
526
+ denoise_audio,
527
+ gen_text,
528
+ nfe_step,
529
+ ch_use_ema,
530
+ separate_langs,
531
+ frontend,
532
+ speed,
533
+ cfg_strength,
534
+ use_acc_grl,
535
+ ref_ratio,
536
+ no_ref_audio,
537
+ sway_sampling_coef,
538
+ use_prosody_encoder,
539
+ seed,
540
+ ],
541
+ outputs=[gen_audio, txt_info_gpu, seed_info],
542
+ )
543
+
544
+ bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
545
+ cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
546
+
547
+ ref_audio.change(
548
+ fn=lambda x: None,
549
+ inputs=[ref_audio],
550
+ outputs=[denoise_audio]
551
+ )
552
+
553
+ update_button.click(fn=update_stats, outputs=output_box)
554
+
555
+ # Auto-load system stats on startup
556
+ app.load(fn=update_stats, outputs=output_box)
557
+
558
+
559
+ @click.command()
560
+ @click.option("--port", "-p", default=7860, type=int, help="Port to run the app on")
561
+ @click.option("--host", "-H", default="0.0.0.0", help="Host to run the app on")
562
+ @click.option(
563
+ "--share",
564
+ "-s",
565
+ default=False,
566
+ is_flag=True,
567
+ help="Share the app via Gradio share link",
568
+ )
569
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
570
+ def main(port, host, share, api):
571
+ global app
572
+ print("Starting LEMAS-TTS Inference Interface...")
573
+ print(f"Device: {device}")
574
+ app.queue(api_open=api).launch(
575
+ server_name=host,
576
+ server_port=port,
577
+ share=share,
578
+ show_api=api,
579
+ allowed_paths=[str(DATA_ROOT)],
580
+ )
581
+
582
+
583
+ if __name__ == "__main__":
584
+ main()
requirements.txt ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu121
2
+ accelerate>=0.33.0
3
+ aiofiles==23.2.1
4
+ aiohappyeyeballs==2.6.1
5
+ aiohttp==3.13.2
6
+ aiosignal==1.4.0
7
+ annotated-doc==0.0.4
8
+ annotated-types==0.7.0
9
+ antlr4-python3-runtime==4.9.3
10
+ anyio==4.12.0
11
+ attrs==25.4.0
12
+ audioread==3.1.0
13
+ babel==2.17.0
14
+ bitsandbytes>0.37.0; platform_machine != "arm64" and platform_system != "Darwin"
15
+ boto3==1.42.16
16
+ botocore==1.42.16
17
+ brotli==1.2.0
18
+ cached_path
19
+ cachetools==6.2.4
20
+ certifi==2025.11.12
21
+ cffi==2.0.0
22
+ charset-normalizer==3.4.4
23
+ click
24
+ contourpy==1.3.2
25
+ csvw==3.7.0
26
+ cycler==0.12.1
27
+ datasets
28
+ decorator==5.2.1
29
+ dill==0.4.0
30
+ dlinfo==2.0.0
31
+ docopt==0.6.2
32
+ einops==0.8.1
33
+ einx==0.3.0
34
+ ema-pytorch==0.7.3
35
+ encodec==0.1.1
36
+ espeakng==1.0.2
37
+ espeak_phonemizer==1.3.1
38
+ fastapi==0.127.0
39
+ ffmpy==1.0.0
40
+ filelock==3.20.1
41
+ fonttools==4.61.1
42
+ frozendict==2.4.7
43
+ frozenlist==1.8.0
44
+ fsspec==2025.10.0
45
+ gitdb==4.0.12
46
+ GitPython==3.1.45
47
+ google-api-core==2.28.1
48
+ google-auth==2.45.0
49
+ google-cloud-core==2.5.0
50
+ google-cloud-storage==3.7.0
51
+ google-crc32c==1.8.0
52
+ google-resumable-media==2.8.0
53
+ googleapis-common-protos==1.72.0
54
+ gradio==5.38.0
55
+ gradio-client==1.11.0
56
+ groovy==0.1.2
57
+ h11==0.16.0
58
+ hf-xet==1.2.0
59
+ httpcore==1.0.9
60
+ httpx==0.28.1
61
+ huggingface-hub==0.36.0
62
+ hydra-core>=1.3.0
63
+ idna==3.11
64
+ isodate==0.7.2
65
+ jieba
66
+ Jinja2==3.1.6
67
+ jmespath==1.0.1
68
+ joblib==1.5.3
69
+ jsonschema==4.25.1
70
+ jsonschema-specifications==2025.9.1
71
+ kiwisolver==1.4.9
72
+ langid==1.1.6
73
+ language-tags==1.2.0
74
+ lazy_loader==0.4
75
+ librosa
76
+ llvmlite==0.42.0
77
+ loguru==0.7.3
78
+ markdown-it-py==4.0.0
79
+ MarkupSafe
80
+ matplotlib
81
+ mdurl==0.1.2
82
+ mpmath==1.3.0
83
+ msgpack==1.1.2
84
+ multidict==6.7.0
85
+ multiprocess==0.70.18
86
+ networkx==3.1
87
+ num2words==0.5.13
88
+ numba==0.59.0
89
+ numpy==1.26.0
90
+ nvidia-cublas-cu12==12.1.3.1
91
+ nvidia-cuda-cupti-cu12==12.1.105
92
+ nvidia-cuda-nvrtc-cu12==12.1.105
93
+ nvidia-cuda-runtime-cu12==12.1.105
94
+ nvidia-cudnn-cu12==8.9.2.26
95
+ nvidia-cufft-cu12==11.0.2.54
96
+ nvidia-cufile-cu12==1.11.1.6
97
+ nvidia-curand-cu12==10.3.2.106
98
+ nvidia-cusolver-cu12==11.4.5.107
99
+ nvidia-cusparse-cu12==12.1.0.106
100
+ nvidia-cusparselt-cu12==0.6.3
101
+ nvidia-nccl-cu12==2.20.5
102
+ nvidia-nvjitlink-cu12==12.6.85
103
+ nvidia-nvtx-cu12==12.1.105
104
+ omegaconf==2.3.0
105
+ onnx==1.16.0
106
+ onnxruntime
107
+ onnxruntime-gpu
108
+ orjson==3.11.5
109
+ packaging==25.0
110
+ pandas==2.3.3
111
+ phonemizer==3.3.0
112
+ pillow==11.3.0
113
+ platformdirs==4.5.1
114
+ pooch==1.8.2
115
+ propcache==0.4.1
116
+ proto-plus==1.27.0
117
+ protobuf==6.33.2
118
+ psutil==7.2.0
119
+ pyarrow==22.0.0
120
+ pyasn1==0.6.1
121
+ pyasn1_modules==0.4.2
122
+ pycparser==2.23
123
+ pydantic<=2.10.6
124
+ pydantic_core==2.27.2
125
+ pydub
126
+ py-espeak-ng==0.1.8
127
+ Pygments==2.19.2
128
+ pyparsing==3.3.1
129
+ pypinyin
130
+ pypinyin-dict
131
+ python-dateutil==2.9.0.post0
132
+ python-multipart==0.0.21
133
+ pytz==2025.2
134
+ PyYAML==6.0.3
135
+ rdflib==7.5.0
136
+ referencing==0.37.0
137
+ regex
138
+ requests==2.32.5
139
+ rfc3986==1.5.0
140
+ rich==13.9.4
141
+ rpds-py==0.30.0
142
+ rsa==4.9.1
143
+ s3transfer==0.16.0
144
+ safehttpx==0.1.7
145
+ safetensors
146
+ scikit-learn==1.7.1
147
+ scipy==1.15.3
148
+ segments==2.3.0
149
+ semantic-version==2.10.0
150
+ sentry-sdk==2.48.0
151
+ setuptools==80.9.0
152
+ shellingham==1.5.4
153
+ six==1.17.0
154
+ smmap==5.0.2
155
+ soundfile
156
+ soxr==1.0.0
157
+ starlette==0.50.0
158
+ sympy==1.14.0
159
+ termcolor==3.2.0
160
+ threadpoolctl==3.6.0
161
+ tokenizers==0.22.1
162
+ tomli
163
+ tomlkit==0.13.3
164
+ torch==2.3.1
165
+ torchaudio==2.3.1
166
+ torchdiffeq==0.2.4
167
+ tqdm>=4.65.0
168
+ transformers
169
+ transformers-stream-generator
170
+ triton==2.3.1
171
+ typer==0.16.0
172
+ typing_extensions==4.12.2
173
+ tzdata==2025.3
174
+ uritemplate==4.2.0
175
+ urllib3==2.6.2
176
+ uroman
177
+ uvicorn==0.40.0
178
+ vocos
179
+ x-transformers>=1.31.14
180
+ xxhash==3.6.0
181
+ yarl==1.22.0
182
+ zhconv
uvr5/gui_data/constants.py ADDED
@@ -0,0 +1,1147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import platform
2
+
3
+ #Platform Details
4
+ OPERATING_SYSTEM = platform.system()
5
+ SYSTEM_ARCH = platform.platform()
6
+ SYSTEM_PROC = platform.processor()
7
+ ARM = 'arm'
8
+
9
+ #Main Font
10
+ MAIN_FONT_NAME = "Century Gothic"
11
+
12
+ #Model Types
13
+ VR_ARCH_TYPE = 'VR Arc'
14
+ MDX_ARCH_TYPE = 'MDX-Net'
15
+ DEMUCS_ARCH_TYPE = 'Demucs'
16
+ VR_ARCH_PM = 'VR Architecture'
17
+ ENSEMBLE_MODE = 'Ensemble Mode'
18
+ ENSEMBLE_STEM_CHECK = 'Ensemble Stem'
19
+ SECONDARY_MODEL = 'Secondary Model'
20
+ DEMUCS_6_STEM_MODEL = 'htdemucs_6s'
21
+
22
+ DEMUCS_V3_ARCH_TYPE = 'Demucs v3'
23
+ DEMUCS_V4_ARCH_TYPE = 'Demucs v4'
24
+ DEMUCS_NEWER_ARCH_TYPES = [DEMUCS_V3_ARCH_TYPE, DEMUCS_V4_ARCH_TYPE]
25
+
26
+ DEMUCS_V1 = 'v1'
27
+ DEMUCS_V2 = 'v2'
28
+ DEMUCS_V3 = 'v3'
29
+ DEMUCS_V4 = 'v4'
30
+
31
+ DEMUCS_V1_TAG = 'v1 | '
32
+ DEMUCS_V2_TAG = 'v2 | '
33
+ DEMUCS_V3_TAG = 'v3 | '
34
+ DEMUCS_V4_TAG = 'v4 | '
35
+ DEMUCS_NEWER_TAGS = [DEMUCS_V3_TAG, DEMUCS_V4_TAG]
36
+
37
+ DEMUCS_VERSION_MAPPER = {
38
+ DEMUCS_V1:DEMUCS_V1_TAG,
39
+ DEMUCS_V2:DEMUCS_V2_TAG,
40
+ DEMUCS_V3:DEMUCS_V3_TAG,
41
+ DEMUCS_V4:DEMUCS_V4_TAG}
42
+
43
+ #Download Center
44
+ DOWNLOAD_FAILED = 'Download Failed'
45
+ DOWNLOAD_STOPPED = 'Download Stopped'
46
+ DOWNLOAD_COMPLETE = 'Download Complete'
47
+ DOWNLOAD_UPDATE_COMPLETE = 'Update Download Complete'
48
+ SETTINGS_MENU_EXIT = 'exit'
49
+ NO_CONNECTION = 'No Internet Connection'
50
+ VIP_SELECTION = 'VIP:'
51
+ DEVELOPER_SELECTION = 'VIP:'
52
+ NO_NEW_MODELS = 'All Available Models Downloaded'
53
+ ENSEMBLE_PARTITION = ': '
54
+ NO_MODEL = 'No Model Selected'
55
+ CHOOSE_MODEL = 'Choose Model'
56
+ SINGLE_DOWNLOAD = 'Downloading Item 1/1...'
57
+ DOWNLOADING_ITEM = 'Downloading Item'
58
+ FILE_EXISTS = 'File already exists!'
59
+ DOWNLOADING_UPDATE = 'Downloading Update...'
60
+ DOWNLOAD_MORE = 'Download More Models'
61
+
62
+ #Menu Options
63
+
64
+ AUTO_SELECT = 'Auto'
65
+
66
+ #LINKS
67
+ DOWNLOAD_CHECKS = "https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json"
68
+ MDX_MODEL_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/mdx_model_data/model_data.json"
69
+ VR_MODEL_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/vr_model_data/model_data.json"
70
+
71
+ DEMUCS_MODEL_NAME_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/demucs_model_data/model_name_mapper.json"
72
+ MDX_MODEL_NAME_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/mdx_model_data/model_name_mapper.json"
73
+
74
+ DONATE_LINK_BMAC = "https://www.buymeacoffee.com/uvr5"
75
+ DONATE_LINK_PATREON = "https://www.patreon.com/uvr"
76
+
77
+ #DOWNLOAD REPOS
78
+ NORMAL_REPO = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
79
+ UPDATE_REPO = "https://github.com/TRvlvr/model_repo/releases/download/uvr_update_patches/"
80
+
81
+ UPDATE_MAC_ARM_REPO = "https://github.com/Anjok07/ultimatevocalremovergui/releases/download/v5.5.0/Ultimate_Vocal_Remover_v5_5_MacOS_arm64.dmg"
82
+ UPDATE_MAC_X86_64_REPO = "https://github.com/Anjok07/ultimatevocalremovergui/releases/download/v5.5.0/Ultimate_Vocal_Remover_v5_5_MacOS_x86_64.dmg"
83
+ UPDATE_LINUX_REPO = "https://github.com/Anjok07/ultimatevocalremovergui#linux-installation"
84
+ UPDATE_REPO = "https://github.com/TRvlvr/model_repo/releases/download/uvr_update_patches/"
85
+
86
+ ISSUE_LINK = 'https://github.com/Anjok07/ultimatevocalremovergui/issues/new'
87
+ VIP_REPO = b'\xf3\xc2W\x19\x1foI)\xc2\xa9\xcc\xb67(Z\xf5',\
88
+ b'gAAAAABjQAIQ-NpNMMxMedpKHHb7ze_nqB05hw0YhbOy3pFzuzDrfqumn8_qvraxEoUpZC5ZXC0gGvfDxFMqyq9VWbYKlA67SUFI_wZB6QoVyGI581vs7kaGfUqlXHIdDS6tQ_U-BfjbEAK9EU_74-R2zXjz8Xzekw=='
89
+ NO_CODE = 'incorrect_code'
90
+
91
+ #Extensions
92
+
93
+ ONNX = '.onnx'
94
+ CKPT = '.ckpt'
95
+ YAML = '.yaml'
96
+ PTH = '.pth'
97
+ TH_EXT = '.th'
98
+ JSON = '.json'
99
+
100
+ #GUI Buttons
101
+
102
+ START_PROCESSING = 'Start Processing'
103
+ WAIT_PROCESSING = 'Please wait...'
104
+ STOP_PROCESSING = 'Halting process, please wait...'
105
+ LOADING_MODELS = 'Loading models...'
106
+
107
+ #---Messages and Logs----
108
+
109
+ MISSING_MODEL = 'missing'
110
+ MODEL_PRESENT = 'present'
111
+
112
+ UNRECOGNIZED_MODEL = 'Unrecognized Model Detected', ' is an unrecognized model.\n\n' + \
113
+ 'Would you like to select the correct parameters before continuing?'
114
+
115
+ STOP_PROCESS_CONFIRM = 'Confirmation', 'You are about to stop all active processes.\n\nAre you sure you wish to continue?'
116
+ NO_ENSEMBLE_SELECTED = 'No Models Selected', 'Please select ensemble and try again.'
117
+ PICKLE_CORRU = 'File Corrupted', 'Unable to load this ensemble.\n\n' + \
118
+ 'Would you like to remove this ensemble from your list?'
119
+ DELETE_ENS_ENTRY = 'Confirm Removal', 'Are you sure you want to remove this entry?'
120
+
121
+ ALL_STEMS = 'All Stems'
122
+ VOCAL_STEM = 'Vocals'
123
+ INST_STEM = 'Instrumental'
124
+ OTHER_STEM = 'Other'
125
+ BASS_STEM = 'Bass'
126
+ DRUM_STEM = 'Drums'
127
+ GUITAR_STEM = 'Guitar'
128
+ PIANO_STEM = 'Piano'
129
+ SYNTH_STEM = 'Synthesizer'
130
+ STRINGS_STEM = 'Strings'
131
+ WOODWINDS_STEM = 'Woodwinds'
132
+ BRASS_STEM = 'Brass'
133
+ WIND_INST_STEM = 'Wind Inst'
134
+ NO_OTHER_STEM = 'No Other'
135
+ NO_BASS_STEM = 'No Bass'
136
+ NO_DRUM_STEM = 'No Drums'
137
+ NO_GUITAR_STEM = 'No Guitar'
138
+ NO_PIANO_STEM = 'No Piano'
139
+ NO_SYNTH_STEM = 'No Synthesizer'
140
+ NO_STRINGS_STEM = 'No Strings'
141
+ NO_WOODWINDS_STEM = 'No Woodwinds'
142
+ NO_WIND_INST_STEM = 'No Wind Inst'
143
+ NO_BRASS_STEM = 'No Brass'
144
+ PRIMARY_STEM = 'Primary Stem'
145
+ SECONDARY_STEM = 'Secondary Stem'
146
+
147
+ #Other Constants
148
+ DEMUCS_2_SOURCE = ["instrumental", "vocals"]
149
+ DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"]
150
+
151
+ DEMUCS_2_SOURCE_MAPPER = {
152
+ INST_STEM: 0,
153
+ VOCAL_STEM: 1}
154
+
155
+ DEMUCS_4_SOURCE_MAPPER = {
156
+ BASS_STEM: 0,
157
+ DRUM_STEM: 1,
158
+ OTHER_STEM: 2,
159
+ VOCAL_STEM: 3}
160
+
161
+ DEMUCS_6_SOURCE_MAPPER = {
162
+ BASS_STEM: 0,
163
+ DRUM_STEM: 1,
164
+ OTHER_STEM: 2,
165
+ VOCAL_STEM: 3,
166
+ GUITAR_STEM:4,
167
+ PIANO_STEM:5}
168
+
169
+ DEMUCS_4_SOURCE_LIST = [BASS_STEM, DRUM_STEM, OTHER_STEM, VOCAL_STEM]
170
+ DEMUCS_6_SOURCE_LIST = [BASS_STEM, DRUM_STEM, OTHER_STEM, VOCAL_STEM, GUITAR_STEM, PIANO_STEM]
171
+
172
+ DEMUCS_UVR_MODEL = 'UVR_Model'
173
+
174
+ CHOOSE_STEM_PAIR = 'Choose Stem Pair'
175
+
176
+ STEM_SET_MENU = (VOCAL_STEM,
177
+ INST_STEM,
178
+ OTHER_STEM,
179
+ BASS_STEM,
180
+ DRUM_STEM,
181
+ GUITAR_STEM,
182
+ PIANO_STEM,
183
+ SYNTH_STEM,
184
+ STRINGS_STEM,
185
+ WOODWINDS_STEM,
186
+ BRASS_STEM,
187
+ WIND_INST_STEM,
188
+ NO_OTHER_STEM,
189
+ NO_BASS_STEM,
190
+ NO_DRUM_STEM,
191
+ NO_GUITAR_STEM,
192
+ NO_PIANO_STEM,
193
+ NO_SYNTH_STEM,
194
+ NO_STRINGS_STEM,
195
+ NO_WOODWINDS_STEM,
196
+ NO_BRASS_STEM,
197
+ NO_WIND_INST_STEM)
198
+
199
+ STEM_PAIR_MAPPER = {
200
+ VOCAL_STEM: INST_STEM,
201
+ INST_STEM: VOCAL_STEM,
202
+ OTHER_STEM: NO_OTHER_STEM,
203
+ BASS_STEM: NO_BASS_STEM,
204
+ DRUM_STEM: NO_DRUM_STEM,
205
+ GUITAR_STEM: NO_GUITAR_STEM,
206
+ PIANO_STEM: NO_PIANO_STEM,
207
+ SYNTH_STEM: NO_SYNTH_STEM,
208
+ STRINGS_STEM: NO_STRINGS_STEM,
209
+ WOODWINDS_STEM: NO_WOODWINDS_STEM,
210
+ BRASS_STEM: NO_BRASS_STEM,
211
+ WIND_INST_STEM: NO_WIND_INST_STEM,
212
+ NO_OTHER_STEM: OTHER_STEM,
213
+ NO_BASS_STEM: BASS_STEM,
214
+ NO_DRUM_STEM: DRUM_STEM,
215
+ NO_GUITAR_STEM: GUITAR_STEM,
216
+ NO_PIANO_STEM: PIANO_STEM,
217
+ NO_SYNTH_STEM: SYNTH_STEM,
218
+ NO_STRINGS_STEM: STRINGS_STEM,
219
+ NO_WOODWINDS_STEM: WOODWINDS_STEM,
220
+ NO_BRASS_STEM: BRASS_STEM,
221
+ NO_WIND_INST_STEM: WIND_INST_STEM,
222
+ PRIMARY_STEM: SECONDARY_STEM}
223
+
224
+ NON_ACCOM_STEMS = (
225
+ VOCAL_STEM,
226
+ OTHER_STEM,
227
+ BASS_STEM,
228
+ DRUM_STEM,
229
+ GUITAR_STEM,
230
+ PIANO_STEM,
231
+ SYNTH_STEM,
232
+ STRINGS_STEM,
233
+ WOODWINDS_STEM,
234
+ BRASS_STEM,
235
+ WIND_INST_STEM)
236
+
237
+ MDX_NET_FREQ_CUT = [VOCAL_STEM, INST_STEM]
238
+
239
+ DEMUCS_4_STEM_OPTIONS = (ALL_STEMS, VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM)
240
+ DEMUCS_6_STEM_OPTIONS = (ALL_STEMS, VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM)
241
+ DEMUCS_2_STEM_OPTIONS = (VOCAL_STEM, INST_STEM)
242
+ DEMUCS_4_STEM_CHECK = (OTHER_STEM, BASS_STEM, DRUM_STEM)
243
+
244
+ #Menu Dropdowns
245
+
246
+ VOCAL_PAIR = f'{VOCAL_STEM}/{INST_STEM}'
247
+ INST_PAIR = f'{INST_STEM}/{VOCAL_STEM}'
248
+ OTHER_PAIR = f'{OTHER_STEM}/{NO_OTHER_STEM}'
249
+ DRUM_PAIR = f'{DRUM_STEM}/{NO_DRUM_STEM}'
250
+ BASS_PAIR = f'{BASS_STEM}/{NO_BASS_STEM}'
251
+ FOUR_STEM_ENSEMBLE = '4 Stem Ensemble'
252
+
253
+ ENSEMBLE_MAIN_STEM = (CHOOSE_STEM_PAIR, VOCAL_PAIR, OTHER_PAIR, DRUM_PAIR, BASS_PAIR, FOUR_STEM_ENSEMBLE)
254
+
255
+ MIN_SPEC = 'Min Spec'
256
+ MAX_SPEC = 'Max Spec'
257
+ AUDIO_AVERAGE = 'Average'
258
+
259
+ MAX_MIN = f'{MAX_SPEC}/{MIN_SPEC}'
260
+ MAX_MAX = f'{MAX_SPEC}/{MAX_SPEC}'
261
+ MAX_AVE = f'{MAX_SPEC}/{AUDIO_AVERAGE}'
262
+ MIN_MAX = f'{MIN_SPEC}/{MAX_SPEC}'
263
+ MIN_MIX = f'{MIN_SPEC}/{MIN_SPEC}'
264
+ MIN_AVE = f'{MIN_SPEC}/{AUDIO_AVERAGE}'
265
+ AVE_MAX = f'{AUDIO_AVERAGE}/{MAX_SPEC}'
266
+ AVE_MIN = f'{AUDIO_AVERAGE}/{MIN_SPEC}'
267
+ AVE_AVE = f'{AUDIO_AVERAGE}/{AUDIO_AVERAGE}'
268
+
269
+ ENSEMBLE_TYPE = (MAX_MIN, MAX_MAX, MAX_AVE, MIN_MAX, MIN_MIX, MIN_AVE, AVE_MAX, AVE_MIN, AVE_AVE)
270
+ ENSEMBLE_TYPE_4_STEM = (MAX_SPEC, MIN_SPEC, AUDIO_AVERAGE)
271
+
272
+ BATCH_MODE = 'Batch Mode'
273
+ BETA_VERSION = 'BETA'
274
+ DEF_OPT = 'Default'
275
+
276
+ CHUNKS = (AUTO_SELECT, '1', '5', '10', '15', '20',
277
+ '25', '30', '35', '40', '45', '50',
278
+ '55', '60', '65', '70', '75', '80',
279
+ '85', '90', '95', 'Full')
280
+
281
+ BATCH_SIZE = (DEF_OPT, '2', '3', '4', '5',
282
+ '6', '7', '8', '9', '10')
283
+
284
+ VOL_COMPENSATION = (AUTO_SELECT, '1.035', '1.08')
285
+
286
+ MARGIN_SIZE = ('44100', '22050', '11025')
287
+
288
+ AUDIO_TOOLS = 'Audio Tools'
289
+
290
+ MANUAL_ENSEMBLE = 'Manual Ensemble'
291
+ TIME_STRETCH = 'Time Stretch'
292
+ CHANGE_PITCH = 'Change Pitch'
293
+ ALIGN_INPUTS = 'Align Inputs'
294
+
295
+ if OPERATING_SYSTEM == 'Windows' or OPERATING_SYSTEM == 'Darwin':
296
+ AUDIO_TOOL_OPTIONS = (MANUAL_ENSEMBLE, TIME_STRETCH, CHANGE_PITCH, ALIGN_INPUTS)
297
+ else:
298
+ AUDIO_TOOL_OPTIONS = (MANUAL_ENSEMBLE, ALIGN_INPUTS)
299
+
300
+ MANUAL_ENSEMBLE_OPTIONS = (MIN_SPEC, MAX_SPEC, AUDIO_AVERAGE)
301
+
302
+ PROCESS_METHODS = (VR_ARCH_PM, MDX_ARCH_TYPE, DEMUCS_ARCH_TYPE, ENSEMBLE_MODE, AUDIO_TOOLS)
303
+
304
+ DEMUCS_SEGMENTS = ('Default', '1', '5', '10', '15', '20',
305
+ '25', '30', '35', '40', '45', '50',
306
+ '55', '60', '65', '70', '75', '80',
307
+ '85', '90', '95', '100')
308
+
309
+ DEMUCS_SHIFTS = (0, 1, 2, 3, 4, 5,
310
+ 6, 7, 8, 9, 10, 11,
311
+ 12, 13, 14, 15, 16, 17,
312
+ 18, 19, 20)
313
+
314
+ DEMUCS_OVERLAP = (0.25, 0.50, 0.75, 0.99)
315
+
316
+ VR_AGGRESSION = (1, 2, 3, 4, 5,
317
+ 6, 7, 8, 9, 10, 11,
318
+ 12, 13, 14, 15, 16, 17,
319
+ 18, 19, 20)
320
+
321
+ VR_WINDOW = ('320', '512','1024')
322
+ VR_CROP = ('256', '512', '1024')
323
+ POST_PROCESSES_THREASHOLD_VALUES = ('0.1', '0.2', '0.3')
324
+
325
+ MDX_POP_PRO = ('MDX-NET_Noise_Profile_14_kHz', 'MDX-NET_Noise_Profile_17_kHz', 'MDX-NET_Noise_Profile_Full_Band')
326
+ MDX_POP_STEMS = ('Vocals', 'Instrumental', 'Other', 'Drums', 'Bass')
327
+ MDX_POP_NFFT = ('4096', '5120', '6144', '7680', '8192', '16384')
328
+ MDX_POP_DIMF = ('2048', '3072', '4096')
329
+
330
+ SAVE_ENSEMBLE = 'Save Ensemble'
331
+ CLEAR_ENSEMBLE = 'Clear Selection(s)'
332
+ MENU_SEPARATOR = 35*'•'
333
+ CHOOSE_ENSEMBLE_OPTION = 'Choose Option'
334
+
335
+ INVALID_ENTRY = 'Invalid Input, Please Try Again'
336
+ ENSEMBLE_INPUT_RULE = '1. Only letters, numbers, spaces, and dashes allowed.\n2. No dashes or spaces at the start or end of input.'
337
+
338
+ ENSEMBLE_OPTIONS = (SAVE_ENSEMBLE, CLEAR_ENSEMBLE)
339
+ ENSEMBLE_CHECK = 'ensemble check'
340
+
341
+ SELECT_SAVED_ENSEMBLE = 'Select Saved Ensemble'
342
+ SELECT_SAVED_SETTING = 'Select Saved Setting'
343
+ ENSEMBLE_OPTION = "Ensemble Customization Options"
344
+ MDX_OPTION = "Advanced MDX-Net Options"
345
+ DEMUCS_OPTION = "Advanced Demucs Options"
346
+ VR_OPTION = "Advanced VR Options"
347
+ HELP_OPTION = "Open Information Guide"
348
+ ERROR_OPTION = "Open Error Log"
349
+ VERIFY_BEGIN = 'Verifying file '
350
+ SAMPLE_BEGIN = 'Creating Sample '
351
+ MODEL_MISSING_CHECK = 'Model Missing:'
352
+
353
+ # Audio Player
354
+
355
+ PLAYING_SONG = ": Playing"
356
+ PAUSE_SONG = ": Paused"
357
+ STOP_SONG = ": Stopped"
358
+
359
+ SELECTED_VER = 'Selected'
360
+ DETECTED_VER = 'Detected'
361
+
362
+ SAMPLE_MODE_CHECKBOX = lambda v:f'Sample Mode ({v}s)'
363
+ REMOVED_FILES = lambda r, e:f'Audio Input Verification Report:\n\nRemoved Files:\n\n{r}\n\nError Details:\n\n{e}'
364
+ ADVANCED_SETTINGS = (ENSEMBLE_OPTION, MDX_OPTION, DEMUCS_OPTION, VR_OPTION, HELP_OPTION, ERROR_OPTION)
365
+
366
+ WAV = 'WAV'
367
+ FLAC = 'FLAC'
368
+ MP3 = 'MP3'
369
+
370
+ MP3_BIT_RATES = ('96k', '128k', '160k', '224k', '256k', '320k')
371
+ WAV_TYPE = ('PCM_U8', 'PCM_16', 'PCM_24', 'PCM_32', '32-bit Float', '64-bit Float')
372
+
373
+ SELECT_SAVED_SET = 'Choose Option'
374
+ SAVE_SETTINGS = 'Save Current Settings'
375
+ RESET_TO_DEFAULT = 'Reset to Default'
376
+ RESET_FULL_TO_DEFAULT = 'Reset to Default'
377
+ RESET_PM_TO_DEFAULT = 'Reset All Application Settings to Default'
378
+
379
+ SAVE_SET_OPTIONS = (SAVE_SETTINGS, RESET_TO_DEFAULT)
380
+
381
+ TIME_PITCH = ('1.0', '2.0', '3.0', '4.0')
382
+ TIME_TEXT = '_time_stretched'
383
+ PITCH_TEXT = '_pitch_shifted'
384
+
385
+ #RegEx Input Validation
386
+
387
+ REG_PITCH = r'^[-+]?(1[0]|[0-9]([.][0-9]*)?)$'
388
+ REG_TIME = r'^[+]?(1[0]|[0-9]([.][0-9]*)?)$'
389
+ REG_COMPENSATION = r'\b^(1[0]|[0-9]([.][0-9]*)?|Auto|None)$\b'
390
+ REG_THES_POSTPORCESS = r'\b^([0]([.][0-9]{0,6})?)$\b'
391
+ REG_CHUNKS = r'\b^(200|1[0-9][0-9]|[1-9][0-9]?|Auto|Full)$\b'
392
+ REG_CHUNKS_DEMUCS = r'\b^(200|1[0-9][0-9]|[1-9][0-9]?|Auto|Full)$\b'
393
+ REG_MARGIN = r'\b^[0-9]*$\b'
394
+ REG_SEGMENTS = r'\b^(200|1[0-9][0-9]|[1-9][0-9]?|Default)$\b'
395
+ REG_SAVE_INPUT = r'\b^([a-zA-Z0-9 -]{0,25})$\b'
396
+ REG_AGGRESSION = r'^[-+]?[0-9]\d*?$'
397
+ REG_WINDOW = r'\b^[0-9]{0,4}$\b'
398
+ REG_SHIFTS = r'\b^[0-9]*$\b'
399
+ REG_BATCHES = r'\b^([0-9]*?|Default)$\b'
400
+ REG_OVERLAP = r'\b^([0]([.][0-9]{0,6})?|None)$\b'
401
+
402
+ # Sub Menu
403
+
404
+ VR_ARCH_SETTING_LOAD = 'Load for VR Arch'
405
+ MDX_SETTING_LOAD = 'Load for MDX-Net'
406
+ DEMUCS_SETTING_LOAD = 'Load for Demucs'
407
+ ALL_ARCH_SETTING_LOAD = 'Load for Full Application'
408
+
409
+ # Mappers
410
+
411
+ DEFAULT_DATA = {
412
+
413
+ 'chosen_process_method': MDX_ARCH_TYPE,
414
+ 'vr_model': CHOOSE_MODEL,
415
+ 'aggression_setting': 10,
416
+ 'window_size': 512,
417
+ 'batch_size': 4,
418
+ 'crop_size': 256,
419
+ 'is_tta': False,
420
+ 'is_output_image': False,
421
+ 'is_post_process': False,
422
+ 'is_high_end_process': False,
423
+ 'post_process_threshold': 0.2,
424
+ 'vr_voc_inst_secondary_model': NO_MODEL,
425
+ 'vr_other_secondary_model': NO_MODEL,
426
+ 'vr_bass_secondary_model': NO_MODEL,
427
+ 'vr_drums_secondary_model': NO_MODEL,
428
+ 'vr_is_secondary_model_activate': False,
429
+ 'vr_voc_inst_secondary_model_scale': 0.9,
430
+ 'vr_other_secondary_model_scale': 0.7,
431
+ 'vr_bass_secondary_model_scale': 0.5,
432
+ 'vr_drums_secondary_model_scale': 0.5,
433
+ 'demucs_model': CHOOSE_MODEL,
434
+ 'demucs_stems': ALL_STEMS,
435
+ 'segment': DEMUCS_SEGMENTS[0],
436
+ 'overlap': DEMUCS_OVERLAP[0],
437
+ 'shifts': 2,
438
+ 'chunks_demucs': CHUNKS[0],
439
+ 'margin_demucs': 44100,
440
+ 'is_chunk_demucs': False,
441
+ 'is_chunk_mdxnet': False,
442
+ 'is_primary_stem_only_Demucs': False,
443
+ 'is_secondary_stem_only_Demucs': False,
444
+ 'is_split_mode': True,
445
+ 'is_demucs_combine_stems': True,
446
+ 'demucs_voc_inst_secondary_model': NO_MODEL,
447
+ 'demucs_other_secondary_model': NO_MODEL,
448
+ 'demucs_bass_secondary_model': NO_MODEL,
449
+ 'demucs_drums_secondary_model': NO_MODEL,
450
+ 'demucs_is_secondary_model_activate': False,
451
+ 'demucs_voc_inst_secondary_model_scale': 0.9,
452
+ 'demucs_other_secondary_model_scale': 0.7,
453
+ 'demucs_bass_secondary_model_scale': 0.5,
454
+ 'demucs_drums_secondary_model_scale': 0.5,
455
+ 'demucs_stems': ALL_STEMS,
456
+ 'demucs_pre_proc_model': NO_MODEL,
457
+ 'is_demucs_pre_proc_model_activate': False,
458
+ 'is_demucs_pre_proc_model_inst_mix': False,
459
+ 'mdx_net_model': CHOOSE_MODEL,
460
+ 'chunks': CHUNKS[0],
461
+ 'margin': 44100,
462
+ 'compensate': AUTO_SELECT,
463
+ 'is_denoise': False,
464
+ 'is_invert_spec': False,
465
+ 'is_mixer_mode': False,
466
+ 'mdx_batch_size': DEF_OPT,
467
+ 'mdx_voc_inst_secondary_model': NO_MODEL,
468
+ 'mdx_other_secondary_model': NO_MODEL,
469
+ 'mdx_bass_secondary_model': NO_MODEL,
470
+ 'mdx_drums_secondary_model': NO_MODEL,
471
+ 'mdx_is_secondary_model_activate': False,
472
+ 'mdx_voc_inst_secondary_model_scale': 0.9,
473
+ 'mdx_other_secondary_model_scale': 0.7,
474
+ 'mdx_bass_secondary_model_scale': 0.5,
475
+ 'mdx_drums_secondary_model_scale': 0.5,
476
+ 'is_save_all_outputs_ensemble': True,
477
+ 'is_append_ensemble_name': False,
478
+ 'chosen_audio_tool': AUDIO_TOOL_OPTIONS[0],
479
+ 'choose_algorithm': MANUAL_ENSEMBLE_OPTIONS[0],
480
+ 'time_stretch_rate': 2.0,
481
+ 'pitch_rate': 2.0,
482
+ 'is_gpu_conversion': False,
483
+ 'is_primary_stem_only': False,
484
+ 'is_secondary_stem_only': False,
485
+ 'is_testing_audio': False,
486
+ 'is_add_model_name': False,
487
+ 'is_accept_any_input': False,
488
+ 'is_task_complete': False,
489
+ 'is_normalization': False,
490
+ 'is_create_model_folder': False,
491
+ 'mp3_bit_set': '320k',
492
+ 'save_format': WAV,
493
+ 'wav_type_set': 'PCM_16',
494
+ 'user_code': '',
495
+ 'export_path': '',
496
+ 'input_paths': [],
497
+ 'lastDir': None,
498
+ 'export_path': '',
499
+ 'model_hash_table': None,
500
+ 'help_hints_var': False,
501
+ 'model_sample_mode': False,
502
+ 'model_sample_mode_duration': 30
503
+ }
504
+
505
+ SETTING_CHECK = ('vr_model',
506
+ 'aggression_setting',
507
+ 'window_size',
508
+ 'batch_size',
509
+ 'crop_size',
510
+ 'is_tta',
511
+ 'is_output_image',
512
+ 'is_post_process',
513
+ 'is_high_end_process',
514
+ 'post_process_threshold',
515
+ 'vr_voc_inst_secondary_model',
516
+ 'vr_other_secondary_model',
517
+ 'vr_bass_secondary_model',
518
+ 'vr_drums_secondary_model',
519
+ 'vr_is_secondary_model_activate',
520
+ 'vr_voc_inst_secondary_model_scale',
521
+ 'vr_other_secondary_model_scale',
522
+ 'vr_bass_secondary_model_scale',
523
+ 'vr_drums_secondary_model_scale',
524
+ 'demucs_model',
525
+ 'segment',
526
+ 'overlap',
527
+ 'shifts',
528
+ 'chunks_demucs',
529
+ 'margin_demucs',
530
+ 'is_chunk_demucs',
531
+ 'is_primary_stem_only_Demucs',
532
+ 'is_secondary_stem_only_Demucs',
533
+ 'is_split_mode',
534
+ 'is_demucs_combine_stems',
535
+ 'demucs_voc_inst_secondary_model',
536
+ 'demucs_other_secondary_model',
537
+ 'demucs_bass_secondary_model',
538
+ 'demucs_drums_secondary_model',
539
+ 'demucs_is_secondary_model_activate',
540
+ 'demucs_voc_inst_secondary_model_scale',
541
+ 'demucs_other_secondary_model_scale',
542
+ 'demucs_bass_secondary_model_scale',
543
+ 'demucs_drums_secondary_model_scale',
544
+ 'demucs_stems',
545
+ 'mdx_net_model',
546
+ 'chunks',
547
+ 'margin',
548
+ 'compensate',
549
+ 'is_denoise',
550
+ 'is_invert_spec',
551
+ 'mdx_batch_size',
552
+ 'mdx_voc_inst_secondary_model',
553
+ 'mdx_other_secondary_model',
554
+ 'mdx_bass_secondary_model',
555
+ 'mdx_drums_secondary_model',
556
+ 'mdx_is_secondary_model_activate',
557
+ 'mdx_voc_inst_secondary_model_scale',
558
+ 'mdx_other_secondary_model_scale',
559
+ 'mdx_bass_secondary_model_scale',
560
+ 'mdx_drums_secondary_model_scale',
561
+ 'is_save_all_outputs_ensemble',
562
+ 'is_append_ensemble_name',
563
+ 'chosen_audio_tool',
564
+ 'choose_algorithm',
565
+ 'time_stretch_rate',
566
+ 'pitch_rate',
567
+ 'is_primary_stem_only',
568
+ 'is_secondary_stem_only',
569
+ 'is_testing_audio',
570
+ 'is_add_model_name',
571
+ "is_accept_any_input",
572
+ 'is_task_complete',
573
+ 'is_create_model_folder',
574
+ 'mp3_bit_set',
575
+ 'save_format',
576
+ 'wav_type_set',
577
+ 'user_code',
578
+ 'is_gpu_conversion',
579
+ 'is_normalization',
580
+ 'help_hints_var',
581
+ 'model_sample_mode',
582
+ 'model_sample_mode_duration')
583
+
584
+ # Message Box Text
585
+
586
+ INVALID_INPUT = 'Invalid Input', 'The input is invalid.\n\nPlease verify the input still exists or is valid and try again.'
587
+ INVALID_EXPORT = 'Invalid Export Directory', 'You have selected an invalid export directory.\n\nPlease make sure the selected directory still exists.'
588
+ INVALID_ENSEMBLE = 'Not Enough Models', 'You must select 2 or more models to run ensemble.'
589
+ INVALID_MODEL = 'No Model Chosen', 'You must select an model to continue.'
590
+ MISSING_MODEL = 'Model Missing', 'The selected model is missing or not valid.'
591
+ ERROR_OCCURED = 'Error Occured', '\n\nWould you like to open the error log for more details?\n'
592
+
593
+ # GUI Text Constants
594
+
595
+ BACK_TO_MAIN_MENU = 'Back to Main Menu'
596
+
597
+ # Help Hint Text
598
+
599
+ INTERNAL_MODEL_ATT = 'Internal model attribute. \n\n ***Do not change this setting if you are unsure!***'
600
+ STOP_HELP = 'Halts any running processes. \n A pop-up window will ask the user to confirm the action.'
601
+ SETTINGS_HELP = 'Opens the main settings guide. This window includes the \"Download Center\"'
602
+ COMMAND_TEXT_HELP = 'Provides information on the progress of the current process.'
603
+ SAVE_CURRENT_SETTINGS_HELP = 'Allows the user to open any saved settings or save the current application settings.'
604
+ CHUNKS_HELP = ('For MDX-Net, all values use the same amount of resources. Using chunks is no longer recommended.\n\n' + \
605
+ '• This option is now only for output quality.\n' + \
606
+ '• Some tracks may fare better depending on the value.\n' + \
607
+ '• Some tracks may fare worse depending on the value.\n' + \
608
+ '• Larger chunk sizes use will take less time to process.\n' +\
609
+ '• Smaller chunk sizes use will take more time to process.\n')
610
+ CHUNKS_DEMUCS_HELP = ('This option allows the user to reduce (or increase) RAM or V-RAM usage.\n\n' + \
611
+ '• Smaller chunk sizes use less RAM or V-RAM but can also increase processing times.\n' + \
612
+ '• Larger chunk sizes use more RAM or V-RAM but can also reduce processing times.\n' + \
613
+ '• Selecting \"Auto\" calculates an appropriate chuck size based on how much RAM or V-RAM your system has.\n' + \
614
+ '• Selecting \"Full\" will process the track as one whole chunk. (not recommended)\n' + \
615
+ '• The default selection is \"Auto\".')
616
+ MARGIN_HELP = 'Selects the frequency margins to slice the chunks from.\n\n• The recommended margin size is 44100.\n• Other values can give unpredictable results.'
617
+ AGGRESSION_SETTING_HELP = ('This option allows you to set how strong the primary stem extraction will be.\n\n' + \
618
+ '• The range is 0-100.\n' + \
619
+ '• Higher values perform deeper extractions.\n' + \
620
+ '• The default is 10 for instrumental & vocal models.\n' + \
621
+ '• Values over 10 can result in muddy-sounding instrumentals for the non-vocal models')
622
+ WINDOW_SIZE_HELP = ('The smaller your window size, the better your conversions will be. \nHowever, a smaller window means longer conversion times and heavier resource usage.\n\n' + \
623
+ 'Breakdown of the selectable window size values:\n' + \
624
+ '• 1024 - Low conversion quality, shortest conversion time, low resource usage.\n' + \
625
+ '• 512 - Average conversion quality, average conversion time, normal resource usage.\n' + \
626
+ '• 320 - Better conversion quality.')
627
+ DEMUCS_STEMS_HELP = ('Here, you can choose which stem to extract using the selected model.\n\n' +\
628
+ 'Stem Selections:\n\n' +\
629
+ '• All Stems - Saves all of the stems the model is able to extract.\n' +\
630
+ '• Vocals - Pulls vocal stem only.\n' +\
631
+ '• Other - Pulls other stem only.\n' +\
632
+ '• Bass - Pulls bass stem only.\n' +\
633
+ '• Drums - Pulls drum stem only.\n')
634
+ SEGMENT_HELP = ('This option allows the user to reduce (or increase) RAM or V-RAM usage.\n\n' + \
635
+ '• Smaller segment sizes use less RAM or V-RAM but can also increase processing times.\n' + \
636
+ '• Larger segment sizes use more RAM or V-RAM but can also reduce processing times.\n' + \
637
+ '• Selecting \"Default\" uses the recommended segment size.\n' + \
638
+ '• It is recommended that you not use segments with \"Chunking\".')
639
+ ENSEMBLE_MAIN_STEM_HELP = 'Allows the user to select the type of stems they wish to ensemble.\n\nOptions:\n\n' +\
640
+ f'• {VOCAL_PAIR} - The primary stem will be the vocals and the secondary stem will be the the instrumental\n' +\
641
+ f'• {OTHER_PAIR} - The primary stem will be other and the secondary stem will be no other (the mixture without the \'other\' stem)\n' +\
642
+ f'• {BASS_PAIR} - The primary stem will be bass and the secondary stem will be no bass (the mixture without the \'bass\' stem)\n' +\
643
+ f'• {DRUM_PAIR} - The primary stem will be drums and the secondary stem will be no drums (the mixture without the \'drums\' stem)\n' +\
644
+ f'• {FOUR_STEM_ENSEMBLE} - This option will gather all the 4 stem Demucs models and ensemble all of the outputs.\n'
645
+ ENSEMBLE_TYPE_HELP = 'Allows the user to select the ensemble algorithm to be used to generate the final output.\n\nExample & Other Note:\n\n' +\
646
+ f'• {MAX_MIN} - If this option is chosen, the primary stem outputs will be processed through \nthe \'Max Spec\' algorithm, and the secondary stem will be processed through the \'Min Spec\' algorithm.\n' +\
647
+ f'• Only a single algorithm will be shown when the \'4 Stem Ensemble\' option is chosen.\n\nAlgorithm Details:\n\n' +\
648
+ f'• {MAX_SPEC} - This algorithm combines the final results and generates the highest possible output from them.\nFor example, if this algorithm were processing vocal stems, you would get the fullest possible \n' +\
649
+ 'result making the ensembled vocal stem sound cleaner. However, it might result in more unwanted artifacts.\n' +\
650
+ f'• {MIN_SPEC} - This algorithm combines the results and generates the lowest possible output from them.\nFor example, if this algorithm were processing instrumental stems, you would get the cleanest possible result \n' +\
651
+ 'result, eliminating more unwanted artifacts. However, the result might also sound \'muddy\' and lack a fuller sound.\n' +\
652
+ f'• {AUDIO_AVERAGE} - This algorithm simply combines the results and averages all of them together. \n'
653
+ ENSEMBLE_LISTBOX_HELP = 'List of the all the models available for the main stem pair selected.'
654
+ IS_GPU_CONVERSION_HELP = ('When checked, the application will attempt to use your GPU (if you have one).\n' +\
655
+ 'If you do not have a GPU but have this checked, the application will default to your CPU.\n\n' +\
656
+ 'Note: CPU conversions are much slower than those processed through the GPU.')
657
+ SAVE_STEM_ONLY_HELP = 'Allows the user to save only the selected stem.'
658
+ IS_NORMALIZATION_HELP = 'Normalizes output to prevent clipping.'
659
+ CROP_SIZE_HELP = '**Only compatible with select models only!**\n\n Setting should match training crop-size value. Leave as is if unsure.'
660
+ IS_TTA_HELP = ('This option performs Test-Time-Augmentation to improve the separation quality.\n\n' +\
661
+ 'Note: Having this selected will increase the time it takes to complete a conversion')
662
+ IS_POST_PROCESS_HELP = ('This option can potentially identify leftover instrumental artifacts within the vocal outputs. \nThis option may improve the separation of some songs.\n\n' +\
663
+ 'Note: Selecting this option can adversely affect the conversion process, depending on the track. Because of this, it is only recommended as a last resort.')
664
+ IS_HIGH_END_PROCESS_HELP = 'The application will mirror the missing frequency range of the output.'
665
+ SHIFTS_HELP = ('Performs multiple predictions with random shifts of the input and averages them.\n\n' +\
666
+ '• The higher number of shifts, the longer the prediction will take. \n- Not recommended unless you have a GPU.')
667
+ OVERLAP_HELP = 'This option controls the amount of overlap between prediction windows (for Demucs one window is 10 seconds)'
668
+ IS_CHUNK_DEMUCS_HELP = '• Enables \"Chunks\".\n• We recommend you not enable this option with \"Split Mode\" enabled or with the Demucs v4 Models.'
669
+ IS_CHUNK_MDX_NET_HELP = '• Enables \"Chunks\".\n• Using this option for MDX-Net no longer effects RAM usage.\n• Having this enabled will effect output quality, for better or worse depending on the set value.'
670
+ IS_SPLIT_MODE_HELP = ('• Enables \"Segments\". \n• We recommend you not enable this option with \"Enable Chunks\".\n' +\
671
+ '• Deselecting this option is only recommended for those with powerful PCs or if using \"Chunk\" mode instead.')
672
+ IS_DEMUCS_COMBINE_STEMS_HELP = 'The application will create the secondary stem by combining the remaining stems \ninstead of inverting the primary stem with the mixture.'
673
+ COMPENSATE_HELP = 'Compensates the audio of the primary stems to allow for a better secondary stem.'
674
+ IS_DENOISE_HELP = '• This option removes a majority of the noise generated by the MDX-Net models.\n• The conversion will take nearly twice as long with this enabled.'
675
+ CLEAR_CACHE_HELP = 'Clears any user selected model settings for previously unrecognized models.'
676
+ IS_SAVE_ALL_OUTPUTS_ENSEMBLE_HELP = 'Enabling this option will keep all indivudual outputs generated by an ensemble.'
677
+ IS_APPEND_ENSEMBLE_NAME_HELP = 'The application will append the ensemble name to the final output \nwhen this option is enabled.'
678
+ DONATE_HELP = 'Takes the user to an external web-site to donate to this project!'
679
+ IS_INVERT_SPEC_HELP = '• This option may produce a better secondary stem.\n• Inverts primary stem with mixture using spectragrams instead of wavforms.\n• This inversion method is slightly slower.'
680
+ IS_MIXER_MODE_HELP = '• This option may improve separations for outputs from 4-stem models.\n• Might produce more noise.\n• This option might slow down separation time.'
681
+ IS_TESTING_AUDIO_HELP = 'Appends a unique 10 digit number to output files so the user \ncan compare results with different settings.'
682
+ IS_MODEL_TESTING_AUDIO_HELP = 'Appends the model name to output files so the user \ncan compare results with different settings.'
683
+ IS_ACCEPT_ANY_INPUT_HELP = 'The application will accept any input when enabled, even if it does not have an audio format extension.\n\nThis is for experimental purposes, and having it enabled is not recommended.'
684
+ IS_TASK_COMPLETE_HELP = 'When enabled, chimes will be heard when a process completes or fails.'
685
+ IS_CREATE_MODEL_FOLDER_HELP = 'Two new directories will be generated for the outputs in \nthe export directory after each conversion.\n\n' +\
686
+ '• First directory - Named after the model.\n' +\
687
+ '• Second directory - Named after the track.\n\n' +\
688
+ '• Example: \n\n' +\
689
+ '─ Export Directory\n' +\
690
+ ' └── First Directory\n' +\
691
+ ' └── Second Directory\n' +\
692
+ ' └── Output File(s)'
693
+ DELETE_YOUR_SETTINGS_HELP = 'This menu contains your saved settings. You will be asked to \nconfirm if you wish to delete the selected setting.'
694
+ SET_STEM_NAME_HELP = 'Choose the primary stem for the selected model.'
695
+ MDX_DIM_T_SET_HELP = INTERNAL_MODEL_ATT
696
+ MDX_DIM_F_SET_HELP = INTERNAL_MODEL_ATT
697
+ MDX_N_FFT_SCALE_SET_HELP = 'Set the N_FFT size the model was trained with.'
698
+ POPUP_COMPENSATE_HELP = f'Choose the appropriate voluem compensattion for the selected model\n\nReminder: {COMPENSATE_HELP}'
699
+ VR_MODEL_PARAM_HELP = 'Choose the parameters needed to run the selected model.'
700
+ CHOSEN_ENSEMBLE_HELP = 'Select saved enselble or save current ensemble.\n\nDefault Selections:\n\n• Save the current ensemble.\n• Clears all current model selections.'
701
+ CHOSEN_PROCESS_METHOD_HELP = 'Here, you choose between different Al networks and algorithms to process your track.\n\n' +\
702
+ 'There are five options:\n\n' +\
703
+ '• VR Architecture - These models use magnitude spectrograms for Source Separation.\n' +\
704
+ '• MDX-Net - These models use Hybrid Spectrogram/Waveform for Source Separation.\n' +\
705
+ '• Demucs v3 - These models use Hybrid Spectrogram/Waveform for Source Separation.\n' +\
706
+ '• Ensemble Mode - Here, you can get the best results from multiple models and networks.\n' +\
707
+ '• Audio Tools - These are additional tools for added convenience.'
708
+ INPUT_FOLDER_ENTRY_HELP = 'Select Input:\n\nHere is where you select the audio files(s) you wish to process.'
709
+ INPUT_FOLDER_ENTRY_HELP_2 = 'Input Option Menu:\n\nClick here to access the input option menu.'
710
+ OUTPUT_FOLDER_ENTRY_HELP = 'Select Output:\n\nHere is where you select the directory where your processed files are to be saved.'
711
+ INPUT_FOLDER_BUTTON_HELP = 'Open Input Folder Button: \n\nOpens the directory containing the selected input audio file(s).'
712
+ OUTPUT_FOLDER_BUTTON_HELP = 'Open Output Folder Button: \n\nOpens the selected output folder.'
713
+ CHOOSE_MODEL_HELP = 'Each process method comes with its own set of options and models.\n\nHere is where you choose the model associated with the selected process method.'
714
+ FORMAT_SETTING_HELP = 'Save outputs as '
715
+ SECONDARY_MODEL_ACTIVATE_HELP = 'When enabled, the application will run an additional inference with the selected model(s) above.'
716
+ SECONDARY_MODEL_HELP = 'Choose the secondary model associated with this stem you wish to run with the current process method.'
717
+ SECONDARY_MODEL_SCALE_HELP = 'The scale determines how the final audio outputs will be averaged between the primary and secondary models.\n\nFor example:\n\n' +\
718
+ '• 10% - 10 percent of the main model result will be factored into the final result.\n' +\
719
+ '• 50% - The results from the main and secondary models will be averaged evenly.\n' +\
720
+ '• 90% - 90 percent of the main model result will be factored into the final result.'
721
+ PRE_PROC_MODEL_ACTIVATE_HELP = 'The application will run an inference with the selected model above, pulling only the instrumental stem when enabled. \nFrom there, all of the non-vocal stems will be pulled from the generated instrumental.\n\nNotes:\n\n' +\
722
+ '• This option can significantly reduce vocal bleed within the non-vocal stems.\n' +\
723
+ '• It is only available in Demucs.\n' +\
724
+ '• It is only compatible with non-vocal and non-instrumental stem outputs.\n' +\
725
+ '• This will increase thetotal processing time.\n' +\
726
+ '• Only VR and MDX-Net Vocal or Instrumental models are selectable above.'
727
+
728
+ AUDIO_TOOLS_HELP = 'Here, you choose between different audio tools to process your track.\n\n' +\
729
+ '• Manual Ensemble - You must have 2 or more files selected as your inputs. Allows the user to run their tracks through \nthe same algorithms used in Ensemble Mode.\n' +\
730
+ '• Align Inputs - You must have exactly 2 files selected as your inputs. The second input will be aligned with the first input.\n' +\
731
+ '• Time Stretch - The user can speed up or slow down the selected inputs.\n' +\
732
+ '• Change Pitch - The user can change the pitch for the selected inputs.\n'
733
+ PRE_PROC_MODEL_INST_MIX_HELP = 'When enabled, the application will generate a third output without the selected stem and vocals.'
734
+ MODEL_SAMPLE_MODE_HELP = 'Allows the user to process only part of a track to sample settings or a model without \nrunning a full conversion.\n\nNotes:\n\n' +\
735
+ '• The number in the parentheses is the current number of seconds the generated sample will be.\n' +\
736
+ '• You can choose the number of seconds to extract from the track in the \"Additional Settings\" menu.'
737
+
738
+ POST_PROCESS_THREASHOLD_HELP = 'Allows the user to control the intensity of the Post_process option.\n\nNotes:\n\n' +\
739
+ '• Higher values potentially remove more artifacts. However, bleed might increase.\n' +\
740
+ '• Lower values limit artifact removal.'
741
+
742
+ BATCH_SIZE_HELP = 'Specify the number of batches to be processed at a time.\n\nNotes:\n\n' +\
743
+ '• Higher values mean more RAM usage but slightly faster processing times.\n' +\
744
+ '• Lower values mean less RAM usage but slightly longer processing times.\n' +\
745
+ '• Batch size value has no effect on output quality.'
746
+
747
+ # Warning Messages
748
+
749
+ STORAGE_ERROR = 'Insufficient Storage', 'There is not enough storage on main drive to continue. Your main drive must have at least 3 GB\'s of storage in order for this application function properly. \n\nPlease ensure your main drive has at least 3 GB\'s of storage and try again.\n\n'
750
+ STORAGE_WARNING = 'Available Storage Low', 'Your main drive is running low on storage. Your main drive must have at least 3 GB\'s of storage in order for this application function properly.\n\n'
751
+ CONFIRM_WARNING = '\nAre you sure you wish to continue?'
752
+ PROCESS_FAILED = 'Process failed, please see error log\n'
753
+ EXIT_PROCESS_ERROR = 'Active Process', 'Please stop the active process or wait for it to complete before you exit.'
754
+ EXIT_HALTED_PROCESS_ERROR = 'Halting Process', 'Please wait for the application to finish halting the process before exiting.'
755
+ EXIT_DOWNLOAD_ERROR = 'Active Download', 'Please stop the download or wait for it to complete before you exit.'
756
+ SET_TO_DEFAULT_PROCESS_ERROR = 'Active Process', 'You cannot reset all of the application settings during an active process.'
757
+ SET_TO_ANY_PROCESS_ERROR = 'Active Process', 'You cannot reset the application settings during an active process.'
758
+ RESET_ALL_TO_DEFAULT_WARNING = 'Reset Settings Confirmation', 'All application settings will be set to factory default.\n\nAre you sure you wish to continue?'
759
+ AUDIO_VERIFICATION_CHECK = lambda i, e:f'++++++++++++++++++++++++++++++++++++++++++++++++++++\n\nBroken File Removed: \n\n{i}\n\nError Details:\n\n{e}\n++++++++++++++++++++++++++++++++++++++++++++++++++++'
760
+ INVALID_ONNX_MODEL_ERROR = 'Invalid Model', 'The file selected is not a valid MDX-Net model. Please see the error log for more information.'
761
+
762
+
763
+ # Separation Text
764
+
765
+ LOADING_MODEL = 'Loading model...'
766
+ INFERENCE_STEP_1 = 'Running inference...'
767
+ INFERENCE_STEP_1_SEC = 'Running inference (secondary model)...'
768
+ INFERENCE_STEP_1_4_STEM = lambda stem:f'Running inference (secondary model for {stem})...'
769
+ INFERENCE_STEP_1_PRE = 'Running inference (pre-process model)...'
770
+ INFERENCE_STEP_2_PRE = lambda pm, m:f'Loading pre-process model ({pm}: {m})...'
771
+ INFERENCE_STEP_2_SEC = lambda pm, m:f'Loading secondary model ({pm}: {m})...'
772
+ INFERENCE_STEP_2_SEC_CACHED_MODOEL = lambda pm, m:f'Secondary model ({pm}: {m}) cache loaded.\n'
773
+ INFERENCE_STEP_2_PRE_CACHED_MODOEL = lambda pm, m:f'Pre-process model ({pm}: {m}) cache loaded.\n'
774
+ INFERENCE_STEP_2_SEC_CACHED = 'Loading cached secondary model source(s)... Done!\n'
775
+ INFERENCE_STEP_2_PRIMARY_CACHED = 'Model cache loaded.\n'
776
+ INFERENCE_STEP_2 = 'Inference complete.'
777
+ SAVING_STEM = 'Saving ', ' stem...'
778
+ SAVING_ALL_STEMS = 'Saving all stems...'
779
+ ENSEMBLING_OUTPUTS = 'Ensembling outputs...'
780
+ DONE = ' Done!\n'
781
+ ENSEMBLES_SAVED = 'Ensembled outputs saved!\n\n'
782
+ NEW_LINES = "\n\n"
783
+ NEW_LINE = "\n"
784
+ NO_LINE = ''
785
+
786
+ # Widget Placements
787
+
788
+ MAIN_ROW_Y = -15, -17
789
+ MAIN_ROW_X = -4, 21
790
+ MAIN_ROW_WIDTH = -53
791
+ MAIN_ROW_2_Y = -15, -17
792
+ MAIN_ROW_2_X = -28, 1
793
+ CHECK_BOX_Y = 0
794
+ CHECK_BOX_X = 20
795
+ CHECK_BOX_WIDTH = -50
796
+ CHECK_BOX_HEIGHT = 2
797
+ LEFT_ROW_WIDTH = -10
798
+ LABEL_HEIGHT = -5
799
+ OPTION_HEIGHT = 7
800
+ LOW_MENU_Y = 18, 16
801
+ FFMPEG_EXT = (".aac", ".aiff", ".alac" ,".flac", ".FLAC", ".mov", ".mp4", ".MP4",
802
+ ".m4a", ".M4A", ".mp2", ".mp3", "MP3", ".mpc", ".mpc8",
803
+ ".mpeg", ".ogg", ".OGG", ".tta", ".wav", ".wave", ".WAV", ".WAVE", ".wma", ".webm", ".eac3", ".mkv")
804
+
805
+ FFMPEG_MORE_EXT = (".aa", ".aac", ".ac3", ".aiff", ".alac", ".avi", ".f4v",".flac", ".flic", ".flv",
806
+ ".m4v",".mlv", ".mov", ".mp4", ".m4a", ".mp2", ".mp3", ".mp4", ".mpc", ".mpc8",
807
+ ".mpeg", ".ogg", ".tta", ".tty", ".vcd", ".wav", ".wma")
808
+ ANY_EXT = ""
809
+
810
+ # Secondary Menu Constants
811
+
812
+ VOCAL_PAIR_PLACEMENT = 1, 2, 3, 4
813
+ OTHER_PAIR_PLACEMENT = 5, 6, 7, 8
814
+ BASS_PAIR_PLACEMENT = 9, 10, 11, 12
815
+ DRUMS_PAIR_PLACEMENT = 13, 14, 15, 16
816
+
817
+ # Drag n Drop String Checks
818
+
819
+ DOUBLE_BRACKET = "} {"
820
+ RIGHT_BRACKET = "}"
821
+ LEFT_BRACKET = "{"
822
+
823
+ # Manual Downloads
824
+
825
+ VR_PLACEMENT_TEXT = 'Place models in \"models/VR_Models\" directory.'
826
+ MDX_PLACEMENT_TEXT = 'Place models in \"models/MDX_Net_Models\" directory.'
827
+ DEMUCS_PLACEMENT_TEXT = 'Place models in \"models/Demucs_Models\" directory.'
828
+ DEMUCS_V3_V4_PLACEMENT_TEXT = 'Place items in \"models/Demucs_Models/v3_v4_repo\" directory.'
829
+
830
+ FULL_DOWNLOAD_LIST_VR = {
831
+ "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth",
832
+ "VR Arch Single Model v5: 2_HP-UVR": "2_HP-UVR.pth",
833
+ "VR Arch Single Model v5: 3_HP-Vocal-UVR": "3_HP-Vocal-UVR.pth",
834
+ "VR Arch Single Model v5: 4_HP-Vocal-UVR": "4_HP-Vocal-UVR.pth",
835
+ "VR Arch Single Model v5: 5_HP-Karaoke-UVR": "5_HP-Karaoke-UVR.pth",
836
+ "VR Arch Single Model v5: 6_HP-Karaoke-UVR": "6_HP-Karaoke-UVR.pth",
837
+ "VR Arch Single Model v5: 7_HP2-UVR": "7_HP2-UVR.pth",
838
+ "VR Arch Single Model v5: 8_HP2-UVR": "8_HP2-UVR.pth",
839
+ "VR Arch Single Model v5: 9_HP2-UVR": "9_HP2-UVR.pth",
840
+ "VR Arch Single Model v5: 10_SP-UVR-2B-32000-1": "10_SP-UVR-2B-32000-1.pth",
841
+ "VR Arch Single Model v5: 11_SP-UVR-2B-32000-2": "11_SP-UVR-2B-32000-2.pth",
842
+ "VR Arch Single Model v5: 12_SP-UVR-3B-44100": "12_SP-UVR-3B-44100.pth",
843
+ "VR Arch Single Model v5: 13_SP-UVR-4B-44100-1": "13_SP-UVR-4B-44100-1.pth",
844
+ "VR Arch Single Model v5: 14_SP-UVR-4B-44100-2": "14_SP-UVR-4B-44100-2.pth",
845
+ "VR Arch Single Model v5: 15_SP-UVR-MID-44100-1": "15_SP-UVR-MID-44100-1.pth",
846
+ "VR Arch Single Model v5: 16_SP-UVR-MID-44100-2": "16_SP-UVR-MID-44100-2.pth",
847
+ "VR Arch Single Model v4: MGM_HIGHEND_v4": "MGM_HIGHEND_v4.pth",
848
+ "VR Arch Single Model v4: MGM_LOWEND_A_v4": "MGM_LOWEND_A_v4.pth",
849
+ "VR Arch Single Model v4: MGM_LOWEND_B_v4": "MGM_LOWEND_B_v4.pth",
850
+ "VR Arch Single Model v4: MGM_MAIN_v4": "MGM_MAIN_v4.pth"
851
+ }
852
+
853
+ FULL_DOWNLOAD_LIST_MDX = {
854
+ "MDX-Net Model: UVR-MDX-NET Main": "UVR_MDXNET_Main.onnx",
855
+ "MDX-Net Model: UVR-MDX-NET Inst Main": "UVR-MDX-NET-Inst_Main.onnx",
856
+ "MDX-Net Model: UVR-MDX-NET 1": "UVR_MDXNET_1_9703.onnx",
857
+ "MDX-Net Model: UVR-MDX-NET 2": "UVR_MDXNET_2_9682.onnx",
858
+ "MDX-Net Model: UVR-MDX-NET 3": "UVR_MDXNET_3_9662.onnx",
859
+ "MDX-Net Model: UVR-MDX-NET Inst 1": "UVR-MDX-NET-Inst_1.onnx",
860
+ "MDX-Net Model: UVR-MDX-NET Inst 2": "UVR-MDX-NET-Inst_2.onnx",
861
+ "MDX-Net Model: UVR-MDX-NET Inst 3": "UVR-MDX-NET-Inst_3.onnx",
862
+ "MDX-Net Model: UVR-MDX-NET Karaoke": "UVR_MDXNET_KARA.onnx",
863
+ "MDX-Net Model: UVR_MDXNET_9482": "UVR_MDXNET_9482.onnx",
864
+ "MDX-Net Model: Kim_Vocal_1": "Kim_Vocal_1.onnx",
865
+ "MDX-Net Model: kuielab_a_vocals": "kuielab_a_vocals.onnx",
866
+ "MDX-Net Model: kuielab_a_other": "kuielab_a_other.onnx",
867
+ "MDX-Net Model: kuielab_a_bass": "kuielab_a_bass.onnx",
868
+ "MDX-Net Model: kuielab_a_drums": "kuielab_a_drums.onnx",
869
+ "MDX-Net Model: kuielab_b_vocals": "kuielab_b_vocals.onnx",
870
+ "MDX-Net Model: kuielab_b_other": "kuielab_b_other.onnx",
871
+ "MDX-Net Model: kuielab_b_bass": "kuielab_b_bass.onnx",
872
+ "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx"}
873
+
874
+ FULL_DOWNLOAD_LIST_DEMUCS = {
875
+
876
+ "Demucs v4: htdemucs_ft":{
877
+ "f7e0c4bc-ba3fe64a.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th",
878
+ "d12395a8-e57c48e6.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th",
879
+ "92cfc3b6-ef3bcb9c.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th",
880
+ "04573f0d-f3cf25b2.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th",
881
+ "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml"
882
+ },
883
+
884
+ "Demucs v4: htdemucs":{
885
+ "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th",
886
+ "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml"
887
+ },
888
+
889
+ "Demucs v4: hdemucs_mmi":{
890
+ "75fc33f5-1941ce65.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/75fc33f5-1941ce65.th",
891
+ "hdemucs_mmi.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/hdemucs_mmi.yaml"
892
+ },
893
+ "Demucs v4: htdemucs_6s":{
894
+ "5c90dfd2-34c22ccb.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/5c90dfd2-34c22ccb.th",
895
+ "htdemucs_6s.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_6s.yaml"
896
+ },
897
+ "Demucs v3: mdx":{
898
+ "0d19c1c6-0f06f20e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/0d19c1c6-0f06f20e.th",
899
+ "7ecf8ec1-70f50cc9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7ecf8ec1-70f50cc9.th",
900
+ "c511e2ab-fe698775.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/c511e2ab-fe698775.th",
901
+ "7d865c68-3d5dd56b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7d865c68-3d5dd56b.th",
902
+ "mdx.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx.yaml"
903
+ },
904
+
905
+ "Demucs v3: mdx_q":{
906
+ "6b9c2ca1-3fd82607.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/6b9c2ca1-3fd82607.th",
907
+ "b72baf4e-8778635e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/b72baf4e-8778635e.th",
908
+ "42e558d4-196e0e1b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/42e558d4-196e0e1b.th",
909
+ "305bc58f-18378783.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/305bc58f-18378783.th",
910
+ "mdx_q.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx_q.yaml"
911
+ },
912
+
913
+ "Demucs v3: mdx_extra":{
914
+ "e51eebcc-c1b80bdd.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/e51eebcc-c1b80bdd.th",
915
+ "a1d90b5c-ae9d2452.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/a1d90b5c-ae9d2452.th",
916
+ "5d2d6c55-db83574e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/5d2d6c55-db83574e.th",
917
+ "cfa93e08-61801ae1.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/cfa93e08-61801ae1.th",
918
+ "mdx_extra.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx_extra.yaml"
919
+ },
920
+
921
+ "Demucs v3: mdx_extra_q": {
922
+ "83fc094f-4a16d450.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/83fc094f-4a16d450.th",
923
+ "464b36d7-e5a9386e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/464b36d7-e5a9386e.th",
924
+ "14fc6a69-a89dd0ee.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/14fc6a69-a89dd0ee.th",
925
+ "7fd6ef75-a905dd85.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7fd6ef75-a905dd85.th",
926
+ "mdx_extra_q.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx_extra_q.yaml"
927
+ },
928
+
929
+ "Demucs v3: UVR Model":{
930
+ "ebf34a2db.th": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/ebf34a2db.th",
931
+ "UVR_Demucs_Model_1.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/UVR_Demucs_Model_1.yaml"
932
+ },
933
+
934
+ "Demucs v3: repro_mdx_a":{
935
+ "9a6b4851-03af0aa6.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
936
+ "1ef250f1-592467ce.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
937
+ "fa0cb7f9-100d8bf4.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
938
+ "902315c2-b39ce9c9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
939
+ "repro_mdx_a.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a.yaml"
940
+ },
941
+
942
+ "Demucs v3: repro_mdx_a_time_only":{
943
+ "9a6b4851-03af0aa6.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
944
+ "1ef250f1-592467ce.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
945
+ "repro_mdx_a_time_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_time_only.yaml"
946
+ },
947
+
948
+ "Demucs v3: repro_mdx_a_hybrid_only":{
949
+ "fa0cb7f9-100d8bf4.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
950
+ "902315c2-b39ce9c9.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
951
+ "repro_mdx_a_hybrid_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_hybrid_only.yaml"
952
+ },
953
+
954
+ "Demucs v2: demucs": {
955
+ "demucs-e07c671f.th": "https://dl.fbaipublicfiles.com/demucs/v3.0/demucs-e07c671f.th"
956
+ },
957
+
958
+ "Demucs v2: demucs_extra": {
959
+ "demucs_extra-3646af93.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_extra-3646af93.th"
960
+ },
961
+
962
+ "Demucs v2: demucs48_hq": {
963
+ "demucs48_hq-28a1282c.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs48_hq-28a1282c.th"
964
+ },
965
+
966
+ "Demucs v2: tasnet": {
967
+ "tasnet-beb46fac.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet-beb46fac.th"
968
+ },
969
+
970
+ "Demucs v2: tasnet_extra": {
971
+ "tasnet_extra-df3777b2.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet_extra-df3777b2.th"
972
+ },
973
+
974
+ "Demucs v2: demucs_unittest": {
975
+ "demucs_unittest-09ebc15f.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_unittest-09ebc15f.th"
976
+ },
977
+
978
+ "Demucs v1: demucs": {
979
+ "demucs.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs.th"
980
+ },
981
+
982
+ "Demucs v1: demucs_extra": {
983
+ "demucs_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs_extra.th"
984
+ },
985
+
986
+ "Demucs v1: light": {
987
+ "light.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light.th"
988
+ },
989
+
990
+ "Demucs v1: light_extra": {
991
+ "light_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light_extra.th"
992
+ },
993
+
994
+ "Demucs v1: tasnet": {
995
+ "tasnet.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th"
996
+ },
997
+
998
+ "Demucs v1: tasnet_extra": {
999
+ "tasnet_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet_extra.th"
1000
+ }
1001
+ }
1002
+
1003
+ # Main Menu Labels
1004
+
1005
+ CHOOSE_PROC_METHOD_MAIN_LABEL = 'CHOOSE PROCESS METHOD'
1006
+ SELECT_SAVED_SETTINGS_MAIN_LABEL = 'SELECT SAVED SETTINGS'
1007
+ CHOOSE_MDX_MODEL_MAIN_LABEL = 'CHOOSE MDX-NET MODEL'
1008
+ BATCHES_MDX_MAIN_LABEL = 'BATCH SIZE'
1009
+ VOL_COMP_MDX_MAIN_LABEL = 'VOLUME COMPENSATION'
1010
+ SELECT_VR_MODEL_MAIN_LABEL = 'CHOOSE VR MODEL'
1011
+ AGGRESSION_SETTING_MAIN_LABEL = 'AGGRESSION SETTING'
1012
+ WINDOW_SIZE_MAIN_LABEL = 'WINDOW SIZE'
1013
+ CHOOSE_DEMUCS_MODEL_MAIN_LABEL = 'CHOOSE DEMUCS MODEL'
1014
+ CHOOSE_DEMUCS_STEMS_MAIN_LABEL = 'CHOOSE STEM(S)'
1015
+ CHOOSE_SEGMENT_MAIN_LABEL = 'SEGMENT'
1016
+ ENSEMBLE_OPTIONS_MAIN_LABEL = 'ENSEMBLE OPTIONS'
1017
+ CHOOSE_MAIN_PAIR_MAIN_LABEL = 'MAIN STEM PAIR'
1018
+ CHOOSE_ENSEMBLE_ALGORITHM_MAIN_LABEL = 'ENSEMBLE ALGORITHM'
1019
+ AVAILABLE_MODELS_MAIN_LABEL = 'AVAILABLE MODELS'
1020
+ CHOOSE_AUDIO_TOOLS_MAIN_LABEL = 'CHOOSE AUDIO TOOL'
1021
+ CHOOSE_MANUAL_ALGORITHM_MAIN_LABEL = 'CHOOSE ALGORITHM'
1022
+ CHOOSE_RATE_MAIN_LABEL = 'RATE'
1023
+ CHOOSE_SEMITONES_MAIN_LABEL = 'SEMITONES'
1024
+ GPU_CONVERSION_MAIN_LABEL = 'GPU Conversion'
1025
+
1026
+ if OPERATING_SYSTEM=="Darwin":
1027
+ LICENSE_OS_SPECIFIC_TEXT = '• This application is intended for those running macOS Catalina and above.\n' +\
1028
+ '• Application functionality for systems running macOS Mojave or lower is not guaranteed.\n' +\
1029
+ '• Application functionality for older or budget Mac systems is not guaranteed.\n\n'
1030
+ FONT_SIZE_F1 = 13
1031
+ FONT_SIZE_F2 = 11
1032
+ FONT_SIZE_F3 = 12
1033
+ FONT_SIZE_0 = 9
1034
+ FONT_SIZE_1 = 11
1035
+ FONT_SIZE_2 = 12
1036
+ FONT_SIZE_3 = 13
1037
+ FONT_SIZE_4 = 14
1038
+ FONT_SIZE_5 = 15
1039
+ FONT_SIZE_6 = 17
1040
+ HELP_HINT_CHECKBOX_WIDTH = 13
1041
+ MDX_CHECKBOXS_WIDTH = 14
1042
+ VR_CHECKBOXS_WIDTH = 14
1043
+ ENSEMBLE_CHECKBOXS_WIDTH = 18
1044
+ DEMUCS_CHECKBOXS_WIDTH = 14
1045
+ DEMUCS_PRE_CHECKBOXS_WIDTH = 20
1046
+ GEN_SETTINGS_WIDTH = 17
1047
+ MENU_COMBOBOX_WIDTH = 16
1048
+
1049
+ elif OPERATING_SYSTEM=="Linux":
1050
+ LICENSE_OS_SPECIFIC_TEXT = '• This application is intended for those running Linux Ubuntu 18.04+.\n' +\
1051
+ '• Application functionality for systems running other Linux platforms is not guaranteed.\n' +\
1052
+ '• Application functionality for older or budget systems is not guaranteed.\n\n'
1053
+ FONT_SIZE_F1 = 10
1054
+ FONT_SIZE_F2 = 8
1055
+ FONT_SIZE_F3 = 9
1056
+ FONT_SIZE_0 = 7
1057
+ FONT_SIZE_1 = 8
1058
+ FONT_SIZE_2 = 9
1059
+ FONT_SIZE_3 = 10
1060
+ FONT_SIZE_4 = 11
1061
+ FONT_SIZE_5 = 12
1062
+ FONT_SIZE_6 = 15
1063
+ HELP_HINT_CHECKBOX_WIDTH = 13
1064
+ MDX_CHECKBOXS_WIDTH = 14
1065
+ VR_CHECKBOXS_WIDTH = 16
1066
+ ENSEMBLE_CHECKBOXS_WIDTH = 25
1067
+ DEMUCS_CHECKBOXS_WIDTH = 18
1068
+ DEMUCS_PRE_CHECKBOXS_WIDTH = 27
1069
+ GEN_SETTINGS_WIDTH = 17
1070
+ MENU_COMBOBOX_WIDTH = 19
1071
+
1072
+ elif OPERATING_SYSTEM=="Windows":
1073
+ LICENSE_OS_SPECIFIC_TEXT = '• This application is intended for those running Windows 10 or higher.\n' +\
1074
+ '• Application functionality for systems running Windows 7 or lower is not guaranteed.\n' +\
1075
+ '• Application functionality for Intel Pentium & Celeron CPUs systems is not guaranteed.\n\n'
1076
+ FONT_SIZE_F1 = 10
1077
+ FONT_SIZE_F2 = 8
1078
+ FONT_SIZE_F3 = 9
1079
+ FONT_SIZE_0 = 7
1080
+ FONT_SIZE_1 = 8
1081
+ FONT_SIZE_2 = 9
1082
+ FONT_SIZE_3 = 10
1083
+ FONT_SIZE_4 = 11
1084
+ FONT_SIZE_5 = 12
1085
+ FONT_SIZE_6 = 15
1086
+ HELP_HINT_CHECKBOX_WIDTH = 16
1087
+ MDX_CHECKBOXS_WIDTH = 16
1088
+ VR_CHECKBOXS_WIDTH = 16
1089
+ ENSEMBLE_CHECKBOXS_WIDTH = 25
1090
+ DEMUCS_CHECKBOXS_WIDTH = 18
1091
+ DEMUCS_PRE_CHECKBOXS_WIDTH = 27
1092
+ GEN_SETTINGS_WIDTH = 23
1093
+ MENU_COMBOBOX_WIDTH = 19
1094
+
1095
+
1096
+ LICENSE_TEXT = lambda a, p:f'Current Application Version: Ultimate Vocal Remover {a}\n' +\
1097
+ f'Current Patch Version: {p}\n\n' +\
1098
+ 'Copyright (c) 2022 Ultimate Vocal Remover\n\n' +\
1099
+ 'UVR is free and open-source, but MIT licensed. Please credit us if you use our\n' +\
1100
+ f'models or code for projects unrelated to UVR.\n\n{LICENSE_OS_SPECIFIC_TEXT}' +\
1101
+ 'This bundle contains the UVR interface, Python, PyTorch, and other\n' +\
1102
+ 'dependencies needed to run the application effectively.\n\n' +\
1103
+ 'Website Links: This application, System or Service(s) may contain links to\n' +\
1104
+ 'other websites and downloads, and they are solely provided to you as an\n' +\
1105
+ 'additional convenience. You understand and acknowledge that by clicking\n' +\
1106
+ 'or activating such links you are accessing a site or service outside of\n' +\
1107
+ 'this application, and that we do not screen, review, approve, or otherwise\n' +\
1108
+ 'endorse any content or information contained in these linked websites.\n' +\
1109
+ 'You acknowledge and agree that we, our affiliates and partners are not\n' +\
1110
+ 'responsible for the contents of any of these linked websites, including\n' +\
1111
+ 'the accuracy or availability of information provided by the linked websites,\n' +\
1112
+ 'and we make no representations or warranties regarding your use of\n' +\
1113
+ 'the linked websites.\n\n' +\
1114
+ 'This application is MIT Licensed\n\n' +\
1115
+ 'Permission is hereby granted, free of charge, to any person obtaining a copy\n' +\
1116
+ 'of this software and associated documentation files (the "Software"), to deal\n' +\
1117
+ 'in the Software without restriction, including without limitation the rights\n' +\
1118
+ 'to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' +\
1119
+ 'copies of the Software, and to permit persons to whom the Software is\n' +\
1120
+ 'furnished to do so, subject to the following conditions:\n\n' +\
1121
+ 'The above copyright notice and this permission notice shall be included in all\n' +\
1122
+ 'copies or substantial portions of the Software.\n\n' +\
1123
+ 'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' +\
1124
+ 'IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' +\
1125
+ 'FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n' +\
1126
+ 'AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' +\
1127
+ 'LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' +\
1128
+ 'OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n' +\
1129
+ 'SOFTWARE.'
1130
+
1131
+ CHANGE_LOG_HEADER = lambda patch:f"Patch Version:\n\n{patch}"
1132
+
1133
+ #DND CONSTS
1134
+
1135
+ MAC_DND_CHECK = ('/Users/',
1136
+ '/Applications/',
1137
+ '/Library/',
1138
+ '/System/')
1139
+ LINUX_DND_CHECK = ('/home/',
1140
+ '/usr/')
1141
+ WINDOWS_DND_CHECK = ('A:', 'B:', 'C:', 'D:', 'E:', 'F:', 'G:', 'H:', 'I:', 'J:', 'K:', 'L:', 'M:', 'N:', 'O:', 'P:', 'Q:', 'R:', 'S:', 'T:', 'U:', 'V:', 'W:', 'X:', 'Y:', 'Z:')
1142
+
1143
+ WOOD_INST_MODEL_HASH = '0ec76fd9e65f81d8b4fbd13af4826ed8'
1144
+ WOOD_INST_PARAMS = {
1145
+ "vr_model_param": "4band_v3",
1146
+ "primary_stem": NO_WIND_INST_STEM
1147
+ }
uvr5/lib_v5/mdxnet.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABCMeta
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from pytorch_lightning import LightningModule
6
+ from .modules import TFC_TDF
7
+
8
+ dim_s = 4
9
+
10
+ class AbstractMDXNet(LightningModule):
11
+ __metaclass__ = ABCMeta
12
+
13
+ def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap):
14
+ super().__init__()
15
+ self.target_name = target_name
16
+ self.lr = lr
17
+ self.optimizer = optimizer
18
+ self.dim_c = dim_c
19
+ self.dim_f = dim_f
20
+ self.dim_t = dim_t
21
+ self.n_fft = n_fft
22
+ self.n_bins = n_fft // 2 + 1
23
+ self.hop_length = hop_length
24
+ self.window = nn.Parameter(torch.hann_window(window_length=self.n_fft, periodic=True), requires_grad=False)
25
+ self.freq_pad = nn.Parameter(torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]), requires_grad=False)
26
+
27
+ def configure_optimizers(self):
28
+ if self.optimizer == 'rmsprop':
29
+ return torch.optim.RMSprop(self.parameters(), self.lr)
30
+
31
+ if self.optimizer == 'adamw':
32
+ return torch.optim.AdamW(self.parameters(), self.lr)
33
+
34
+ class ConvTDFNet(AbstractMDXNet):
35
+ def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length,
36
+ num_blocks, l, g, k, bn, bias, overlap):
37
+
38
+ super(ConvTDFNet, self).__init__(
39
+ target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap)
40
+ self.save_hyperparameters()
41
+
42
+ self.num_blocks = num_blocks
43
+ self.l = l
44
+ self.g = g
45
+ self.k = k
46
+ self.bn = bn
47
+ self.bias = bias
48
+
49
+ if optimizer == 'rmsprop':
50
+ norm = nn.BatchNorm2d
51
+
52
+ if optimizer == 'adamw':
53
+ norm = lambda input:nn.GroupNorm(2, input)
54
+
55
+ self.n = num_blocks // 2
56
+ scale = (2, 2)
57
+
58
+ self.first_conv = nn.Sequential(
59
+ nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)),
60
+ norm(g),
61
+ nn.ReLU(),
62
+ )
63
+
64
+ f = self.dim_f
65
+ c = g
66
+ self.encoding_blocks = nn.ModuleList()
67
+ self.ds = nn.ModuleList()
68
+ for i in range(self.n):
69
+ self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
70
+ self.ds.append(
71
+ nn.Sequential(
72
+ nn.Conv2d(in_channels=c, out_channels=c + g, kernel_size=scale, stride=scale),
73
+ norm(c + g),
74
+ nn.ReLU()
75
+ )
76
+ )
77
+ f = f // 2
78
+ c += g
79
+
80
+ self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)
81
+
82
+ self.decoding_blocks = nn.ModuleList()
83
+ self.us = nn.ModuleList()
84
+ for i in range(self.n):
85
+ self.us.append(
86
+ nn.Sequential(
87
+ nn.ConvTranspose2d(in_channels=c, out_channels=c - g, kernel_size=scale, stride=scale),
88
+ norm(c - g),
89
+ nn.ReLU()
90
+ )
91
+ )
92
+ f = f * 2
93
+ c -= g
94
+
95
+ self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
96
+
97
+ self.final_conv = nn.Sequential(
98
+ nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)),
99
+ )
100
+
101
+ def forward(self, x):
102
+
103
+ x = self.first_conv(x)
104
+
105
+ x = x.transpose(-1, -2)
106
+
107
+ ds_outputs = []
108
+ for i in range(self.n):
109
+ x = self.encoding_blocks[i](x)
110
+ ds_outputs.append(x)
111
+ x = self.ds[i](x)
112
+
113
+ x = self.bottleneck_block(x)
114
+
115
+ for i in range(self.n):
116
+ x = self.us[i](x)
117
+ x *= ds_outputs[-i - 1]
118
+ x = self.decoding_blocks[i](x)
119
+
120
+ x = x.transpose(-1, -2)
121
+
122
+ x = self.final_conv(x)
123
+
124
+ return x
125
+
126
+ class Mixer(nn.Module):
127
+ def __init__(self, device, mixer_path):
128
+
129
+ super(Mixer, self).__init__()
130
+
131
+ self.linear = nn.Linear((dim_s+1)*2, dim_s*2, bias=False)
132
+
133
+ self.load_state_dict(
134
+ torch.load(mixer_path, map_location=device)
135
+ )
136
+
137
+ def forward(self, x):
138
+ x = x.reshape(1,(dim_s+1)*2,-1).transpose(-1,-2)
139
+ x = self.linear(x)
140
+ return x.transpose(-1,-2).reshape(dim_s,2,-1)
uvr5/lib_v5/mixer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea781bd52c6a523b825fa6cdbb6189f52e318edd8b17e6fe404f76f7af8caa9c
3
+ size 1208
uvr5/lib_v5/modules.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class TFC(nn.Module):
6
+ def __init__(self, c, l, k, norm):
7
+ super(TFC, self).__init__()
8
+
9
+ self.H = nn.ModuleList()
10
+ for i in range(l):
11
+ self.H.append(
12
+ nn.Sequential(
13
+ nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
14
+ norm(c),
15
+ nn.ReLU(),
16
+ )
17
+ )
18
+
19
+ def forward(self, x):
20
+ for h in self.H:
21
+ x = h(x)
22
+ return x
23
+
24
+
25
+ class DenseTFC(nn.Module):
26
+ def __init__(self, c, l, k, norm):
27
+ super(DenseTFC, self).__init__()
28
+
29
+ self.conv = nn.ModuleList()
30
+ for i in range(l):
31
+ self.conv.append(
32
+ nn.Sequential(
33
+ nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
34
+ norm(c),
35
+ nn.ReLU(),
36
+ )
37
+ )
38
+
39
+ def forward(self, x):
40
+ for layer in self.conv[:-1]:
41
+ x = torch.cat([layer(x), x], 1)
42
+ return self.conv[-1](x)
43
+
44
+
45
+ class TFC_TDF(nn.Module):
46
+ def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d):
47
+
48
+ super(TFC_TDF, self).__init__()
49
+
50
+ self.use_tdf = bn is not None
51
+
52
+ self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm)
53
+
54
+ if self.use_tdf:
55
+ if bn == 0:
56
+ self.tdf = nn.Sequential(
57
+ nn.Linear(f, f, bias=bias),
58
+ norm(c),
59
+ nn.ReLU()
60
+ )
61
+ else:
62
+ self.tdf = nn.Sequential(
63
+ nn.Linear(f, f // bn, bias=bias),
64
+ norm(c),
65
+ nn.ReLU(),
66
+ nn.Linear(f // bn, f, bias=bias),
67
+ norm(c),
68
+ nn.ReLU()
69
+ )
70
+
71
+ def forward(self, x):
72
+ x = self.tfc(x)
73
+ return x + self.tdf(x) if self.use_tdf else x
74
+
uvr5/lib_v5/pyrb.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ import six
5
+ import numpy as np
6
+ import soundfile as sf
7
+ import sys
8
+
9
+ if getattr(sys, 'frozen', False):
10
+ BASE_PATH_RUB = sys._MEIPASS
11
+ else:
12
+ BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__))
13
+
14
+ __all__ = ['time_stretch', 'pitch_shift']
15
+
16
+ __RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, 'rubberband')
17
+
18
+ if six.PY2:
19
+ DEVNULL = open(os.devnull, 'w')
20
+ else:
21
+ DEVNULL = subprocess.DEVNULL
22
+
23
+ def __rubberband(y, sr, **kwargs):
24
+
25
+ assert sr > 0
26
+
27
+ # Get the input and output tempfile
28
+ fd, infile = tempfile.mkstemp(suffix='.wav')
29
+ os.close(fd)
30
+ fd, outfile = tempfile.mkstemp(suffix='.wav')
31
+ os.close(fd)
32
+
33
+ # dump the audio
34
+ sf.write(infile, y, sr)
35
+
36
+ try:
37
+ # Execute rubberband
38
+ arguments = [__RUBBERBAND_UTIL, '-q']
39
+
40
+ for key, value in six.iteritems(kwargs):
41
+ arguments.append(str(key))
42
+ arguments.append(str(value))
43
+
44
+ arguments.extend([infile, outfile])
45
+
46
+ subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL)
47
+
48
+ # Load the processed audio.
49
+ y_out, _ = sf.read(outfile, always_2d=True)
50
+
51
+ # make sure that output dimensions matches input
52
+ if y.ndim == 1:
53
+ y_out = np.squeeze(y_out)
54
+
55
+ except OSError as exc:
56
+ six.raise_from(RuntimeError('Failed to execute rubberband. '
57
+ 'Please verify that rubberband-cli '
58
+ 'is installed.'),
59
+ exc)
60
+
61
+ finally:
62
+ # Remove temp files
63
+ os.unlink(infile)
64
+ os.unlink(outfile)
65
+
66
+ return y_out
67
+
68
+ def time_stretch(y, sr, rate, rbargs=None):
69
+ if rate <= 0:
70
+ raise ValueError('rate must be strictly positive')
71
+
72
+ if rate == 1.0:
73
+ return y
74
+
75
+ if rbargs is None:
76
+ rbargs = dict()
77
+
78
+ rbargs.setdefault('--tempo', rate)
79
+
80
+ return __rubberband(y, sr, **rbargs)
81
+
82
+ def pitch_shift(y, sr, n_steps, rbargs=None):
83
+
84
+ if n_steps == 0:
85
+ return y
86
+
87
+ if rbargs is None:
88
+ rbargs = dict()
89
+
90
+ rbargs.setdefault('--pitch', n_steps)
91
+
92
+ return __rubberband(y, sr, **rbargs)
uvr5/lib_v5/spec_utils.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import soundfile as sf
4
+ import math
5
+ import random
6
+ import math
7
+ import platform
8
+ import traceback
9
+ from . import pyrb
10
+ #cur
11
+ OPERATING_SYSTEM = platform.system()
12
+ SYSTEM_ARCH = platform.platform()
13
+ SYSTEM_PROC = platform.processor()
14
+ ARM = 'arm'
15
+
16
+ if OPERATING_SYSTEM == 'Windows':
17
+ from pyrubberband import pyrb
18
+ else:
19
+ from . import pyrb
20
+
21
+ if OPERATING_SYSTEM == 'Darwin':
22
+ wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest"
23
+ else:
24
+ wav_resolution = "sinc_fastest"
25
+
26
+ MAX_SPEC = 'Max Spec'
27
+ MIN_SPEC = 'Min Spec'
28
+ AVERAGE = 'Average'
29
+
30
+ def crop_center(h1, h2):
31
+ h1_shape = h1.size()
32
+ h2_shape = h2.size()
33
+
34
+ if h1_shape[3] == h2_shape[3]:
35
+ return h1
36
+ elif h1_shape[3] < h2_shape[3]:
37
+ raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
38
+
39
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
40
+ e_time = s_time + h2_shape[3]
41
+ h1 = h1[:, :, :, s_time:e_time]
42
+
43
+ return h1
44
+
45
+ def preprocess(X_spec):
46
+ X_mag = np.abs(X_spec)
47
+ X_phase = np.angle(X_spec)
48
+
49
+ return X_mag, X_phase
50
+
51
+ def make_padding(width, cropsize, offset):
52
+ left = offset
53
+ roi_size = cropsize - offset * 2
54
+ if roi_size == 0:
55
+ roi_size = cropsize
56
+ right = roi_size - (width % roi_size) + left
57
+
58
+ return left, right, roi_size
59
+
60
+ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
61
+ if reverse:
62
+ wave_left = np.flip(np.asfortranarray(wave[0]))
63
+ wave_right = np.flip(np.asfortranarray(wave[1]))
64
+ elif mid_side:
65
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
66
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
67
+ elif mid_side_b2:
68
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
69
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
70
+ else:
71
+ wave_left = np.asfortranarray(wave[0])
72
+ wave_right = np.asfortranarray(wave[1])
73
+
74
+ spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
75
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
76
+
77
+ spec = np.asfortranarray([spec_left, spec_right])
78
+
79
+ return spec
80
+
81
+ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
82
+ import threading
83
+
84
+ if reverse:
85
+ wave_left = np.flip(np.asfortranarray(wave[0]))
86
+ wave_right = np.flip(np.asfortranarray(wave[1]))
87
+ elif mid_side:
88
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
89
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
90
+ elif mid_side_b2:
91
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
92
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
93
+ else:
94
+ wave_left = np.asfortranarray(wave[0])
95
+ wave_right = np.asfortranarray(wave[1])
96
+
97
+ def run_thread(**kwargs):
98
+ global spec_left
99
+ spec_left = librosa.stft(**kwargs)
100
+
101
+ thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
102
+ thread.start()
103
+ # print(wave_right.shape, n_fft, hop_length)
104
+ spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
105
+ thread.join()
106
+
107
+ spec = np.asfortranarray([spec_left, spec_right])
108
+
109
+ return spec
110
+
111
+ def normalize(wave, is_normalize=False):
112
+ """Save output music files"""
113
+ maxv = np.max(np.abs(wave))
114
+ if maxv > 1.0:
115
+ print(f"\nNormalization Set {is_normalize}: Input above threshold for clipping. Max:{maxv}")
116
+ if is_normalize:
117
+ print(f"The result was normalized.")
118
+ wave /= maxv
119
+ else:
120
+ print(f"The result was not normalized.")
121
+ else:
122
+ print(f"\nNormalization Set {is_normalize}: Input not above threshold for clipping. Max:{maxv}")
123
+ # stereo to mono
124
+ if wave.shape[1] < wave.shape[0]:
125
+ wave = np.mean(wave, axis=1)
126
+ else:
127
+ wave = np.mean(wave, axis=0)
128
+ return wave
129
+
130
+ def normalize_two_stem(wave, mix, is_normalize=False):
131
+ """Save output music files"""
132
+
133
+ maxv = np.abs(wave).max()
134
+ max_mix = np.abs(mix).max()
135
+
136
+ if maxv > 1.0:
137
+ print(f"\nNormalization Set {is_normalize}: Primary source above threshold for clipping. Max:{maxv}")
138
+ print(f"\nNormalization Set {is_normalize}: Mixture above threshold for clipping. Max:{max_mix}")
139
+ if is_normalize:
140
+ print(f"The result was normalized.")
141
+ wave /= maxv
142
+ mix /= maxv
143
+ else:
144
+ print(f"The result was not normalized.")
145
+ else:
146
+ print(f"\nNormalization Set {is_normalize}: Input not above threshold for clipping. Max:{maxv}")
147
+
148
+
149
+ print(f"\nNormalization Set {is_normalize}: Primary source - Max:{np.abs(wave).max()}")
150
+ print(f"\nNormalization Set {is_normalize}: Mixture - Max:{np.abs(mix).max()}")
151
+
152
+ return wave, mix
153
+
154
+ def combine_spectrograms(specs, mp):
155
+ l = min([specs[i].shape[2] for i in specs])
156
+ spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
157
+ offset = 0
158
+ bands_n = len(mp.param['band'])
159
+
160
+ for d in range(1, bands_n + 1):
161
+ h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
162
+ spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
163
+ offset += h
164
+
165
+ if offset > mp.param['bins']:
166
+ raise ValueError('Too much bins')
167
+
168
+ # lowpass fiter
169
+ if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
170
+ if bands_n == 1:
171
+ spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
172
+ else:
173
+ gp = 1
174
+ for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
175
+ g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
176
+ gp = g
177
+ spec_c[:, b, :] *= g
178
+
179
+ return np.asfortranarray(spec_c)
180
+
181
+ def spectrogram_to_image(spec, mode='magnitude'):
182
+ if mode == 'magnitude':
183
+ if np.iscomplexobj(spec):
184
+ y = np.abs(spec)
185
+ else:
186
+ y = spec
187
+ y = np.log10(y ** 2 + 1e-8)
188
+ elif mode == 'phase':
189
+ if np.iscomplexobj(spec):
190
+ y = np.angle(spec)
191
+ else:
192
+ y = spec
193
+
194
+ y -= y.min()
195
+ y *= 255 / y.max()
196
+ img = np.uint8(y)
197
+
198
+ if y.ndim == 3:
199
+ img = img.transpose(1, 2, 0)
200
+ img = np.concatenate([
201
+ np.max(img, axis=2, keepdims=True), img
202
+ ], axis=2)
203
+
204
+ return img
205
+
206
+ def reduce_vocal_aggressively(X, y, softmask):
207
+ v = X - y
208
+ y_mag_tmp = np.abs(y)
209
+ v_mag_tmp = np.abs(v)
210
+
211
+ v_mask = v_mag_tmp > y_mag_tmp
212
+ y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
213
+
214
+ return y_mag * np.exp(1.j * np.angle(y))
215
+
216
+ def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32):
217
+ mask = y_mask
218
+
219
+ try:
220
+ if min_range < fade_size * 2:
221
+ raise ValueError('min_range must be >= fade_size * 2')
222
+
223
+ idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0]
224
+ start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
225
+ end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
226
+ artifact_idx = np.where(end_idx - start_idx > min_range)[0]
227
+ weight = np.zeros_like(y_mask)
228
+ if len(artifact_idx) > 0:
229
+ start_idx = start_idx[artifact_idx]
230
+ end_idx = end_idx[artifact_idx]
231
+ old_e = None
232
+ for s, e in zip(start_idx, end_idx):
233
+ if old_e is not None and s - old_e < fade_size:
234
+ s = old_e - fade_size * 2
235
+
236
+ if s != 0:
237
+ weight[:, :, s:s + fade_size] = np.linspace(0, 1, fade_size)
238
+ else:
239
+ s -= fade_size
240
+
241
+ if e != y_mask.shape[2]:
242
+ weight[:, :, e - fade_size:e] = np.linspace(1, 0, fade_size)
243
+ else:
244
+ e += fade_size
245
+
246
+ weight[:, :, s + fade_size:e - fade_size] = 1
247
+ old_e = e
248
+
249
+ v_mask = 1 - y_mask
250
+ y_mask += weight * v_mask
251
+
252
+ mask = y_mask
253
+ except Exception as e:
254
+ error_name = f'{type(e).__name__}'
255
+ traceback_text = ''.join(traceback.format_tb(e.__traceback__))
256
+ message = f'{error_name}: "{e}"\n{traceback_text}"'
257
+ print('Post Process Failed: ', message)
258
+
259
+
260
+ return mask
261
+
262
+ def align_wave_head_and_tail(a, b):
263
+ l = min([a[0].size, b[0].size])
264
+
265
+ return a[:l,:l], b[:l,:l]
266
+
267
+ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse, clamp=False):
268
+ spec_left = np.asfortranarray(spec[0])
269
+ spec_right = np.asfortranarray(spec[1])
270
+
271
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
272
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
273
+
274
+ if reverse:
275
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
276
+ elif mid_side:
277
+ return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
278
+ elif mid_side_b2:
279
+ return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
280
+ else:
281
+ return np.asfortranarray([wave_left, wave_right])
282
+
283
+ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
284
+ import threading
285
+
286
+ spec_left = np.asfortranarray(spec[0])
287
+ spec_right = np.asfortranarray(spec[1])
288
+
289
+ def run_thread(**kwargs):
290
+ global wave_left
291
+ wave_left = librosa.istft(**kwargs)
292
+
293
+ thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
294
+ thread.start()
295
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
296
+ thread.join()
297
+
298
+ if reverse:
299
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
300
+ elif mid_side:
301
+ return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
302
+ elif mid_side_b2:
303
+ return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
304
+ else:
305
+ return np.asfortranarray([wave_left, wave_right])
306
+
307
+ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
308
+ bands_n = len(mp.param['band'])
309
+ offset = 0
310
+ # print('spec_m: ', spec_m.shape, np.max(spec_m), np.min(spec_m))
311
+ for d in range(1, bands_n + 1):
312
+ bp = mp.param['band'][d]
313
+ spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
314
+ h = bp['crop_stop'] - bp['crop_start']
315
+ spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
316
+ # print('\nbp', d, bands_n, bp)
317
+ # print('spec_s: ', spec_s.shape, np.max(spec_s), np.min(spec_s))
318
+ offset += h
319
+ if d == bands_n: # higher
320
+ # print('hpf_start: ', extra_bins_h, bp['hpf_start'])
321
+ if extra_bins_h: # if --high_end_process bypass
322
+ max_bin = bp['n_fft'] // 2
323
+ spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
324
+ # print('extra_bins_h, max_bin, extra_bins: ', extra_bins_h, max_bin, extra_bins.shape, np.max(extra_bins), np.min(extra_bins))
325
+ # print('spec_s d=4: ', spec_s.shape, np.max(spec_s), np.min(spec_s))
326
+ if bp['hpf_start'] > 0:
327
+ spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
328
+ # print('spec_s fft: ', spec_s.shape, np.max(spec_s), np.min(spec_s) )
329
+ if bands_n == 1:
330
+ wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
331
+ else:
332
+ wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
333
+ else:
334
+ sr = mp.param['band'][d+1]['sr']
335
+ if d == 1: # lower
336
+ spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'] - 1) # test
337
+ spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
338
+ wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type=wav_resolution)
339
+ else: # mid
340
+ spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
341
+ spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
342
+ wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
343
+ wave = librosa.resample(wave2, bp['sr'], sr, res_type=wav_resolution)
344
+ # print('spec to wav shape: ', d, wave.shape, np.max(wave), np.min(wave), spec_s.shape, np.max(spec_s), np.min(spec_s))
345
+ return wave
346
+
347
+ def fft_lp_filter(spec, bin_start, bin_stop):
348
+ g = 1.0
349
+ for b in range(bin_start, bin_stop):
350
+ g -= 1 / (bin_stop - bin_start)
351
+ spec[:, b, :] = g * spec[:, b, :]
352
+
353
+ spec[:, bin_stop:, :] *= 0
354
+
355
+ return spec
356
+
357
+ def fft_hp_filter(spec, bin_start, bin_stop):
358
+ g = 1.0
359
+ for b in range(bin_start, bin_stop, -1):
360
+ g -= 1 / (bin_start - bin_stop)
361
+ spec[:, b, :] = g * spec[:, b, :]
362
+
363
+ spec[:, 0:bin_stop+1, :] *= 0
364
+
365
+ return spec
366
+
367
+ def mirroring(a, spec_m, input_high_end, mp):
368
+ if 'mirroring' == a:
369
+ mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
370
+ mirror = mirror * np.exp(1.j * np.angle(input_high_end))
371
+
372
+ return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
373
+
374
+ if 'mirroring2' == a:
375
+ mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
376
+ mi = np.multiply(mirror, input_high_end * 1.7)
377
+
378
+ return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
379
+
380
+ def adjust_aggr(mask, is_non_accom_stem, aggressiveness):
381
+ aggr = aggressiveness['value']
382
+
383
+ if aggr != 0:
384
+ if is_non_accom_stem:
385
+ aggr = 1 - aggr
386
+
387
+ aggr = [aggr, aggr]
388
+
389
+ if aggressiveness['aggr_correction'] is not None:
390
+ aggr[0] += aggressiveness['aggr_correction']['left']
391
+ aggr[1] += aggressiveness['aggr_correction']['right']
392
+
393
+ for ch in range(2):
394
+ mask[ch, :aggressiveness['split_bin']] = np.power(mask[ch, :aggressiveness['split_bin']], 1 + aggr[ch] / 3)
395
+ mask[ch, aggressiveness['split_bin']:] = np.power(mask[ch, aggressiveness['split_bin']:], 1 + aggr[ch])
396
+
397
+ # if is_non_accom_stem:
398
+ # mask = (1.0 - mask)
399
+
400
+ return mask
401
+
402
+ def stft(wave, nfft, hl):
403
+ wave_left = np.asfortranarray(wave[0])
404
+ wave_right = np.asfortranarray(wave[1])
405
+ spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
406
+ spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
407
+ spec = np.asfortranarray([spec_left, spec_right])
408
+
409
+ return spec
410
+
411
+ def istft(spec, hl):
412
+ spec_left = np.asfortranarray(spec[0])
413
+ spec_right = np.asfortranarray(spec[1])
414
+ wave_left = librosa.istft(spec_left, hop_length=hl)
415
+ wave_right = librosa.istft(spec_right, hop_length=hl)
416
+ wave = np.asfortranarray([wave_left, wave_right])
417
+
418
+ return wave
419
+
420
+ def spec_effects(wave, algorithm='Default', value=None):
421
+ spec = [stft(wave[0],2048,1024), stft(wave[1],2048,1024)]
422
+ if algorithm == 'Min_Mag':
423
+ v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0])
424
+ wave = istft(v_spec_m,1024)
425
+ elif algorithm == 'Max_Mag':
426
+ v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0])
427
+ wave = istft(v_spec_m,1024)
428
+ elif algorithm == 'Default':
429
+ wave = (wave[1] * value) + (wave[0] * (1-value))
430
+ elif algorithm == 'Invert_p':
431
+ X_mag = np.abs(spec[0])
432
+ y_mag = np.abs(spec[1])
433
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
434
+ v_spec = spec[1] - max_mag * np.exp(1.j * np.angle(spec[0]))
435
+ wave = istft(v_spec,1024)
436
+
437
+ return wave
438
+
439
+ def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024):
440
+ wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length)
441
+
442
+ if wave.ndim == 1:
443
+ wave = np.asfortranarray([wave,wave])
444
+
445
+ return wave
446
+
447
+ def wave_to_spectrogram_no_mp(wave):
448
+
449
+ spec = librosa.stft(wave, n_fft=2048, hop_length=1024)
450
+
451
+ if spec.ndim == 1:
452
+ spec = np.asfortranarray([spec,spec])
453
+
454
+ return spec
455
+
456
+ def invert_audio(specs, invert_p=True):
457
+
458
+ ln = min([specs[0].shape[2], specs[1].shape[2]])
459
+ specs[0] = specs[0][:,:,:ln]
460
+ specs[1] = specs[1][:,:,:ln]
461
+
462
+ if invert_p:
463
+ X_mag = np.abs(specs[0])
464
+ y_mag = np.abs(specs[1])
465
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
466
+ v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
467
+ else:
468
+ specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
469
+ v_spec = specs[0] - specs[1]
470
+
471
+ return v_spec
472
+
473
+ def invert_stem(mixture, stem):
474
+
475
+ mixture = wave_to_spectrogram_no_mp(mixture)
476
+ stem = wave_to_spectrogram_no_mp(stem)
477
+ output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem]))
478
+
479
+ return -output.T
480
+
481
+ def ensembling(a, specs):
482
+ for i in range(1, len(specs)):
483
+ if i == 1:
484
+ spec = specs[0]
485
+
486
+ ln = min([spec.shape[2], specs[i].shape[2]])
487
+ spec = spec[:,:,:ln]
488
+ specs[i] = specs[i][:,:,:ln]
489
+
490
+ if MIN_SPEC == a:
491
+ spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
492
+ if MAX_SPEC == a:
493
+ spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
494
+ if AVERAGE == a:
495
+ spec = np.where(np.abs(specs[i]) == np.abs(spec), specs[i], spec)
496
+
497
+ return spec
498
+
499
+ def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path):
500
+
501
+ wavs_ = []
502
+
503
+ if algorithm == AVERAGE:
504
+ output = average_audio(audio_input)
505
+ samplerate = 44100
506
+ else:
507
+ specs = []
508
+
509
+ for i in range(len(audio_input)):
510
+ wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100)
511
+ wavs_.append(wave)
512
+ spec = wave_to_spectrogram_no_mp(wave)
513
+ specs.append(spec)
514
+
515
+ wave_shapes = [w.shape[1] for w in wavs_]
516
+ target_shape = wavs_[wave_shapes.index(max(wave_shapes))]
517
+
518
+ output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs))
519
+ output = to_shape(output, target_shape.shape)
520
+
521
+ sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set)
522
+
523
+ def to_shape(x, target_shape):
524
+ padding_list = []
525
+ for x_dim, target_dim in zip(x.shape, target_shape):
526
+ pad_value = (target_dim - x_dim)
527
+ pad_tuple = ((0, pad_value))
528
+ padding_list.append(pad_tuple)
529
+
530
+ return np.pad(x, tuple(padding_list), mode='constant')
531
+
532
+ def to_shape_minimize(x: np.ndarray, target_shape):
533
+
534
+ padding_list = []
535
+ for x_dim, target_dim in zip(x.shape, target_shape):
536
+ pad_value = (target_dim - x_dim)
537
+ pad_tuple = ((0, pad_value))
538
+ padding_list.append(pad_tuple)
539
+
540
+ return np.pad(x, tuple(padding_list), mode='constant')
541
+
542
+ def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False):
543
+
544
+ wav, sr = librosa.load(audio_file, sr=44100, mono=False)
545
+
546
+ if wav.ndim == 1:
547
+ wav = np.asfortranarray([wav,wav])
548
+
549
+ if is_pitch:
550
+ wav_1 = pyrb.pitch_shift(wav[0], sr, rate, rbargs=None)
551
+ wav_2 = pyrb.pitch_shift(wav[1], sr, rate, rbargs=None)
552
+ else:
553
+ wav_1 = pyrb.time_stretch(wav[0], sr, rate, rbargs=None)
554
+ wav_2 = pyrb.time_stretch(wav[1], sr, rate, rbargs=None)
555
+
556
+ if wav_1.shape > wav_2.shape:
557
+ wav_2 = to_shape(wav_2, wav_1.shape)
558
+ if wav_1.shape < wav_2.shape:
559
+ wav_1 = to_shape(wav_1, wav_2.shape)
560
+
561
+ wav_mix = np.asfortranarray([wav_1, wav_2])
562
+
563
+ sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set)
564
+ save_format(export_path)
565
+
566
+ def average_audio(audio):
567
+
568
+ waves = []
569
+ wave_shapes = []
570
+ final_waves = []
571
+
572
+ for i in range(len(audio)):
573
+ wave = librosa.load(audio[i], sr=44100, mono=False)
574
+ waves.append(wave[0])
575
+ wave_shapes.append(wave[0].shape[1])
576
+
577
+ wave_shapes_index = wave_shapes.index(max(wave_shapes))
578
+ target_shape = waves[wave_shapes_index]
579
+ waves.pop(wave_shapes_index)
580
+ final_waves.append(target_shape)
581
+
582
+ for n_array in waves:
583
+ wav_target = to_shape(n_array, target_shape.shape)
584
+ final_waves.append(wav_target)
585
+
586
+ waves = sum(final_waves)
587
+ waves = waves/len(audio)
588
+
589
+ return waves
590
+
591
+ def average_dual_sources(wav_1, wav_2, value):
592
+
593
+ if wav_1.shape > wav_2.shape:
594
+ wav_2 = to_shape(wav_2, wav_1.shape)
595
+ if wav_1.shape < wav_2.shape:
596
+ wav_1 = to_shape(wav_1, wav_2.shape)
597
+
598
+ wave = (wav_1 * value) + (wav_2 * (1-value))
599
+
600
+ return wave
601
+
602
+ def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray):
603
+
604
+ if wav_1.shape > wav_2.shape:
605
+ wav_2 = to_shape(wav_2, wav_1.shape)
606
+ if wav_1.shape < wav_2.shape:
607
+ ln = min([wav_1.shape[1], wav_2.shape[1]])
608
+ wav_2 = wav_2[:,:ln]
609
+
610
+ ln = min([wav_1.shape[1], wav_2.shape[1]])
611
+ wav_1 = wav_1[:,:ln]
612
+ wav_2 = wav_2[:,:ln]
613
+
614
+ return wav_2
615
+
616
+ def align_audio(file1, file2, file2_aligned, file_subtracted, wav_type_set, is_normalization, command_Text, progress_bar_main_var, save_format):
617
+ def get_diff(a, b):
618
+ corr = np.correlate(a, b, "full")
619
+ diff = corr.argmax() - (b.shape[0] - 1)
620
+ return diff
621
+
622
+ progress_bar_main_var.set(10)
623
+
624
+ # read tracks
625
+ wav1, sr1 = librosa.load(file1, sr=44100, mono=False)
626
+ wav2, sr2 = librosa.load(file2, sr=44100, mono=False)
627
+ wav1 = wav1.transpose()
628
+ wav2 = wav2.transpose()
629
+
630
+ command_Text(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n")
631
+
632
+ wav2_org = wav2.copy()
633
+ progress_bar_main_var.set(20)
634
+
635
+ command_Text("Processing files... \n")
636
+
637
+ # pick random position and get diff
638
+
639
+ counts = {} # counting up for each diff value
640
+ progress = 20
641
+
642
+ check_range = 64
643
+
644
+ base = (64 / check_range)
645
+
646
+ for i in range(check_range):
647
+ index = int(random.uniform(44100 * 2, min(wav1.shape[0], wav2.shape[0]) - 44100 * 2))
648
+ shift = int(random.uniform(-22050,+22050))
649
+ samp1 = wav1[index :index +44100, 0] # currently use left channel
650
+ samp2 = wav2[index+shift:index+shift+44100, 0]
651
+ progress += 1 * base
652
+ progress_bar_main_var.set(progress)
653
+ diff = get_diff(samp1, samp2)
654
+ diff -= shift
655
+
656
+ if abs(diff) < 22050:
657
+ if not diff in counts:
658
+ counts[diff] = 0
659
+ counts[diff] += 1
660
+
661
+ # use max counted diff value
662
+ max_count = 0
663
+ est_diff = 0
664
+ for diff in counts.keys():
665
+ if counts[diff] > max_count:
666
+ max_count = counts[diff]
667
+ est_diff = diff
668
+
669
+ command_Text(f"Estimated difference is {est_diff} (count: {max_count})\n")
670
+
671
+ progress_bar_main_var.set(90)
672
+
673
+ audio_files = []
674
+
675
+ def save_aligned_audio(wav2_aligned):
676
+ command_Text(f"Aligned File 2 with File 1.\n")
677
+ command_Text(f"Saving files... ")
678
+ sf.write(file2_aligned, normalize(wav2_aligned, is_normalization), sr2, subtype=wav_type_set)
679
+ save_format(file2_aligned)
680
+ min_len = min(wav1.shape[0], wav2_aligned.shape[0])
681
+ wav_sub = wav1[:min_len] - wav2_aligned[:min_len]
682
+ audio_files.append(file2_aligned)
683
+ return min_len, wav_sub
684
+
685
+ # make aligned track 2
686
+ if est_diff > 0:
687
+ wav2_aligned = np.append(np.zeros((est_diff, 2)), wav2_org, axis=0)
688
+ min_len, wav_sub = save_aligned_audio(wav2_aligned)
689
+ elif est_diff < 0:
690
+ wav2_aligned = wav2_org[-est_diff:]
691
+ min_len, wav_sub = save_aligned_audio(wav2_aligned)
692
+ else:
693
+ command_Text(f"Audio files already aligned.\n")
694
+ command_Text(f"Saving inverted track... ")
695
+ min_len = min(wav1.shape[0], wav2.shape[0])
696
+ wav_sub = wav1[:min_len] - wav2[:min_len]
697
+
698
+ wav_sub = np.clip(wav_sub, -1, +1)
699
+
700
+ sf.write(file_subtracted, normalize(wav_sub, is_normalization), sr1, subtype=wav_type_set)
701
+ save_format(file_subtracted)
702
+
703
+ progress_bar_main_var.set(95)
uvr5/lib_v5/vr_network/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # VR init.
uvr5/lib_v5/vr_network/layers.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from lib_v5 import spec_utils
6
+
7
+ class Conv2DBNActiv(nn.Module):
8
+
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin, nout,
14
+ kernel_size=ksize,
15
+ stride=stride,
16
+ padding=pad,
17
+ dilation=dilation,
18
+ bias=False),
19
+ nn.BatchNorm2d(nout),
20
+ activ()
21
+ )
22
+
23
+ def __call__(self, x):
24
+ return self.conv(x)
25
+
26
+ class SeperableConv2DBNActiv(nn.Module):
27
+
28
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
29
+ super(SeperableConv2DBNActiv, self).__init__()
30
+ self.conv = nn.Sequential(
31
+ nn.Conv2d(
32
+ nin, nin,
33
+ kernel_size=ksize,
34
+ stride=stride,
35
+ padding=pad,
36
+ dilation=dilation,
37
+ groups=nin,
38
+ bias=False),
39
+ nn.Conv2d(
40
+ nin, nout,
41
+ kernel_size=1,
42
+ bias=False),
43
+ nn.BatchNorm2d(nout),
44
+ activ()
45
+ )
46
+
47
+ def __call__(self, x):
48
+ return self.conv(x)
49
+
50
+
51
+ class Encoder(nn.Module):
52
+
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+
67
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
68
+ super(Decoder, self).__init__()
69
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
70
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
71
+
72
+ def __call__(self, x, skip=None):
73
+ x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
74
+ if skip is not None:
75
+ skip = spec_utils.crop_center(skip, x)
76
+ x = torch.cat([x, skip], dim=1)
77
+ h = self.conv(x)
78
+
79
+ if self.dropout is not None:
80
+ h = self.dropout(h)
81
+
82
+ return h
83
+
84
+
85
+ class ASPPModule(nn.Module):
86
+
87
+ def __init__(self, nn_architecture, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
92
+ )
93
+
94
+ self.nn_architecture = nn_architecture
95
+ self.six_layer = [129605]
96
+ self.seven_layer = [537238, 537227, 33966]
97
+
98
+ extra_conv = SeperableConv2DBNActiv(
99
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
100
+
101
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
102
+ self.conv3 = SeperableConv2DBNActiv(
103
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
104
+ self.conv4 = SeperableConv2DBNActiv(
105
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
106
+ self.conv5 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
108
+
109
+ if self.nn_architecture in self.six_layer:
110
+ self.conv6 = extra_conv
111
+ nin_x = 6
112
+ elif self.nn_architecture in self.seven_layer:
113
+ self.conv6 = extra_conv
114
+ self.conv7 = extra_conv
115
+ nin_x = 7
116
+ else:
117
+ nin_x = 5
118
+
119
+ self.bottleneck = nn.Sequential(
120
+ Conv2DBNActiv(nin * nin_x, nout, 1, 1, 0, activ=activ),
121
+ nn.Dropout2d(0.1)
122
+ )
123
+
124
+ def forward(self, x):
125
+ _, _, h, w = x.size()
126
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
127
+ feat2 = self.conv2(x)
128
+ feat3 = self.conv3(x)
129
+ feat4 = self.conv4(x)
130
+ feat5 = self.conv5(x)
131
+
132
+ if self.nn_architecture in self.six_layer:
133
+ feat6 = self.conv6(x)
134
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6), dim=1)
135
+ elif self.nn_architecture in self.seven_layer:
136
+ feat6 = self.conv6(x)
137
+ feat7 = self.conv7(x)
138
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
139
+ else:
140
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
141
+
142
+ bottle = self.bottleneck(out)
143
+ return bottle
uvr5/lib_v5/vr_network/layers_new.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from lib_v5 import spec_utils
6
+
7
+ class Conv2DBNActiv(nn.Module):
8
+
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin, nout,
14
+ kernel_size=ksize,
15
+ stride=stride,
16
+ padding=pad,
17
+ dilation=dilation,
18
+ bias=False),
19
+ nn.BatchNorm2d(nout),
20
+ activ()
21
+ )
22
+
23
+ def __call__(self, x):
24
+ return self.conv(x)
25
+
26
+ class Encoder(nn.Module):
27
+
28
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
29
+ super(Encoder, self).__init__()
30
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
31
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
32
+
33
+ def __call__(self, x):
34
+ h = self.conv1(x)
35
+ h = self.conv2(h)
36
+
37
+ return h
38
+
39
+
40
+ class Decoder(nn.Module):
41
+
42
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
43
+ super(Decoder, self).__init__()
44
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
45
+ # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
46
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
47
+
48
+ def __call__(self, x, skip=None):
49
+ x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
50
+
51
+ if skip is not None:
52
+ skip = spec_utils.crop_center(skip, x)
53
+ x = torch.cat([x, skip], dim=1)
54
+
55
+ h = self.conv1(x)
56
+ # h = self.conv2(h)
57
+
58
+ if self.dropout is not None:
59
+ h = self.dropout(h)
60
+
61
+ return h
62
+
63
+
64
+ class ASPPModule(nn.Module):
65
+
66
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
67
+ super(ASPPModule, self).__init__()
68
+ self.conv1 = nn.Sequential(
69
+ nn.AdaptiveAvgPool2d((1, None)),
70
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
71
+ )
72
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
73
+ self.conv3 = Conv2DBNActiv(
74
+ nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
75
+ )
76
+ self.conv4 = Conv2DBNActiv(
77
+ nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
78
+ )
79
+ self.conv5 = Conv2DBNActiv(
80
+ nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
81
+ )
82
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
83
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
84
+
85
+ def forward(self, x):
86
+ _, _, h, w = x.size()
87
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
88
+ feat2 = self.conv2(x)
89
+ feat3 = self.conv3(x)
90
+ feat4 = self.conv4(x)
91
+ feat5 = self.conv5(x)
92
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
93
+ out = self.bottleneck(out)
94
+
95
+ if self.dropout is not None:
96
+ out = self.dropout(out)
97
+
98
+ return out
99
+
100
+
101
+ class LSTMModule(nn.Module):
102
+
103
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
104
+ super(LSTMModule, self).__init__()
105
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
106
+ self.lstm = nn.LSTM(
107
+ input_size=nin_lstm,
108
+ hidden_size=nout_lstm // 2,
109
+ bidirectional=True
110
+ )
111
+ self.dense = nn.Sequential(
112
+ nn.Linear(nout_lstm, nin_lstm),
113
+ nn.BatchNorm1d(nin_lstm),
114
+ nn.ReLU()
115
+ )
116
+
117
+ def forward(self, x):
118
+ N, _, nbins, nframes = x.size()
119
+ h = self.conv(x)[:, 0] # N, nbins, nframes
120
+ h = h.permute(2, 0, 1) # nframes, N, nbins
121
+ h, _ = self.lstm(h)
122
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
123
+ h = h.reshape(nframes, N, 1, nbins)
124
+ h = h.permute(1, 2, 3, 0)
125
+
126
+ return h
uvr5/lib_v5/vr_network/model_param_init.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pathlib
3
+
4
+ default_param = {}
5
+ default_param['bins'] = 768
6
+ default_param['unstable_bins'] = 9 # training only
7
+ default_param['reduction_bins'] = 762 # training only
8
+ default_param['sr'] = 44100
9
+ default_param['pre_filter_start'] = 757
10
+ default_param['pre_filter_stop'] = 768
11
+ default_param['band'] = {}
12
+
13
+
14
+ default_param['band'][1] = {
15
+ 'sr': 11025,
16
+ 'hl': 128,
17
+ 'n_fft': 960,
18
+ 'crop_start': 0,
19
+ 'crop_stop': 245,
20
+ 'lpf_start': 61, # inference only
21
+ 'res_type': 'polyphase'
22
+ }
23
+
24
+ default_param['band'][2] = {
25
+ 'sr': 44100,
26
+ 'hl': 512,
27
+ 'n_fft': 1536,
28
+ 'crop_start': 24,
29
+ 'crop_stop': 547,
30
+ 'hpf_start': 81, # inference only
31
+ 'res_type': 'sinc_best'
32
+ }
33
+
34
+
35
+ def int_keys(d):
36
+ r = {}
37
+ for k, v in d:
38
+ if k.isdigit():
39
+ k = int(k)
40
+ r[k] = v
41
+ return r
42
+
43
+
44
+ class ModelParameters(object):
45
+ def __init__(self, config_path=''):
46
+ if '.pth' == pathlib.Path(config_path).suffix:
47
+ import zipfile
48
+
49
+ with zipfile.ZipFile(config_path, 'r') as zip:
50
+ self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
51
+ elif '.json' == pathlib.Path(config_path).suffix:
52
+ with open(config_path, 'r') as f:
53
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
54
+ else:
55
+ self.param = default_param
56
+
57
+ for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
58
+ if not k in self.param:
59
+ self.param[k] = False
uvr5/lib_v5/vr_network/modelparams/1band_sr16000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 16000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 16000,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr32000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 32000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "kaiser_fast"
14
+ }
15
+ },
16
+ "sr": 32000,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr33075_hl384.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 33075,
8
+ "hl": 384,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 33075,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 1024,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl256.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 256,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 256,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 256,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 256,
18
+ "pre_filter_stop": 256
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }