1f commited on
Commit
f65aa03
·
verified ·
1 Parent(s): 8470646

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/logger/neptune.yaml +9 -0
  2. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/logger/tensorboard.yaml +10 -0
  3. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/logger/wandb.yaml +16 -0
  4. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/cfm/default.yaml +3 -0
  5. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/decoder/default.yaml +7 -0
  6. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/encoder/default.yaml +18 -0
  7. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/matcha.yaml +15 -0
  8. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml +4 -0
  9. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/paths/default.yaml +18 -0
  10. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/cpu.yaml +5 -0
  11. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/ddp.yaml +9 -0
  12. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml +7 -0
  13. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/default.yaml +20 -0
  14. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/gpu.yaml +5 -0
  15. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/mps.yaml +5 -0
  16. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/VERSION +1 -0
  17. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/__init__.py +0 -0
  18. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/app.py +357 -0
  19. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/cli.py +418 -0
  20. r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/train.py +122 -0
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/logger/neptune.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # https://neptune.ai
2
+
3
+ neptune:
4
+ _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
5
+ api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
6
+ project: username/lightning-hydra-template
7
+ # name: ""
8
+ log_model_checkpoints: True
9
+ prefix: ""
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/logger/tensorboard.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://www.tensorflow.org/tensorboard/
2
+
3
+ tensorboard:
4
+ _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
5
+ save_dir: "${paths.output_dir}/tensorboard/"
6
+ name: null
7
+ log_graph: False
8
+ default_hp_metric: True
9
+ prefix: ""
10
+ # version: ""
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/logger/wandb.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://wandb.ai
2
+
3
+ wandb:
4
+ _target_: lightning.pytorch.loggers.wandb.WandbLogger
5
+ # name: "" # name of the run (normally generated by wandb)
6
+ save_dir: "${paths.output_dir}"
7
+ offline: False
8
+ id: null # pass correct id to resume experiment!
9
+ anonymous: null # enable anonymous logging
10
+ project: "lightning-hydra-template"
11
+ log_model: False # upload lightning ckpts
12
+ prefix: "" # a string to put at the beginning of metric keys
13
+ # entity: "" # set to name of your wandb team
14
+ group: ""
15
+ tags: []
16
+ job_type: ""
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/cfm/default.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ name: CFM
2
+ solver: euler
3
+ sigma_min: 1e-4
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/decoder/default.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ channels: [256, 256]
2
+ dropout: 0.05
3
+ attention_head_dim: 64
4
+ n_blocks: 1
5
+ num_mid_blocks: 2
6
+ num_heads: 2
7
+ act_fn: snakebeta
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/encoder/default.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoder_type: RoPE Encoder
2
+ encoder_params:
3
+ n_feats: ${model.n_feats}
4
+ n_channels: 192
5
+ filter_channels: 768
6
+ filter_channels_dp: 256
7
+ n_heads: 2
8
+ n_layers: 6
9
+ kernel_size: 3
10
+ p_dropout: 0.1
11
+ spk_emb_dim: 64
12
+ n_spks: 1
13
+ prenet: true
14
+
15
+ duration_predictor_params:
16
+ filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp}
17
+ kernel_size: 3
18
+ p_dropout: ${model.encoder.encoder_params.p_dropout}
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/matcha.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _self_
3
+ - encoder: default.yaml
4
+ - decoder: default.yaml
5
+ - cfm: default.yaml
6
+ - optimizer: adam.yaml
7
+
8
+ _target_: matcha.models.matcha_tts.MatchaTTS
9
+ n_vocab: 178
10
+ n_spks: ${data.n_spks}
11
+ spk_emb_dim: 64
12
+ n_feats: 80
13
+ data_statistics: ${data.data_statistics}
14
+ out_size: null # Must be divisible by 4
15
+ prior_loss: true
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: torch.optim.Adam
2
+ _partial_: true
3
+ lr: 1e-4
4
+ weight_decay: 0.0
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/paths/default.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # path to root directory
2
+ # this requires PROJECT_ROOT environment variable to exist
3
+ # you can replace it with "." if you want the root to be the current working directory
4
+ root_dir: ${oc.env:PROJECT_ROOT}
5
+
6
+ # path to data directory
7
+ data_dir: ${paths.root_dir}/data/
8
+
9
+ # path to logging directory
10
+ log_dir: ${paths.root_dir}/logs/
11
+
12
+ # path to output directory, created dynamically by hydra
13
+ # path generation pattern is specified in `configs/hydra/default.yaml`
14
+ # use it to store all files generated during the run, like ckpts and metrics
15
+ output_dir: ${hydra:runtime.output_dir}
16
+
17
+ # path to working directory
18
+ work_dir: ${hydra:runtime.cwd}
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/cpu.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ accelerator: cpu
5
+ devices: 1
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/ddp.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ strategy: ddp
5
+
6
+ accelerator: gpu
7
+ devices: [0,1]
8
+ num_nodes: 1
9
+ sync_batchnorm: True
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ # simulate DDP on CPU, useful for debugging
5
+ accelerator: cpu
6
+ devices: 2
7
+ strategy: ddp_spawn
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/default.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: lightning.pytorch.trainer.Trainer
2
+
3
+ default_root_dir: ${paths.output_dir}
4
+
5
+ max_epochs: -1
6
+
7
+ accelerator: gpu
8
+ devices: [0]
9
+
10
+ # mixed precision for extra speed-up
11
+ precision: 16-mixed
12
+
13
+ # perform a validation loop every N training epochs
14
+ check_val_every_n_epoch: 1
15
+
16
+ # set True to to ensure deterministic results
17
+ # makes training slower but gives more reproducibility than just setting seeds
18
+ deterministic: False
19
+
20
+ gradient_clip_val: 5.0
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/gpu.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ accelerator: gpu
5
+ devices: 1
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/configs/trainer/mps.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ accelerator: mps
5
+ devices: 1
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.0.5.1
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/__init__.py ADDED
File without changes
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/app.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ from argparse import Namespace
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ import torch
8
+
9
+ from matcha.cli import (
10
+ MATCHA_URLS,
11
+ VOCODER_URLS,
12
+ assert_model_downloaded,
13
+ get_device,
14
+ load_matcha,
15
+ load_vocoder,
16
+ process_text,
17
+ to_waveform,
18
+ )
19
+ from matcha.utils.utils import get_user_data_dir, plot_tensor
20
+
21
+ LOCATION = Path(get_user_data_dir())
22
+
23
+ args = Namespace(
24
+ cpu=False,
25
+ model="matcha_vctk",
26
+ vocoder="hifigan_univ_v1",
27
+ spk=0,
28
+ )
29
+
30
+ CURRENTLY_LOADED_MODEL = args.model
31
+
32
+
33
+ def MATCHA_TTS_LOC(x):
34
+ return LOCATION / f"{x}.ckpt"
35
+
36
+
37
+ def VOCODER_LOC(x):
38
+ return LOCATION / f"{x}"
39
+
40
+
41
+ LOGO_URL = "https://shivammehta25.github.io/Matcha-TTS/images/logo.png"
42
+ RADIO_OPTIONS = {
43
+ "Multi Speaker (VCTK)": {
44
+ "model": "matcha_vctk",
45
+ "vocoder": "hifigan_univ_v1",
46
+ },
47
+ "Single Speaker (LJ Speech)": {
48
+ "model": "matcha_ljspeech",
49
+ "vocoder": "hifigan_T2_v1",
50
+ },
51
+ }
52
+
53
+ # Ensure all the required models are downloaded
54
+ assert_model_downloaded(MATCHA_TTS_LOC("matcha_ljspeech"), MATCHA_URLS["matcha_ljspeech"])
55
+ assert_model_downloaded(VOCODER_LOC("hifigan_T2_v1"), VOCODER_URLS["hifigan_T2_v1"])
56
+ assert_model_downloaded(MATCHA_TTS_LOC("matcha_vctk"), MATCHA_URLS["matcha_vctk"])
57
+ assert_model_downloaded(VOCODER_LOC("hifigan_univ_v1"), VOCODER_URLS["hifigan_univ_v1"])
58
+
59
+ device = get_device(args)
60
+
61
+ # Load default model
62
+ model = load_matcha(args.model, MATCHA_TTS_LOC(args.model), device)
63
+ vocoder, denoiser = load_vocoder(args.vocoder, VOCODER_LOC(args.vocoder), device)
64
+
65
+
66
+ def load_model(model_name, vocoder_name):
67
+ model = load_matcha(model_name, MATCHA_TTS_LOC(model_name), device)
68
+ vocoder, denoiser = load_vocoder(vocoder_name, VOCODER_LOC(vocoder_name), device)
69
+ return model, vocoder, denoiser
70
+
71
+
72
+ def load_model_ui(model_type, textbox):
73
+ model_name, vocoder_name = RADIO_OPTIONS[model_type]["model"], RADIO_OPTIONS[model_type]["vocoder"]
74
+
75
+ global model, vocoder, denoiser, CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
76
+ if CURRENTLY_LOADED_MODEL != model_name:
77
+ model, vocoder, denoiser = load_model(model_name, vocoder_name)
78
+ CURRENTLY_LOADED_MODEL = model_name
79
+
80
+ if model_name == "matcha_ljspeech":
81
+ spk_slider = gr.update(visible=False, value=-1)
82
+ single_speaker_examples = gr.update(visible=True)
83
+ multi_speaker_examples = gr.update(visible=False)
84
+ length_scale = gr.update(value=0.95)
85
+ else:
86
+ spk_slider = gr.update(visible=True, value=0)
87
+ single_speaker_examples = gr.update(visible=False)
88
+ multi_speaker_examples = gr.update(visible=True)
89
+ length_scale = gr.update(value=0.85)
90
+
91
+ return (
92
+ textbox,
93
+ gr.update(interactive=True),
94
+ spk_slider,
95
+ single_speaker_examples,
96
+ multi_speaker_examples,
97
+ length_scale,
98
+ )
99
+
100
+
101
+ @torch.inference_mode()
102
+ def process_text_gradio(text):
103
+ output = process_text(1, text, device)
104
+ return output["x_phones"][1::2], output["x"], output["x_lengths"]
105
+
106
+
107
+ @torch.inference_mode()
108
+ def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk):
109
+ spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None
110
+ output = model.synthesise(
111
+ text,
112
+ text_length,
113
+ n_timesteps=n_timesteps,
114
+ temperature=temperature,
115
+ spks=spk,
116
+ length_scale=length_scale,
117
+ )
118
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
119
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
120
+ sf.write(fp.name, output["waveform"], 22050, "PCM_24")
121
+
122
+ return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy())
123
+
124
+
125
+ def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scale, spk):
126
+ global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
127
+ if CURRENTLY_LOADED_MODEL != "matcha_vctk":
128
+ global model, vocoder, denoiser # pylint: disable=global-statement
129
+ model, vocoder, denoiser = load_model("matcha_vctk", "hifigan_univ_v1")
130
+ CURRENTLY_LOADED_MODEL = "matcha_vctk"
131
+
132
+ phones, text, text_lengths = process_text_gradio(text)
133
+ audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
134
+ return phones, audio, mel_spectrogram
135
+
136
+
137
+ def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1):
138
+ global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
139
+ if CURRENTLY_LOADED_MODEL != "matcha_ljspeech":
140
+ global model, vocoder, denoiser # pylint: disable=global-statement
141
+ model, vocoder, denoiser = load_model("matcha_ljspeech", "hifigan_T2_v1")
142
+ CURRENTLY_LOADED_MODEL = "matcha_ljspeech"
143
+
144
+ phones, text, text_lengths = process_text_gradio(text)
145
+ audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
146
+ return phones, audio, mel_spectrogram
147
+
148
+
149
+ def main():
150
+ description = """# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
151
+ ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
152
+ We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis. Our method:
153
+
154
+
155
+ * Is probabilistic
156
+ * Has compact memory footprint
157
+ * Sounds highly natural
158
+ * Is very fast to synthesise from
159
+
160
+
161
+ Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS). Read our [arXiv preprint for more details](https://arxiv.org/abs/2309.03199).
162
+ Code is available in our [GitHub repository](https://github.com/shivammehta25/Matcha-TTS), along with pre-trained models.
163
+
164
+ Cached examples are available at the bottom of the page.
165
+ """
166
+
167
+ with gr.Blocks(title="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching") as demo:
168
+ processed_text = gr.State(value=None)
169
+ processed_text_len = gr.State(value=None)
170
+
171
+ with gr.Box():
172
+ with gr.Row():
173
+ gr.Markdown(description, scale=3)
174
+ with gr.Column():
175
+ gr.Image(LOGO_URL, label="Matcha-TTS logo", height=50, width=50, scale=1, show_label=False)
176
+ html = '<br><iframe width="560" height="315" src="https://www.youtube.com/embed/xmvJkz3bqw0?si=jN7ILyDsbPwJCGoa" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
177
+ gr.HTML(html)
178
+
179
+ with gr.Box():
180
+ radio_options = list(RADIO_OPTIONS.keys())
181
+ model_type = gr.Radio(
182
+ radio_options, value=radio_options[0], label="Choose a Model", interactive=True, container=False
183
+ )
184
+
185
+ with gr.Row():
186
+ gr.Markdown("# Text Input")
187
+ with gr.Row():
188
+ text = gr.Textbox(value="", lines=2, label="Text to synthesise", scale=3)
189
+ spk_slider = gr.Slider(
190
+ minimum=0, maximum=107, step=1, value=args.spk, label="Speaker ID", interactive=True, scale=1
191
+ )
192
+
193
+ with gr.Row():
194
+ gr.Markdown("### Hyper parameters")
195
+ with gr.Row():
196
+ n_timesteps = gr.Slider(
197
+ label="Number of ODE steps",
198
+ minimum=1,
199
+ maximum=100,
200
+ step=1,
201
+ value=10,
202
+ interactive=True,
203
+ )
204
+ length_scale = gr.Slider(
205
+ label="Length scale (Speaking rate)",
206
+ minimum=0.5,
207
+ maximum=1.5,
208
+ step=0.05,
209
+ value=1.0,
210
+ interactive=True,
211
+ )
212
+ mel_temp = gr.Slider(
213
+ label="Sampling temperature",
214
+ minimum=0.00,
215
+ maximum=2.001,
216
+ step=0.16675,
217
+ value=0.667,
218
+ interactive=True,
219
+ )
220
+
221
+ synth_btn = gr.Button("Synthesise")
222
+
223
+ with gr.Box():
224
+ with gr.Row():
225
+ gr.Markdown("### Phonetised text")
226
+ phonetised_text = gr.Textbox(interactive=False, scale=10, label="Phonetised text")
227
+
228
+ with gr.Box():
229
+ with gr.Row():
230
+ mel_spectrogram = gr.Image(interactive=False, label="mel spectrogram")
231
+
232
+ # with gr.Row():
233
+ audio = gr.Audio(interactive=False, label="Audio")
234
+
235
+ with gr.Row(visible=False) as example_row_lj_speech:
236
+ examples = gr.Examples( # pylint: disable=unused-variable
237
+ examples=[
238
+ [
239
+ "We propose Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up O D E-based speech synthesis.",
240
+ 50,
241
+ 0.677,
242
+ 0.95,
243
+ ],
244
+ [
245
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
246
+ 2,
247
+ 0.677,
248
+ 0.95,
249
+ ],
250
+ [
251
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
252
+ 4,
253
+ 0.677,
254
+ 0.95,
255
+ ],
256
+ [
257
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
258
+ 10,
259
+ 0.677,
260
+ 0.95,
261
+ ],
262
+ [
263
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
264
+ 50,
265
+ 0.677,
266
+ 0.95,
267
+ ],
268
+ [
269
+ "The narrative of these events is based largely on the recollections of the participants.",
270
+ 10,
271
+ 0.677,
272
+ 0.95,
273
+ ],
274
+ [
275
+ "The jury did not believe him, and the verdict was for the defendants.",
276
+ 10,
277
+ 0.677,
278
+ 0.95,
279
+ ],
280
+ ],
281
+ fn=ljspeech_example_cacher,
282
+ inputs=[text, n_timesteps, mel_temp, length_scale],
283
+ outputs=[phonetised_text, audio, mel_spectrogram],
284
+ cache_examples=True,
285
+ )
286
+
287
+ with gr.Row() as example_row_multispeaker:
288
+ multi_speaker_examples = gr.Examples( # pylint: disable=unused-variable
289
+ examples=[
290
+ [
291
+ "Hello everyone! I am speaker 0 and I am here to tell you that Matcha-TTS is amazing!",
292
+ 10,
293
+ 0.677,
294
+ 0.85,
295
+ 0,
296
+ ],
297
+ [
298
+ "Hello everyone! I am speaker 16 and I am here to tell you that Matcha-TTS is amazing!",
299
+ 10,
300
+ 0.677,
301
+ 0.85,
302
+ 16,
303
+ ],
304
+ [
305
+ "Hello everyone! I am speaker 44 and I am here to tell you that Matcha-TTS is amazing!",
306
+ 50,
307
+ 0.677,
308
+ 0.85,
309
+ 44,
310
+ ],
311
+ [
312
+ "Hello everyone! I am speaker 45 and I am here to tell you that Matcha-TTS is amazing!",
313
+ 50,
314
+ 0.677,
315
+ 0.85,
316
+ 45,
317
+ ],
318
+ [
319
+ "Hello everyone! I am speaker 58 and I am here to tell you that Matcha-TTS is amazing!",
320
+ 4,
321
+ 0.677,
322
+ 0.85,
323
+ 58,
324
+ ],
325
+ ],
326
+ fn=multispeaker_example_cacher,
327
+ inputs=[text, n_timesteps, mel_temp, length_scale, spk_slider],
328
+ outputs=[phonetised_text, audio, mel_spectrogram],
329
+ cache_examples=True,
330
+ label="Multi Speaker Examples",
331
+ )
332
+
333
+ model_type.change(lambda x: gr.update(interactive=False), inputs=[synth_btn], outputs=[synth_btn]).then(
334
+ load_model_ui,
335
+ inputs=[model_type, text],
336
+ outputs=[text, synth_btn, spk_slider, example_row_lj_speech, example_row_multispeaker, length_scale],
337
+ )
338
+
339
+ synth_btn.click(
340
+ fn=process_text_gradio,
341
+ inputs=[
342
+ text,
343
+ ],
344
+ outputs=[phonetised_text, processed_text, processed_text_len],
345
+ api_name="matcha_tts",
346
+ queue=True,
347
+ ).then(
348
+ fn=synthesise_mel,
349
+ inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale, spk_slider],
350
+ outputs=[audio, mel_spectrogram],
351
+ )
352
+
353
+ demo.queue().launch(share=True)
354
+
355
+
356
+ if __name__ == "__main__":
357
+ main()
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/cli.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import datetime as dt
3
+ import os
4
+ import warnings
5
+ from pathlib import Path
6
+
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import torch
11
+
12
+ from matcha.hifigan.config import v1
13
+ from matcha.hifigan.denoiser import Denoiser
14
+ from matcha.hifigan.env import AttrDict
15
+ from matcha.hifigan.models import Generator as HiFiGAN
16
+ from matcha.models.matcha_tts import MatchaTTS
17
+ from matcha.text import sequence_to_text, text_to_sequence
18
+ from matcha.utils.utils import assert_model_downloaded, get_user_data_dir, intersperse
19
+
20
+ MATCHA_URLS = {
21
+ "matcha_ljspeech": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_ljspeech.ckpt",
22
+ "matcha_vctk": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_vctk.ckpt",
23
+ }
24
+
25
+ VOCODER_URLS = {
26
+ "hifigan_T2_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/generator_v1", # Old url: https://drive.google.com/file/d/14NENd4equCBLyyCSke114Mv6YR_j_uFs/view?usp=drive_link
27
+ "hifigan_univ_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/g_02500000", # Old url: https://drive.google.com/file/d/1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW/view?usp=drive_link
28
+ }
29
+
30
+ MULTISPEAKER_MODEL = {
31
+ "matcha_vctk": {"vocoder": "hifigan_univ_v1", "speaking_rate": 0.85, "spk": 0, "spk_range": (0, 107)}
32
+ }
33
+
34
+ SINGLESPEAKER_MODEL = {"matcha_ljspeech": {"vocoder": "hifigan_T2_v1", "speaking_rate": 0.95, "spk": None}}
35
+
36
+
37
+ def plot_spectrogram_to_numpy(spectrogram, filename):
38
+ fig, ax = plt.subplots(figsize=(12, 3))
39
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
40
+ plt.colorbar(im, ax=ax)
41
+ plt.xlabel("Frames")
42
+ plt.ylabel("Channels")
43
+ plt.title("Synthesised Mel-Spectrogram")
44
+ fig.canvas.draw()
45
+ plt.savefig(filename)
46
+
47
+
48
+ def process_text(i: int, text: str, device: torch.device):
49
+ print(f"[{i}] - Input text: {text}")
50
+ x = torch.tensor(
51
+ intersperse(text_to_sequence(text, ["english_cleaners2"]), 0),
52
+ dtype=torch.long,
53
+ device=device,
54
+ )[None]
55
+ x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
56
+ x_phones = sequence_to_text(x.squeeze(0).tolist())
57
+ print(f"[{i}] - Phonetised text: {x_phones[1::2]}")
58
+
59
+ return {"x_orig": text, "x": x, "x_lengths": x_lengths, "x_phones": x_phones}
60
+
61
+
62
+ def get_texts(args):
63
+ if args.text:
64
+ texts = [args.text]
65
+ else:
66
+ with open(args.file, encoding="utf-8") as f:
67
+ texts = f.readlines()
68
+ return texts
69
+
70
+
71
+ def assert_required_models_available(args):
72
+ save_dir = get_user_data_dir()
73
+ if not hasattr(args, "checkpoint_path") and args.checkpoint_path is None:
74
+ model_path = args.checkpoint_path
75
+ else:
76
+ model_path = save_dir / f"{args.model}.ckpt"
77
+ assert_model_downloaded(model_path, MATCHA_URLS[args.model])
78
+
79
+ vocoder_path = save_dir / f"{args.vocoder}"
80
+ assert_model_downloaded(vocoder_path, VOCODER_URLS[args.vocoder])
81
+ return {"matcha": model_path, "vocoder": vocoder_path}
82
+
83
+
84
+ def load_hifigan(checkpoint_path, device):
85
+ h = AttrDict(v1)
86
+ hifigan = HiFiGAN(h).to(device)
87
+ hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
88
+ _ = hifigan.eval()
89
+ hifigan.remove_weight_norm()
90
+ return hifigan
91
+
92
+
93
+ def load_vocoder(vocoder_name, checkpoint_path, device):
94
+ print(f"[!] Loading {vocoder_name}!")
95
+ vocoder = None
96
+ if vocoder_name in ("hifigan_T2_v1", "hifigan_univ_v1"):
97
+ vocoder = load_hifigan(checkpoint_path, device)
98
+ else:
99
+ raise NotImplementedError(
100
+ f"Vocoder {vocoder_name} not implemented! define a load_<<vocoder_name>> method for it"
101
+ )
102
+
103
+ denoiser = Denoiser(vocoder, mode="zeros")
104
+ print(f"[+] {vocoder_name} loaded!")
105
+ return vocoder, denoiser
106
+
107
+
108
+ def load_matcha(model_name, checkpoint_path, device):
109
+ print(f"[!] Loading {model_name}!")
110
+ model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
111
+ _ = model.eval()
112
+
113
+ print(f"[+] {model_name} loaded!")
114
+ return model
115
+
116
+
117
+ def to_waveform(mel, vocoder, denoiser=None):
118
+ audio = vocoder(mel).clamp(-1, 1)
119
+ if denoiser is not None:
120
+ audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze()
121
+
122
+ return audio.cpu().squeeze()
123
+
124
+
125
+ def save_to_folder(filename: str, output: dict, folder: str):
126
+ folder = Path(folder)
127
+ folder.mkdir(exist_ok=True, parents=True)
128
+ plot_spectrogram_to_numpy(np.array(output["mel"].squeeze().float().cpu()), f"{filename}.png")
129
+ np.save(folder / f"{filename}", output["mel"].cpu().numpy())
130
+ sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
131
+ return folder.resolve() / f"{filename}.wav"
132
+
133
+
134
+ def validate_args(args):
135
+ assert (
136
+ args.text or args.file
137
+ ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms."
138
+ assert args.temperature >= 0, "Sampling temperature cannot be negative"
139
+ assert args.steps > 0, "Number of ODE steps must be greater than 0"
140
+
141
+ if args.checkpoint_path is None:
142
+ # When using pretrained models
143
+ if args.model in SINGLESPEAKER_MODEL:
144
+ args = validate_args_for_single_speaker_model(args)
145
+
146
+ if args.model in MULTISPEAKER_MODEL:
147
+ args = validate_args_for_multispeaker_model(args)
148
+ else:
149
+ # When using a custom model
150
+ if args.vocoder != "hifigan_univ_v1":
151
+ warn_ = "[-] Using custom model checkpoint! I would suggest passing --vocoder hifigan_univ_v1, unless the custom model is trained on LJ Speech."
152
+ warnings.warn(warn_, UserWarning)
153
+ if args.speaking_rate is None:
154
+ args.speaking_rate = 1.0
155
+
156
+ if args.batched:
157
+ assert args.batch_size > 0, "Batch size must be greater than 0"
158
+ assert args.speaking_rate > 0, "Speaking rate must be greater than 0"
159
+
160
+ return args
161
+
162
+
163
+ def validate_args_for_multispeaker_model(args):
164
+ if args.vocoder is not None:
165
+ if args.vocoder != MULTISPEAKER_MODEL[args.model]["vocoder"]:
166
+ warn_ = f"[-] Using {args.model} model! I would suggest passing --vocoder {MULTISPEAKER_MODEL[args.model]['vocoder']}"
167
+ warnings.warn(warn_, UserWarning)
168
+ else:
169
+ args.vocoder = MULTISPEAKER_MODEL[args.model]["vocoder"]
170
+
171
+ if args.speaking_rate is None:
172
+ args.speaking_rate = MULTISPEAKER_MODEL[args.model]["speaking_rate"]
173
+
174
+ spk_range = MULTISPEAKER_MODEL[args.model]["spk_range"]
175
+ if args.spk is not None:
176
+ assert (
177
+ args.spk >= spk_range[0] and args.spk <= spk_range[-1]
178
+ ), f"Speaker ID must be between {spk_range} for this model."
179
+ else:
180
+ available_spk_id = MULTISPEAKER_MODEL[args.model]["spk"]
181
+ warn_ = f"[!] Speaker ID not provided! Using speaker ID {available_spk_id}"
182
+ warnings.warn(warn_, UserWarning)
183
+ args.spk = available_spk_id
184
+
185
+ return args
186
+
187
+
188
+ def validate_args_for_single_speaker_model(args):
189
+ if args.vocoder is not None:
190
+ if args.vocoder != SINGLESPEAKER_MODEL[args.model]["vocoder"]:
191
+ warn_ = f"[-] Using {args.model} model! I would suggest passing --vocoder {SINGLESPEAKER_MODEL[args.model]['vocoder']}"
192
+ warnings.warn(warn_, UserWarning)
193
+ else:
194
+ args.vocoder = SINGLESPEAKER_MODEL[args.model]["vocoder"]
195
+
196
+ if args.speaking_rate is None:
197
+ args.speaking_rate = SINGLESPEAKER_MODEL[args.model]["speaking_rate"]
198
+
199
+ if args.spk != SINGLESPEAKER_MODEL[args.model]["spk"]:
200
+ warn_ = f"[-] Ignoring speaker id {args.spk} for {args.model}"
201
+ warnings.warn(warn_, UserWarning)
202
+ args.spk = SINGLESPEAKER_MODEL[args.model]["spk"]
203
+
204
+ return args
205
+
206
+
207
+ @torch.inference_mode()
208
+ def cli():
209
+ parser = argparse.ArgumentParser(
210
+ description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching"
211
+ )
212
+ parser.add_argument(
213
+ "--model",
214
+ type=str,
215
+ default="matcha_ljspeech",
216
+ help="Model to use",
217
+ choices=MATCHA_URLS.keys(),
218
+ )
219
+
220
+ parser.add_argument(
221
+ "--checkpoint_path",
222
+ type=str,
223
+ default=None,
224
+ help="Path to the custom model checkpoint",
225
+ )
226
+
227
+ parser.add_argument(
228
+ "--vocoder",
229
+ type=str,
230
+ default=None,
231
+ help="Vocoder to use (default: will use the one suggested with the pretrained model))",
232
+ choices=VOCODER_URLS.keys(),
233
+ )
234
+ parser.add_argument("--text", type=str, default=None, help="Text to synthesize")
235
+ parser.add_argument("--file", type=str, default=None, help="Text file to synthesize")
236
+ parser.add_argument("--spk", type=int, default=None, help="Speaker ID")
237
+ parser.add_argument(
238
+ "--temperature",
239
+ type=float,
240
+ default=0.667,
241
+ help="Variance of the x0 noise (default: 0.667)",
242
+ )
243
+ parser.add_argument(
244
+ "--speaking_rate",
245
+ type=float,
246
+ default=None,
247
+ help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)",
248
+ )
249
+ parser.add_argument("--steps", type=int, default=10, help="Number of ODE steps (default: 10)")
250
+ parser.add_argument("--cpu", action="store_true", help="Use CPU for inference (default: use GPU if available)")
251
+ parser.add_argument(
252
+ "--denoiser_strength",
253
+ type=float,
254
+ default=0.00025,
255
+ help="Strength of the vocoder bias denoiser (default: 0.00025)",
256
+ )
257
+ parser.add_argument(
258
+ "--output_folder",
259
+ type=str,
260
+ default=os.getcwd(),
261
+ help="Output folder to save results (default: current dir)",
262
+ )
263
+ parser.add_argument("--batched", action="store_true", help="Batched inference (default: False)")
264
+ parser.add_argument(
265
+ "--batch_size", type=int, default=32, help="Batch size only useful when --batched (default: 32)"
266
+ )
267
+
268
+ args = parser.parse_args()
269
+
270
+ args = validate_args(args)
271
+ device = get_device(args)
272
+ print_config(args)
273
+ paths = assert_required_models_available(args)
274
+
275
+ if args.checkpoint_path is not None:
276
+ print(f"[🍵] Loading custom model from {args.checkpoint_path}")
277
+ paths["matcha"] = args.checkpoint_path
278
+ args.model = "custom_model"
279
+
280
+ model = load_matcha(args.model, paths["matcha"], device)
281
+ vocoder, denoiser = load_vocoder(args.vocoder, paths["vocoder"], device)
282
+
283
+ texts = get_texts(args)
284
+
285
+ spk = torch.tensor([args.spk], device=device, dtype=torch.long) if args.spk is not None else None
286
+ if len(texts) == 1 or not args.batched:
287
+ unbatched_synthesis(args, device, model, vocoder, denoiser, texts, spk)
288
+ else:
289
+ batched_synthesis(args, device, model, vocoder, denoiser, texts, spk)
290
+
291
+
292
+ class BatchedSynthesisDataset(torch.utils.data.Dataset):
293
+ def __init__(self, processed_texts):
294
+ self.processed_texts = processed_texts
295
+
296
+ def __len__(self):
297
+ return len(self.processed_texts)
298
+
299
+ def __getitem__(self, idx):
300
+ return self.processed_texts[idx]
301
+
302
+
303
+ def batched_collate_fn(batch):
304
+ x = []
305
+ x_lengths = []
306
+
307
+ for b in batch:
308
+ x.append(b["x"].squeeze(0))
309
+ x_lengths.append(b["x_lengths"])
310
+
311
+ x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
312
+ x_lengths = torch.concat(x_lengths, dim=0)
313
+ return {"x": x, "x_lengths": x_lengths}
314
+
315
+
316
+ def batched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
317
+ total_rtf = []
318
+ total_rtf_w = []
319
+ processed_text = [process_text(i, text, "cpu") for i, text in enumerate(texts)]
320
+ dataloader = torch.utils.data.DataLoader(
321
+ BatchedSynthesisDataset(processed_text),
322
+ batch_size=args.batch_size,
323
+ collate_fn=batched_collate_fn,
324
+ num_workers=8,
325
+ )
326
+ for i, batch in enumerate(dataloader):
327
+ i = i + 1
328
+ start_t = dt.datetime.now()
329
+ output = model.synthesise(
330
+ batch["x"].to(device),
331
+ batch["x_lengths"].to(device),
332
+ n_timesteps=args.steps,
333
+ temperature=args.temperature,
334
+ spks=spk,
335
+ length_scale=args.speaking_rate,
336
+ )
337
+
338
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
339
+ t = (dt.datetime.now() - start_t).total_seconds()
340
+ rtf_w = t * 22050 / (output["waveform"].shape[-1])
341
+ print(f"[🍵-Batch: {i}] Matcha-TTS RTF: {output['rtf']:.4f}")
342
+ print(f"[🍵-Batch: {i}] Matcha-TTS + VOCODER RTF: {rtf_w:.4f}")
343
+ total_rtf.append(output["rtf"])
344
+ total_rtf_w.append(rtf_w)
345
+ for j in range(output["mel"].shape[0]):
346
+ base_name = f"utterance_{j:03d}_speaker_{args.spk:03d}" if args.spk is not None else f"utterance_{j:03d}"
347
+ length = output["mel_lengths"][j]
348
+ new_dict = {"mel": output["mel"][j][:, :length], "waveform": output["waveform"][j][: length * 256]}
349
+ location = save_to_folder(base_name, new_dict, args.output_folder)
350
+ print(f"[🍵-{j}] Waveform saved: {location}")
351
+
352
+ print("".join(["="] * 100))
353
+ print(f"[🍵] Average Matcha-TTS RTF: {np.mean(total_rtf):.4f} ± {np.std(total_rtf)}")
354
+ print(f"[🍵] Average Matcha-TTS + VOCODER RTF: {np.mean(total_rtf_w):.4f} ± {np.std(total_rtf_w)}")
355
+ print("[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!")
356
+
357
+
358
+ def unbatched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
359
+ total_rtf = []
360
+ total_rtf_w = []
361
+ for i, text in enumerate(texts):
362
+ i = i + 1
363
+ base_name = f"utterance_{i:03d}_speaker_{args.spk:03d}" if args.spk is not None else f"utterance_{i:03d}"
364
+
365
+ print("".join(["="] * 100))
366
+ text = text.strip()
367
+ text_processed = process_text(i, text, device)
368
+
369
+ print(f"[🍵] Whisking Matcha-T(ea)TS for: {i}")
370
+ start_t = dt.datetime.now()
371
+ output = model.synthesise(
372
+ text_processed["x"],
373
+ text_processed["x_lengths"],
374
+ n_timesteps=args.steps,
375
+ temperature=args.temperature,
376
+ spks=spk,
377
+ length_scale=args.speaking_rate,
378
+ )
379
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
380
+ # RTF with HiFiGAN
381
+ t = (dt.datetime.now() - start_t).total_seconds()
382
+ rtf_w = t * 22050 / (output["waveform"].shape[-1])
383
+ print(f"[🍵-{i}] Matcha-TTS RTF: {output['rtf']:.4f}")
384
+ print(f"[🍵-{i}] Matcha-TTS + VOCODER RTF: {rtf_w:.4f}")
385
+ total_rtf.append(output["rtf"])
386
+ total_rtf_w.append(rtf_w)
387
+
388
+ location = save_to_folder(base_name, output, args.output_folder)
389
+ print(f"[+] Waveform saved: {location}")
390
+
391
+ print("".join(["="] * 100))
392
+ print(f"[🍵] Average Matcha-TTS RTF: {np.mean(total_rtf):.4f} ± {np.std(total_rtf)}")
393
+ print(f"[🍵] Average Matcha-TTS + VOCODER RTF: {np.mean(total_rtf_w):.4f} ± {np.std(total_rtf_w)}")
394
+ print("[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!")
395
+
396
+
397
+ def print_config(args):
398
+ print("[!] Configurations: ")
399
+ print(f"\t- Model: {args.model}")
400
+ print(f"\t- Vocoder: {args.vocoder}")
401
+ print(f"\t- Temperature: {args.temperature}")
402
+ print(f"\t- Speaking rate: {args.speaking_rate}")
403
+ print(f"\t- Number of ODE steps: {args.steps}")
404
+ print(f"\t- Speaker: {args.spk}")
405
+
406
+
407
+ def get_device(args):
408
+ if torch.cuda.is_available() and not args.cpu:
409
+ print("[+] GPU Available! Using GPU")
410
+ device = torch.device("cuda")
411
+ else:
412
+ print("[-] GPU not available or forced CPU run! Using CPU")
413
+ device = torch.device("cpu")
414
+ return device
415
+
416
+
417
+ if __name__ == "__main__":
418
+ cli()
r1-a/response_generation/Kimi-Audio/kimia_infer/models/tokenizer/glm4/third_party/Matcha-TTS/matcha/train.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional, Tuple
2
+
3
+ import hydra
4
+ import lightning as L
5
+ import rootutils
6
+ from lightning import Callback, LightningDataModule, LightningModule, Trainer
7
+ from lightning.pytorch.loggers import Logger
8
+ from omegaconf import DictConfig
9
+
10
+ from matcha import utils
11
+
12
+ rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
13
+ # ------------------------------------------------------------------------------------ #
14
+ # the setup_root above is equivalent to:
15
+ # - adding project root dir to PYTHONPATH
16
+ # (so you don't need to force user to install project as a package)
17
+ # (necessary before importing any local modules e.g. `from src import utils`)
18
+ # - setting up PROJECT_ROOT environment variable
19
+ # (which is used as a base for paths in "configs/paths/default.yaml")
20
+ # (this way all filepaths are the same no matter where you run the code)
21
+ # - loading environment variables from ".env" in root dir
22
+ #
23
+ # you can remove it if you:
24
+ # 1. either install project as a package or move entry files to project root dir
25
+ # 2. set `root_dir` to "." in "configs/paths/default.yaml"
26
+ #
27
+ # more info: https://github.com/ashleve/rootutils
28
+ # ------------------------------------------------------------------------------------ #
29
+
30
+
31
+ log = utils.get_pylogger(__name__)
32
+
33
+
34
+ @utils.task_wrapper
35
+ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
36
+ """Trains the model. Can additionally evaluate on a testset, using best weights obtained during
37
+ training.
38
+
39
+ This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
40
+ failure. Useful for multiruns, saving info about the crash, etc.
41
+
42
+ :param cfg: A DictConfig configuration composed by Hydra.
43
+ :return: A tuple with metrics and dict with all instantiated objects.
44
+ """
45
+ # set seed for random number generators in pytorch, numpy and python.random
46
+ if cfg.get("seed"):
47
+ L.seed_everything(cfg.seed, workers=True)
48
+
49
+ log.info(f"Instantiating datamodule <{cfg.data._target_}>") # pylint: disable=protected-access
50
+ datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
51
+
52
+ log.info(f"Instantiating model <{cfg.model._target_}>") # pylint: disable=protected-access
53
+ model: LightningModule = hydra.utils.instantiate(cfg.model)
54
+
55
+ log.info("Instantiating callbacks...")
56
+ callbacks: List[Callback] = utils.instantiate_callbacks(cfg.get("callbacks"))
57
+
58
+ log.info("Instantiating loggers...")
59
+ logger: List[Logger] = utils.instantiate_loggers(cfg.get("logger"))
60
+
61
+ log.info(f"Instantiating trainer <{cfg.trainer._target_}>") # pylint: disable=protected-access
62
+ trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger)
63
+
64
+ object_dict = {
65
+ "cfg": cfg,
66
+ "datamodule": datamodule,
67
+ "model": model,
68
+ "callbacks": callbacks,
69
+ "logger": logger,
70
+ "trainer": trainer,
71
+ }
72
+
73
+ if logger:
74
+ log.info("Logging hyperparameters!")
75
+ utils.log_hyperparameters(object_dict)
76
+
77
+ if cfg.get("train"):
78
+ log.info("Starting training!")
79
+ trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
80
+
81
+ train_metrics = trainer.callback_metrics
82
+
83
+ if cfg.get("test"):
84
+ log.info("Starting testing!")
85
+ ckpt_path = trainer.checkpoint_callback.best_model_path
86
+ if ckpt_path == "":
87
+ log.warning("Best ckpt not found! Using current weights for testing...")
88
+ ckpt_path = None
89
+ trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
90
+ log.info(f"Best ckpt path: {ckpt_path}")
91
+
92
+ test_metrics = trainer.callback_metrics
93
+
94
+ # merge train and test metrics
95
+ metric_dict = {**train_metrics, **test_metrics}
96
+
97
+ return metric_dict, object_dict
98
+
99
+
100
+ @hydra.main(version_base="1.3", config_path="../configs", config_name="train.yaml")
101
+ def main(cfg: DictConfig) -> Optional[float]:
102
+ """Main entry point for training.
103
+
104
+ :param cfg: DictConfig configuration composed by Hydra.
105
+ :return: Optional[float] with optimized metric value.
106
+ """
107
+ # apply extra utilities
108
+ # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.)
109
+ utils.extras(cfg)
110
+
111
+ # train the model
112
+ metric_dict, _ = train(cfg)
113
+
114
+ # safely retrieve metric value for hydra-based hyperparameter optimization
115
+ metric_value = utils.get_metric_value(metric_dict=metric_dict, metric_name=cfg.get("optimized_metric"))
116
+
117
+ # return optimized metric
118
+ return metric_value
119
+
120
+
121
+ if __name__ == "__main__":
122
+ main() # pylint: disable=no-value-for-parameter