optigesr commited on
Commit
53412e9
·
1 Parent(s): 73b095e

Upload tortoise_tts.py

Browse files
Files changed (1) hide show
  1. tortoise_tts.py +397 -0
tortoise_tts.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/home/runner/tts-tortoise-gradio/venv/bin/python
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+ import tempfile
7
+ import time
8
+
9
+ import torch
10
+ import torchaudio
11
+
12
+ from tortoise.api import MODELS_DIR, TextToSpeech
13
+ from tortoise.utils.audio import get_voices, load_voices, load_audio
14
+ from tortoise.utils.text import split_and_recombine_text
15
+
16
+ parser = argparse.ArgumentParser(
17
+ description="TorToiSe is a text-to-speech program that is capable of synthesizing speech "
18
+ "in multiple voices with realistic prosody and intonation."
19
+ )
20
+
21
+ parser.add_argument(
22
+ "text",
23
+ type=str,
24
+ nargs="*",
25
+ help="Text to speak. If omitted, text is read from stdin.",
26
+ )
27
+ parser.add_argument(
28
+ "-v, --voice",
29
+ type=str,
30
+ default="random",
31
+ metavar="VOICE",
32
+ dest="voice",
33
+ help="Selects the voice to use for generation. Use the & character to join two voices together. "
34
+ 'Use a comma to perform inference on multiple voices. Set to "all" to use all available voices. '
35
+ "Note that multiple voices require the --output-dir option to be set.",
36
+ )
37
+ parser.add_argument(
38
+ "-V, --voices-dir",
39
+ metavar="VOICES_DIR",
40
+ type=str,
41
+ dest="voices_dir",
42
+ help="Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories.",
43
+ )
44
+ parser.add_argument(
45
+ "-p, --preset",
46
+ type=str,
47
+ default="fast",
48
+ choices=["ultra_fast", "fast", "standard", "high_quality"],
49
+ dest="preset",
50
+ help="Which voice quality preset to use.",
51
+ )
52
+ parser.add_argument(
53
+ "-q, --quiet",
54
+ default=False,
55
+ action="store_true",
56
+ dest="quiet",
57
+ help="Suppress all output.",
58
+ )
59
+
60
+ output_group = parser.add_mutually_exclusive_group(required=True)
61
+ output_group.add_argument(
62
+ "-l, --list-voices",
63
+ default=False,
64
+ action="store_true",
65
+ dest="list_voices",
66
+ help="List available voices and exit.",
67
+ )
68
+ output_group.add_argument(
69
+ "-P, --play",
70
+ action="store_true",
71
+ dest="play",
72
+ help="Play the audio (requires pydub).",
73
+ )
74
+ output_group.add_argument(
75
+ "-o, --output",
76
+ type=str,
77
+ metavar="OUTPUT",
78
+ dest="output",
79
+ help="Save the audio to a file.",
80
+ )
81
+ output_group.add_argument(
82
+ "-O, --output-dir",
83
+ type=str,
84
+ metavar="OUTPUT_DIR",
85
+ dest="output_dir",
86
+ help="Save the audio to a directory as individual segments.",
87
+ )
88
+
89
+ multi_output_group = parser.add_argument_group(
90
+ "multi-output options (requires --output-dir)"
91
+ )
92
+ multi_output_group.add_argument(
93
+ "--candidates",
94
+ type=int,
95
+ default=1,
96
+ help="How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output.",
97
+ )
98
+ multi_output_group.add_argument(
99
+ "--regenerate",
100
+ type=str,
101
+ default=None,
102
+ help="Comma-separated list of clip numbers to re-generate.",
103
+ )
104
+ multi_output_group.add_argument(
105
+ "--skip-existing",
106
+ action="store_true",
107
+ help="Set to skip re-generating existing clips.",
108
+ )
109
+
110
+ advanced_group = parser.add_argument_group("advanced options")
111
+ advanced_group.add_argument(
112
+ "--produce-debug-state",
113
+ default=False,
114
+ action="store_true",
115
+ help="Whether or not to produce debug_states in current directory, which can aid in reproducing problems.",
116
+ )
117
+ advanced_group.add_argument(
118
+ "--seed",
119
+ type=int,
120
+ default=None,
121
+ help="Random seed which can be used to reproduce results.",
122
+ )
123
+ advanced_group.add_argument(
124
+ "--models-dir",
125
+ type=str,
126
+ default=MODELS_DIR,
127
+ help="Where to find pretrained model checkpoints. Tortoise automatically downloads these to "
128
+ "~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints.",
129
+ )
130
+ advanced_group.add_argument(
131
+ "--text-split",
132
+ type=str,
133
+ default=None,
134
+ help="How big chunks to split the text into, in the format <desired_length>,<max_length>.",
135
+ )
136
+ advanced_group.add_argument(
137
+ "--disable-redaction",
138
+ default=False,
139
+ action="store_true",
140
+ help="Normally text enclosed in brackets are automatically redacted from the spoken output "
141
+ "(but are still rendered by the model), this can be used for prompt engineering. "
142
+ "Set this to disable this behavior.",
143
+ )
144
+ advanced_group.add_argument(
145
+ "--device", type=str, default=None, help="Device to use for inference."
146
+ )
147
+ advanced_group.add_argument(
148
+ "--batch-size",
149
+ type=int,
150
+ default=None,
151
+ help="Batch size to use for inference. If omitted, the batch size is set based on available GPU memory.",
152
+ )
153
+
154
+ tuning_group = parser.add_argument_group("tuning options (overrides preset settings)")
155
+ tuning_group.add_argument(
156
+ "--num-autoregressive-samples",
157
+ type=int,
158
+ default=None,
159
+ help="Number of samples taken from the autoregressive model, all of which are filtered using CLVP. "
160
+ 'As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great".',
161
+ )
162
+ tuning_group.add_argument(
163
+ "--temperature",
164
+ type=float,
165
+ default=None,
166
+ help="The softmax temperature of the autoregressive model.",
167
+ )
168
+ tuning_group.add_argument(
169
+ "--length-penalty",
170
+ type=float,
171
+ default=None,
172
+ help="A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.",
173
+ )
174
+ tuning_group.add_argument(
175
+ "--repetition-penalty",
176
+ type=float,
177
+ default=None,
178
+ help="A penalty that prevents the autoregressive decoder from repeating itself during decoding. "
179
+ 'Can be used to reduce the incidence of long silences or "uhhhhhhs", etc.',
180
+ )
181
+ tuning_group.add_argument(
182
+ "--top-p",
183
+ type=float,
184
+ default=None,
185
+ help='P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs.',
186
+ )
187
+ tuning_group.add_argument(
188
+ "--max-mel-tokens",
189
+ type=int,
190
+ default=None,
191
+ help="Restricts the output length. 1 to 600. Each unit is 1/20 of a second.",
192
+ )
193
+ tuning_group.add_argument(
194
+ "--cvvp-amount",
195
+ type=float,
196
+ default=None,
197
+ help="How much the CVVP model should influence the output."
198
+ "Increasing this can in some cases reduce the likelyhood of multiple speakers.",
199
+ )
200
+ tuning_group.add_argument(
201
+ "--diffusion-iterations",
202
+ type=int,
203
+ default=None,
204
+ help="Number of diffusion steps to perform. More steps means the network has more chances to iteratively"
205
+ "refine the output, which should theoretically mean a higher quality output. "
206
+ "Generally a value above 250 is not noticeably better, however.",
207
+ )
208
+ tuning_group.add_argument(
209
+ "--cond-free",
210
+ type=bool,
211
+ default=None,
212
+ help="Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for "
213
+ "each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output "
214
+ "of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and "
215
+ "dramatically improves realism.",
216
+ )
217
+ tuning_group.add_argument(
218
+ "--cond-free-k",
219
+ type=float,
220
+ default=None,
221
+ help="Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. "
222
+ "As cond_free_k increases, the output becomes dominated by the conditioning-free signal. "
223
+ "Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k",
224
+ )
225
+ tuning_group.add_argument(
226
+ "--diffusion-temperature",
227
+ type=float,
228
+ default=None,
229
+ help="Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 "
230
+ 'are the "mean" prediction of the diffusion network and will sound bland and smeared. ',
231
+ )
232
+
233
+ usage_examples = f"""
234
+ Examples:
235
+
236
+ Read text using random voice and place it in a file:
237
+
238
+ {parser.prog} -o hello.wav "Hello, how are you?"
239
+
240
+ Read text from stdin and play it using the tom voice:
241
+
242
+ echo "Say it like you mean it!" | {parser.prog} -P -v tom
243
+
244
+ Read a text file using multiple voices and save the audio clips to a directory:
245
+
246
+ {parser.prog} -O /tmp/tts-results -v tom,emma <textfile.txt
247
+ """
248
+
249
+ try:
250
+ args = parser.parse_args()
251
+ except SystemExit as e:
252
+ if e.code == 0:
253
+ print(usage_examples)
254
+ sys.exit(e.code)
255
+
256
+ extra_voice_dirs = args.voices_dir.split(",") if args.voices_dir else []
257
+ all_voices = sorted(get_voices(extra_voice_dirs))
258
+
259
+ if args.list_voices:
260
+ for v in all_voices:
261
+ print(v)
262
+ sys.exit(0)
263
+
264
+ selected_voices = all_voices if args.voice == "all" else args.voice.split(",")
265
+ selected_voices = [v.split("&") if "&" in v else [v] for v in selected_voices]
266
+ for voices in selected_voices:
267
+ for v in voices:
268
+ if v != "random" and v not in all_voices:
269
+ parser.error(
270
+ f"voice {v} not available, use --list-voices to see available voices."
271
+ )
272
+
273
+ if len(args.text) == 0:
274
+ text = ""
275
+ for line in sys.stdin:
276
+ text += line
277
+ else:
278
+ text = " ".join(args.text)
279
+ text = text.strip()
280
+ if args.text_split:
281
+ desired_length, max_length = [int(x) for x in args.text_split.split(",")]
282
+ if desired_length > max_length:
283
+ parser.error(
284
+ f"--text-split: desired_length ({desired_length}) must be <= max_length ({max_length})"
285
+ )
286
+ texts = split_and_recombine_text(text, desired_length, max_length)
287
+ else:
288
+ texts = split_and_recombine_text(text)
289
+ if len(texts) == 0:
290
+ parser.error("no text provided")
291
+
292
+ if args.output_dir:
293
+ os.makedirs(args.output_dir, exist_ok=True)
294
+ else:
295
+ if len(selected_voices) > 1:
296
+ parser.error('cannot have multiple voices without --output-dir"')
297
+ if args.candidates > 1:
298
+ parser.error('cannot have multiple candidates without --output-dir"')
299
+
300
+ # error out early if pydub isn't installed
301
+ if args.play:
302
+ try:
303
+ import pydub
304
+ import pydub.playback
305
+ except ImportError:
306
+ parser.error(
307
+ '--play requires pydub to be installed, which can be done with "pip install pydub"'
308
+ )
309
+
310
+ seed = int(time.time()) if args.seed is None else args.seed
311
+ if not args.quiet:
312
+ print("Loading tts...")
313
+ tts = TextToSpeech(
314
+ models_dir=args.models_dir,
315
+ enable_redaction=not args.disable_redaction,
316
+ device=args.device,
317
+ autoregressive_batch_size=args.batch_size,
318
+ )
319
+ gen_settings = {
320
+ "use_deterministic_seed": seed,
321
+ "verbose": not args.quiet,
322
+ "k": args.candidates,
323
+ "preset": args.preset,
324
+ }
325
+ tuning_options = [
326
+ "num_autoregressive_samples",
327
+ "temperature",
328
+ "length_penalty",
329
+ "repetition_penalty",
330
+ "top_p",
331
+ "max_mel_tokens",
332
+ "cvvp_amount",
333
+ "diffusion_iterations",
334
+ "cond_free",
335
+ "cond_free_k",
336
+ "diffusion_temperature",
337
+ ]
338
+ for option in tuning_options:
339
+ if getattr(args, option) is not None:
340
+ gen_settings[option] = getattr(args, option)
341
+ total_clips = len(texts) * len(selected_voices)
342
+ regenerate_clips = (
343
+ [int(x) for x in args.regenerate.split(",")] if args.regenerate else None
344
+ )
345
+ for voice_idx, voice in enumerate(selected_voices):
346
+ audio_parts = []
347
+ voice_samples, conditioning_latents = load_voices(voice, extra_voice_dirs)
348
+ for text_idx, text in enumerate(texts):
349
+ clip_name = f'{"-".join(voice)}_{text_idx:02d}'
350
+ if args.output_dir:
351
+ first_clip = os.path.join(args.output_dir, f"{clip_name}_00.wav")
352
+ if (
353
+ args.skip_existing
354
+ or (regenerate_clips and text_idx not in regenerate_clips)
355
+ ) and os.path.exists(first_clip):
356
+ audio_parts.append(load_audio(first_clip, 24000))
357
+ if not args.quiet:
358
+ print(f"Skipping {clip_name}")
359
+ continue
360
+ if not args.quiet:
361
+ print(
362
+ f"Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})..."
363
+ )
364
+ print(" " + text)
365
+ gen = tts.tts_with_preset(
366
+ text,
367
+ voice_samples=voice_samples,
368
+ conditioning_latents=conditioning_latents,
369
+ **gen_settings,
370
+ )
371
+ gen = gen if args.candidates > 1 else [gen]
372
+ for candidate_idx, audio in enumerate(gen):
373
+ audio = audio.squeeze(0).cpu()
374
+ if candidate_idx == 0:
375
+ audio_parts.append(audio)
376
+ if args.output_dir:
377
+ filename = f"{clip_name}_{candidate_idx:02d}.wav"
378
+ torchaudio.save(os.path.join(args.output_dir, filename), audio, 24000)
379
+
380
+ audio = torch.cat(audio_parts, dim=-1)
381
+ if args.output_dir:
382
+ filename = f'{"-".join(voice)}_combined.wav'
383
+ torchaudio.save(os.path.join(args.output_dir, filename), audio, 24000)
384
+ elif args.output:
385
+ filename = args.output if args.output else os.tmp
386
+ torchaudio.save(args.output, audio, 24000)
387
+ elif args.play:
388
+ f = tempfile.NamedTemporaryFile(suffix=".wav", delete=True)
389
+ torchaudio.save(f.name, audio, 24000)
390
+ pydub.playback.play(pydub.AudioSegment.from_wav(f.name))
391
+
392
+ if args.produce_debug_state:
393
+ os.makedirs("debug_states", exist_ok=True)
394
+ dbg_state = (seed, texts, voice_samples, conditioning_latents, args)
395
+ torch.save(
396
+ dbg_state, os.path.join("debug_states", f'debug_{"-".join(voice)}.pth')
397
+ )