optigesr commited on
Commit
39bc51b
·
1 Parent(s): da5f6cc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import tempfile
3
+ import gradio as gr
4
+ import numpy as np
5
+ from typing import Tuple, List
6
+
7
+ # Setup and installation
8
+ os.system("git clone https://github.com/neonbjb/tortoise-tts.git")
9
+ sys.path.append("./tortoise-tts/")
10
+ os.system("pip install -r ./tortoise-tts/requirements.txt")
11
+ os.system("python ./tortoise-tts/setup.py install")
12
+
13
+ import torch
14
+ import torchaudio
15
+ import torch.nn as nn
16
+ import torch.nn.functional as F
17
+
18
+ from tortoise.api import TextToSpeech
19
+ from tortoise.utils.audio import load_audio, load_voice
20
+
21
+
22
+ # Download and instantiate model
23
+ tts = TextToSpeech()
24
+
25
+
26
+ # Display parameters
27
+ VOICES = ["random","train_atkins","train_daws","train_dotrice","train_dreams","train_empire","train_grace","train_kennard","train_lescault","train_mouse","angie","applejack","daniel","deniro","emma","freeman","geralt","halle","jlaw","lj","mol","myself","pat","pat2","rainbow","snakes","tim_reynolds","tom","weaver","william"]
28
+ DEFAULT_VOICE = "random"
29
+ PRESETS = ["ultra_fast", "fast", "standard", "high_quality"]
30
+ DEFAULT_PRESET = "fast"
31
+ DEFAULT_TEXT = "Hello, world!"
32
+
33
+ README = """# TorToiSe
34
+ Tortoise is a text-to-speech model developed by James Betker. It is capable of zero-shot voice cloning from a small set of voice samples. GitHub repo: [neonbjb/tortoise-tts](https://github.com/neonbjb/tortoise-tts).
35
+
36
+ ## Usage
37
+ 1. Select a model preset and type the text to speak.
38
+ 2. Load a voice - either by choosing a preset, uploading audio files, or recording via microphone. Select the option to split audio into chunks if the clips are much longer than 10 seconds each. Follow the guidelines in the [voice customization guide](https://github.com/neonbjb/tortoise-tts#voice-customization-guide).
39
+ 3. Click **Generate**, and wait - it's called *tortoise* for a reason!
40
+ """
41
+
42
+ TORTOISE_SR_IN = 22050
43
+ TORTOISE_SR_OUT = 24000
44
+
45
+ def chunk_audio(t: torch.Tensor, sample_rate: int, chunk_duration_sec: int) -> List[torch.Tensor]:
46
+ duration = t.shape[1] / sample_rate
47
+ num_chunks = 1 + int(duration/chunk_duration_sec)
48
+ chunks = [t[:,(sample_rate*chunk_duration_sec*i):(sample_rate*chunk_duration_sec*(i+1))] for i in range(num_chunks)]
49
+ # remove 0-width chunks
50
+ chunks = [chunk for chunk in chunks if chunk.shape[1]>0]
51
+ return chunks
52
+
53
+ def tts_main(voice_samples: List[torch.Tensor], text: str, model_preset: str) -> str:
54
+ gen = tts.tts_with_preset(
55
+ text,
56
+ voice_samples=voice_samples,
57
+ conditioning_latents=None,
58
+ preset=model_preset
59
+ )
60
+ torchaudio.save("generated.wav", gen.squeeze(0).cpu(), TORTOISE_SR_OUT)
61
+ return "generated.wav"
62
+
63
+ def tts_from_preset(voice: str, text, model_preset):
64
+ voice_samples, _ = load_voice(voice)
65
+ return tts_main(voice_samples, text, model_preset)
66
+
67
+ def tts_from_files(files: List[tempfile._TemporaryFileWrapper], do_chunk, text, model_preset):
68
+ voice_samples = [load_audio(f.name, TORTOISE_SR_IN) for f in files]
69
+ if do_chunk:
70
+ voice_samples = [chunk for t in voice_samples for chunk in chunk_audio(t, TORTOISE_SR_IN, 10)]
71
+ return tts_main(voice_samples, text, model_preset)
72
+
73
+ def tts_from_recording(recording: Tuple[int, np.ndarray], do_chunk, text, model_preset):
74
+ sample_rate, audio = recording
75
+ # normalize- https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/audio.py#L16
76
+ norm_fix = 1
77
+ if audio.dtype == np.int32:
78
+ norm_fix = 2**31
79
+ elif audio.dtype == np.int16:
80
+ norm_fix = 2**15
81
+ audio = torch.FloatTensor(audio.T) / norm_fix
82
+ if len(audio.shape) > 1:
83
+ # convert to mono
84
+ audio = torch.mean(audio, axis=0).unsqueeze(0)
85
+ audio = torchaudio.transforms.Resample(sample_rate, TORTOISE_SR_IN)(audio)
86
+ if do_chunk:
87
+ voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10)
88
+ else:
89
+ voice_samples = [audio]
90
+ return tts_main(voice_samples, text, model_preset)
91
+
92
+ def tts_from_url(audio_url, start_time, end_time, do_chunk, text, model_preset):
93
+ os.system(f"yt-dlp -x --audio-format mp3 --force-overwrites {audio_url} -o audio.mp3")
94
+ audio = load_audio("audio.mp3", TORTOISE_SR_IN)
95
+ audio = audio[:,start_time*TORTOISE_SR_IN:end_time*TORTOISE_SR_IN]
96
+ if do_chunk:
97
+ voice_samples = chunk_audio(audio, TORTOISE_SR_IN, 10)
98
+ else:
99
+ voice_samples = [audio]
100
+ return tts_main(voice_samples, text, model_preset)
101
+
102
+
103
+ with gr.Blocks() as demo:
104
+
105
+ gr.Markdown(README)
106
+
107
+ preset = gr.Dropdown(PRESETS, label="Model preset", value=DEFAULT_PRESET)
108
+ text = gr.Textbox(label="Text to speak", value=DEFAULT_TEXT)
109
+ do_chunk_label = "Split audio into chunks? (for audio much longer than 10 seconds.)"
110
+ do_chunk_default = True
111
+
112
+ with gr.Tab("Choose preset voice"):
113
+ inp1 = gr.Dropdown(VOICES, value=DEFAULT_VOICE, label="Preset voice")
114
+ btn1 = gr.Button("Generate")
115
+
116
+ with gr.Tab("Upload audio"):
117
+ inp2 = gr.File(file_count="multiple")
118
+ do_chunk2 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
119
+ btn2 = gr.Button("Generate")
120
+
121
+ with gr.Tab("Record audio"):
122
+ inp3 = gr.Audio(source="microphone")
123
+ do_chunk3 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
124
+ btn3 = gr.Button("Generate")
125
+
126
+ # with gr.Tab("From YouTube"):
127
+ # inp4 = gr.Textbox(label="URL")
128
+ # do_chunk4 = gr.Checkbox(label=do_chunk_label, value=do_chunk_default)
129
+ # start_time = gr.Number(label="Start time (seconds)", precision=0)
130
+ # end_time = gr.Number(label="End time (seconds)", precision=0)
131
+ # btn4 = gr.Button("Generate")
132
+
133
+ audio_out = gr.Audio()
134
+
135
+ btn1.click(
136
+ tts_from_preset,
137
+ [inp1, text, preset],
138
+ [audio_out],
139
+ )
140
+ btn2.click(
141
+ tts_from_files,
142
+ [inp2, do_chunk2, text, preset],
143
+ [audio_out],
144
+ )
145
+ btn3.click(
146
+ tts_from_recording,
147
+ [inp3, do_chunk3, text, preset],
148
+ [audio_out],
149
+ )
150
+ # btn4.click(
151
+ # tts_from_url,
152
+ # [inp4, start_time, end_time, do_chunk4, text, preset],
153
+ # [audio_out],
154
+ # )
155
+
156
+ demo.launch()