Delete app.py
Browse files
app.py
DELETED
|
@@ -1,257 +0,0 @@
|
|
| 1 |
-
from kokoro import KModel, KPipeline
|
| 2 |
-
import gradio as gr
|
| 3 |
-
import os
|
| 4 |
-
import random
|
| 5 |
-
import torch
|
| 6 |
-
import logging
|
| 7 |
-
import soundfile as sf
|
| 8 |
-
|
| 9 |
-
# Optional: import Resemblyzer for voice cloning (install via pip install resemblyzer)
|
| 10 |
-
try:
|
| 11 |
-
from resemblyzer import VoiceEncoder, preprocess_wav
|
| 12 |
-
encoder = VoiceEncoder()
|
| 13 |
-
except ImportError:
|
| 14 |
-
encoder = None
|
| 15 |
-
|
| 16 |
-
# Configuration
|
| 17 |
-
VOICE_DIR = r"D:\New folder (2)\model\voices"
|
| 18 |
-
OUTPUT_DIR = r"D:\New folder (2)\output_audio"
|
| 19 |
-
TEXT = "Hello, this is a test of the Kokoro TTS system."
|
| 20 |
-
|
| 21 |
-
# Configure logging
|
| 22 |
-
logging.basicConfig(level=logging.INFO)
|
| 23 |
-
logger = logging.getLogger(__name__)
|
| 24 |
-
|
| 25 |
-
# Device setup
|
| 26 |
-
CUDA_AVAILABLE = torch.cuda.is_available()
|
| 27 |
-
device = "cuda" if CUDA_AVAILABLE else "cpu"
|
| 28 |
-
logger.info(f"Using hardware: {device}")
|
| 29 |
-
|
| 30 |
-
# Load models for CPU and GPU (if available)
|
| 31 |
-
models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
|
| 32 |
-
|
| 33 |
-
# Define pipelines for American ('a') and British ('b') English
|
| 34 |
-
pipelines = {
|
| 35 |
-
'a': KPipeline(model=models[False], lang_code='a', device='cpu'), # American English
|
| 36 |
-
'b': KPipeline(model=models[False], lang_code='b', device='cpu') # British English
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
# Set custom pronunciations for "kokoro" in both American and British modes
|
| 40 |
-
try:
|
| 41 |
-
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
|
| 42 |
-
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
|
| 43 |
-
except AttributeError as e:
|
| 44 |
-
logger.warning(f"Could not set custom pronunciations: {e}")
|
| 45 |
-
|
| 46 |
-
def forward_gpu(text, voice_path, speed):
|
| 47 |
-
# Use the GPU model directly without spaces.GPU decorator
|
| 48 |
-
pipeline = pipelines[voice_path[0]]
|
| 49 |
-
# Ensure the pipeline uses the GPU model
|
| 50 |
-
pipeline.model = models[True] # Switch to GPU model
|
| 51 |
-
generator = pipeline(text, voice=voice_path, speed=speed)
|
| 52 |
-
for _, _, audio in generator:
|
| 53 |
-
return audio
|
| 54 |
-
return None
|
| 55 |
-
|
| 56 |
-
def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE, clone_voice_file=None):
|
| 57 |
-
voice_path = os.path.join(VOICE_DIR, voice)
|
| 58 |
-
if not os.path.exists(voice_path):
|
| 59 |
-
raise FileNotFoundError(f"Voice file not found: {voice_path}")
|
| 60 |
-
|
| 61 |
-
pipeline = pipelines[voice[0]]
|
| 62 |
-
|
| 63 |
-
# If a clone file is provided and the encoder is available, try to clone the voice
|
| 64 |
-
if clone_voice_file is not None and encoder is not None:
|
| 65 |
-
try:
|
| 66 |
-
# clone_voice_file is a file path (string) in Gradio with type="filepath"
|
| 67 |
-
wav = preprocess_wav(clone_voice_file)
|
| 68 |
-
cloned_voice = torch.tensor(encoder.embed_utterance(wav), device=device).unsqueeze(0)
|
| 69 |
-
temp_voice_path = os.path.join(VOICE_DIR, "cloned_voice.pt")
|
| 70 |
-
torch.save(cloned_voice, temp_voice_path)
|
| 71 |
-
voice_path = temp_voice_path
|
| 72 |
-
except Exception as e:
|
| 73 |
-
logger.error(f"Error cloning voice: {e}")
|
| 74 |
-
voice_path = os.path.join(VOICE_DIR, voice)
|
| 75 |
-
|
| 76 |
-
use_gpu = use_gpu and CUDA_AVAILABLE
|
| 77 |
-
try:
|
| 78 |
-
if use_gpu:
|
| 79 |
-
audio = forward_gpu(text, voice_path, speed)
|
| 80 |
-
else:
|
| 81 |
-
pipeline.model = models[False] # Ensure CPU model is used
|
| 82 |
-
generator = pipeline(text, voice=voice_path, speed=speed)
|
| 83 |
-
for _, ps, audio in generator:
|
| 84 |
-
return (24000, audio.numpy()), ps
|
| 85 |
-
except gr.exceptions.Error as e:
|
| 86 |
-
if use_gpu:
|
| 87 |
-
gr.Warning(str(e))
|
| 88 |
-
gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
|
| 89 |
-
pipeline.model = models[False] # Switch to CPU model
|
| 90 |
-
generator = pipeline(text, voice=voice_path, speed=speed)
|
| 91 |
-
for _, ps, audio in generator:
|
| 92 |
-
return (24000, audio.numpy()), ps
|
| 93 |
-
else:
|
| 94 |
-
raise gr.Error(e)
|
| 95 |
-
return None, ""
|
| 96 |
-
|
| 97 |
-
def predict(text, voice="af_bella.pt", speed=1):
|
| 98 |
-
return generate_first(text, voice, speed, use_gpu=False)[0]
|
| 99 |
-
|
| 100 |
-
def tokenize_first(text, voice="af_bella.pt"):
|
| 101 |
-
voice_path = os.path.join(VOICE_DIR, voice)
|
| 102 |
-
if not os.path.exists(voice_path):
|
| 103 |
-
raise FileNotFoundError(f"Voice file not found: {voice_path}")
|
| 104 |
-
|
| 105 |
-
pipeline = pipelines[voice[0]]
|
| 106 |
-
generator = pipeline(text, voice=voice_path)
|
| 107 |
-
for _, ps, _ in generator:
|
| 108 |
-
return ps
|
| 109 |
-
return ""
|
| 110 |
-
|
| 111 |
-
def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
|
| 112 |
-
voice_path = os.path.join(VOICE_DIR, voice)
|
| 113 |
-
if not os.path.exists(voice_path):
|
| 114 |
-
raise FileNotFoundError(f"Voice file not found: {voice_path}")
|
| 115 |
-
|
| 116 |
-
pipeline = pipelines[voice[0]]
|
| 117 |
-
use_gpu = use_gpu and CUDA_AVAILABLE
|
| 118 |
-
first = True
|
| 119 |
-
if use_gpu:
|
| 120 |
-
pipeline.model = models[True] # Switch to GPU model
|
| 121 |
-
else:
|
| 122 |
-
pipeline.model = models[False] # Switch to CPU model
|
| 123 |
-
generator = pipeline(text, voice=voice_path, speed=speed)
|
| 124 |
-
for _, _, audio in generator:
|
| 125 |
-
yield 24000, audio.numpy()
|
| 126 |
-
if first:
|
| 127 |
-
first = False
|
| 128 |
-
yield 24000, torch.zeros(1).numpy()
|
| 129 |
-
|
| 130 |
-
# Load random quotes and sample texts
|
| 131 |
-
try:
|
| 132 |
-
with open("en.txt", "r") as r:
|
| 133 |
-
random_quotes = [line.strip() for line in r]
|
| 134 |
-
except FileNotFoundError:
|
| 135 |
-
random_quotes = ["Hello, this is a test of the Kokoro TTS system."]
|
| 136 |
-
|
| 137 |
-
def get_random_quote():
|
| 138 |
-
return random.choice(random_quotes)
|
| 139 |
-
|
| 140 |
-
def get_gatsby():
|
| 141 |
-
try:
|
| 142 |
-
with open("gatsby5k.md", "r") as r:
|
| 143 |
-
return r.read().strip()
|
| 144 |
-
except FileNotFoundError:
|
| 145 |
-
return "The Great Gatsby text not found."
|
| 146 |
-
|
| 147 |
-
def get_frankenstein():
|
| 148 |
-
try:
|
| 149 |
-
with open("frankenstein5k.md", "r") as r:
|
| 150 |
-
return r.read().strip()
|
| 151 |
-
except FileNotFoundError:
|
| 152 |
-
return "Frankenstein text not found."
|
| 153 |
-
|
| 154 |
-
# Dynamically load all .pt voice files from VOICE_DIR
|
| 155 |
-
def load_voice_choices():
|
| 156 |
-
voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
|
| 157 |
-
choices = {}
|
| 158 |
-
for voice_file in voice_files:
|
| 159 |
-
# Determine the voice type based on the prefix
|
| 160 |
-
prefix = voice_file[:2]
|
| 161 |
-
if prefix == 'af':
|
| 162 |
-
label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
|
| 163 |
-
elif prefix == 'am':
|
| 164 |
-
label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
|
| 165 |
-
elif prefix == 'bf':
|
| 166 |
-
label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
|
| 167 |
-
elif prefix == 'bm':
|
| 168 |
-
label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
|
| 169 |
-
else:
|
| 170 |
-
label = f"Unknown {voice_file[:-3]}"
|
| 171 |
-
choices[label] = voice_file
|
| 172 |
-
return choices
|
| 173 |
-
|
| 174 |
-
CHOICES = load_voice_choices()
|
| 175 |
-
|
| 176 |
-
# Log available voices
|
| 177 |
-
for label, voice_path in CHOICES.items():
|
| 178 |
-
full_path = os.path.join(VOICE_DIR, voice_path)
|
| 179 |
-
if not os.path.exists(full_path):
|
| 180 |
-
logger.warning(f"Voice file not found: {full_path}")
|
| 181 |
-
else:
|
| 182 |
-
logger.info(f"Loaded voice: {label} ({voice_path})")
|
| 183 |
-
|
| 184 |
-
# If no voices are found, add a default fallback
|
| 185 |
-
if not CHOICES:
|
| 186 |
-
logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
|
| 187 |
-
CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
|
| 188 |
-
|
| 189 |
-
TOKEN_NOTE = '''
|
| 190 |
-
💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
|
| 191 |
-
|
| 192 |
-
💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ
|
| 193 |
-
|
| 194 |
-
⬇️ Lower stress [1 level](-1) or [2 levels](-2)
|
| 195 |
-
|
| 196 |
-
⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
|
| 197 |
-
'''
|
| 198 |
-
|
| 199 |
-
with gr.Blocks() as generate_tab:
|
| 200 |
-
out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
|
| 201 |
-
generate_btn = gr.Button("Generate", variant="primary")
|
| 202 |
-
with gr.Accordion("Output Tokens", open=True):
|
| 203 |
-
out_ps = gr.Textbox(interactive=False, show_label=False,
|
| 204 |
-
info="Tokens used to generate the audio, up to 510 context length.")
|
| 205 |
-
tokenize_btn = gr.Button("Tokenize", variant="secondary")
|
| 206 |
-
gr.Markdown(TOKEN_NOTE)
|
| 207 |
-
predict_btn = gr.Button("Predict", variant="secondary", visible=False)
|
| 208 |
-
|
| 209 |
-
with gr.Blocks() as stream_tab:
|
| 210 |
-
out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
|
| 211 |
-
with gr.Row():
|
| 212 |
-
stream_btn = gr.Button("Stream", variant="primary")
|
| 213 |
-
stop_btn = gr.Button("Stop", variant="stop")
|
| 214 |
-
with gr.Accordion("Note", open=True):
|
| 215 |
-
gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
|
| 216 |
-
gr.DuplicateButton()
|
| 217 |
-
|
| 218 |
-
API_OPEN = True
|
| 219 |
-
with gr.Blocks() as app:
|
| 220 |
-
with gr.Row():
|
| 221 |
-
with gr.Column():
|
| 222 |
-
text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
|
| 223 |
-
with gr.Row():
|
| 224 |
-
voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
|
| 225 |
-
info="Quality and availability vary by language")
|
| 226 |
-
use_gpu = gr.Dropdown(
|
| 227 |
-
[("GPU 🚀", True), ("CPU 🐌", False)],
|
| 228 |
-
value=CUDA_AVAILABLE,
|
| 229 |
-
label="Hardware",
|
| 230 |
-
info="GPU is usually faster, but may require CUDA support",
|
| 231 |
-
interactive=CUDA_AVAILABLE
|
| 232 |
-
)
|
| 233 |
-
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
|
| 234 |
-
clone_voice_file = gr.File(label="Clone Voice Sample (Optional)", file_count="single", type="filepath")
|
| 235 |
-
random_btn = gr.Button("🎲 Random Quote 💬", variant="secondary")
|
| 236 |
-
with gr.Row():
|
| 237 |
-
gatsby_btn = gr.Button("🥂 Gatsby 📕", variant="secondary")
|
| 238 |
-
frankenstein_btn = gr.Button("💀 Frankenstein 📗", variant="secondary")
|
| 239 |
-
with gr.Column():
|
| 240 |
-
gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
|
| 241 |
-
random_btn.click(fn=get_random_quote, inputs=[], outputs=[text])
|
| 242 |
-
gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text])
|
| 243 |
-
frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text])
|
| 244 |
-
generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu, clone_voice_file],
|
| 245 |
-
outputs=[out_audio, out_ps])
|
| 246 |
-
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
|
| 247 |
-
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
|
| 248 |
-
stop_btn.click(fn=None, cancels=[stream_event])
|
| 249 |
-
predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
|
| 250 |
-
|
| 251 |
-
if __name__ == "__main__":
|
| 252 |
-
app.queue(api_open=API_OPEN).launch(
|
| 253 |
-
server_name="127.0.0.1",
|
| 254 |
-
server_port=40001,
|
| 255 |
-
show_api=API_OPEN,
|
| 256 |
-
inbrowser=True
|
| 257 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|