|
|
import os |
|
|
import subprocess |
|
|
import sys |
|
|
|
|
|
|
|
|
os.environ["OMP_NUM_THREADS"] = "4" |
|
|
|
|
|
|
|
|
def setup_dependencies(): |
|
|
try: |
|
|
|
|
|
if os.path.exists('/tmp/deps_installed'): |
|
|
return |
|
|
|
|
|
print("Installing transformers dev version...") |
|
|
subprocess.check_call([ |
|
|
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", |
|
|
"git+https://github.com/huggingface/transformers.git" |
|
|
]) |
|
|
|
|
|
|
|
|
with open('/tmp/deps_installed', 'w') as f: |
|
|
f.write('done') |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Dependencies setup error: {e}") |
|
|
|
|
|
|
|
|
setup_dependencies() |
|
|
|
|
|
import spaces |
|
|
import gradio as gr |
|
|
from util import Config, NemoAudioPlayer, KaniModel, Demo |
|
|
import numpy as np |
|
|
import torch |
|
|
|
|
|
|
|
|
token_ = os.getenv('HF_TOKEN') |
|
|
|
|
|
|
|
|
models_configs = { |
|
|
'Base_pretrained_model': Config(), |
|
|
'Female_voice': Config( |
|
|
model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2', |
|
|
temperature=0.2 |
|
|
), |
|
|
'Male_voice': Config( |
|
|
model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1', |
|
|
temperature=0.2 |
|
|
) |
|
|
} |
|
|
|
|
|
|
|
|
player = NemoAudioPlayer(Config()) |
|
|
demo_examples = Demo()() |
|
|
models = {} |
|
|
for model_name, config in models_configs.items(): |
|
|
print(f"Loading {model_name}...") |
|
|
models[model_name] = KaniModel(config, player, token_) |
|
|
print(f"{model_name} loaded!") |
|
|
print("All models loaded!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def generate_speech_gpu(text, model_choice): |
|
|
""" |
|
|
Generate speech from text using the selected model on GPU |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
if not text.strip(): |
|
|
return None, "Please enter text for speech generation." |
|
|
|
|
|
if not model_choice: |
|
|
return None, "Please select a model." |
|
|
|
|
|
try: |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
|
|
|
selected_model = models[model_choice] |
|
|
|
|
|
|
|
|
print(f"Generating speech with {model_choice}...") |
|
|
audio, _, time_report = selected_model.run_model(text) |
|
|
|
|
|
sample_rate = 22050 |
|
|
print("Speech generation completed!") |
|
|
|
|
|
return (sample_rate, audio), time_report |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during generation: {str(e)}") |
|
|
return None, f"❌ Error during generation: {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: |
|
|
gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model") |
|
|
gr.Markdown("Select a model and enter text to generate high-quality speech") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(models_configs.keys()), |
|
|
value=list(models_configs.keys())[0], |
|
|
label="Select Model", |
|
|
info="Base - default model, Female - female voice, Male - male voice" |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Enter Text", |
|
|
placeholder="Enter text for speech generation...", |
|
|
lines=3, |
|
|
max_lines=10 |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Speech", |
|
|
type="numpy" |
|
|
) |
|
|
|
|
|
time_report_output = gr.Textbox( |
|
|
label="Time Report", |
|
|
interactive=False, |
|
|
value="Ready to generate speech" |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech_gpu, |
|
|
inputs=[text_input, model_dropdown], |
|
|
outputs=[audio_output, time_report_output] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("## 🎯 Demo Examples") |
|
|
|
|
|
def play_demo(text): |
|
|
return (22050, demo_examples[text]) |
|
|
|
|
|
with gr.Row(): |
|
|
for text in list(demo_examples.keys())[:4]: |
|
|
gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output]) |
|
|
|
|
|
with gr.Row(): |
|
|
for text in list(demo_examples.keys())[4:8]: |
|
|
gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_error=True |
|
|
) |