|
|
import os |
|
|
import subprocess |
|
|
import sys |
|
|
|
|
|
|
|
|
os.environ["OMP_NUM_THREADS"] = "4" |
|
|
|
|
|
|
|
|
def setup_dependencies(): |
|
|
try: |
|
|
|
|
|
if os.path.exists('/tmp/deps_installed'): |
|
|
return |
|
|
|
|
|
print("Installing transformers dev version...") |
|
|
subprocess.check_call([ |
|
|
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", |
|
|
"git+https://github.com/huggingface/transformers.git" |
|
|
]) |
|
|
|
|
|
|
|
|
with open('/tmp/deps_installed', 'w') as f: |
|
|
f.write('done') |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Dependencies setup error: {e}") |
|
|
|
|
|
|
|
|
setup_dependencies() |
|
|
|
|
|
import spaces |
|
|
import gradio as gr |
|
|
from util import Config, NemoAudioPlayer, KaniModel, Demo |
|
|
import numpy as np |
|
|
import torch |
|
|
|
|
|
|
|
|
token_ = os.getenv('HF_TOKEN') |
|
|
|
|
|
|
|
|
models_configs = { |
|
|
'base': Config(), |
|
|
'female': Config( |
|
|
model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2', |
|
|
), |
|
|
'male': Config( |
|
|
model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1', |
|
|
) |
|
|
} |
|
|
|
|
|
|
|
|
player = NemoAudioPlayer(Config()) |
|
|
demo_examples = Demo()() |
|
|
models = {} |
|
|
for model_name, config in models_configs.items(): |
|
|
print(f"Loading {model_name}...") |
|
|
models[model_name] = KaniModel(config, player, token_) |
|
|
print(f"{model_name} loaded!") |
|
|
print("All models loaded!") |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def generate_speech_gpu(text, model_choice): |
|
|
""" |
|
|
Generate speech from text using the selected model on GPU |
|
|
""" |
|
|
|
|
|
if not text.strip(): |
|
|
return None, "Please enter text for speech generation." |
|
|
|
|
|
if not model_choice: |
|
|
return None, "Please select a model." |
|
|
|
|
|
try: |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
|
|
|
selected_model = models[model_choice] |
|
|
|
|
|
|
|
|
print(f"Generating speech with {model_choice}...") |
|
|
audio, _, time_report = selected_model.run_model(text) |
|
|
|
|
|
sample_rate = 22050 |
|
|
print("Speech generation completed!") |
|
|
|
|
|
return (sample_rate, audio), time_report |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during generation: {str(e)}") |
|
|
return None, f"❌ Error during generation: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: |
|
|
gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model") |
|
|
gr.Markdown("Select a model and enter text to generate high-quality speech") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(models_configs.keys()), |
|
|
value=list(models_configs.keys())[0], |
|
|
label="Select Model", |
|
|
info="Base - default model, Female - female voice, Male - male voice" |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text", |
|
|
placeholder="Enter your text ...", |
|
|
lines=3, |
|
|
max_lines=10 |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Audio", |
|
|
type="numpy" |
|
|
) |
|
|
|
|
|
time_report_output = gr.Textbox( |
|
|
label="Time Report", |
|
|
interactive=False, |
|
|
value="Ready to generate speech", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech_gpu, |
|
|
inputs=[text_input, model_dropdown], |
|
|
outputs=[audio_output, time_report_output] |
|
|
) |
|
|
|
|
|
gr.Markdown("## Examples") |
|
|
|
|
|
def play_demo(text): |
|
|
return (22050, demo_examples[text]), 'DEMO' |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
examples = [ |
|
|
["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male"], |
|
|
["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male"], |
|
|
["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male"], |
|
|
["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female"], |
|
|
["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male"], |
|
|
["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female"], |
|
|
["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female"], |
|
|
["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female"], |
|
|
] |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[text_input, model_dropdown], |
|
|
fn=lambda t=text_input: play_demo(t), |
|
|
outputs=[audio_output, time_report_output], |
|
|
cache_examples=True, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_error=True |
|
|
) |