import os import subprocess import sys # Fix OMP_NUM_THREADS issue before any imports os.environ["OMP_NUM_THREADS"] = "4" # Install dependencies programmatically to avoid conflicts def setup_dependencies(): try: # Check if already installed if os.path.exists('/tmp/deps_installed'): return print("Installing transformers dev version...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", "git+https://github.com/huggingface/transformers.git" ]) # Mark as installed with open('/tmp/deps_installed', 'w') as f: f.write('done') except Exception as e: print(f"Dependencies setup error: {e}") # Run setup setup_dependencies() import spaces import gradio as gr from util import Config, NemoAudioPlayer, KaniModel, Demo import numpy as np import torch # Get HuggingFace token token_ = os.getenv('HF_TOKEN') # Model configurations models_configs = { 'base': Config(), 'female': Config( model_name='nineninesix/kani-tts-450m-0.2-ft', ), 'male': Config( model_name='nineninesix/kani-tts-450m-0.1-ft', ) } # Global variables for models (loaded once) player = NemoAudioPlayer(Config()) models = {} for model_name, config in models_configs.items(): print(f"Loading {model_name}...") models[model_name] = KaniModel(config, player, token_) print(f"{model_name} loaded!") print("All models loaded!") @spaces.GPU def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok): """ Generate speech from text using the selected model on GPU """ if not text.strip(): return None, "Please enter text for speech generation." if not model_choice: return None, "Please select a model." try: # Check GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Get selected model selected_model = models[model_choice] # Generate audio print(f"Generating speech with {model_choice}...") audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok) sample_rate = 22050 print("Speech generation completed!") return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}" except Exception as e: print(f"Error during generation: {str(e)}") return None, f"❌ Error during generation: {str(e)}" # Create Gradio interface with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") gr.Markdown("Select a model and enter text to generate emotional speech") with gr.Row(): with gr.Column(scale=1): model_dropdown = gr.Dropdown( choices=list(models_configs.keys()), value=list(models_configs.keys())[0], label="Selected Model", info="Base generates random voices" ) text_input = gr.Textbox( label="Text", placeholder="Enter your text ...", lines=3, max_lines=10 ) with gr.Accordion("Settings", open=False): temp = gr.Slider( minimum=0.1, maximum=1.5, value=0.6, step=0.05, label="Temp", ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P", ) rp = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty", ) max_tok = gr.Slider( minimum=100, maximum=2000, value=1200, step=100, label="Max Tokens", ) generate_btn = gr.Button("Run", variant="primary", size="lg") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Audio", type="numpy" ) time_report_output = gr.Textbox( label="Time Report", interactive=False, value="Ready to generate speech", lines=3 ) # GPU generation event generate_btn.click( fn=generate_speech_gpu, inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok], outputs=[audio_output, time_report_output] ) with gr.Row(): examples = [ ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 0.6, 0.95, 1.1, 1200], ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 0.6, 0.95, 1.1, 1200], ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 0.6, 0.95, 1.1, 1200], ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 0.6, 0.95, 1.1, 1200], ["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male", 0.6, 0.95, 1.1, 1200], ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 0.6, 0.95, 1.1, 1200], ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female", 0.6, 0.95, 1.1, 1200], ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 0.6, 0.95, 1.1, 1200], ] gr.Examples( examples=examples, inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok], fn=generate_speech_gpu, outputs=[audio_output, time_report_output], cache_examples=True, ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )