import time import gradio as gr import numpy as np import torch try: import spaces except ImportError: spaces = None torch.set_float32_matmul_precision("high") from vui.inference import render from vui.model import Vui AVAILABLE_MODELS = {"COHOST": Vui.COHOST} print(f"Available models: {list(AVAILABLE_MODELS.keys())}") current_model = None current_model_name = None def load_and_warm_model(model_name): global current_model, current_model_name if current_model_name == model_name and current_model is not None: print(f"Model {model_name} already loaded!") return current_model print(f"Loading model {model_name}...") if current_model is not None: del current_model torch.cuda.empty_cache() torch.cuda.synchronize() model_path = AVAILABLE_MODELS[model_name] model = Vui.from_pretrained_inf(model_path).cuda() current_model = model current_model_name = model_name print(f"Model {model_name} loaded successfully!") return model SAMPLE_TEXTS = [ """Welcome to Fluxions, the podcast where... we uh explore how technology is shaping the world around us. I'm your host, Alex. [breath] And I'm Jamie um [laugh] today, we're diving into a [hesitate] topic that's transforming customer service uh voice technology for agents. That's right. We're [hesitate] talking about the AI-driven tools that are making those long, frustrating customer service calls a little more bearable, for both the customer and the agents.""", """Um, hey Sarah, so I just left the meeting with the, uh, rabbit focus group and they are absolutely loving the new heritage carrots! Like, I've never seen such enthusiastic thumping in my life! The purple ones are testing through the roof - apparently the flavor profile is just amazing - and they're willing to pay a premium for them! We need to, like, triple production on those immediately and maybe consider a subscription model? Anyway, gotta go, but let's touch base tomorrow about scaling this before the Easter rush hits!""", """What an absolute joke, like I'm really not enjoying this situation where I'm just forced to say things.""", """ So [breath] I don't know if you've been there [breath] but I'm really pissed off. Oh no! Why, what happened? Well I went to this cafe hearth, and they gave me the worst toastie I've ever had, it didn't come with salad it was just raw. Well that's awful what kind of toastie was it? It was supposed to be a chicken bacon lettuce tomatoe, but it was fucking shite, like really bad and I honestly would have preferred to eat my own shit. [laugh] well, it must have been awful for you, I'm sorry to hear that, why don't we move on to brighter topics, like the good old weather?""", """Right so [breath] the thing about quantum computing is, it's not just faster classical computing, right? It's a completely different paradigm. Um, you're working with qubits that can be in superposition, and when you entangle them [hesitate] that's where the magic happens. But here's what nobody tells you, the error rates are still absolutely brutal.""", """Oh my god, you will not believe what just happened to me at the supermarket. So I'm standing in the queue, minding my own business, and this woman just [breath] cuts right in front of me with a trolley full of stuff! And I'm standing there with like, two items. Two! [laugh] So I said excuse me, and she just looked at me like I was the problem. The audacity, honestly.""", """Today we're going to be looking at how to make the perfect sourdough bread. Now [breath] the key thing that most people get wrong is the hydration level. You want to be somewhere around seventy five percent for a nice open crumb. Um, and your starter needs to be really active, I'm talking like, doubling in size within four to six hours. If it's not doing that, don't even bother, you'll just end up with a brick.""", """And the winner of this year's award goes to [hesitate] oh wow, I can barely read this, um [breath] it goes to the team from Edinburgh! [laugh] I have to say, this is absolutely deserved, they have worked so incredibly hard this year and the results speak for themselves. Congratulations to everyone involved, this is a truly special moment.""", ] default_model = "COHOST" if "COHOST" in AVAILABLE_MODELS else list(AVAILABLE_MODELS.keys())[0] model = load_and_warm_model(default_model) log_lines = [f"Model {default_model} loaded and ready"] def log(msg): log_lines.append(msg) return "\n".join(log_lines[-20:]) def get_log(): return "\n".join(log_lines[-20:]) def _gpu_duration(text, temperature=0.5, top_k=100, top_p=None, max_duration=120): return max(10, min(120, len(text) * 30 // 1000)) @spaces.GPU(duration=_gpu_duration) if spaces else lambda f: f def generate(text, temperature=0.5, top_k=100, top_p=None, max_duration=120): if not text.strip(): return None, log("No text provided") if current_model is None: return None, log("No model loaded") print(f"Generating: {text[:50]}... [{current_model_name}]") t1 = time.perf_counter() result = render( current_model, text, temperature=temperature, top_k=top_k, top_p=top_p, max_secs=max_duration, ) waveform = result.cpu() sr = current_model.codec.config.sample_rate generation_time = time.perf_counter() - t1 audio_duration = waveform.shape[-1] / sr if waveform.shape[-1] > 2000: waveform = waveform[..., :-2000] audio_array = waveform.flatten().numpy() info = f"Generated {audio_duration:.1f}s in {generation_time:.1f}s ({audio_duration/generation_time:.1f}x RT) [{current_model_name}]" print(info) return (sr, audio_array), log(info) def change_model(model_name): try: log(f"Loading {model_name}...") load_and_warm_model(model_name) return log(f"Loaded {model_name}") except Exception as e: return log(f"Error loading {model_name}: {e}") with gr.Blocks(title="Vui", theme=gr.themes.Soft()) as demo: with gr.Row(): with gr.Column(scale=1): audio_output = gr.Audio(label=None, type="numpy", autoplay=True) log_output = gr.Textbox(label=None, lines=4, interactive=False, value=get_log()) with gr.Column(scale=2): model_dropdown = gr.Dropdown( choices=list(AVAILABLE_MODELS.keys()), value=default_model, label=None, ) text_input = gr.Textbox( label=None, placeholder="Enter text to convert to speech...", lines=5, max_lines=10, ) with gr.Row(): for i, sample in enumerate(SAMPLE_TEXTS): btn = gr.Button(f"Sample {i + 1}", size="sm") btn.click(fn=lambda idx=i: SAMPLE_TEXTS[idx], outputs=text_input) with gr.Accordion("Settings", open=False): temperature = gr.Slider(0.1, 1.0, value=0.5, step=0.1, label="Temperature") top_k = gr.Slider(1, 200, value=100, step=1, label="Top-K") use_top_p = gr.Checkbox(label="Use Top-P", value=False) top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P", visible=False) max_duration = gr.Slider(5, 120, value=120, step=5, label="Max Duration (s)") use_top_p.change(fn=lambda x: gr.update(visible=x), inputs=use_top_p, outputs=top_p) generate_btn = gr.Button( "Generate", variant="primary", size="lg", elem_id="generate-btn" ) model_dropdown.change(fn=change_model, inputs=model_dropdown, outputs=log_output) def on_generate(text, temp, k, use_p, p, duration): top_p_val = p if use_p else None return generate(text, temp, int(k), top_p_val, int(duration)) generate_btn.click( fn=on_generate, inputs=[text_input, temperature, top_k, use_top_p, top_p, max_duration], outputs=[audio_output, log_output], ) text_input.submit( fn=on_generate, inputs=[text_input, temperature, top_k, use_top_p, top_p, max_duration], outputs=[audio_output, log_output], ) demo.load(fn=lambda: SAMPLE_TEXTS[1], outputs=text_input) demo.launch()