import gradio as gr import time import os from utils import generate_dummy_audio, MOCK_LOGS # ----------------------------------------------------------------------------- # Model Inference Wrapper # ----------------------------------------------------------------------------- def run_vibevoice( text_prompt: str, reference_audio: str, speed: float, temperature: float ): """ Wrapper function for VibeVoice inference. Args: text_prompt: The text to be spoken. reference_audio: Path to the reference audio file for style cloning. speed: Speaking rate. temperature: Sampling temperature (creativity/variance). """ # 1. Input Validation if not text_prompt: raise gr.Error("Please enter text to synthesize.") if not reference_audio: # VibeVoice usually requires a reference, but we can warn if missing gr.Warning("No reference audio provided. Using default voice style.") # 2. Progress Simulation (Replace this block with actual model inference) # ------------------------------------------------------------------ # Actual implementation would look like: # model = load_vibevoice_model() # audio_array = model.inference(text_prompt, reference_audio, ...) # return (sample_rate, audio_array), "Generation Successful" # ------------------------------------------------------------------ progress = gr.Progress() progress(0, desc="Initializing VibeVoice...") time.sleep(0.5) progress(0.3, desc="Analyzing Reference Audio Style...") time.sleep(0.8) progress(0.6, desc="Synthesizing Speech...") time.sleep(0.8) progress(0.9, desc="Finalizing Audio...") time.sleep(0.3) # Generate dummy audio for demonstration purposes output_audio_path = generate_dummy_audio(duration=3) log_message = ( f"✅ Generation Complete\n" f"📝 Text length: {len(text_prompt)} chars\n" f"🎚️ Speed: {speed}x | 🌡️ Temp: {temperature}\n" f"🎤 Reference: {os.path.basename(reference_audio) if reference_audio else 'None'}" ) return output_audio_path, log_message # ----------------------------------------------------------------------------- # Custom Theme Definition # ----------------------------------------------------------------------------- # Creating a professional Microsoft-inspired blue theme custom_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", neutral_hue="slate", font=gr.themes.GoogleFont("Segoe UI"), text_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", block_shadow="*shadow_drop_lg" ) # ----------------------------------------------------------------------------- # Gradio 6 UI Layout # ----------------------------------------------------------------------------- # Note: No parameters in gr.Blocks() for Gradio 6 with gr.Blocks() as demo: # Header Section with gr.Row(): with gr.Column(scale=1): gr.Markdown("# 🗣️ Microsoft VibeVoice") gr.Markdown("### Zero-shot Text-to-Speech with Emotion & Style Transfer") with gr.Row(): gr.Markdown( "Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", elem_classes=["header-link"] ) # Main Content with gr.Row(): # Left Column: Inputs with gr.Column(scale=1): with gr.Group(): gr.Markdown("### 1. Input Text") input_text = gr.Textbox( label="Text to Speech", placeholder="Enter the text you want VibeVoice to speak...", lines=4, max_lines=8, value="The quick brown fox jumps over the lazy dog, demonstrating the amazing capabilities of modern voice synthesis." ) with gr.Group(): gr.Markdown("### 2. Voice Reference (The 'Vibe')") ref_audio = gr.Audio( label="Reference Audio", sources=["upload", "microphone"], type="filepath", editable=True ) with gr.Accordion("⚙️ Advanced Settings", open=False): speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Speed" ) temp_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature (Variance)" ) generate_btn = gr.Button("Generate Speech 🎵", variant="primary", size="lg") # Right Column: Outputs with gr.Column(scale=1): gr.Markdown("### 3. Generated Result") output_audio = gr.Audio( label="Synthesized Audio", interactive=False, autoplay=False ) with gr.Group(): gr.Markdown("#### Process Logs") logs = gr.Textbox( label="Status", value="Ready to generate.", lines=5, interactive=False, show_copy_button=True ) # ------------------------------------------------------------------------- # Event Listeners # ------------------------------------------------------------------------- # Note: using api_visibility="public" (Gradio 6 standard) generate_btn.click( fn=run_vibevoice, inputs=[input_text, ref_audio, speed_slider, temp_slider], outputs=[output_audio, logs], api_visibility="public" ) # Example inputs to help users get started gr.Examples( examples=[ ["Hello! This is a test of the VibeVoice system.", None, 1.0, 0.7], ["Dramatic reading requires a specific cadence and tone.", None, 0.8, 0.9], ], inputs=[input_text, ref_audio, speed_slider, temp_slider] ) # ----------------------------------------------------------------------------- # App Launch # ----------------------------------------------------------------------------- # Note: All app-level configs go here in Gradio 6 if __name__ == "__main__": demo.launch( theme=custom_theme, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "VibeVoice Repo", "url": "https://github.com/microsoft/VibeVoice"} ], css=""" .header-link a { text-decoration: none; color: #666; font-size: 0.9em; font-weight: bold; } .header-link a:hover { color: #2563eb; text-decoration: underline; } """ )