Spaces:

NeuralFalcon
/

anycoder-ee200fb6

Runtime error

File size: 7,149 Bytes

329d2b4

import gradio as gr
import time
import os
from utils import generate_dummy_audio, MOCK_LOGS

# -----------------------------------------------------------------------------
# Model Inference Wrapper
# -----------------------------------------------------------------------------
def run_vibevoice(
    text_prompt: str, 
    reference_audio: str, 
    speed: float, 
    temperature: float
):
    """
    Wrapper function for VibeVoice inference.
    
    Args:
        text_prompt: The text to be spoken.
        reference_audio: Path to the reference audio file for style cloning.
        speed: Speaking rate.
        temperature: Sampling temperature (creativity/variance).
    """
    
    # 1. Input Validation
    if not text_prompt:
        raise gr.Error("Please enter text to synthesize.")
    
    if not reference_audio:
        # VibeVoice usually requires a reference, but we can warn if missing
        gr.Warning("No reference audio provided. Using default voice style.")
    
    # 2. Progress Simulation (Replace this block with actual model inference)
    # ------------------------------------------------------------------
    # Actual implementation would look like:
    # model = load_vibevoice_model()
    # audio_array = model.inference(text_prompt, reference_audio, ...)
    # return (sample_rate, audio_array), "Generation Successful"
    # ------------------------------------------------------------------
    
    progress = gr.Progress()
    progress(0, desc="Initializing VibeVoice...")
    time.sleep(0.5)
    
    progress(0.3, desc="Analyzing Reference Audio Style...")
    time.sleep(0.8)
    
    progress(0.6, desc="Synthesizing Speech...")
    time.sleep(0.8)
    
    progress(0.9, desc="Finalizing Audio...")
    time.sleep(0.3)
    
    # Generate dummy audio for demonstration purposes
    output_audio_path = generate_dummy_audio(duration=3)
    
    log_message = (
        f"✅ Generation Complete\n"
        f"📝 Text length: {len(text_prompt)} chars\n"
        f"🎚️ Speed: {speed}x | 🌡️ Temp: {temperature}\n"
        f"🎤 Reference: {os.path.basename(reference_audio) if reference_audio else 'None'}"
    )
    
    return output_audio_path, log_message

# -----------------------------------------------------------------------------
# Custom Theme Definition
# -----------------------------------------------------------------------------
# Creating a professional Microsoft-inspired blue theme
custom_theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="slate",
    neutral_hue="slate",
    font=gr.themes.GoogleFont("Segoe UI"),
    text_size="lg",
    radius_size="md"
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
    block_title_text_weight="600",
    block_shadow="*shadow_drop_lg"
)

# -----------------------------------------------------------------------------
# Gradio 6 UI Layout
# -----------------------------------------------------------------------------
# Note: No parameters in gr.Blocks() for Gradio 6
with gr.Blocks() as demo:
    
    # Header Section
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("# 🗣️ Microsoft VibeVoice")
            gr.Markdown("### Zero-shot Text-to-Speech with Emotion & Style Transfer")
            
    with gr.Row():
        gr.Markdown(
            "Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", 
            elem_classes=["header-link"]
        )

    # Main Content
    with gr.Row():
        
        # Left Column: Inputs
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 1. Input Text")
                input_text = gr.Textbox(
                    label="Text to Speech",
                    placeholder="Enter the text you want VibeVoice to speak...",
                    lines=4,
                    max_lines=8,
                    value="The quick brown fox jumps over the lazy dog, demonstrating the amazing capabilities of modern voice synthesis."
                )

            with gr.Group():
                gr.Markdown("### 2. Voice Reference (The 'Vibe')")
                ref_audio = gr.Audio(
                    label="Reference Audio",
                    sources=["upload", "microphone"],
                    type="filepath",
                    editable=True
                )
                
            with gr.Accordion("⚙️ Advanced Settings", open=False):
                speed_slider = gr.Slider(
                    minimum=0.5, maximum=2.0, value=1.0, step=0.1, 
                    label="Speaking Speed"
                )
                temp_slider = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.7, step=0.1, 
                    label="Temperature (Variance)"
                )

            generate_btn = gr.Button("Generate Speech 🎵", variant="primary", size="lg")

        # Right Column: Outputs
        with gr.Column(scale=1):
            gr.Markdown("### 3. Generated Result")
            output_audio = gr.Audio(
                label="Synthesized Audio",
                interactive=False,
                autoplay=False
            )
            
            with gr.Group():
                gr.Markdown("#### Process Logs")
                logs = gr.Textbox(
                    label="Status",
                    value="Ready to generate.",
                    lines=5,
                    interactive=False,
                    show_copy_button=True
                )

    # -------------------------------------------------------------------------
    # Event Listeners
    # -------------------------------------------------------------------------
    # Note: using api_visibility="public" (Gradio 6 standard)
    generate_btn.click(
        fn=run_vibevoice,
        inputs=[input_text, ref_audio, speed_slider, temp_slider],
        outputs=[output_audio, logs],
        api_visibility="public"
    )
    
    # Example inputs to help users get started
    gr.Examples(
        examples=[
            ["Hello! This is a test of the VibeVoice system.", None, 1.0, 0.7],
            ["Dramatic reading requires a specific cadence and tone.", None, 0.8, 0.9],
        ],
        inputs=[input_text, ref_audio, speed_slider, temp_slider]
    )

# -----------------------------------------------------------------------------
# App Launch
# -----------------------------------------------------------------------------
# Note: All app-level configs go here in Gradio 6
if __name__ == "__main__":
    demo.launch(
        theme=custom_theme,
        footer_links=[
            {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
            {"label": "VibeVoice Repo", "url": "https://github.com/microsoft/VibeVoice"}
        ],
        css="""
        .header-link a { 
            text-decoration: none; 
            color: #666; 
            font-size: 0.9em;
            font-weight: bold;
        }
        .header-link a:hover {
            color: #2563eb;
            text-decoration: underline;
        }
        """
    )