File size: 7,149 Bytes
329d2b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import gradio as gr
import time
import os
from utils import generate_dummy_audio, MOCK_LOGS

# -----------------------------------------------------------------------------
# Model Inference Wrapper
# -----------------------------------------------------------------------------
def run_vibevoice(
    text_prompt: str, 
    reference_audio: str, 
    speed: float, 
    temperature: float
):
    """
    Wrapper function for VibeVoice inference.
    
    Args:
        text_prompt: The text to be spoken.
        reference_audio: Path to the reference audio file for style cloning.
        speed: Speaking rate.
        temperature: Sampling temperature (creativity/variance).
    """
    
    # 1. Input Validation
    if not text_prompt:
        raise gr.Error("Please enter text to synthesize.")
    
    if not reference_audio:
        # VibeVoice usually requires a reference, but we can warn if missing
        gr.Warning("No reference audio provided. Using default voice style.")
    
    # 2. Progress Simulation (Replace this block with actual model inference)
    # ------------------------------------------------------------------
    # Actual implementation would look like:
    # model = load_vibevoice_model()
    # audio_array = model.inference(text_prompt, reference_audio, ...)
    # return (sample_rate, audio_array), "Generation Successful"
    # ------------------------------------------------------------------
    
    progress = gr.Progress()
    progress(0, desc="Initializing VibeVoice...")
    time.sleep(0.5)
    
    progress(0.3, desc="Analyzing Reference Audio Style...")
    time.sleep(0.8)
    
    progress(0.6, desc="Synthesizing Speech...")
    time.sleep(0.8)
    
    progress(0.9, desc="Finalizing Audio...")
    time.sleep(0.3)
    
    # Generate dummy audio for demonstration purposes
    output_audio_path = generate_dummy_audio(duration=3)
    
    log_message = (
        f"✅ Generation Complete\n"
        f"📝 Text length: {len(text_prompt)} chars\n"
        f"🎚️ Speed: {speed}x | 🌡️ Temp: {temperature}\n"
        f"🎤 Reference: {os.path.basename(reference_audio) if reference_audio else 'None'}"
    )
    
    return output_audio_path, log_message

# -----------------------------------------------------------------------------
# Custom Theme Definition
# -----------------------------------------------------------------------------
# Creating a professional Microsoft-inspired blue theme
custom_theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="slate",
    neutral_hue="slate",
    font=gr.themes.GoogleFont("Segoe UI"),
    text_size="lg",
    radius_size="md"
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
    block_title_text_weight="600",
    block_shadow="*shadow_drop_lg"
)

# -----------------------------------------------------------------------------
# Gradio 6 UI Layout
# -----------------------------------------------------------------------------
# Note: No parameters in gr.Blocks() for Gradio 6
with gr.Blocks() as demo:
    
    # Header Section
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("# 🗣️ Microsoft VibeVoice")
            gr.Markdown("### Zero-shot Text-to-Speech with Emotion & Style Transfer")
            
    with gr.Row():
        gr.Markdown(
            "Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", 
            elem_classes=["header-link"]
        )

    # Main Content
    with gr.Row():
        
        # Left Column: Inputs
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 1. Input Text")
                input_text = gr.Textbox(
                    label="Text to Speech",
                    placeholder="Enter the text you want VibeVoice to speak...",
                    lines=4,
                    max_lines=8,
                    value="The quick brown fox jumps over the lazy dog, demonstrating the amazing capabilities of modern voice synthesis."
                )

            with gr.Group():
                gr.Markdown("### 2. Voice Reference (The 'Vibe')")
                ref_audio = gr.Audio(
                    label="Reference Audio",
                    sources=["upload", "microphone"],
                    type="filepath",
                    editable=True
                )
                
            with gr.Accordion("⚙️ Advanced Settings", open=False):
                speed_slider = gr.Slider(
                    minimum=0.5, maximum=2.0, value=1.0, step=0.1, 
                    label="Speaking Speed"
                )
                temp_slider = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.7, step=0.1, 
                    label="Temperature (Variance)"
                )

            generate_btn = gr.Button("Generate Speech 🎵", variant="primary", size="lg")

        # Right Column: Outputs
        with gr.Column(scale=1):
            gr.Markdown("### 3. Generated Result")
            output_audio = gr.Audio(
                label="Synthesized Audio",
                interactive=False,
                autoplay=False
            )
            
            with gr.Group():
                gr.Markdown("#### Process Logs")
                logs = gr.Textbox(
                    label="Status",
                    value="Ready to generate.",
                    lines=5,
                    interactive=False,
                    show_copy_button=True
                )

    # -------------------------------------------------------------------------
    # Event Listeners
    # -------------------------------------------------------------------------
    # Note: using api_visibility="public" (Gradio 6 standard)
    generate_btn.click(
        fn=run_vibevoice,
        inputs=[input_text, ref_audio, speed_slider, temp_slider],
        outputs=[output_audio, logs],
        api_visibility="public"
    )
    
    # Example inputs to help users get started
    gr.Examples(
        examples=[
            ["Hello! This is a test of the VibeVoice system.", None, 1.0, 0.7],
            ["Dramatic reading requires a specific cadence and tone.", None, 0.8, 0.9],
        ],
        inputs=[input_text, ref_audio, speed_slider, temp_slider]
    )

# -----------------------------------------------------------------------------
# App Launch
# -----------------------------------------------------------------------------
# Note: All app-level configs go here in Gradio 6
if __name__ == "__main__":
    demo.launch(
        theme=custom_theme,
        footer_links=[
            {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
            {"label": "VibeVoice Repo", "url": "https://github.com/microsoft/VibeVoice"}
        ],
        css="""
        .header-link a { 
            text-decoration: none; 
            color: #666; 
            font-size: 0.9em;
            font-weight: bold;
        }
        .header-link a:hover {
            color: #2563eb;
            text-decoration: underline;
        }
        """
    )