NeuralFalcon's picture
Upload folder using huggingface_hub
329d2b4 verified
import gradio as gr
import time
import os
from utils import generate_dummy_audio, MOCK_LOGS
# -----------------------------------------------------------------------------
# Model Inference Wrapper
# -----------------------------------------------------------------------------
def run_vibevoice(
text_prompt: str,
reference_audio: str,
speed: float,
temperature: float
):
"""
Wrapper function for VibeVoice inference.
Args:
text_prompt: The text to be spoken.
reference_audio: Path to the reference audio file for style cloning.
speed: Speaking rate.
temperature: Sampling temperature (creativity/variance).
"""
# 1. Input Validation
if not text_prompt:
raise gr.Error("Please enter text to synthesize.")
if not reference_audio:
# VibeVoice usually requires a reference, but we can warn if missing
gr.Warning("No reference audio provided. Using default voice style.")
# 2. Progress Simulation (Replace this block with actual model inference)
# ------------------------------------------------------------------
# Actual implementation would look like:
# model = load_vibevoice_model()
# audio_array = model.inference(text_prompt, reference_audio, ...)
# return (sample_rate, audio_array), "Generation Successful"
# ------------------------------------------------------------------
progress = gr.Progress()
progress(0, desc="Initializing VibeVoice...")
time.sleep(0.5)
progress(0.3, desc="Analyzing Reference Audio Style...")
time.sleep(0.8)
progress(0.6, desc="Synthesizing Speech...")
time.sleep(0.8)
progress(0.9, desc="Finalizing Audio...")
time.sleep(0.3)
# Generate dummy audio for demonstration purposes
output_audio_path = generate_dummy_audio(duration=3)
log_message = (
f"βœ… Generation Complete\n"
f"πŸ“ Text length: {len(text_prompt)} chars\n"
f"🎚️ Speed: {speed}x | 🌑️ Temp: {temperature}\n"
f"🎀 Reference: {os.path.basename(reference_audio) if reference_audio else 'None'}"
)
return output_audio_path, log_message
# -----------------------------------------------------------------------------
# Custom Theme Definition
# -----------------------------------------------------------------------------
# Creating a professional Microsoft-inspired blue theme
custom_theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate",
font=gr.themes.GoogleFont("Segoe UI"),
text_size="lg",
radius_size="md"
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600",
block_shadow="*shadow_drop_lg"
)
# -----------------------------------------------------------------------------
# Gradio 6 UI Layout
# -----------------------------------------------------------------------------
# Note: No parameters in gr.Blocks() for Gradio 6
with gr.Blocks() as demo:
# Header Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("# πŸ—£οΈ Microsoft VibeVoice")
gr.Markdown("### Zero-shot Text-to-Speech with Emotion & Style Transfer")
with gr.Row():
gr.Markdown(
"Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)",
elem_classes=["header-link"]
)
# Main Content
with gr.Row():
# Left Column: Inputs
with gr.Column(scale=1):
with gr.Group():
gr.Markdown("### 1. Input Text")
input_text = gr.Textbox(
label="Text to Speech",
placeholder="Enter the text you want VibeVoice to speak...",
lines=4,
max_lines=8,
value="The quick brown fox jumps over the lazy dog, demonstrating the amazing capabilities of modern voice synthesis."
)
with gr.Group():
gr.Markdown("### 2. Voice Reference (The 'Vibe')")
ref_audio = gr.Audio(
label="Reference Audio",
sources=["upload", "microphone"],
type="filepath",
editable=True
)
with gr.Accordion("βš™οΈ Advanced Settings", open=False):
speed_slider = gr.Slider(
minimum=0.5, maximum=2.0, value=1.0, step=0.1,
label="Speaking Speed"
)
temp_slider = gr.Slider(
minimum=0.1, maximum=1.0, value=0.7, step=0.1,
label="Temperature (Variance)"
)
generate_btn = gr.Button("Generate Speech 🎡", variant="primary", size="lg")
# Right Column: Outputs
with gr.Column(scale=1):
gr.Markdown("### 3. Generated Result")
output_audio = gr.Audio(
label="Synthesized Audio",
interactive=False,
autoplay=False
)
with gr.Group():
gr.Markdown("#### Process Logs")
logs = gr.Textbox(
label="Status",
value="Ready to generate.",
lines=5,
interactive=False,
show_copy_button=True
)
# -------------------------------------------------------------------------
# Event Listeners
# -------------------------------------------------------------------------
# Note: using api_visibility="public" (Gradio 6 standard)
generate_btn.click(
fn=run_vibevoice,
inputs=[input_text, ref_audio, speed_slider, temp_slider],
outputs=[output_audio, logs],
api_visibility="public"
)
# Example inputs to help users get started
gr.Examples(
examples=[
["Hello! This is a test of the VibeVoice system.", None, 1.0, 0.7],
["Dramatic reading requires a specific cadence and tone.", None, 0.8, 0.9],
],
inputs=[input_text, ref_audio, speed_slider, temp_slider]
)
# -----------------------------------------------------------------------------
# App Launch
# -----------------------------------------------------------------------------
# Note: All app-level configs go here in Gradio 6
if __name__ == "__main__":
demo.launch(
theme=custom_theme,
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "VibeVoice Repo", "url": "https://github.com/microsoft/VibeVoice"}
],
css="""
.header-link a {
text-decoration: none;
color: #666;
font-size: 0.9em;
font-weight: bold;
}
.header-link a:hover {
color: #2563eb;
text-decoration: underline;
}
"""
)