gagndeep's picture
Upload folder using huggingface_hub
fa63eb7 verified
import gradio as gr
import requests
import io
import numpy as np
from pydub import AudioSegment
import tempfile
import os
# Create a custom theme for the application
custom_theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="lg",
spacing_size="lg",
radius_size="md"
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600",
)
def vibevoice_conversion(audio_file, speaker_id="default"):
"""
Convert audio using the VibeVoice Realtime 0.5B model
"""
try:
# Check if audio file is provided
if audio_file is None:
raise gr.Error("Please upload an audio file")
# Create a temporary file to store the uploaded audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
temp_audio_path = temp_audio.name
# Save the uploaded audio to the temporary file
if isinstance(audio_file, tuple):
# If it's a tuple (sample_rate, audio_data)
sample_rate, audio_data = audio_file
# Convert numpy array to AudioSegment and export as WAV
audio_segment = AudioSegment(
audio_data.tobytes(),
frame_rate=sample_rate,
sample_width=audio_data.dtype.itemsize,
channels=1 if len(audio_data.shape) == 1 else audio_data.shape[0]
)
audio_segment.export(temp_audio_path, format="wav")
else:
# If it's a file path
audio_segment = AudioSegment.from_file(audio_file)
audio_segment.export(temp_audio_path, format="wav")
# Prepare the request to the VibeVoice API
api_url = "https://anycoderapps-vibevice-realtime-0-5b.hf.space/run/predict"
# Read the audio file as bytes
with open(temp_audio_path, "rb") as f:
audio_bytes = f.read()
# Prepare the payload
payload = {
"data": [
audio_bytes,
speaker_id
]
}
# Send request to the VibeVoice API
response = requests.post(api_url, json=payload)
# Clean up temporary file
os.unlink(temp_audio_path)
if response.status_code == 200:
result = response.json()
if "data" in result and len(result["data"]) > 0:
# Get the converted audio data
converted_audio_bytes = result["data"][0]
# Create a temporary file for the converted audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_converted:
temp_converted_path = temp_converted.name
temp_converted.write(converted_audio_bytes)
# Return the converted audio file path
return temp_converted_path
else:
raise gr.Error("No audio data received from VibeVoice API")
else:
raise gr.Error(f"VibeVoice API request failed with status code: {response.status_code}")
except Exception as e:
raise gr.Error(f"An error occurred during voice conversion: {str(e)}")
def process_audio(audio_file, speaker_id):
"""
Process the audio file and return the converted audio
"""
try:
# Convert the audio using VibeVoice
converted_audio_path = vibevoice_conversion(audio_file, speaker_id)
# Return the converted audio
return converted_audio_path
except Exception as e:
raise gr.Error(f"Error processing audio: {str(e)}")
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# 🎀 VibeVoice Realtime 0.5B - Voice Conversion")
gr.Markdown("""
### Convert your voice to different styles using the VibeVoice Realtime 0.5B model
**Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
Upload an audio file and select a speaker style to convert your voice. The VibeVoice model can transform your voice while preserving the emotional content and prosody.
""")
with gr.Row():
with gr.Column():
gr.Markdown("### Input Audio")
input_audio = gr.Audio(
label="Upload your audio file",
type="filepath",
sources=["upload", "microphone"],
format="wav"
)
speaker_style = gr.Dropdown(
choices=[
"default",
"female_1",
"male_1",
"child",
"elderly",
"emotional"
],
value="default",
label="Select Speaker Style"
)
convert_btn = gr.Button("πŸ”„ Convert Voice", variant="primary", size="lg")
with gr.Column():
gr.Markdown("### Converted Audio")
output_audio = gr.Audio(
label="Converted Audio",
type="filepath",
format="wav"
)
status_text = gr.Textbox(
label="Status",
value="Ready to convert your voice!",
interactive=False
)
# Add examples
examples = gr.Examples(
examples=[
["https://example.com/sample1.wav", "female_1"],
["https://example.com/sample2.wav", "male_1"],
["https://example.com/sample3.wav", "emotional"]
],
inputs=[input_audio, speaker_style],
label="Try these examples:"
)
# Set up the conversion event
convert_btn.click(
fn=process_audio,
inputs=[input_audio, speaker_style],
outputs=[output_audio, status_text],
api_visibility="public",
api_name="convert_voice"
)
gr.Markdown("""
### About VibeVoice Realtime 0.5B
- **Model**: VibeVoice Realtime 0.5B
- **Size**: 0.5 Billion parameters
- **Features**: Real-time voice conversion with emotional preservation
- **Capabilities**: Speaker style transfer, emotional content preservation, high-quality voice conversion
### Tips for Best Results
- Use clear, high-quality audio recordings
- Speak naturally and expressively
- For best results, use audio samples of 5-15 seconds
- The model preserves emotional content and prosody from the original voice
""")
# Launch the application with custom theme and settings
demo.launch(
theme=custom_theme,
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "VibeVoice Model", "url": "https://huggingface.co/spaces/anycoderapps/VibeVoice-Realtime-0.5B"},
{"label": "Gradio", "url": "https://gradio.app"},
{"label": "Hugging Face", "url": "https://huggingface.co"}
],
title="VibeVoice Realtime 0.5B - Voice Conversion",
description="Convert your voice to different styles using the VibeVoice Realtime 0.5B model",
show_error=True
)