import gradio as gr import requests import io import numpy as np from pydub import AudioSegment import tempfile import os # Create a custom theme for the application custom_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", ) def vibevoice_conversion(audio_file, speaker_id="default"): """ Convert audio using the VibeVoice Realtime 0.5B model """ try: # Check if audio file is provided if audio_file is None: raise gr.Error("Please upload an audio file") # Create a temporary file to store the uploaded audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: temp_audio_path = temp_audio.name # Save the uploaded audio to the temporary file if isinstance(audio_file, tuple): # If it's a tuple (sample_rate, audio_data) sample_rate, audio_data = audio_file # Convert numpy array to AudioSegment and export as WAV audio_segment = AudioSegment( audio_data.tobytes(), frame_rate=sample_rate, sample_width=audio_data.dtype.itemsize, channels=1 if len(audio_data.shape) == 1 else audio_data.shape[0] ) audio_segment.export(temp_audio_path, format="wav") else: # If it's a file path audio_segment = AudioSegment.from_file(audio_file) audio_segment.export(temp_audio_path, format="wav") # Prepare the request to the VibeVoice API api_url = "https://anycoderapps-vibevice-realtime-0-5b.hf.space/run/predict" # Read the audio file as bytes with open(temp_audio_path, "rb") as f: audio_bytes = f.read() # Prepare the payload payload = { "data": [ audio_bytes, speaker_id ] } # Send request to the VibeVoice API response = requests.post(api_url, json=payload) # Clean up temporary file os.unlink(temp_audio_path) if response.status_code == 200: result = response.json() if "data" in result and len(result["data"]) > 0: # Get the converted audio data converted_audio_bytes = result["data"][0] # Create a temporary file for the converted audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_converted: temp_converted_path = temp_converted.name temp_converted.write(converted_audio_bytes) # Return the converted audio file path return temp_converted_path else: raise gr.Error("No audio data received from VibeVoice API") else: raise gr.Error(f"VibeVoice API request failed with status code: {response.status_code}") except Exception as e: raise gr.Error(f"An error occurred during voice conversion: {str(e)}") def process_audio(audio_file, speaker_id): """ Process the audio file and return the converted audio """ try: # Convert the audio using VibeVoice converted_audio_path = vibevoice_conversion(audio_file, speaker_id) # Return the converted audio return converted_audio_path except Exception as e: raise gr.Error(f"Error processing audio: {str(e)}") # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("# 🎤 VibeVoice Realtime 0.5B - Voice Conversion") gr.Markdown(""" ### Convert your voice to different styles using the VibeVoice Realtime 0.5B model **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** Upload an audio file and select a speaker style to convert your voice. The VibeVoice model can transform your voice while preserving the emotional content and prosody. """) with gr.Row(): with gr.Column(): gr.Markdown("### Input Audio") input_audio = gr.Audio( label="Upload your audio file", type="filepath", sources=["upload", "microphone"], format="wav" ) speaker_style = gr.Dropdown( choices=[ "default", "female_1", "male_1", "child", "elderly", "emotional" ], value="default", label="Select Speaker Style" ) convert_btn = gr.Button("🔄 Convert Voice", variant="primary", size="lg") with gr.Column(): gr.Markdown("### Converted Audio") output_audio = gr.Audio( label="Converted Audio", type="filepath", format="wav" ) status_text = gr.Textbox( label="Status", value="Ready to convert your voice!", interactive=False ) # Add examples examples = gr.Examples( examples=[ ["https://example.com/sample1.wav", "female_1"], ["https://example.com/sample2.wav", "male_1"], ["https://example.com/sample3.wav", "emotional"] ], inputs=[input_audio, speaker_style], label="Try these examples:" ) # Set up the conversion event convert_btn.click( fn=process_audio, inputs=[input_audio, speaker_style], outputs=[output_audio, status_text], api_visibility="public", api_name="convert_voice" ) gr.Markdown(""" ### About VibeVoice Realtime 0.5B - **Model**: VibeVoice Realtime 0.5B - **Size**: 0.5 Billion parameters - **Features**: Real-time voice conversion with emotional preservation - **Capabilities**: Speaker style transfer, emotional content preservation, high-quality voice conversion ### Tips for Best Results - Use clear, high-quality audio recordings - Speak naturally and expressively - For best results, use audio samples of 5-15 seconds - The model preserves emotional content and prosody from the original voice """) # Launch the application with custom theme and settings demo.launch( theme=custom_theme, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "VibeVoice Model", "url": "https://huggingface.co/spaces/anycoderapps/VibeVoice-Realtime-0.5B"}, {"label": "Gradio", "url": "https://gradio.app"}, {"label": "Hugging Face", "url": "https://huggingface.co"} ], title="VibeVoice Realtime 0.5B - Voice Conversion", description="Convert your voice to different styles using the VibeVoice Realtime 0.5B model", show_error=True )