import gradio as gr
import requests
import io
import numpy as np
from pydub import AudioSegment
import tempfile
import os

# Create a custom theme for the application
custom_theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="indigo",
    neutral_hue="slate",
    font=gr.themes.GoogleFont("Inter"),
    text_size="lg",
    spacing_size="lg",
    radius_size="md"
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
    block_title_text_weight="600",
)

def vibevoice_conversion(audio_file, speaker_id="default"):
    """
    Convert audio using the VibeVoice Realtime 0.5B model
    """
    try:
        # Check if audio file is provided
        if audio_file is None:
            raise gr.Error("Please upload an audio file")

        # Create a temporary file to store the uploaded audio
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
            temp_audio_path = temp_audio.name

        # Save the uploaded audio to the temporary file
        if isinstance(audio_file, tuple):
            # If it's a tuple (sample_rate, audio_data)
            sample_rate, audio_data = audio_file
            # Convert numpy array to AudioSegment and export as WAV
            audio_segment = AudioSegment(
                audio_data.tobytes(),
                frame_rate=sample_rate,
                sample_width=audio_data.dtype.itemsize,
                channels=1 if len(audio_data.shape) == 1 else audio_data.shape[0]
            )
            audio_segment.export(temp_audio_path, format="wav")
        else:
            # If it's a file path
            audio_segment = AudioSegment.from_file(audio_file)
            audio_segment.export(temp_audio_path, format="wav")

        # Prepare the request to the VibeVoice API
        api_url = "https://anycoderapps-vibevice-realtime-0-5b.hf.space/run/predict"

        # Read the audio file as bytes
        with open(temp_audio_path, "rb") as f:
            audio_bytes = f.read()

        # Prepare the payload
        payload = {
            "data": [
                audio_bytes,
                speaker_id
            ]
        }

        # Send request to the VibeVoice API
        response = requests.post(api_url, json=payload)

        # Clean up temporary file
        os.unlink(temp_audio_path)

        if response.status_code == 200:
            result = response.json()
            if "data" in result and len(result["data"]) > 0:
                # Get the converted audio data
                converted_audio_bytes = result["data"][0]

                # Create a temporary file for the converted audio
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_converted:
                    temp_converted_path = temp_converted.name
                    temp_converted.write(converted_audio_bytes)

                # Return the converted audio file path
                return temp_converted_path
            else:
                raise gr.Error("No audio data received from VibeVoice API")
        else:
            raise gr.Error(f"VibeVoice API request failed with status code: {response.status_code}")

    except Exception as e:
        raise gr.Error(f"An error occurred during voice conversion: {str(e)}")

def process_audio(audio_file, speaker_id):
    """
    Process the audio file and return the converted audio
    """
    try:
        # Convert the audio using VibeVoice
        converted_audio_path = vibevoice_conversion(audio_file, speaker_id)

        # Return the converted audio
        return converted_audio_path

    except Exception as e:
        raise gr.Error(f"Error processing audio: {str(e)}")

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 🎤 VibeVoice Realtime 0.5B - Voice Conversion")
    gr.Markdown("""
    ### Convert your voice to different styles using the VibeVoice Realtime 0.5B model

    **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**

    Upload an audio file and select a speaker style to convert your voice. The VibeVoice model can transform your voice while preserving the emotional content and prosody.
    """)

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input Audio")
            input_audio = gr.Audio(
                label="Upload your audio file",
                type="filepath",
                sources=["upload", "microphone"],
                format="wav"
            )

            speaker_style = gr.Dropdown(
                choices=[
                    "default",
                    "female_1",
                    "male_1",
                    "child",
                    "elderly",
                    "emotional"
                ],
                value="default",
                label="Select Speaker Style"
            )

            convert_btn = gr.Button("🔄 Convert Voice", variant="primary", size="lg")

        with gr.Column():
            gr.Markdown("### Converted Audio")
            output_audio = gr.Audio(
                label="Converted Audio",
                type="filepath",
                format="wav"
            )

            status_text = gr.Textbox(
                label="Status",
                value="Ready to convert your voice!",
                interactive=False
            )

    # Add examples
    examples = gr.Examples(
        examples=[
            ["https://example.com/sample1.wav", "female_1"],
            ["https://example.com/sample2.wav", "male_1"],
            ["https://example.com/sample3.wav", "emotional"]
        ],
        inputs=[input_audio, speaker_style],
        label="Try these examples:"
    )

    # Set up the conversion event
    convert_btn.click(
        fn=process_audio,
        inputs=[input_audio, speaker_style],
        outputs=[output_audio, status_text],
        api_visibility="public",
        api_name="convert_voice"
    )

    gr.Markdown("""
    ### About VibeVoice Realtime 0.5B
    - **Model**: VibeVoice Realtime 0.5B
    - **Size**: 0.5 Billion parameters
    - **Features**: Real-time voice conversion with emotional preservation
    - **Capabilities**: Speaker style transfer, emotional content preservation, high-quality voice conversion

    ### Tips for Best Results
    - Use clear, high-quality audio recordings
    - Speak naturally and expressively
    - For best results, use audio samples of 5-15 seconds
    - The model preserves emotional content and prosody from the original voice
    """)

# Launch the application with custom theme and settings
demo.launch(
    theme=custom_theme,
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "VibeVoice Model", "url": "https://huggingface.co/spaces/anycoderapps/VibeVoice-Realtime-0.5B"},
        {"label": "Gradio", "url": "https://gradio.app"},
        {"label": "Hugging Face", "url": "https://huggingface.co"}
    ],
    title="VibeVoice Realtime 0.5B - Voice Conversion",
    description="Convert your voice to different styles using the VibeVoice Realtime 0.5B model",
    show_error=True
)