import gradio as gr

#gr.load("models/stabilityai/stable-diffusion-3.5-large").launch()
gr.load("tts_models/multilingual/multi-dataset/xtts_v2").launch()


# Gradio interface
interface = gr.Interface(
    fn=process_audio_and_generate_image,
    inputs=gr.Audio(type="filepath", label="Upload an Audio File (WAV/MP3)"),
    outputs=[
        gr.Image(label="Generated Image"),
        gr.Textbox(label="Transcription"),
    ],
    title="Voice-to-Image Generator",
    description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
)

# Launch the interface
interface.launch()