import gradio as gr #gr.load("models/stabilityai/stable-diffusion-3.5-large").launch() gr.load("tts_models/multilingual/multi-dataset/xtts_v2").launch() # Gradio interface interface = gr.Interface( fn=process_audio_and_generate_image, inputs=gr.Audio(type="filepath", label="Upload an Audio File (WAV/MP3)"), outputs=[ gr.Image(label="Generated Image"), gr.Textbox(label="Transcription"), ], title="Voice-to-Image Generator", description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.", ) # Launch the interface interface.launch()