import gradio as gr from huggingface_hub import InferenceClient import torch # Using Hugging Face Inference API for speed and no local GPU requirement # Image Model: FLUX.1-schnell (State-of-the-art fast generation) # TTS Model: facebook/mms-tts-eng (Simple, reliable TTS) client = InferenceClient() def generate_all(text): # 1. Generate Image print(f"Generating image for: {text}") image = client.text_to_image(text, model="black-forest-labs/FLUX.1-schnell") # 2. Generate Audio (TTS) print(f"Generating audio for: {text}") # We'll use a widely available TTS model via the API audio_response = client.text_to_speech(text, model="facebook/mms-tts-eng") # Save audio to a temporary file for Gradio to play audio_path = "output.wav" with open(audio_path, "wb") as f: f.write(audio_response) return image, audio_path # Create the UI with gr.Blocks(title="AI Image & Voice Creator") as demo: gr.Markdown("# 🎨 AI Image & Voice Creator") gr.Markdown("Type a prompt below to generate an image and hear it spoken!") with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="Enter your prompt", placeholder="A futuristic city at sunset...") btn = gr.Button("Generate ✨", variant="primary") with gr.Row(): output_img = gr.Image(label="Generated Image") output_audio = gr.Audio(label="Spoken Prompt", type="filepath") btn.click(fn=generate_all, inputs=input_text, outputs=[output_img, output_audio]) gr.Examples( examples=["A cute robot painting a masterpiece", "A mysterious forest with glowing mushrooms"], inputs=input_text ) if __name__ == "__main__": demo.launch()