import gradio as gr
from huggingface_hub import InferenceClient
import torch

# Using Hugging Face Inference API for speed and no local GPU requirement
# Image Model: FLUX.1-schnell (State-of-the-art fast generation)
# TTS Model: facebook/mms-tts-eng (Simple, reliable TTS)

client = InferenceClient()

def generate_all(text):
    # 1. Generate Image
    print(f"Generating image for: {text}")
    image = client.text_to_image(text, model="black-forest-labs/FLUX.1-schnell")
    
    # 2. Generate Audio (TTS)
    print(f"Generating audio for: {text}")
    # We'll use a widely available TTS model via the API
    audio_response = client.text_to_speech(text, model="facebook/mms-tts-eng")
    
    # Save audio to a temporary file for Gradio to play
    audio_path = "output.wav"
    with open(audio_path, "wb") as f:
        f.write(audio_response)
    
    return image, audio_path

# Create the UI
with gr.Blocks(title="AI Image & Voice Creator") as demo:
    gr.Markdown("# 🎨 AI Image & Voice Creator")
    gr.Markdown("Type a prompt below to generate an image and hear it spoken!")
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Enter your prompt", placeholder="A futuristic city at sunset...")
            btn = gr.Button("Generate ✨", variant="primary")
        
    with gr.Row():
        output_img = gr.Image(label="Generated Image")
        output_audio = gr.Audio(label="Spoken Prompt", type="filepath")

    btn.click(fn=generate_all, inputs=input_text, outputs=[output_img, output_audio])
    
    gr.Examples(
        examples=["A cute robot painting a masterpiece", "A mysterious forest with glowing mushrooms"],
        inputs=input_text
    )

if __name__ == "__main__":
    demo.launch()