au / app.py
abersbail's picture
Upload 3 files
3469362 verified
import gradio as gr
from huggingface_hub import InferenceClient
import torch
# Using Hugging Face Inference API for speed and no local GPU requirement
# Image Model: FLUX.1-schnell (State-of-the-art fast generation)
# TTS Model: facebook/mms-tts-eng (Simple, reliable TTS)
client = InferenceClient()
def generate_all(text):
# 1. Generate Image
print(f"Generating image for: {text}")
image = client.text_to_image(text, model="black-forest-labs/FLUX.1-schnell")
# 2. Generate Audio (TTS)
print(f"Generating audio for: {text}")
# We'll use a widely available TTS model via the API
audio_response = client.text_to_speech(text, model="facebook/mms-tts-eng")
# Save audio to a temporary file for Gradio to play
audio_path = "output.wav"
with open(audio_path, "wb") as f:
f.write(audio_response)
return image, audio_path
# Create the UI
with gr.Blocks(title="AI Image & Voice Creator") as demo:
gr.Markdown("# 🎨 AI Image & Voice Creator")
gr.Markdown("Type a prompt below to generate an image and hear it spoken!")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Enter your prompt", placeholder="A futuristic city at sunset...")
btn = gr.Button("Generate ✨", variant="primary")
with gr.Row():
output_img = gr.Image(label="Generated Image")
output_audio = gr.Audio(label="Spoken Prompt", type="filepath")
btn.click(fn=generate_all, inputs=input_text, outputs=[output_img, output_audio])
gr.Examples(
examples=["A cute robot painting a masterpiece", "A mysterious forest with glowing mushrooms"],
inputs=input_text
)
if __name__ == "__main__":
demo.launch()