import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from diffusers import StableDiffusionPipeline
import torch

# Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")

def generate_prompt(description: str) -> str:
    # Generate a detailed prompt based on a short description
    prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text']
    return prompt

# Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with CPU support)
stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
stable_diffusion.to("cpu")  # Use CPU instead of GPU

def generate_image(prompt: str):
    # Generate an image from the prompt using Stable Diffusion
    image = stable_diffusion(prompt).images[0]
    return image

# Step 5: Voice Input Integration using Whisper for Speech-to-Text
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

def transcribe_audio(audio):
    # Convert audio to text using Whisper
    audio_input = processor(audio, return_tensors="pt").input_features
    predicted_ids = model.generate(audio_input)
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

# Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio)
def process_input(description: str, creativity: float, include_background: bool):
    # Generate a detailed prompt
    prompt = generate_prompt(description)
    
    # Optionally modify prompt based on checkbox (for background inclusion)
    if include_background:
        prompt += " with a detailed, vibrant background."
    
    # Generate image based on the prompt
    image = generate_image(prompt)
    
    return prompt, image

def process_audio_input(audio):
    # Convert audio to text
    description = transcribe_audio(audio)
    # Generate a prompt and image based on transcribed text
    prompt = generate_prompt(description)
    image = generate_image(prompt)
    return prompt, image

# Define Gradio interface
text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky")
creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7)
background_checkbox = gr.Checkbox(label="Include Background", value=True)

audio_input = gr.Audio(type="numpy", label="Speak your Description")

# Create interface with both text and audio inputs
interface = gr.Interface(
    fn=process_input,
    inputs=[
        text_input,
        creativity_slider,
        background_checkbox
    ],
    outputs=[
        gr.Textbox(label="Generated Prompt"),
        gr.Image(label="Generated Image")
    ],
    title="Magical Image Generator",
    description="Enter a short description or speak it to generate a magical image! Adjust creativity and background options.",
    theme="huggingface"
)

# Add audio input for voice interaction
interface_with_audio = gr.Interface(
    fn=process_audio_input,
    inputs=[audio_input],
    outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")],
    title="Magical Image Generator with Voice Input",
    description="Speak a short description and generate a magical image!"
)

# Launch the interface with multiple tabs for text and voice input
gr.TabbedInterface([interface, interface_with_audio]).launch()