import gradio as gr from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration from diffusers import StableDiffusionPipeline import torch # Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek) prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn") def generate_prompt(description: str) -> str: # Generate a detailed prompt based on a short description prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text'] return prompt # Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with CPU support) stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") stable_diffusion.to("cpu") # Use CPU instead of GPU def generate_image(prompt: str): # Generate an image from the prompt using Stable Diffusion image = stable_diffusion(prompt).images[0] return image # Step 5: Voice Input Integration using Whisper for Speech-to-Text processor = WhisperProcessor.from_pretrained("openai/whisper-large") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") def transcribe_audio(audio): # Convert audio to text using Whisper audio_input = processor(audio, return_tensors="pt").input_features predicted_ids = model.generate(audio_input) transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) return transcription # Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio) def process_input(description: str, creativity: float, include_background: bool): # Generate a detailed prompt prompt = generate_prompt(description) # Optionally modify prompt based on checkbox (for background inclusion) if include_background: prompt += " with a detailed, vibrant background." # Generate image based on the prompt image = generate_image(prompt) return prompt, image def process_audio_input(audio): # Convert audio to text description = transcribe_audio(audio) # Generate a prompt and image based on transcribed text prompt = generate_prompt(description) image = generate_image(prompt) return prompt, image # Define Gradio interface text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky") creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7) background_checkbox = gr.Checkbox(label="Include Background", value=True) audio_input = gr.Audio(type="numpy", label="Speak your Description") # Create interface with both text and audio inputs interface = gr.Interface( fn=process_input, inputs=[ text_input, creativity_slider, background_checkbox ], outputs=[ gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image") ], title="Magical Image Generator", description="Enter a short description or speak it to generate a magical image! Adjust creativity and background options.", theme="huggingface" ) # Add audio input for voice interaction interface_with_audio = gr.Interface( fn=process_audio_input, inputs=[audio_input], outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")], title="Magical Image Generator with Voice Input", description="Speak a short description and generate a magical image!" ) # Launch the interface with multiple tabs for text and voice input gr.TabbedInterface([interface, interface_with_audio]).launch()