Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration | |
| from diffusers import StableDiffusionPipeline | |
| import torch | |
| import numpy as np | |
| # Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek) | |
| prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn") | |
| def generate_prompt(description: str) -> str: | |
| # Generate a detailed prompt based on a short description | |
| prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text'] | |
| return prompt | |
| # Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with GPU/CPU Support) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") | |
| stable_diffusion.to(device) | |
| def generate_image(prompt: str, creativity: float, include_background: bool): | |
| # Adjust creativity and background options in the prompt | |
| if creativity < 0.5: | |
| prompt += " with simpler details." | |
| else: | |
| prompt += " with highly detailed elements." | |
| if include_background: | |
| prompt += " with a vibrant and detailed background." | |
| # Generate image based on the prompt | |
| image = stable_diffusion(prompt).images[0] | |
| return image | |
| # Step 3: Voice Input Integration using Whisper for Speech-to-Text | |
| processor = WhisperProcessor.from_pretrained("openai/whisper-large") | |
| model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") | |
| def transcribe_audio(audio: np.ndarray, sampling_rate: int) -> str: | |
| # Directly process the numpy array audio input | |
| audio_input = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features | |
| predicted_ids = model.generate(audio_input) | |
| transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) | |
| return transcription | |
| # Step 4: Gradio Interface with Simple Controllers (Textbox, Slider, Checkbox, Audio) | |
| def process_input(description: str, creativity: float, include_background: bool): | |
| # Generate a detailed prompt | |
| prompt = generate_prompt(description) | |
| # Generate image based on user inputs | |
| image = generate_image(prompt, creativity, include_background) | |
| return prompt, image | |
| def process_audio_input(audio, sampling_rate): | |
| # Convert audio to text | |
| description = transcribe_audio(audio, sampling_rate) | |
| # Generate a prompt and image based on transcribed text | |
| prompt = generate_prompt(description) | |
| image = generate_image(prompt, creativity=0.7, include_background=True) | |
| return prompt, image | |
| # Define Gradio interface components | |
| text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky") | |
| creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7) | |
| background_checkbox = gr.Checkbox(label="Include Background", value=True) | |
| audio_input = gr.Audio(type="numpy", label="Speak your Description", source="microphone") | |
| # Create Gradio interface for text input | |
| interface = gr.Interface( | |
| fn=process_input, | |
| inputs=[ | |
| text_input, | |
| creativity_slider, | |
| background_checkbox | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Generated Prompt"), | |
| gr.Image(label="Generated Image") | |
| ], | |
| title="Magical Image Generator", | |
| description="Enter a short description to generate a magical image. Adjust creativity and background options.", | |
| theme="huggingface" | |
| ) | |
| # Add audio input interface for voice interaction | |
| interface_with_audio = gr.Interface( | |
| fn=process_audio_input, | |
| inputs=[audio_input], | |
| outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")], | |
| title="Magical Image Generator with Voice Input", | |
| description="Speak a short description to generate a magical image!" | |
| ) | |
| # Launch the interface with multiple tabs for text and voice input | |
| gr.TabbedInterface([interface, interface_with_audio]).launch() | |