Spaces:
Runtime error
Runtime error
File size: 4,055 Bytes
94d6b3b 76362b7 94d6b3b 76362b7 7960641 94d6b3b 76362b7 94d6b3b 76362b7 94d6b3b 76362b7 94d6b3b 76362b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from diffusers import StableDiffusionPipeline
import torch
import numpy as np
# Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
def generate_prompt(description: str) -> str:
# Generate a detailed prompt based on a short description
prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text']
return prompt
# Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with GPU/CPU Support)
device = "cuda" if torch.cuda.is_available() else "cpu"
stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
stable_diffusion.to(device)
def generate_image(prompt: str, creativity: float, include_background: bool):
# Adjust creativity and background options in the prompt
if creativity < 0.5:
prompt += " with simpler details."
else:
prompt += " with highly detailed elements."
if include_background:
prompt += " with a vibrant and detailed background."
# Generate image based on the prompt
image = stable_diffusion(prompt).images[0]
return image
# Step 3: Voice Input Integration using Whisper for Speech-to-Text
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
def transcribe_audio(audio: np.ndarray, sampling_rate: int) -> str:
# Directly process the numpy array audio input
audio_input = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(audio_input)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
# Step 4: Gradio Interface with Simple Controllers (Textbox, Slider, Checkbox, Audio)
def process_input(description: str, creativity: float, include_background: bool):
# Generate a detailed prompt
prompt = generate_prompt(description)
# Generate image based on user inputs
image = generate_image(prompt, creativity, include_background)
return prompt, image
def process_audio_input(audio, sampling_rate):
# Convert audio to text
description = transcribe_audio(audio, sampling_rate)
# Generate a prompt and image based on transcribed text
prompt = generate_prompt(description)
image = generate_image(prompt, creativity=0.7, include_background=True)
return prompt, image
# Define Gradio interface components
text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky")
creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7)
background_checkbox = gr.Checkbox(label="Include Background", value=True)
audio_input = gr.Audio(type="numpy", label="Speak your Description", source="microphone")
# Create Gradio interface for text input
interface = gr.Interface(
fn=process_input,
inputs=[
text_input,
creativity_slider,
background_checkbox
],
outputs=[
gr.Textbox(label="Generated Prompt"),
gr.Image(label="Generated Image")
],
title="Magical Image Generator",
description="Enter a short description to generate a magical image. Adjust creativity and background options.",
theme="huggingface"
)
# Add audio input interface for voice interaction
interface_with_audio = gr.Interface(
fn=process_audio_input,
inputs=[audio_input],
outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")],
title="Magical Image Generator with Voice Input",
description="Speak a short description to generate a magical image!"
)
# Launch the interface with multiple tabs for text and voice input
gr.TabbedInterface([interface, interface_with_audio]).launch()
|