Spaces:

hellokawei
/

image

Runtime error

App Files Files Community

image / app.py

hellokawei

Update app.py

7960641 verified 8 months ago

raw

history blame contribute delete

4.06 kB

	import gradio as gr
	from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
	from diffusers import StableDiffusionPipeline
	import torch
	import numpy as np

	# Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
	prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")

	def generate_prompt(description: str) -> str:
	# Generate a detailed prompt based on a short description
	prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text']
	return prompt

	# Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with GPU/CPU Support)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
	stable_diffusion.to(device)

	def generate_image(prompt: str, creativity: float, include_background: bool):
	# Adjust creativity and background options in the prompt
	if creativity < 0.5:
	prompt += " with simpler details."
	else:
	prompt += " with highly detailed elements."

	if include_background:
	prompt += " with a vibrant and detailed background."

	# Generate image based on the prompt
	image = stable_diffusion(prompt).images[0]
	return image

	# Step 3: Voice Input Integration using Whisper for Speech-to-Text
	processor = WhisperProcessor.from_pretrained("openai/whisper-large")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

	def transcribe_audio(audio: np.ndarray, sampling_rate: int) -> str:
	# Directly process the numpy array audio input
	audio_input = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features
	predicted_ids = model.generate(audio_input)
	transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
	return transcription

	# Step 4: Gradio Interface with Simple Controllers (Textbox, Slider, Checkbox, Audio)
	def process_input(description: str, creativity: float, include_background: bool):
	# Generate a detailed prompt
	prompt = generate_prompt(description)

	# Generate image based on user inputs
	image = generate_image(prompt, creativity, include_background)

	return prompt, image

	def process_audio_input(audio, sampling_rate):
	# Convert audio to text
	description = transcribe_audio(audio, sampling_rate)
	# Generate a prompt and image based on transcribed text
	prompt = generate_prompt(description)
	image = generate_image(prompt, creativity=0.7, include_background=True)
	return prompt, image

	# Define Gradio interface components
	text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky")
	creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7)
	background_checkbox = gr.Checkbox(label="Include Background", value=True)

	audio_input = gr.Audio(type="numpy", label="Speak your Description", source="microphone")

	# Create Gradio interface for text input
	interface = gr.Interface(
	fn=process_input,
	inputs=[
	text_input,
	creativity_slider,
	background_checkbox
	],
	outputs=[
	gr.Textbox(label="Generated Prompt"),
	gr.Image(label="Generated Image")
	],
	title="Magical Image Generator",
	description="Enter a short description to generate a magical image. Adjust creativity and background options.",
	theme="huggingface"
	)

	# Add audio input interface for voice interaction
	interface_with_audio = gr.Interface(
	fn=process_audio_input,
	inputs=[audio_input],
	outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")],
	title="Magical Image Generator with Voice Input",
	description="Speak a short description to generate a magical image!"
	)

	# Launch the interface with multiple tabs for text and voice input
	gr.TabbedInterface([interface, interface_with_audio]).launch()