Spaces:

hellokawei
/

image

Runtime error

App Files Files Community

hellokawei commited on Apr 21, 2025

Commit

94d6b3b

verified ·

1 Parent(s): f511c2c

Create app.py

Browse files

Files changed (1) hide show

app.py +96 -0

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import gradio as gr
+from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
+from diffusers import StableDiffusionPipeline
+import torch
+# Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
+prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
+def generate_prompt(description: str) -> str:
+    # Generate a detailed prompt based on a short description
+    prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text']
+    return prompt
+# Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with GPU/CPU Support)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
+stable_diffusion.to(device)
+def generate_image(prompt: str, creativity: float, include_background: bool):
+    # Adjust creativity and background options in the prompt
+    if creativity < 0.5:
+        prompt += " with simpler details."
+    else:
+        prompt += " with highly detailed elements."
+    if include_background:
+        prompt += " with a vibrant and detailed background."
+    # Generate image based on the prompt
+    image = stable_diffusion(prompt).images[0]
+    return image
+# Step 3: Voice Input Integration using Whisper for Speech-to-Text
+processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
+def transcribe_audio(audio):
+    # Convert audio to text using Whisper
+    audio_input = processor(audio, return_tensors="pt").input_features
+    predicted_ids = model.generate(audio_input)
+    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
+    return transcription
+# Step 4: Gradio Interface with Simple Controllers (Textbox, Slider, Checkbox, Audio)
+def process_input(description: str, creativity: float, include_background: bool):
+    # Generate a detailed prompt
+    prompt = generate_prompt(description)
+    # Generate image based on user inputs
+    image = generate_image(prompt, creativity, include_background)
+    return prompt, image
+def process_audio_input(audio):
+    # Convert audio to text
+    description = transcribe_audio(audio)
+    # Generate a prompt and image based on transcribed text
+    prompt = generate_prompt(description)
+    image = generate_image(prompt, creativity=0.7, include_background=True)
+    return prompt, image
+# Define Gradio interface components
+text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky")
+creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7)
+background_checkbox = gr.Checkbox(label="Include Background", value=True)
+audio_input = gr.Audio(type="numpy", label="Speak your Description")
+# Create Gradio interface for text input
+interface = gr.Interface(
+    fn=process_input,
+    inputs=[
+        text_input,
+        creativity_slider,
+        background_checkbox
+    ],
+    outputs=[
+        gr.Textbox(label="Generated Prompt"),
+        gr.Image(label="Generated Image")
+    ],
+    title="Magical Image Generator",
+    description="Enter a short description to generate a magical image. Adjust creativity and background options.",
+    theme="huggingface"
+)
+# Add audio input interface for voice interaction
+interface_with_audio = gr.Interface(
+    fn=process_audio_input,
+    inputs=[audio_input],
+    outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")],
+    title="Magical Image Generator with Voice Input",
+    description="Speak a short description to generate a magical image!"
+)
+# Launch the interface with multiple tabs for text and voice input
+gr.TabbedInterface([interface, interface_with_audio]).launch()