hellokawei commited on
Commit
94d6b3b
Β·
verified Β·
1 Parent(s): f511c2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
3
+ from diffusers import StableDiffusionPipeline
4
+ import torch
5
+
6
+ # Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
7
+ prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
8
+
9
+ def generate_prompt(description: str) -> str:
10
+ # Generate a detailed prompt based on a short description
11
+ prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text']
12
+ return prompt
13
+
14
+ # Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with GPU/CPU Support)
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
17
+ stable_diffusion.to(device)
18
+
19
+ def generate_image(prompt: str, creativity: float, include_background: bool):
20
+ # Adjust creativity and background options in the prompt
21
+ if creativity < 0.5:
22
+ prompt += " with simpler details."
23
+ else:
24
+ prompt += " with highly detailed elements."
25
+
26
+ if include_background:
27
+ prompt += " with a vibrant and detailed background."
28
+
29
+ # Generate image based on the prompt
30
+ image = stable_diffusion(prompt).images[0]
31
+ return image
32
+
33
+ # Step 3: Voice Input Integration using Whisper for Speech-to-Text
34
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large")
35
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
36
+
37
+ def transcribe_audio(audio):
38
+ # Convert audio to text using Whisper
39
+ audio_input = processor(audio, return_tensors="pt").input_features
40
+ predicted_ids = model.generate(audio_input)
41
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
42
+ return transcription
43
+
44
+ # Step 4: Gradio Interface with Simple Controllers (Textbox, Slider, Checkbox, Audio)
45
+ def process_input(description: str, creativity: float, include_background: bool):
46
+ # Generate a detailed prompt
47
+ prompt = generate_prompt(description)
48
+
49
+ # Generate image based on user inputs
50
+ image = generate_image(prompt, creativity, include_background)
51
+
52
+ return prompt, image
53
+
54
+ def process_audio_input(audio):
55
+ # Convert audio to text
56
+ description = transcribe_audio(audio)
57
+ # Generate a prompt and image based on transcribed text
58
+ prompt = generate_prompt(description)
59
+ image = generate_image(prompt, creativity=0.7, include_background=True)
60
+ return prompt, image
61
+
62
+ # Define Gradio interface components
63
+ text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky")
64
+ creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7)
65
+ background_checkbox = gr.Checkbox(label="Include Background", value=True)
66
+
67
+ audio_input = gr.Audio(type="numpy", label="Speak your Description")
68
+
69
+ # Create Gradio interface for text input
70
+ interface = gr.Interface(
71
+ fn=process_input,
72
+ inputs=[
73
+ text_input,
74
+ creativity_slider,
75
+ background_checkbox
76
+ ],
77
+ outputs=[
78
+ gr.Textbox(label="Generated Prompt"),
79
+ gr.Image(label="Generated Image")
80
+ ],
81
+ title="Magical Image Generator",
82
+ description="Enter a short description to generate a magical image. Adjust creativity and background options.",
83
+ theme="huggingface"
84
+ )
85
+
86
+ # Add audio input interface for voice interaction
87
+ interface_with_audio = gr.Interface(
88
+ fn=process_audio_input,
89
+ inputs=[audio_input],
90
+ outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")],
91
+ title="Magical Image Generator with Voice Input",
92
+ description="Speak a short description to generate a magical image!"
93
+ )
94
+
95
+ # Launch the interface with multiple tabs for text and voice input
96
+ gr.TabbedInterface([interface, interface_with_audio]).launch()