lucksadasd commited on
Commit
89e0004
·
verified ·
1 Parent(s): e1467d6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
3
+ from diffusers import StableDiffusionPipeline
4
+ import torch
5
+
6
+ # Step 1: Prompt-to-Prompt Generation using BART (or any LLM except GPT or DeepSeek)
7
+ prompt_generator = pipeline("text2text-generation", model="facebook/bart-large-cnn")
8
+
9
+ def generate_prompt(description: str) -> str:
10
+ # Generate a detailed prompt based on a short description
11
+ prompt = prompt_generator(f"Expand this description into a detailed prompt for an image: {description}", max_length=150)[0]['generated_text']
12
+ return prompt
13
+
14
+ # Step 2: Prompt-to-Image Generation using Stable Diffusion v1.5 (with CPU support)
15
+ stable_diffusion = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
16
+ stable_diffusion.to("cpu") # Use CPU instead of GPU
17
+
18
+ def generate_image(prompt: str):
19
+ # Generate an image from the prompt using Stable Diffusion
20
+ image = stable_diffusion(prompt).images[0]
21
+ return image
22
+
23
+ # Step 5: Voice Input Integration using Whisper for Speech-to-Text
24
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large")
25
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
26
+
27
+ def transcribe_audio(audio):
28
+ # Convert audio to text using Whisper
29
+ audio_input = processor(audio, return_tensors="pt").input_features
30
+ predicted_ids = model.generate(audio_input)
31
+ transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
32
+ return transcription
33
+
34
+ # Step 3: Gradio Interface with Multiple Controllers (Textbox, Slider, Checkbox, Audio)
35
+ def process_input(description: str, creativity: float, include_background: bool):
36
+ # Generate a detailed prompt
37
+ prompt = generate_prompt(description)
38
+
39
+ # Optionally modify prompt based on checkbox (for background inclusion)
40
+ if include_background:
41
+ prompt += " with a detailed, vibrant background."
42
+
43
+ # Generate image based on the prompt
44
+ image = generate_image(prompt)
45
+
46
+ return prompt, image
47
+
48
+ def process_audio_input(audio):
49
+ # Convert audio to text
50
+ description = transcribe_audio(audio)
51
+ # Generate a prompt and image based on transcribed text
52
+ prompt = generate_prompt(description)
53
+ image = generate_image(prompt)
54
+ return prompt, image
55
+
56
+ # Define Gradio interface
57
+ text_input = gr.Textbox(label="Enter Description", placeholder="E.g., A magical treehouse in the sky")
58
+ creativity_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="Creativity (0 to 1)", value=0.7)
59
+ background_checkbox = gr.Checkbox(label="Include Background", value=True)
60
+
61
+ audio_input = gr.Audio(type="numpy", label="Speak your Description")
62
+
63
+ # Create interface with both text and audio inputs
64
+ interface = gr.Interface(
65
+ fn=process_input,
66
+ inputs=[
67
+ text_input,
68
+ creativity_slider,
69
+ background_checkbox
70
+ ],
71
+ outputs=[
72
+ gr.Textbox(label="Generated Prompt"),
73
+ gr.Image(label="Generated Image")
74
+ ],
75
+ title="Magical Image Generator",
76
+ description="Enter a short description or speak it to generate a magical image! Adjust creativity and background options.",
77
+ theme="huggingface"
78
+ )
79
+
80
+ # Add audio input for voice interaction
81
+ interface_with_audio = gr.Interface(
82
+ fn=process_audio_input,
83
+ inputs=[audio_input],
84
+ outputs=[gr.Textbox(label="Generated Prompt"), gr.Image(label="Generated Image")],
85
+ title="Magical Image Generator with Voice Input",
86
+ description="Speak a short description and generate a magical image!"
87
+ )
88
+
89
+ # Launch the interface with multiple tabs for text and voice input
90
+ gr.TabbedInterface([interface, interface_with_audio]).launch()