ProfRom commited on
Commit
02b1c71
·
verified ·
1 Parent(s): 7004d4e

Hess - Unit 8 Assignment

Browse files
Files changed (2) hide show
  1. app.py +192 -74
  2. requirements.txt +6 -4
app.py CHANGED
@@ -1,89 +1,207 @@
1
- import torch
2
- from diffusers import StableDiffusionPipeline
3
  import gradio as gr
 
 
 
 
4
 
5
- # -------------------------------------------------------
6
- # 1. LOAD PRETRAINED TEXT-TO-IMAGE MODEL
7
- # -------------------------------------------------------
 
 
8
 
9
- model_id = "runwayml/stable-diffusion-v1-5"
 
 
10
 
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  dtype = torch.float16 if device == "cuda" else torch.float32
13
 
14
- pipe = StableDiffusionPipeline.from_pretrained(
15
- model_id,
 
16
  torch_dtype=dtype,
17
- safety_checker=None,
18
- use_safetensors=True
19
  )
20
-
21
- pipe = pipe.to(device)
22
-
23
-
24
- # -------------------------------------------------------
25
- # 2. CORE PREDICTION FUNCTION
26
- # -------------------------------------------------------
27
-
28
- def generate_image(prompt: str,
29
- num_inference_steps: int = 25,
30
- guidance_scale: float = 7.5):
31
-
32
- if not prompt or prompt.strip() == "":
33
- prompt = "A friendly robot reading a book in a cozy library, digital art"
34
-
35
- if device == "cuda":
36
- with torch.autocast(device_type="cuda"):
37
- result = pipe(
38
- prompt,
39
- num_inference_steps=num_inference_steps,
40
- guidance_scale=guidance_scale
41
- )
42
- else:
43
- result = pipe(
44
- prompt,
45
- num_inference_steps=num_inference_steps,
46
- guidance_scale=guidance_scale
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  )
48
 
49
- return result.images[0]
50
-
51
-
52
- # -------------------------------------------------------
53
- # 3. GRADIO UI
54
- # -------------------------------------------------------
55
-
56
- prompt_input = gr.Textbox(
57
- label="Enter your image prompt",
58
- lines=2,
59
- placeholder="e.g., 'A watercolor painting of a sunrise over mountains'"
60
- )
61
-
62
- steps_slider = gr.Slider(
63
- minimum=10,
64
- maximum=40,
65
- value=25,
66
- step=1,
67
- label="Number of inference steps"
68
- )
69
-
70
- guidance_slider = gr.Slider(
71
- minimum=1.0,
72
- maximum=15.0,
73
- value=7.5,
74
- step=0.5,
75
- label="Guidance scale"
76
- )
77
 
78
- image_output = gr.Image(label="Generated image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- demo = gr.Interface(
81
- fn=generate_image,
82
- inputs=[prompt_input, steps_slider, guidance_slider],
83
- outputs=image_output,
84
- title="Multimodal Text-to-Image Generator",
85
- description="Enter a prompt to generate an image using a pretrained text-to-image model."
86
- )
87
 
88
  if __name__ == "__main__":
89
- demo.launch()
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from diffusers import AutoPipelineForText2Image
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration
5
+ from PIL import Image
6
 
7
+ """
8
+ Multimodal Space for Assignment 8:
9
+ - Text to Image using stabilityai/sd-turbo
10
+ - Image to Text (captioning) using Salesforce BLIP
11
+ """
12
 
13
+ # model configuration
14
+ TEXT_TO_IMAGE_MODEL_ID = "stabilityai/sd-turbo"
15
+ CAPTION_MODEL_ID = "Salesforce/blip-image-captioning-base"
16
 
17
+ # device and dtype setup
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  dtype = torch.float16 if device == "cuda" else torch.float32
20
 
21
+ # load text to image pipeline (SD-Turbo)
22
+ text2img_pipe = AutoPipelineForText2Image.from_pretrained(
23
+ TEXT_TO_IMAGE_MODEL_ID,
24
  torch_dtype=dtype,
 
 
25
  )
26
+ text2img_pipe = text2img_pipe.to(device)
27
+
28
+ # memory optimization
29
+ if hasattr(text2img_pipe, "enable_attention_slicing"):
30
+ text2img_pipe.enable_attention_slicing()
31
+
32
+ # load image captioning model (BLIP)
33
+ caption_processor = BlipProcessor.from_pretrained(CAPTION_MODEL_ID)
34
+ caption_model = BlipForConditionalGeneration.from_pretrained(
35
+ CAPTION_MODEL_ID
36
+ ).to(device)
37
+
38
+
39
+ # text to image generation function
40
+ def generate_image(prompt, steps, guidance, seed):
41
+ """
42
+ Generate an image from a text prompt using the SD-Turbo pipeline.
43
+ """
44
+ if not prompt or not prompt.strip():
45
+ prompt = "a watercolor painting of a quiet cabin in the forest at sunrise"
46
+
47
+ # optional seeding for reproducibility
48
+ generator = None
49
+ if seed is not None and str(seed).strip() != "":
50
+ try:
51
+ seed_int = int(seed)
52
+ generator = torch.Generator(device=device).manual_seed(seed_int)
53
+ except ValueError:
54
+ # ignore invalid seeds and use a random generator instead
55
+ generator = None
56
+
57
+ # expose a slider for steps
58
+ # even though SD-Turbo is noted to be designed for a low number of steps (1–4)
59
+ steps = int(steps)
60
+ guidance = float(guidance)
61
+
62
+ result = text2img_pipe(
63
+ prompt=prompt,
64
+ num_inference_steps=steps,
65
+ guidance_scale=guidance,
66
+ generator=generator,
67
+ ).images[0]
68
+
69
+ return result
70
+
71
+
72
+ # image to text captioning function
73
+ def caption_image(image, max_length, num_beams):
74
+ """
75
+ Generate a text caption for an uploaded image.
76
+ """
77
+ if image is None:
78
+ return "Please upload an image first."
79
+
80
+ if not isinstance(image, Image.Image):
81
+ image = Image.fromarray(image)
82
+
83
+ inputs = caption_processor(images=image, return_tensors="pt").to(device)
84
+ output_ids = caption_model.generate(
85
+ **inputs,
86
+ max_length=int(max_length),
87
+ num_beams=int(num_beams),
88
+ )
89
+ caption = caption_processor.decode(output_ids[0], skip_special_tokens=True).strip()
90
+ return caption
91
+
92
+
93
+ # build Gradio UI
94
+ # includes blocks and tabs
95
+ with gr.Blocks() as assignment8:
96
+ gr.Markdown(
97
+ """
98
+ # Multimodal Assignment 8: Text to Image and Image Captioning
99
+
100
+ This Space demonstrates two multimodal capabilities:
101
+ 1. Text to Image generation using Stability AI's SD-Turbo model
102
+ 2. Image to Text captioning using a BLIP image captioning model
103
+ """
104
+ )
105
+
106
+ with gr.Tab("Text to Image"):
107
+ with gr.Row():
108
+ with gr.Column(scale=1):
109
+ prompt_in = gr.Textbox(
110
+ label="Prompt",
111
+ lines=5, # taller so full prompts are visible
112
+ placeholder="Describe the image you want the model to generate.",
113
+ )
114
+ steps_in = gr.Slider(
115
+ minimum=1,
116
+ maximum=8,
117
+ value=4,
118
+ step=1,
119
+ label="Number of inference steps",
120
+ )
121
+ guidance_in = gr.Slider(
122
+ minimum=0.0,
123
+ maximum=3.0,
124
+ value=0.0,
125
+ step=0.1,
126
+ label="Guidance scale",
127
+ )
128
+ seed_in = gr.Textbox(
129
+ label="Seed (optional, integer)",
130
+ placeholder="Ex. 42. Leave blank for random seed.",
131
+ )
132
+ generate_button = gr.Button("Generate Image")
133
+
134
+ with gr.Column(scale=1):
135
+ image_out = gr.Image(label="Generated Image")
136
+
137
+ gr.Examples(
138
+ examples=[
139
+ [
140
+ "a focused student working at a computer late at night",
141
+ 4,
142
+ 3.0,
143
+ "",
144
+ ],
145
+ [
146
+ "a golden retriever playing in a field of flowers",
147
+ 4,
148
+ 0.0,
149
+ "42",
150
+ ],
151
+ [
152
+ "a martial artist performing a high kick, dynamic motion",
153
+ 5,
154
+ 1.0,
155
+ "",
156
+ ],
157
+ ],
158
+ inputs=[prompt_in, steps_in, guidance_in, seed_in],
159
+ outputs=image_out,
160
+ fn=generate_image,
161
+ cache_examples=False,
162
  )
163
 
164
+ generate_button.click(
165
+ fn=generate_image,
166
+ inputs=[prompt_in, steps_in, guidance_in, seed_in],
167
+ outputs=image_out,
168
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ with gr.Tab("Image to Text (Captioning)"):
171
+ with gr.Row():
172
+ with gr.Column(scale=1):
173
+ image_in = gr.Image(
174
+ type="pil",
175
+ label="Upload an image",
176
+ )
177
+ max_length_in = gr.Slider(
178
+ minimum=16,
179
+ maximum=64,
180
+ value=32,
181
+ step=4,
182
+ label="Max caption length",
183
+ )
184
+ num_beams_in = gr.Slider(
185
+ minimum=1,
186
+ maximum=6,
187
+ value=4,
188
+ step=1,
189
+ label="Beam search width",
190
+ )
191
+ caption_button = gr.Button("Generate Caption")
192
+
193
+ with gr.Column(scale=1):
194
+ caption_out = gr.Textbox(
195
+ label="Generated Caption",
196
+ lines=4,
197
+ )
198
+
199
+ caption_button.click(
200
+ fn=caption_image,
201
+ inputs=[image_in, max_length_in, num_beams_in],
202
+ outputs=caption_out,
203
+ )
204
 
 
 
 
 
 
 
 
205
 
206
  if __name__ == "__main__":
207
+ assignment8.launch()
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  gradio>=4.0.0
2
- diffusers>=0.30.0
3
- transformers>=4.40.0
4
- accelerate>=0.30.0
5
  torch
6
- safetensors
 
 
 
 
 
 
1
  gradio>=4.0.0
 
 
 
2
  torch
3
+ torchvision
4
+ Pillow
5
+ diffusers
6
+ transformers
7
+ accelerate
8
+ safetensors