Spaces:
Runtime error
Runtime error
Update min_dalle/min_dalle.py
Browse files- min_dalle/min_dalle.py +22 -24
min_dalle/min_dalle.py
CHANGED
|
@@ -39,10 +39,9 @@ class MinDalle:
|
|
| 39 |
self.dtype = dtype
|
| 40 |
self.is_verbose = is_verbose
|
| 41 |
self.text_token_count = 64
|
| 42 |
-
|
| 43 |
-
self.
|
| 44 |
-
self.
|
| 45 |
-
self.embed_count = 2048 if is_mega else 512
|
| 46 |
self.glu_embed_count = 4096 if is_mega else 2730
|
| 47 |
self.text_vocab_count = 50272 if is_mega else 50264
|
| 48 |
self.image_vocab_count = 16415 if is_mega else 16384
|
|
@@ -238,29 +237,27 @@ class MinDalle:
|
|
| 238 |
for i in range(IMAGE_TOKEN_COUNT):
|
| 239 |
if(st.session_state.page != 0):
|
| 240 |
break
|
| 241 |
-
|
| 242 |
st.session_state.bar.progress(i/IMAGE_TOKEN_COUNT)
|
| 243 |
-
|
| 244 |
-
#torch.cuda.empty_cache()
|
| 245 |
-
#torch.cpu.empty_cache()
|
| 246 |
-
#with torch.cuda.amp.autocast(dtype=self.dtype):
|
| 247 |
-
image_tokens[i + 1], attention_state = self.decoder.forward(
|
| 248 |
-
settings=settings,
|
| 249 |
-
attention_mask=attention_mask,
|
| 250 |
-
encoder_state=encoder_state,
|
| 251 |
-
attention_state=attention_state,
|
| 252 |
-
prev_tokens=image_tokens[i],
|
| 253 |
-
token_index=token_indices[[i]]
|
| 254 |
-
)
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
)
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
def generate_image_stream(self, *args, **kwargs) -> Iterator[Image.Image]:
|
| 266 |
image_stream = self.generate_raw_image_stream(*args, **kwargs)
|
|
@@ -278,6 +275,7 @@ class MinDalle:
|
|
| 278 |
image = image.reshape([grid_size ** 2, 2 ** 8, 2 ** 8, 3])
|
| 279 |
yield image
|
| 280 |
|
|
|
|
| 281 |
def generate_image(self, *args, **kwargs) -> Image.Image:
|
| 282 |
image_stream = self.generate_image_stream(
|
| 283 |
*args, **kwargs,
|
|
|
|
| 39 |
self.dtype = dtype
|
| 40 |
self.is_verbose = is_verbose
|
| 41 |
self.text_token_count = 64
|
| 42 |
+
self.layer_count = 24 if is_mega else 12
|
| 43 |
+
self.attention_head_count = 32 if is_mega else 16
|
| 44 |
+
self.embed_count = 2048 if is_mega else 1024
|
|
|
|
| 45 |
self.glu_embed_count = 4096 if is_mega else 2730
|
| 46 |
self.text_vocab_count = 50272 if is_mega else 50264
|
| 47 |
self.image_vocab_count = 16415 if is_mega else 16384
|
|
|
|
| 237 |
for i in range(IMAGE_TOKEN_COUNT):
|
| 238 |
if(st.session_state.page != 0):
|
| 239 |
break
|
|
|
|
| 240 |
st.session_state.bar.progress(i/IMAGE_TOKEN_COUNT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
torch.cuda.empty_cache()
|
| 243 |
+
#torch.cpu.empty_cache()
|
| 244 |
+
with torch.cuda.amp.autocast(dtype=self.dtype):
|
| 245 |
+
image_tokens[i + 1], attention_state = self.decoder.forward(
|
| 246 |
+
settings=settings,
|
| 247 |
+
attention_mask=attention_mask,
|
| 248 |
+
encoder_state=encoder_state,
|
| 249 |
+
attention_state=attention_state,
|
| 250 |
+
prev_tokens=image_tokens[i],
|
| 251 |
+
token_index=token_indices[[i]]
|
| 252 |
)
|
| 253 |
+
|
| 254 |
+
with torch.cuda.amp.autocast(dtype=torch.float32):
|
| 255 |
+
if ((i + 1) % 32 == 0 and progressive_outputs) or i + 1 == 256:
|
| 256 |
+
yield self.image_grid_from_tokens(
|
| 257 |
+
image_tokens=image_tokens[1:].T,
|
| 258 |
+
is_seamless=is_seamless,
|
| 259 |
+
is_verbose=is_verbose
|
| 260 |
+
)
|
| 261 |
|
| 262 |
def generate_image_stream(self, *args, **kwargs) -> Iterator[Image.Image]:
|
| 263 |
image_stream = self.generate_raw_image_stream(*args, **kwargs)
|
|
|
|
| 275 |
image = image.reshape([grid_size ** 2, 2 ** 8, 2 ** 8, 3])
|
| 276 |
yield image
|
| 277 |
|
| 278 |
+
|
| 279 |
def generate_image(self, *args, **kwargs) -> Image.Image:
|
| 280 |
image_stream = self.generate_image_stream(
|
| 281 |
*args, **kwargs,
|