Spaces:
Runtime error
Runtime error
Update min_dalle/min_dalle.py
Browse files- min_dalle/min_dalle.py +17 -22
min_dalle/min_dalle.py
CHANGED
|
@@ -10,12 +10,11 @@ from typing import Iterator
|
|
| 10 |
from .text_tokenizer import TextTokenizer
|
| 11 |
from .models import DalleBartEncoder, DalleBartDecoder, VQGanDetokenizer
|
| 12 |
import streamlit as st
|
| 13 |
-
import gc
|
| 14 |
|
| 15 |
torch.set_grad_enabled(False)
|
| 16 |
torch.set_num_threads(os.cpu_count())
|
| 17 |
-
torch.backends.cudnn.enabled =
|
| 18 |
-
torch.backends.cudnn.
|
| 19 |
|
| 20 |
MIN_DALLE_REPO = 'https://huggingface.co/kuprel/min-dalle/resolve/main/'
|
| 21 |
IMAGE_TOKEN_COUNT = 256
|
|
@@ -25,7 +24,7 @@ class MinDalle:
|
|
| 25 |
def __init__(
|
| 26 |
self,
|
| 27 |
models_root: str = 'pretrained',
|
| 28 |
-
dtype: torch.dtype = torch.
|
| 29 |
device: str = None,
|
| 30 |
is_mega: bool = True,
|
| 31 |
is_reusable: bool = True,
|
|
@@ -188,7 +187,7 @@ class MinDalle:
|
|
| 188 |
if len(tokens) > self.text_token_count:
|
| 189 |
tokens = tokens[:self.text_token_count]
|
| 190 |
if is_verbose: print("{} text tokens".format(len(tokens)), tokens)
|
| 191 |
-
text_tokens = numpy.ones((2, 64), dtype=numpy.
|
| 192 |
text_tokens[0, :2] = [tokens[0], tokens[-1]]
|
| 193 |
text_tokens[1, :len(tokens)] = tokens
|
| 194 |
text_tokens = torch.tensor(
|
|
@@ -232,37 +231,33 @@ class MinDalle:
|
|
| 232 |
token_indices = torch.arange(IMAGE_TOKEN_COUNT, device=self.device)
|
| 233 |
settings = torch.tensor(
|
| 234 |
[temperature, top_k, supercondition_factor],
|
| 235 |
-
dtype=torch.
|
| 236 |
device=self.device
|
| 237 |
)
|
| 238 |
-
|
| 239 |
-
|
| 240 |
for i in range(IMAGE_TOKEN_COUNT):
|
| 241 |
-
|
| 242 |
if(st.session_state.page != 0):
|
| 243 |
break
|
| 244 |
st.session_state.bar.progress(i/IMAGE_TOKEN_COUNT)
|
| 245 |
|
| 246 |
-
|
| 247 |
#torch.cpu.empty_cache()
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
)
|
| 258 |
|
| 259 |
-
|
|
|
|
| 260 |
yield self.image_grid_from_tokens(
|
| 261 |
image_tokens=image_tokens[1:].T,
|
| 262 |
is_seamless=is_seamless,
|
| 263 |
is_verbose=is_verbose
|
| 264 |
)
|
| 265 |
-
|
| 266 |
|
| 267 |
def generate_image_stream(self, *args, **kwargs) -> Iterator[Image.Image]:
|
| 268 |
image_stream = self.generate_raw_image_stream(*args, **kwargs)
|
|
|
|
| 10 |
from .text_tokenizer import TextTokenizer
|
| 11 |
from .models import DalleBartEncoder, DalleBartDecoder, VQGanDetokenizer
|
| 12 |
import streamlit as st
|
|
|
|
| 13 |
|
| 14 |
torch.set_grad_enabled(False)
|
| 15 |
torch.set_num_threads(os.cpu_count())
|
| 16 |
+
torch.backends.cudnn.enabled = True
|
| 17 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 18 |
|
| 19 |
MIN_DALLE_REPO = 'https://huggingface.co/kuprel/min-dalle/resolve/main/'
|
| 20 |
IMAGE_TOKEN_COUNT = 256
|
|
|
|
| 24 |
def __init__(
|
| 25 |
self,
|
| 26 |
models_root: str = 'pretrained',
|
| 27 |
+
dtype: torch.dtype = torch.float32,
|
| 28 |
device: str = None,
|
| 29 |
is_mega: bool = True,
|
| 30 |
is_reusable: bool = True,
|
|
|
|
| 187 |
if len(tokens) > self.text_token_count:
|
| 188 |
tokens = tokens[:self.text_token_count]
|
| 189 |
if is_verbose: print("{} text tokens".format(len(tokens)), tokens)
|
| 190 |
+
text_tokens = numpy.ones((2, 64), dtype=numpy.int32)
|
| 191 |
text_tokens[0, :2] = [tokens[0], tokens[-1]]
|
| 192 |
text_tokens[1, :len(tokens)] = tokens
|
| 193 |
text_tokens = torch.tensor(
|
|
|
|
| 231 |
token_indices = torch.arange(IMAGE_TOKEN_COUNT, device=self.device)
|
| 232 |
settings = torch.tensor(
|
| 233 |
[temperature, top_k, supercondition_factor],
|
| 234 |
+
dtype=torch.float32,
|
| 235 |
device=self.device
|
| 236 |
)
|
|
|
|
|
|
|
| 237 |
for i in range(IMAGE_TOKEN_COUNT):
|
|
|
|
| 238 |
if(st.session_state.page != 0):
|
| 239 |
break
|
| 240 |
st.session_state.bar.progress(i/IMAGE_TOKEN_COUNT)
|
| 241 |
|
| 242 |
+
torch.cuda.empty_cache()
|
| 243 |
#torch.cpu.empty_cache()
|
| 244 |
+
with torch.cuda.amp.autocast(dtype=self.dtype):
|
| 245 |
+
image_tokens[i + 1], attention_state = self.decoder.forward(
|
| 246 |
+
settings=settings,
|
| 247 |
+
attention_mask=attention_mask,
|
| 248 |
+
encoder_state=encoder_state,
|
| 249 |
+
attention_state=attention_state,
|
| 250 |
+
prev_tokens=image_tokens[i],
|
| 251 |
+
token_index=token_indices[[i]]
|
| 252 |
+
)
|
|
|
|
| 253 |
|
| 254 |
+
# with torch.cuda.amp.autocast(dtype=torch.float32):
|
| 255 |
+
if ((i + 1) % 32 == 0 and progressive_outputs) or i + 1 == 256:
|
| 256 |
yield self.image_grid_from_tokens(
|
| 257 |
image_tokens=image_tokens[1:].T,
|
| 258 |
is_seamless=is_seamless,
|
| 259 |
is_verbose=is_verbose
|
| 260 |
)
|
|
|
|
| 261 |
|
| 262 |
def generate_image_stream(self, *args, **kwargs) -> Iterator[Image.Image]:
|
| 263 |
image_stream = self.generate_raw_image_stream(*args, **kwargs)
|