Spaces:
Runtime error
Runtime error
feat(data): super conditioning (#141)
Browse files* feat(data): online filtering
* feat(generate): super conditioning
* feat: add processor
- README.md +1 -1
- src/dalle_mini/__init__.py +3 -1
- src/dalle_mini/data.py +69 -20
- src/dalle_mini/model/__init__.py +1 -0
- src/dalle_mini/model/modeling.py +337 -1
- src/dalle_mini/model/processor.py +58 -0
- src/dalle_mini/{text.py → model/text.py} +0 -0
- tools/inference/inference_pipeline.ipynb +25 -44
- tools/train/train.py +21 -1
README.md
CHANGED
|
@@ -35,7 +35,6 @@ To generate sample predictions and understand the inference pipeline step by ste
|
|
| 35 |
Join the community on the [DALLE-Pytorch Discord](https://discord.gg/xBPBXfcFHd).
|
| 36 |
Any contribution is welcome, from reporting issues to proposing fixes/improvements or testing the model with cool prompts!
|
| 37 |
|
| 38 |
-
|
| 39 |
## Development
|
| 40 |
|
| 41 |
### Dependencies Installation
|
|
@@ -95,6 +94,7 @@ Many thanks to the people who helped make it better:
|
|
| 95 |
|
| 96 |
- the [DALLE-Pytorch](https://discord.gg/xBPBXfcFHd) and [EleutherAI](https://www.eleuther.ai/) communities for testing and exchanging cool ideas
|
| 97 |
- [Rohan Anil](https://github.com/rohan-anil) for adding Distributed Shampoo optimizer
|
|
|
|
| 98 |
|
| 99 |
## Citing DALL·E mini
|
| 100 |
|
|
|
|
| 35 |
Join the community on the [DALLE-Pytorch Discord](https://discord.gg/xBPBXfcFHd).
|
| 36 |
Any contribution is welcome, from reporting issues to proposing fixes/improvements or testing the model with cool prompts!
|
| 37 |
|
|
|
|
| 38 |
## Development
|
| 39 |
|
| 40 |
### Dependencies Installation
|
|
|
|
| 94 |
|
| 95 |
- the [DALLE-Pytorch](https://discord.gg/xBPBXfcFHd) and [EleutherAI](https://www.eleuther.ai/) communities for testing and exchanging cool ideas
|
| 96 |
- [Rohan Anil](https://github.com/rohan-anil) for adding Distributed Shampoo optimizer
|
| 97 |
+
- [Katherine Crowson](https://github.com/crowsonkb) for [super conditioning](https://twitter.com/RiversHaveWings/status/1478093658716966912)
|
| 98 |
|
| 99 |
## Citing DALL·E mini
|
| 100 |
|
src/dalle_mini/__init__.py
CHANGED
|
@@ -1 +1,3 @@
|
|
| 1 |
-
__version__ = "0.0.
|
|
|
|
|
|
|
|
|
| 1 |
+
__version__ = "0.0.3"
|
| 2 |
+
|
| 3 |
+
from .model import DalleBart, DalleBartProcessor
|
src/dalle_mini/data.py
CHANGED
|
@@ -7,7 +7,7 @@ import numpy as np
|
|
| 7 |
from braceexpand import braceexpand
|
| 8 |
from datasets import Dataset, load_dataset
|
| 9 |
|
| 10 |
-
from .text import TextNormalizer
|
| 11 |
|
| 12 |
|
| 13 |
@dataclass
|
|
@@ -28,6 +28,11 @@ class Dataset:
|
|
| 28 |
seed_dataset: int = None
|
| 29 |
shard_by_host: bool = False
|
| 30 |
blank_caption_prob: float = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
train_dataset: Dataset = field(init=False)
|
| 32 |
eval_dataset: Dataset = field(init=False)
|
| 33 |
rng_dataset: jnp.ndarray = field(init=False)
|
|
@@ -36,6 +41,7 @@ class Dataset:
|
|
| 36 |
def __post_init__(self):
|
| 37 |
self.multi_hosts = jax.process_count() > 1
|
| 38 |
# feed blank captions only in streaming mode for now
|
|
|
|
| 39 |
if self.blank_caption_prob:
|
| 40 |
assert (
|
| 41 |
self.streaming is True
|
|
@@ -107,23 +113,30 @@ class Dataset:
|
|
| 107 |
self.seed_dataset = np.random.get_state()[1][0]
|
| 108 |
self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
|
| 109 |
|
| 110 |
-
#
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
)
|
| 128 |
|
| 129 |
# normalize text
|
|
@@ -151,6 +164,25 @@ class Dataset:
|
|
| 151 |
),
|
| 152 |
)
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
# preprocess
|
| 155 |
partial_preprocess_function = partial(
|
| 156 |
preprocess_function,
|
|
@@ -230,8 +262,8 @@ class Dataset:
|
|
| 230 |
dataset.set_epoch(epoch)
|
| 231 |
epoch += 1
|
| 232 |
for item in dataset:
|
| 233 |
-
for k
|
| 234 |
-
batch[k].append(
|
| 235 |
if len(batch[keys[0]]) == batch_size:
|
| 236 |
batch = {k: jnp.array(v) for k, v in batch.items()}
|
| 237 |
yield batch
|
|
@@ -292,6 +324,23 @@ def normalize_function(example, text_column, text_normalizer):
|
|
| 292 |
return example
|
| 293 |
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
def preprocess_function(
|
| 296 |
examples,
|
| 297 |
tokenizer,
|
|
|
|
| 7 |
from braceexpand import braceexpand
|
| 8 |
from datasets import Dataset, load_dataset
|
| 9 |
|
| 10 |
+
from .model.text import TextNormalizer
|
| 11 |
|
| 12 |
|
| 13 |
@dataclass
|
|
|
|
| 28 |
seed_dataset: int = None
|
| 29 |
shard_by_host: bool = False
|
| 30 |
blank_caption_prob: float = 0.0
|
| 31 |
+
clip_score_column: str = "clip_score"
|
| 32 |
+
min_clip_score: float = None
|
| 33 |
+
max_clip_score: float = None
|
| 34 |
+
filter_column: str = None
|
| 35 |
+
filter_value: str = None
|
| 36 |
train_dataset: Dataset = field(init=False)
|
| 37 |
eval_dataset: Dataset = field(init=False)
|
| 38 |
rng_dataset: jnp.ndarray = field(init=False)
|
|
|
|
| 41 |
def __post_init__(self):
|
| 42 |
self.multi_hosts = jax.process_count() > 1
|
| 43 |
# feed blank captions only in streaming mode for now
|
| 44 |
+
# otherwise dataset could be cached with same blanked captions
|
| 45 |
if self.blank_caption_prob:
|
| 46 |
assert (
|
| 47 |
self.streaming is True
|
|
|
|
| 113 |
self.seed_dataset = np.random.get_state()[1][0]
|
| 114 |
self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
|
| 115 |
|
| 116 |
+
# filter data
|
| 117 |
+
partial_filter_function = partial(
|
| 118 |
+
filter_function,
|
| 119 |
+
filter_column=self.filter_column,
|
| 120 |
+
filter_value=self.filter_value,
|
| 121 |
+
clip_score_column=self.clip_score_column,
|
| 122 |
+
min_clip_score=self.min_clip_score,
|
| 123 |
+
max_clip_score=self.max_clip_score,
|
| 124 |
+
)
|
| 125 |
+
for ds in ["train_dataset", "eval_dataset"]:
|
| 126 |
+
if hasattr(self, ds):
|
| 127 |
+
setattr(
|
| 128 |
+
self,
|
| 129 |
+
ds,
|
| 130 |
+
(
|
| 131 |
+
getattr(self, ds).filter(partial_filter_function)
|
| 132 |
+
if self.streaming
|
| 133 |
+
else getattr(self, ds).filter(
|
| 134 |
+
partial_filter_function,
|
| 135 |
+
num_proc=self.preprocessing_num_workers,
|
| 136 |
+
load_from_cache_file=not self.overwrite_cache,
|
| 137 |
+
desc="Filtering datasets",
|
| 138 |
+
)
|
| 139 |
+
),
|
| 140 |
)
|
| 141 |
|
| 142 |
# normalize text
|
|
|
|
| 164 |
),
|
| 165 |
)
|
| 166 |
|
| 167 |
+
# blank captions
|
| 168 |
+
if self.blank_caption_prob:
|
| 169 |
+
partial_blank_caption_function = partial(
|
| 170 |
+
blank_caption_function,
|
| 171 |
+
text_column=self.text_column,
|
| 172 |
+
blank_caption_prob=self.blank_caption_prob,
|
| 173 |
+
)
|
| 174 |
+
if hasattr(self, "train_dataset"):
|
| 175 |
+
self.train_dataset = (
|
| 176 |
+
self.train_dataset.map(partial_blank_caption_function)
|
| 177 |
+
if self.streaming
|
| 178 |
+
else self.train_dataset.map(
|
| 179 |
+
partial_blank_caption_function,
|
| 180 |
+
num_proc=self.preprocessing_num_workers,
|
| 181 |
+
load_from_cache_file=False,
|
| 182 |
+
desc="Blanking some captions",
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
# preprocess
|
| 187 |
partial_preprocess_function = partial(
|
| 188 |
preprocess_function,
|
|
|
|
| 262 |
dataset.set_epoch(epoch)
|
| 263 |
epoch += 1
|
| 264 |
for item in dataset:
|
| 265 |
+
for k in keys:
|
| 266 |
+
batch[k].append(item[k])
|
| 267 |
if len(batch[keys[0]]) == batch_size:
|
| 268 |
batch = {k: jnp.array(v) for k, v in batch.items()}
|
| 269 |
yield batch
|
|
|
|
| 324 |
return example
|
| 325 |
|
| 326 |
|
| 327 |
+
def filter_function(
|
| 328 |
+
example,
|
| 329 |
+
min_clip_score,
|
| 330 |
+
max_clip_score,
|
| 331 |
+
clip_score_column,
|
| 332 |
+
filter_column,
|
| 333 |
+
filter_value,
|
| 334 |
+
):
|
| 335 |
+
if min_clip_score is not None and example[clip_score_column] < min_clip_score:
|
| 336 |
+
return False
|
| 337 |
+
if max_clip_score is not None and example[clip_score_column] > max_clip_score:
|
| 338 |
+
return False
|
| 339 |
+
if filter_column is not None and example[filter_column] != filter_value:
|
| 340 |
+
return False
|
| 341 |
+
return True
|
| 342 |
+
|
| 343 |
+
|
| 344 |
def preprocess_function(
|
| 345 |
examples,
|
| 346 |
tokenizer,
|
src/dalle_mini/model/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from .configuration import DalleBartConfig
|
| 2 |
from .modeling import DalleBart
|
| 3 |
from .partitions import set_partitions
|
|
|
|
| 4 |
from .tokenizer import DalleBartTokenizer
|
|
|
|
| 1 |
from .configuration import DalleBartConfig
|
| 2 |
from .modeling import DalleBart
|
| 3 |
from .partitions import set_partitions
|
| 4 |
+
from .processor import DalleBartProcessor
|
| 5 |
from .tokenizer import DalleBartTokenizer
|
src/dalle_mini/model/modeling.py
CHANGED
|
@@ -18,8 +18,9 @@ import math
|
|
| 18 |
import os
|
| 19 |
from functools import partial
|
| 20 |
from pickle import UnpicklingError
|
| 21 |
-
from typing import Optional, Tuple, Union
|
| 22 |
|
|
|
|
| 23 |
import flax.linen as nn
|
| 24 |
import jax
|
| 25 |
import jax.numpy as jnp
|
|
@@ -39,6 +40,7 @@ from transformers.file_utils import (
|
|
| 39 |
is_offline_mode,
|
| 40 |
is_remote_url,
|
| 41 |
)
|
|
|
|
| 42 |
from transformers.modeling_flax_outputs import (
|
| 43 |
FlaxCausalLMOutputWithCrossAttentions,
|
| 44 |
FlaxSeq2SeqLMOutput,
|
|
@@ -691,6 +693,17 @@ class FlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationMod
|
|
| 691 |
)
|
| 692 |
|
| 693 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
class DalleBart(
|
| 695 |
PretrainedFromWandbMixin, FlaxBartPreTrainedModel, FlaxBartForConditionalGeneration
|
| 696 |
):
|
|
@@ -702,6 +715,7 @@ class DalleBart(
|
|
| 702 |
- no bias in decode method
|
| 703 |
- custom prepare_inputs_for_generation using "max_length - 1" to avoid issues
|
| 704 |
related to position embedding during model.generate()
|
|
|
|
| 705 |
"""
|
| 706 |
|
| 707 |
module_class = FlaxBartForConditionalGenerationModule
|
|
@@ -872,3 +886,325 @@ class DalleBart(
|
|
| 872 |
"decoder_attention_mask": extended_attention_mask,
|
| 873 |
"decoder_position_ids": position_ids,
|
| 874 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
import os
|
| 19 |
from functools import partial
|
| 20 |
from pickle import UnpicklingError
|
| 21 |
+
from typing import Dict, Optional, Tuple, Union
|
| 22 |
|
| 23 |
+
import flax
|
| 24 |
import flax.linen as nn
|
| 25 |
import jax
|
| 26 |
import jax.numpy as jnp
|
|
|
|
| 40 |
is_offline_mode,
|
| 41 |
is_remote_url,
|
| 42 |
)
|
| 43 |
+
from transformers.generation_flax_utils import FlaxSampleOutput
|
| 44 |
from transformers.modeling_flax_outputs import (
|
| 45 |
FlaxCausalLMOutputWithCrossAttentions,
|
| 46 |
FlaxSeq2SeqLMOutput,
|
|
|
|
| 693 |
)
|
| 694 |
|
| 695 |
|
| 696 |
+
@flax.struct.dataclass
|
| 697 |
+
class SampleState:
|
| 698 |
+
cur_len: jnp.ndarray
|
| 699 |
+
sequences: jnp.ndarray
|
| 700 |
+
running_token: jnp.ndarray
|
| 701 |
+
is_sent_finished: jnp.ndarray
|
| 702 |
+
prng_key: jnp.ndarray
|
| 703 |
+
model_kwargs: Dict[str, jnp.ndarray]
|
| 704 |
+
model_kwargs_uncond: Dict[str, jnp.ndarray]
|
| 705 |
+
|
| 706 |
+
|
| 707 |
class DalleBart(
|
| 708 |
PretrainedFromWandbMixin, FlaxBartPreTrainedModel, FlaxBartForConditionalGeneration
|
| 709 |
):
|
|
|
|
| 715 |
- no bias in decode method
|
| 716 |
- custom prepare_inputs_for_generation using "max_length - 1" to avoid issues
|
| 717 |
related to position embedding during model.generate()
|
| 718 |
+
- custom generate method to allow super conditions
|
| 719 |
"""
|
| 720 |
|
| 721 |
module_class = FlaxBartForConditionalGenerationModule
|
|
|
|
| 886 |
"decoder_attention_mask": extended_attention_mask,
|
| 887 |
"decoder_position_ids": position_ids,
|
| 888 |
}
|
| 889 |
+
|
| 890 |
+
def generate(
|
| 891 |
+
self,
|
| 892 |
+
input_ids: jnp.ndarray,
|
| 893 |
+
attention_mask: Optional[jnp.ndarray] = None,
|
| 894 |
+
max_length: Optional[int] = None,
|
| 895 |
+
pad_token_id: Optional[int] = None,
|
| 896 |
+
bos_token_id: Optional[int] = None,
|
| 897 |
+
eos_token_id: Optional[int] = None,
|
| 898 |
+
decoder_start_token_id: Optional[int] = None,
|
| 899 |
+
do_sample: Optional[bool] = None,
|
| 900 |
+
prng_key: Optional[jnp.ndarray] = None,
|
| 901 |
+
top_k: Optional[int] = None,
|
| 902 |
+
top_p: Optional[float] = None,
|
| 903 |
+
temperature: Optional[float] = None,
|
| 904 |
+
num_beams: Optional[int] = None,
|
| 905 |
+
no_repeat_ngram_size: Optional[int] = None,
|
| 906 |
+
min_length: Optional[int] = None,
|
| 907 |
+
forced_bos_token_id: Optional[int] = None,
|
| 908 |
+
forced_eos_token_id: Optional[int] = None,
|
| 909 |
+
length_penalty: Optional[float] = None,
|
| 910 |
+
early_stopping: Optional[bool] = None,
|
| 911 |
+
trace: bool = True,
|
| 912 |
+
params: Optional[Dict[str, jnp.ndarray]] = None,
|
| 913 |
+
condition_scale: Optional[float] = 1.0,
|
| 914 |
+
input_ids_uncond: Optional[jnp.ndarray] = None,
|
| 915 |
+
attention_mask_uncond: Optional[jnp.ndarray] = None,
|
| 916 |
+
**model_kwargs,
|
| 917 |
+
):
|
| 918 |
+
"""Edit: Allow super conditioning."""
|
| 919 |
+
|
| 920 |
+
# set init values
|
| 921 |
+
max_length = max_length if max_length is not None else self.config.max_length
|
| 922 |
+
bos_token_id = (
|
| 923 |
+
bos_token_id if bos_token_id is not None else self.config.bos_token_id
|
| 924 |
+
)
|
| 925 |
+
pad_token_id = (
|
| 926 |
+
pad_token_id if pad_token_id is not None else self.config.pad_token_id
|
| 927 |
+
)
|
| 928 |
+
eos_token_id = (
|
| 929 |
+
eos_token_id if eos_token_id is not None else self.config.eos_token_id
|
| 930 |
+
)
|
| 931 |
+
decoder_start_token_id = (
|
| 932 |
+
decoder_start_token_id
|
| 933 |
+
if decoder_start_token_id
|
| 934 |
+
else self.config.decoder_start_token_id
|
| 935 |
+
)
|
| 936 |
+
prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
|
| 937 |
+
|
| 938 |
+
if decoder_start_token_id is None and self.config.is_encoder_decoder:
|
| 939 |
+
raise ValueError(
|
| 940 |
+
"`decoder_start_token_id` has to be defined for encoder-decoder generation."
|
| 941 |
+
)
|
| 942 |
+
|
| 943 |
+
do_sample = do_sample if do_sample is not None else self.config.do_sample
|
| 944 |
+
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
| 945 |
+
|
| 946 |
+
if self.config.is_encoder_decoder:
|
| 947 |
+
# add encoder_outputs to model_kwargs
|
| 948 |
+
if model_kwargs.get("encoder_outputs") is None:
|
| 949 |
+
model_kwargs_input = dict(model_kwargs)
|
| 950 |
+
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
|
| 951 |
+
input_ids,
|
| 952 |
+
params,
|
| 953 |
+
{"attention_mask": attention_mask, **model_kwargs_input},
|
| 954 |
+
)
|
| 955 |
+
if condition_scale != 1.0:
|
| 956 |
+
assert (
|
| 957 |
+
input_ids_uncond is not None
|
| 958 |
+
), "`input_ids_uncond` has to be defined for super conditioning."
|
| 959 |
+
assert (
|
| 960 |
+
do_sample is True
|
| 961 |
+
), "`do_sample` has to be True for super conditioning."
|
| 962 |
+
assert (
|
| 963 |
+
num_beams == 1
|
| 964 |
+
), "`num_beams` has to be 1 for super conditioning."
|
| 965 |
+
model_kwargs_uncond = (
|
| 966 |
+
self._prepare_encoder_decoder_kwargs_for_generation(
|
| 967 |
+
input_ids_uncond,
|
| 968 |
+
params,
|
| 969 |
+
{
|
| 970 |
+
"attention_mask": attention_mask_uncond,
|
| 971 |
+
**model_kwargs_input,
|
| 972 |
+
},
|
| 973 |
+
)
|
| 974 |
+
)
|
| 975 |
+
else:
|
| 976 |
+
model_kwargs_uncond = None
|
| 977 |
+
# prepare decoder_input_ids for generation
|
| 978 |
+
input_ids = (
|
| 979 |
+
jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
|
| 980 |
+
)
|
| 981 |
+
|
| 982 |
+
if not do_sample and num_beams == 1:
|
| 983 |
+
logits_processor = self._get_logits_processor(
|
| 984 |
+
no_repeat_ngram_size,
|
| 985 |
+
min_length,
|
| 986 |
+
max_length,
|
| 987 |
+
eos_token_id,
|
| 988 |
+
forced_bos_token_id,
|
| 989 |
+
forced_eos_token_id,
|
| 990 |
+
)
|
| 991 |
+
return self._greedy_search(
|
| 992 |
+
input_ids,
|
| 993 |
+
max_length,
|
| 994 |
+
pad_token_id,
|
| 995 |
+
eos_token_id,
|
| 996 |
+
logits_processor=logits_processor,
|
| 997 |
+
trace=trace,
|
| 998 |
+
params=params,
|
| 999 |
+
model_kwargs=model_kwargs,
|
| 1000 |
+
)
|
| 1001 |
+
elif do_sample and num_beams == 1:
|
| 1002 |
+
logits_warper = self._get_logits_warper(
|
| 1003 |
+
top_k=top_k, top_p=top_p, temperature=temperature
|
| 1004 |
+
)
|
| 1005 |
+
logits_processor = self._get_logits_processor(
|
| 1006 |
+
no_repeat_ngram_size,
|
| 1007 |
+
min_length,
|
| 1008 |
+
max_length,
|
| 1009 |
+
eos_token_id,
|
| 1010 |
+
forced_bos_token_id,
|
| 1011 |
+
forced_eos_token_id,
|
| 1012 |
+
)
|
| 1013 |
+
return self._sample(
|
| 1014 |
+
input_ids,
|
| 1015 |
+
max_length,
|
| 1016 |
+
pad_token_id,
|
| 1017 |
+
eos_token_id,
|
| 1018 |
+
prng_key,
|
| 1019 |
+
logits_warper=logits_warper,
|
| 1020 |
+
logits_processor=logits_processor,
|
| 1021 |
+
trace=trace,
|
| 1022 |
+
params=params,
|
| 1023 |
+
model_kwargs=model_kwargs,
|
| 1024 |
+
condition_scale=condition_scale,
|
| 1025 |
+
model_kwargs_uncond=model_kwargs_uncond,
|
| 1026 |
+
)
|
| 1027 |
+
elif not do_sample and num_beams > 1:
|
| 1028 |
+
# broadcast input_ids & encoder_outputs
|
| 1029 |
+
input_ids = self._expand_to_num_beams(input_ids, num_beams=num_beams)
|
| 1030 |
+
|
| 1031 |
+
if "encoder_outputs" in model_kwargs:
|
| 1032 |
+
model_kwargs["encoder_outputs"][
|
| 1033 |
+
"last_hidden_state"
|
| 1034 |
+
] = self._expand_to_num_beams(
|
| 1035 |
+
model_kwargs["encoder_outputs"]["last_hidden_state"],
|
| 1036 |
+
num_beams=num_beams,
|
| 1037 |
+
)
|
| 1038 |
+
|
| 1039 |
+
if "attention_mask" in model_kwargs:
|
| 1040 |
+
model_kwargs["attention_mask"] = self._expand_to_num_beams(
|
| 1041 |
+
model_kwargs["attention_mask"], num_beams=num_beams
|
| 1042 |
+
)
|
| 1043 |
+
|
| 1044 |
+
logits_processor = self._get_logits_processor(
|
| 1045 |
+
no_repeat_ngram_size,
|
| 1046 |
+
min_length,
|
| 1047 |
+
max_length,
|
| 1048 |
+
eos_token_id,
|
| 1049 |
+
forced_bos_token_id,
|
| 1050 |
+
forced_eos_token_id,
|
| 1051 |
+
)
|
| 1052 |
+
|
| 1053 |
+
return self._beam_search(
|
| 1054 |
+
input_ids,
|
| 1055 |
+
max_length,
|
| 1056 |
+
pad_token_id,
|
| 1057 |
+
eos_token_id,
|
| 1058 |
+
length_penalty=length_penalty,
|
| 1059 |
+
early_stopping=early_stopping,
|
| 1060 |
+
logits_processor=logits_processor,
|
| 1061 |
+
trace=trace,
|
| 1062 |
+
params=params,
|
| 1063 |
+
model_kwargs=model_kwargs,
|
| 1064 |
+
)
|
| 1065 |
+
else:
|
| 1066 |
+
raise NotImplementedError("`Beam sampling is currently not implemented.")
|
| 1067 |
+
|
| 1068 |
+
def _sample(
|
| 1069 |
+
self,
|
| 1070 |
+
input_ids: None,
|
| 1071 |
+
max_length: Optional[int] = None,
|
| 1072 |
+
pad_token_id: Optional[int] = None,
|
| 1073 |
+
eos_token_id: Optional[int] = None,
|
| 1074 |
+
prng_key: Optional[jnp.ndarray] = None,
|
| 1075 |
+
logits_processor=None,
|
| 1076 |
+
logits_warper=None,
|
| 1077 |
+
trace: bool = True,
|
| 1078 |
+
params: Optional[Dict[str, jnp.ndarray]] = None,
|
| 1079 |
+
model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
|
| 1080 |
+
condition_scale: float = 1.0,
|
| 1081 |
+
model_kwargs_uncond: Optional[Dict[str, jnp.ndarray]] = None,
|
| 1082 |
+
):
|
| 1083 |
+
# init values
|
| 1084 |
+
max_length = max_length if max_length is not None else self.config.max_length
|
| 1085 |
+
pad_token_id = (
|
| 1086 |
+
pad_token_id if pad_token_id is not None else self.config.pad_token_id
|
| 1087 |
+
)
|
| 1088 |
+
eos_token_id = (
|
| 1089 |
+
eos_token_id if eos_token_id is not None else self.config.eos_token_id
|
| 1090 |
+
)
|
| 1091 |
+
prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
|
| 1092 |
+
|
| 1093 |
+
batch_size, cur_len = input_ids.shape
|
| 1094 |
+
|
| 1095 |
+
eos_token_id = jnp.array(eos_token_id)
|
| 1096 |
+
pad_token_id = jnp.array(pad_token_id)
|
| 1097 |
+
cur_len = jnp.array(cur_len)
|
| 1098 |
+
|
| 1099 |
+
# per batch-item holding current token in loop.
|
| 1100 |
+
sequences = jnp.full((batch_size, max_length), pad_token_id, dtype=jnp.int32)
|
| 1101 |
+
sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0))
|
| 1102 |
+
|
| 1103 |
+
# per batch-item state bit indicating if sentence has finished.
|
| 1104 |
+
is_sent_finished = jnp.zeros((batch_size,), dtype=jnp.bool_)
|
| 1105 |
+
|
| 1106 |
+
# For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
|
| 1107 |
+
# and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
|
| 1108 |
+
model = self.decode if self.config.is_encoder_decoder else self
|
| 1109 |
+
|
| 1110 |
+
# initialize model specific kwargs
|
| 1111 |
+
model_kwargs = self.prepare_inputs_for_generation(
|
| 1112 |
+
input_ids, max_length, **model_kwargs
|
| 1113 |
+
)
|
| 1114 |
+
if condition_scale != 1.0:
|
| 1115 |
+
model_kwargs_uncond = self.prepare_inputs_for_generation(
|
| 1116 |
+
input_ids, max_length, **model_kwargs_uncond
|
| 1117 |
+
)
|
| 1118 |
+
|
| 1119 |
+
# initialize state
|
| 1120 |
+
state = SampleState(
|
| 1121 |
+
cur_len=cur_len,
|
| 1122 |
+
sequences=sequences,
|
| 1123 |
+
running_token=input_ids,
|
| 1124 |
+
is_sent_finished=is_sent_finished,
|
| 1125 |
+
prng_key=prng_key,
|
| 1126 |
+
model_kwargs=model_kwargs,
|
| 1127 |
+
model_kwargs_uncond=model_kwargs_uncond,
|
| 1128 |
+
)
|
| 1129 |
+
|
| 1130 |
+
def sample_search_cond_fn(state):
|
| 1131 |
+
"""state termination condition fn."""
|
| 1132 |
+
has_reached_max_length = state.cur_len == max_length
|
| 1133 |
+
all_sequence_finished = jnp.all(state.is_sent_finished)
|
| 1134 |
+
finish_generation = jnp.logical_or(
|
| 1135 |
+
has_reached_max_length, all_sequence_finished
|
| 1136 |
+
)
|
| 1137 |
+
return ~finish_generation
|
| 1138 |
+
|
| 1139 |
+
def sample_search_body_fn(state):
|
| 1140 |
+
"""state update fn."""
|
| 1141 |
+
prng_key, prng_key_next = jax.random.split(state.prng_key)
|
| 1142 |
+
model_outputs = model(
|
| 1143 |
+
state.running_token, params=params, **state.model_kwargs
|
| 1144 |
+
)
|
| 1145 |
+
|
| 1146 |
+
logits = model_outputs.logits[:, -1]
|
| 1147 |
+
|
| 1148 |
+
# perform super conditioning
|
| 1149 |
+
# Source: @RiversHaveWings - https://twitter.com/RiversHaveWings/status/1478093658716966912?s=20&t=xdm-wZ61Wf7OLnE_NJHZ1w
|
| 1150 |
+
if condition_scale != 1.0:
|
| 1151 |
+
model_outputs_uncond = model(
|
| 1152 |
+
state.running_token, params=params, **state.model_kwargs_uncond
|
| 1153 |
+
)
|
| 1154 |
+
logits_uncond = model_outputs_uncond.logits[:, -1]
|
| 1155 |
+
logits = logits_uncond + condition_scale * (logits - logits_uncond)
|
| 1156 |
+
else:
|
| 1157 |
+
model_outputs_uncond = None
|
| 1158 |
+
|
| 1159 |
+
# apply min_length, ...
|
| 1160 |
+
logits = logits_processor(state.sequences, logits, state.cur_len)
|
| 1161 |
+
# apply top_k, top_k, temperature
|
| 1162 |
+
logits = logits_warper(logits, logits, state.cur_len)
|
| 1163 |
+
|
| 1164 |
+
next_token = jax.random.categorical(prng_key, logits, axis=-1)
|
| 1165 |
+
|
| 1166 |
+
next_is_sent_finished = state.is_sent_finished | (
|
| 1167 |
+
next_token == eos_token_id
|
| 1168 |
+
)
|
| 1169 |
+
next_token = (
|
| 1170 |
+
next_token * ~next_is_sent_finished
|
| 1171 |
+
+ pad_token_id * next_is_sent_finished
|
| 1172 |
+
)
|
| 1173 |
+
next_token = next_token[:, None]
|
| 1174 |
+
|
| 1175 |
+
next_sequences = lax.dynamic_update_slice(
|
| 1176 |
+
state.sequences, next_token, (0, state.cur_len)
|
| 1177 |
+
)
|
| 1178 |
+
next_model_kwargs = self.update_inputs_for_generation(
|
| 1179 |
+
model_outputs, state.model_kwargs
|
| 1180 |
+
)
|
| 1181 |
+
next_model_kwargs_uncond = (
|
| 1182 |
+
self.update_inputs_for_generation(
|
| 1183 |
+
model_outputs_uncond, state.model_kwargs_uncond
|
| 1184 |
+
)
|
| 1185 |
+
if condition_scale != 1.0
|
| 1186 |
+
else None
|
| 1187 |
+
)
|
| 1188 |
+
|
| 1189 |
+
return SampleState(
|
| 1190 |
+
cur_len=state.cur_len + 1,
|
| 1191 |
+
sequences=next_sequences,
|
| 1192 |
+
running_token=next_token,
|
| 1193 |
+
is_sent_finished=next_is_sent_finished,
|
| 1194 |
+
model_kwargs=next_model_kwargs,
|
| 1195 |
+
model_kwargs_uncond=next_model_kwargs_uncond,
|
| 1196 |
+
prng_key=prng_key_next,
|
| 1197 |
+
)
|
| 1198 |
+
|
| 1199 |
+
# The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
|
| 1200 |
+
if input_ids.shape[1] > 1:
|
| 1201 |
+
state = sample_search_body_fn(state)
|
| 1202 |
+
|
| 1203 |
+
if not trace:
|
| 1204 |
+
state = self._run_loop_in_debug(
|
| 1205 |
+
sample_search_cond_fn, sample_search_body_fn, state
|
| 1206 |
+
)
|
| 1207 |
+
else:
|
| 1208 |
+
state = lax.while_loop(sample_search_cond_fn, sample_search_body_fn, state)
|
| 1209 |
+
|
| 1210 |
+
return FlaxSampleOutput(sequences=state.sequences)
|
src/dalle_mini/model/processor.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" DalleBart processor """
|
| 2 |
+
|
| 3 |
+
import jax.numpy as jnp
|
| 4 |
+
|
| 5 |
+
from .configuration import DalleBartConfig
|
| 6 |
+
from .text import TextNormalizer
|
| 7 |
+
from .tokenizer import DalleBartTokenizer
|
| 8 |
+
from .utils import PretrainedFromWandbMixin
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DalleBartProcessorBase:
|
| 12 |
+
def __init__(
|
| 13 |
+
self, tokenizer: DalleBartTokenizer, normalize_text: bool, max_text_length: int
|
| 14 |
+
):
|
| 15 |
+
self.tokenizer = tokenizer
|
| 16 |
+
self.normalize_text = normalize_text
|
| 17 |
+
self.max_text_length = max_text_length
|
| 18 |
+
if normalize_text:
|
| 19 |
+
self.text_processor = TextNormalizer()
|
| 20 |
+
# create unconditional tokens
|
| 21 |
+
uncond = self.tokenizer(
|
| 22 |
+
"",
|
| 23 |
+
return_tensors="jax",
|
| 24 |
+
padding="max_length",
|
| 25 |
+
truncation=True,
|
| 26 |
+
max_length=self.max_text_length,
|
| 27 |
+
).data
|
| 28 |
+
self.input_ids_uncond = uncond["input_ids"]
|
| 29 |
+
self.attention_mask_uncond = uncond["attention_mask"]
|
| 30 |
+
|
| 31 |
+
def __call__(self, text: str = None):
|
| 32 |
+
# check that text is not a string
|
| 33 |
+
assert not isinstance(text, str), "text must be a list of strings"
|
| 34 |
+
|
| 35 |
+
if self.normalize_text:
|
| 36 |
+
text = [self.text_processor(t) for t in text]
|
| 37 |
+
res = self.tokenizer(
|
| 38 |
+
text,
|
| 39 |
+
return_tensors="jax",
|
| 40 |
+
padding="max_length",
|
| 41 |
+
truncation=True,
|
| 42 |
+
max_length=self.max_text_length,
|
| 43 |
+
).data
|
| 44 |
+
# tokens used only with super conditioning
|
| 45 |
+
n = len(text)
|
| 46 |
+
res["input_ids_uncond"] = jnp.repeat(self.input_ids_uncond, n, axis=0)
|
| 47 |
+
res["attention_mask_uncond"] = jnp.repeat(self.attention_mask_uncond, n, axis=0)
|
| 48 |
+
return res
|
| 49 |
+
|
| 50 |
+
@classmethod
|
| 51 |
+
def from_pretrained(cls, *args, **kwargs):
|
| 52 |
+
tokenizer = DalleBartTokenizer.from_pretrained(*args, **kwargs)
|
| 53 |
+
config = DalleBartConfig.from_pretrained(*args, **kwargs)
|
| 54 |
+
return cls(tokenizer, config.normalize_text, config.max_text_length)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class DalleBartProcessor(PretrainedFromWandbMixin, DalleBartProcessorBase):
|
| 58 |
+
pass
|
src/dalle_mini/{text.py → model/text.py}
RENAMED
|
File without changes
|
tools/inference/inference_pipeline.ipynb
CHANGED
|
@@ -75,7 +75,7 @@
|
|
| 75 |
"# Model references\n",
|
| 76 |
"\n",
|
| 77 |
"# dalle-mini\n",
|
| 78 |
-
"DALLE_MODEL = \"dalle-mini/dalle-mini/model-
|
| 79 |
"DALLE_COMMIT_ID = None\n",
|
| 80 |
"\n",
|
| 81 |
"# VQGAN model\n",
|
|
@@ -126,7 +126,7 @@
|
|
| 126 |
"outputs": [],
|
| 127 |
"source": [
|
| 128 |
"# Load models & tokenizer\n",
|
| 129 |
-
"from dalle_mini
|
| 130 |
"from vqgan_jax.modeling_flax_vqgan import VQModel\n",
|
| 131 |
"from transformers import CLIPProcessor, FlaxCLIPModel\n",
|
| 132 |
"\n",
|
|
@@ -134,14 +134,13 @@
|
|
| 134 |
"model = DalleBart.from_pretrained(\n",
|
| 135 |
" DALLE_MODEL, revision=DALLE_COMMIT_ID, dtype=dtype, abstract_init=True\n",
|
| 136 |
")\n",
|
| 137 |
-
"tokenizer = DalleBartTokenizer.from_pretrained(DALLE_MODEL, revision=DALLE_COMMIT_ID)\n",
|
| 138 |
"\n",
|
| 139 |
"# Load VQGAN\n",
|
| 140 |
"vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
|
| 141 |
"\n",
|
| 142 |
"# Load CLIP\n",
|
| 143 |
"clip = FlaxCLIPModel.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)\n",
|
| 144 |
-
"
|
| 145 |
]
|
| 146 |
},
|
| 147 |
{
|
|
@@ -192,17 +191,18 @@
|
|
| 192 |
"from functools import partial\n",
|
| 193 |
"\n",
|
| 194 |
"# model inference\n",
|
| 195 |
-
"@partial(jax.pmap, axis_name=\"batch\", static_broadcasted_argnums=(3, 4))\n",
|
| 196 |
-
"def p_generate(
|
|
|
|
|
|
|
| 197 |
" return model.generate(\n",
|
| 198 |
" **tokenized_prompt,\n",
|
| 199 |
-
" do_sample=True,\n",
|
| 200 |
-
" num_beams=1,\n",
|
| 201 |
" prng_key=key,\n",
|
| 202 |
" params=params,\n",
|
| 203 |
" top_k=top_k,\n",
|
| 204 |
" top_p=top_p,\n",
|
| 205 |
-
"
|
|
|
|
| 206 |
" )\n",
|
| 207 |
"\n",
|
| 208 |
"\n",
|
|
@@ -258,7 +258,7 @@
|
|
| 258 |
"id": "rsmj0Aj5OQox"
|
| 259 |
},
|
| 260 |
"source": [
|
| 261 |
-
"Our model
|
| 262 |
]
|
| 263 |
},
|
| 264 |
{
|
|
@@ -269,9 +269,9 @@
|
|
| 269 |
},
|
| 270 |
"outputs": [],
|
| 271 |
"source": [
|
| 272 |
-
"from dalle_mini
|
| 273 |
"\n",
|
| 274 |
-
"
|
| 275 |
]
|
| 276 |
},
|
| 277 |
{
|
|
@@ -291,7 +291,7 @@
|
|
| 291 |
},
|
| 292 |
"outputs": [],
|
| 293 |
"source": [
|
| 294 |
-
"prompt = \"
|
| 295 |
]
|
| 296 |
},
|
| 297 |
{
|
|
@@ -302,34 +302,7 @@
|
|
| 302 |
},
|
| 303 |
"outputs": [],
|
| 304 |
"source": [
|
| 305 |
-
"
|
| 306 |
-
"processed_prompt"
|
| 307 |
-
]
|
| 308 |
-
},
|
| 309 |
-
{
|
| 310 |
-
"cell_type": "markdown",
|
| 311 |
-
"metadata": {
|
| 312 |
-
"id": "QUzYACWxOe5z"
|
| 313 |
-
},
|
| 314 |
-
"source": [
|
| 315 |
-
"We tokenize the prompt."
|
| 316 |
-
]
|
| 317 |
-
},
|
| 318 |
-
{
|
| 319 |
-
"cell_type": "code",
|
| 320 |
-
"execution_count": null,
|
| 321 |
-
"metadata": {
|
| 322 |
-
"id": "n8e7MvGwOe5z"
|
| 323 |
-
},
|
| 324 |
-
"outputs": [],
|
| 325 |
-
"source": [
|
| 326 |
-
"tokenized_prompt = tokenizer(\n",
|
| 327 |
-
" processed_prompt,\n",
|
| 328 |
-
" return_tensors=\"jax\",\n",
|
| 329 |
-
" padding=\"max_length\",\n",
|
| 330 |
-
" truncation=True,\n",
|
| 331 |
-
" max_length=128,\n",
|
| 332 |
-
").data\n",
|
| 333 |
"tokenized_prompt"
|
| 334 |
]
|
| 335 |
},
|
|
@@ -390,7 +363,9 @@
|
|
| 390 |
"\n",
|
| 391 |
"# We can customize top_k/top_p used for generating samples\n",
|
| 392 |
"gen_top_k = None\n",
|
| 393 |
-
"gen_top_p = None"
|
|
|
|
|
|
|
| 394 |
]
|
| 395 |
},
|
| 396 |
{
|
|
@@ -413,7 +388,13 @@
|
|
| 413 |
" key, subkey = jax.random.split(key)\n",
|
| 414 |
" # generate images\n",
|
| 415 |
" encoded_images = p_generate(\n",
|
| 416 |
-
" tokenized_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
" )\n",
|
| 418 |
" # remove BOS\n",
|
| 419 |
" encoded_images = encoded_images.sequences[..., 1:]\n",
|
|
@@ -444,7 +425,7 @@
|
|
| 444 |
"from flax.training.common_utils import shard\n",
|
| 445 |
"\n",
|
| 446 |
"# get clip scores\n",
|
| 447 |
-
"clip_inputs =
|
| 448 |
" text=[prompt] * jax.device_count(),\n",
|
| 449 |
" images=images,\n",
|
| 450 |
" return_tensors=\"np\",\n",
|
|
|
|
| 75 |
"# Model references\n",
|
| 76 |
"\n",
|
| 77 |
"# dalle-mini\n",
|
| 78 |
+
"DALLE_MODEL = \"dalle-mini/dalle-mini/model-2vm4itcx:latest\" # can be wandb artifact or 🤗 Hub or local folder or google bucket\n",
|
| 79 |
"DALLE_COMMIT_ID = None\n",
|
| 80 |
"\n",
|
| 81 |
"# VQGAN model\n",
|
|
|
|
| 126 |
"outputs": [],
|
| 127 |
"source": [
|
| 128 |
"# Load models & tokenizer\n",
|
| 129 |
+
"from dalle_mini import DalleBart, DalleBartProcessor\n",
|
| 130 |
"from vqgan_jax.modeling_flax_vqgan import VQModel\n",
|
| 131 |
"from transformers import CLIPProcessor, FlaxCLIPModel\n",
|
| 132 |
"\n",
|
|
|
|
| 134 |
"model = DalleBart.from_pretrained(\n",
|
| 135 |
" DALLE_MODEL, revision=DALLE_COMMIT_ID, dtype=dtype, abstract_init=True\n",
|
| 136 |
")\n",
|
|
|
|
| 137 |
"\n",
|
| 138 |
"# Load VQGAN\n",
|
| 139 |
"vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
|
| 140 |
"\n",
|
| 141 |
"# Load CLIP\n",
|
| 142 |
"clip = FlaxCLIPModel.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)\n",
|
| 143 |
+
"clip_processor = CLIPProcessor.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)"
|
| 144 |
]
|
| 145 |
},
|
| 146 |
{
|
|
|
|
| 191 |
"from functools import partial\n",
|
| 192 |
"\n",
|
| 193 |
"# model inference\n",
|
| 194 |
+
"@partial(jax.pmap, axis_name=\"batch\", static_broadcasted_argnums=(3, 4, 5, 6))\n",
|
| 195 |
+
"def p_generate(\n",
|
| 196 |
+
" tokenized_prompt, key, params, top_k, top_p, temperature, condition_scale\n",
|
| 197 |
+
"):\n",
|
| 198 |
" return model.generate(\n",
|
| 199 |
" **tokenized_prompt,\n",
|
|
|
|
|
|
|
| 200 |
" prng_key=key,\n",
|
| 201 |
" params=params,\n",
|
| 202 |
" top_k=top_k,\n",
|
| 203 |
" top_p=top_p,\n",
|
| 204 |
+
" temperature=temperature,\n",
|
| 205 |
+
" condition_scale=condition_scale,\n",
|
| 206 |
" )\n",
|
| 207 |
"\n",
|
| 208 |
"\n",
|
|
|
|
| 258 |
"id": "rsmj0Aj5OQox"
|
| 259 |
},
|
| 260 |
"source": [
|
| 261 |
+
"Our model requires processing prompts."
|
| 262 |
]
|
| 263 |
},
|
| 264 |
{
|
|
|
|
| 269 |
},
|
| 270 |
"outputs": [],
|
| 271 |
"source": [
|
| 272 |
+
"from dalle_mini import DalleBartProcessor\n",
|
| 273 |
"\n",
|
| 274 |
+
"processor = DalleBartProcessor.from_pretrained(DALLE_MODEL, revision=DALLE_COMMIT_ID)"
|
| 275 |
]
|
| 276 |
},
|
| 277 |
{
|
|
|
|
| 291 |
},
|
| 292 |
"outputs": [],
|
| 293 |
"source": [
|
| 294 |
+
"prompt = \"a blue table\""
|
| 295 |
]
|
| 296 |
},
|
| 297 |
{
|
|
|
|
| 302 |
},
|
| 303 |
"outputs": [],
|
| 304 |
"source": [
|
| 305 |
+
"tokenized_prompt = processor([prompt])\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
"tokenized_prompt"
|
| 307 |
]
|
| 308 |
},
|
|
|
|
| 363 |
"\n",
|
| 364 |
"# We can customize top_k/top_p used for generating samples\n",
|
| 365 |
"gen_top_k = None\n",
|
| 366 |
+
"gen_top_p = None\n",
|
| 367 |
+
"temperature = 0.85\n",
|
| 368 |
+
"cond_scale = 3.0"
|
| 369 |
]
|
| 370 |
},
|
| 371 |
{
|
|
|
|
| 388 |
" key, subkey = jax.random.split(key)\n",
|
| 389 |
" # generate images\n",
|
| 390 |
" encoded_images = p_generate(\n",
|
| 391 |
+
" tokenized_prompt,\n",
|
| 392 |
+
" shard_prng_key(subkey),\n",
|
| 393 |
+
" model.params,\n",
|
| 394 |
+
" gen_top_k,\n",
|
| 395 |
+
" gen_top_p,\n",
|
| 396 |
+
" temperature,\n",
|
| 397 |
+
" cond_scale,\n",
|
| 398 |
" )\n",
|
| 399 |
" # remove BOS\n",
|
| 400 |
" encoded_images = encoded_images.sequences[..., 1:]\n",
|
|
|
|
| 425 |
"from flax.training.common_utils import shard\n",
|
| 426 |
"\n",
|
| 427 |
"# get clip scores\n",
|
| 428 |
+
"clip_inputs = clip_processor(\n",
|
| 429 |
" text=[prompt] * jax.device_count(),\n",
|
| 430 |
" images=images,\n",
|
| 431 |
" return_tensors=\"np\",\n",
|
tools/train/train.py
CHANGED
|
@@ -103,7 +103,7 @@ class ModelArguments:
|
|
| 103 |
|
| 104 |
def __post_init__(self):
|
| 105 |
if self.tokenizer_name is None:
|
| 106 |
-
self.tokenizer_name
|
| 107 |
assert (
|
| 108 |
self.tokenizer_name is not None
|
| 109 |
), "Tokenizer name or model name/path needs to be specified"
|
|
@@ -209,6 +209,26 @@ class DataTrainingArguments:
|
|
| 209 |
"help": "Probability of removing some captions for classifier-free guidance."
|
| 210 |
},
|
| 211 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
max_train_samples: Optional[int] = field(
|
| 213 |
default=None,
|
| 214 |
metadata={
|
|
|
|
| 103 |
|
| 104 |
def __post_init__(self):
|
| 105 |
if self.tokenizer_name is None:
|
| 106 |
+
self.tokenizer_name = self.model_name_or_path
|
| 107 |
assert (
|
| 108 |
self.tokenizer_name is not None
|
| 109 |
), "Tokenizer name or model name/path needs to be specified"
|
|
|
|
| 209 |
"help": "Probability of removing some captions for classifier-free guidance."
|
| 210 |
},
|
| 211 |
)
|
| 212 |
+
clip_score_column: Optional[str] = field(
|
| 213 |
+
default="clip_score",
|
| 214 |
+
metadata={"help": "Column that containts clip score for filtering."},
|
| 215 |
+
)
|
| 216 |
+
min_clip_score: Optional[float] = field(
|
| 217 |
+
default=None,
|
| 218 |
+
metadata={"help": "Minimum clip score required."},
|
| 219 |
+
)
|
| 220 |
+
max_clip_score: Optional[float] = field(
|
| 221 |
+
default=None,
|
| 222 |
+
metadata={"help": "Maximum clip score required."},
|
| 223 |
+
)
|
| 224 |
+
filter_column: Optional[str] = field(
|
| 225 |
+
default=None,
|
| 226 |
+
metadata={"help": "Column that containts classes to be filtered."},
|
| 227 |
+
)
|
| 228 |
+
filter_value: Optional[str] = field(
|
| 229 |
+
default=None,
|
| 230 |
+
metadata={"help": "Class value to be kept during filtering."},
|
| 231 |
+
)
|
| 232 |
max_train_samples: Optional[int] = field(
|
| 233 |
default=None,
|
| 234 |
metadata={
|