Spaces:

tom-doerr
/

logo_generator

Runtime error

App Files Files Community

boris commited on Mar 17, 2022

Commit

7939874

unverified ·

1 Parent(s): 803ccbf

feat(data): super conditioning (#141)

Browse files

* feat(data): online filtering
* feat(generate): super conditioning
* feat: add processor

Files changed (9) hide show

README.md +1 -1
src/dalle_mini/__init__.py +3 -1
src/dalle_mini/data.py +69 -20
src/dalle_mini/model/__init__.py +1 -0
src/dalle_mini/model/modeling.py +337 -1
src/dalle_mini/model/processor.py +58 -0
src/dalle_mini/{text.py → model/text.py} +0 -0
tools/inference/inference_pipeline.ipynb +25 -44
tools/train/train.py +21 -1

README.md CHANGED Viewed

@@ -35,7 +35,6 @@ To generate sample predictions and understand the inference pipeline step by ste
 Join the community on the [DALLE-Pytorch Discord](https://discord.gg/xBPBXfcFHd).
 Any contribution is welcome, from reporting issues to proposing fixes/improvements or testing the model with cool prompts!
 ## Development
 ### Dependencies Installation
@@ -95,6 +94,7 @@ Many thanks to the people who helped make it better:
 - the [DALLE-Pytorch](https://discord.gg/xBPBXfcFHd) and [EleutherAI](https://www.eleuther.ai/) communities for testing and exchanging cool ideas
 - [Rohan Anil](https://github.com/rohan-anil) for adding Distributed Shampoo optimizer
 ## Citing DALL·E mini

 Join the community on the [DALLE-Pytorch Discord](https://discord.gg/xBPBXfcFHd).
 Any contribution is welcome, from reporting issues to proposing fixes/improvements or testing the model with cool prompts!
 ## Development
 ### Dependencies Installation
 - the [DALLE-Pytorch](https://discord.gg/xBPBXfcFHd) and [EleutherAI](https://www.eleuther.ai/) communities for testing and exchanging cool ideas
 - [Rohan Anil](https://github.com/rohan-anil) for adding Distributed Shampoo optimizer
+- [Katherine Crowson](https://github.com/crowsonkb) for [super conditioning](https://twitter.com/RiversHaveWings/status/1478093658716966912)
 ## Citing DALL·E mini

src/dalle_mini/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- __version__ = "0.0.2"


1	+ __version__ = "0.0.3"
2	+
3	+ from .model import DalleBart, DalleBartProcessor

src/dalle_mini/data.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 from braceexpand import braceexpand
 from datasets import Dataset, load_dataset
-from .text import TextNormalizer
 @dataclass
@@ -28,6 +28,11 @@ class Dataset:
     seed_dataset: int = None
     shard_by_host: bool = False
     blank_caption_prob: float = 0.0
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
@@ -36,6 +41,7 @@ class Dataset:
     def __post_init__(self):
         self.multi_hosts = jax.process_count() > 1
         # feed blank captions only in streaming mode for now
         if self.blank_caption_prob:
             assert (
                 self.streaming is True
@@ -107,23 +113,30 @@ class Dataset:
                 self.seed_dataset = np.random.get_state()[1][0]
             self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
-        # blank captions
-        if self.blank_caption_prob:
-            partial_blank_caption_function = partial(
-                blank_caption_function,
-                text_column=self.text_column,
-                blank_caption_prob=self.blank_caption_prob,
-            )
-            if hasattr(self, "train_dataset"):
-                self.train_dataset = (
-                    self.train_dataset.map(partial_blank_caption_function)
-                    if self.streaming
-                    else self.train_dataset.map(
-                        partial_blank_caption_function,
-                        num_proc=self.preprocessing_num_workers,
-                        load_from_cache_file=False,
-                        desc="Blanking some captions",
-                    )
                 )
         # normalize text
@@ -151,6 +164,25 @@ class Dataset:
                         ),
                     )
         # preprocess
         partial_preprocess_function = partial(
             preprocess_function,
@@ -230,8 +262,8 @@ class Dataset:
                     dataset.set_epoch(epoch)
                     epoch += 1
                 for item in dataset:
-                    for k, v in item.items():
-                        batch[k].append(v)
                     if len(batch[keys[0]]) == batch_size:
                         batch = {k: jnp.array(v) for k, v in batch.items()}
                         yield batch
@@ -292,6 +324,23 @@ def normalize_function(example, text_column, text_normalizer):
     return example
 def preprocess_function(
     examples,
     tokenizer,

 from braceexpand import braceexpand
 from datasets import Dataset, load_dataset
+from .model.text import TextNormalizer
 @dataclass
     seed_dataset: int = None
     shard_by_host: bool = False
     blank_caption_prob: float = 0.0
+    clip_score_column: str = "clip_score"
+    min_clip_score: float = None
+    max_clip_score: float = None
+    filter_column: str = None
+    filter_value: str = None
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
     def __post_init__(self):
         self.multi_hosts = jax.process_count() > 1
         # feed blank captions only in streaming mode for now
+        # otherwise dataset could be cached with same blanked captions
         if self.blank_caption_prob:
             assert (
                 self.streaming is True
                 self.seed_dataset = np.random.get_state()[1][0]
             self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
+        # filter data
+        partial_filter_function = partial(
+            filter_function,
+            filter_column=self.filter_column,
+            filter_value=self.filter_value,
+            clip_score_column=self.clip_score_column,
+            min_clip_score=self.min_clip_score,
+            max_clip_score=self.max_clip_score,
+        )
+        for ds in ["train_dataset", "eval_dataset"]:
+            if hasattr(self, ds):
+                setattr(
+                    self,
+                    ds,
+                    (
+                        getattr(self, ds).filter(partial_filter_function)
+                        if self.streaming
+                        else getattr(self, ds).filter(
+                            partial_filter_function,
+                            num_proc=self.preprocessing_num_workers,
+                            load_from_cache_file=not self.overwrite_cache,
+                            desc="Filtering datasets",
+                        )
+                    ),
                 )
         # normalize text
                         ),
                     )
+        # blank captions
+        if self.blank_caption_prob:
+            partial_blank_caption_function = partial(
+                blank_caption_function,
+                text_column=self.text_column,
+                blank_caption_prob=self.blank_caption_prob,
+            )
+            if hasattr(self, "train_dataset"):
+                self.train_dataset = (
+                    self.train_dataset.map(partial_blank_caption_function)
+                    if self.streaming
+                    else self.train_dataset.map(
+                        partial_blank_caption_function,
+                        num_proc=self.preprocessing_num_workers,
+                        load_from_cache_file=False,
+                        desc="Blanking some captions",
+                    )
+                )
         # preprocess
         partial_preprocess_function = partial(
             preprocess_function,
                     dataset.set_epoch(epoch)
                     epoch += 1
                 for item in dataset:
+                    for k in keys:
+                        batch[k].append(item[k])
                     if len(batch[keys[0]]) == batch_size:
                         batch = {k: jnp.array(v) for k, v in batch.items()}
                         yield batch
     return example
+def filter_function(
+    example,
+    min_clip_score,
+    max_clip_score,
+    clip_score_column,
+    filter_column,
+    filter_value,
+):
+    if min_clip_score is not None and example[clip_score_column] < min_clip_score:
+        return False
+    if max_clip_score is not None and example[clip_score_column] > max_clip_score:
+        return False
+    if filter_column is not None and example[filter_column] != filter_value:
+        return False
+    return True
 def preprocess_function(
     examples,
     tokenizer,

src/dalle_mini/model/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from .configuration import DalleBartConfig
 from .modeling import DalleBart
 from .partitions import set_partitions
 from .tokenizer import DalleBartTokenizer

 from .configuration import DalleBartConfig
 from .modeling import DalleBart
 from .partitions import set_partitions
+from .processor import DalleBartProcessor
 from .tokenizer import DalleBartTokenizer

src/dalle_mini/model/modeling.py CHANGED Viewed

@@ -18,8 +18,9 @@ import math
 import os
 from functools import partial
 from pickle import UnpicklingError
-from typing import Optional, Tuple, Union
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -39,6 +40,7 @@ from transformers.file_utils import (
     is_offline_mode,
     is_remote_url,
 )
 from transformers.modeling_flax_outputs import (
     FlaxCausalLMOutputWithCrossAttentions,
     FlaxSeq2SeqLMOutput,
@@ -691,6 +693,17 @@ class FlaxBartForConditionalGenerationModule(FlaxBartForConditionalGenerationMod
         )
 class DalleBart(
     PretrainedFromWandbMixin, FlaxBartPreTrainedModel, FlaxBartForConditionalGeneration
 ):
@@ -702,6 +715,7 @@ class DalleBart(
     - no bias in decode method
     - custom prepare_inputs_for_generation using "max_length - 1" to avoid issues
       related to position embedding during model.generate()
     """
     module_class = FlaxBartForConditionalGenerationModule
@@ -872,3 +886,325 @@ class DalleBart(
             "decoder_attention_mask": extended_attention_mask,
             "decoder_position_ids": position_ids,
         }

 import os
 from functools import partial
 from pickle import UnpicklingError
+from typing import Dict, Optional, Tuple, Union
+import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
     is_offline_mode,
     is_remote_url,
 )
+from transformers.generation_flax_utils import FlaxSampleOutput
 from transformers.modeling_flax_outputs import (
     FlaxCausalLMOutputWithCrossAttentions,
     FlaxSeq2SeqLMOutput,
         )
+@flax.struct.dataclass
+class SampleState:
+    cur_len: jnp.ndarray
+    sequences: jnp.ndarray
+    running_token: jnp.ndarray
+    is_sent_finished: jnp.ndarray
+    prng_key: jnp.ndarray
+    model_kwargs: Dict[str, jnp.ndarray]
+    model_kwargs_uncond: Dict[str, jnp.ndarray]
 class DalleBart(
     PretrainedFromWandbMixin, FlaxBartPreTrainedModel, FlaxBartForConditionalGeneration
 ):
     - no bias in decode method
     - custom prepare_inputs_for_generation using "max_length - 1" to avoid issues
       related to position embedding during model.generate()
+    - custom generate method to allow super conditions
     """
     module_class = FlaxBartForConditionalGenerationModule
             "decoder_attention_mask": extended_attention_mask,
             "decoder_position_ids": position_ids,
         }
+    def generate(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        decoder_start_token_id: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        prng_key: Optional[jnp.ndarray] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        temperature: Optional[float] = None,
+        num_beams: Optional[int] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        min_length: Optional[int] = None,
+        forced_bos_token_id: Optional[int] = None,
+        forced_eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        early_stopping: Optional[bool] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        condition_scale: Optional[float] = 1.0,
+        input_ids_uncond: Optional[jnp.ndarray] = None,
+        attention_mask_uncond: Optional[jnp.ndarray] = None,
+        **model_kwargs,
+    ):
+        """Edit: Allow super conditioning."""
+        # set init values
+        max_length = max_length if max_length is not None else self.config.max_length
+        bos_token_id = (
+            bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        )
+        pad_token_id = (
+            pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        )
+        eos_token_id = (
+            eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        )
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id
+            else self.config.decoder_start_token_id
+        )
+        prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
+        if decoder_start_token_id is None and self.config.is_encoder_decoder:
+            raise ValueError(
+                "`decoder_start_token_id` has to be defined for encoder-decoder generation."
+            )
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        if self.config.is_encoder_decoder:
+            # add encoder_outputs to model_kwargs
+            if model_kwargs.get("encoder_outputs") is None:
+                model_kwargs_input = dict(model_kwargs)
+                model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                    input_ids,
+                    params,
+                    {"attention_mask": attention_mask, **model_kwargs_input},
+                )
+                if condition_scale != 1.0:
+                    assert (
+                        input_ids_uncond is not None
+                    ), "`input_ids_uncond` has to be defined for super conditioning."
+                    assert (
+                        do_sample is True
+                    ), "`do_sample` has to be True for super conditioning."
+                    assert (
+                        num_beams == 1
+                    ), "`num_beams` has to be 1 for super conditioning."
+                    model_kwargs_uncond = (
+                        self._prepare_encoder_decoder_kwargs_for_generation(
+                            input_ids_uncond,
+                            params,
+                            {
+                                "attention_mask": attention_mask_uncond,
+                                **model_kwargs_input,
+                            },
+                        )
+                    )
+                else:
+                    model_kwargs_uncond = None
+            # prepare decoder_input_ids for generation
+            input_ids = (
+                jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+            )
+        if not do_sample and num_beams == 1:
+            logits_processor = self._get_logits_processor(
+                no_repeat_ngram_size,
+                min_length,
+                max_length,
+                eos_token_id,
+                forced_bos_token_id,
+                forced_eos_token_id,
+            )
+            return self._greedy_search(
+                input_ids,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        elif do_sample and num_beams == 1:
+            logits_warper = self._get_logits_warper(
+                top_k=top_k, top_p=top_p, temperature=temperature
+            )
+            logits_processor = self._get_logits_processor(
+                no_repeat_ngram_size,
+                min_length,
+                max_length,
+                eos_token_id,
+                forced_bos_token_id,
+                forced_eos_token_id,
+            )
+            return self._sample(
+                input_ids,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                prng_key,
+                logits_warper=logits_warper,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+                condition_scale=condition_scale,
+                model_kwargs_uncond=model_kwargs_uncond,
+            )
+        elif not do_sample and num_beams > 1:
+            # broadcast input_ids & encoder_outputs
+            input_ids = self._expand_to_num_beams(input_ids, num_beams=num_beams)
+            if "encoder_outputs" in model_kwargs:
+                model_kwargs["encoder_outputs"][
+                    "last_hidden_state"
+                ] = self._expand_to_num_beams(
+                    model_kwargs["encoder_outputs"]["last_hidden_state"],
+                    num_beams=num_beams,
+                )
+            if "attention_mask" in model_kwargs:
+                model_kwargs["attention_mask"] = self._expand_to_num_beams(
+                    model_kwargs["attention_mask"], num_beams=num_beams
+                )
+            logits_processor = self._get_logits_processor(
+                no_repeat_ngram_size,
+                min_length,
+                max_length,
+                eos_token_id,
+                forced_bos_token_id,
+                forced_eos_token_id,
+            )
+            return self._beam_search(
+                input_ids,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                length_penalty=length_penalty,
+                early_stopping=early_stopping,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        else:
+            raise NotImplementedError("`Beam sampling is currently not implemented.")
+    def _sample(
+        self,
+        input_ids: None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        prng_key: Optional[jnp.ndarray] = None,
+        logits_processor=None,
+        logits_warper=None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+        condition_scale: float = 1.0,
+        model_kwargs_uncond: Optional[Dict[str, jnp.ndarray]] = None,
+    ):
+        # init values
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = (
+            pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        )
+        eos_token_id = (
+            eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        )
+        prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
+        batch_size, cur_len = input_ids.shape
+        eos_token_id = jnp.array(eos_token_id)
+        pad_token_id = jnp.array(pad_token_id)
+        cur_len = jnp.array(cur_len)
+        # per batch-item holding current token in loop.
+        sequences = jnp.full((batch_size, max_length), pad_token_id, dtype=jnp.int32)
+        sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0))
+        # per batch-item state bit indicating if sentence has finished.
+        is_sent_finished = jnp.zeros((batch_size,), dtype=jnp.bool_)
+        # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
+        # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
+        model = self.decode if self.config.is_encoder_decoder else self
+        # initialize model specific kwargs
+        model_kwargs = self.prepare_inputs_for_generation(
+            input_ids, max_length, **model_kwargs
+        )
+        if condition_scale != 1.0:
+            model_kwargs_uncond = self.prepare_inputs_for_generation(
+                input_ids, max_length, **model_kwargs_uncond
+            )
+        # initialize state
+        state = SampleState(
+            cur_len=cur_len,
+            sequences=sequences,
+            running_token=input_ids,
+            is_sent_finished=is_sent_finished,
+            prng_key=prng_key,
+            model_kwargs=model_kwargs,
+            model_kwargs_uncond=model_kwargs_uncond,
+        )
+        def sample_search_cond_fn(state):
+            """state termination condition fn."""
+            has_reached_max_length = state.cur_len == max_length
+            all_sequence_finished = jnp.all(state.is_sent_finished)
+            finish_generation = jnp.logical_or(
+                has_reached_max_length, all_sequence_finished
+            )
+            return ~finish_generation
+        def sample_search_body_fn(state):
+            """state update fn."""
+            prng_key, prng_key_next = jax.random.split(state.prng_key)
+            model_outputs = model(
+                state.running_token, params=params, **state.model_kwargs
+            )
+            logits = model_outputs.logits[:, -1]
+            # perform super conditioning
+            # Source: @RiversHaveWings - https://twitter.com/RiversHaveWings/status/1478093658716966912?s=20&t=xdm-wZ61Wf7OLnE_NJHZ1w
+            if condition_scale != 1.0:
+                model_outputs_uncond = model(
+                    state.running_token, params=params, **state.model_kwargs_uncond
+                )
+                logits_uncond = model_outputs_uncond.logits[:, -1]
+                logits = logits_uncond + condition_scale * (logits - logits_uncond)
+            else:
+                model_outputs_uncond = None
+            # apply min_length, ...
+            logits = logits_processor(state.sequences, logits, state.cur_len)
+            # apply top_k, top_k, temperature
+            logits = logits_warper(logits, logits, state.cur_len)
+            next_token = jax.random.categorical(prng_key, logits, axis=-1)
+            next_is_sent_finished = state.is_sent_finished | (
+                next_token == eos_token_id
+            )
+            next_token = (
+                next_token * ~next_is_sent_finished
+                + pad_token_id * next_is_sent_finished
+            )
+            next_token = next_token[:, None]
+            next_sequences = lax.dynamic_update_slice(
+                state.sequences, next_token, (0, state.cur_len)
+            )
+            next_model_kwargs = self.update_inputs_for_generation(
+                model_outputs, state.model_kwargs
+            )
+            next_model_kwargs_uncond = (
+                self.update_inputs_for_generation(
+                    model_outputs_uncond, state.model_kwargs_uncond
+                )
+                if condition_scale != 1.0
+                else None
+            )
+            return SampleState(
+                cur_len=state.cur_len + 1,
+                sequences=next_sequences,
+                running_token=next_token,
+                is_sent_finished=next_is_sent_finished,
+                model_kwargs=next_model_kwargs,
+                model_kwargs_uncond=next_model_kwargs_uncond,
+                prng_key=prng_key_next,
+            )
+        # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
+        if input_ids.shape[1] > 1:
+            state = sample_search_body_fn(state)
+        if not trace:
+            state = self._run_loop_in_debug(
+                sample_search_cond_fn, sample_search_body_fn, state
+            )
+        else:
+            state = lax.while_loop(sample_search_cond_fn, sample_search_body_fn, state)
+        return FlaxSampleOutput(sequences=state.sequences)

src/dalle_mini/model/processor.py ADDED Viewed

	@@ -0,0 +1,58 @@

+""" DalleBart processor """
+import jax.numpy as jnp
+from .configuration import DalleBartConfig
+from .text import TextNormalizer
+from .tokenizer import DalleBartTokenizer
+from .utils import PretrainedFromWandbMixin
+class DalleBartProcessorBase:
+    def __init__(
+        self, tokenizer: DalleBartTokenizer, normalize_text: bool, max_text_length: int
+    ):
+        self.tokenizer = tokenizer
+        self.normalize_text = normalize_text
+        self.max_text_length = max_text_length
+        if normalize_text:
+            self.text_processor = TextNormalizer()
+        # create unconditional tokens
+        uncond = self.tokenizer(
+            "",
+            return_tensors="jax",
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_text_length,
+        ).data
+        self.input_ids_uncond = uncond["input_ids"]
+        self.attention_mask_uncond = uncond["attention_mask"]
+    def __call__(self, text: str = None):
+        # check that text is not a string
+        assert not isinstance(text, str), "text must be a list of strings"
+        if self.normalize_text:
+            text = [self.text_processor(t) for t in text]
+        res = self.tokenizer(
+            text,
+            return_tensors="jax",
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_text_length,
+        ).data
+        # tokens used only with super conditioning
+        n = len(text)
+        res["input_ids_uncond"] = jnp.repeat(self.input_ids_uncond, n, axis=0)
+        res["attention_mask_uncond"] = jnp.repeat(self.attention_mask_uncond, n, axis=0)
+        return res
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        tokenizer = DalleBartTokenizer.from_pretrained(*args, **kwargs)
+        config = DalleBartConfig.from_pretrained(*args, **kwargs)
+        return cls(tokenizer, config.normalize_text, config.max_text_length)
+class DalleBartProcessor(PretrainedFromWandbMixin, DalleBartProcessorBase):
+    pass

src/dalle_mini/{text.py → model/text.py} RENAMED Viewed

File without changes

tools/inference/inference_pipeline.ipynb CHANGED Viewed

@@ -75,7 +75,7 @@
     "# Model references\n",
     "\n",
     "# dalle-mini\n",
-    "DALLE_MODEL = \"dalle-mini/dalle-mini/model-1reghx5l:latest\"  # can be wandb artifact or 🤗 Hub or local folder\n",
     "DALLE_COMMIT_ID = None\n",
     "\n",
     "# VQGAN model\n",
@@ -126,7 +126,7 @@
    "outputs": [],
    "source": [
     "# Load models & tokenizer\n",
-    "from dalle_mini.model import DalleBart, DalleBartTokenizer\n",
     "from vqgan_jax.modeling_flax_vqgan import VQModel\n",
     "from transformers import CLIPProcessor, FlaxCLIPModel\n",
     "\n",
@@ -134,14 +134,13 @@
     "model = DalleBart.from_pretrained(\n",
     "    DALLE_MODEL, revision=DALLE_COMMIT_ID, dtype=dtype, abstract_init=True\n",
     ")\n",
-    "tokenizer = DalleBartTokenizer.from_pretrained(DALLE_MODEL, revision=DALLE_COMMIT_ID)\n",
     "\n",
     "# Load VQGAN\n",
     "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
     "\n",
     "# Load CLIP\n",
     "clip = FlaxCLIPModel.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)\n",
-    "processor = CLIPProcessor.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)"
    ]
   },
   {
@@ -192,17 +191,18 @@
     "from functools import partial\n",
     "\n",
     "# model inference\n",
-    "@partial(jax.pmap, axis_name=\"batch\", static_broadcasted_argnums=(3, 4))\n",
-    "def p_generate(tokenized_prompt, key, params, top_k, top_p):\n",
     "    return model.generate(\n",
     "        **tokenized_prompt,\n",
-    "        do_sample=True,\n",
-    "        num_beams=1,\n",
     "        prng_key=key,\n",
     "        params=params,\n",
     "        top_k=top_k,\n",
     "        top_p=top_p,\n",
-    "        max_length=257\n",
     "    )\n",
     "\n",
     "\n",
@@ -258,7 +258,7 @@
     "id": "rsmj0Aj5OQox"
    },
    "source": [
-    "Our model may require to normalize the prompt."
    ]
   },
   {
@@ -269,9 +269,9 @@
    },
    "outputs": [],
    "source": [
-    "from dalle_mini.text import TextNormalizer\n",
     "\n",
-    "text_normalizer = TextNormalizer() if model.config.normalize_text else None"
    ]
   },
   {
@@ -291,7 +291,7 @@
    },
    "outputs": [],
    "source": [
-    "prompt = \"view of the beach during sunset\""
    ]
   },
   {
@@ -302,34 +302,7 @@
    },
    "outputs": [],
    "source": [
-    "processed_prompt = text_normalizer(prompt) if model.config.normalize_text else prompt\n",
-    "processed_prompt"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "QUzYACWxOe5z"
-   },
-   "source": [
-    "We tokenize the prompt."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "n8e7MvGwOe5z"
-   },
-   "outputs": [],
-   "source": [
-    "tokenized_prompt = tokenizer(\n",
-    "    processed_prompt,\n",
-    "    return_tensors=\"jax\",\n",
-    "    padding=\"max_length\",\n",
-    "    truncation=True,\n",
-    "    max_length=128,\n",
-    ").data\n",
     "tokenized_prompt"
    ]
   },
@@ -390,7 +363,9 @@
     "\n",
     "# We can customize top_k/top_p used for generating samples\n",
     "gen_top_k = None\n",
-    "gen_top_p = None"
    ]
   },
   {
@@ -413,7 +388,13 @@
     "    key, subkey = jax.random.split(key)\n",
     "    # generate images\n",
     "    encoded_images = p_generate(\n",
-    "        tokenized_prompt, shard_prng_key(subkey), model.params, gen_top_k, gen_top_p\n",
     "    )\n",
     "    # remove BOS\n",
     "    encoded_images = encoded_images.sequences[..., 1:]\n",
@@ -444,7 +425,7 @@
     "from flax.training.common_utils import shard\n",
     "\n",
     "# get clip scores\n",
-    "clip_inputs = processor(\n",
     "    text=[prompt] * jax.device_count(),\n",
     "    images=images,\n",
     "    return_tensors=\"np\",\n",

     "# Model references\n",
     "\n",
     "# dalle-mini\n",
+    "DALLE_MODEL = \"dalle-mini/dalle-mini/model-2vm4itcx:latest\"  # can be wandb artifact or 🤗 Hub or local folder or google bucket\n",
     "DALLE_COMMIT_ID = None\n",
     "\n",
     "# VQGAN model\n",
    "outputs": [],
    "source": [
     "# Load models & tokenizer\n",
+    "from dalle_mini import DalleBart, DalleBartProcessor\n",
     "from vqgan_jax.modeling_flax_vqgan import VQModel\n",
     "from transformers import CLIPProcessor, FlaxCLIPModel\n",
     "\n",
     "model = DalleBart.from_pretrained(\n",
     "    DALLE_MODEL, revision=DALLE_COMMIT_ID, dtype=dtype, abstract_init=True\n",
     ")\n",
     "\n",
     "# Load VQGAN\n",
     "vqgan = VQModel.from_pretrained(VQGAN_REPO, revision=VQGAN_COMMIT_ID)\n",
     "\n",
     "# Load CLIP\n",
     "clip = FlaxCLIPModel.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)\n",
+    "clip_processor = CLIPProcessor.from_pretrained(CLIP_REPO, revision=CLIP_COMMIT_ID)"
    ]
   },
   {
     "from functools import partial\n",
     "\n",
     "# model inference\n",
+    "@partial(jax.pmap, axis_name=\"batch\", static_broadcasted_argnums=(3, 4, 5, 6))\n",
+    "def p_generate(\n",
+    "    tokenized_prompt, key, params, top_k, top_p, temperature, condition_scale\n",
+    "):\n",
     "    return model.generate(\n",
     "        **tokenized_prompt,\n",
     "        prng_key=key,\n",
     "        params=params,\n",
     "        top_k=top_k,\n",
     "        top_p=top_p,\n",
+    "        temperature=temperature,\n",
+    "        condition_scale=condition_scale,\n",
     "    )\n",
     "\n",
     "\n",
     "id": "rsmj0Aj5OQox"
    },
    "source": [
+    "Our model requires processing prompts."
    ]
   },
   {
    },
    "outputs": [],
    "source": [
+    "from dalle_mini import DalleBartProcessor\n",
     "\n",
+    "processor = DalleBartProcessor.from_pretrained(DALLE_MODEL, revision=DALLE_COMMIT_ID)"
    ]
   },
   {
    },
    "outputs": [],
    "source": [
+    "prompt = \"a blue table\""
    ]
   },
   {
    },
    "outputs": [],
    "source": [
+    "tokenized_prompt = processor([prompt])\n",
     "tokenized_prompt"
    ]
   },
     "\n",
     "# We can customize top_k/top_p used for generating samples\n",
     "gen_top_k = None\n",
+    "gen_top_p = None\n",
+    "temperature = 0.85\n",
+    "cond_scale = 3.0"
    ]
   },
   {
     "    key, subkey = jax.random.split(key)\n",
     "    # generate images\n",
     "    encoded_images = p_generate(\n",
+    "        tokenized_prompt,\n",
+    "        shard_prng_key(subkey),\n",
+    "        model.params,\n",
+    "        gen_top_k,\n",
+    "        gen_top_p,\n",
+    "        temperature,\n",
+    "        cond_scale,\n",
     "    )\n",
     "    # remove BOS\n",
     "    encoded_images = encoded_images.sequences[..., 1:]\n",
     "from flax.training.common_utils import shard\n",
     "\n",
     "# get clip scores\n",
+    "clip_inputs = clip_processor(\n",
     "    text=[prompt] * jax.device_count(),\n",
     "    images=images,\n",
     "    return_tensors=\"np\",\n",

tools/train/train.py CHANGED Viewed

@@ -103,7 +103,7 @@ class ModelArguments:
     def __post_init__(self):
         if self.tokenizer_name is None:
-            self.tokenizer_name == self.model_name_or_path
             assert (
                 self.tokenizer_name is not None
             ), "Tokenizer name or model name/path needs to be specified"
@@ -209,6 +209,26 @@ class DataTrainingArguments:
             "help": "Probability of removing some captions for classifier-free guidance."
         },
     )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={

     def __post_init__(self):
         if self.tokenizer_name is None:
+            self.tokenizer_name = self.model_name_or_path
             assert (
                 self.tokenizer_name is not None
             ), "Tokenizer name or model name/path needs to be specified"
             "help": "Probability of removing some captions for classifier-free guidance."
         },
     )
+    clip_score_column: Optional[str] = field(
+        default="clip_score",
+        metadata={"help": "Column that containts clip score for filtering."},
+    )
+    min_clip_score: Optional[float] = field(
+        default=None,
+        metadata={"help": "Minimum clip score required."},
+    )
+    max_clip_score: Optional[float] = field(
+        default=None,
+        metadata={"help": "Maximum clip score required."},
+    )
+    filter_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "Column that containts classes to be filtered."},
+    )
+    filter_value: Optional[str] = field(
+        default=None,
+        metadata={"help": "Class value to be kept during filtering."},
+    )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={