BiliSakura commited on 6 days ago

Commit

b67e8f3

verified ·

1 Parent(s): b80fa12

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitattributes +1 -0
MiniT2I-B-16/demo.png +3 -0
MiniT2I-B-16/model_index.json +26 -0
MiniT2I-B-16/pipeline.py +444 -0
MiniT2I-B-16/scheduler/scheduler_config.json +7 -0
MiniT2I-B-16/text_encoder/README.md +276 -0
MiniT2I-B-16/text_encoder/config.json +28 -0
MiniT2I-B-16/text_encoder/generation_config.json +7 -0
MiniT2I-B-16/text_encoder/model.safetensors +3 -0
MiniT2I-B-16/text_encoder/special_tokens_map.json +107 -0
MiniT2I-B-16/text_encoder/spiece.model +3 -0
MiniT2I-B-16/text_encoder/tokenizer.json +0 -0
MiniT2I-B-16/text_encoder/tokenizer_config.json +113 -0
MiniT2I-B-16/tokenizer/special_tokens_map.json +107 -0
MiniT2I-B-16/tokenizer/spiece.model +3 -0
MiniT2I-B-16/tokenizer/tokenizer.json +0 -0
MiniT2I-B-16/tokenizer/tokenizer_config.json +113 -0
MiniT2I-B-16/transformer/config.json +27 -0
MiniT2I-B-16/transformer/diffusion_pytorch_model.safetensors +3 -0
MiniT2I-B-16/transformer/transformer_minit2i.py +446 -0
MiniT2I-L-16/model_index.json +26 -0
MiniT2I-L-16/pipeline.py +444 -0
MiniT2I-L-16/scheduler/scheduler_config.json +7 -0
MiniT2I-L-16/text_encoder/README.md +276 -0
MiniT2I-L-16/text_encoder/config.json +28 -0
MiniT2I-L-16/text_encoder/generation_config.json +7 -0
MiniT2I-L-16/text_encoder/model.safetensors +3 -0
MiniT2I-L-16/text_encoder/special_tokens_map.json +107 -0
MiniT2I-L-16/text_encoder/spiece.model +3 -0
MiniT2I-L-16/text_encoder/tokenizer.json +0 -0
MiniT2I-L-16/text_encoder/tokenizer_config.json +113 -0
MiniT2I-L-16/tokenizer/special_tokens_map.json +107 -0
MiniT2I-L-16/tokenizer/spiece.model +3 -0
MiniT2I-L-16/tokenizer/tokenizer.json +0 -0
MiniT2I-L-16/tokenizer/tokenizer_config.json +113 -0
MiniT2I-L-16/transformer/config.json +27 -0
MiniT2I-L-16/transformer/diffusion_pytorch_model.safetensors +3 -0
MiniT2I-L-16/transformer/transformer_minit2i.py +446 -0
README.md +156 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+MiniT2I-B-16/demo.png filter=lfs diff=lfs merge=lfs -text

MiniT2I-B-16/demo.png ADDED Viewed

Git LFS Details

SHA256: 5f7ef1590783708ce7d2ece800ad0d48e76b71260ed5b818cc999e5c2a5e0952
Pointer size: 131 Bytes
Size of remote file: 489 kB

MiniT2I-B-16/model_index.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_class_name": [
+    "pipeline",
+    "MiniT2ITextToImagePipeline"
+  ],
+  "_diffusers_version": "0.32.0",
+  "default_num_inference_steps": 100,
+  "model_type": "b16",
+  "recommended_guidance_scale": 2.5,
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5EncoderModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "T5Tokenizer"
+  ],
+  "transformer": [
+    "transformer_minit2i",
+    "MiniT2IMMJiTModel"
+  ]
+}

MiniT2I-B-16/pipeline.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""Hub custom pipeline: MiniT2ITextToImagePipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+os.environ.setdefault("USE_FLAX", "0")
+os.environ.setdefault("TRANSFORMERS_NO_FLAX", "1")
+import torch
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers import logging as transformers_logging
+transformers_logging.set_verbosity_error()
+DEFAULT_NUM_INFERENCE_STEPS = 100
+NOISE_INIT_SCALE = 2.0
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+        >>> model_dir = Path("./minit2i-diffusers").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe.to("cuda")
+        >>> pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     "a cinematic portrait of a robot musician",
+        ...     num_inference_steps=100,
+        ...     guidance_scale=6.0,
+        ...     generator=generator,
+        ... ).images[0]
+        >>> image.save("demo.png")
+        ```
+"""
+MODEL_ALIASES: Dict[str, str] = {
+    "b": "minit2i-b-16",
+    "b16": "minit2i-b-16",
+    "b-16": "minit2i-b-16",
+    "base": "minit2i-b-16",
+    "minit2i-b16": "minit2i-b-16",
+    "minit2i-b-16": "minit2i-b-16",
+    "minit2i-b/16": "minit2i-b-16",
+    "l": "minit2i-l-16",
+    "l16": "minit2i-l-16",
+    "l-16": "minit2i-l-16",
+    "large": "minit2i-l-16",
+    "minit2i-l16": "minit2i-l-16",
+    "minit2i-l-16": "minit2i-l-16",
+    "minit2i-l/16": "minit2i-l-16",
+}
+def resolve_model_type(model_type: str) -> str:
+    key = model_type.lower().replace("_", "-")
+    if key not in MODEL_ALIASES:
+        choices = ", ".join(sorted(set(MODEL_ALIASES)))
+        raise ValueError(f"Unknown model_type={model_type!r}. Expected one of: {choices}")
+    return MODEL_ALIASES[key]
+class MiniT2ITextToImagePipeline(DiffusionPipeline):
+    r"""
+    Text-to-image pipeline for MiniT2I pixel-space flow matching.
+    Parameters:
+        transformer ([`MiniT2IMMJiTModel`]):
+            MiniT2I MM-JiT transformer that predicts flow-matching velocity in pixel space.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            Flow-matching Euler scheduler. Other [`KarrasDiffusionSchedulers`] can be swapped at inference time.
+        tokenizer ([`AutoTokenizer`], *optional*):
+            Tokenizer for the text encoder.
+        text_encoder ([`T5EncoderModel`], *optional*):
+            Text encoder used to embed prompts.
+    """
+    model_cpu_offload_seq = "text_encoder->transformer"
+    _optional_components = ["tokenizer", "text_encoder"]
+    def __init__(
+        self,
+        transformer,
+        scheduler,
+        tokenizer=None,
+        text_encoder=None,
+        text_encoder_name: str = "google/flan-t5-large",
+        model_type: str = "b16",
+        repo_id_or_path: Optional[str] = None,
+        default_num_inference_steps: int = DEFAULT_NUM_INFERENCE_STEPS,
+    ):
+        super().__init__()
+        if scheduler is None:
+            scheduler = self._default_inference_scheduler()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+        )
+        self.register_to_config(
+            text_encoder_name=text_encoder_name,
+            model_type=model_type,
+            repo_id_or_path=repo_id_or_path,
+            default_num_inference_steps=int(default_num_inference_steps),
+        )
+        self._variant_transformers: Dict[str, MiniT2IMMJiTModel] = {}
+        self._active_model_type = resolve_model_type(model_type)
+    @staticmethod
+    def _default_inference_scheduler() -> FlowMatchEulerDiscreteScheduler:
+        return FlowMatchEulerDiscreteScheduler(
+            num_train_timesteps=1000,
+            shift=1.0,
+            stochastic_sampling=False,
+        )
+    @classmethod
+    def _load_scheduler_from_dir(
+        cls,
+        scheduler_dir: Path,
+        model_kwargs: Dict[str, Any],
+    ) -> Tuple[KarrasDiffusionSchedulers, int]:
+        config_path = scheduler_dir / "scheduler_config.json"
+        if not config_path.exists():
+            return cls._default_inference_scheduler(), DEFAULT_NUM_INFERENCE_STEPS
+        config = json.loads(config_path.read_text(encoding="utf-8"))
+        class_name = config.get("_class_name", "")
+        default_steps = int(config.get("num_inference_steps", DEFAULT_NUM_INFERENCE_STEPS))
+        if class_name == "MiniT2IFlowMatchScheduler":
+            return cls._default_inference_scheduler(), default_steps
+        schedulers_pkg = _hf["schedulers"]
+        if hasattr(schedulers_pkg, class_name):
+            scheduler_cls = getattr(schedulers_pkg, class_name)
+            return scheduler_cls.from_pretrained(str(scheduler_dir), **model_kwargs), default_steps
+        return cls._default_inference_scheduler(), default_steps
+    @staticmethod
+    def _resolve_transformer_path(root: Path, variant_dir: str) -> Path:
+        variant_transformer = root / variant_dir / "transformer"
+        if variant_transformer.exists():
+            return variant_transformer
+        root_transformer = root / "transformer"
+        if root_transformer.exists():
+            return root_transformer
+        raise FileNotFoundError(
+            f"Could not find transformer weights under {root}. "
+            f"Tried {variant_transformer} and {root_transformer}."
+        )
+    def _get_transformer(
+        self,
+        model_type: Optional[str],
+        repo_id_or_path: Optional[str],
+        torch_dtype: Optional[torch.dtype] = None,
+        variant: Optional[str] = None,
+    ) -> MiniT2IMMJiTModel:
+        active_type = resolve_model_type(model_type or self.config.model_type)
+        if active_type == self._active_model_type and self.transformer is not None:
+            return self.transformer
+        if active_type in self._variant_transformers:
+            return self._variant_transformers[active_type]
+        repo = repo_id_or_path or self.config.repo_id_or_path
+        if repo is None:
+            raise ValueError("model_type switching requires repo_id_or_path to be set on the pipeline.")
+        root = Path(repo)
+        if not root.exists():
+            root = Path(snapshot_download(repo_id=str(repo)))
+        transformer = MiniT2IMMJiTModel.from_pretrained(
+            self._resolve_transformer_path(root, active_type),
+            torch_dtype=torch_dtype,
+            variant=variant,
+        )
+        self._variant_transformers[active_type] = transformer
+        if active_type == resolve_model_type(self.config.model_type):
+            self.transformer = transformer
+            self._active_model_type = active_type
+        return transformer
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+    ) -> Dict[str, Any]:
+        kwargs: Dict[str, Any] = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        return kwargs
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        guidance_scale: float,
+        num_inference_steps: int,
+        output_type: str,
+    ) -> None:
+        if not isinstance(prompt, str) and not (isinstance(prompt, list) and all(isinstance(p, str) for p in prompt)):
+            raise TypeError(f"`prompt` must be a string or list of strings, got {type(prompt)}.")
+        if guidance_scale < 0:
+            raise ValueError(f"`guidance_scale` must be non-negative, got {guidance_scale}.")
+        if num_inference_steps <= 0:
+            raise ValueError(f"`num_inference_steps` must be positive, got {num_inference_steps}.")
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError(f"Unsupported `output_type`: {output_type}")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        image_size: int,
+        in_channels: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        shape = (batch_size, in_channels, image_size, image_size)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = latents * NOISE_INIT_SCALE
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+            if tuple(latents.shape) != shape:
+                raise ValueError(f"Invalid `latents` shape: {tuple(latents.shape)}. Expected {shape}.")
+        return latents
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        transformer = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        transformer = transformer or self.transformer
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.config.text_encoder_name)
+        if self.text_encoder is None:
+            self.text_encoder = T5EncoderModel.from_pretrained(self.config.text_encoder_name)
+        if next(self.text_encoder.parameters()).device != device:
+            self.text_encoder.to(device)
+        cfg = transformer.mmjit_config
+        tokens = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=cfg.prompt_length,
+        )
+        input_ids = tokens.input_ids.to(device)
+        attn = tokens.attention_mask.to(device)
+        text = self.text_encoder(input_ids=input_ids, attention_mask=attn).last_hidden_state
+        return text, attn
+    @staticmethod
+    def _cfg_velocity(
+        transformer,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        text: torch.Tensor,
+        mask: torch.Tensor,
+        cfg_scale: float,
+    ) -> torch.Tensor:
+        batch_size = x.shape[0]
+        doubled_x = torch.cat([x, x], dim=0)
+        doubled_t = torch.cat([t, t], dim=0)
+        doubled_text = torch.cat([text, text], dim=0)
+        null_mask = torch.zeros_like(mask)
+        doubled_mask = torch.cat([mask, null_mask], dim=0)
+        velocity = transformer.pred_velocity(doubled_x, doubled_t, doubled_text, doubled_mask)
+        cond, uncond = velocity[:batch_size], velocity[batch_size:]
+        cfg_interval = transformer.mmjit_config.cfg_interval
+        use_cfg = ((t >= cfg_interval[0]) & (t <= cfg_interval[1])).to(velocity.dtype)
+        scale = torch.where(
+            use_cfg[:, None, None, None] > 0,
+            torch.tensor(cfg_scale, device=x.device, dtype=velocity.dtype),
+            torch.tensor(1.0, device=x.device, dtype=velocity.dtype),
+        )
+        return uncond + (cond - uncond) * scale
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        guidance_scale: float = 6.0,
+        num_inference_steps: Optional[int] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        progress: bool = True,
+        model_type: Optional[str] = None,
+        repo_id_or_path: Optional[str] = None,
+        variant: Optional[str] = None,
+        torch_dtype: Optional[torch.dtype] = None,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Generate images from text prompts with MiniT2I.
+        Args:
+            prompt (`str` or `list[str]`):
+                Text prompt or batch of prompts.
+            num_images_per_prompt (`int`, defaults to `1`):
+                Number of images to generate per prompt.
+            guidance_scale (`float`, defaults to `6.0`):
+                Classifier-free guidance scale. CFG is active when `guidance_scale != 1.0`.
+            num_inference_steps (`int`, *optional*):
+                Number of denoising steps. Defaults to the pipeline config value.
+            generator (`torch.Generator`, *optional*):
+                RNG for reproducibility.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated pixel latents with shape `(batch, channels, height, width)`.
+            output_type (`str`, defaults to `"pil"`):
+                `"pil"`, `"np"`, `"pt"`, or `"latent"`.
+            return_dict (`bool`, defaults to `True`):
+                Return [`ImagePipelineOutput`] if True.
+            progress (`bool`, defaults to `True`):
+                Whether to show a progress bar during denoising.
+            model_type (`str`, *optional*):
+                MiniT2I variant alias such as `"b16"` or `"l16"`.
+            repo_id_or_path (`str`, *optional*):
+                Hub id or local path used when switching `model_type`.
+            variant (`str`, *optional*):
+                Weight variant passed to `from_pretrained`.
+            torch_dtype (`torch.dtype`, *optional*):
+                Optional dtype override when loading a different transformer variant.
+        """
+        num_inference_steps = int(num_inference_steps or self.config.default_num_inference_steps)
+        self.check_inputs(prompt, guidance_scale, num_inference_steps, output_type)
+        transformer = self._get_transformer(model_type, repo_id_or_path, torch_dtype=torch_dtype, variant=variant)
+        device = self._execution_device
+        transformer = transformer.to(device)
+        if isinstance(prompt, str):
+            prompt_batch = [prompt] * num_images_per_prompt
+        else:
+            prompt_batch = []
+            for entry in prompt:
+                prompt_batch.extend([entry] * num_images_per_prompt)
+        batch_size = len(prompt_batch)
+        mmjit_cfg = transformer.mmjit_config
+        model_dtype = next(transformer.parameters()).dtype
+        text, attn = self._encode_prompt(prompt_batch, device, transformer=transformer)
+        text = text.to(dtype=model_dtype)
+        attn = attn.to(dtype=model_dtype)
+        if getattr(self.scheduler.config, "stochastic_sampling", False):
+            raise ValueError(
+                "MiniT2I expects deterministic FlowMatchEulerDiscreteScheduler stepping "
+                "(scheduler.config.stochastic_sampling=False)."
+            )
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        num_train_timesteps = self.scheduler.config.num_train_timesteps
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            image_size=mmjit_cfg.image_size,
+            in_channels=mmjit_cfg.in_channels,
+            device=device,
+            dtype=model_dtype,
+            generator=generator,
+            latents=latents,
+        )
+        timesteps = self.scheduler.timesteps
+        if progress:
+            timesteps = self.progress_bar(timesteps)
+        using_cfg = guidance_scale != 1.0
+        for timestep in timesteps:
+            flow_time = 1.0 - float(timestep) / num_train_timesteps
+            t = torch.full((batch_size,), flow_time, device=device, dtype=model_dtype)
+            if using_cfg:
+                velocity = self._cfg_velocity(transformer, latents, t, text, attn, guidance_scale)
+            else:
+                velocity = transformer.pred_velocity(latents, t, text, attn)
+            # MiniT2I integrates velocity from noise (t=0) to data (t=1); flip sign for
+            # FlowMatchEulerDiscreteScheduler sigma decreasing from 1 to 0.
+            latents = self.scheduler.step(-velocity, timestep, latents, **extra_step_kwargs).prev_sample
+        if output_type == "latent":
+            images = latents
+        else:
+            images = (latents.clamp(-1, 1) * 127.5 + 128.0).clamp(0, 255).to(torch.uint8)
+            if output_type == "pt":
+                images = images.float() / 255.0
+            else:
+                images = images.permute(0, 2, 3, 1).cpu().numpy()
+                if output_type == "pil":
+                    images = [Image.fromarray(image) for image in images]
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (images,)
+        return ImagePipelineOutput(images=images)

MiniT2I-B-16/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.32.0",
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "stochastic_sampling": false
+}

MiniT2I-B-16/text_encoder/README.md ADDED Viewed

	@@ -0,0 +1,276 @@

+---
+language:
+- en
+- fr
+- ro
+- de
+- multilingual
+widget:
+- text: "Translate to German:  My name is Arthur"
+  example_title: "Translation"
+- text: "Please answer to the following question. Who is going to be the next Ballon d'or?"
+  example_title: "Question Answering"
+- text: "Q: Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering."
+  example_title: "Logical reasoning"
+- text: "Please answer the following question. What is the boiling point of Nitrogen?"
+  example_title: "Scientific knowledge"
+- text: "Answer the following yes/no question. Can you write a whole Haiku in a single tweet?"
+  example_title: "Yes/no question"
+- text: "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?"
+  example_title: "Reasoning task"
+- text: "Q: ( False or not False or False ) is? A: Let's think step by step"
+  example_title: "Boolean Expressions"
+- text: "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
+  example_title: "Math reasoning"
+- text: "Premise:  At my age you will probably have learnt one lesson. Hypothesis:  It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?"
+  example_title: "Premise and hypothesis"
+tags:
+- text2text-generation
+datasets:
+- svakulenk0/qrecc
+- taskmaster2
+- djaym7/wiki_dialog
+- deepmind/code_contests
+- lambada
+- gsm8k
+- aqua_rat
+- esnli
+- quasc
+- qed
+license: apache-2.0
+---
+# Model Card for FLAN-T5 large
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/flan2_architecture.jpg"
+alt="drawing" width="600"/>
+#  Table of Contents
+0. [TL;DR](#TL;DR)
+1. [Model Details](#model-details)
+2. [Usage](#usage)
+3. [Uses](#uses)
+4. [Bias, Risks, and Limitations](#bias-risks-and-limitations)
+5. [Training Details](#training-details)
+6. [Evaluation](#evaluation)
+7. [Environmental Impact](#environmental-impact)
+8. [Citation](#citation)
+9. [Model Card Authors](#model-card-authors)
+# TL;DR
+If you already know T5, FLAN-T5 is just better at everything. For the same number of parameters, these models have been fine-tuned on more than 1000 additional tasks covering also more languages.
+As mentioned in the first few lines of the abstract :
+>  Flan-PaLM 540B achieves state-of-the-art performance on several benchmarks, such as 75.2% on five-shot MMLU. We also publicly release Flan-T5 checkpoints,1 which achieve strong few-shot performance even compared to much larger models, such as PaLM 62B. Overall, instruction finetuning is a general method for improving the performance and usability of pretrained language models.
+**Disclaimer**: Content from **this** model card has been written by the Hugging Face team, and parts of it were copy pasted from the [T5 model card](https://huggingface.co/t5-large).
+# Model Details
+## Model Description
+- **Model type:** Language model
+- **Language(s) (NLP):** English, Spanish, Japanese, Persian, Hindi, French, Chinese, Bengali, Gujarati, German, Telugu, Italian, Arabic, Polish, Tamil, Marathi, Malayalam, Oriya, Panjabi, Portuguese, Urdu, Galician, Hebrew, Korean, Catalan, Thai, Dutch, Indonesian, Vietnamese, Bulgarian, Filipino, Central Khmer, Lao, Turkish, Russian, Croatian, Swedish, Yoruba, Kurdish, Burmese, Malay, Czech, Finnish, Somali, Tagalog, Swahili, Sinhala, Kannada, Zhuang, Igbo, Xhosa, Romanian, Haitian, Estonian, Slovak, Lithuanian, Greek, Nepali, Assamese, Norwegian
+- **License:** Apache 2.0
+- **Related Models:** [All FLAN-T5 Checkpoints](https://huggingface.co/models?search=flan-t5)
+- **Original Checkpoints:** [All Original FLAN-T5 Checkpoints](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints)
+- **Resources for more information:**
+  - [Research paper](https://arxiv.org/pdf/2210.11416.pdf)
+  - [GitHub Repo](https://github.com/google-research/t5x)
+  - [Hugging Face FLAN-T5 Docs (Similar to T5) ](https://huggingface.co/docs/transformers/model_doc/t5)
+# Usage
+Find below some example scripts on how to use the model in `transformers`:
+## Using the Pytorch model
+### Running the model on a CPU
+<details>
+<summary> Click to expand </summary>
+```python
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+### Running the model on a GPU
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install accelerate
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+### Running the model on a GPU using different precisions
+#### FP16
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install accelerate
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+#### INT8
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install bitsandbytes accelerate
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", load_in_8bit=True)
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+# Uses
+## Direct Use and Downstream Use
+The authors write in [the original paper's model card](https://arxiv.org/pdf/2210.11416.pdf) that:
+> The primary use is research on language models, including: research on zero-shot NLP tasks and in-context few-shot learning NLP tasks, such as reasoning, and question answering; advancing fairness and safety research, and understanding limitations of current large language models
+See the [research paper](https://arxiv.org/pdf/2210.11416.pdf) for further details.
+## Out-of-Scope Use
+More information needed.
+# Bias, Risks, and Limitations
+The information below in this section are copied from the model's [official model card](https://arxiv.org/pdf/2210.11416.pdf):
+> Language models, including Flan-T5, can potentially be used for language generation in a harmful way, according to Rae et al. (2021). Flan-T5 should not be used directly in any application, without a prior assessment of safety and fairness concerns specific to the application.
+## Ethical considerations and risks
+> Flan-T5 is fine-tuned on a large corpus of text data that was not filtered for explicit content or assessed for existing biases. As a result the model itself is potentially vulnerable to generating equivalently inappropriate content or replicating inherent biases in the underlying data.
+## Known Limitations
+> Flan-T5 has not been tested in real world applications.
+## Sensitive Use:
+> Flan-T5 should not be applied for any unacceptable use cases, e.g., generation of abusive speech.
+# Training Details
+## Training Data
+The model was trained on a mixture of tasks, that includes the tasks described in the table below (from the original paper, figure 2):
+![table.png](https://s3.amazonaws.com/moonup/production/uploads/1666363265279-62441d1d9fdefb55a0b7d12c.png)
+## Training Procedure
+According to the model card from the [original paper](https://arxiv.org/pdf/2210.11416.pdf):
+> These models are based on pretrained T5 (Raffel et al., 2020) and fine-tuned with instructions for better zero-shot and few-shot performance. There is one fine-tuned Flan model per T5 model size.
+The model has been trained on TPU v3 or TPU v4 pods, using [`t5x`](https://github.com/google-research/t5x) codebase together with [`jax`](https://github.com/google/jax).
+# Evaluation
+## Testing Data, Factors & Metrics
+The authors evaluated the model on various tasks covering several languages (1836 in total). See the table below for some quantitative evaluation:
+![image.png](https://s3.amazonaws.com/moonup/production/uploads/1668072995230-62441d1d9fdefb55a0b7d12c.png)
+For full details, please check the [research paper](https://arxiv.org/pdf/2210.11416.pdf).
+## Results
+For full results for FLAN-T5-Large, see the [research paper](https://arxiv.org/pdf/2210.11416.pdf), Table 3.
+# Environmental Impact
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** Google Cloud TPU Pods - TPU v3 or TPU v4  | Number of chips ≥ 4.
+- **Hours used:** More information needed
+- **Cloud Provider:** GCP
+- **Compute Region:** More information needed
+- **Carbon Emitted:** More information needed
+# Citation
+**BibTeX:**
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2210.11416,
+  doi = {10.48550/ARXIV.2210.11416},
+  url = {https://arxiv.org/abs/2210.11416},
+  author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
+  keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Scaling Instruction-Finetuned Language Models},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```

MiniT2I-B-16/text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 2816,
+  "d_kv": 64,
+  "d_model": 1024,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 24,
+  "num_heads": 16,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.23.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}

MiniT2I-B-16/text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.27.0.dev0"
+}

MiniT2I-B-16/text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9dd06ce490f139af36e9eb77dd3758b4fd07a08a73d5a1abe5ff2591e2d388e
+size 3132668804

MiniT2I-B-16/text_encoder/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

MiniT2I-B-16/text_encoder/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

MiniT2I-B-16/text_encoder/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

MiniT2I-B-16/text_encoder/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "google/t5-v1_1-large",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/younes_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-large/snapshots/314bc112b191ec17b625ba81438dc73d6c23659d/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

MiniT2I-B-16/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

MiniT2I-B-16/tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

MiniT2I-B-16/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

MiniT2I-B-16/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "google/t5-v1_1-large",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/younes_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-large/snapshots/314bc112b191ec17b625ba81438dc73d6c23659d/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

MiniT2I-B-16/transformer/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_class_name": "MiniT2IMMJiTModel",
+  "_diffusers_version": "0.35.2",
+  "cfg_channels": 3,
+  "cfg_interval": [
+    0.0,
+    1.0
+  ],
+  "cond_vec_size": 768,
+  "depth_double": 17,
+  "head_dim": 64,
+  "hidden_size": 768,
+  "image_size": 512,
+  "in_channels": 3,
+  "llm": "google/flan-t5-large",
+  "mlp_ratio": 2.6666666666666665,
+  "n_T": 100,
+  "num_heads": 12,
+  "patch_size": 16,
+  "pca_channels": 128,
+  "prediction": "x",
+  "prompt_length": 256,
+  "sampler": "euler",
+  "txt_hidden_size": 768,
+  "txt_input_size": 1024,
+  "txt_preamble_depth": 2
+}

MiniT2I-B-16/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5114de36379acd45f810001d9da82d48f96559ce4428cde7a79b1e724983ced1
+size 1032534472

MiniT2I-B-16/transformer/transformer_minit2i.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import math
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch import nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+def modulate(x, shift, scale):
+    return x * (1 + scale[:, None, :]) + shift[:, None, :]
+def rotate_half(x):
+    x1, x2 = x.reshape(*x.shape[:-1], 2, -1).unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        y = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return y * self.weight
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, t):
+        half = self.frequency_embedding_size // 2
+        freqs = torch.exp(
+            -math.log(10000.0)
+            * torch.arange(half, device=t.device, dtype=torch.float32)
+            / half
+        )
+        args = t.float()[:, None] * freqs[None]
+        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return self.mlp(emb.to(dtype=self.mlp[0].weight.dtype))
+class BottleneckPatchEmbed(nn.Module):
+    def __init__(self, img_size=512, patch_size=16, in_channels=3, pca_channels=128, hidden_size=1248):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj1 = nn.Conv2d(in_channels, pca_channels, kernel_size=patch_size, stride=patch_size, bias=False)
+        self.proj2 = nn.Conv2d(pca_channels, hidden_size, kernel_size=1, stride=1, bias=True)
+    def forward(self, x):
+        x = self.proj2(self.proj1(x))
+        return x.flatten(2).transpose(1, 2)
+class SwiGLUMlp(nn.Module):
+    def __init__(self, in_features: int, hidden_features: int):
+        super().__init__()
+        hidden_dim = (hidden_features + 7) // 8 * 8
+        self.w1 = nn.Linear(in_features, hidden_dim, bias=False)
+        self.w3 = nn.Linear(in_features, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, in_features, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TextRotaryEmbedding1D(nn.Module):
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.head_dim = head_dim
+        self.theta = theta
+    def forward(self, x):
+        b, length, h, d = x.shape
+        inv = 1.0 / (self.theta ** (torch.arange(0, d, 2, device=x.device, dtype=torch.float32) / d))
+        pos = torch.arange(length, device=x.device, dtype=torch.float32)
+        angles = torch.einsum("l,f->lf", pos, inv)
+        angles = torch.cat([angles, angles], dim=-1)
+        cos = angles.cos().to(dtype=x.dtype)
+        sin = angles.sin().to(dtype=x.dtype)
+        return x * cos[None, :, None, :] + rotate_half(x) * sin[None, :, None, :]
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.dim = head_dim // 2
+        self.theta = theta
+    def forward(self, x):
+        length = x.shape[1]
+        side = int(math.sqrt(length))
+        if side * side != length:
+            raise ValueError(f"image token length must be square, got {length}")
+        freqs = 1.0 / (
+            self.theta
+            ** (torch.arange(0, self.dim, 2, device=x.device, dtype=torch.float32)[: self.dim // 2] / self.dim)
+        )
+        t = torch.arange(side, device=x.device, dtype=torch.float32)
+        base = torch.einsum("l,f->lf", t, freqs)
+        f_h, f_w = torch.broadcast_tensors(base[:, None, :], base[None, :, :])
+        angles = torch.cat([f_h, f_w], dim=-1)
+        angles = torch.cat([angles, angles], dim=-1).reshape(length, -1)
+        cos = angles.cos().to(dtype=x.dtype)
+        sin = angles.sin().to(dtype=x.dtype)
+        return x * cos[None, :, None, :] + rotate_half(x) * sin[None, :, None, :]
+class MultiModalRotaryEmbeddingFast(nn.Module):
+    def __init__(self, head_dim: int):
+        super().__init__()
+        self.text_rope = TextRotaryEmbedding1D(head_dim)
+        self.vision_rope = VisionRotaryEmbeddingFast(head_dim)
+    def forward(self, x, txt_len: int):
+        txt = self.text_rope(x[:, :txt_len])
+        img = self.vision_rope(x[:, txt_len:])
+        return torch.cat([txt, img], dim=1)
+class PlainTextTransformerBlock(nn.Module):
+    def __init__(self, hidden_size=1248, num_heads=24, head_dim=52, mlp_ratio=2.7):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        inner_dim = num_heads * head_dim
+        self.norm1 = RMSNorm(hidden_size)
+        self.norm2 = RMSNorm(hidden_size)
+        self.qkv = nn.Linear(hidden_size, inner_dim * 3)
+        self.attn_proj = nn.Linear(inner_dim, hidden_size)
+        self.mlp = SwiGLUMlp(hidden_size, int(hidden_size * mlp_ratio))
+        self.q_norm = RMSNorm(head_dim)
+        self.k_norm = RMSNorm(head_dim)
+        self.rope = TextRotaryEmbedding1D(head_dim)
+    def forward(self, txt):
+        b, length, _ = txt.shape
+        qkv = self.qkv(self.norm1(txt)).reshape(b, length, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
+        q = self.rope(self.q_norm(q))
+        k = self.rope(self.k_norm(k))
+        attn = torch.einsum("bqhd,bkhd->bhqk", q, k) * (self.head_dim ** -0.5)
+        out = torch.einsum("bhqk,bkhd->bqhd", attn.softmax(dim=-1), v).reshape(b, length, -1)
+        txt = txt + self.attn_proj(out)
+        txt = txt + self.mlp(self.norm2(txt))
+        return txt
+class DoubleStreamDiTBlock(nn.Module):
+    def __init__(self, hidden_size=1248, txt_hidden_size=1248, num_heads=24, head_dim=52, mlp_ratio=2.7):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.txt_hidden_size = txt_hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        inner_dim = num_heads * head_dim
+        self.img_norm1 = RMSNorm(hidden_size)
+        self.img_norm2 = RMSNorm(hidden_size)
+        self.txt_norm1 = RMSNorm(txt_hidden_size)
+        self.txt_norm2 = RMSNorm(txt_hidden_size)
+        self.img_qkv = nn.Linear(hidden_size, inner_dim * 3)
+        self.txt_qkv = nn.Linear(txt_hidden_size, inner_dim * 3)
+        self.q_norm = RMSNorm(head_dim)
+        self.k_norm = RMSNorm(head_dim)
+        self.rope = MultiModalRotaryEmbeddingFast(head_dim)
+        self.img_attn_proj = nn.Linear(inner_dim, hidden_size)
+        self.txt_attn_proj = nn.Linear(inner_dim, txt_hidden_size)
+        self.img_mlp = SwiGLUMlp(hidden_size, int(hidden_size * mlp_ratio))
+        self.txt_mlp = SwiGLUMlp(txt_hidden_size, int(txt_hidden_size * mlp_ratio))
+    def forward(self, x, txt, vec):
+        b, li, _ = x.shape
+        lt = txt.shape[1]
+        x_norm = self.img_norm1(x)
+        txt_norm = self.txt_norm1(txt)
+        qkv_i = self.img_qkv(x_norm).reshape(b, li, 3, self.num_heads, self.head_dim)
+        qkv_t = self.txt_qkv(txt_norm).reshape(b, lt, 3, self.num_heads, self.head_dim)
+        q_i, k_i, v_i = qkv_i[:, :, 0], qkv_i[:, :, 1], qkv_i[:, :, 2]
+        q_t, k_t, v_t = qkv_t[:, :, 0], qkv_t[:, :, 1], qkv_t[:, :, 2]
+        q_i, k_i = self.q_norm(q_i), self.k_norm(k_i)
+        q_t, k_t = self.q_norm(q_t), self.k_norm(k_t)
+        q = self.rope(torch.cat([q_t, q_i], dim=1), txt_len=lt)
+        k = self.rope(torch.cat([k_t, k_i], dim=1), txt_len=lt)
+        v = torch.cat([v_t, v_i], dim=1)
+        attn = torch.einsum("bqhd,bkhd->bhqk", q, k) * (self.head_dim ** -0.5)
+        out = torch.einsum("bhqk,bkhd->bqhd", attn.softmax(dim=-1), v)
+        x = x + self.img_attn_proj(out[:, lt:].reshape(b, li, -1))
+        txt = txt + self.txt_attn_proj(out[:, :lt].reshape(b, lt, -1))
+        x = x + self.img_mlp(self.img_norm2(x))
+        txt = txt + self.txt_mlp(self.txt_norm2(txt))
+        return x, txt
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size=1248, patch_size=16, out_channels=3):
+        super().__init__()
+        self.patch_size = patch_size
+        self.out_channels = out_channels
+        self.norm_final = RMSNorm(hidden_size)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels)
+    def forward(self, x, vec=None):
+        return self.linear(self.norm_final(x))
+def get_2d_sincos_pos_embed(embed_dim, grid_size, device, dtype):
+    grid_h = torch.arange(grid_size, device=device, dtype=torch.float32)
+    grid_w = torch.arange(grid_size, device=device, dtype=torch.float32)
+    grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
+    grid = torch.stack(grid, dim=0).reshape(2, 1, grid_size, grid_size)
+    emb_h = get_1d_sincos_pos_embed(embed_dim // 2, grid[0])
+    emb_w = get_1d_sincos_pos_embed(embed_dim // 2, grid[1])
+    return torch.cat([emb_h, emb_w], dim=1).to(dtype=dtype)
+def get_1d_sincos_pos_embed(embed_dim, pos):
+    omega = torch.arange(embed_dim // 2, device=pos.device, dtype=torch.float32)
+    omega = 1.0 / (10000 ** (omega / (embed_dim / 2.0)))
+    out = torch.einsum("m,d->md", pos.reshape(-1), omega)
+    return torch.cat([out.sin(), out.cos()], dim=1)
+@dataclass
+class MMJiTConfig:
+    image_size: int = 512
+    patch_size: int = 16
+    in_channels: int = 3
+    txt_input_size: int = 1024
+    hidden_size: int = 768
+    txt_hidden_size: int = 768
+    cond_vec_size: int = 768
+    depth_double: int = 17
+    txt_preamble_depth: int = 2
+    num_heads: int = 12
+    head_dim: int = 64
+    mlp_ratio: float = 2.6667
+    pca_channels: int = 128
+    prompt_length: int = 256
+    n_T: int = 100
+    prediction: str = "x"
+    sampler: str = "euler"
+    cfg_channels: int = 3
+    cfg_interval: tuple = (0.0, 1.0)
+    llm: str = "google/flan-t5-large"
+class MMJiT(nn.Module):
+    def __init__(self, cfg: MMJiTConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.latent_img_size = cfg.image_size // cfg.patch_size
+        self.img_embedder = BottleneckPatchEmbed(
+            cfg.image_size, cfg.patch_size, cfg.in_channels, cfg.pca_channels, cfg.hidden_size
+        )
+        self.txt_embedder = nn.Linear(cfg.txt_input_size, cfg.txt_hidden_size, bias=False)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, cfg.txt_input_size))
+        self.t_embedder = TimestepEmbedder(cfg.cond_vec_size)
+        self.pooled_embedder = nn.Linear(cfg.txt_input_size, cfg.cond_vec_size, bias=False)
+        self.txt_preamble_blocks = nn.ModuleList(
+            [
+                PlainTextTransformerBlock(cfg.txt_hidden_size, cfg.num_heads, cfg.head_dim, cfg.mlp_ratio)
+                for _ in range(cfg.txt_preamble_depth)
+            ]
+        )
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamDiTBlock(
+                    cfg.hidden_size, cfg.txt_hidden_size, cfg.num_heads, cfg.head_dim, cfg.mlp_ratio
+                )
+                for _ in range(cfg.depth_double)
+            ]
+        )
+        self.final_layer = FinalLayer(cfg.hidden_size, cfg.patch_size, cfg.in_channels)
+    def unpatchify(self, x):
+        b = x.shape[0]
+        p = self.cfg.patch_size
+        c = self.cfg.in_channels
+        h = w = int(math.sqrt(x.shape[1]))
+        x = x.reshape(b, h, w, p, p, c)
+        x = torch.einsum("nhwpqc->nchpwq", x)
+        return x.reshape(b, c, h * p, w * p)
+    def forward(self, img, t, context, attn_mask):
+        if img.ndim == 4 and img.shape[1] != self.cfg.in_channels:
+            img = img.permute(0, 3, 1, 2)
+        attn_mask = attn_mask.to(device=context.device)
+        context = torch.where(attn_mask[:, :, None] > 0.5, context, self.mask_token.to(dtype=context.dtype))
+        x = self.img_embedder(img)
+        pos = get_2d_sincos_pos_embed(self.cfg.hidden_size, self.latent_img_size, x.device, x.dtype)
+        x = x + pos[None]
+        t_vec = self.t_embedder(t)
+        txt = self.txt_embedder(context.to(dtype=self.txt_embedder.weight.dtype))
+        pooled_text = context.mean(dim=1)
+        vec = t_vec + self.pooled_embedder(pooled_text.to(dtype=self.pooled_embedder.weight.dtype))
+        for block in self.txt_preamble_blocks:
+            txt = block(txt)
+        for block in self.double_blocks:
+            x, txt = block(x, txt, vec)
+        combined = torch.cat([txt, x], dim=1)
+        out = self.final_layer(combined, vec)
+        img_out = out[:, txt.shape[1] :, :]
+        return self.unpatchify(img_out)
+class DiffusionModel(nn.Module):
+    def __init__(self, cfg: Optional[MMJiTConfig] = None):
+        super().__init__()
+        self.cfg = cfg or MMJiTConfig()
+        self.net = MMJiT(self.cfg)
+    def real_t_to_embed_t(self, t):
+        return t
+    def pred_velocity(self, x, t, text, mask):
+        x0 = self.net(x, self.real_t_to_embed_t(t), text, mask)
+        return (x0 - x) / torch.clamp(1 - t[:, None, None, None], min=0.05)
+    def cfg_velocity(self, x, t, text, mask, cfg_scale: float):
+        b = x.shape[0]
+        xx = torch.cat([x, x], dim=0)
+        tt = torch.cat([t, t], dim=0)
+        yy = torch.cat([text, text], dim=0)
+        mm = torch.cat([mask, torch.zeros_like(mask)], dim=0)
+        out = self.pred_velocity(xx, tt, yy, mm)
+        cond, uncond = out[:b], out[b:]
+        use_cfg = ((t >= self.cfg.cfg_interval[0]) & (t <= self.cfg.cfg_interval[1])).to(out.dtype)
+        scale = torch.where(
+            use_cfg[:, None, None, None] > 0,
+            torch.tensor(cfg_scale, device=x.device, dtype=out.dtype),
+            torch.tensor(1.0, device=x.device, dtype=out.dtype),
+        )
+        return uncond + (cond - uncond) * scale
+    @torch.no_grad()
+    def sample(self, text, mask, cfg_scale=6.0, generator=None, progress=False):
+        b = text.shape[0]
+        device = text.device
+        dtype = next(self.parameters()).dtype
+        x = torch.randn(
+            b,
+            self.cfg.in_channels,
+            self.cfg.image_size,
+            self.cfg.image_size,
+            generator=generator,
+            device=device,
+            dtype=dtype,
+        ) * 2
+        timesteps = torch.linspace(0.0, 1.0, self.cfg.n_T + 1, device=device, dtype=dtype)
+        iterator = range(self.cfg.n_T)
+        if progress:
+            from tqdm.auto import tqdm
+            iterator = tqdm(iterator)
+        for i in iterator:
+            t_cur = timesteps[i].expand(b)
+            t_next = timesteps[i + 1].expand(b)
+            v = self.cfg_velocity(x, t_cur, text.to(dtype), mask.to(dtype), cfg_scale)
+            x = x + (t_next - t_cur)[:, None, None, None] * v
+        return x
+class MiniT2IMMJiTModel(ModelMixin, ConfigMixin):
+    """MiniT2I MM-JiT transformer for pixel-space flow matching."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        image_size: int = 512,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        txt_input_size: int = 1024,
+        hidden_size: int = 768,
+        txt_hidden_size: int = 768,
+        cond_vec_size: int = 768,
+        depth_double: int = 17,
+        txt_preamble_depth: int = 2,
+        num_heads: int = 12,
+        head_dim: int = 64,
+        mlp_ratio: float = 2.6666666666666665,
+        pca_channels: int = 128,
+        prompt_length: int = 256,
+        n_T: int = 100,
+        prediction: str = "x",
+        sampler: str = "euler",
+        cfg_channels: int = 3,
+        cfg_interval: tuple = (0.0, 1.0),
+        llm: str = "google/flan-t5-large",
+    ):
+        super().__init__()
+        cfg = MMJiTConfig(
+            image_size=image_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            txt_input_size=txt_input_size,
+            hidden_size=hidden_size,
+            txt_hidden_size=txt_hidden_size,
+            cond_vec_size=cond_vec_size,
+            depth_double=depth_double,
+            txt_preamble_depth=txt_preamble_depth,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            mlp_ratio=mlp_ratio,
+            pca_channels=pca_channels,
+            prompt_length=prompt_length,
+            n_T=n_T,
+            prediction=prediction,
+            sampler=sampler,
+            cfg_channels=cfg_channels,
+            cfg_interval=tuple(cfg_interval),
+            llm=llm,
+        )
+        self.model = DiffusionModel(cfg)
+    @property
+    def mmjit_config(self) -> MMJiTConfig:
+        return self.model.cfg
+    def forward(self, img, t, context, attn_mask):
+        return self.model.net(img, t, context, attn_mask)
+    def pred_velocity(self, x, t, text, mask):
+        return self.model.pred_velocity(x, t, text, mask)
+    def sample(self, text, mask, cfg_scale=6.0, generator=None, progress=False):
+        return self.model.sample(text, mask, cfg_scale=cfg_scale, generator=generator, progress=progress)

MiniT2I-L-16/model_index.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_class_name": [
+    "pipeline",
+    "MiniT2ITextToImagePipeline"
+  ],
+  "_diffusers_version": "0.32.0",
+  "default_num_inference_steps": 100,
+  "model_type": "l16",
+  "recommended_guidance_scale": 6.0,
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5EncoderModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "T5Tokenizer"
+  ],
+  "transformer": [
+    "transformer_minit2i",
+    "MiniT2IMMJiTModel"
+  ]
+}

MiniT2I-L-16/pipeline.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""Hub custom pipeline: MiniT2ITextToImagePipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+os.environ.setdefault("USE_FLAX", "0")
+os.environ.setdefault("TRANSFORMERS_NO_FLAX", "1")
+import torch
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers import logging as transformers_logging
+transformers_logging.set_verbosity_error()
+DEFAULT_NUM_INFERENCE_STEPS = 100
+NOISE_INIT_SCALE = 2.0
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+        >>> model_dir = Path("./minit2i-diffusers").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe.to("cuda")
+        >>> pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     "a cinematic portrait of a robot musician",
+        ...     num_inference_steps=100,
+        ...     guidance_scale=6.0,
+        ...     generator=generator,
+        ... ).images[0]
+        >>> image.save("demo.png")
+        ```
+"""
+MODEL_ALIASES: Dict[str, str] = {
+    "b": "minit2i-b-16",
+    "b16": "minit2i-b-16",
+    "b-16": "minit2i-b-16",
+    "base": "minit2i-b-16",
+    "minit2i-b16": "minit2i-b-16",
+    "minit2i-b-16": "minit2i-b-16",
+    "minit2i-b/16": "minit2i-b-16",
+    "l": "minit2i-l-16",
+    "l16": "minit2i-l-16",
+    "l-16": "minit2i-l-16",
+    "large": "minit2i-l-16",
+    "minit2i-l16": "minit2i-l-16",
+    "minit2i-l-16": "minit2i-l-16",
+    "minit2i-l/16": "minit2i-l-16",
+}
+def resolve_model_type(model_type: str) -> str:
+    key = model_type.lower().replace("_", "-")
+    if key not in MODEL_ALIASES:
+        choices = ", ".join(sorted(set(MODEL_ALIASES)))
+        raise ValueError(f"Unknown model_type={model_type!r}. Expected one of: {choices}")
+    return MODEL_ALIASES[key]
+class MiniT2ITextToImagePipeline(DiffusionPipeline):
+    r"""
+    Text-to-image pipeline for MiniT2I pixel-space flow matching.
+    Parameters:
+        transformer ([`MiniT2IMMJiTModel`]):
+            MiniT2I MM-JiT transformer that predicts flow-matching velocity in pixel space.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            Flow-matching Euler scheduler. Other [`KarrasDiffusionSchedulers`] can be swapped at inference time.
+        tokenizer ([`AutoTokenizer`], *optional*):
+            Tokenizer for the text encoder.
+        text_encoder ([`T5EncoderModel`], *optional*):
+            Text encoder used to embed prompts.
+    """
+    model_cpu_offload_seq = "text_encoder->transformer"
+    _optional_components = ["tokenizer", "text_encoder"]
+    def __init__(
+        self,
+        transformer,
+        scheduler,
+        tokenizer=None,
+        text_encoder=None,
+        text_encoder_name: str = "google/flan-t5-large",
+        model_type: str = "b16",
+        repo_id_or_path: Optional[str] = None,
+        default_num_inference_steps: int = DEFAULT_NUM_INFERENCE_STEPS,
+    ):
+        super().__init__()
+        if scheduler is None:
+            scheduler = self._default_inference_scheduler()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+        )
+        self.register_to_config(
+            text_encoder_name=text_encoder_name,
+            model_type=model_type,
+            repo_id_or_path=repo_id_or_path,
+            default_num_inference_steps=int(default_num_inference_steps),
+        )
+        self._variant_transformers: Dict[str, MiniT2IMMJiTModel] = {}
+        self._active_model_type = resolve_model_type(model_type)
+    @staticmethod
+    def _default_inference_scheduler() -> FlowMatchEulerDiscreteScheduler:
+        return FlowMatchEulerDiscreteScheduler(
+            num_train_timesteps=1000,
+            shift=1.0,
+            stochastic_sampling=False,
+        )
+    @classmethod
+    def _load_scheduler_from_dir(
+        cls,
+        scheduler_dir: Path,
+        model_kwargs: Dict[str, Any],
+    ) -> Tuple[KarrasDiffusionSchedulers, int]:
+        config_path = scheduler_dir / "scheduler_config.json"
+        if not config_path.exists():
+            return cls._default_inference_scheduler(), DEFAULT_NUM_INFERENCE_STEPS
+        config = json.loads(config_path.read_text(encoding="utf-8"))
+        class_name = config.get("_class_name", "")
+        default_steps = int(config.get("num_inference_steps", DEFAULT_NUM_INFERENCE_STEPS))
+        if class_name == "MiniT2IFlowMatchScheduler":
+            return cls._default_inference_scheduler(), default_steps
+        schedulers_pkg = _hf["schedulers"]
+        if hasattr(schedulers_pkg, class_name):
+            scheduler_cls = getattr(schedulers_pkg, class_name)
+            return scheduler_cls.from_pretrained(str(scheduler_dir), **model_kwargs), default_steps
+        return cls._default_inference_scheduler(), default_steps
+    @staticmethod
+    def _resolve_transformer_path(root: Path, variant_dir: str) -> Path:
+        variant_transformer = root / variant_dir / "transformer"
+        if variant_transformer.exists():
+            return variant_transformer
+        root_transformer = root / "transformer"
+        if root_transformer.exists():
+            return root_transformer
+        raise FileNotFoundError(
+            f"Could not find transformer weights under {root}. "
+            f"Tried {variant_transformer} and {root_transformer}."
+        )
+    def _get_transformer(
+        self,
+        model_type: Optional[str],
+        repo_id_or_path: Optional[str],
+        torch_dtype: Optional[torch.dtype] = None,
+        variant: Optional[str] = None,
+    ) -> MiniT2IMMJiTModel:
+        active_type = resolve_model_type(model_type or self.config.model_type)
+        if active_type == self._active_model_type and self.transformer is not None:
+            return self.transformer
+        if active_type in self._variant_transformers:
+            return self._variant_transformers[active_type]
+        repo = repo_id_or_path or self.config.repo_id_or_path
+        if repo is None:
+            raise ValueError("model_type switching requires repo_id_or_path to be set on the pipeline.")
+        root = Path(repo)
+        if not root.exists():
+            root = Path(snapshot_download(repo_id=str(repo)))
+        transformer = MiniT2IMMJiTModel.from_pretrained(
+            self._resolve_transformer_path(root, active_type),
+            torch_dtype=torch_dtype,
+            variant=variant,
+        )
+        self._variant_transformers[active_type] = transformer
+        if active_type == resolve_model_type(self.config.model_type):
+            self.transformer = transformer
+            self._active_model_type = active_type
+        return transformer
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+    ) -> Dict[str, Any]:
+        kwargs: Dict[str, Any] = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        return kwargs
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        guidance_scale: float,
+        num_inference_steps: int,
+        output_type: str,
+    ) -> None:
+        if not isinstance(prompt, str) and not (isinstance(prompt, list) and all(isinstance(p, str) for p in prompt)):
+            raise TypeError(f"`prompt` must be a string or list of strings, got {type(prompt)}.")
+        if guidance_scale < 0:
+            raise ValueError(f"`guidance_scale` must be non-negative, got {guidance_scale}.")
+        if num_inference_steps <= 0:
+            raise ValueError(f"`num_inference_steps` must be positive, got {num_inference_steps}.")
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError(f"Unsupported `output_type`: {output_type}")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        image_size: int,
+        in_channels: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        shape = (batch_size, in_channels, image_size, image_size)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = latents * NOISE_INIT_SCALE
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+            if tuple(latents.shape) != shape:
+                raise ValueError(f"Invalid `latents` shape: {tuple(latents.shape)}. Expected {shape}.")
+        return latents
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        transformer = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        transformer = transformer or self.transformer
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.config.text_encoder_name)
+        if self.text_encoder is None:
+            self.text_encoder = T5EncoderModel.from_pretrained(self.config.text_encoder_name)
+        if next(self.text_encoder.parameters()).device != device:
+            self.text_encoder.to(device)
+        cfg = transformer.mmjit_config
+        tokens = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=cfg.prompt_length,
+        )
+        input_ids = tokens.input_ids.to(device)
+        attn = tokens.attention_mask.to(device)
+        text = self.text_encoder(input_ids=input_ids, attention_mask=attn).last_hidden_state
+        return text, attn
+    @staticmethod
+    def _cfg_velocity(
+        transformer,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        text: torch.Tensor,
+        mask: torch.Tensor,
+        cfg_scale: float,
+    ) -> torch.Tensor:
+        batch_size = x.shape[0]
+        doubled_x = torch.cat([x, x], dim=0)
+        doubled_t = torch.cat([t, t], dim=0)
+        doubled_text = torch.cat([text, text], dim=0)
+        null_mask = torch.zeros_like(mask)
+        doubled_mask = torch.cat([mask, null_mask], dim=0)
+        velocity = transformer.pred_velocity(doubled_x, doubled_t, doubled_text, doubled_mask)
+        cond, uncond = velocity[:batch_size], velocity[batch_size:]
+        cfg_interval = transformer.mmjit_config.cfg_interval
+        use_cfg = ((t >= cfg_interval[0]) & (t <= cfg_interval[1])).to(velocity.dtype)
+        scale = torch.where(
+            use_cfg[:, None, None, None] > 0,
+            torch.tensor(cfg_scale, device=x.device, dtype=velocity.dtype),
+            torch.tensor(1.0, device=x.device, dtype=velocity.dtype),
+        )
+        return uncond + (cond - uncond) * scale
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        guidance_scale: float = 6.0,
+        num_inference_steps: Optional[int] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        progress: bool = True,
+        model_type: Optional[str] = None,
+        repo_id_or_path: Optional[str] = None,
+        variant: Optional[str] = None,
+        torch_dtype: Optional[torch.dtype] = None,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Generate images from text prompts with MiniT2I.
+        Args:
+            prompt (`str` or `list[str]`):
+                Text prompt or batch of prompts.
+            num_images_per_prompt (`int`, defaults to `1`):
+                Number of images to generate per prompt.
+            guidance_scale (`float`, defaults to `6.0`):
+                Classifier-free guidance scale. CFG is active when `guidance_scale != 1.0`.
+            num_inference_steps (`int`, *optional*):
+                Number of denoising steps. Defaults to the pipeline config value.
+            generator (`torch.Generator`, *optional*):
+                RNG for reproducibility.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated pixel latents with shape `(batch, channels, height, width)`.
+            output_type (`str`, defaults to `"pil"`):
+                `"pil"`, `"np"`, `"pt"`, or `"latent"`.
+            return_dict (`bool`, defaults to `True`):
+                Return [`ImagePipelineOutput`] if True.
+            progress (`bool`, defaults to `True`):
+                Whether to show a progress bar during denoising.
+            model_type (`str`, *optional*):
+                MiniT2I variant alias such as `"b16"` or `"l16"`.
+            repo_id_or_path (`str`, *optional*):
+                Hub id or local path used when switching `model_type`.
+            variant (`str`, *optional*):
+                Weight variant passed to `from_pretrained`.
+            torch_dtype (`torch.dtype`, *optional*):
+                Optional dtype override when loading a different transformer variant.
+        """
+        num_inference_steps = int(num_inference_steps or self.config.default_num_inference_steps)
+        self.check_inputs(prompt, guidance_scale, num_inference_steps, output_type)
+        transformer = self._get_transformer(model_type, repo_id_or_path, torch_dtype=torch_dtype, variant=variant)
+        device = self._execution_device
+        transformer = transformer.to(device)
+        if isinstance(prompt, str):
+            prompt_batch = [prompt] * num_images_per_prompt
+        else:
+            prompt_batch = []
+            for entry in prompt:
+                prompt_batch.extend([entry] * num_images_per_prompt)
+        batch_size = len(prompt_batch)
+        mmjit_cfg = transformer.mmjit_config
+        model_dtype = next(transformer.parameters()).dtype
+        text, attn = self._encode_prompt(prompt_batch, device, transformer=transformer)
+        text = text.to(dtype=model_dtype)
+        attn = attn.to(dtype=model_dtype)
+        if getattr(self.scheduler.config, "stochastic_sampling", False):
+            raise ValueError(
+                "MiniT2I expects deterministic FlowMatchEulerDiscreteScheduler stepping "
+                "(scheduler.config.stochastic_sampling=False)."
+            )
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        num_train_timesteps = self.scheduler.config.num_train_timesteps
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            image_size=mmjit_cfg.image_size,
+            in_channels=mmjit_cfg.in_channels,
+            device=device,
+            dtype=model_dtype,
+            generator=generator,
+            latents=latents,
+        )
+        timesteps = self.scheduler.timesteps
+        if progress:
+            timesteps = self.progress_bar(timesteps)
+        using_cfg = guidance_scale != 1.0
+        for timestep in timesteps:
+            flow_time = 1.0 - float(timestep) / num_train_timesteps
+            t = torch.full((batch_size,), flow_time, device=device, dtype=model_dtype)
+            if using_cfg:
+                velocity = self._cfg_velocity(transformer, latents, t, text, attn, guidance_scale)
+            else:
+                velocity = transformer.pred_velocity(latents, t, text, attn)
+            # MiniT2I integrates velocity from noise (t=0) to data (t=1); flip sign for
+            # FlowMatchEulerDiscreteScheduler sigma decreasing from 1 to 0.
+            latents = self.scheduler.step(-velocity, timestep, latents, **extra_step_kwargs).prev_sample
+        if output_type == "latent":
+            images = latents
+        else:
+            images = (latents.clamp(-1, 1) * 127.5 + 128.0).clamp(0, 255).to(torch.uint8)
+            if output_type == "pt":
+                images = images.float() / 255.0
+            else:
+                images = images.permute(0, 2, 3, 1).cpu().numpy()
+                if output_type == "pil":
+                    images = [Image.fromarray(image) for image in images]
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (images,)
+        return ImagePipelineOutput(images=images)

MiniT2I-L-16/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.32.0",
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "stochastic_sampling": false
+}

MiniT2I-L-16/text_encoder/README.md ADDED Viewed

	@@ -0,0 +1,276 @@

+---
+language:
+- en
+- fr
+- ro
+- de
+- multilingual
+widget:
+- text: "Translate to German:  My name is Arthur"
+  example_title: "Translation"
+- text: "Please answer to the following question. Who is going to be the next Ballon d'or?"
+  example_title: "Question Answering"
+- text: "Q: Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering."
+  example_title: "Logical reasoning"
+- text: "Please answer the following question. What is the boiling point of Nitrogen?"
+  example_title: "Scientific knowledge"
+- text: "Answer the following yes/no question. Can you write a whole Haiku in a single tweet?"
+  example_title: "Yes/no question"
+- text: "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?"
+  example_title: "Reasoning task"
+- text: "Q: ( False or not False or False ) is? A: Let's think step by step"
+  example_title: "Boolean Expressions"
+- text: "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
+  example_title: "Math reasoning"
+- text: "Premise:  At my age you will probably have learnt one lesson. Hypothesis:  It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?"
+  example_title: "Premise and hypothesis"
+tags:
+- text2text-generation
+datasets:
+- svakulenk0/qrecc
+- taskmaster2
+- djaym7/wiki_dialog
+- deepmind/code_contests
+- lambada
+- gsm8k
+- aqua_rat
+- esnli
+- quasc
+- qed
+license: apache-2.0
+---
+# Model Card for FLAN-T5 large
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/flan2_architecture.jpg"
+alt="drawing" width="600"/>
+#  Table of Contents
+0. [TL;DR](#TL;DR)
+1. [Model Details](#model-details)
+2. [Usage](#usage)
+3. [Uses](#uses)
+4. [Bias, Risks, and Limitations](#bias-risks-and-limitations)
+5. [Training Details](#training-details)
+6. [Evaluation](#evaluation)
+7. [Environmental Impact](#environmental-impact)
+8. [Citation](#citation)
+9. [Model Card Authors](#model-card-authors)
+# TL;DR
+If you already know T5, FLAN-T5 is just better at everything. For the same number of parameters, these models have been fine-tuned on more than 1000 additional tasks covering also more languages.
+As mentioned in the first few lines of the abstract :
+>  Flan-PaLM 540B achieves state-of-the-art performance on several benchmarks, such as 75.2% on five-shot MMLU. We also publicly release Flan-T5 checkpoints,1 which achieve strong few-shot performance even compared to much larger models, such as PaLM 62B. Overall, instruction finetuning is a general method for improving the performance and usability of pretrained language models.
+**Disclaimer**: Content from **this** model card has been written by the Hugging Face team, and parts of it were copy pasted from the [T5 model card](https://huggingface.co/t5-large).
+# Model Details
+## Model Description
+- **Model type:** Language model
+- **Language(s) (NLP):** English, Spanish, Japanese, Persian, Hindi, French, Chinese, Bengali, Gujarati, German, Telugu, Italian, Arabic, Polish, Tamil, Marathi, Malayalam, Oriya, Panjabi, Portuguese, Urdu, Galician, Hebrew, Korean, Catalan, Thai, Dutch, Indonesian, Vietnamese, Bulgarian, Filipino, Central Khmer, Lao, Turkish, Russian, Croatian, Swedish, Yoruba, Kurdish, Burmese, Malay, Czech, Finnish, Somali, Tagalog, Swahili, Sinhala, Kannada, Zhuang, Igbo, Xhosa, Romanian, Haitian, Estonian, Slovak, Lithuanian, Greek, Nepali, Assamese, Norwegian
+- **License:** Apache 2.0
+- **Related Models:** [All FLAN-T5 Checkpoints](https://huggingface.co/models?search=flan-t5)
+- **Original Checkpoints:** [All Original FLAN-T5 Checkpoints](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints)
+- **Resources for more information:**
+  - [Research paper](https://arxiv.org/pdf/2210.11416.pdf)
+  - [GitHub Repo](https://github.com/google-research/t5x)
+  - [Hugging Face FLAN-T5 Docs (Similar to T5) ](https://huggingface.co/docs/transformers/model_doc/t5)
+# Usage
+Find below some example scripts on how to use the model in `transformers`:
+## Using the Pytorch model
+### Running the model on a CPU
+<details>
+<summary> Click to expand </summary>
+```python
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+### Running the model on a GPU
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install accelerate
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+### Running the model on a GPU using different precisions
+#### FP16
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install accelerate
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+#### INT8
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install bitsandbytes accelerate
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", load_in_8bit=True)
+input_text = "translate English to German: How old are you?"
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+# Uses
+## Direct Use and Downstream Use
+The authors write in [the original paper's model card](https://arxiv.org/pdf/2210.11416.pdf) that:
+> The primary use is research on language models, including: research on zero-shot NLP tasks and in-context few-shot learning NLP tasks, such as reasoning, and question answering; advancing fairness and safety research, and understanding limitations of current large language models
+See the [research paper](https://arxiv.org/pdf/2210.11416.pdf) for further details.
+## Out-of-Scope Use
+More information needed.
+# Bias, Risks, and Limitations
+The information below in this section are copied from the model's [official model card](https://arxiv.org/pdf/2210.11416.pdf):
+> Language models, including Flan-T5, can potentially be used for language generation in a harmful way, according to Rae et al. (2021). Flan-T5 should not be used directly in any application, without a prior assessment of safety and fairness concerns specific to the application.
+## Ethical considerations and risks
+> Flan-T5 is fine-tuned on a large corpus of text data that was not filtered for explicit content or assessed for existing biases. As a result the model itself is potentially vulnerable to generating equivalently inappropriate content or replicating inherent biases in the underlying data.
+## Known Limitations
+> Flan-T5 has not been tested in real world applications.
+## Sensitive Use:
+> Flan-T5 should not be applied for any unacceptable use cases, e.g., generation of abusive speech.
+# Training Details
+## Training Data
+The model was trained on a mixture of tasks, that includes the tasks described in the table below (from the original paper, figure 2):
+![table.png](https://s3.amazonaws.com/moonup/production/uploads/1666363265279-62441d1d9fdefb55a0b7d12c.png)
+## Training Procedure
+According to the model card from the [original paper](https://arxiv.org/pdf/2210.11416.pdf):
+> These models are based on pretrained T5 (Raffel et al., 2020) and fine-tuned with instructions for better zero-shot and few-shot performance. There is one fine-tuned Flan model per T5 model size.
+The model has been trained on TPU v3 or TPU v4 pods, using [`t5x`](https://github.com/google-research/t5x) codebase together with [`jax`](https://github.com/google/jax).
+# Evaluation
+## Testing Data, Factors & Metrics
+The authors evaluated the model on various tasks covering several languages (1836 in total). See the table below for some quantitative evaluation:
+![image.png](https://s3.amazonaws.com/moonup/production/uploads/1668072995230-62441d1d9fdefb55a0b7d12c.png)
+For full details, please check the [research paper](https://arxiv.org/pdf/2210.11416.pdf).
+## Results
+For full results for FLAN-T5-Large, see the [research paper](https://arxiv.org/pdf/2210.11416.pdf), Table 3.
+# Environmental Impact
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** Google Cloud TPU Pods - TPU v3 or TPU v4  | Number of chips ≥ 4.
+- **Hours used:** More information needed
+- **Cloud Provider:** GCP
+- **Compute Region:** More information needed
+- **Carbon Emitted:** More information needed
+# Citation
+**BibTeX:**
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2210.11416,
+  doi = {10.48550/ARXIV.2210.11416},
+  url = {https://arxiv.org/abs/2210.11416},
+  author = {Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and Webson, Albert and Gu, Shixiang Shane and Dai, Zhuyun and Suzgun, Mirac and Chen, Xinyun and Chowdhery, Aakanksha and Narang, Sharan and Mishra, Gaurav and Yu, Adams and Zhao, Vincent and Huang, Yanping and Dai, Andrew and Yu, Hongkun and Petrov, Slav and Chi, Ed H. and Dean, Jeff and Devlin, Jacob and Roberts, Adam and Zhou, Denny and Le, Quoc V. and Wei, Jason},
+  keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Scaling Instruction-Finetuned Language Models},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```

MiniT2I-L-16/text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 2816,
+  "d_kv": 64,
+  "d_model": 1024,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 24,
+  "num_heads": 16,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.23.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}

MiniT2I-L-16/text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.27.0.dev0"
+}

MiniT2I-L-16/text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9dd06ce490f139af36e9eb77dd3758b4fd07a08a73d5a1abe5ff2591e2d388e
+size 3132668804

MiniT2I-L-16/text_encoder/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

MiniT2I-L-16/text_encoder/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

MiniT2I-L-16/text_encoder/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

MiniT2I-L-16/text_encoder/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "google/t5-v1_1-large",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/younes_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-large/snapshots/314bc112b191ec17b625ba81438dc73d6c23659d/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

MiniT2I-L-16/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

MiniT2I-L-16/tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

MiniT2I-L-16/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

MiniT2I-L-16/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "google/t5-v1_1-large",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/younes_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-large/snapshots/314bc112b191ec17b625ba81438dc73d6c23659d/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

MiniT2I-L-16/transformer/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_class_name": "MiniT2IMMJiTModel",
+  "_diffusers_version": "0.35.2",
+  "cfg_channels": 3,
+  "cfg_interval": [
+    0.0,
+    1.0
+  ],
+  "cond_vec_size": 1248,
+  "depth_double": 23,
+  "head_dim": 52,
+  "hidden_size": 1248,
+  "image_size": 512,
+  "in_channels": 3,
+  "llm": "google/flan-t5-large",
+  "mlp_ratio": 2.7051282051282053,
+  "n_T": 100,
+  "num_heads": 24,
+  "patch_size": 16,
+  "pca_channels": 128,
+  "prediction": "x",
+  "prompt_length": 256,
+  "sampler": "euler",
+  "txt_hidden_size": 1248,
+  "txt_input_size": 1024,
+  "txt_preamble_depth": 2
+}

MiniT2I-L-16/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:290775640b27b7cc743c4f79a244aab2b5f460d285ea42f569144b38cbb03633
+size 3647124768

MiniT2I-L-16/transformer/transformer_minit2i.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import math
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch import nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+def modulate(x, shift, scale):
+    return x * (1 + scale[:, None, :]) + shift[:, None, :]
+def rotate_half(x):
+    x1, x2 = x.reshape(*x.shape[:-1], 2, -1).unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        y = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return y * self.weight
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, t):
+        half = self.frequency_embedding_size // 2
+        freqs = torch.exp(
+            -math.log(10000.0)
+            * torch.arange(half, device=t.device, dtype=torch.float32)
+            / half
+        )
+        args = t.float()[:, None] * freqs[None]
+        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return self.mlp(emb.to(dtype=self.mlp[0].weight.dtype))
+class BottleneckPatchEmbed(nn.Module):
+    def __init__(self, img_size=512, patch_size=16, in_channels=3, pca_channels=128, hidden_size=1248):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj1 = nn.Conv2d(in_channels, pca_channels, kernel_size=patch_size, stride=patch_size, bias=False)
+        self.proj2 = nn.Conv2d(pca_channels, hidden_size, kernel_size=1, stride=1, bias=True)
+    def forward(self, x):
+        x = self.proj2(self.proj1(x))
+        return x.flatten(2).transpose(1, 2)
+class SwiGLUMlp(nn.Module):
+    def __init__(self, in_features: int, hidden_features: int):
+        super().__init__()
+        hidden_dim = (hidden_features + 7) // 8 * 8
+        self.w1 = nn.Linear(in_features, hidden_dim, bias=False)
+        self.w3 = nn.Linear(in_features, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, in_features, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TextRotaryEmbedding1D(nn.Module):
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.head_dim = head_dim
+        self.theta = theta
+    def forward(self, x):
+        b, length, h, d = x.shape
+        inv = 1.0 / (self.theta ** (torch.arange(0, d, 2, device=x.device, dtype=torch.float32) / d))
+        pos = torch.arange(length, device=x.device, dtype=torch.float32)
+        angles = torch.einsum("l,f->lf", pos, inv)
+        angles = torch.cat([angles, angles], dim=-1)
+        cos = angles.cos().to(dtype=x.dtype)
+        sin = angles.sin().to(dtype=x.dtype)
+        return x * cos[None, :, None, :] + rotate_half(x) * sin[None, :, None, :]
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.dim = head_dim // 2
+        self.theta = theta
+    def forward(self, x):
+        length = x.shape[1]
+        side = int(math.sqrt(length))
+        if side * side != length:
+            raise ValueError(f"image token length must be square, got {length}")
+        freqs = 1.0 / (
+            self.theta
+            ** (torch.arange(0, self.dim, 2, device=x.device, dtype=torch.float32)[: self.dim // 2] / self.dim)
+        )
+        t = torch.arange(side, device=x.device, dtype=torch.float32)
+        base = torch.einsum("l,f->lf", t, freqs)
+        f_h, f_w = torch.broadcast_tensors(base[:, None, :], base[None, :, :])
+        angles = torch.cat([f_h, f_w], dim=-1)
+        angles = torch.cat([angles, angles], dim=-1).reshape(length, -1)
+        cos = angles.cos().to(dtype=x.dtype)
+        sin = angles.sin().to(dtype=x.dtype)
+        return x * cos[None, :, None, :] + rotate_half(x) * sin[None, :, None, :]
+class MultiModalRotaryEmbeddingFast(nn.Module):
+    def __init__(self, head_dim: int):
+        super().__init__()
+        self.text_rope = TextRotaryEmbedding1D(head_dim)
+        self.vision_rope = VisionRotaryEmbeddingFast(head_dim)
+    def forward(self, x, txt_len: int):
+        txt = self.text_rope(x[:, :txt_len])
+        img = self.vision_rope(x[:, txt_len:])
+        return torch.cat([txt, img], dim=1)
+class PlainTextTransformerBlock(nn.Module):
+    def __init__(self, hidden_size=1248, num_heads=24, head_dim=52, mlp_ratio=2.7):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        inner_dim = num_heads * head_dim
+        self.norm1 = RMSNorm(hidden_size)
+        self.norm2 = RMSNorm(hidden_size)
+        self.qkv = nn.Linear(hidden_size, inner_dim * 3)
+        self.attn_proj = nn.Linear(inner_dim, hidden_size)
+        self.mlp = SwiGLUMlp(hidden_size, int(hidden_size * mlp_ratio))
+        self.q_norm = RMSNorm(head_dim)
+        self.k_norm = RMSNorm(head_dim)
+        self.rope = TextRotaryEmbedding1D(head_dim)
+    def forward(self, txt):
+        b, length, _ = txt.shape
+        qkv = self.qkv(self.norm1(txt)).reshape(b, length, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
+        q = self.rope(self.q_norm(q))
+        k = self.rope(self.k_norm(k))
+        attn = torch.einsum("bqhd,bkhd->bhqk", q, k) * (self.head_dim ** -0.5)
+        out = torch.einsum("bhqk,bkhd->bqhd", attn.softmax(dim=-1), v).reshape(b, length, -1)
+        txt = txt + self.attn_proj(out)
+        txt = txt + self.mlp(self.norm2(txt))
+        return txt
+class DoubleStreamDiTBlock(nn.Module):
+    def __init__(self, hidden_size=1248, txt_hidden_size=1248, num_heads=24, head_dim=52, mlp_ratio=2.7):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.txt_hidden_size = txt_hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        inner_dim = num_heads * head_dim
+        self.img_norm1 = RMSNorm(hidden_size)
+        self.img_norm2 = RMSNorm(hidden_size)
+        self.txt_norm1 = RMSNorm(txt_hidden_size)
+        self.txt_norm2 = RMSNorm(txt_hidden_size)
+        self.img_qkv = nn.Linear(hidden_size, inner_dim * 3)
+        self.txt_qkv = nn.Linear(txt_hidden_size, inner_dim * 3)
+        self.q_norm = RMSNorm(head_dim)
+        self.k_norm = RMSNorm(head_dim)
+        self.rope = MultiModalRotaryEmbeddingFast(head_dim)
+        self.img_attn_proj = nn.Linear(inner_dim, hidden_size)
+        self.txt_attn_proj = nn.Linear(inner_dim, txt_hidden_size)
+        self.img_mlp = SwiGLUMlp(hidden_size, int(hidden_size * mlp_ratio))
+        self.txt_mlp = SwiGLUMlp(txt_hidden_size, int(txt_hidden_size * mlp_ratio))
+    def forward(self, x, txt, vec):
+        b, li, _ = x.shape
+        lt = txt.shape[1]
+        x_norm = self.img_norm1(x)
+        txt_norm = self.txt_norm1(txt)
+        qkv_i = self.img_qkv(x_norm).reshape(b, li, 3, self.num_heads, self.head_dim)
+        qkv_t = self.txt_qkv(txt_norm).reshape(b, lt, 3, self.num_heads, self.head_dim)
+        q_i, k_i, v_i = qkv_i[:, :, 0], qkv_i[:, :, 1], qkv_i[:, :, 2]
+        q_t, k_t, v_t = qkv_t[:, :, 0], qkv_t[:, :, 1], qkv_t[:, :, 2]
+        q_i, k_i = self.q_norm(q_i), self.k_norm(k_i)
+        q_t, k_t = self.q_norm(q_t), self.k_norm(k_t)
+        q = self.rope(torch.cat([q_t, q_i], dim=1), txt_len=lt)
+        k = self.rope(torch.cat([k_t, k_i], dim=1), txt_len=lt)
+        v = torch.cat([v_t, v_i], dim=1)
+        attn = torch.einsum("bqhd,bkhd->bhqk", q, k) * (self.head_dim ** -0.5)
+        out = torch.einsum("bhqk,bkhd->bqhd", attn.softmax(dim=-1), v)
+        x = x + self.img_attn_proj(out[:, lt:].reshape(b, li, -1))
+        txt = txt + self.txt_attn_proj(out[:, :lt].reshape(b, lt, -1))
+        x = x + self.img_mlp(self.img_norm2(x))
+        txt = txt + self.txt_mlp(self.txt_norm2(txt))
+        return x, txt
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size=1248, patch_size=16, out_channels=3):
+        super().__init__()
+        self.patch_size = patch_size
+        self.out_channels = out_channels
+        self.norm_final = RMSNorm(hidden_size)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels)
+    def forward(self, x, vec=None):
+        return self.linear(self.norm_final(x))
+def get_2d_sincos_pos_embed(embed_dim, grid_size, device, dtype):
+    grid_h = torch.arange(grid_size, device=device, dtype=torch.float32)
+    grid_w = torch.arange(grid_size, device=device, dtype=torch.float32)
+    grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
+    grid = torch.stack(grid, dim=0).reshape(2, 1, grid_size, grid_size)
+    emb_h = get_1d_sincos_pos_embed(embed_dim // 2, grid[0])
+    emb_w = get_1d_sincos_pos_embed(embed_dim // 2, grid[1])
+    return torch.cat([emb_h, emb_w], dim=1).to(dtype=dtype)
+def get_1d_sincos_pos_embed(embed_dim, pos):
+    omega = torch.arange(embed_dim // 2, device=pos.device, dtype=torch.float32)
+    omega = 1.0 / (10000 ** (omega / (embed_dim / 2.0)))
+    out = torch.einsum("m,d->md", pos.reshape(-1), omega)
+    return torch.cat([out.sin(), out.cos()], dim=1)
+@dataclass
+class MMJiTConfig:
+    image_size: int = 512
+    patch_size: int = 16
+    in_channels: int = 3
+    txt_input_size: int = 1024
+    hidden_size: int = 768
+    txt_hidden_size: int = 768
+    cond_vec_size: int = 768
+    depth_double: int = 17
+    txt_preamble_depth: int = 2
+    num_heads: int = 12
+    head_dim: int = 64
+    mlp_ratio: float = 2.6667
+    pca_channels: int = 128
+    prompt_length: int = 256
+    n_T: int = 100
+    prediction: str = "x"
+    sampler: str = "euler"
+    cfg_channels: int = 3
+    cfg_interval: tuple = (0.0, 1.0)
+    llm: str = "google/flan-t5-large"
+class MMJiT(nn.Module):
+    def __init__(self, cfg: MMJiTConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.latent_img_size = cfg.image_size // cfg.patch_size
+        self.img_embedder = BottleneckPatchEmbed(
+            cfg.image_size, cfg.patch_size, cfg.in_channels, cfg.pca_channels, cfg.hidden_size
+        )
+        self.txt_embedder = nn.Linear(cfg.txt_input_size, cfg.txt_hidden_size, bias=False)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, cfg.txt_input_size))
+        self.t_embedder = TimestepEmbedder(cfg.cond_vec_size)
+        self.pooled_embedder = nn.Linear(cfg.txt_input_size, cfg.cond_vec_size, bias=False)
+        self.txt_preamble_blocks = nn.ModuleList(
+            [
+                PlainTextTransformerBlock(cfg.txt_hidden_size, cfg.num_heads, cfg.head_dim, cfg.mlp_ratio)
+                for _ in range(cfg.txt_preamble_depth)
+            ]
+        )
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamDiTBlock(
+                    cfg.hidden_size, cfg.txt_hidden_size, cfg.num_heads, cfg.head_dim, cfg.mlp_ratio
+                )
+                for _ in range(cfg.depth_double)
+            ]
+        )
+        self.final_layer = FinalLayer(cfg.hidden_size, cfg.patch_size, cfg.in_channels)
+    def unpatchify(self, x):
+        b = x.shape[0]
+        p = self.cfg.patch_size
+        c = self.cfg.in_channels
+        h = w = int(math.sqrt(x.shape[1]))
+        x = x.reshape(b, h, w, p, p, c)
+        x = torch.einsum("nhwpqc->nchpwq", x)
+        return x.reshape(b, c, h * p, w * p)
+    def forward(self, img, t, context, attn_mask):
+        if img.ndim == 4 and img.shape[1] != self.cfg.in_channels:
+            img = img.permute(0, 3, 1, 2)
+        attn_mask = attn_mask.to(device=context.device)
+        context = torch.where(attn_mask[:, :, None] > 0.5, context, self.mask_token.to(dtype=context.dtype))
+        x = self.img_embedder(img)
+        pos = get_2d_sincos_pos_embed(self.cfg.hidden_size, self.latent_img_size, x.device, x.dtype)
+        x = x + pos[None]
+        t_vec = self.t_embedder(t)
+        txt = self.txt_embedder(context.to(dtype=self.txt_embedder.weight.dtype))
+        pooled_text = context.mean(dim=1)
+        vec = t_vec + self.pooled_embedder(pooled_text.to(dtype=self.pooled_embedder.weight.dtype))
+        for block in self.txt_preamble_blocks:
+            txt = block(txt)
+        for block in self.double_blocks:
+            x, txt = block(x, txt, vec)
+        combined = torch.cat([txt, x], dim=1)
+        out = self.final_layer(combined, vec)
+        img_out = out[:, txt.shape[1] :, :]
+        return self.unpatchify(img_out)
+class DiffusionModel(nn.Module):
+    def __init__(self, cfg: Optional[MMJiTConfig] = None):
+        super().__init__()
+        self.cfg = cfg or MMJiTConfig()
+        self.net = MMJiT(self.cfg)
+    def real_t_to_embed_t(self, t):
+        return t
+    def pred_velocity(self, x, t, text, mask):
+        x0 = self.net(x, self.real_t_to_embed_t(t), text, mask)
+        return (x0 - x) / torch.clamp(1 - t[:, None, None, None], min=0.05)
+    def cfg_velocity(self, x, t, text, mask, cfg_scale: float):
+        b = x.shape[0]
+        xx = torch.cat([x, x], dim=0)
+        tt = torch.cat([t, t], dim=0)
+        yy = torch.cat([text, text], dim=0)
+        mm = torch.cat([mask, torch.zeros_like(mask)], dim=0)
+        out = self.pred_velocity(xx, tt, yy, mm)
+        cond, uncond = out[:b], out[b:]
+        use_cfg = ((t >= self.cfg.cfg_interval[0]) & (t <= self.cfg.cfg_interval[1])).to(out.dtype)
+        scale = torch.where(
+            use_cfg[:, None, None, None] > 0,
+            torch.tensor(cfg_scale, device=x.device, dtype=out.dtype),
+            torch.tensor(1.0, device=x.device, dtype=out.dtype),
+        )
+        return uncond + (cond - uncond) * scale
+    @torch.no_grad()
+    def sample(self, text, mask, cfg_scale=6.0, generator=None, progress=False):
+        b = text.shape[0]
+        device = text.device
+        dtype = next(self.parameters()).dtype
+        x = torch.randn(
+            b,
+            self.cfg.in_channels,
+            self.cfg.image_size,
+            self.cfg.image_size,
+            generator=generator,
+            device=device,
+            dtype=dtype,
+        ) * 2
+        timesteps = torch.linspace(0.0, 1.0, self.cfg.n_T + 1, device=device, dtype=dtype)
+        iterator = range(self.cfg.n_T)
+        if progress:
+            from tqdm.auto import tqdm
+            iterator = tqdm(iterator)
+        for i in iterator:
+            t_cur = timesteps[i].expand(b)
+            t_next = timesteps[i + 1].expand(b)
+            v = self.cfg_velocity(x, t_cur, text.to(dtype), mask.to(dtype), cfg_scale)
+            x = x + (t_next - t_cur)[:, None, None, None] * v
+        return x
+class MiniT2IMMJiTModel(ModelMixin, ConfigMixin):
+    """MiniT2I MM-JiT transformer for pixel-space flow matching."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        image_size: int = 512,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        txt_input_size: int = 1024,
+        hidden_size: int = 768,
+        txt_hidden_size: int = 768,
+        cond_vec_size: int = 768,
+        depth_double: int = 17,
+        txt_preamble_depth: int = 2,
+        num_heads: int = 12,
+        head_dim: int = 64,
+        mlp_ratio: float = 2.6666666666666665,
+        pca_channels: int = 128,
+        prompt_length: int = 256,
+        n_T: int = 100,
+        prediction: str = "x",
+        sampler: str = "euler",
+        cfg_channels: int = 3,
+        cfg_interval: tuple = (0.0, 1.0),
+        llm: str = "google/flan-t5-large",
+    ):
+        super().__init__()
+        cfg = MMJiTConfig(
+            image_size=image_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            txt_input_size=txt_input_size,
+            hidden_size=hidden_size,
+            txt_hidden_size=txt_hidden_size,
+            cond_vec_size=cond_vec_size,
+            depth_double=depth_double,
+            txt_preamble_depth=txt_preamble_depth,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            mlp_ratio=mlp_ratio,
+            pca_channels=pca_channels,
+            prompt_length=prompt_length,
+            n_T=n_T,
+            prediction=prediction,
+            sampler=sampler,
+            cfg_channels=cfg_channels,
+            cfg_interval=tuple(cfg_interval),
+            llm=llm,
+        )
+        self.model = DiffusionModel(cfg)
+    @property
+    def mmjit_config(self) -> MMJiTConfig:
+        return self.model.cfg
+    def forward(self, img, t, context, attn_mask):
+        return self.model.net(img, t, context, attn_mask)
+    def pred_velocity(self, x, t, text, mask):
+        return self.model.pred_velocity(x, t, text, mask)
+    def sample(self, text, mask, cfg_scale=6.0, generator=None, progress=False):
+        return self.model.sample(text, mask, cfg_scale=cfg_scale, generator=generator, progress=progress)

README.md ADDED Viewed

	@@ -0,0 +1,156 @@

+---
+license: mit
+library_name: diffusers
+pipeline_tag: text-to-image
+tags:
+  - diffusers
+  - minit2i
+  - image-generation
+  - text-to-image
+  - flow-matching
+  - pixel-space
+inference: true
+widget:
+  - text: A lonely astronaut standing on a quiet beach under two moons.
+    output:
+      url: MiniT2I-B-16/demo.png
+language:
+  - en
+---
+# BiliSakura/MiniT2I-diffusers
+Self-contained MiniT2I text-to-image checkpoints for Hugging Face diffusers. Each variant folder ships its own pipeline code, component modules, bundled FLAN-T5-Large text encoder, and transformer weights.
+Converted from [`MiniT2I/MiniT2I`](https://huggingface.co/MiniT2I/MiniT2I) using [MiniT2I-diffusers](https://github.com/Bili-Sakura/Visual-Generative-Foundation-Model-Collection/tree/main/libs/MiniT2I-diffusers) in [Visual-Generative-Foundation-Model-Collection](https://github.com/Bili-Sakura/Visual-Generative-Foundation-Model-Collection).
+## Available checkpoints
+| Subfolder | Model | Params (denoiser + text encoder) | Patch | Recommended CFG |
+| --- | --- | --- | ---: | ---: |
+| [`MiniT2I-B-16/`](MiniT2I-B-16/) | MiniT2I-B/16 | 258M + 341M | 16 | 2.5 |
+| [`MiniT2I-L-16/`](MiniT2I-L-16/) | MiniT2I-L/16 | 912M + 341M | 16 | 6.0 |
+## Repo layout
+```text
+BiliSakura/MiniT2I-diffusers/
+├── README.md
+├── MiniT2I-B-16/
+│   ├── pipeline.py
+│   ├── model_index.json
+│   ├── conversion_metadata.json
+│   ├── demo.png
+│   ├── scheduler/
+│   │   └── scheduler_config.json
+│   ├── text_encoder/
+│   ├── tokenizer/
+│   └── transformer/
+│       ├── config.json
+│       ├── diffusion_pytorch_model.safetensors
+│       └── transformer_minit2i.py
+└── MiniT2I-L-16/
+    └── ...
+```
+Each variant is self-contained: load with `custom_pipeline=.../pipeline.py` and `trust_remote_code=True`. MiniT2I denoises directly in RGB pixel space (no VAE).
+## Demo
+![MiniT2I-B-16 demo](MiniT2I-B-16/demo.png)
+Prompt: *"A lonely astronaut standing on a quiet beach under two moons."* — MiniT2I-B/16 at 512×512, 100 steps, `guidance_scale=2.5`, seed 42.
+## Load from Hugging Face
+```python
+import torch
+from diffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained(
+    "BiliSakura/MiniT2I-diffusers/MiniT2I-B-16",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipe(
+    "A lonely astronaut standing on a quiet beach under two moons.",
+    num_inference_steps=100,
+    guidance_scale=2.5,
+    generator=generator,
+).images[0]
+image.save("demo.png")
+```
+For MiniT2I-L/16, use `MiniT2I-L-16` and `guidance_scale=6.0`.
+## Load from a local clone
+```python
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+model_dir = Path("./MiniT2I-B-16").resolve()
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipe(
+    "A lonely astronaut standing on a quiet beach under two moons.",
+    num_inference_steps=100,
+    guidance_scale=2.5,
+    generator=generator,
+).images[0]
+image.save("demo.png")
+```
+Load a **variant subfolder** (e.g. `./MiniT2I-B-16`), not the repo root.
+## Recommended inference settings
+| Variant | Resolution | Steps | CFG scale | `torch_dtype` |
+| --- | --- | ---: | ---: | --- |
+| `MiniT2I-B-16` | 512×512 | 100 | 2.5 | `bfloat16` |
+| `MiniT2I-L-16` | 512×512 | 100 | 6.0 | `bfloat16` |
+For GenEval / DPG-Bench evaluation, upstream configs use `guidance_scale=5.0` for both B/16 and L/16.
+## Interface notes
+- Text conditioning uses bundled `google/flan-t5-large` (`T5EncoderModel` + `T5Tokenizer`).
+- Scheduler is `FlowMatchEulerDiscreteScheduler` with 1000 training timesteps and `shift=1.0`.
+- `guidance_scale > 1.0` enables classifier-free guidance with an empty-string null prompt.
+- Output resolution is fixed at 512×512 for these exports.
+## Regenerate bundles
+From the repository root:
+```bash
+conda activate rsgen
+python scripts/convert_minit2i_to_bilisakura.py
+```
+## Links
+- Blog: [MiniT2I: A Minimalist Baseline for Text-to-Image Generation](https://peppaking8.github.io/#/post/minit2i)
+- Upstream checkpoints: [MiniT2I/MiniT2I](https://huggingface.co/MiniT2I/MiniT2I)
+- PyTorch/Diffusers source: [MiniT2I-diffusers](https://github.com/Bili-Sakura/Visual-Generative-Foundation-Model-Collection/tree/main/libs/MiniT2I-diffusers)
+## Citation
+```bibtex
+@misc{minit2i2026,
+  title  = {MiniT2I: A Minimalist Baseline for Text-to-Image Generation},
+  author = {Wang, Xianbang and Zhao, Hanhong and Lu, Yiyang and Zhou, Kangyang and Ma, Linrui and He, Kaiming},
+  year   = {2026},
+  url    = {https://peppaking8.github.io/#/post/minit2i}
+}
+```