BiliSakura commited on 2 days ago

Commit

4968e7f

verified ·

1 Parent(s): bd7a133

Upload folder using huggingface_hub

Browse files

Files changed (40) hide show

.gitignore +1 -0
PixelFlow-256/__pycache__/pipeline.cpython-312.pyc +0 -0
PixelFlow-256/demo.png +0 -0
PixelFlow-256/model_index.json +12 -0
PixelFlow-256/pipeline.py +489 -0
PixelFlow-256/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc +0 -0
PixelFlow-256/scheduler/scheduler_config.json +7 -0
PixelFlow-256/scheduler/scheduling_pixelflow.py +135 -0
PixelFlow-256/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc +0 -0
PixelFlow-256/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc +0 -0
PixelFlow-256/transformer/config.json +16 -0
PixelFlow-256/transformer/diffusion_pytorch_model.safetensors +3 -0
PixelFlow-256/transformer/modeling_pixelflow.py +448 -0
PixelFlow-256/transformer/transformer_pixelflow.py +85 -0
PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc +0 -0
PixelFlow-T2I/model_index.json +20 -0
PixelFlow-T2I/pipeline.py +405 -0
PixelFlow-T2I/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc +0 -0
PixelFlow-T2I/scheduler/scheduler_config.json +7 -0
PixelFlow-T2I/scheduler/scheduling_pixelflow.py +135 -0
PixelFlow-T2I/text_encoder/config.json +58 -0
PixelFlow-T2I/text_encoder/generation_config.json +7 -0
PixelFlow-T2I/text_encoder/model-00001-of-00002.safetensors +3 -0
PixelFlow-T2I/text_encoder/model-00002-of-00002.safetensors +3 -0
PixelFlow-T2I/text_encoder/model.safetensors.index.json +567 -0
PixelFlow-T2I/tokenizer/special_tokens_map.json +107 -0
PixelFlow-T2I/tokenizer/spiece.model +3 -0
PixelFlow-T2I/tokenizer/tokenizer.json +0 -0
PixelFlow-T2I/tokenizer/tokenizer_config.json +113 -0
PixelFlow-T2I/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc +0 -0
PixelFlow-T2I/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc +0 -0
PixelFlow-T2I/transformer/config.json +16 -0
PixelFlow-T2I/transformer/diffusion_pytorch_model.safetensors +3 -0
PixelFlow-T2I/transformer/modeling_pixelflow.py +448 -0
PixelFlow-T2I/transformer/transformer_pixelflow.py +85 -0
README.md +110 -0
labels/__pycache__/imagenet_labels.cpython-312.pyc +0 -0
labels/id2label_cn.json +1002 -0
labels/id2label_en.json +1002 -0
labels/imagenet_labels.py +61 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

PixelFlow-256/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (14 kB). View file

PixelFlow-256/demo.png ADDED Viewed

PixelFlow-256/model_index.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_class_name": "PixelFlowPipeline",
+  "_diffusers_version": "0.36.0",
+  "scheduler": [
+    "scheduling_pixelflow",
+    "PixelFlowScheduler"
+  ],
+  "transformer": [
+    "transformer_pixelflow",
+    "PixelFlowTransformer2DModel"
+  ]
+}

PixelFlow-256/pipeline.py ADDED Viewed

	@@ -0,0 +1,489 @@

+"""Hub custom pipeline: PixelFlowPipeline.
+Load with native Hugging Face diffusers and `trust_remote_code=True`.
+"""
+from __future__ import annotations
+import importlib
+import math
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.embeddings import get_2d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+@dataclass
+class PixelFlowPipelineOutput(BaseOutput):
+    images: Union[torch.Tensor, List, np.ndarray]
+class PixelFlowPipeline(DiffusionPipeline):
+    """Pipeline for PixelFlow pixel-space flow generation (class-conditional or text-to-image)."""
+    model_cpu_offload_seq = "text_encoder->transformer"
+    _optional_components = ["text_encoder", "tokenizer"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
+        """Load a self-contained variant folder locally or from the Hub."""
+        repo_root = Path(__file__).resolve().parent
+        if pretrained_model_name_or_path in (None, "", "."):
+            variant = repo_root
+        elif (
+            isinstance(pretrained_model_name_or_path, str)
+            and "/" in pretrained_model_name_or_path
+            and not Path(pretrained_model_name_or_path).exists()
+        ):
+            from huggingface_hub import snapshot_download
+            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
+            if subfolder:
+                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
+            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
+            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
+        else:
+            variant = Path(pretrained_model_name_or_path)
+            if not variant.is_absolute():
+                candidate = (Path.cwd() / variant).resolve()
+                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
+            if subfolder:
+                variant = variant / subfolder
+        model_kwargs = dict(kwargs)
+        inserted: List[str] = []
+        def _load_component(folder: str, module_name: str, class_name: str):
+            comp_dir = variant / folder
+            module_path = comp_dir / f"{module_name}.py"
+            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
+            if not module_path.exists() or not has_weights:
+                return None
+            comp_path = str(comp_dir)
+            if comp_path not in sys.path:
+                sys.path.insert(0, comp_path)
+                inserted.append(comp_path)
+            module = importlib.import_module(module_name)
+            component_cls = getattr(module, class_name)
+            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
+        def _load_text_components():
+            text_encoder = None
+            tokenizer = None
+            te_dir = variant / "text_encoder"
+            tok_dir = variant / "tokenizer"
+            if te_dir.exists() and (te_dir / "config.json").exists():
+                from transformers import T5EncoderModel, T5Tokenizer
+                text_encoder = T5EncoderModel.from_pretrained(str(te_dir), **model_kwargs)
+                tokenizer = T5Tokenizer.from_pretrained(str(tok_dir))
+            return text_encoder, tokenizer
+        try:
+            transformer = _load_component("transformer", "transformer_pixelflow", "PixelFlowTransformer2DModel")
+            scheduler = _load_component("scheduler", "scheduling_pixelflow", "PixelFlowScheduler")
+            text_encoder, tokenizer = _load_text_components()
+            if scheduler is None:
+                sched_dir = variant / "scheduler"
+                if (sched_dir / "scheduling_pixelflow.py").exists():
+                    sched_path = str(sched_dir)
+                    if sched_path not in sys.path:
+                        sys.path.insert(0, sched_path)
+                        inserted.append(sched_path)
+                    scheduler = importlib.import_module("scheduling_pixelflow").PixelFlowScheduler()
+            if transformer is None:
+                raise ValueError(f"No loadable transformer found under {variant}")
+            id2label = None
+            id2label_cn = None
+            labels_dir = variant.parent / "labels"
+            if labels_dir.is_dir():
+                labels_path = str(labels_dir)
+                if labels_path not in sys.path:
+                    sys.path.insert(0, labels_path)
+                    inserted.append(labels_path)
+                from imagenet_labels import load_id2label
+                id2label = load_id2label(labels_dir, lang="en")
+                id2label_cn = load_id2label(labels_dir, lang="cn")
+            return cls(
+                transformer=transformer,
+                scheduler=scheduler,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                id2label=id2label,
+                id2label_cn=id2label_cn,
+            )
+        finally:
+            for comp_path in inserted:
+                if comp_path in sys.path:
+                    sys.path.remove(comp_path)
+    def __init__(
+        self,
+        transformer,
+        scheduler,
+        text_encoder=None,
+        tokenizer=None,
+        max_token_length: int = 512,
+        id2label: Optional[dict[int, str]] = None,
+        id2label_cn: Optional[dict[int, str]] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+        self.class_cond = transformer.config.num_classes > 0
+        self.max_token_length = max_token_length
+        self._id2label = id2label or {}
+        self._id2label_cn = id2label_cn or {}
+        self.labels = self._build_label2id(self._id2label)
+        self.labels_cn = self._build_label2id(self._id2label_cn)
+    @staticmethod
+    def _build_label2id(id2label: dict[int, str]) -> dict[str, int]:
+        label2id: dict[str, int] = {}
+        for class_id, value in id2label.items():
+            for synonym in value.split(","):
+                synonym = synonym.strip()
+                if synonym:
+                    label2id[synonym] = int(class_id)
+        return dict(sorted(label2id.items()))
+    @property
+    def id2label(self) -> dict[int, str]:
+        """ImageNet class id to English label string (comma-separated synonyms)."""
+        return self._id2label
+    @property
+    def id2label_cn(self) -> dict[int, str]:
+        """ImageNet class id to Chinese label string (comma-separated synonyms)."""
+        return self._id2label_cn
+    def get_label_ids(self, label: Union[str, List[str]], lang: str = "en") -> List[int]:
+        r"""
+        Map ImageNet label strings to class ids.
+        Args:
+            label (`str` or `list[str]`):
+                One or more label strings. Each string must match a synonym in `id2label` (English)
+                or `id2label_cn` (Chinese).
+            lang (`str`, *optional*, defaults to `"en"`):
+                `"en"` uses English synonyms; `"cn"` uses Chinese synonyms.
+        Returns:
+            `list[int]`: Class ids for [`~PixelFlowPipeline.__call__`].
+        """
+        if lang not in ("en", "cn"):
+            raise ValueError(f"`lang` must be 'en' or 'cn', got {lang!r}.")
+        label2id = self.labels if lang == "en" else self.labels_cn
+        if not label2id:
+            raise ValueError(
+                f"No {lang} labels loaded. Ensure `labels/id2label_{lang}.json` exists next to the variant folder."
+            )
+        if isinstance(label, str):
+            label = [label]
+        missing = [item for item in label if item not in label2id]
+        if missing:
+            preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(
+                f"Unknown label(s) for lang={lang!r}: {missing}. Example valid labels: {preview}, ..."
+            )
+        return [label2id[item] for item in label]
+    def _normalize_class_labels(
+        self,
+        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
+    ) -> Optional[Union[int, List[int], torch.Tensor]]:
+        if class_labels is None:
+            return None
+        if isinstance(class_labels, str):
+            return self.get_label_ids(class_labels)[0]
+        if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
+            if all(label in self.labels for label in class_labels):
+                return self.get_label_ids(class_labels, lang="en")
+            if all(label in self.labels_cn for label in class_labels):
+                return self.get_label_ids(class_labels, lang="cn")
+            raise ValueError(
+                "Could not resolve string `class_labels`. Use English synonyms from `pipe.labels` "
+                "or Chinese synonyms from `pipe.labels_cn`."
+            )
+        return class_labels
+    def sample_block_noise(self, bs, ch, height, width, eps=1e-6):
+        gamma = self.scheduler.gamma
+        dist = torch.distributions.multivariate_normal.MultivariateNormal(
+            torch.zeros(4),
+            torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
+        )
+        block_number = bs * ch * (height // 2) * (width // 2)
+        noise = torch.stack([dist.sample() for _ in range(block_number)])
+        noise = rearrange(
+            noise,
+            "(b c h w) (p q) -> b c (h p) (w q)",
+            b=bs,
+            c=ch,
+            h=height // 2,
+            w=width // 2,
+            p=2,
+            q=2,
+        )
+        return noise
+    def _stage_guidance_scale(self, stage_idx: int) -> float:
+        if not self.class_cond:
+            return self._guidance_scale_value
+        scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
+        return (self._guidance_scale_value - 1) * scale_dict[stage_idx] + 1
+    @property
+    def do_classifier_free_guidance(self) -> bool:
+        return self._guidance_scale_value > 0
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Union[str, List[str]] = "",
+        max_length: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.text_encoder is None or self.tokenizer is None:
+            raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        batch_size = len(prompt)
+        max_length = max_length or self.max_token_length
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        prompt_attention_mask = text_inputs.attention_mask.to(device)
+        prompt_embeds = self.text_encoder(
+            text_input_ids,
+            attention_mask=prompt_attention_mask,
+        )[0]
+        dtype = self.text_encoder.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
+        if do_classifier_free_guidance:
+            if isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif isinstance(negative_prompt, list):
+                if len(negative_prompt) != batch_size:
+                    raise ValueError(
+                        f"Negative prompt list length ({len(negative_prompt)}) must match prompt batch ({batch_size})."
+                    )
+                uncond_tokens = negative_prompt
+            else:
+                raise ValueError("Negative prompt must be a string or list of strings.")
+            uncond_inputs = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=prompt_embeds.shape[1],
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_input_ids = uncond_inputs.input_ids.to(device)
+            negative_prompt_attention_mask = uncond_inputs.attention_mask.to(device)
+            negative_prompt_embeds = self.text_encoder(
+                negative_input_ids,
+                attention_mask=negative_prompt_attention_mask,
+            )[0]
+            seq_len_neg = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        return prompt_embeds, prompt_attention_mask
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Union[int, List[int]] = 10,
+        guidance_scale: float = 4.0,
+        shift: float = 1.0,
+        negative_prompt: Union[str, List[str]] = "",
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[PixelFlowPipelineOutput, Tuple]:
+        if height is None:
+            height = int(self.transformer.config.sample_size)
+        if width is None:
+            width = int(self.transformer.config.sample_size)
+        device = self._execution_device
+        self._guidance_scale_value = guidance_scale
+        if isinstance(num_inference_steps, int):
+            num_inference_steps = [num_inference_steps] * self.scheduler.num_stages
+        prompt_attention_mask = None
+        if self.class_cond:
+            if class_labels is None:
+                raise ValueError("`class_labels` are required for class-conditional PixelFlow checkpoints.")
+            class_labels = self._normalize_class_labels(class_labels)
+            if isinstance(class_labels, int):
+                class_labels = [class_labels]
+            if not torch.is_tensor(class_labels):
+                class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
+            else:
+                class_labels = class_labels.to(device=device, dtype=torch.long)
+            batch_size = class_labels.shape[0]
+            prompt_embeds = class_labels
+            negative_prompt_embeds = torch.full_like(prompt_embeds, self.transformer.config.num_classes)
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        else:
+            if prompt is None:
+                raise ValueError("`prompt` is required for text-to-image PixelFlow checkpoints.")
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            batch_size = len(prompt)
+            prompt_embeds, prompt_attention_mask = self.encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance and guidance_scale > 1.0,
+                negative_prompt=negative_prompt,
+            )
+        init_factor = 2 ** (self.scheduler.num_stages - 1)
+        height, width = height // init_factor, width // init_factor
+        latents = randn_tensor(
+            (batch_size * num_images_per_prompt, 3, height, width),
+            generator=generator,
+            device=device,
+            dtype=torch.float32,
+        )
+        for stage_idx in range(self.scheduler.num_stages):
+            self.scheduler.set_timesteps(num_inference_steps[stage_idx], stage_idx, device=device, shift=shift)
+            timesteps = self.scheduler.Timesteps
+            if stage_idx > 0:
+                height, width = height * 2, width * 2
+                latents = F.interpolate(latents, size=(height, width), mode="nearest")
+                original_start_t = self.scheduler.original_start_t[stage_idx]
+                gamma = self.scheduler.gamma
+                alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
+                beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
+                noise = self.sample_block_noise(*latents.shape)
+                noise = noise.to(device=device, dtype=latents.dtype)
+                latents = alpha * latents + beta * noise
+            size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+            pos_embed = get_2d_rotary_pos_embed(
+                embed_dim=self.transformer.attention_head_dim,
+                crops_coords=((0, 0), (latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size)),
+                grid_size=(latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size),
+                device=device,
+                output_type="pt",
+            )
+            rope_pos = torch.stack(pos_embed, -1)
+            autocast_enabled = device.type == "cuda"
+            autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
+            for timestep in timesteps:
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
+                    if self.class_cond:
+                        noise_pred = self.transformer(
+                            latent_model_input,
+                            timestep=timestep_batch,
+                            class_labels=prompt_embeds,
+                            latent_size=size_tensor,
+                            pos_embed=rope_pos,
+                        ).sample
+                    else:
+                        noise_pred = self.transformer(
+                            latent_model_input,
+                            encoder_hidden_states=prompt_embeds,
+                            encoder_attention_mask=prompt_attention_mask,
+                            timestep=timestep_batch,
+                            latent_size=size_tensor,
+                            pos_embed=rope_pos,
+                        ).sample
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self._stage_guidance_scale(stage_idx) * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
+        image = (latents / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            pass
+        elif output_type in ("pil", "np"):
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            raise ValueError(f"Unsupported output_type: {output_type}")
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return PixelFlowPipelineOutput(images=image)

PixelFlow-256/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc ADDED Viewed

Binary file (7.76 kB). View file

PixelFlow-256/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "PixelFlowScheduler",
+  "_diffusers_version": "0.36.0",
+  "gamma": -0.3333333333333333,
+  "num_stages": 4,
+  "num_train_timesteps": 1000
+}

PixelFlow-256/scheduler/scheduling_pixelflow.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+from diffusers.utils import BaseOutput
+def cal_rectify_ratio(start_t, gamma):
+    return 1 / (math.sqrt(1 - (1 / gamma)) * (1 - start_t) + start_t)
+@dataclass
+class PixelFlowSchedulerOutput(BaseOutput):
+    prev_sample: torch.FloatTensor
+class PixelFlowScheduler(SchedulerMixin, ConfigMixin):
+    """Cascade flow scheduler for PixelFlow multi-stage pixel-space generation."""
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        num_stages: int = 4,
+        gamma: float = -1 / 3,
+    ):
+        assert num_stages > 0, f"num_stages must be positive, got {num_stages}"
+        self.num_stages = num_stages
+        self.gamma = gamma
+        self.Timesteps = torch.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=torch.float32)
+        self.t = self.Timesteps / num_train_timesteps
+        self.stage_range = [x / num_stages for x in range(num_stages + 1)]
+        self.original_start_t = {}
+        self.start_t, self.end_t = {}, {}
+        self.t_window_per_stage = {}
+        self.Timesteps_per_stage = {}
+        stage_distance = []
+        for stage_idx in range(num_stages):
+            start_idx = max(int(num_train_timesteps * self.stage_range[stage_idx]), 0)
+            end_idx = min(int(num_train_timesteps * self.stage_range[stage_idx + 1]), num_train_timesteps)
+            start_t = self.t[start_idx].item()
+            end_t = self.t[end_idx].item() if end_idx < num_train_timesteps else 1.0
+            self.original_start_t[stage_idx] = start_t
+            if stage_idx > 0:
+                start_t *= cal_rectify_ratio(start_t, gamma)
+            self.start_t[stage_idx] = start_t
+            self.end_t[stage_idx] = end_t
+            stage_distance.append(end_t - start_t)
+        total_stage_distance = sum(stage_distance)
+        t_within_stage = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float64)[:-1]
+        for stage_idx in range(num_stages):
+            start_ratio = 0.0 if stage_idx == 0 else sum(stage_distance[:stage_idx]) / total_stage_distance
+            end_ratio = 1.0 if stage_idx == num_stages - 1 else sum(stage_distance[:stage_idx + 1]) / total_stage_distance
+            Timestep_start = self.Timesteps[int(num_train_timesteps * start_ratio)]
+            Timestep_end = self.Timesteps[min(int(num_train_timesteps * end_ratio), num_train_timesteps - 1)]
+            self.t_window_per_stage[stage_idx] = t_within_stage
+            if stage_idx == num_stages - 1:
+                self.Timesteps_per_stage[stage_idx] = torch.linspace(
+                    Timestep_start.item(), Timestep_end.item(), num_train_timesteps, dtype=torch.float64
+                )
+            else:
+                self.Timesteps_per_stage[stage_idx] = torch.linspace(
+                    Timestep_start.item(), Timestep_end.item(), num_train_timesteps + 1, dtype=torch.float64
+                )[:-1]
+        self._step_index = None
+        self.Timesteps = None
+    @staticmethod
+    def time_linear_to_Timesteps(t, t_start, t_end, T_start, T_end):
+        k = (T_end - T_start) / (t_end - t_start)
+        b = T_start - t_start * k
+        return k * t + b
+    def set_timesteps(self, num_inference_steps, stage_index, device=None, shift=1.0):
+        self.num_inference_steps = num_inference_steps
+        self._step_index = None
+        stage_T_start = self.Timesteps_per_stage[stage_index][0].item()
+        stage_T_end = self.Timesteps_per_stage[stage_index][-1].item()
+        t_start = self.t_window_per_stage[stage_index][0].item()
+        t_end = self.t_window_per_stage[stage_index][-1].item()
+        t = np.linspace(t_start, t_end, num_inference_steps, dtype=np.float64)
+        t = t / (shift + (1 - shift) * t)
+        Timesteps = self.time_linear_to_Timesteps(t, t_start, t_end, stage_T_start, stage_T_end)
+        self.Timesteps = torch.from_numpy(Timesteps).to(device=device)
+        self.t = torch.from_numpy(np.append(t, 1.0)).to(device=device, dtype=torch.float64)
+    def step(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[PixelFlowSchedulerOutput, SchedulerOutput, Tuple[torch.Tensor, ...]]:
+        if self._step_index is None:
+            self._step_index = 0
+        sample = sample.to(torch.float32)
+        t = self.t[self._step_index].float()
+        t_next = self.t[self._step_index + 1].float()
+        prev_sample = sample + (t_next - t) * model_output
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample.to(model_output.dtype),)
+        return PixelFlowSchedulerOutput(prev_sample=prev_sample.to(model_output.dtype))
+    @property
+    def step_index(self) -> Optional[int]:
+        return self._step_index

PixelFlow-256/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc ADDED Viewed

Binary file (24.1 kB). View file

PixelFlow-256/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc ADDED Viewed

Binary file (3.84 kB). View file

PixelFlow-256/transformer/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_class_name": "PixelFlowTransformer2DModel",
+  "_diffusers_version": "0.36.0",
+  "attention_bias": true,
+  "attention_head_dim": 72,
+  "cross_attention_dim": null,
+  "depth": 28,
+  "dropout": 0.0,
+  "in_channels": 3,
+  "init_weights": false,
+  "num_attention_heads": 16,
+  "num_classes": 1000,
+  "out_channels": 3,
+  "patch_size": 4,
+  "sample_size": 256
+}

PixelFlow-256/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6524ee52cec42041ff72b19d8b606f1c6196cbd9c623202d57af908280b3703
+size 2706502480

PixelFlow-256/transformer/modeling_pixelflow.py ADDED Viewed

	@@ -0,0 +1,448 @@

+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+from diffusers.models.embeddings import LabelEmbedding, TimestepEmbedding, Timesteps
+try:
+    from flash_attn import flash_attn_varlen_func
+except ImportError:
+    warnings.warn("`flash-attn` is not installed. Training mode may not work properly.", UserWarning)
+    flash_attn_varlen_func = None
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos, sin = freqs_cis.unbind(-1)
+    cos = cos[None, None]
+    sin = sin[None, None]
+    cos, sin = cos.to(x.device), sin.to(x.device)
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+    return out
+class PatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_channels, embed_dim, bias=True):
+        super().__init__()
+        self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
+    def forward_unfold(self, x):
+        out_unfold = x.matmul(self.proj.weight.view(self.proj.weight.size(0), -1).t())
+        if self.proj.bias is not None:
+            out_unfold += self.proj.bias.to(out_unfold.dtype)
+        return out_unfold
+    def forward(self, x):
+        if self.training:
+            return self.forward_unfold(x)
+        out = self.proj(x)
+        out = out.flatten(2).transpose(1, 2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, timestep, seqlen_list=None):
+        input_dtype = x.dtype
+        emb = self.linear(self.silu(timestep))
+        if seqlen_list is not None:
+            emb = torch.cat([one_emb[None].expand(repeat_time, -1) for one_emb, repeat_time in zip(emb, seqlen_list)])
+        else:
+            emb = emb.unsqueeze(1)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.float().chunk(6, dim=-1)
+        x = self.norm(x).float() * (1 + scale_msa) + shift_msa
+        return x.to(input_dtype), gate_msa, shift_mlp, scale_mlp, gate_mlp
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, inner_dim=None, bias=True):
+        super().__init__()
+        inner_dim = int(dim * mult) if inner_dim is None else inner_dim
+        dim_out = dim_out if dim_out is not None else dim
+        self.fc1 = nn.Linear(dim, inner_dim, bias=bias)
+        self.fc2 = nn.Linear(inner_dim, dim_out, bias=bias)
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states, approximate="tanh")
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        output = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (self.weight * output).to(x.dtype)
+class Attention(nn.Module):
+    def __init__(self, q_dim, kv_dim=None, heads=8, head_dim=64, dropout=0.0, bias=False):
+        super().__init__()
+        self.q_dim = q_dim
+        self.kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.inner_dim = head_dim * heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+        self.num_heads = heads
+        self.q_proj = nn.Linear(self.q_dim, self.inner_dim, bias=bias)
+        self.k_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.o_proj = nn.Linear(self.inner_dim, self.q_dim, bias=bias)
+        self.q_norm = RMSNorm(self.inner_dim)
+        self.k_norm = RMSNorm(self.inner_dim)
+    def prepare_attention_mask(self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3):
+        head_size = self.num_heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def forward(
+        self,
+        inputs_q,
+        inputs_kv,
+        attention_mask=None,
+        cross_attention=False,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        max_seqlen_q=None,
+        max_seqlen_k=None,
+    ):
+        inputs_kv = inputs_q if inputs_kv is None else inputs_kv
+        query_states = self.q_proj(inputs_q)
+        key_states = self.k_proj(inputs_kv)
+        value_states = self.v_proj(inputs_kv)
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        if max_seqlen_q is None:
+            assert not self.training, "PixelFlow needs sequence packing for training"
+            bsz, q_len, _ = inputs_q.shape
+            _, kv_len, _ = inputs_kv.shape
+            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            query_states = apply_rotary_emb(query_states, rope_pos_embed)
+            if not cross_attention:
+                key_states = apply_rotary_emb(key_states, rope_pos_embed)
+            if attention_mask is not None:
+                attention_mask = self.prepare_attention_mask(attention_mask, kv_len, bsz)
+                attention_mask = attention_mask.view(bsz, self.num_heads, -1, attention_mask.shape[-1])
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False,
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.view(bsz, q_len, self.inner_dim)
+            attn_output = self.o_proj(attn_output)
+            return attn_output
+        query_states = query_states.view(-1, self.num_heads, self.head_dim)
+        key_states = key_states.view(-1, self.num_heads, self.head_dim)
+        value_states = value_states.view(-1, self.num_heads, self.head_dim)
+        query_states = apply_rotary_emb(query_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        if not cross_attention:
+            key_states = apply_rotary_emb(key_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+        )
+        attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=False,
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNorm(dim)
+        self.attn1 = Attention(
+            q_dim=dim,
+            kv_dim=None,
+            heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )
+        if cross_attention_dim is not None:
+            self.norm2 = RMSNorm(dim, eps=1e-6)
+            self.attn2 = Attention(
+                q_dim=dim,
+                kv_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                head_dim=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+            )
+        else:
+            self.attn2 = None
+        self.norm3 = RMSNorm(dim, eps=1e-6)
+        self.mlp = FeedForward(dim)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, timestep, seqlen_list_q)
+        attn_output = self.attn1(
+            inputs_q=norm_hidden_states,
+            inputs_kv=None,
+            attention_mask=None,
+            cross_attention=False,
+            rope_pos_embed=rope_pos_embed,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+            max_seqlen_k=max(seqlen_list_q) if seqlen_list_q is not None else None,
+        )
+        attn_output = (gate_msa * attn_output.float()).to(attn_output.dtype)
+        hidden_states = attn_output + hidden_states
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(
+                inputs_q=norm_hidden_states,
+                inputs_kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                cross_attention=True,
+                rope_pos_embed=rope_pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+                max_seqlen_k=max(seqlen_list_k) if seqlen_list_k is not None else None,
+            )
+            hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm3(hidden_states)
+        norm_hidden_states = (norm_hidden_states.float() * (1 + scale_mlp) + shift_mlp).to(norm_hidden_states.dtype)
+        ff_output = self.mlp(norm_hidden_states)
+        ff_output = (gate_mlp * ff_output.float()).to(ff_output.dtype)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class PixelFlowModel(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_attention_heads,
+        attention_head_dim,
+        depth,
+        patch_size,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=True,
+        num_classes=0,
+        init_weights=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.attention_head_dim = attention_head_dim
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        embed_dim = num_attention_heads * attention_head_dim
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        self.latent_size_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        if self.num_classes > 0:
+            self.class_embedder = LabelEmbedding(num_classes, embed_dim, dropout_prob=0.1)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    embed_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout,
+                    cross_attention_dim,
+                    attention_bias,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(embed_dim, 2 * embed_dim)
+        self.proj_out_2 = nn.Linear(embed_dim, patch_size * patch_size * out_channels)
+        if init_weights:
+            self.initialize_from_scratch()
+    def initialize_from_scratch(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        nn.init.normal_(self.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_2.weight, std=0.02)
+        if self.num_classes > 0:
+            nn.init.normal_(self.class_embedder.embedding_table.weight, std=0.02)
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+        nn.init.constant_(self.proj_out_1.weight, 0)
+        nn.init.constant_(self.proj_out_1.bias, 0)
+        nn.init.constant_(self.proj_out_2.weight, 0)
+        nn.init.constant_(self.proj_out_2.bias, 0)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        class_labels=None,
+        timestep=None,
+        latent_size=None,
+        encoder_attention_mask=None,
+        pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        orig_height, orig_width = hidden_states.shape[-2], hidden_states.shape[-1]
+        hidden_states = hidden_states.to(torch.float32)
+        hidden_states = self.patch_embed(hidden_states)
+        timesteps_proj = self.time_proj(timestep)
+        conditioning = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
+        if self.num_classes > 0:
+            class_embed = self.class_embedder(class_labels)
+            conditioning += class_embed
+        latent_size_proj = self.time_proj(latent_size)
+        latent_size_embed = self.latent_size_embedder(latent_size_proj.to(dtype=hidden_states.dtype))
+        conditioning += latent_size_embed
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=conditioning,
+                rope_pos_embed=pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                seqlen_list_q=seqlen_list_q,
+                seqlen_list_k=seqlen_list_k,
+            )
+        shift, scale = self.proj_out_1(F.silu(conditioning)).float().chunk(2, dim=1)
+        if seqlen_list_q is None:
+            shift = shift.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+        else:
+            shift = torch.cat([shift_i[None].expand(ri, -1) for shift_i, ri in zip(shift, seqlen_list_q)])
+            scale = torch.cat([scale_i[None].expand(ri, -1) for scale_i, ri in zip(scale, seqlen_list_q)])
+        hidden_states = (self.norm_out(hidden_states).float() * (1 + scale) + shift).to(hidden_states.dtype)
+        hidden_states = self.proj_out_2(hidden_states)
+        if self.training:
+            hidden_states = hidden_states.reshape(hidden_states.shape[0], self.patch_size, self.patch_size, self.out_channels)
+            hidden_states = hidden_states.permute(0, 3, 1, 2).flatten(1)
+            return hidden_states
+        height, width = orig_height // self.patch_size, orig_width // self.patch_size
+        hidden_states = hidden_states.reshape(shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels))
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size))
+        return output

PixelFlow-256/transformer/transformer_pixelflow.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from modeling_pixelflow import PixelFlowModel
+@dataclass
+class PixelFlowTransformerOutput(BaseOutput):
+    sample: torch.FloatTensor
+class PixelFlowTransformer2DModel(ModelMixin, ConfigMixin):
+    """PixelFlow transformer for class-conditional pixel-space flow generation."""
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        depth: int = 28,
+        patch_size: int = 4,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = True,
+        num_classes: int = 1000,
+        sample_size: int = 256,
+        init_weights: bool = True,
+    ):
+        super().__init__()
+        self.model = PixelFlowModel(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            depth=depth,
+            patch_size=patch_size,
+            dropout=dropout,
+            cross_attention_dim=cross_attention_dim,
+            attention_bias=attention_bias,
+            num_classes=num_classes,
+            init_weights=init_weights,
+        )
+    @property
+    def patch_size(self) -> int:
+        return self.model.patch_size
+    @property
+    def attention_head_dim(self) -> int:
+        return self.model.attention_head_dim
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.Tensor] = None,
+        latent_size: Optional[torch.Tensor] = None,
+        pos_embed: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[PixelFlowTransformerOutput, Transformer2DModelOutput, Tuple[torch.Tensor, ...]]:
+        output = self.model(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=class_labels,
+            timestep=timestep,
+            latent_size=latent_size,
+            encoder_attention_mask=encoder_attention_mask,
+            pos_embed=pos_embed,
+        )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (20.2 kB). View file

PixelFlow-T2I/model_index.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_class_name": "PixelFlowPipeline",
+  "_diffusers_version": "0.36.0",
+  "scheduler": [
+    "scheduling_pixelflow",
+    "PixelFlowScheduler"
+  ],
+  "transformer": [
+    "transformer_pixelflow",
+    "PixelFlowTransformer2DModel"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5EncoderModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "T5Tokenizer"
+  ]
+}

PixelFlow-T2I/pipeline.py ADDED Viewed

	@@ -0,0 +1,405 @@

+"""Hub custom pipeline: PixelFlowPipeline.
+Load with native Hugging Face diffusers and `trust_remote_code=True`.
+"""
+from __future__ import annotations
+import importlib
+import math
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.embeddings import get_2d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+@dataclass
+class PixelFlowPipelineOutput(BaseOutput):
+    images: Union[torch.Tensor, List, np.ndarray]
+class PixelFlowPipeline(DiffusionPipeline):
+    """Pipeline for PixelFlow pixel-space flow generation (class-conditional or text-to-image)."""
+    model_cpu_offload_seq = "text_encoder->transformer"
+    _optional_components = ["text_encoder", "tokenizer"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
+        """Load a self-contained variant folder locally or from the Hub."""
+        repo_root = Path(__file__).resolve().parent
+        if pretrained_model_name_or_path in (None, "", "."):
+            variant = repo_root
+        elif (
+            isinstance(pretrained_model_name_or_path, str)
+            and "/" in pretrained_model_name_or_path
+            and not Path(pretrained_model_name_or_path).exists()
+        ):
+            from huggingface_hub import snapshot_download
+            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
+            if subfolder:
+                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
+            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
+            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
+        else:
+            variant = Path(pretrained_model_name_or_path)
+            if not variant.is_absolute():
+                candidate = (Path.cwd() / variant).resolve()
+                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
+            if subfolder:
+                variant = variant / subfolder
+        model_kwargs = dict(kwargs)
+        inserted: List[str] = []
+        def _load_component(folder: str, module_name: str, class_name: str):
+            comp_dir = variant / folder
+            module_path = comp_dir / f"{module_name}.py"
+            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
+            if not module_path.exists() or not has_weights:
+                return None
+            comp_path = str(comp_dir)
+            if comp_path not in sys.path:
+                sys.path.insert(0, comp_path)
+                inserted.append(comp_path)
+            module = importlib.import_module(module_name)
+            component_cls = getattr(module, class_name)
+            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
+        def _load_text_components():
+            text_encoder = None
+            tokenizer = None
+            te_dir = variant / "text_encoder"
+            tok_dir = variant / "tokenizer"
+            if te_dir.exists() and (te_dir / "config.json").exists():
+                from transformers import T5EncoderModel, T5Tokenizer
+                text_encoder = T5EncoderModel.from_pretrained(str(te_dir), **model_kwargs)
+                tokenizer = T5Tokenizer.from_pretrained(str(tok_dir))
+            return text_encoder, tokenizer
+        def _load_text_encoder_name() -> str:
+            metadata_path = variant / "conversion_metadata.json"
+            if metadata_path.exists():
+                import json
+                metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+                if metadata.get("text_encoder"):
+                    return metadata["text_encoder"]
+            return "google/flan-t5-xl"
+        try:
+            transformer = _load_component("transformer", "transformer_pixelflow", "PixelFlowTransformer2DModel")
+            scheduler = _load_component("scheduler", "scheduling_pixelflow", "PixelFlowScheduler")
+            text_encoder, tokenizer = _load_text_components()
+            if scheduler is None:
+                sched_dir = variant / "scheduler"
+                if (sched_dir / "scheduling_pixelflow.py").exists():
+                    sched_path = str(sched_dir)
+                    if sched_path not in sys.path:
+                        sys.path.insert(0, sched_path)
+                        inserted.append(sched_path)
+                    scheduler = importlib.import_module("scheduling_pixelflow").PixelFlowScheduler()
+            if transformer is None:
+                raise ValueError(f"No loadable transformer found under {variant}")
+            if (
+                text_encoder is None
+                and tokenizer is None
+                and transformer.config.num_classes == 0
+                and transformer.config.cross_attention_dim is not None
+            ):
+                from transformers import T5EncoderModel, T5Tokenizer
+                text_encoder_name = _load_text_encoder_name()
+                text_encoder = T5EncoderModel.from_pretrained(text_encoder_name, **model_kwargs)
+                tokenizer = T5Tokenizer.from_pretrained(text_encoder_name)
+            return cls(
+                transformer=transformer,
+                scheduler=scheduler,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+            )
+        finally:
+            for comp_path in inserted:
+                if comp_path in sys.path:
+                    sys.path.remove(comp_path)
+    def __init__(self, transformer, scheduler, text_encoder=None, tokenizer=None, max_token_length: int = 512):
+        super().__init__()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+        self.class_cond = transformer.config.num_classes > 0
+        self.max_token_length = max_token_length
+    def sample_block_noise(self, bs, ch, height, width, eps=1e-6):
+        gamma = self.scheduler.gamma
+        dist = torch.distributions.multivariate_normal.MultivariateNormal(
+            torch.zeros(4),
+            torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
+        )
+        block_number = bs * ch * (height // 2) * (width // 2)
+        noise = torch.stack([dist.sample() for _ in range(block_number)])
+        noise = rearrange(
+            noise,
+            "(b c h w) (p q) -> b c (h p) (w q)",
+            b=bs,
+            c=ch,
+            h=height // 2,
+            w=width // 2,
+            p=2,
+            q=2,
+        )
+        return noise
+    def _stage_guidance_scale(self, stage_idx: int) -> float:
+        if not self.class_cond:
+            return self._guidance_scale_value
+        scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
+        return (self._guidance_scale_value - 1) * scale_dict[stage_idx] + 1
+    @property
+    def do_classifier_free_guidance(self) -> bool:
+        return self._guidance_scale_value > 0
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Union[str, List[str]] = "",
+        max_length: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.text_encoder is None or self.tokenizer is None:
+            raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        batch_size = len(prompt)
+        max_length = max_length or self.max_token_length
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        prompt_attention_mask = text_inputs.attention_mask.to(device)
+        prompt_embeds = self.text_encoder(
+            text_input_ids,
+            attention_mask=prompt_attention_mask,
+        )[0]
+        dtype = self.text_encoder.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
+        if do_classifier_free_guidance:
+            if isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif isinstance(negative_prompt, list):
+                if len(negative_prompt) != batch_size:
+                    raise ValueError(
+                        f"Negative prompt list length ({len(negative_prompt)}) must match prompt batch ({batch_size})."
+                    )
+                uncond_tokens = negative_prompt
+            else:
+                raise ValueError("Negative prompt must be a string or list of strings.")
+            uncond_inputs = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=prompt_embeds.shape[1],
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_input_ids = uncond_inputs.input_ids.to(device)
+            negative_prompt_attention_mask = uncond_inputs.attention_mask.to(device)
+            negative_prompt_embeds = self.text_encoder(
+                negative_input_ids,
+                attention_mask=negative_prompt_attention_mask,
+            )[0]
+            seq_len_neg = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        return prompt_embeds, prompt_attention_mask
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        class_labels: Optional[Union[int, List[int], torch.Tensor]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Union[int, List[int]] = 10,
+        guidance_scale: float = 4.0,
+        shift: float = 1.0,
+        negative_prompt: Union[str, List[str]] = "",
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[PixelFlowPipelineOutput, Tuple]:
+        if height is None:
+            height = int(self.transformer.config.sample_size)
+        if width is None:
+            width = int(self.transformer.config.sample_size)
+        device = self._execution_device
+        self._guidance_scale_value = guidance_scale
+        if isinstance(num_inference_steps, int):
+            num_inference_steps = [num_inference_steps] * self.scheduler.num_stages
+        prompt_attention_mask = None
+        if self.class_cond:
+            if class_labels is None:
+                raise ValueError("`class_labels` are required for class-conditional PixelFlow checkpoints.")
+            if isinstance(class_labels, int):
+                class_labels = [class_labels]
+            if not torch.is_tensor(class_labels):
+                class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
+            else:
+                class_labels = class_labels.to(device=device, dtype=torch.long)
+            batch_size = class_labels.shape[0]
+            prompt_embeds = class_labels
+            negative_prompt_embeds = torch.full_like(prompt_embeds, self.transformer.config.num_classes)
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        else:
+            if prompt is None:
+                raise ValueError("`prompt` is required for text-to-image PixelFlow checkpoints.")
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            batch_size = len(prompt)
+            prompt_embeds, prompt_attention_mask = self.encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance and guidance_scale > 1.0,
+                negative_prompt=negative_prompt,
+            )
+        init_factor = 2 ** (self.scheduler.num_stages - 1)
+        height, width = height // init_factor, width // init_factor
+        latents = randn_tensor(
+            (batch_size * num_images_per_prompt, 3, height, width),
+            generator=generator,
+            device=device,
+            dtype=torch.float32,
+        )
+        for stage_idx in range(self.scheduler.num_stages):
+            self.scheduler.set_timesteps(num_inference_steps[stage_idx], stage_idx, device=device, shift=shift)
+            timesteps = self.scheduler.Timesteps
+            if stage_idx > 0:
+                height, width = height * 2, width * 2
+                latents = F.interpolate(latents, size=(height, width), mode="nearest")
+                original_start_t = self.scheduler.original_start_t[stage_idx]
+                gamma = self.scheduler.gamma
+                alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
+                beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
+                noise = self.sample_block_noise(*latents.shape)
+                noise = noise.to(device=device, dtype=latents.dtype)
+                latents = alpha * latents + beta * noise
+            size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
+            pos_embed = get_2d_rotary_pos_embed(
+                embed_dim=self.transformer.attention_head_dim,
+                crops_coords=((0, 0), (latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size)),
+                grid_size=(latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size),
+                device=device,
+                output_type="pt",
+            )
+            rope_pos = torch.stack(pos_embed, -1)
+            autocast_enabled = device.type == "cuda"
+            autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
+            for timestep in timesteps:
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+                with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
+                    if self.class_cond:
+                        noise_pred = self.transformer(
+                            latent_model_input,
+                            timestep=timestep_batch,
+                            class_labels=prompt_embeds,
+                            latent_size=size_tensor,
+                            pos_embed=rope_pos,
+                        ).sample
+                    else:
+                        noise_pred = self.transformer(
+                            latent_model_input,
+                            encoder_hidden_states=prompt_embeds,
+                            encoder_attention_mask=prompt_attention_mask,
+                            timestep=timestep_batch,
+                            latent_size=size_tensor,
+                            pos_embed=rope_pos,
+                        ).sample
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self._stage_guidance_scale(stage_idx) * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
+        image = (latents / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            pass
+        elif output_type in ("pil", "np"):
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            raise ValueError(f"Unsupported output_type: {output_type}")
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return PixelFlowPipelineOutput(images=image)

PixelFlow-T2I/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc ADDED Viewed

Binary file (7.76 kB). View file

PixelFlow-T2I/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "PixelFlowScheduler",
+  "_diffusers_version": "0.36.0",
+  "gamma": -0.3333333333333333,
+  "num_stages": 4,
+  "num_train_timesteps": 1000
+}

PixelFlow-T2I/scheduler/scheduling_pixelflow.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+from diffusers.utils import BaseOutput
+def cal_rectify_ratio(start_t, gamma):
+    return 1 / (math.sqrt(1 - (1 / gamma)) * (1 - start_t) + start_t)
+@dataclass
+class PixelFlowSchedulerOutput(BaseOutput):
+    prev_sample: torch.FloatTensor
+class PixelFlowScheduler(SchedulerMixin, ConfigMixin):
+    """Cascade flow scheduler for PixelFlow multi-stage pixel-space generation."""
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        num_stages: int = 4,
+        gamma: float = -1 / 3,
+    ):
+        assert num_stages > 0, f"num_stages must be positive, got {num_stages}"
+        self.num_stages = num_stages
+        self.gamma = gamma
+        self.Timesteps = torch.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=torch.float32)
+        self.t = self.Timesteps / num_train_timesteps
+        self.stage_range = [x / num_stages for x in range(num_stages + 1)]
+        self.original_start_t = {}
+        self.start_t, self.end_t = {}, {}
+        self.t_window_per_stage = {}
+        self.Timesteps_per_stage = {}
+        stage_distance = []
+        for stage_idx in range(num_stages):
+            start_idx = max(int(num_train_timesteps * self.stage_range[stage_idx]), 0)
+            end_idx = min(int(num_train_timesteps * self.stage_range[stage_idx + 1]), num_train_timesteps)
+            start_t = self.t[start_idx].item()
+            end_t = self.t[end_idx].item() if end_idx < num_train_timesteps else 1.0
+            self.original_start_t[stage_idx] = start_t
+            if stage_idx > 0:
+                start_t *= cal_rectify_ratio(start_t, gamma)
+            self.start_t[stage_idx] = start_t
+            self.end_t[stage_idx] = end_t
+            stage_distance.append(end_t - start_t)
+        total_stage_distance = sum(stage_distance)
+        t_within_stage = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float64)[:-1]
+        for stage_idx in range(num_stages):
+            start_ratio = 0.0 if stage_idx == 0 else sum(stage_distance[:stage_idx]) / total_stage_distance
+            end_ratio = 1.0 if stage_idx == num_stages - 1 else sum(stage_distance[:stage_idx + 1]) / total_stage_distance
+            Timestep_start = self.Timesteps[int(num_train_timesteps * start_ratio)]
+            Timestep_end = self.Timesteps[min(int(num_train_timesteps * end_ratio), num_train_timesteps - 1)]
+            self.t_window_per_stage[stage_idx] = t_within_stage
+            if stage_idx == num_stages - 1:
+                self.Timesteps_per_stage[stage_idx] = torch.linspace(
+                    Timestep_start.item(), Timestep_end.item(), num_train_timesteps, dtype=torch.float64
+                )
+            else:
+                self.Timesteps_per_stage[stage_idx] = torch.linspace(
+                    Timestep_start.item(), Timestep_end.item(), num_train_timesteps + 1, dtype=torch.float64
+                )[:-1]
+        self._step_index = None
+        self.Timesteps = None
+    @staticmethod
+    def time_linear_to_Timesteps(t, t_start, t_end, T_start, T_end):
+        k = (T_end - T_start) / (t_end - t_start)
+        b = T_start - t_start * k
+        return k * t + b
+    def set_timesteps(self, num_inference_steps, stage_index, device=None, shift=1.0):
+        self.num_inference_steps = num_inference_steps
+        self._step_index = None
+        stage_T_start = self.Timesteps_per_stage[stage_index][0].item()
+        stage_T_end = self.Timesteps_per_stage[stage_index][-1].item()
+        t_start = self.t_window_per_stage[stage_index][0].item()
+        t_end = self.t_window_per_stage[stage_index][-1].item()
+        t = np.linspace(t_start, t_end, num_inference_steps, dtype=np.float64)
+        t = t / (shift + (1 - shift) * t)
+        Timesteps = self.time_linear_to_Timesteps(t, t_start, t_end, stage_T_start, stage_T_end)
+        self.Timesteps = torch.from_numpy(Timesteps).to(device=device)
+        self.t = torch.from_numpy(np.append(t, 1.0)).to(device=device, dtype=torch.float64)
+    def step(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[PixelFlowSchedulerOutput, SchedulerOutput, Tuple[torch.Tensor, ...]]:
+        if self._step_index is None:
+            self._step_index = 0
+        sample = sample.to(torch.float32)
+        t = self.t[self._step_index].float()
+        t_next = self.t[self._step_index + 1].float()
+        prev_sample = sample + (t_next - t) * model_output
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample.to(model_output.dtype),)
+        return PixelFlowSchedulerOutput(prev_sample=prev_sample.to(model_output.dtype))
+    @property
+    def step_index(self) -> Optional[int]:
+        return self._step_index

PixelFlow-T2I/text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 5120,
+  "d_kv": 64,
+  "d_model": 2048,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 24,
+  "num_heads": 32,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.24.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32128
+}

PixelFlow-T2I/text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.27.0.dev0"
+}

PixelFlow-T2I/text_encoder/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99196ddfbe886e8ef860f52de979df64890edfc792c3d94ce0502991f347dd18
+size 9449619912

PixelFlow-T2I/text_encoder/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c677ddeb21009b6efd97146f37fc3a0396707fb5e63ade7aff64884dce9806
+size 1949477672

PixelFlow-T2I/text_encoder/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,567 @@

+{
+    "metadata": {
+        "total_size": 11925413888
+    },
+    "weight_map": {
+        "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.0.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.1.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.10.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.11.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.12.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.13.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.14.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.15.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.16.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.17.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.18.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.19.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.2.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.20.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.21.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.22.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.23.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.3.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.4.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.5.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.6.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.7.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.8.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "decoder.block.9.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "decoder.embed_tokens.weight": "model-00001-of-00002.safetensors",
+        "decoder.final_layer_norm.weight": "model-00002-of-00002.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.18.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.19.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.20.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.21.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.22.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.23.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
+        "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
+        "encoder.embed_tokens.weight": "model-00001-of-00002.safetensors",
+        "encoder.final_layer_norm.weight": "model-00001-of-00002.safetensors",
+        "lm_head.weight": "model-00002-of-00002.safetensors",
+        "shared.weight": "model-00001-of-00002.safetensors"
+    }
+}

PixelFlow-T2I/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

PixelFlow-T2I/tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

PixelFlow-T2I/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

PixelFlow-T2I/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "google/t5-v1_1-small",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/arthur_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-small/snapshots/fb7e6cba609f7bab11c614294bc04f82f613c7b1/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

PixelFlow-T2I/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc ADDED Viewed

Binary file (24.1 kB). View file

PixelFlow-T2I/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc ADDED Viewed

Binary file (3.83 kB). View file

PixelFlow-T2I/transformer/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_class_name": "PixelFlowTransformer2DModel",
+  "_diffusers_version": "0.36.0",
+  "attention_bias": true,
+  "attention_head_dim": 72,
+  "cross_attention_dim": 2048,
+  "depth": 28,
+  "dropout": 0.0,
+  "in_channels": 3,
+  "init_weights": false,
+  "num_attention_heads": 16,
+  "num_classes": 0,
+  "out_channels": 3,
+  "patch_size": 4,
+  "sample_size": 1024
+}

PixelFlow-T2I/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a62a11eef9c84f80ff482e996311666546c032270a7ce024684c455cf800251c
+size 3528583392

PixelFlow-T2I/transformer/modeling_pixelflow.py ADDED Viewed

	@@ -0,0 +1,448 @@

+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+from diffusers.models.embeddings import LabelEmbedding, TimestepEmbedding, Timesteps
+try:
+    from flash_attn import flash_attn_varlen_func
+except ImportError:
+    warnings.warn("`flash-attn` is not installed. Training mode may not work properly.", UserWarning)
+    flash_attn_varlen_func = None
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos, sin = freqs_cis.unbind(-1)
+    cos = cos[None, None]
+    sin = sin[None, None]
+    cos, sin = cos.to(x.device), sin.to(x.device)
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+    return out
+class PatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_channels, embed_dim, bias=True):
+        super().__init__()
+        self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
+    def forward_unfold(self, x):
+        out_unfold = x.matmul(self.proj.weight.view(self.proj.weight.size(0), -1).t())
+        if self.proj.bias is not None:
+            out_unfold += self.proj.bias.to(out_unfold.dtype)
+        return out_unfold
+    def forward(self, x):
+        if self.training:
+            return self.forward_unfold(x)
+        out = self.proj(x)
+        out = out.flatten(2).transpose(1, 2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, timestep, seqlen_list=None):
+        input_dtype = x.dtype
+        emb = self.linear(self.silu(timestep))
+        if seqlen_list is not None:
+            emb = torch.cat([one_emb[None].expand(repeat_time, -1) for one_emb, repeat_time in zip(emb, seqlen_list)])
+        else:
+            emb = emb.unsqueeze(1)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.float().chunk(6, dim=-1)
+        x = self.norm(x).float() * (1 + scale_msa) + shift_msa
+        return x.to(input_dtype), gate_msa, shift_mlp, scale_mlp, gate_mlp
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, inner_dim=None, bias=True):
+        super().__init__()
+        inner_dim = int(dim * mult) if inner_dim is None else inner_dim
+        dim_out = dim_out if dim_out is not None else dim
+        self.fc1 = nn.Linear(dim, inner_dim, bias=bias)
+        self.fc2 = nn.Linear(inner_dim, dim_out, bias=bias)
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states, approximate="tanh")
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        output = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (self.weight * output).to(x.dtype)
+class Attention(nn.Module):
+    def __init__(self, q_dim, kv_dim=None, heads=8, head_dim=64, dropout=0.0, bias=False):
+        super().__init__()
+        self.q_dim = q_dim
+        self.kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.inner_dim = head_dim * heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+        self.num_heads = heads
+        self.q_proj = nn.Linear(self.q_dim, self.inner_dim, bias=bias)
+        self.k_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
+        self.o_proj = nn.Linear(self.inner_dim, self.q_dim, bias=bias)
+        self.q_norm = RMSNorm(self.inner_dim)
+        self.k_norm = RMSNorm(self.inner_dim)
+    def prepare_attention_mask(self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3):
+        head_size = self.num_heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def forward(
+        self,
+        inputs_q,
+        inputs_kv,
+        attention_mask=None,
+        cross_attention=False,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        max_seqlen_q=None,
+        max_seqlen_k=None,
+    ):
+        inputs_kv = inputs_q if inputs_kv is None else inputs_kv
+        query_states = self.q_proj(inputs_q)
+        key_states = self.k_proj(inputs_kv)
+        value_states = self.v_proj(inputs_kv)
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        if max_seqlen_q is None:
+            assert not self.training, "PixelFlow needs sequence packing for training"
+            bsz, q_len, _ = inputs_q.shape
+            _, kv_len, _ = inputs_kv.shape
+            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            query_states = apply_rotary_emb(query_states, rope_pos_embed)
+            if not cross_attention:
+                key_states = apply_rotary_emb(key_states, rope_pos_embed)
+            if attention_mask is not None:
+                attention_mask = self.prepare_attention_mask(attention_mask, kv_len, bsz)
+                attention_mask = attention_mask.view(bsz, self.num_heads, -1, attention_mask.shape[-1])
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False,
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous()
+            attn_output = attn_output.view(bsz, q_len, self.inner_dim)
+            attn_output = self.o_proj(attn_output)
+            return attn_output
+        query_states = query_states.view(-1, self.num_heads, self.head_dim)
+        key_states = key_states.view(-1, self.num_heads, self.head_dim)
+        value_states = value_states.view(-1, self.num_heads, self.head_dim)
+        query_states = apply_rotary_emb(query_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        if not cross_attention:
+            key_states = apply_rotary_emb(key_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+        )
+        attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=False,
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNorm(dim)
+        self.attn1 = Attention(
+            q_dim=dim,
+            kv_dim=None,
+            heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )
+        if cross_attention_dim is not None:
+            self.norm2 = RMSNorm(dim, eps=1e-6)
+            self.attn2 = Attention(
+                q_dim=dim,
+                kv_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                head_dim=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+            )
+        else:
+            self.attn2 = None
+        self.norm3 = RMSNorm(dim, eps=1e-6)
+        self.mlp = FeedForward(dim)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        rope_pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, timestep, seqlen_list_q)
+        attn_output = self.attn1(
+            inputs_q=norm_hidden_states,
+            inputs_kv=None,
+            attention_mask=None,
+            cross_attention=False,
+            rope_pos_embed=rope_pos_embed,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_q,
+            max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+            max_seqlen_k=max(seqlen_list_q) if seqlen_list_q is not None else None,
+        )
+        attn_output = (gate_msa * attn_output.float()).to(attn_output.dtype)
+        hidden_states = attn_output + hidden_states
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(
+                inputs_q=norm_hidden_states,
+                inputs_kv=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                cross_attention=True,
+                rope_pos_embed=rope_pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
+                max_seqlen_k=max(seqlen_list_k) if seqlen_list_k is not None else None,
+            )
+            hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm3(hidden_states)
+        norm_hidden_states = (norm_hidden_states.float() * (1 + scale_mlp) + shift_mlp).to(norm_hidden_states.dtype)
+        ff_output = self.mlp(norm_hidden_states)
+        ff_output = (gate_mlp * ff_output.float()).to(ff_output.dtype)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class PixelFlowModel(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_attention_heads,
+        attention_head_dim,
+        depth,
+        patch_size,
+        dropout=0.0,
+        cross_attention_dim=None,
+        attention_bias=True,
+        num_classes=0,
+        init_weights=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.attention_head_dim = attention_head_dim
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        embed_dim = num_attention_heads * attention_head_dim
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        self.latent_size_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
+        if self.num_classes > 0:
+            self.class_embedder = LabelEmbedding(num_classes, embed_dim, dropout_prob=0.1)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    embed_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout,
+                    cross_attention_dim,
+                    attention_bias,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(embed_dim, 2 * embed_dim)
+        self.proj_out_2 = nn.Linear(embed_dim, patch_size * patch_size * out_channels)
+        if init_weights:
+            self.initialize_from_scratch()
+    def initialize_from_scratch(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        nn.init.normal_(self.timestep_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.timestep_embedder.linear_2.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_1.weight, std=0.02)
+        nn.init.normal_(self.latent_size_embedder.linear_2.weight, std=0.02)
+        if self.num_classes > 0:
+            nn.init.normal_(self.class_embedder.embedding_table.weight, std=0.02)
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.norm1.linear.weight, 0)
+            nn.init.constant_(block.norm1.linear.bias, 0)
+        nn.init.constant_(self.proj_out_1.weight, 0)
+        nn.init.constant_(self.proj_out_1.bias, 0)
+        nn.init.constant_(self.proj_out_2.weight, 0)
+        nn.init.constant_(self.proj_out_2.bias, 0)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        class_labels=None,
+        timestep=None,
+        latent_size=None,
+        encoder_attention_mask=None,
+        pos_embed=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        seqlen_list_q=None,
+        seqlen_list_k=None,
+    ):
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        orig_height, orig_width = hidden_states.shape[-2], hidden_states.shape[-1]
+        hidden_states = hidden_states.to(torch.float32)
+        hidden_states = self.patch_embed(hidden_states)
+        timesteps_proj = self.time_proj(timestep)
+        conditioning = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
+        if self.num_classes > 0:
+            class_embed = self.class_embedder(class_labels)
+            conditioning += class_embed
+        latent_size_proj = self.time_proj(latent_size)
+        latent_size_embed = self.latent_size_embedder(latent_size_proj.to(dtype=hidden_states.dtype))
+        conditioning += latent_size_embed
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=conditioning,
+                rope_pos_embed=pos_embed,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                seqlen_list_q=seqlen_list_q,
+                seqlen_list_k=seqlen_list_k,
+            )
+        shift, scale = self.proj_out_1(F.silu(conditioning)).float().chunk(2, dim=1)
+        if seqlen_list_q is None:
+            shift = shift.unsqueeze(1)
+            scale = scale.unsqueeze(1)
+        else:
+            shift = torch.cat([shift_i[None].expand(ri, -1) for shift_i, ri in zip(shift, seqlen_list_q)])
+            scale = torch.cat([scale_i[None].expand(ri, -1) for scale_i, ri in zip(scale, seqlen_list_q)])
+        hidden_states = (self.norm_out(hidden_states).float() * (1 + scale) + shift).to(hidden_states.dtype)
+        hidden_states = self.proj_out_2(hidden_states)
+        if self.training:
+            hidden_states = hidden_states.reshape(hidden_states.shape[0], self.patch_size, self.patch_size, self.out_channels)
+            hidden_states = hidden_states.permute(0, 3, 1, 2).flatten(1)
+            return hidden_states
+        height, width = orig_height // self.patch_size, orig_width // self.patch_size
+        hidden_states = hidden_states.reshape(shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels))
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size))
+        return output

PixelFlow-T2I/transformer/transformer_pixelflow.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from modeling_pixelflow import PixelFlowModel
+@dataclass
+class PixelFlowTransformerOutput(BaseOutput):
+    sample: torch.FloatTensor
+class PixelFlowTransformer2DModel(ModelMixin, ConfigMixin):
+    """PixelFlow transformer for class-conditional pixel-space flow generation."""
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 72,
+        depth: int = 28,
+        patch_size: int = 4,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = True,
+        num_classes: int = 1000,
+        sample_size: int = 256,
+        init_weights: bool = True,
+    ):
+        super().__init__()
+        self.model = PixelFlowModel(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            depth=depth,
+            patch_size=patch_size,
+            dropout=dropout,
+            cross_attention_dim=cross_attention_dim,
+            attention_bias=attention_bias,
+            num_classes=num_classes,
+            init_weights=init_weights,
+        )
+    @property
+    def patch_size(self) -> int:
+        return self.model.patch_size
+    @property
+    def attention_head_dim(self) -> int:
+        return self.model.attention_head_dim
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.Tensor] = None,
+        latent_size: Optional[torch.Tensor] = None,
+        pos_embed: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[PixelFlowTransformerOutput, Transformer2DModelOutput, Tuple[torch.Tensor, ...]]:
+        output = self.model(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=class_labels,
+            timestep=timestep,
+            latent_size=latent_size,
+            encoder_attention_mask=encoder_attention_mask,
+            pos_embed=pos_embed,
+        )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

README.md ADDED Viewed

	@@ -0,0 +1,110 @@

+---
+license: mit
+library_name: diffusers
+pipeline_tag: text-to-image
+tags:
+- diffusers
+- pixelflow
+- image-generation
+- class-conditional
+- flow-matching
+widget:
+- output:
+    url: PixelFlow-256/demo.png
+language:
+- en
+---
+# BiliSakura/PixelFlow-diffusers
+Self-contained PixelFlow checkpoints for Hugging Face diffusers. Each subfolder ships its own `pipeline.py`, component modules, and weights.
+## Available checkpoints
+| Subfolder | Task | Resolution | Params |
+| --- | --- | ---: | ---: |
+| [`PixelFlow-256/`](PixelFlow-256/) | class-to-image | 256×256 | 677M |
+| [`PixelFlow-T2I/`](PixelFlow-T2I/) | text-to-image | 1024×1024 | 882M |
+## ImageNet class labels
+For class-conditional [`PixelFlow-256/`](PixelFlow-256/), ImageNet-1k labels live in shared [`labels/`](labels/) at the repo root:
+| File | Direction | Value format |
+| --- | --- | --- |
+| `labels/id2label_en.json` | id → English | comma-separated synonyms, e.g. `"207": "golden retriever"` |
+| `labels/id2label_cn.json` | id → Chinese | comma-separated synonyms, e.g. `"207": "金毛猎犬"` |
+After `PixelFlowPipeline.from_pretrained(...)`, the pipeline exposes:
+- `pipe.id2label` / `pipe.id2label_cn` — inspect id → label correspondence
+- `pipe.labels` / `pipe.labels_cn` — reverse maps (synonym → id)
+- `pipe.get_label_ids("golden retriever")` or `pipe.get_label_ids("金毛猎犬", lang="cn")`
+- `pipe(class_labels="golden retriever", ...)` — string labels resolved automatically
+## Demo
+![PixelFlow-256 demo](PixelFlow-256/demo.png)
+## Load from a local clone
+```python
+import sys
+from pathlib import Path
+repo = Path("BiliSakura/PixelFlow-diffusers").resolve()
+variant = "PixelFlow-256"
+sys.path.insert(0, str(repo / variant))
+from pipeline import PixelFlowPipeline
+pipe = PixelFlowPipeline.from_pretrained(".")
+pipe.to("cuda")
+images = pipe(
+    class_labels=207,
+    num_inference_steps=[10, 10, 10, 10],
+    guidance_scale=4.0,
+).images
+# Human-readable ImageNet labels (English or Chinese)
+print(pipe.id2label[207])          # "golden retriever"
+print(pipe.id2label_cn[207])       # "金毛猎犬"
+pipe.get_label_ids("golden retriever")  # [207]
+pipe.get_label_ids("金毛猎犬", lang="cn")  # [207]
+images = pipe(class_labels="golden retriever", num_inference_steps=[10, 10, 10, 10]).images
+```
+### Text-to-image (`PixelFlow-T2I`)
+Uses [`google/flan-t5-xl`](https://huggingface.co/google/flan-t5-xl) as the text encoder (loaded from Hugging Face at runtime, not bundled in the repo).
+```python
+variant = "PixelFlow-T2I"
+sys.path.insert(0, str(repo / variant))
+from pipeline import PixelFlowPipeline
+pipe = PixelFlowPipeline.from_pretrained(".")
+pipe.to("cuda")
+images = pipe(
+    prompt="A golden retriever playing in a sunny garden",
+    num_inference_steps=[10, 10, 10, 10],
+    guidance_scale=4.0,
+).images
+```
+## Conversion
+```bash
+python scripts/convert_pixelflow_to_diffusers.py \
+  --checkpoint models/raw/PixelFlow/c2i/model.pt \
+  --config models/raw/PixelFlow/c2i/config.yaml \
+  --output models/BiliSakura/PixelFlow-diffusers/PixelFlow-256
+python scripts/convert_pixelflow_to_diffusers.py \
+  --checkpoint models/raw/PixelFlow/t2i/model.pt \
+  --config models/raw/PixelFlow/t2i/config.yaml \
+  --output models/BiliSakura/PixelFlow-diffusers/PixelFlow-T2I \
+  --skip-text-encoder
+```

labels/__pycache__/imagenet_labels.cpython-312.pyc ADDED Viewed

Binary file (3.24 kB). View file

labels/id2label_cn.json ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+  "0": "丁鲷",
+  "1": "金鱼",
+  "2": "大白鲨",
+  "3": "虎鲨",
+  "4": "锤头鲨",
+  "5": "电鳐",
+  "6": "黄貂鱼",
+  "7": "公鸡",
+  "8": "母鸡",
+  "9": "鸵鸟",
+  "10": "燕雀",
+  "11": "金翅雀",
+  "12": "家朱雀",
+  "13": "灯芯草雀",
+  "14": "靛蓝雀,靛蓝鸟",
+  "15": "蓝鹀",
+  "16": "夜莺",
+  "17": "松鸦",
+  "18": "喜鹊",
+  "19": "山雀",
+  "20": "河鸟",
+  "21": "鸢（猛禽）",
+  "22": "秃头鹰",
+  "23": "秃鹫",
+  "24": "大灰猫头鹰",
+  "25": "欧洲火蝾螈",
+  "26": "普通蝾螈",
+  "27": "水蜥",
+  "28": "斑点蝾螈",
+  "29": "蝾螈,泥狗",
+  "30": "牛蛙",
+  "31": "树蛙",
+  "32": "尾蛙,铃蟾蜍,肋蟾蜍,尾蟾蜍",
+  "33": "红海龟",
+  "34": "皮革龟",
+  "35": "泥龟",
+  "36": "淡水龟",
+  "37": "箱龟",
+  "38": "带状壁虎",
+  "39": "普通鬣蜥",
+  "40": "美国变色龙",
+  "41": "鞭尾蜥蜴",
+  "42": "飞龙科蜥蜴",
+  "43": "褶边蜥蜴",
+  "44": "鳄鱼蜥蜴",
+  "45": "毒蜥",
+  "46": "绿蜥蜴",
+  "47": "非洲变色龙",
+  "48": "科莫多蜥蜴",
+  "49": "非洲鳄,尼罗河鳄鱼",
+  "50": "美国鳄鱼,鳄鱼",
+  "51": "三角龙",
+  "52": "雷蛇,蠕虫蛇",
+  "53": "环蛇,环颈蛇",
+  "54": "希腊蛇",
+  "55": "绿蛇,草蛇",
+  "56": "国王蛇",
+  "57": "袜带蛇,草蛇",
+  "58": "水蛇",
+  "59": "藤蛇",
+  "60": "夜蛇",
+  "61": "大蟒蛇",
+  "62": "岩石蟒蛇,岩蛇,蟒蛇",
+  "63": "印度眼镜蛇",
+  "64": "绿曼巴",
+  "65": "海蛇",
+  "66": "角腹蛇",
+  "67": "菱纹响尾蛇",
+  "68": "角响尾蛇",
+  "69": "三叶虫",
+  "70": "盲蜘蛛",
+  "71": "蝎子",
+  "72": "黑金花园蜘蛛",
+  "73": "谷仓蜘蛛",
+  "74": "花园蜘蛛",
+  "75": "黑寡妇蜘蛛",
+  "76": "狼蛛",
+  "77": "狼蜘蛛,狩猎蜘蛛",
+  "78": "壁虱",
+  "79": "蜈蚣",
+  "80": "黑松鸡",
+  "81": "松鸡,雷鸟",
+  "82": "披肩鸡,披肩榛鸡",
+  "83": "草原鸡,草原松鸡",
+  "84": "孔雀",
+  "85": "鹌鹑",
+  "86": "鹧鸪",
+  "87": "非洲灰鹦鹉",
+  "88": "金刚鹦鹉",
+  "89": "硫冠鹦鹉",
+  "90": "短尾鹦鹉",
+  "91": "褐翅鸦鹃",
+  "92": "蜜蜂",
+  "93": "犀鸟",
+  "94": "蜂鸟",
+  "95": "鹟䴕",
+  "96": "犀鸟",
+  "97": "野鸭",
+  "98": "红胸秋沙鸭",
+  "99": "鹅",
+  "100": "黑天鹅",
+  "101": "大象",
+  "102": "针鼹鼠",
+  "103": "鸭嘴兽",
+  "104": "沙袋鼠",
+  "105": "考拉,考拉熊",
+  "106": "袋熊",
+  "107": "水母",
+  "108": "海葵",
+  "109": "脑珊瑚",
+  "110": "扁形虫扁虫",
+  "111": "线虫,蛔虫",
+  "112": "海螺",
+  "113": "蜗牛",
+  "114": "鼻涕虫",
+  "115": "海参",
+  "116": "石鳖",
+  "117": "鹦鹉螺",
+  "118": "珍宝蟹",
+  "119": "石蟹",
+  "120": "招潮蟹",
+  "121": "帝王蟹,阿拉斯加蟹,阿拉斯加帝王蟹",
+  "122": "美国龙虾,缅因州龙虾",
+  "123": "大螯虾",
+  "124": "小龙虾",
+  "125": "寄居蟹",
+  "126": "等足目动物(明虾和螃蟹近亲)",
+  "127": "白鹳",
+  "128": "黑鹳",
+  "129": "鹭",
+  "130": "火烈鸟",
+  "131": "小蓝鹭",
+  "132": "美国鹭,大白鹭",
+  "133": "麻鸦",
+  "134": "鹤",
+  "135": "秧鹤",
+  "136": "欧洲水鸡,紫水鸡",
+  "137": "沼泽泥母鸡,水母鸡",
+  "138": "鸨",
+  "139": "红翻石鹬",
+  "140": "红背鹬,黑腹滨鹬",
+  "141": "红脚鹬",
+  "142": "半蹼鹬",
+  "143": "蛎鹬",
+  "144": "鹈鹕",
+  "145": "国王企鹅",
+  "146": "信天翁,大海鸟",
+  "147": "灰鲸",
+  "148": "杀人鲸,逆戟鲸,虎鲸",
+  "149": "海牛",
+  "150": "海狮",
+  "151": "奇瓦瓦",
+  "152": "日本猎犬",
+  "153": "马尔济斯犬",
+  "154": "狮子狗",
+  "155": "西施犬",
+  "156": "布莱尼姆猎犬",
+  "157": "巴比狗",
+  "158": "玩具犬",
+  "159": "罗得西亚长背猎狗",
+  "160": "阿富汗猎犬",
+  "161": "猎犬",
+  "162": "比格犬,猎兔犬",
+  "163": "侦探犬",
+  "164": "蓝色快狗",
+  "165": "黑褐猎浣熊犬",
+  "166": "沃克猎犬",
+  "167": "英国猎狐犬",
+  "168": "美洲赤狗",
+  "169": "俄罗斯猎狼犬",
+  "170": "爱尔兰猎狼犬",
+  "171": "意大利灰狗",
+  "172": "惠比特犬",
+  "173": "依比沙猎犬",
+  "174": "挪威猎犬",
+  "175": "奥达猎犬,水獭猎犬",
+  "176": "沙克犬,瞪羚猎犬",
+  "177": "苏格兰猎鹿犬,猎鹿犬",
+  "178": "威玛猎犬",
+  "179": "斯塔福德郡牛头梗,斯塔福德郡斗牛梗",
+  "180": "美国斯塔福德郡梗,美国比特斗牛梗,斗牛梗",
+  "181": "贝德灵顿梗",
+  "182": "边境梗",
+  "183": "凯丽蓝梗",
+  "184": "爱尔兰梗",
+  "185": "诺福克梗",
+  "186": "诺维奇梗",
+  "187": "约克郡梗",
+  "188": "刚毛猎狐梗",
+  "189": "莱克兰梗",
+  "190": "锡利哈姆梗",
+  "191": "艾尔谷犬",
+  "192": "凯恩梗",
+  "193": "澳大利亚梗",
+  "194": "丹迪丁蒙梗",
+  "195": "波士顿梗",
+  "196": "迷你雪纳瑞犬",
+  "197": "巨型雪纳瑞犬",
+  "198": "标准雪纳瑞犬",
+  "199": "苏格兰梗",
+  "200": "西藏梗,菊花狗",
+  "201": "丝毛梗",
+  "202": "软毛麦色梗",
+  "203": "西高地白梗",
+  "204": "拉萨阿普索犬",
+  "205": "平毛寻回犬",
+  "206": "卷毛寻回犬",
+  "207": "金毛猎犬",
+  "208": "拉布拉多猎犬",
+  "209": "乞沙比克猎犬",
+  "210": "德国短毛猎犬",
+  "211": "维兹拉犬",
+  "212": "英国谍犬",
+  "213": "爱尔兰雪达犬,红色猎犬",
+  "214": "戈登雪达犬",
+  "215": "布列塔尼犬猎犬",
+  "216": "黄毛,黄毛猎犬",
+  "217": "英国史宾格犬",
+  "218": "威尔士史宾格犬",
+  "219": "可卡犬,英国可卡犬",
+  "220": "萨塞克斯猎犬",
+  "221": "爱尔兰水猎犬",
+  "222": "哥威斯犬",
+  "223": "舒柏奇犬",
+  "224": "比利时牧羊犬",
+  "225": "马里努阿犬",
+  "226": "伯瑞犬",
+  "227": "凯尔皮犬",
+  "228": "匈牙利牧羊犬",
+  "229": "老英国牧羊犬",
+  "230": "喜乐蒂牧羊犬",
+  "231": "牧羊犬",
+  "232": "边境牧羊犬",
+  "233": "法兰德斯牧牛狗",
+  "234": "罗特韦尔犬",
+  "235": "德国牧羊犬,德国警犬,阿尔萨斯",
+  "236": "多伯曼犬,杜宾犬",
+  "237": "迷你杜宾犬",
+  "238": "大瑞士山地犬",
+  "239": "伯恩山犬",
+  "240": "Appenzeller狗",
+  "241": "EntleBucher狗",
+  "242": "拳师狗",
+  "243": "斗牛獒",
+  "244": "藏獒",
+  "245": "法国斗牛犬",
+  "246": "大丹犬",
+  "247": "圣伯纳德狗",
+  "248": "爱斯基摩犬,哈士奇",
+  "249": "雪橇犬,阿拉斯加爱斯基摩狗",
+  "250": "哈士奇",
+  "251": "达尔马提亚,教练车狗",
+  "252": "狮毛狗",
+  "253": "巴辛吉狗",
+  "254": "哈巴狗,狮子狗",
+  "255": "莱昂贝格狗",
+  "256": "纽芬兰岛狗",
+  "257": "大白熊犬",
+  "258": "萨摩耶犬",
+  "259": "博美犬",
+  "260": "松狮,松狮",
+  "261": "荷兰卷尾狮毛狗",
+  "262": "布鲁塞尔格林芬犬",
+  "263": "彭布洛克威尔士科基犬",
+  "264": "威尔士柯基犬",
+  "265": "玩具贵宾犬",
+  "266": "迷你贵宾犬",
+  "267": "标准贵宾犬",
+  "268": "墨西哥无毛犬",
+  "269": "灰狼",
+  "270": "白狼,北极狼",
+  "271": "红太狼,鬃狼,犬犬鲁弗斯",
+  "272": "狼,草原狼,刷狼,郊狼",
+  "273": "澳洲野狗,澳大利亚野犬",
+  "274": "豺",
+  "275": "非洲猎犬,土狼犬",
+  "276": "鬣狗",
+  "277": "红狐狸",
+  "278": "沙狐",
+  "279": "北极狐狸,白狐狸",
+  "280": "灰狐狸",
+  "281": "虎斑猫",
+  "282": "山猫,虎猫",
+  "283": "波斯猫",
+  "284": "暹罗暹罗猫,",
+  "285": "埃及猫",
+  "286": "美洲狮,美洲豹",
+  "287": "猞猁,山猫",
+  "288": "豹子",
+  "289": "雪豹",
+  "290": "美洲虎",
+  "291": "狮子",
+  "292": "老虎",
+  "293": "猎豹",
+  "294": "棕熊",
+  "295": "美洲黑熊",
+  "296": "冰熊,北极熊",
+  "297": "懒熊",
+  "298": "猫鼬",
+  "299": "猫鼬,海猫",
+  "300": "虎甲虫",
+  "301": "瓢虫",
+  "302": "土鳖虫",
+  "303": "天牛",
+  "304": "龟甲虫",
+  "305": "粪甲虫",
+  "306": "犀牛甲虫",
+  "307": "象甲",
+  "308": "苍蝇",
+  "309": "蜜蜂",
+  "310": "蚂蚁",
+  "311": "蚱蜢",
+  "312": "蟋蟀",
+  "313": "竹节虫",
+  "314": "蟑螂",
+  "315": "螳螂",
+  "316": "蝉",
+  "317": "叶蝉",
+  "318": "草蜻蛉",
+  "319": "蜻蜓",
+  "320": "豆娘,蜻蛉",
+  "321": "优红蛱蝶",
+  "322": "小环蝴蝶",
+  "323": "君主蝴蝶,大斑蝶",
+  "324": "菜粉蝶",
+  "325": "白蝴蝶",
+  "326": "灰蝶",
+  "327": "海星",
+  "328": "海胆",
+  "329": "海参,海黄瓜",
+  "330": "野兔",
+  "331": "兔",
+  "332": "安哥拉兔",
+  "333": "仓鼠",
+  "334": "刺猬,豪猪,",
+  "335": "黑松鼠",
+  "336": "土拨鼠",
+  "337": "海狸",
+  "338": "豚鼠,豚鼠",
+  "339": "栗色马",
+  "340": "斑马",
+  "341": "猪",
+  "342": "野猪",
+  "343": "疣猪",
+  "344": "河马",
+  "345": "牛",
+  "346": "水牛,亚洲水牛",
+  "347": "野牛",
+  "348": "公羊",
+  "349": "大角羊,洛矶山大角羊",
+  "350": "山羊",
+  "351": "狷羚",
+  "352": "黑斑羚",
+  "353": "瞪羚",
+  "354": "阿拉伯单峰骆驼,骆驼",
+  "355": "羊驼",
+  "356": "黄鼠狼",
+  "357": "水貂",
+  "358": "臭猫",
+  "359": "黑足鼬",
+  "360": "水獭",
+  "361": "臭鼬,木猫",
+  "362": "獾",
+  "363": "犰狳",
+  "364": "树懒",
+  "365": "猩猩,婆罗洲猩猩",
+  "366": "大猩猩",
+  "367": "黑猩猩",
+  "368": "长臂猿",
+  "369": "合趾猿长臂猿,合趾猿",
+  "370": "长尾猴",
+  "371": "赤猴",
+  "372": "狒狒",
+  "373": "恒河猴,猕猴",
+  "374": "白头叶猴",
+  "375": "疣猴",
+  "376": "长鼻猴",
+  "377": "狨（美洲产小型长尾猴）",
+  "378": "卷尾猴",
+  "379": "吼猴",
+  "380": "伶猴",
+  "381": "蜘蛛猴",
+  "382": "松鼠猴",
+  "383": "马达加斯加环尾狐猴,鼠狐猴",
+  "384": "大狐猴,马达加斯加大狐猴",
+  "385": "印度大象,亚洲象",
+  "386": "非洲象,非洲象",
+  "387": "小熊猫",
+  "388": "大熊猫",
+  "389": "杖鱼",
+  "390": "鳗鱼",
+  "391": "银鲑,银鲑���",
+  "392": "三色刺蝶鱼",
+  "393": "海葵鱼",
+  "394": "鲟鱼",
+  "395": "雀鳝",
+  "396": "狮子鱼",
+  "397": "河豚",
+  "398": "算盘",
+  "399": "长袍",
+  "400": "学位袍",
+  "401": "手风琴",
+  "402": "原声吉他",
+  "403": "航空母舰",
+  "404": "客机",
+  "405": "飞艇",
+  "406": "祭坛",
+  "407": "救护车",
+  "408": "水陆两用车",
+  "409": "模拟时钟",
+  "410": "蜂房",
+  "411": "围裙",
+  "412": "垃圾桶",
+  "413": "攻击步枪,枪",
+  "414": "背包",
+  "415": "面包店,面包铺,",
+  "416": "平衡木",
+  "417": "热气球",
+  "418": "圆珠笔",
+  "419": "创可贴",
+  "420": "班卓琴",
+  "421": "栏杆,楼梯扶手",
+  "422": "杠铃",
+  "423": "理发师的椅子",
+  "424": "理发店",
+  "425": "牲口棚",
+  "426": "晴雨表",
+  "427": "圆筒",
+  "428": "园地小车,手推车",
+  "429": "棒球",
+  "430": "篮球",
+  "431": "婴儿床",
+  "432": "巴松管,低音管",
+  "433": "游泳帽",
+  "434": "沐浴毛巾",
+  "435": "浴缸,澡盆",
+  "436": "沙滩车,旅行车",
+  "437": "灯塔",
+  "438": "高脚杯",
+  "439": "熊皮高帽",
+  "440": "啤酒瓶",
+  "441": "啤酒杯",
+  "442": "钟塔",
+  "443": "（小儿用的）围嘴",
+  "444": "串联自行车,",
+  "445": "比基尼",
+  "446": "装订册",
+  "447": "双筒望远镜",
+  "448": "鸟舍",
+  "449": "船库",
+  "450": "雪橇",
+  "451": "饰扣式领带",
+  "452": "阔边女帽",
+  "453": "书橱",
+  "454": "书店,书摊",
+  "455": "瓶盖",
+  "456": "弓箭",
+  "457": "蝴蝶结领结",
+  "458": "铜制牌位",
+  "459": "奶罩",
+  "460": "防波堤,海堤",
+  "461": "铠甲",
+  "462": "扫帚",
+  "463": "桶",
+  "464": "扣环",
+  "465": "防弹背心",
+  "466": "动车,子弹头列车",
+  "467": "肉铺,肉菜市场",
+  "468": "出租车",
+  "469": "大锅",
+  "470": "蜡烛",
+  "471": "大炮",
+  "472": "独木舟",
+  "473": "开瓶器,开罐器",
+  "474": "开衫",
+  "475": "车镜",
+  "476": "旋转木马",
+  "477": "木匠的工具包,工具包",
+  "478": "纸箱",
+  "479": "车轮",
+  "480": "取款机,自动取款机",
+  "481": "盒式录音带",
+  "482": "卡带播放器",
+  "483": "城堡",
+  "484": "双体船",
+  "485": "CD播放器",
+  "486": "大提琴",
+  "487": "移动电话,手机",
+  "488": "铁链",
+  "489": "围栏",
+  "490": "链甲",
+  "491": "电锯,油锯",
+  "492": "箱子",
+  "493": "衣柜,洗脸台",
+  "494": "编钟,钟,锣",
+  "495": "中国橱柜",
+  "496": "圣诞袜",
+  "497": "教堂,教堂建筑",
+  "498": "电影院,剧场",
+  "499": "切肉刀,菜刀",
+  "500": "悬崖屋",
+  "501": "斗篷",
+  "502": "木屐,木鞋",
+  "503": "鸡尾酒调酒器",
+  "504": "咖啡杯",
+  "505": "咖啡壶",
+  "506": "螺旋结构（楼梯）",
+  "507": "组合锁",
+  "508": "电脑键盘,键盘",
+  "509": "糖果,糖果店",
+  "510": "集装箱船",
+  "511": "敞篷车",
+  "512": "开瓶器,瓶螺杆",
+  "513": "短号,喇叭",
+  "514": "牛仔靴",
+  "515": "牛仔帽",
+  "516": "摇篮",
+  "517": "起重机",
+  "518": "头盔",
+  "519": "板条箱",
+  "520": "小儿床",
+  "521": "砂锅",
+  "522": "槌球",
+  "523": "拐杖",
+  "524": "胸甲",
+  "525": "大坝,堤防",
+  "526": "书桌",
+  "527": "台式电脑",
+  "528": "有线电话",
+  "529": "尿布湿",
+  "530": "数字时钟",
+  "531": "数字手表",
+  "532": "餐桌板",
+  "533": "抹布",
+  "534": "洗碗机,洗碟机",
+  "535": "盘式制动器",
+  "536": "码头,船坞,码头设施",
+  "537": "狗拉雪橇",
+  "538": "圆顶",
+  "539": "门垫,垫子",
+  "540": "钻井平台,海上钻井",
+  "541": "鼓,乐器,鼓膜",
+  "542": "鼓槌",
+  "543": "哑铃",
+  "544": "荷兰烤箱",
+  "545": "电风扇,鼓风机",
+  "546": "电吉他",
+  "547": "电力机车",
+  "548": "电视,电视柜",
+  "549": "信封",
+  "550": "浓缩咖啡机",
+  "551": "扑面粉",
+  "552": "女用长围巾",
+  "553": "文件,文件柜,档案柜",
+  "554": "消防船",
+  "555": "消防车",
+  "556": "火炉栏",
+  "557": "旗杆",
+  "558": "长笛",
+  "559": "折叠椅",
+  "560": "橄榄球头盔",
+  "561": "叉车",
+  "562": "喷泉",
+  "563": "钢笔",
+  "564": "有四根帷柱的床",
+  "565": "运货车厢",
+  "566": "圆号,喇叭",
+  "567": "煎锅",
+  "568": "裘皮大衣",
+  "569": "垃圾车",
+  "570": "防毒面具,呼吸器",
+  "571": "汽油泵",
+  "572": "高脚杯",
+  "573": "卡丁车",
+  "574": "高尔夫球",
+  "575": "高尔夫球车",
+  "576": "狭长小船",
+  "577": "锣",
+  "578": "礼服",
+  "579": "钢琴",
+  "580": "温室,苗圃",
+  "581": "散热器格栅",
+  "582": "杂货店,食品市场",
+  "583": "断头台",
+  "584": "小发夹",
+  "585": "头发喷雾",
+  "586": "半履带装甲车",
+  "587": "锤子",
+  "588": "大篮子",
+  "589": "手摇鼓风机,吹风机",
+  "590": "手提电脑",
+  "591": "手帕",
+  "592": "硬盘",
+  "593": "口琴,口风琴",
+  "594": "竖琴",
+  "595": "收割机",
+  "596": "斧头",
+  "597": "手枪皮套",
+  "598": "家庭影院",
+  "599": "蜂窝",
+  "600": "钩爪",
+  "601": "衬裙",
+  "602": "单杠",
+  "603": "马车",
+  "604": "沙漏",
+  "605": "手机，iPad",
+  "606": "熨斗",
+  "607": "南瓜灯笼",
+  "608": "牛仔裤,蓝色牛仔裤",
+  "609": "吉普车",
+  "610": "运动衫,T恤",
+  "611": "拼图",
+  "612": "人力车",
+  "613": "操纵杆",
+  "614": "和服",
+  "615": "护膝",
+  "616": "蝴蝶结",
+  "617": "大褂,实验室外套",
+  "618": "长柄勺",
+  "619": "灯罩",
+  "620": "笔记本电脑",
+  "621": "割草机",
+  "622": "镜头盖",
+  "623": "开信刀,裁纸刀",
+  "624": "图书馆",
+  "625": "救生艇",
+  "626": "点火器,打火机",
+  "627": "豪华轿车",
+  "628": "远洋班轮",
+  "629": "唇膏,口红",
+  "630": "平底便鞋",
+  "631": "洗剂",
+  "632": "扬声器",
+  "633": "放大镜",
+  "634": "锯木厂",
+  "635": "磁罗盘",
+  "636": "邮袋",
+  "637": "信箱",
+  "638": "女游泳衣",
+  "639": "有肩带浴衣",
+  "640": "窨井盖",
+  "641": "沙球（一种打击乐器）",
+  "642": "马林巴木琴",
+  "643": "面膜",
+  "644": "火柴",
+  "645": "花柱",
+  "646": "迷宫",
+  "647": "量杯",
+  "648": "药箱",
+  "649": "巨石,巨石结构",
+  "650": "麦克风",
+  "651": "微波炉",
+  "652": "军装",
+  "653": "奶桶",
+  "654": "迷你巴士",
+  "655": "迷你裙",
+  "656": "面包车",
+  "657": "导弹",
+  "658": "连指手套",
+  "659": "搅拌钵",
+  "660": "活动房屋（由汽车拖拉的）",
+  "661": "T型发动机小汽车",
+  "662": "调制解调器",
+  "663": "修道院",
+  "664": "显示器",
+  "665": "电瓶车",
+  "666": "砂浆",
+  "667": "学士",
+  "668": "清真寺",
+  "669": "蚊帐",
+  "670": "摩托车",
+  "671": "山地自行车",
+  "672": "登山帐",
+  "673": "鼠标,电脑鼠标",
+  "674": "捕鼠器",
+  "675": "搬家车",
+  "676": "口套",
+  "677": "钉子",
+  "678": "颈托",
+  "679": "项链",
+  "680": "乳头（瓶）",
+  "681": "笔记本,笔记本电脑",
+  "682": "方尖碑",
+  "683": "双簧管",
+  "684": "陶笛,卵形笛",
+  "685": "里程表",
+  "686": "滤油器",
+  "687": "风琴,管风琴",
+  "688": "示波器",
+  "689": "罩裙",
+  "690": "牛车",
+  "691": "氧气面罩",
+  "692": "包装",
+  "693": "船桨",
+  "694": "明轮,桨轮",
+  "695": "挂锁,扣锁",
+  "696": "画笔",
+  "697": "睡衣",
+  "698": "宫殿",
+  "699": "排箫,鸣管",
+  "700": "纸巾",
+  "701": "降落伞",
+  "702": "双杠",
+  "703": "公园长椅",
+  "704": "停车收费表,停车计时器",
+  "705": "客车,教练车",
+  "706": "露台,阳台",
+  "707": "付费电话",
+  "708": "基座,基脚",
+  "709": "铅笔盒",
+  "710": "卷笔刀",
+  "711": "香水（瓶）",
+  "712": "培养皿",
+  "713": "复印机",
+  "714": "拨弦片,拨子",
+  "715": "尖顶头盔",
+  "716": "栅栏,栅栏",
+  "717": "皮卡,皮卡车",
+  "718": "桥墩",
+  "719": "存钱罐",
+  "720": "药瓶",
+  "721": "枕头",
+  "722": "乒乓球",
+  "723": "风车",
+  "724": "海盗船",
+  "725": "水罐",
+  "726": "木工刨",
+  "727": "天文馆",
+  "728": "塑料袋",
+  "729": "板架",
+  "730": "犁型铲雪机",
+  "731": "手压皮碗泵",
+  "732": "宝丽来相机",
+  "733": "电线杆",
+  "734": "警车,巡逻车",
+  "735": "雨披",
+  "736": "台球桌",
+  "737": "充气饮料瓶",
+  "738": "花盆",
+  "739": "陶工旋盘",
+  "740": "电钻",
+  "741": "祈祷垫,地毯",
+  "742": "打印机",
+  "743": "监狱",
+  "744": "炮弹,导弹",
+  "745": "投影仪",
+  "746": "冰球",
+  "747": "沙包,吊球",
+  "748": "钱包",
+  "749": "羽管笔",
+  "750": "被子",
+  "751": "赛车",
+  "752": "球拍",
+  "753": "散热器",
+  "754": "收音机",
+  "755": "射电望远镜,无线电反射器",
+  "756": "雨桶",
+  "757": "休闲车,房车",
+  "758": "卷轴,卷筒",
+  "759": "反射式照相机",
+  "760": "冰箱,冰柜",
+  "761": "遥控器",
+  "762": "餐厅,饮食店,食堂",
+  "763": "左轮手枪",
+  "764": "步枪",
+  "765": "摇椅",
+  "766": "电转烤肉架",
+  "767": "橡皮",
+  "768": "橄榄球",
+  "769": "直尺",
+  "770": "跑步鞋",
+  "771": "保险柜",
+  "772": "安全别针",
+  "773": "盐瓶（调味用）",
+  "774": "凉鞋",
+  "775": "纱笼,围裙",
+  "776": "萨克斯管",
+  "777": "剑鞘",
+  "778": "秤,称重机",
+  "779": "校车",
+  "780": "帆船",
+  "781": "记分牌",
+  "782": "屏幕",
+  "783": "螺丝",
+  "784": "螺丝刀",
+  "785": "安全带",
+  "786": "缝纫机",
+  "787": "盾牌,盾牌",
+  "788": "皮鞋店,鞋店",
+  "789": "障子",
+  "790": "购物篮",
+  "791": "购物车",
+  "792": "铁锹",
+  "793": "浴帽",
+  "794": "浴帘",
+  "795": "滑雪板",
+  "796": "滑雪面罩",
+  "797": "睡袋",
+  "798": "滑尺",
+  "799": "滑动门",
+  "800": "角子老虎机",
+  "801": "潜水通气管",
+  "802": "雪橇",
+  "803": "扫雪机,扫雪机",
+  "804": "皂液器",
+  "805": "足球",
+  "806": "袜子",
+  "807": "碟式太阳能,太阳能集热器,太阳能炉",
+  "808": "宽边帽",
+  "809": "汤碗",
+  "810": "空格键",
+  "811": "空间加热器",
+  "812": "航天飞机",
+  "813": "铲（搅拌或涂敷用的）",
+  "814": "快艇",
+  "815": "蜘蛛网",
+  "816": "纺锤,纱锭",
+  "817": "跑车",
+  "818": "聚光灯",
+  "819": "舞台",
+  "820": "蒸汽机车",
+  "821": "钢拱桥",
+  "822": "钢滚筒",
+  "823": "听诊器",
+  "824": "女用披肩",
+  "825": "石头墙",
+  "826": "秒表",
+  "827": "火炉",
+  "828": "过滤器",
+  "829": "有轨电车,电车",
+  "830": "担架",
+  "831": "沙发床",
+  "832": "佛塔",
+  "833": "潜艇,潜水艇",
+  "834": "套装,衣服",
+  "835": "日晷",
+  "836": "太阳镜",
+  "837": "太阳镜,墨镜",
+  "838": "防晒霜,防晒剂",
+  "839": "悬索桥",
+  "840": "拖把",
+  "841": "运动衫",
+  "842": "游泳裤",
+  "843": "秋千",
+  "844": "开关,电器开关",
+  "845": "注射器",
+  "846": "台灯",
+  "847": "坦克,装甲战车,装甲战斗车辆",
+  "848": "磁带播放器",
+  "849": "茶壶",
+  "850": "泰迪,泰迪熊",
+  "851": "电视",
+  "852": "网球",
+  "853": "茅草,茅草屋顶",
+  "854": "幕布,剧院的帷幕",
+  "855": "顶针",
+  "856": "脱粒机",
+  "857": "宝座",
+  "858": "瓦屋顶",
+  "859": "烤面包机",
+  "860": "烟草店,烟草",
+  "861": "马桶",
+  "862": "火炬",
+  "863": "图腾柱",
+  "864": "拖车,牵引车,清障车",
+  "865": "玩具店",
+  "866": "拖拉机",
+  "867": "拖车,铰接式卡车",
+  "868": "托盘",
+  "869": "风衣",
+  "870": "三轮车",
+  "871": "三体船",
+  "872": "三脚架",
+  "873": "凯旋门",
+  "874": "无轨电车",
+  "875": "长号",
+  "876": "浴盆,浴缸",
+  "877": "旋转式栅门",
+  "878": "打字机键盘",
+  "879": "伞",
+  "880": "独轮车",
+  "881": "直立式钢琴",
+  "882": "真空吸尘器",
+  "883": "花瓶",
+  "884": "拱顶",
+  "885": "天鹅绒",
+  "886": "自动售货机",
+  "887": "祭服",
+  "888": "高架桥",
+  "889": "小提琴,小提琴",
+  "890": "排球",
+  "891": "松饼机",
+  "892": "挂钟",
+  "893": "钱包,皮夹",
+  "894": "衣柜,壁橱",
+  "895": "军用飞机",
+  "896": "洗脸盆,洗手盆",
+  "897": "洗衣机,自动洗衣机",
+  "898": "水瓶",
+  "899": "水壶",
+  "900": "水塔",
+  "901": "威士忌壶",
+  "902": "哨子",
+  "903": "假发",
+  "904": "纱窗",
+  "905": "百叶窗",
+  "906": "温莎领带",
+  "907": "葡萄酒瓶",
+  "908": "飞机翅膀,飞机",
+  "909": "炒菜锅",
+  "910": "木制的勺子",
+  "911": "毛织品,羊绒",
+  "912": "栅栏,围栏",
+  "913": "沉船",
+  "914": "双桅船",
+  "915": "蒙古包",
+  "916": "网站,互联网网站",
+  "917": "漫画",
+  "918": "纵横字谜",
+  "919": "路标",
+  "920": "交通信号灯",
+  "921": "防尘罩,书皮",
+  "922": "菜单",
+  "923": "盘子",
+  "924": "鳄梨酱",
+  "925": "清汤",
+  "926": "罐焖土豆烧肉",
+  "927": "蛋糕",
+  "928": "冰淇淋",
+  "929": "雪糕,冰棍,冰棒",
+  "930": "法式面包",
+  "931": "百吉饼",
+  "932": "椒盐脆饼",
+  "933": "芝士汉堡",
+  "934": "热狗",
+  "935": "土豆泥",
+  "936": "结球甘蓝",
+  "937": "西兰花",
+  "938": "菜花",
+  "939": "绿皮密生西葫芦",
+  "940": "西葫芦",
+  "941": "小青南瓜",
+  "942": "南瓜",
+  "943": "黄瓜",
+  "944": "朝鲜蓟",
+  "945": "甜椒",
+  "946": "刺棘蓟",
+  "947": "蘑菇",
+  "948": "绿苹果",
+  "949": "草莓",
+  "950": "橘子",
+  "951": "柠檬",
+  "952": "无花果",
+  "953": "菠萝",
+  "954": "香蕉",
+  "955": "菠萝蜜",
+  "956": "蛋奶冻苹果",
+  "957": "石榴",
+  "958": "干草",
+  "959": "烤面条加干酪沙司",
+  "960": "巧克力酱,巧克力糖浆",
+  "961": "面团",
+  "962": "瑞士肉包,肉饼",
+  "963": "披萨,披萨饼",
+  "964": "馅饼",
+  "965": "卷饼",
+  "966": "红葡萄酒",
+  "967": "意大利浓咖啡",
+  "968": "杯子",
+  "969": "蛋酒",
+  "970": "高山",
+  "971": "泡泡",
+  "972": "悬崖",
+  "973": "珊瑚礁",
+  "974": "间歇泉",
+  "975": "湖边,湖岸",
+  "976": "海角",
+  "977": "沙洲,沙坝",
+  "978": "海滨,海岸",
+  "979": "峡谷",
+  "980": "火山",
+  "981": "棒球,棒球运动员",
+  "982": "新郎",
+  "983": "潜水员",
+  "984": "油菜",
+  "985": "雏菊",
+  "986": "杓兰",
+  "987": "玉米",
+  "988": "橡子",
+  "989": "玫瑰果",
+  "990": "七叶树果实",
+  "991": "珊瑚菌",
+  "992": "木耳",
+  "993": "鹿花菌",
+  "994": "鬼笔菌",
+  "995": "地星（菌类）",
+  "996": "多叶奇果菌",
+  "997": "牛肝菌",
+  "998": "玉米穗",
+  "999": "卫生纸"
+}

labels/id2label_en.json ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+  "0": "tench, Tinca tinca",
+  "1": "goldfish, Carassius auratus",
+  "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+  "3": "tiger shark, Galeocerdo cuvieri",
+  "4": "hammerhead, hammerhead shark",
+  "5": "electric ray, crampfish, numbfish, torpedo",
+  "6": "stingray",
+  "7": "cock",
+  "8": "hen",
+  "9": "ostrich, Struthio camelus",
+  "10": "brambling, Fringilla montifringilla",
+  "11": "goldfinch, Carduelis carduelis",
+  "12": "house finch, linnet, Carpodacus mexicanus",
+  "13": "junco, snowbird",
+  "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+  "15": "robin, American robin, Turdus migratorius",
+  "16": "bulbul",
+  "17": "jay",
+  "18": "magpie",
+  "19": "chickadee",
+  "20": "water ouzel, dipper",
+  "21": "kite",
+  "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+  "23": "vulture",
+  "24": "great grey owl, great gray owl, Strix nebulosa",
+  "25": "European fire salamander, Salamandra salamandra",
+  "26": "common newt, Triturus vulgaris",
+  "27": "eft",
+  "28": "spotted salamander, Ambystoma maculatum",
+  "29": "axolotl, mud puppy, Ambystoma mexicanum",
+  "30": "bullfrog, Rana catesbeiana",
+  "31": "tree frog, tree-frog",
+  "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+  "33": "loggerhead, loggerhead turtle, Caretta caretta",
+  "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+  "35": "mud turtle",
+  "36": "terrapin",
+  "37": "box turtle, box tortoise",
+  "38": "banded gecko",
+  "39": "common iguana, iguana, Iguana iguana",
+  "40": "American chameleon, anole, Anolis carolinensis",
+  "41": "whiptail, whiptail lizard",
+  "42": "agama",
+  "43": "frilled lizard, Chlamydosaurus kingi",
+  "44": "alligator lizard",
+  "45": "Gila monster, Heloderma suspectum",
+  "46": "green lizard, Lacerta viridis",
+  "47": "African chameleon, Chamaeleo chamaeleon",
+  "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+  "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+  "50": "American alligator, Alligator mississipiensis",
+  "51": "triceratops",
+  "52": "thunder snake, worm snake, Carphophis amoenus",
+  "53": "ringneck snake, ring-necked snake, ring snake",
+  "54": "hognose snake, puff adder, sand viper",
+  "55": "green snake, grass snake",
+  "56": "king snake, kingsnake",
+  "57": "garter snake, grass snake",
+  "58": "water snake",
+  "59": "vine snake",
+  "60": "night snake, Hypsiglena torquata",
+  "61": "boa constrictor, Constrictor constrictor",
+  "62": "rock python, rock snake, Python sebae",
+  "63": "Indian cobra, Naja naja",
+  "64": "green mamba",
+  "65": "sea snake",
+  "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+  "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+  "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+  "69": "trilobite",
+  "70": "harvestman, daddy longlegs, Phalangium opilio",
+  "71": "scorpion",
+  "72": "black and gold garden spider, Argiope aurantia",
+  "73": "barn spider, Araneus cavaticus",
+  "74": "garden spider, Aranea diademata",
+  "75": "black widow, Latrodectus mactans",
+  "76": "tarantula",
+  "77": "wolf spider, hunting spider",
+  "78": "tick",
+  "79": "centipede",
+  "80": "black grouse",
+  "81": "ptarmigan",
+  "82": "ruffed grouse, partridge, Bonasa umbellus",
+  "83": "prairie chicken, prairie grouse, prairie fowl",
+  "84": "peacock",
+  "85": "quail",
+  "86": "partridge",
+  "87": "African grey, African gray, Psittacus erithacus",
+  "88": "macaw",
+  "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+  "90": "lorikeet",
+  "91": "coucal",
+  "92": "bee eater",
+  "93": "hornbill",
+  "94": "hummingbird",
+  "95": "jacamar",
+  "96": "toucan",
+  "97": "drake",
+  "98": "red-breasted merganser, Mergus serrator",
+  "99": "goose",
+  "100": "black swan, Cygnus atratus",
+  "101": "tusker",
+  "102": "echidna, spiny anteater, anteater",
+  "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+  "104": "wallaby, brush kangaroo",
+  "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+  "106": "wombat",
+  "107": "jellyfish",
+  "108": "sea anemone, anemone",
+  "109": "brain coral",
+  "110": "flatworm, platyhelminth",
+  "111": "nematode, nematode worm, roundworm",
+  "112": "conch",
+  "113": "snail",
+  "114": "slug",
+  "115": "sea slug, nudibranch",
+  "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+  "117": "chambered nautilus, pearly nautilus, nautilus",
+  "118": "Dungeness crab, Cancer magister",
+  "119": "rock crab, Cancer irroratus",
+  "120": "fiddler crab",
+  "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+  "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+  "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+  "124": "crayfish, crawfish, crawdad, crawdaddy",
+  "125": "hermit crab",
+  "126": "isopod",
+  "127": "white stork, Ciconia ciconia",
+  "128": "black stork, Ciconia nigra",
+  "129": "spoonbill",
+  "130": "flamingo",
+  "131": "little blue heron, Egretta caerulea",
+  "132": "American egret, great white heron, Egretta albus",
+  "133": "bittern",
+  "134": "crane",
+  "135": "limpkin, Aramus pictus",
+  "136": "European gallinule, Porphyrio porphyrio",
+  "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+  "138": "bustard",
+  "139": "ruddy turnstone, Arenaria interpres",
+  "140": "red-backed sandpiper, dunlin, Erolia alpina",
+  "141": "redshank, Tringa totanus",
+  "142": "dowitcher",
+  "143": "oystercatcher, oyster catcher",
+  "144": "pelican",
+  "145": "king penguin, Aptenodytes patagonica",
+  "146": "albatross, mollymawk",
+  "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+  "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+  "149": "dugong, Dugong dugon",
+  "150": "sea lion",
+  "151": "Chihuahua",
+  "152": "Japanese spaniel",
+  "153": "Maltese dog, Maltese terrier, Maltese",
+  "154": "Pekinese, Pekingese, Peke",
+  "155": "Shih-Tzu",
+  "156": "Blenheim spaniel",
+  "157": "papillon",
+  "158": "toy terrier",
+  "159": "Rhodesian ridgeback",
+  "160": "Afghan hound, Afghan",
+  "161": "basset, basset hound",
+  "162": "beagle",
+  "163": "bloodhound, sleuthhound",
+  "164": "bluetick",
+  "165": "black-and-tan coonhound",
+  "166": "Walker hound, Walker foxhound",
+  "167": "English foxhound",
+  "168": "redbone",
+  "169": "borzoi, Russian wolfhound",
+  "170": "Irish wolfhound",
+  "171": "Italian greyhound",
+  "172": "whippet",
+  "173": "Ibizan hound, Ibizan Podenco",
+  "174": "Norwegian elkhound, elkhound",
+  "175": "otterhound, otter hound",
+  "176": "Saluki, gazelle hound",
+  "177": "Scottish deerhound, deerhound",
+  "178": "Weimaraner",
+  "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+  "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+  "181": "Bedlington terrier",
+  "182": "Border terrier",
+  "183": "Kerry blue terrier",
+  "184": "Irish terrier",
+  "185": "Norfolk terrier",
+  "186": "Norwich terrier",
+  "187": "Yorkshire terrier",
+  "188": "wire-haired fox terrier",
+  "189": "Lakeland terrier",
+  "190": "Sealyham terrier, Sealyham",
+  "191": "Airedale, Airedale terrier",
+  "192": "cairn, cairn terrier",
+  "193": "Australian terrier",
+  "194": "Dandie Dinmont, Dandie Dinmont terrier",
+  "195": "Boston bull, Boston terrier",
+  "196": "miniature schnauzer",
+  "197": "giant schnauzer",
+  "198": "standard schnauzer",
+  "199": "Scotch terrier, Scottish terrier, Scottie",
+  "200": "Tibetan terrier, chrysanthemum dog",
+  "201": "silky terrier, Sydney silky",
+  "202": "soft-coated wheaten terrier",
+  "203": "West Highland white terrier",
+  "204": "Lhasa, Lhasa apso",
+  "205": "flat-coated retriever",
+  "206": "curly-coated retriever",
+  "207": "golden retriever",
+  "208": "Labrador retriever",
+  "209": "Chesapeake Bay retriever",
+  "210": "German short-haired pointer",
+  "211": "vizsla, Hungarian pointer",
+  "212": "English setter",
+  "213": "Irish setter, red setter",
+  "214": "Gordon setter",
+  "215": "Brittany spaniel",
+  "216": "clumber, clumber spaniel",
+  "217": "English springer, English springer spaniel",
+  "218": "Welsh springer spaniel",
+  "219": "cocker spaniel, English cocker spaniel, cocker",
+  "220": "Sussex spaniel",
+  "221": "Irish water spaniel",
+  "222": "kuvasz",
+  "223": "schipperke",
+  "224": "groenendael",
+  "225": "malinois",
+  "226": "briard",
+  "227": "kelpie",
+  "228": "komondor",
+  "229": "Old English sheepdog, bobtail",
+  "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+  "231": "collie",
+  "232": "Border collie",
+  "233": "Bouvier des Flandres, Bouviers des Flandres",
+  "234": "Rottweiler",
+  "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+  "236": "Doberman, Doberman pinscher",
+  "237": "miniature pinscher",
+  "238": "Greater Swiss Mountain dog",
+  "239": "Bernese mountain dog",
+  "240": "Appenzeller",
+  "241": "EntleBucher",
+  "242": "boxer",
+  "243": "bull mastiff",
+  "244": "Tibetan mastiff",
+  "245": "French bulldog",
+  "246": "Great Dane",
+  "247": "Saint Bernard, St Bernard",
+  "248": "Eskimo dog, husky",
+  "249": "malamute, malemute, Alaskan malamute",
+  "250": "Siberian husky",
+  "251": "dalmatian, coach dog, carriage dog",
+  "252": "affenpinscher, monkey pinscher, monkey dog",
+  "253": "basenji",
+  "254": "pug, pug-dog",
+  "255": "Leonberg",
+  "256": "Newfoundland, Newfoundland dog",
+  "257": "Great Pyrenees",
+  "258": "Samoyed, Samoyede",
+  "259": "Pomeranian",
+  "260": "chow, chow chow",
+  "261": "keeshond",
+  "262": "Brabancon griffon",
+  "263": "Pembroke, Pembroke Welsh corgi",
+  "264": "Cardigan, Cardigan Welsh corgi",
+  "265": "toy poodle",
+  "266": "miniature poodle",
+  "267": "standard poodle",
+  "268": "Mexican hairless",
+  "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+  "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+  "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+  "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+  "273": "dingo, warrigal, warragal, Canis dingo",
+  "274": "dhole, Cuon alpinus",
+  "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+  "276": "hyena, hyaena",
+  "277": "red fox, Vulpes vulpes",
+  "278": "kit fox, Vulpes macrotis",
+  "279": "Arctic fox, white fox, Alopex lagopus",
+  "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+  "281": "tabby, tabby cat",
+  "282": "tiger cat",
+  "283": "Persian cat",
+  "284": "Siamese cat, Siamese",
+  "285": "Egyptian cat",
+  "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+  "287": "lynx, catamount",
+  "288": "leopard, Panthera pardus",
+  "289": "snow leopard, ounce, Panthera uncia",
+  "290": "jaguar, panther, Panthera onca, Felis onca",
+  "291": "lion, king of beasts, Panthera leo",
+  "292": "tiger, Panthera tigris",
+  "293": "cheetah, chetah, Acinonyx jubatus",
+  "294": "brown bear, bruin, Ursus arctos",
+  "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+  "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+  "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+  "298": "mongoose",
+  "299": "meerkat, mierkat",
+  "300": "tiger beetle",
+  "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+  "302": "ground beetle, carabid beetle",
+  "303": "long-horned beetle, longicorn, longicorn beetle",
+  "304": "leaf beetle, chrysomelid",
+  "305": "dung beetle",
+  "306": "rhinoceros beetle",
+  "307": "weevil",
+  "308": "fly",
+  "309": "bee",
+  "310": "ant, emmet, pismire",
+  "311": "grasshopper, hopper",
+  "312": "cricket",
+  "313": "walking stick, walkingstick, stick insect",
+  "314": "cockroach, roach",
+  "315": "mantis, mantid",
+  "316": "cicada, cicala",
+  "317": "leafhopper",
+  "318": "lacewing, lacewing fly",
+  "319": "dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+  "320": "damselfly",
+  "321": "admiral",
+  "322": "ringlet, ringlet butterfly",
+  "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+  "324": "cabbage butterfly",
+  "325": "sulphur butterfly, sulfur butterfly",
+  "326": "lycaenid, lycaenid butterfly",
+  "327": "starfish, sea star",
+  "328": "sea urchin",
+  "329": "sea cucumber, holothurian",
+  "330": "wood rabbit, cottontail, cottontail rabbit",
+  "331": "hare",
+  "332": "Angora, Angora rabbit",
+  "333": "hamster",
+  "334": "porcupine, hedgehog",
+  "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+  "336": "marmot",
+  "337": "beaver",
+  "338": "guinea pig, Cavia cobaya",
+  "339": "sorrel",
+  "340": "zebra",
+  "341": "hog, pig, grunter, squealer, Sus scrofa",
+  "342": "wild boar, boar, Sus scrofa",
+  "343": "warthog",
+  "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+  "345": "ox",
+  "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+  "347": "bison",
+  "348": "ram, tup",
+  "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+  "350": "ibex, Capra ibex",
+  "351": "hartebeest",
+  "352": "impala, Aepyceros melampus",
+  "353": "gazelle",
+  "354": "Arabian camel, dromedary, Camelus dromedarius",
+  "355": "llama",
+  "356": "weasel",
+  "357": "mink",
+  "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+  "359": "black-footed ferret, ferret, Mustela nigripes",
+  "360": "otter",
+  "361": "skunk, polecat, wood pussy",
+  "362": "badger",
+  "363": "armadillo",
+  "364": "three-toed sloth, ai, Bradypus tridactylus",
+  "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+  "366": "gorilla, Gorilla gorilla",
+  "367": "chimpanzee, chimp, Pan troglodytes",
+  "368": "gibbon, Hylobates lar",
+  "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+  "370": "guenon, guenon monkey",
+  "371": "patas, hussar monkey, Erythrocebus patas",
+  "372": "baboon",
+  "373": "macaque",
+  "374": "langur",
+  "375": "colobus, colobus monkey",
+  "376": "proboscis monkey, Nasalis larvatus",
+  "377": "marmoset",
+  "378": "capuchin, ringtail, Cebus capucinus",
+  "379": "howler monkey, howler",
+  "380": "titi, titi monkey",
+  "381": "spider monkey, Ateles geoffroyi",
+  "382": "squirrel monkey, Saimiri sciureus",
+  "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+  "384": "indri, indris, Indri indri, Indri brevicaudatus",
+  "385": "Indian elephant, Elephas maximus",
+  "386": "African elephant, Loxodonta africana",
+  "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+  "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+  "389": "barracouta, snoek",
+  "390": "eel",
+  "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+  "392": "rock beauty, Holocanthus tricolor",
+  "393": "anemone fish",
+  "394": "sturgeon",
+  "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+  "396": "lionfish",
+  "397": "puffer, pufferfish, blowfish, globefish",
+  "398": "abacus",
+  "399": "abaya",
+  "400": "academic gown, academic robe, judge robe",
+  "401": "accordion, piano accordion, squeeze box",
+  "402": "acoustic guitar",
+  "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+  "404": "airliner",
+  "405": "airship, dirigible",
+  "406": "altar",
+  "407": "ambulance",
+  "408": "amphibian, amphibious vehicle",
+  "409": "analog clock",
+  "410": "apiary, bee house",
+  "411": "apron",
+  "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+  "413": "assault rifle, assault gun",
+  "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+  "415": "bakery, bakeshop, bakehouse",
+  "416": "balance beam, beam",
+  "417": "balloon",
+  "418": "ballpoint, ballpoint pen, ballpen, Biro",
+  "419": "Band Aid",
+  "420": "banjo",
+  "421": "bannister, banister, balustrade, balusters, handrail",
+  "422": "barbell",
+  "423": "barber chair",
+  "424": "barbershop",
+  "425": "barn",
+  "426": "barometer",
+  "427": "barrel, cask",
+  "428": "barrow, garden cart, lawn cart, wheelbarrow",
+  "429": "baseball",
+  "430": "basketball",
+  "431": "bassinet",
+  "432": "bassoon",
+  "433": "bathing cap, swimming cap",
+  "434": "bath towel",
+  "435": "bathtub, bathing tub, bath, tub",
+  "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+  "437": "beacon, lighthouse, beacon light, pharos",
+  "438": "beaker",
+  "439": "bearskin, busby, shako",
+  "440": "beer bottle",
+  "441": "beer glass",
+  "442": "bell cote, bell cot",
+  "443": "bib",
+  "444": "bicycle-built-for-two, tandem bicycle, tandem",
+  "445": "bikini, two-piece",
+  "446": "binder, ring-binder",
+  "447": "binoculars, field glasses, opera glasses",
+  "448": "birdhouse",
+  "449": "boathouse",
+  "450": "bobsled, bobsleigh, bob",
+  "451": "bolo tie, bolo, bola tie, bola",
+  "452": "bonnet, poke bonnet",
+  "453": "bookcase",
+  "454": "bookshop, bookstore, bookstall",
+  "455": "bottlecap",
+  "456": "bow",
+  "457": "bow tie, bow-tie, bowtie",
+  "458": "brass, memorial tablet, plaque",
+  "459": "brassiere, bra, bandeau",
+  "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+  "461": "breastplate, aegis, egis",
+  "462": "broom",
+  "463": "bucket, pail",
+  "464": "buckle",
+  "465": "bulletproof vest",
+  "466": "bullet train, bullet",
+  "467": "butcher shop, meat market",
+  "468": "cab, hack, taxi, taxicab",
+  "469": "caldron, cauldron",
+  "470": "candle, taper, wax light",
+  "471": "cannon",
+  "472": "canoe",
+  "473": "can opener, tin opener",
+  "474": "cardigan",
+  "475": "car mirror",
+  "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+  "477": "carpenters kit, tool kit",
+  "478": "carton",
+  "479": "car wheel",
+  "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+  "481": "cassette",
+  "482": "cassette player",
+  "483": "castle",
+  "484": "catamaran",
+  "485": "CD player",
+  "486": "cello, violoncello",
+  "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+  "488": "chain",
+  "489": "chainlink fence",
+  "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+  "491": "chain saw, chainsaw",
+  "492": "chest",
+  "493": "chiffonier, commode",
+  "494": "chime, bell, gong",
+  "495": "china cabinet, china closet",
+  "496": "Christmas stocking",
+  "497": "church, church building",
+  "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+  "499": "cleaver, meat cleaver, chopper",
+  "500": "cliff dwelling",
+  "501": "cloak",
+  "502": "clog, geta, patten, sabot",
+  "503": "cocktail shaker",
+  "504": "coffee mug",
+  "505": "coffeepot",
+  "506": "coil, spiral, volute, whorl, helix",
+  "507": "combination lock",
+  "508": "computer keyboard, keypad",
+  "509": "confectionery, confectionary, candy store",
+  "510": "container ship, containership, container vessel",
+  "511": "convertible",
+  "512": "corkscrew, bottle screw",
+  "513": "cornet, horn, trumpet, trump",
+  "514": "cowboy boot",
+  "515": "cowboy hat, ten-gallon hat",
+  "516": "cradle",
+  "517": "crane",
+  "518": "crash helmet",
+  "519": "crate",
+  "520": "crib, cot",
+  "521": "Crock Pot",
+  "522": "croquet ball",
+  "523": "crutch",
+  "524": "cuirass",
+  "525": "dam, dike, dyke",
+  "526": "desk",
+  "527": "desktop computer",
+  "528": "dial telephone, dial phone",
+  "529": "diaper, nappy, napkin",
+  "530": "digital clock",
+  "531": "digital watch",
+  "532": "dining table, board",
+  "533": "dishrag, dishcloth",
+  "534": "dishwasher, dish washer, dishwashing machine",
+  "535": "disk brake, disc brake",
+  "536": "dock, dockage, docking facility",
+  "537": "dogsled, dog sled, dog sleigh",
+  "538": "dome",
+  "539": "doormat, welcome mat",
+  "540": "drilling platform, offshore rig",
+  "541": "drum, membranophone, tympan",
+  "542": "drumstick",
+  "543": "dumbbell",
+  "544": "Dutch oven",
+  "545": "electric fan, blower",
+  "546": "electric guitar",
+  "547": "electric locomotive",
+  "548": "entertainment center",
+  "549": "envelope",
+  "550": "espresso maker",
+  "551": "face powder",
+  "552": "feather boa, boa",
+  "553": "file, file cabinet, filing cabinet",
+  "554": "fireboat",
+  "555": "fire engine, fire truck",
+  "556": "fire screen, fireguard",
+  "557": "flagpole, flagstaff",
+  "558": "flute, transverse flute",
+  "559": "folding chair",
+  "560": "football helmet",
+  "561": "forklift",
+  "562": "fountain",
+  "563": "fountain pen",
+  "564": "four-poster",
+  "565": "freight car",
+  "566": "French horn, horn",
+  "567": "frying pan, frypan, skillet",
+  "568": "fur coat",
+  "569": "garbage truck, dustcart",
+  "570": "gasmask, respirator, gas helmet",
+  "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+  "572": "goblet",
+  "573": "go-kart",
+  "574": "golf ball",
+  "575": "golfcart, golf cart",
+  "576": "gondola",
+  "577": "gong, tam-tam",
+  "578": "gown",
+  "579": "grand piano, grand",
+  "580": "greenhouse, nursery, glasshouse",
+  "581": "grille, radiator grille",
+  "582": "grocery store, grocery, food market, market",
+  "583": "guillotine",
+  "584": "hair slide",
+  "585": "hair spray",
+  "586": "half track",
+  "587": "hammer",
+  "588": "hamper",
+  "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+  "590": "hand-held computer, hand-held microcomputer",
+  "591": "handkerchief, hankie, hanky, hankey",
+  "592": "hard disc, hard disk, fixed disk",
+  "593": "harmonica, mouth organ, harp, mouth harp",
+  "594": "harp",
+  "595": "harvester, reaper",
+  "596": "hatchet",
+  "597": "holster",
+  "598": "home theater, home theatre",
+  "599": "honeycomb",
+  "600": "hook, claw",
+  "601": "hoopskirt, crinoline",
+  "602": "horizontal bar, high bar",
+  "603": "horse cart, horse-cart",
+  "604": "hourglass",
+  "605": "iPod",
+  "606": "iron, smoothing iron",
+  "607": "jack-o-lantern",
+  "608": "jean, blue jean, denim",
+  "609": "jeep, landrover",
+  "610": "jersey, T-shirt, tee shirt",
+  "611": "jigsaw puzzle",
+  "612": "jinrikisha, ricksha, rickshaw",
+  "613": "joystick",
+  "614": "kimono",
+  "615": "knee pad",
+  "616": "knot",
+  "617": "lab coat, laboratory coat",
+  "618": "ladle",
+  "619": "lampshade, lamp shade",
+  "620": "laptop, laptop computer",
+  "621": "lawn mower, mower",
+  "622": "lens cap, lens cover",
+  "623": "letter opener, paper knife, paperknife",
+  "624": "library",
+  "625": "lifeboat",
+  "626": "lighter, light, igniter, ignitor",
+  "627": "limousine, limo",
+  "628": "liner, ocean liner",
+  "629": "lipstick, lip rouge",
+  "630": "Loafer",
+  "631": "lotion",
+  "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+  "633": "loupe, jewelers loupe",
+  "634": "lumbermill, sawmill",
+  "635": "magnetic compass",
+  "636": "mailbag, postbag",
+  "637": "mailbox, letter box",
+  "638": "maillot",
+  "639": "maillot, tank suit",
+  "640": "manhole cover",
+  "641": "maraca",
+  "642": "marimba, xylophone",
+  "643": "mask",
+  "644": "matchstick",
+  "645": "maypole",
+  "646": "maze, labyrinth",
+  "647": "measuring cup",
+  "648": "medicine chest, medicine cabinet",
+  "649": "megalith, megalithic structure",
+  "650": "microphone, mike",
+  "651": "microwave, microwave oven",
+  "652": "military uniform",
+  "653": "milk can",
+  "654": "minibus",
+  "655": "miniskirt, mini",
+  "656": "minivan",
+  "657": "missile",
+  "658": "mitten",
+  "659": "mixing bowl",
+  "660": "mobile home, manufactured home",
+  "661": "Model T",
+  "662": "modem",
+  "663": "monastery",
+  "664": "monitor",
+  "665": "moped",
+  "666": "mortar",
+  "667": "mortarboard",
+  "668": "mosque",
+  "669": "mosquito net",
+  "670": "motor scooter, scooter",
+  "671": "mountain bike, all-terrain bike, off-roader",
+  "672": "mountain tent",
+  "673": "mouse, computer mouse",
+  "674": "mousetrap",
+  "675": "moving van",
+  "676": "muzzle",
+  "677": "nail",
+  "678": "neck brace",
+  "679": "necklace",
+  "680": "nipple",
+  "681": "notebook, notebook computer",
+  "682": "obelisk",
+  "683": "oboe, hautboy, hautbois",
+  "684": "ocarina, sweet potato",
+  "685": "odometer, hodometer, mileometer, milometer",
+  "686": "oil filter",
+  "687": "organ, pipe organ",
+  "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+  "689": "overskirt",
+  "690": "oxcart",
+  "691": "oxygen mask",
+  "692": "packet",
+  "693": "paddle, boat paddle",
+  "694": "paddlewheel, paddle wheel",
+  "695": "padlock",
+  "696": "paintbrush",
+  "697": "pajama, pyjama, pjs, jammies",
+  "698": "palace",
+  "699": "panpipe, pandean pipe, syrinx",
+  "700": "paper towel",
+  "701": "parachute, chute",
+  "702": "parallel bars, bars",
+  "703": "park bench",
+  "704": "parking meter",
+  "705": "passenger car, coach, carriage",
+  "706": "patio, terrace",
+  "707": "pay-phone, pay-station",
+  "708": "pedestal, plinth, footstall",
+  "709": "pencil box, pencil case",
+  "710": "pencil sharpener",
+  "711": "perfume, essence",
+  "712": "Petri dish",
+  "713": "photocopier",
+  "714": "pick, plectrum, plectron",
+  "715": "pickelhaube",
+  "716": "picket fence, paling",
+  "717": "pickup, pickup truck",
+  "718": "pier",
+  "719": "piggy bank, penny bank",
+  "720": "pill bottle",
+  "721": "pillow",
+  "722": "ping-pong ball",
+  "723": "pinwheel",
+  "724": "pirate, pirate ship",
+  "725": "pitcher, ewer",
+  "726": "plane, carpenters plane, woodworking plane",
+  "727": "planetarium",
+  "728": "plastic bag",
+  "729": "plate rack",
+  "730": "plow, plough",
+  "731": "plunger, plumbers helper",
+  "732": "Polaroid camera, Polaroid Land camera",
+  "733": "pole",
+  "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+  "735": "poncho",
+  "736": "pool table, billiard table, snooker table",
+  "737": "pop bottle, soda bottle",
+  "738": "pot, flowerpot",
+  "739": "potters wheel",
+  "740": "power drill",
+  "741": "prayer rug, prayer mat",
+  "742": "printer",
+  "743": "prison, prison house",
+  "744": "projectile, missile",
+  "745": "projector",
+  "746": "puck, hockey puck",
+  "747": "punching bag, punch bag, punching ball, punchball",
+  "748": "purse",
+  "749": "quill, quill pen",
+  "750": "quilt, comforter, comfort, puff",
+  "751": "racer, race car, racing car",
+  "752": "racket, racquet",
+  "753": "radiator",
+  "754": "radio, wireless",
+  "755": "radio telescope, radio reflector",
+  "756": "rain barrel",
+  "757": "recreational vehicle, RV, R.V.",
+  "758": "reel",
+  "759": "reflex camera",
+  "760": "refrigerator, icebox",
+  "761": "remote control, remote",
+  "762": "restaurant, eating house, eating place, eatery",
+  "763": "revolver, six-gun, six-shooter",
+  "764": "rifle",
+  "765": "rocking chair, rocker",
+  "766": "rotisserie",
+  "767": "rubber eraser, rubber, pencil eraser",
+  "768": "rugby ball",
+  "769": "rule, ruler",
+  "770": "running shoe",
+  "771": "safe",
+  "772": "safety pin",
+  "773": "saltshaker, salt shaker",
+  "774": "sandal",
+  "775": "sarong",
+  "776": "sax, saxophone",
+  "777": "scabbard",
+  "778": "scale, weighing machine",
+  "779": "school bus",
+  "780": "schooner",
+  "781": "scoreboard",
+  "782": "screen, CRT screen",
+  "783": "screw",
+  "784": "screwdriver",
+  "785": "seat belt, seatbelt",
+  "786": "sewing machine",
+  "787": "shield, buckler",
+  "788": "shoe shop, shoe-shop, shoe store",
+  "789": "shoji",
+  "790": "shopping basket",
+  "791": "shopping cart",
+  "792": "shovel",
+  "793": "shower cap",
+  "794": "shower curtain",
+  "795": "ski",
+  "796": "ski mask",
+  "797": "sleeping bag",
+  "798": "slide rule, slipstick",
+  "799": "sliding door",
+  "800": "slot, one-armed bandit",
+  "801": "snorkel",
+  "802": "snowmobile",
+  "803": "snowplow, snowplough",
+  "804": "soap dispenser",
+  "805": "soccer ball",
+  "806": "sock",
+  "807": "solar dish, solar collector, solar furnace",
+  "808": "sombrero",
+  "809": "soup bowl",
+  "810": "space bar",
+  "811": "space heater",
+  "812": "space shuttle",
+  "813": "spatula",
+  "814": "speedboat",
+  "815": "spider web, spiders web",
+  "816": "spindle",
+  "817": "sports car, sport car",
+  "818": "spotlight, spot",
+  "819": "stage",
+  "820": "steam locomotive",
+  "821": "steel arch bridge",
+  "822": "steel drum",
+  "823": "stethoscope",
+  "824": "stole",
+  "825": "stone wall",
+  "826": "stopwatch, stop watch",
+  "827": "stove",
+  "828": "strainer",
+  "829": "streetcar, tram, tramcar, trolley, trolley car",
+  "830": "stretcher",
+  "831": "studio couch, day bed",
+  "832": "stupa, tope",
+  "833": "submarine, pigboat, sub, U-boat",
+  "834": "suit, suit of clothes",
+  "835": "sundial",
+  "836": "sunglass",
+  "837": "sunglasses, dark glasses, shades",
+  "838": "sunscreen, sunblock, sun blocker",
+  "839": "suspension bridge",
+  "840": "swab, swob, mop",
+  "841": "sweatshirt",
+  "842": "swimming trunks, bathing trunks",
+  "843": "swing",
+  "844": "switch, electric switch, electrical switch",
+  "845": "syringe",
+  "846": "table lamp",
+  "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+  "848": "tape player",
+  "849": "teapot",
+  "850": "teddy, teddy bear",
+  "851": "television, television system",
+  "852": "tennis ball",
+  "853": "thatch, thatched roof",
+  "854": "theater curtain, theatre curtain",
+  "855": "thimble",
+  "856": "thresher, thrasher, threshing machine",
+  "857": "throne",
+  "858": "tile roof",
+  "859": "toaster",
+  "860": "tobacco shop, tobacconist shop, tobacconist",
+  "861": "toilet seat",
+  "862": "torch",
+  "863": "totem pole",
+  "864": "tow truck, tow car, wrecker",
+  "865": "toyshop",
+  "866": "tractor",
+  "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+  "868": "tray",
+  "869": "trench coat",
+  "870": "tricycle, trike, velocipede",
+  "871": "trimaran",
+  "872": "tripod",
+  "873": "triumphal arch",
+  "874": "trolleybus, trolley coach, trackless trolley",
+  "875": "trombone",
+  "876": "tub, vat",
+  "877": "turnstile",
+  "878": "typewriter keyboard",
+  "879": "umbrella",
+  "880": "unicycle, monocycle",
+  "881": "upright, upright piano",
+  "882": "vacuum, vacuum cleaner",
+  "883": "vase",
+  "884": "vault",
+  "885": "velvet",
+  "886": "vending machine",
+  "887": "vestment",
+  "888": "viaduct",
+  "889": "violin, fiddle",
+  "890": "volleyball",
+  "891": "waffle iron",
+  "892": "wall clock",
+  "893": "wallet, billfold, notecase, pocketbook",
+  "894": "wardrobe, closet, press",
+  "895": "warplane, military plane",
+  "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+  "897": "washer, automatic washer, washing machine",
+  "898": "water bottle",
+  "899": "water jug",
+  "900": "water tower",
+  "901": "whiskey jug",
+  "902": "whistle",
+  "903": "wig",
+  "904": "window screen",
+  "905": "window shade",
+  "906": "Windsor tie",
+  "907": "wine bottle",
+  "908": "wing",
+  "909": "wok",
+  "910": "wooden spoon",
+  "911": "wool, woolen, woollen",
+  "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+  "913": "wreck",
+  "914": "yawl",
+  "915": "yurt",
+  "916": "web site, website, internet site, site",
+  "917": "comic book",
+  "918": "crossword puzzle, crossword",
+  "919": "street sign",
+  "920": "traffic light, traffic signal, stoplight",
+  "921": "book jacket, dust cover, dust jacket, dust wrapper",
+  "922": "menu",
+  "923": "plate",
+  "924": "guacamole",
+  "925": "consomme",
+  "926": "hot pot, hotpot",
+  "927": "trifle",
+  "928": "ice cream, icecream",
+  "929": "ice lolly, lolly, lollipop, popsicle",
+  "930": "French loaf",
+  "931": "bagel, beigel",
+  "932": "pretzel",
+  "933": "cheeseburger",
+  "934": "hotdog, hot dog, red hot",
+  "935": "mashed potato",
+  "936": "head cabbage",
+  "937": "broccoli",
+  "938": "cauliflower",
+  "939": "zucchini, courgette",
+  "940": "spaghetti squash",
+  "941": "acorn squash",
+  "942": "butternut squash",
+  "943": "cucumber, cuke",
+  "944": "artichoke, globe artichoke",
+  "945": "bell pepper",
+  "946": "cardoon",
+  "947": "mushroom",
+  "948": "Granny Smith",
+  "949": "strawberry",
+  "950": "orange",
+  "951": "lemon",
+  "952": "fig",
+  "953": "pineapple, ananas",
+  "954": "banana",
+  "955": "jackfruit, jak, jack",
+  "956": "custard apple",
+  "957": "pomegranate",
+  "958": "hay",
+  "959": "carbonara",
+  "960": "chocolate sauce, chocolate syrup",
+  "961": "dough",
+  "962": "meat loaf, meatloaf",
+  "963": "pizza, pizza pie",
+  "964": "potpie",
+  "965": "burrito",
+  "966": "red wine",
+  "967": "espresso",
+  "968": "cup",
+  "969": "eggnog",
+  "970": "alp",
+  "971": "bubble",
+  "972": "cliff, drop, drop-off",
+  "973": "coral reef",
+  "974": "geyser",
+  "975": "lakeside, lakeshore",
+  "976": "promontory, headland, head, foreland",
+  "977": "sandbar, sand bar",
+  "978": "seashore, coast, seacoast, sea-coast",
+  "979": "valley, vale",
+  "980": "volcano",
+  "981": "ballplayer, baseball player",
+  "982": "groom, bridegroom",
+  "983": "scuba diver",
+  "984": "rapeseed",
+  "985": "daisy",
+  "986": "yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+  "987": "corn",
+  "988": "acorn",
+  "989": "hip, rose hip, rosehip",
+  "990": "buckeye, horse chestnut, conker",
+  "991": "coral fungus",
+  "992": "agaric",
+  "993": "gyromitra",
+  "994": "stinkhorn, carrion fungus",
+  "995": "earthstar",
+  "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+  "997": "bolete",
+  "998": "ear, spike, capitulum",
+  "999": "toilet tissue, toilet paper, bathroom tissue"
+}

labels/imagenet_labels.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""ImageNet-1k class labels for ADM class-conditional generation.
+Labels are stored as Hugging Face-style ``id2label`` JSON maps (string keys ``"0"``–``"999"``).
+Each value is a comma-separated list of synonyms for that class id.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Literal
+Language = Literal["en", "cn"]
+_LABELS_DIR = Path(__file__).resolve().parent
+def load_id2label(
+    labels_dir: Path | str | None = None,
+    lang: Language = "en",
+) -> dict[int, str]:
+    """Load ``id2label`` from ``id2label_en.json`` or ``id2label_cn.json``."""
+    root = Path(labels_dir) if labels_dir is not None else _LABELS_DIR
+    filename = "id2label_en.json" if lang == "en" else "id2label_cn.json"
+    path = root / filename
+    if not path.exists():
+        raise FileNotFoundError(f"ImageNet label file not found: {path}")
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    return {int(key): value for key, value in raw.items()}
+def build_label2id(id2label: dict[int, str]) -> dict[str, int]:
+    """Build a synonym -> class id map from an ``id2label`` dict (DiT-style)."""
+    labels: dict[str, int] = {}
+    for class_id, value in id2label.items():
+        for synonym in value.split(","):
+            synonym = synonym.strip()
+            if synonym:
+                labels[synonym] = int(class_id)
+    return dict(sorted(labels.items()))
+def resolve_label_ids(
+    labels: str | list[str],
+    label2id: dict[str, int],
+    *,
+    lang: Language = "en",
+) -> list[int]:
+    """Map one or more label strings to ImageNet class ids."""
+    if isinstance(labels, str):
+        labels = [labels]
+    missing = [label for label in labels if label not in label2id]
+    if missing:
+        preview = ", ".join(list(label2id.keys())[:8])
+        raise ValueError(
+            f"Unknown label(s) for lang={lang!r}: {missing}. "
+            f"Example valid labels: {preview}, ..."
+        )
+    return [label2id[label] for label in labels]