BiliSakura commited on 6 days ago

Commit

9dc3cb9

verified ·

1 Parent(s): 26e1caf

Upload folder using huggingface_hub

Browse files

Files changed (45) hide show

.gitattributes +3 -0
DeCo-XL-16-256/decoder/__pycache__/decoder_deco.cpython-312.pyc +0 -0
DeCo-XL-16-256/decoder/config.json +9 -0
DeCo-XL-16-256/decoder/decoder_deco.py +163 -0
DeCo-XL-16-256/decoder/diffusion_pytorch_model.safetensors +3 -0
DeCo-XL-16-256/model_index.json +1021 -0
DeCo-XL-16-256/pipeline.py +268 -0
DeCo-XL-16-256/scheduler/scheduler_config.json +8 -0
DeCo-XL-16-256/scheduler/scheduling_deco_flow_match_euler_discrete.py +82 -0
DeCo-XL-16-256/transformer/__pycache__/transformer_deco.cpython-312.pyc +0 -0
DeCo-XL-16-256/transformer/config.json +22 -0
DeCo-XL-16-256/transformer/diffusion_pytorch_model.safetensors +3 -0
DeCo-XL-16-256/transformer/transformer_deco.py +332 -0
DeCo-XL-16-512/decoder/__pycache__/decoder_deco.cpython-312.pyc +0 -0
DeCo-XL-16-512/decoder/config.json +8 -0
DeCo-XL-16-512/decoder/decoder_deco.py +163 -0
DeCo-XL-16-512/decoder/diffusion_pytorch_model.safetensors +3 -0
DeCo-XL-16-512/decoder/diffusion_pytorch_model.safetensors.bak +3 -0
DeCo-XL-16-512/demo.png +3 -0
DeCo-XL-16-512/model_index.json +1021 -0
DeCo-XL-16-512/pipeline.py +268 -0
DeCo-XL-16-512/scheduler/scheduler_config.json +8 -0
DeCo-XL-16-512/scheduler/scheduling_deco_flow_match_euler_discrete.py +82 -0
DeCo-XL-16-512/transformer/__pycache__/transformer_deco.cpython-312.pyc +0 -0
DeCo-XL-16-512/transformer/config.json +21 -0
DeCo-XL-16-512/transformer/diffusion_pytorch_model.safetensors +3 -0
DeCo-XL-16-512/transformer/diffusion_pytorch_model.safetensors.bak +3 -0
DeCo-XL-16-512/transformer/transformer_deco.py +332 -0
DeCo-XXL-16-512-t2i/decoder/__pycache__/decoder_deco.cpython-312.pyc +0 -0
DeCo-XXL-16-512-t2i/decoder/config.json +8 -0
DeCo-XXL-16-512-t2i/decoder/decoder_deco.py +177 -0
DeCo-XXL-16-512-t2i/decoder/diffusion_pytorch_model.safetensors +3 -0
DeCo-XXL-16-512-t2i/model_index.json +27 -0
DeCo-XXL-16-512-t2i/pipeline.py +291 -0
DeCo-XXL-16-512-t2i/scheduler/scheduler_config.json +13 -0
DeCo-XXL-16-512-t2i/scheduler/scheduling_deco_flow_match_adam_discrete.py +200 -0
DeCo-XXL-16-512-t2i/scheduler/scheduling_deco_flow_match_euler_discrete.py +82 -0
DeCo-XXL-16-512-t2i/scripts/run_t2i_demo.py +47 -0
DeCo-XXL-16-512-t2i/scripts/test_t2i_load.py +53 -0
DeCo-XXL-16-512-t2i/transformer/__pycache__/transformer_deco_t2i.cpython-312.pyc +0 -0
DeCo-XXL-16-512-t2i/transformer/config.json +21 -0
DeCo-XXL-16-512-t2i/transformer/diffusion_pytorch_model.safetensors +3 -0
DeCo-XXL-16-512-t2i/transformer/transformer_deco_t2i.py +411 -0
README.md +109 -0
t2i_DeCo.ckpt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+DeCo-XL-16-512/decoder/diffusion_pytorch_model.safetensors.bak filter=lfs diff=lfs merge=lfs -text
+DeCo-XL-16-512/demo.png filter=lfs diff=lfs merge=lfs -text
+DeCo-XL-16-512/transformer/diffusion_pytorch_model.safetensors.bak filter=lfs diff=lfs merge=lfs -text

DeCo-XL-16-256/decoder/__pycache__/decoder_deco.cpython-312.pyc ADDED Viewed

Binary file (10.9 kB). View file

DeCo-XL-16-256/decoder/config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_class_name": "DeCoPatchDecoderModel",
+  "hidden_size_x": 32,
+  "in_channels": 3,
+  "max_freqs": 8,
+  "num_res_blocks": 3,
+  "patch_size": 16,
+  "z_channels": 1152
+}

DeCo-XL-16-256/decoder/decoder_deco.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+class NerfEmbedder(nn.Module):
+    def __init__(self, in_channels: int, hidden_size_input: int, max_freqs: int):
+        super().__init__()
+        self.max_freqs = max_freqs
+        self.embedder = nn.Sequential(nn.Linear(in_channels + max_freqs**2, hidden_size_input, bias=True))
+    @lru_cache
+    def fetch_pos(self, patch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        pos_x = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
+        pos_y = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
+        pos_y, pos_x = torch.meshgrid(pos_y, pos_x, indexing="ij")
+        freqs = torch.linspace(0, self.max_freqs, self.max_freqs, dtype=dtype, device=device)
+        freqs_x = freqs[None, :, None]
+        freqs_y = freqs[None, None, :]
+        coeffs = (1 + freqs_x * freqs_y) ** -1
+        dct = (
+            torch.cos(pos_x.reshape(-1, 1, 1) * freqs_x * torch.pi)
+            * torch.cos(pos_y.reshape(-1, 1, 1) * freqs_y * torch.pi)
+            * coeffs
+        ).view(1, -1, self.max_freqs**2)
+        return dct
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        batch_size, patch_tokens, _ = inputs.shape
+        patch_size = int(patch_tokens**0.5)
+        dct = self.fetch_pos(patch_size, inputs.device, inputs.dtype).repeat(batch_size, 1, 1)
+        return self.embedder(torch.cat([inputs, dct], dim=-1))
+class ResBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(channels, 3 * channels, bias=True))
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        return x + gate_mlp * self.mlp(_modulate(self.in_ln(x), shift_mlp, scale_mlp))
+class DecoderFinalLayer(nn.Module):
+    def __init__(self, model_channels: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.norm_final(x))
+class SimpleMLPAdaLN(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        z_channels: int,
+        num_res_blocks: int,
+        patch_size: int,
+        grad_checkpointing: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.grad_checkpointing = grad_checkpointing
+        self.cond_embed = nn.Linear(z_channels, patch_size**2 * model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        self.res_blocks = nn.ModuleList([ResBlock(model_channels) for _ in range(num_res_blocks)])
+        self.final_layer = DecoderFinalLayer(model_channels, out_channels)
+        self._init_weights()
+    def _init_weights(self) -> None:
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        x = self.input_proj(x)
+        y = self.cond_embed(c).reshape(c.shape[0], self.patch_size**2, -1)
+        for block in self.res_blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(block, x, y)
+            else:
+                x = block(x, y)
+        return self.final_layer(x)
+@dataclass
+class DeCoPatchDecoderOutput(BaseOutput):
+    sample: torch.Tensor
+class DeCoPatchDecoderModel(ModelMixin, ConfigMixin):
+    """Per-patch RGB decoder for DeCo (NerfEmbedder + AdaLN MLP)."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        hidden_size_x: int = 32,
+        z_channels: int = 1152,
+        num_res_blocks: int = 3,
+        patch_size: int = 16,
+        max_freqs: int = 8,
+    ):
+        super().__init__()
+        self.x_embedder = NerfEmbedder(in_channels, hidden_size_x, max_freqs=max_freqs)
+        self.dec_net = SimpleMLPAdaLN(
+            in_channels=hidden_size_x,
+            model_channels=hidden_size_x,
+            out_channels=in_channels,
+            z_channels=z_channels,
+            num_res_blocks=num_res_blocks,
+            patch_size=patch_size,
+        )
+    def forward(
+        self,
+        patch_pixels: torch.Tensor,
+        conditioning: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[DeCoPatchDecoderOutput, tuple[torch.Tensor]]:
+        """
+        Args:
+            patch_pixels (`torch.Tensor`):
+                Flattened patch pixels of shape `(batch * num_patches, patch_size ** 2, in_channels)`.
+            conditioning (`torch.Tensor`):
+                Per-patch conditioning of shape `(batch * num_patches, z_channels)`.
+        """
+        output = self.dec_net(self.x_embedder(patch_pixels), conditioning)
+        if not return_dict:
+            return (output,)
+        return DeCoPatchDecoderOutput(sample=output)

DeCo-XL-16-256/decoder/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2de852e7fd141788fe391192901098dd2c5f5196e7a6e988391a1b7be002e5f6
+size 37862236

DeCo-XL-16-256/model_index.json ADDED Viewed

	@@ -0,0 +1,1021 @@

+{
+  "_class_name": [
+    "pipeline",
+    "DeCoPipeline"
+  ],
+  "_diffusers_version": "0.31.0",
+  "decoder": [
+    "decoder_deco",
+    "DeCoPatchDecoderModel"
+  ],
+  "id2label": {
+    "0": "tench, Tinca tinca",
+    "1": "goldfish, Carassius auratus",
+    "10": "brambling, Fringilla montifringilla",
+    "100": "black swan, Cygnus atratus",
+    "101": "tusker",
+    "102": "echidna, spiny anteater, anteater",
+    "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    "104": "wallaby, brush kangaroo",
+    "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    "106": "wombat",
+    "107": "jellyfish",
+    "108": "sea anemone, anemone",
+    "109": "brain coral",
+    "11": "goldfinch, Carduelis carduelis",
+    "110": "flatworm, platyhelminth",
+    "111": "nematode, nematode worm, roundworm",
+    "112": "conch",
+    "113": "snail",
+    "114": "slug",
+    "115": "sea slug, nudibranch",
+    "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "117": "chambered nautilus, pearly nautilus, nautilus",
+    "118": "Dungeness crab, Cancer magister",
+    "119": "rock crab, Cancer irroratus",
+    "12": "house finch, linnet, Carpodacus mexicanus",
+    "120": "fiddler crab",
+    "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    "124": "crayfish, crawfish, crawdad, crawdaddy",
+    "125": "hermit crab",
+    "126": "isopod",
+    "127": "white stork, Ciconia ciconia",
+    "128": "black stork, Ciconia nigra",
+    "129": "spoonbill",
+    "13": "junco, snowbird",
+    "130": "flamingo",
+    "131": "little blue heron, Egretta caerulea",
+    "132": "American egret, great white heron, Egretta albus",
+    "133": "bittern",
+    "134": "crane",
+    "135": "limpkin, Aramus pictus",
+    "136": "European gallinule, Porphyrio porphyrio",
+    "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "138": "bustard",
+    "139": "ruddy turnstone, Arenaria interpres",
+    "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "140": "red-backed sandpiper, dunlin, Erolia alpina",
+    "141": "redshank, Tringa totanus",
+    "142": "dowitcher",
+    "143": "oystercatcher, oyster catcher",
+    "144": "pelican",
+    "145": "king penguin, Aptenodytes patagonica",
+    "146": "albatross, mollymawk",
+    "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "149": "dugong, Dugong dugon",
+    "15": "robin, American robin, Turdus migratorius",
+    "150": "sea lion",
+    "151": "Chihuahua",
+    "152": "Japanese spaniel",
+    "153": "Maltese dog, Maltese terrier, Maltese",
+    "154": "Pekinese, Pekingese, Peke",
+    "155": "Shih-Tzu",
+    "156": "Blenheim spaniel",
+    "157": "papillon",
+    "158": "toy terrier",
+    "159": "Rhodesian ridgeback",
+    "16": "bulbul",
+    "160": "Afghan hound, Afghan",
+    "161": "basset, basset hound",
+    "162": "beagle",
+    "163": "bloodhound, sleuthhound",
+    "164": "bluetick",
+    "165": "black-and-tan coonhound",
+    "166": "Walker hound, Walker foxhound",
+    "167": "English foxhound",
+    "168": "redbone",
+    "169": "borzoi, Russian wolfhound",
+    "17": "jay",
+    "170": "Irish wolfhound",
+    "171": "Italian greyhound",
+    "172": "whippet",
+    "173": "Ibizan hound, Ibizan Podenco",
+    "174": "Norwegian elkhound, elkhound",
+    "175": "otterhound, otter hound",
+    "176": "Saluki, gazelle hound",
+    "177": "Scottish deerhound, deerhound",
+    "178": "Weimaraner",
+    "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+    "18": "magpie",
+    "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    "181": "Bedlington terrier",
+    "182": "Border terrier",
+    "183": "Kerry blue terrier",
+    "184": "Irish terrier",
+    "185": "Norfolk terrier",
+    "186": "Norwich terrier",
+    "187": "Yorkshire terrier",
+    "188": "wire-haired fox terrier",
+    "189": "Lakeland terrier",
+    "19": "chickadee",
+    "190": "Sealyham terrier, Sealyham",
+    "191": "Airedale, Airedale terrier",
+    "192": "cairn, cairn terrier",
+    "193": "Australian terrier",
+    "194": "Dandie Dinmont, Dandie Dinmont terrier",
+    "195": "Boston bull, Boston terrier",
+    "196": "miniature schnauzer",
+    "197": "giant schnauzer",
+    "198": "standard schnauzer",
+    "199": "Scotch terrier, Scottish terrier, Scottie",
+    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    "20": "water ouzel, dipper",
+    "200": "Tibetan terrier, chrysanthemum dog",
+    "201": "silky terrier, Sydney silky",
+    "202": "soft-coated wheaten terrier",
+    "203": "West Highland white terrier",
+    "204": "Lhasa, Lhasa apso",
+    "205": "flat-coated retriever",
+    "206": "curly-coated retriever",
+    "207": "golden retriever",
+    "208": "Labrador retriever",
+    "209": "Chesapeake Bay retriever",
+    "21": "kite",
+    "210": "German short-haired pointer",
+    "211": "vizsla, Hungarian pointer",
+    "212": "English setter",
+    "213": "Irish setter, red setter",
+    "214": "Gordon setter",
+    "215": "Brittany spaniel",
+    "216": "clumber, clumber spaniel",
+    "217": "English springer, English springer spaniel",
+    "218": "Welsh springer spaniel",
+    "219": "cocker spaniel, English cocker spaniel, cocker",
+    "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "220": "Sussex spaniel",
+    "221": "Irish water spaniel",
+    "222": "kuvasz",
+    "223": "schipperke",
+    "224": "groenendael",
+    "225": "malinois",
+    "226": "briard",
+    "227": "kelpie",
+    "228": "komondor",
+    "229": "Old English sheepdog, bobtail",
+    "23": "vulture",
+    "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "231": "collie",
+    "232": "Border collie",
+    "233": "Bouvier des Flandres, Bouviers des Flandres",
+    "234": "Rottweiler",
+    "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+    "236": "Doberman, Doberman pinscher",
+    "237": "miniature pinscher",
+    "238": "Greater Swiss Mountain dog",
+    "239": "Bernese mountain dog",
+    "24": "great grey owl, great gray owl, Strix nebulosa",
+    "240": "Appenzeller",
+    "241": "EntleBucher",
+    "242": "boxer",
+    "243": "bull mastiff",
+    "244": "Tibetan mastiff",
+    "245": "French bulldog",
+    "246": "Great Dane",
+    "247": "Saint Bernard, St Bernard",
+    "248": "Eskimo dog, husky",
+    "249": "malamute, malemute, Alaskan malamute",
+    "25": "European fire salamander, Salamandra salamandra",
+    "250": "Siberian husky",
+    "251": "dalmatian, coach dog, carriage dog",
+    "252": "affenpinscher, monkey pinscher, monkey dog",
+    "253": "basenji",
+    "254": "pug, pug-dog",
+    "255": "Leonberg",
+    "256": "Newfoundland, Newfoundland dog",
+    "257": "Great Pyrenees",
+    "258": "Samoyed, Samoyede",
+    "259": "Pomeranian",
+    "26": "common newt, Triturus vulgaris",
+    "260": "chow, chow chow",
+    "261": "keeshond",
+    "262": "Brabancon griffon",
+    "263": "Pembroke, Pembroke Welsh corgi",
+    "264": "Cardigan, Cardigan Welsh corgi",
+    "265": "toy poodle",
+    "266": "miniature poodle",
+    "267": "standard poodle",
+    "268": "Mexican hairless",
+    "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "27": "eft",
+    "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+    "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+    "273": "dingo, warrigal, warragal, Canis dingo",
+    "274": "dhole, Cuon alpinus",
+    "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "276": "hyena, hyaena",
+    "277": "red fox, Vulpes vulpes",
+    "278": "kit fox, Vulpes macrotis",
+    "279": "Arctic fox, white fox, Alopex lagopus",
+    "28": "spotted salamander, Ambystoma maculatum",
+    "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+    "281": "tabby, tabby cat",
+    "282": "tiger cat",
+    "283": "Persian cat",
+    "284": "Siamese cat, Siamese",
+    "285": "Egyptian cat",
+    "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    "287": "lynx, catamount",
+    "288": "leopard, Panthera pardus",
+    "289": "snow leopard, ounce, Panthera uncia",
+    "29": "axolotl, mud puppy, Ambystoma mexicanum",
+    "290": "jaguar, panther, Panthera onca, Felis onca",
+    "291": "lion, king of beasts, Panthera leo",
+    "292": "tiger, Panthera tigris",
+    "293": "cheetah, chetah, Acinonyx jubatus",
+    "294": "brown bear, bruin, Ursus arctos",
+    "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+    "298": "mongoose",
+    "299": "meerkat, mierkat",
+    "3": "tiger shark, Galeocerdo cuvieri",
+    "30": "bullfrog, Rana catesbeiana",
+    "300": "tiger beetle",
+    "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "302": "ground beetle, carabid beetle",
+    "303": "long-horned beetle, longicorn, longicorn beetle",
+    "304": "leaf beetle, chrysomelid",
+    "305": "dung beetle",
+    "306": "rhinoceros beetle",
+    "307": "weevil",
+    "308": "fly",
+    "309": "bee",
+    "31": "tree frog, tree-frog",
+    "310": "ant, emmet, pismire",
+    "311": "grasshopper, hopper",
+    "312": "cricket",
+    "313": "walking stick, walkingstick, stick insect",
+    "314": "cockroach, roach",
+    "315": "mantis, mantid",
+    "316": "cicada, cicala",
+    "317": "leafhopper",
+    "318": "lacewing, lacewing fly",
+    "319": "dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "320": "damselfly",
+    "321": "admiral",
+    "322": "ringlet, ringlet butterfly",
+    "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "324": "cabbage butterfly",
+    "325": "sulphur butterfly, sulfur butterfly",
+    "326": "lycaenid, lycaenid butterfly",
+    "327": "starfish, sea star",
+    "328": "sea urchin",
+    "329": "sea cucumber, holothurian",
+    "33": "loggerhead, loggerhead turtle, Caretta caretta",
+    "330": "wood rabbit, cottontail, cottontail rabbit",
+    "331": "hare",
+    "332": "Angora, Angora rabbit",
+    "333": "hamster",
+    "334": "porcupine, hedgehog",
+    "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "336": "marmot",
+    "337": "beaver",
+    "338": "guinea pig, Cavia cobaya",
+    "339": "sorrel",
+    "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    "340": "zebra",
+    "341": "hog, pig, grunter, squealer, Sus scrofa",
+    "342": "wild boar, boar, Sus scrofa",
+    "343": "warthog",
+    "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "345": "ox",
+    "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "347": "bison",
+    "348": "ram, tup",
+    "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    "35": "mud turtle",
+    "350": "ibex, Capra ibex",
+    "351": "hartebeest",
+    "352": "impala, Aepyceros melampus",
+    "353": "gazelle",
+    "354": "Arabian camel, dromedary, Camelus dromedarius",
+    "355": "llama",
+    "356": "weasel",
+    "357": "mink",
+    "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "359": "black-footed ferret, ferret, Mustela nigripes",
+    "36": "terrapin",
+    "360": "otter",
+    "361": "skunk, polecat, wood pussy",
+    "362": "badger",
+    "363": "armadillo",
+    "364": "three-toed sloth, ai, Bradypus tridactylus",
+    "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+    "366": "gorilla, Gorilla gorilla",
+    "367": "chimpanzee, chimp, Pan troglodytes",
+    "368": "gibbon, Hylobates lar",
+    "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "37": "box turtle, box tortoise",
+    "370": "guenon, guenon monkey",
+    "371": "patas, hussar monkey, Erythrocebus patas",
+    "372": "baboon",
+    "373": "macaque",
+    "374": "langur",
+    "375": "colobus, colobus monkey",
+    "376": "proboscis monkey, Nasalis larvatus",
+    "377": "marmoset",
+    "378": "capuchin, ringtail, Cebus capucinus",
+    "379": "howler monkey, howler",
+    "38": "banded gecko",
+    "380": "titi, titi monkey",
+    "381": "spider monkey, Ateles geoffroyi",
+    "382": "squirrel monkey, Saimiri sciureus",
+    "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "384": "indri, indris, Indri indri, Indri brevicaudatus",
+    "385": "Indian elephant, Elephas maximus",
+    "386": "African elephant, Loxodonta africana",
+    "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "389": "barracouta, snoek",
+    "39": "common iguana, iguana, Iguana iguana",
+    "390": "eel",
+    "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    "392": "rock beauty, Holocanthus tricolor",
+    "393": "anemone fish",
+    "394": "sturgeon",
+    "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "396": "lionfish",
+    "397": "puffer, pufferfish, blowfish, globefish",
+    "398": "abacus",
+    "399": "abaya",
+    "4": "hammerhead, hammerhead shark",
+    "40": "American chameleon, anole, Anolis carolinensis",
+    "400": "academic gown, academic robe, judge robe",
+    "401": "accordion, piano accordion, squeeze box",
+    "402": "acoustic guitar",
+    "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "404": "airliner",
+    "405": "airship, dirigible",
+    "406": "altar",
+    "407": "ambulance",
+    "408": "amphibian, amphibious vehicle",
+    "409": "analog clock",
+    "41": "whiptail, whiptail lizard",
+    "410": "apiary, bee house",
+    "411": "apron",
+    "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    "413": "assault rifle, assault gun",
+    "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "415": "bakery, bakeshop, bakehouse",
+    "416": "balance beam, beam",
+    "417": "balloon",
+    "418": "ballpoint, ballpoint pen, ballpen, Biro",
+    "419": "Band Aid",
+    "42": "agama",
+    "420": "banjo",
+    "421": "bannister, banister, balustrade, balusters, handrail",
+    "422": "barbell",
+    "423": "barber chair",
+    "424": "barbershop",
+    "425": "barn",
+    "426": "barometer",
+    "427": "barrel, cask",
+    "428": "barrow, garden cart, lawn cart, wheelbarrow",
+    "429": "baseball",
+    "43": "frilled lizard, Chlamydosaurus kingi",
+    "430": "basketball",
+    "431": "bassinet",
+    "432": "bassoon",
+    "433": "bathing cap, swimming cap",
+    "434": "bath towel",
+    "435": "bathtub, bathing tub, bath, tub",
+    "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    "437": "beacon, lighthouse, beacon light, pharos",
+    "438": "beaker",
+    "439": "bearskin, busby, shako",
+    "44": "alligator lizard",
+    "440": "beer bottle",
+    "441": "beer glass",
+    "442": "bell cote, bell cot",
+    "443": "bib",
+    "444": "bicycle-built-for-two, tandem bicycle, tandem",
+    "445": "bikini, two-piece",
+    "446": "binder, ring-binder",
+    "447": "binoculars, field glasses, opera glasses",
+    "448": "birdhouse",
+    "449": "boathouse",
+    "45": "Gila monster, Heloderma suspectum",
+    "450": "bobsled, bobsleigh, bob",
+    "451": "bolo tie, bolo, bola tie, bola",
+    "452": "bonnet, poke bonnet",
+    "453": "bookcase",
+    "454": "bookshop, bookstore, bookstall",
+    "455": "bottlecap",
+    "456": "bow",
+    "457": "bow tie, bow-tie, bowtie",
+    "458": "brass, memorial tablet, plaque",
+    "459": "brassiere, bra, bandeau",
+    "46": "green lizard, Lacerta viridis",
+    "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "461": "breastplate, aegis, egis",
+    "462": "broom",
+    "463": "bucket, pail",
+    "464": "buckle",
+    "465": "bulletproof vest",
+    "466": "bullet train, bullet",
+    "467": "butcher shop, meat market",
+    "468": "cab, hack, taxi, taxicab",
+    "469": "caldron, cauldron",
+    "47": "African chameleon, Chamaeleo chamaeleon",
+    "470": "candle, taper, wax light",
+    "471": "cannon",
+    "472": "canoe",
+    "473": "can opener, tin opener",
+    "474": "cardigan",
+    "475": "car mirror",
+    "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "477": "carpenters kit, tool kit",
+    "478": "carton",
+    "479": "car wheel",
+    "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    "481": "cassette",
+    "482": "cassette player",
+    "483": "castle",
+    "484": "catamaran",
+    "485": "CD player",
+    "486": "cello, violoncello",
+    "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "488": "chain",
+    "489": "chainlink fence",
+    "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    "491": "chain saw, chainsaw",
+    "492": "chest",
+    "493": "chiffonier, commode",
+    "494": "chime, bell, gong",
+    "495": "china cabinet, china closet",
+    "496": "Christmas stocking",
+    "497": "church, church building",
+    "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+    "499": "cleaver, meat cleaver, chopper",
+    "5": "electric ray, crampfish, numbfish, torpedo",
+    "50": "American alligator, Alligator mississipiensis",
+    "500": "cliff dwelling",
+    "501": "cloak",
+    "502": "clog, geta, patten, sabot",
+    "503": "cocktail shaker",
+    "504": "coffee mug",
+    "505": "coffeepot",
+    "506": "coil, spiral, volute, whorl, helix",
+    "507": "combination lock",
+    "508": "computer keyboard, keypad",
+    "509": "confectionery, confectionary, candy store",
+    "51": "triceratops",
+    "510": "container ship, containership, container vessel",
+    "511": "convertible",
+    "512": "corkscrew, bottle screw",
+    "513": "cornet, horn, trumpet, trump",
+    "514": "cowboy boot",
+    "515": "cowboy hat, ten-gallon hat",
+    "516": "cradle",
+    "517": "crane",
+    "518": "crash helmet",
+    "519": "crate",
+    "52": "thunder snake, worm snake, Carphophis amoenus",
+    "520": "crib, cot",
+    "521": "Crock Pot",
+    "522": "croquet ball",
+    "523": "crutch",
+    "524": "cuirass",
+    "525": "dam, dike, dyke",
+    "526": "desk",
+    "527": "desktop computer",
+    "528": "dial telephone, dial phone",
+    "529": "diaper, nappy, napkin",
+    "53": "ringneck snake, ring-necked snake, ring snake",
+    "530": "digital clock",
+    "531": "digital watch",
+    "532": "dining table, board",
+    "533": "dishrag, dishcloth",
+    "534": "dishwasher, dish washer, dishwashing machine",
+    "535": "disk brake, disc brake",
+    "536": "dock, dockage, docking facility",
+    "537": "dogsled, dog sled, dog sleigh",
+    "538": "dome",
+    "539": "doormat, welcome mat",
+    "54": "hognose snake, puff adder, sand viper",
+    "540": "drilling platform, offshore rig",
+    "541": "drum, membranophone, tympan",
+    "542": "drumstick",
+    "543": "dumbbell",
+    "544": "Dutch oven",
+    "545": "electric fan, blower",
+    "546": "electric guitar",
+    "547": "electric locomotive",
+    "548": "entertainment center",
+    "549": "envelope",
+    "55": "green snake, grass snake",
+    "550": "espresso maker",
+    "551": "face powder",
+    "552": "feather boa, boa",
+    "553": "file, file cabinet, filing cabinet",
+    "554": "fireboat",
+    "555": "fire engine, fire truck",
+    "556": "fire screen, fireguard",
+    "557": "flagpole, flagstaff",
+    "558": "flute, transverse flute",
+    "559": "folding chair",
+    "56": "king snake, kingsnake",
+    "560": "football helmet",
+    "561": "forklift",
+    "562": "fountain",
+    "563": "fountain pen",
+    "564": "four-poster",
+    "565": "freight car",
+    "566": "French horn, horn",
+    "567": "frying pan, frypan, skillet",
+    "568": "fur coat",
+    "569": "garbage truck, dustcart",
+    "57": "garter snake, grass snake",
+    "570": "gasmask, respirator, gas helmet",
+    "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+    "572": "goblet",
+    "573": "go-kart",
+    "574": "golf ball",
+    "575": "golfcart, golf cart",
+    "576": "gondola",
+    "577": "gong, tam-tam",
+    "578": "gown",
+    "579": "grand piano, grand",
+    "58": "water snake",
+    "580": "greenhouse, nursery, glasshouse",
+    "581": "grille, radiator grille",
+    "582": "grocery store, grocery, food market, market",
+    "583": "guillotine",
+    "584": "hair slide",
+    "585": "hair spray",
+    "586": "half track",
+    "587": "hammer",
+    "588": "hamper",
+    "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "59": "vine snake",
+    "590": "hand-held computer, hand-held microcomputer",
+    "591": "handkerchief, hankie, hanky, hankey",
+    "592": "hard disc, hard disk, fixed disk",
+    "593": "harmonica, mouth organ, harp, mouth harp",
+    "594": "harp",
+    "595": "harvester, reaper",
+    "596": "hatchet",
+    "597": "holster",
+    "598": "home theater, home theatre",
+    "599": "honeycomb",
+    "6": "stingray",
+    "60": "night snake, Hypsiglena torquata",
+    "600": "hook, claw",
+    "601": "hoopskirt, crinoline",
+    "602": "horizontal bar, high bar",
+    "603": "horse cart, horse-cart",
+    "604": "hourglass",
+    "605": "iPod",
+    "606": "iron, smoothing iron",
+    "607": "jack-o-lantern",
+    "608": "jean, blue jean, denim",
+    "609": "jeep, landrover",
+    "61": "boa constrictor, Constrictor constrictor",
+    "610": "jersey, T-shirt, tee shirt",
+    "611": "jigsaw puzzle",
+    "612": "jinrikisha, ricksha, rickshaw",
+    "613": "joystick",
+    "614": "kimono",
+    "615": "knee pad",
+    "616": "knot",
+    "617": "lab coat, laboratory coat",
+    "618": "ladle",
+    "619": "lampshade, lamp shade",
+    "62": "rock python, rock snake, Python sebae",
+    "620": "laptop, laptop computer",
+    "621": "lawn mower, mower",
+    "622": "lens cap, lens cover",
+    "623": "letter opener, paper knife, paperknife",
+    "624": "library",
+    "625": "lifeboat",
+    "626": "lighter, light, igniter, ignitor",
+    "627": "limousine, limo",
+    "628": "liner, ocean liner",
+    "629": "lipstick, lip rouge",
+    "63": "Indian cobra, Naja naja",
+    "630": "Loafer",
+    "631": "lotion",
+    "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    "633": "loupe, jewelers loupe",
+    "634": "lumbermill, sawmill",
+    "635": "magnetic compass",
+    "636": "mailbag, postbag",
+    "637": "mailbox, letter box",
+    "638": "maillot",
+    "639": "maillot, tank suit",
+    "64": "green mamba",
+    "640": "manhole cover",
+    "641": "maraca",
+    "642": "marimba, xylophone",
+    "643": "mask",
+    "644": "matchstick",
+    "645": "maypole",
+    "646": "maze, labyrinth",
+    "647": "measuring cup",
+    "648": "medicine chest, medicine cabinet",
+    "649": "megalith, megalithic structure",
+    "65": "sea snake",
+    "650": "microphone, mike",
+    "651": "microwave, microwave oven",
+    "652": "military uniform",
+    "653": "milk can",
+    "654": "minibus",
+    "655": "miniskirt, mini",
+    "656": "minivan",
+    "657": "missile",
+    "658": "mitten",
+    "659": "mixing bowl",
+    "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "660": "mobile home, manufactured home",
+    "661": "Model T",
+    "662": "modem",
+    "663": "monastery",
+    "664": "monitor",
+    "665": "moped",
+    "666": "mortar",
+    "667": "mortarboard",
+    "668": "mosque",
+    "669": "mosquito net",
+    "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "670": "motor scooter, scooter",
+    "671": "mountain bike, all-terrain bike, off-roader",
+    "672": "mountain tent",
+    "673": "mouse, computer mouse",
+    "674": "mousetrap",
+    "675": "moving van",
+    "676": "muzzle",
+    "677": "nail",
+    "678": "neck brace",
+    "679": "necklace",
+    "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "680": "nipple",
+    "681": "notebook, notebook computer",
+    "682": "obelisk",
+    "683": "oboe, hautboy, hautbois",
+    "684": "ocarina, sweet potato",
+    "685": "odometer, hodometer, mileometer, milometer",
+    "686": "oil filter",
+    "687": "organ, pipe organ",
+    "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "689": "overskirt",
+    "69": "trilobite",
+    "690": "oxcart",
+    "691": "oxygen mask",
+    "692": "packet",
+    "693": "paddle, boat paddle",
+    "694": "paddlewheel, paddle wheel",
+    "695": "padlock",
+    "696": "paintbrush",
+    "697": "pajama, pyjama, pjs, jammies",
+    "698": "palace",
+    "699": "panpipe, pandean pipe, syrinx",
+    "7": "cock",
+    "70": "harvestman, daddy longlegs, Phalangium opilio",
+    "700": "paper towel",
+    "701": "parachute, chute",
+    "702": "parallel bars, bars",
+    "703": "park bench",
+    "704": "parking meter",
+    "705": "passenger car, coach, carriage",
+    "706": "patio, terrace",
+    "707": "pay-phone, pay-station",
+    "708": "pedestal, plinth, footstall",
+    "709": "pencil box, pencil case",
+    "71": "scorpion",
+    "710": "pencil sharpener",
+    "711": "perfume, essence",
+    "712": "Petri dish",
+    "713": "photocopier",
+    "714": "pick, plectrum, plectron",
+    "715": "pickelhaube",
+    "716": "picket fence, paling",
+    "717": "pickup, pickup truck",
+    "718": "pier",
+    "719": "piggy bank, penny bank",
+    "72": "black and gold garden spider, Argiope aurantia",
+    "720": "pill bottle",
+    "721": "pillow",
+    "722": "ping-pong ball",
+    "723": "pinwheel",
+    "724": "pirate, pirate ship",
+    "725": "pitcher, ewer",
+    "726": "plane, carpenters plane, woodworking plane",
+    "727": "planetarium",
+    "728": "plastic bag",
+    "729": "plate rack",
+    "73": "barn spider, Araneus cavaticus",
+    "730": "plow, plough",
+    "731": "plunger, plumbers helper",
+    "732": "Polaroid camera, Polaroid Land camera",
+    "733": "pole",
+    "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    "735": "poncho",
+    "736": "pool table, billiard table, snooker table",
+    "737": "pop bottle, soda bottle",
+    "738": "pot, flowerpot",
+    "739": "potters wheel",
+    "74": "garden spider, Aranea diademata",
+    "740": "power drill",
+    "741": "prayer rug, prayer mat",
+    "742": "printer",
+    "743": "prison, prison house",
+    "744": "projectile, missile",
+    "745": "projector",
+    "746": "puck, hockey puck",
+    "747": "punching bag, punch bag, punching ball, punchball",
+    "748": "purse",
+    "749": "quill, quill pen",
+    "75": "black widow, Latrodectus mactans",
+    "750": "quilt, comforter, comfort, puff",
+    "751": "racer, race car, racing car",
+    "752": "racket, racquet",
+    "753": "radiator",
+    "754": "radio, wireless",
+    "755": "radio telescope, radio reflector",
+    "756": "rain barrel",
+    "757": "recreational vehicle, RV, R.V.",
+    "758": "reel",
+    "759": "reflex camera",
+    "76": "tarantula",
+    "760": "refrigerator, icebox",
+    "761": "remote control, remote",
+    "762": "restaurant, eating house, eating place, eatery",
+    "763": "revolver, six-gun, six-shooter",
+    "764": "rifle",
+    "765": "rocking chair, rocker",
+    "766": "rotisserie",
+    "767": "rubber eraser, rubber, pencil eraser",
+    "768": "rugby ball",
+    "769": "rule, ruler",
+    "77": "wolf spider, hunting spider",
+    "770": "running shoe",
+    "771": "safe",
+    "772": "safety pin",
+    "773": "saltshaker, salt shaker",
+    "774": "sandal",
+    "775": "sarong",
+    "776": "sax, saxophone",
+    "777": "scabbard",
+    "778": "scale, weighing machine",
+    "779": "school bus",
+    "78": "tick",
+    "780": "schooner",
+    "781": "scoreboard",
+    "782": "screen, CRT screen",
+    "783": "screw",
+    "784": "screwdriver",
+    "785": "seat belt, seatbelt",
+    "786": "sewing machine",
+    "787": "shield, buckler",
+    "788": "shoe shop, shoe-shop, shoe store",
+    "789": "shoji",
+    "79": "centipede",
+    "790": "shopping basket",
+    "791": "shopping cart",
+    "792": "shovel",
+    "793": "shower cap",
+    "794": "shower curtain",
+    "795": "ski",
+    "796": "ski mask",
+    "797": "sleeping bag",
+    "798": "slide rule, slipstick",
+    "799": "sliding door",
+    "8": "hen",
+    "80": "black grouse",
+    "800": "slot, one-armed bandit",
+    "801": "snorkel",
+    "802": "snowmobile",
+    "803": "snowplow, snowplough",
+    "804": "soap dispenser",
+    "805": "soccer ball",
+    "806": "sock",
+    "807": "solar dish, solar collector, solar furnace",
+    "808": "sombrero",
+    "809": "soup bowl",
+    "81": "ptarmigan",
+    "810": "space bar",
+    "811": "space heater",
+    "812": "space shuttle",
+    "813": "spatula",
+    "814": "speedboat",
+    "815": "spider web, spiders web",
+    "816": "spindle",
+    "817": "sports car, sport car",
+    "818": "spotlight, spot",
+    "819": "stage",
+    "82": "ruffed grouse, partridge, Bonasa umbellus",
+    "820": "steam locomotive",
+    "821": "steel arch bridge",
+    "822": "steel drum",
+    "823": "stethoscope",
+    "824": "stole",
+    "825": "stone wall",
+    "826": "stopwatch, stop watch",
+    "827": "stove",
+    "828": "strainer",
+    "829": "streetcar, tram, tramcar, trolley, trolley car",
+    "83": "prairie chicken, prairie grouse, prairie fowl",
+    "830": "stretcher",
+    "831": "studio couch, day bed",
+    "832": "stupa, tope",
+    "833": "submarine, pigboat, sub, U-boat",
+    "834": "suit, suit of clothes",
+    "835": "sundial",
+    "836": "sunglass",
+    "837": "sunglasses, dark glasses, shades",
+    "838": "sunscreen, sunblock, sun blocker",
+    "839": "suspension bridge",
+    "84": "peacock",
+    "840": "swab, swob, mop",
+    "841": "sweatshirt",
+    "842": "swimming trunks, bathing trunks",
+    "843": "swing",
+    "844": "switch, electric switch, electrical switch",
+    "845": "syringe",
+    "846": "table lamp",
+    "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "848": "tape player",
+    "849": "teapot",
+    "85": "quail",
+    "850": "teddy, teddy bear",
+    "851": "television, television system",
+    "852": "tennis ball",
+    "853": "thatch, thatched roof",
+    "854": "theater curtain, theatre curtain",
+    "855": "thimble",
+    "856": "thresher, thrasher, threshing machine",
+    "857": "throne",
+    "858": "tile roof",
+    "859": "toaster",
+    "86": "partridge",
+    "860": "tobacco shop, tobacconist shop, tobacconist",
+    "861": "toilet seat",
+    "862": "torch",
+    "863": "totem pole",
+    "864": "tow truck, tow car, wrecker",
+    "865": "toyshop",
+    "866": "tractor",
+    "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    "868": "tray",
+    "869": "trench coat",
+    "87": "African grey, African gray, Psittacus erithacus",
+    "870": "tricycle, trike, velocipede",
+    "871": "trimaran",
+    "872": "tripod",
+    "873": "triumphal arch",
+    "874": "trolleybus, trolley coach, trackless trolley",
+    "875": "trombone",
+    "876": "tub, vat",
+    "877": "turnstile",
+    "878": "typewriter keyboard",
+    "879": "umbrella",
+    "88": "macaw",
+    "880": "unicycle, monocycle",
+    "881": "upright, upright piano",
+    "882": "vacuum, vacuum cleaner",
+    "883": "vase",
+    "884": "vault",
+    "885": "velvet",
+    "886": "vending machine",
+    "887": "vestment",
+    "888": "viaduct",
+    "889": "violin, fiddle",
+    "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "890": "volleyball",
+    "891": "waffle iron",
+    "892": "wall clock",
+    "893": "wallet, billfold, notecase, pocketbook",
+    "894": "wardrobe, closet, press",
+    "895": "warplane, military plane",
+    "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "897": "washer, automatic washer, washing machine",
+    "898": "water bottle",
+    "899": "water jug",
+    "9": "ostrich, Struthio camelus",
+    "90": "lorikeet",
+    "900": "water tower",
+    "901": "whiskey jug",
+    "902": "whistle",
+    "903": "wig",
+    "904": "window screen",
+    "905": "window shade",
+    "906": "Windsor tie",
+    "907": "wine bottle",
+    "908": "wing",
+    "909": "wok",
+    "91": "coucal",
+    "910": "wooden spoon",
+    "911": "wool, woolen, woollen",
+    "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "913": "wreck",
+    "914": "yawl",
+    "915": "yurt",
+    "916": "web site, website, internet site, site",
+    "917": "comic book",
+    "918": "crossword puzzle, crossword",
+    "919": "street sign",
+    "92": "bee eater",
+    "920": "traffic light, traffic signal, stoplight",
+    "921": "book jacket, dust cover, dust jacket, dust wrapper",
+    "922": "menu",
+    "923": "plate",
+    "924": "guacamole",
+    "925": "consomme",
+    "926": "hot pot, hotpot",
+    "927": "trifle",
+    "928": "ice cream, icecream",
+    "929": "ice lolly, lolly, lollipop, popsicle",
+    "93": "hornbill",
+    "930": "French loaf",
+    "931": "bagel, beigel",
+    "932": "pretzel",
+    "933": "cheeseburger",
+    "934": "hotdog, hot dog, red hot",
+    "935": "mashed potato",
+    "936": "head cabbage",
+    "937": "broccoli",
+    "938": "cauliflower",
+    "939": "zucchini, courgette",
+    "94": "hummingbird",
+    "940": "spaghetti squash",
+    "941": "acorn squash",
+    "942": "butternut squash",
+    "943": "cucumber, cuke",
+    "944": "artichoke, globe artichoke",
+    "945": "bell pepper",
+    "946": "cardoon",
+    "947": "mushroom",
+    "948": "Granny Smith",
+    "949": "strawberry",
+    "95": "jacamar",
+    "950": "orange",
+    "951": "lemon",
+    "952": "fig",
+    "953": "pineapple, ananas",
+    "954": "banana",
+    "955": "jackfruit, jak, jack",
+    "956": "custard apple",
+    "957": "pomegranate",
+    "958": "hay",
+    "959": "carbonara",
+    "96": "toucan",
+    "960": "chocolate sauce, chocolate syrup",
+    "961": "dough",
+    "962": "meat loaf, meatloaf",
+    "963": "pizza, pizza pie",
+    "964": "potpie",
+    "965": "burrito",
+    "966": "red wine",
+    "967": "espresso",
+    "968": "cup",
+    "969": "eggnog",
+    "97": "drake",
+    "970": "alp",
+    "971": "bubble",
+    "972": "cliff, drop, drop-off",
+    "973": "coral reef",
+    "974": "geyser",
+    "975": "lakeside, lakeshore",
+    "976": "promontory, headland, head, foreland",
+    "977": "sandbar, sand bar",
+    "978": "seashore, coast, seacoast, sea-coast",
+    "979": "valley, vale",
+    "98": "red-breasted merganser, Mergus serrator",
+    "980": "volcano",
+    "981": "ballplayer, baseball player",
+    "982": "groom, bridegroom",
+    "983": "scuba diver",
+    "984": "rapeseed",
+    "985": "daisy",
+    "986": "yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    "987": "corn",
+    "988": "acorn",
+    "989": "hip, rose hip, rosehip",
+    "99": "goose",
+    "990": "buckeye, horse chestnut, conker",
+    "991": "coral fungus",
+    "992": "agaric",
+    "993": "gyromitra",
+    "994": "stinkhorn, carrion fungus",
+    "995": "earthstar",
+    "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    "997": "bolete",
+    "998": "ear, spike, capitulum",
+    "999": "toilet tissue, toilet paper, bathroom tissue"
+  },
+  "scheduler": [
+    "scheduling_deco_flow_match_euler_discrete",
+    "DeCoFlowMatchEulerDiscreteScheduler"
+  ],
+  "transformer": [
+    "transformer_deco",
+    "DeCoTransformer2DModel"
+  ]
+}

DeCo-XL-16-256/pipeline.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""Hub custom pipeline: DeCoPipeline (class-conditioned c2i).
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.utils.torch_utils import randn_tensor
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+        >>> model_dir = Path("./DeCo-XL-16-512").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe.to("cuda")
+        >>> print(pipe.id2label[207])
+        >>> print(pipe.get_label_ids("golden retriever"))
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     class_labels="golden retriever",
+        ...     num_inference_steps=100,
+        ...     guidance_scale=5.0,
+        ...     generator=generator,
+        ... ).images[0]
+        ```
+"""
+class DeCoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for class-conditional image generation with DeCo.
+    Parameters:
+        transformer ([`DeCoTransformer2DModel`]):
+            Class-conditional DeCo transformer.
+        scheduler ([`DeCoFlowMatchEulerDiscreteScheduler`]):
+            Flow-matching Euler scheduler for DeCo.
+        decoder ([`DeCoPatchDecoderModel`]):
+            Per-patch RGB decoder (NerfEmbedder + AdaLN MLP).
+        id2label (`dict[int, str]`, *optional*):
+            ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
+    """
+    model_cpu_offload_seq = "transformer->decoder"
+    def __init__(
+        self,
+        transformer,
+        scheduler,
+        decoder,
+        id2label: Optional[Dict[Union[int, str], str]] = None,
+    ):
+        super().__init__()
+        self.register_modules(transformer=transformer, scheduler=scheduler, decoder=decoder)
+        self._id2label = self._normalize_id2label(id2label)
+        self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
+    @staticmethod
+    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
+        if not id2label:
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
+        if not model_index_path.exists():
+            return {}
+        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
+        id2label = raw.get("id2label")
+        if not isinstance(id2label, dict):
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    @staticmethod
+    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
+        label2id: Dict[str, int] = {}
+        for class_id, value in id2label.items():
+            for synonym in value.split(","):
+                synonym = synonym.strip()
+                if synonym:
+                    label2id[synonym] = int(class_id)
+        return dict(sorted(label2id.items()))
+    @property
+    def id2label(self) -> Dict[int, str]:
+        r"""ImageNet class id to English label string (comma-separated synonyms)."""
+        self._ensure_labels_loaded()
+        return self._id2label
+    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        r"""
+        Map ImageNet label strings to class ids.
+        Args:
+            label (`str` or `list[str]`):
+                One or more English label strings. Each string must match a synonym in `id2label`.
+        """
+        self._ensure_labels_loaded()
+        label2id = self.labels
+        if not label2id:
+            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
+        if isinstance(label, str):
+            label = [label]
+        missing = [item for item in label if item not in label2id]
+        if missing:
+            preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
+        return [label2id[item] for item in label]
+    def _normalize_class_labels(
+        self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
+    ) -> torch.LongTensor:
+        if torch.is_tensor(class_labels):
+            return class_labels.to(device=self._execution_device, dtype=torch.long).reshape(-1)
+        if isinstance(class_labels, int):
+            class_label_ids = [class_labels]
+        elif isinstance(class_labels, str):
+            class_label_ids = self.get_label_ids(class_labels)
+        elif class_labels and isinstance(class_labels[0], str):
+            class_label_ids = self.get_label_ids(class_labels)
+        else:
+            class_label_ids = list(class_labels)
+        return torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
+    def _default_sample_size(self) -> int:
+        return int(getattr(self.transformer.config, "sample_size", 256))
+    @torch.no_grad()
+    def __call__(
+        self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
+        batch_size: Optional[int] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.0,
+        generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Generate class-conditional images with DeCo.
+        Args:
+            class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`):
+                ImageNet class indices or human-readable English label strings.
+            batch_size (`int`, *optional*):
+                Number of images to generate. Defaults to the number of class labels. When a single
+                class label is provided, repeats it to match `batch_size`.
+            height (`int`, *optional*):
+                Output image height in pixels. Defaults to `transformer.config.sample_size`.
+            width (`int`, *optional*):
+                Output image width in pixels. Defaults to `transformer.config.sample_size`.
+            num_inference_steps (`int`, defaults to `50`):
+                Number of denoising steps.
+            guidance_scale (`float`, defaults to `1.0`):
+                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
+            generator (`torch.Generator`, *optional*):
+                RNG for reproducibility.
+            output_type (`str`, defaults to `"pil"`):
+                `"pil"`, `"np"`, or `"latent"`.
+            return_dict (`bool`, defaults to `True`):
+                Return [`ImagePipelineOutput`] if True.
+        """
+        device = self._execution_device
+        dtype = next(self.transformer.parameters()).dtype
+        do_cfg = guidance_scale is not None and float(guidance_scale) > 1.0
+        sample_size = self._default_sample_size()
+        height = int(height if height is not None else sample_size)
+        width = int(width if width is not None else sample_size)
+        class_labels = self._normalize_class_labels(class_labels)
+        if batch_size is None:
+            batch_size = int(class_labels.numel())
+        elif class_labels.numel() == 1 and batch_size > 1:
+            class_labels = class_labels.repeat(batch_size)
+        elif class_labels.numel() != batch_size:
+            raise ValueError("class_labels batch size must match batch_size")
+        if do_cfg:
+            null_label = int(self.transformer.config.num_classes)
+            uncond_labels = torch.full((batch_size,), null_label, device=device, dtype=torch.long)
+        latents = randn_tensor(
+            (batch_size, int(self.transformer.config.in_channels), height, width),
+            generator=generator,
+            device=device,
+            dtype=dtype,
+        )
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps[:-1]
+        for timestep in self.progress_bar(timesteps):
+            latent_model_input = self.scheduler.scale_model_input(latents, timestep)
+            if do_cfg:
+                latent_model_input = torch.cat([latent_model_input, latent_model_input], dim=0)
+                model_output = self.transformer(
+                    latent_model_input,
+                    timestep,
+                    class_labels=torch.cat([uncond_labels, class_labels], dim=0),
+                    decoder=self.decoder,
+                ).sample
+                model_output_uncond, model_output_cond = model_output.chunk(2)
+                model_output = model_output_uncond + float(guidance_scale) * (model_output_cond - model_output_uncond)
+            else:
+                model_output = self.transformer(
+                    latent_model_input, timestep, class_labels=class_labels, decoder=self.decoder
+                ).sample
+            latents = self.scheduler.step(model_output, timestep, latents).prev_sample
+        image = latents
+        if output_type == "latent":
+            if not return_dict:
+                return (image,)
+            return ImagePipelineOutput(images=image)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        elif output_type != "np":
+            raise ValueError("output_type must be one of {'pil', 'np', 'latent'}")
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

DeCo-XL-16-256/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_class_name": "DeCoFlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.31.0",
+  "last_step": null,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "shift": 1.0
+}

DeCo-XL-16-256/scheduler/scheduling_deco_flow_match_euler_discrete.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import annotations
+from typing import Optional, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+def _shift_respace_fn(t: torch.Tensor, shift: float = 1.0) -> torch.Tensor:
+    return t / (t + (1 - t) * shift)
+class DeCoFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    config_name = "scheduler_config.json"
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        last_step: Optional[float] = None,
+        prediction_type: str = "v_prediction",
+    ):
+        self.timesteps = torch.tensor([], dtype=torch.float32)
+        self.num_inference_steps: Optional[int] = None
+        self._step_index: int = 0
+    @property
+    def init_noise_sigma(self) -> float:
+        return 1.0
+    def set_timesteps(self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None):
+        if num_inference_steps <= 0:
+            raise ValueError("num_inference_steps must be > 0")
+        self.num_inference_steps = int(num_inference_steps)
+        last_step = self.config.last_step
+        if last_step is None:
+            last_step = 1.0 / float(self.num_inference_steps)
+        base_timesteps = torch.linspace(0.0, 1.0 - float(last_step), self.num_inference_steps, dtype=torch.float32)
+        base_timesteps = torch.cat([base_timesteps, torch.tensor([1.0], dtype=torch.float32)], dim=0)
+        timesteps = _shift_respace_fn(base_timesteps, shift=float(self.config.shift))
+        if device is not None:
+            timesteps = timesteps.to(device)
+        self.timesteps = timesteps
+        self._step_index = 0
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return sample
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[torch.Tensor, float],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ):
+        if self.num_inference_steps is None or self.timesteps.numel() == 0:
+            raise ValueError("Call set_timesteps before step")
+        step_index = min(self._step_index, len(self.timesteps) - 2)
+        dt = (self.timesteps[step_index + 1] - self.timesteps[step_index]).to(device=sample.device, dtype=sample.dtype)
+        prev_sample = sample + model_output * dt
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def add_noise(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        if timesteps.ndim == 0:
+            timesteps = timesteps[None]
+        t = timesteps.to(device=original_samples.device, dtype=original_samples.dtype).view(-1, 1, 1, 1)
+        return t * original_samples + (1.0 - t) * noise

DeCo-XL-16-256/transformer/__pycache__/transformer_deco.cpython-312.pyc ADDED Viewed

Binary file (23.2 kB). View file

DeCo-XL-16-256/transformer/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "DeCoTransformer2DModel",
+  "conditioning_type": "class",
+  "decoder_hidden_size": 64,
+  "deep_supervision": 0,
+  "hidden_size": 1152,
+  "hidden_size_x": 32,
+  "in_channels": 3,
+  "learn_sigma": true,
+  "nerf_mlpratio": 4,
+  "num_blocks": 31,
+  "num_classes": 1000,
+  "num_cond_blocks": 28,
+  "num_decoder_blocks": 4,
+  "num_encoder_blocks": 18,
+  "num_groups": 16,
+  "num_text_blocks": 4,
+  "patch_size": 16,
+  "sample_size": 256,
+  "txt_embed_dim": 1024,
+  "txt_max_length": 100
+}

DeCo-XL-16-256/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75c24fe14dde1f4def9b52ab7211252b7baa344f09d7a3da7b95a5033ccfb824
+size 2691309848

DeCo-XL-16-256/transformer/transformer_deco.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.functional import scaled_dot_product_attention
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.models.normalization import RMSNorm
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans: int, embed_dim: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(in_chans, embed_dim, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.proj(x)
+class TimestepEmbedder(nn.Module):
+    """Sinusoidal timestep embedding with checkpoint-compatible `mlp` module names."""
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t: torch.Tensor, dim: int, max_period: int = 10) -> torch.Tensor:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+        )
+        args = t[..., None].float() * freqs[None, ...]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding.to(t.dtype)
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        return self.mlp(t_freq)
+class DeCoSwiGLU(nn.Module):
+    """SwiGLU MLP with w1/w2/w3 layout matching official DeCo checkpoints."""
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float = 10000.0, scale: float = 16.0) -> torch.Tensor:
+    x_pos = torch.linspace(0, scale, width)
+    y_pos = torch.linspace(0, scale, height)
+    y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+    y_pos = y_pos.reshape(-1)
+    x_pos = x_pos.reshape(-1)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    x_freqs = torch.outer(x_pos, freqs).float()
+    y_freqs = torch.outer(y_pos, freqs).float()
+    x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+    y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+    freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+    return freqs_cis.reshape(height * width, -1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    freqs_cis = freqs_cis[None, :, None, :]
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class LabelEmbedder(nn.Module):
+    def __init__(self, num_classes: int, hidden_size: int):
+        super().__init__()
+        self.embedding_table = nn.Embedding(num_classes, hidden_size)
+    def forward(self, labels: torch.Tensor) -> torch.Tensor:
+        return self.embedding_table(labels)
+class RAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = True,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = RMSNorm(self.head_dim, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = RMSNorm(self.head_dim, eps=1e-6) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, pos: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, num_tokens, channels = x.shape
+        qkv = self.qkv(x).reshape(batch_size, num_tokens, 3, self.num_heads, self.head_dim).permute(2, 0, 1, 3, 4)
+        query, key, value = qkv[0], qkv[1], qkv[2]
+        query = self.q_norm(query)
+        key = self.k_norm(key)
+        query, key = apply_rotary_emb(query, key, freqs_cis=pos)
+        query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        x = scaled_dot_product_attention(query, key, value, attn_mask=mask, dropout_p=0.0)
+        x = x.transpose(1, 2).reshape(batch_size, num_tokens, channels)
+        return self.proj_drop(self.proj(x))
+class FlattenDiTBlock(nn.Module):
+    def __init__(self, hidden_size: int, groups: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
+        self.attn = RAttention(hidden_size, num_heads=groups, qkv_bias=False)
+        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
+        self.mlp = DeCoSwiGLU(hidden_size, int(hidden_size * mlp_ratio))
+        self.adaLN_modulation = nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+    def forward(self, x: torch.Tensor, c: torch.Tensor, pos: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate_msa * self.attn(_modulate(self.norm1(x), shift_msa, scale_msa), pos, mask=mask)
+        return x + gate_mlp * self.mlp(_modulate(self.norm2(x), shift_mlp, scale_mlp))
+@dataclass
+class DeCoTransformer2DModelOutput(BaseOutput):
+    sample: torch.Tensor
+class _DeCoTransformerBackbone(nn.Module):
+    """Class-conditioned DeCo conditioning trunk. Checkpoint weights live under the `backbone.` prefix."""
+    def __init__(
+        self,
+        in_channels: int,
+        patch_size: int,
+        num_groups: int,
+        hidden_size: int,
+        num_cond_blocks: int,
+        num_classes: int,
+        learn_sigma: bool,
+        deep_supervision: int,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.deep_supervision = deep_supervision
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.num_groups = num_groups
+        self.num_cond_blocks = num_cond_blocks
+        self.s_embedder = PatchEmbed(in_channels * patch_size**2, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes + 1, hidden_size)
+        self.blocks = nn.ModuleList([FlattenDiTBlock(hidden_size, num_groups) for _ in range(num_cond_blocks)])
+        self.precompute_pos: dict[tuple[int, int], torch.Tensor] = {}
+        self._init_weights()
+    def _init_weights(self) -> None:
+        weight = self.s_embedder.proj.weight.data
+        nn.init.xavier_uniform_(weight.view([weight.shape[0], -1]))
+        nn.init.constant_(self.s_embedder.proj.bias, 0)
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+    def fetch_pos(self, height: int, width: int, device: torch.device) -> torch.Tensor:
+        key = (height, width)
+        if key not in self.precompute_pos:
+            self.precompute_pos[key] = precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width)
+        return self.precompute_pos[key].to(device)
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: torch.Tensor,
+        decoder: nn.Module,
+        s: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = x.shape
+        pos = self.fetch_pos(height // self.patch_size, width // self.patch_size, x.device)
+        x = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+        t = self.t_embedder(t.view(-1)).view(batch_size, -1, self.hidden_size)
+        y = self.y_embedder(y).view(batch_size, 1, self.hidden_size)
+        c = F.silu(t + y)
+        if s is None:
+            s = self.s_embedder(x)
+            for block in self.blocks:
+                s = block(s, c, pos, mask)
+            s = F.silu(t + s)
+        batch_size, length, _ = s.shape
+        patch_pixels = x.reshape(batch_size * length, self.in_channels, self.patch_size**2).transpose(1, 2)
+        conditioning = s.view(batch_size * length, self.hidden_size)
+        decoded = decoder(patch_pixels, conditioning).sample
+        x = decoded.transpose(1, 2).reshape(batch_size, length, -1)
+        return F.fold(
+            x.transpose(1, 2).contiguous(),
+            (height, width),
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+class DeCoTransformer2DModel(ModelMixin, ConfigMixin):
+    """Class-conditioned DeCo transformer (c2i) for Diffusers pipelines."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        patch_size: int = 2,
+        num_groups: int = 12,
+        hidden_size: int = 1152,
+        hidden_size_x: int = 64,
+        num_blocks: int = 18,
+        num_cond_blocks: int = 4,
+        num_classes: int = 1000,
+        learn_sigma: bool = True,
+        deep_supervision: int = 0,
+        sample_size: int = 256,
+        # Deprecated config keys kept for backward-compatible hub configs.
+        conditioning_type: str = "class",
+        nerf_mlpratio: int = 4,
+        decoder_hidden_size: int = 64,
+        num_encoder_blocks: int = 18,
+        num_decoder_blocks: int = 4,
+        num_text_blocks: int = 4,
+        txt_embed_dim: int = 1024,
+        txt_max_length: int = 100,
+    ):
+        super().__init__()
+        del hidden_size_x, nerf_mlpratio, decoder_hidden_size, num_encoder_blocks, num_decoder_blocks
+        del num_text_blocks, txt_embed_dim, txt_max_length
+        if conditioning_type != "class":
+            raise ValueError("DeCoTransformer2DModel only supports class conditioning (c2i).")
+        self.backbone = _DeCoTransformerBackbone(
+            in_channels=in_channels,
+            patch_size=patch_size,
+            num_groups=num_groups,
+            hidden_size=hidden_size,
+            num_cond_blocks=num_cond_blocks,
+            num_classes=num_classes,
+            learn_sigma=learn_sigma,
+            deep_supervision=deep_supervision,
+        )
+    @property
+    def in_channels(self) -> int:
+        return int(self.config.in_channels)
+    def _prepare_timestep(
+        self, timestep: Union[torch.Tensor, float, int], batch_size: int, sample: torch.Tensor
+    ) -> torch.Tensor:
+        if not isinstance(timestep, torch.Tensor):
+            timestep = torch.tensor([timestep], device=sample.device, dtype=sample.dtype)
+        timestep = timestep.to(device=sample.device, dtype=sample.dtype)
+        if timestep.ndim == 0:
+            timestep = timestep[None]
+        if timestep.shape[0] == 1 and batch_size > 1:
+            timestep = timestep.repeat(batch_size)
+        return timestep
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        class_labels: Optional[torch.Tensor] = None,
+        decoder: Optional[nn.Module] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DeCoTransformer2DModelOutput, tuple[torch.Tensor]]:
+        if encoder_hidden_states is not None:
+            raise ValueError("encoder_hidden_states is not supported; use class_labels for c2i DeCo models.")
+        if class_labels is None:
+            raise ValueError("class_labels must be provided for class-conditioned DeCo models.")
+        if decoder is None:
+            raise ValueError("decoder must be provided; load DeCoPatchDecoderModel as a separate pipeline component.")
+        batch_size = sample.shape[0]
+        t = self._prepare_timestep(timestep=timestep, batch_size=batch_size, sample=sample)
+        output = self.backbone(
+            sample,
+            t,
+            class_labels.to(device=sample.device, dtype=torch.long),
+            decoder=decoder,
+        )
+        if not return_dict:
+            return (output,)
+        return DeCoTransformer2DModelOutput(sample=output)

DeCo-XL-16-512/decoder/__pycache__/decoder_deco.cpython-312.pyc ADDED Viewed

Binary file (10.9 kB). View file

DeCo-XL-16-512/decoder/config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "in_channels": 3,
+  "hidden_size_x": 32,
+  "z_channels": 1152,
+  "max_freqs": 8,
+  "num_res_blocks": 3,
+  "patch_size": 16
+}

DeCo-XL-16-512/decoder/decoder_deco.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+class NerfEmbedder(nn.Module):
+    def __init__(self, in_channels: int, hidden_size_input: int, max_freqs: int):
+        super().__init__()
+        self.max_freqs = max_freqs
+        self.embedder = nn.Sequential(nn.Linear(in_channels + max_freqs**2, hidden_size_input, bias=True))
+    @lru_cache
+    def fetch_pos(self, patch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        pos_x = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
+        pos_y = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
+        pos_y, pos_x = torch.meshgrid(pos_y, pos_x, indexing="ij")
+        freqs = torch.linspace(0, self.max_freqs, self.max_freqs, dtype=dtype, device=device)
+        freqs_x = freqs[None, :, None]
+        freqs_y = freqs[None, None, :]
+        coeffs = (1 + freqs_x * freqs_y) ** -1
+        dct = (
+            torch.cos(pos_x.reshape(-1, 1, 1) * freqs_x * torch.pi)
+            * torch.cos(pos_y.reshape(-1, 1, 1) * freqs_y * torch.pi)
+            * coeffs
+        ).view(1, -1, self.max_freqs**2)
+        return dct
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        batch_size, patch_tokens, _ = inputs.shape
+        patch_size = int(patch_tokens**0.5)
+        dct = self.fetch_pos(patch_size, inputs.device, inputs.dtype).repeat(batch_size, 1, 1)
+        return self.embedder(torch.cat([inputs, dct], dim=-1))
+class ResBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(channels, 3 * channels, bias=True))
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        return x + gate_mlp * self.mlp(_modulate(self.in_ln(x), shift_mlp, scale_mlp))
+class DecoderFinalLayer(nn.Module):
+    def __init__(self, model_channels: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.norm_final(x))
+class SimpleMLPAdaLN(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        z_channels: int,
+        num_res_blocks: int,
+        patch_size: int,
+        grad_checkpointing: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.grad_checkpointing = grad_checkpointing
+        self.cond_embed = nn.Linear(z_channels, patch_size**2 * model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        self.res_blocks = nn.ModuleList([ResBlock(model_channels) for _ in range(num_res_blocks)])
+        self.final_layer = DecoderFinalLayer(model_channels, out_channels)
+        self._init_weights()
+    def _init_weights(self) -> None:
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        x = self.input_proj(x)
+        y = self.cond_embed(c).reshape(c.shape[0], self.patch_size**2, -1)
+        for block in self.res_blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(block, x, y)
+            else:
+                x = block(x, y)
+        return self.final_layer(x)
+@dataclass
+class DeCoPatchDecoderOutput(BaseOutput):
+    sample: torch.Tensor
+class DeCoPatchDecoderModel(ModelMixin, ConfigMixin):
+    """Per-patch RGB decoder for DeCo (NerfEmbedder + AdaLN MLP)."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        hidden_size_x: int = 32,
+        z_channels: int = 1152,
+        num_res_blocks: int = 3,
+        patch_size: int = 16,
+        max_freqs: int = 8,
+    ):
+        super().__init__()
+        self.x_embedder = NerfEmbedder(in_channels, hidden_size_x, max_freqs=max_freqs)
+        self.dec_net = SimpleMLPAdaLN(
+            in_channels=hidden_size_x,
+            model_channels=hidden_size_x,
+            out_channels=in_channels,
+            z_channels=z_channels,
+            num_res_blocks=num_res_blocks,
+            patch_size=patch_size,
+        )
+    def forward(
+        self,
+        patch_pixels: torch.Tensor,
+        conditioning: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[DeCoPatchDecoderOutput, tuple[torch.Tensor]]:
+        """
+        Args:
+            patch_pixels (`torch.Tensor`):
+                Flattened patch pixels of shape `(batch * num_patches, patch_size ** 2, in_channels)`.
+            conditioning (`torch.Tensor`):
+                Per-patch conditioning of shape `(batch * num_patches, z_channels)`.
+        """
+        output = self.dec_net(self.x_embedder(patch_pixels), conditioning)
+        if not return_dict:
+            return (output,)
+        return DeCoPatchDecoderOutput(sample=output)

DeCo-XL-16-512/decoder/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca6476afbc38d431cc503a810567d5d30075c57e9209567b1b12279d749b5a8
+size 37862236

DeCo-XL-16-512/decoder/diffusion_pytorch_model.safetensors.bak ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca6476afbc38d431cc503a810567d5d30075c57e9209567b1b12279d749b5a8
+size 37862236

DeCo-XL-16-512/demo.png ADDED Viewed

Git LFS Details

SHA256: af1ae9ea3f293d2f531f437c93c28df86e648f28fbdba5ec3ce65724cf480822
Pointer size: 131 Bytes
Size of remote file: 504 kB

DeCo-XL-16-512/model_index.json ADDED Viewed

	@@ -0,0 +1,1021 @@

+{
+  "_class_name": [
+    "pipeline",
+    "DeCoPipeline"
+  ],
+  "_diffusers_version": "0.31.0",
+  "transformer": [
+    "transformer_deco",
+    "DeCoTransformer2DModel"
+  ],
+  "scheduler": [
+    "scheduling_deco_flow_match_euler_discrete",
+    "DeCoFlowMatchEulerDiscreteScheduler"
+  ],
+  "decoder": [
+    "decoder_deco",
+    "DeCoPatchDecoderModel"
+  ],
+  "id2label": {
+    "0": "tench, Tinca tinca",
+    "1": "goldfish, Carassius auratus",
+    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
+    "3": "tiger shark, Galeocerdo cuvieri",
+    "4": "hammerhead, hammerhead shark",
+    "5": "electric ray, crampfish, numbfish, torpedo",
+    "6": "stingray",
+    "7": "cock",
+    "8": "hen",
+    "9": "ostrich, Struthio camelus",
+    "10": "brambling, Fringilla montifringilla",
+    "11": "goldfinch, Carduelis carduelis",
+    "12": "house finch, linnet, Carpodacus mexicanus",
+    "13": "junco, snowbird",
+    "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
+    "15": "robin, American robin, Turdus migratorius",
+    "16": "bulbul",
+    "17": "jay",
+    "18": "magpie",
+    "19": "chickadee",
+    "20": "water ouzel, dipper",
+    "21": "kite",
+    "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
+    "23": "vulture",
+    "24": "great grey owl, great gray owl, Strix nebulosa",
+    "25": "European fire salamander, Salamandra salamandra",
+    "26": "common newt, Triturus vulgaris",
+    "27": "eft",
+    "28": "spotted salamander, Ambystoma maculatum",
+    "29": "axolotl, mud puppy, Ambystoma mexicanum",
+    "30": "bullfrog, Rana catesbeiana",
+    "31": "tree frog, tree-frog",
+    "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
+    "33": "loggerhead, loggerhead turtle, Caretta caretta",
+    "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
+    "35": "mud turtle",
+    "36": "terrapin",
+    "37": "box turtle, box tortoise",
+    "38": "banded gecko",
+    "39": "common iguana, iguana, Iguana iguana",
+    "40": "American chameleon, anole, Anolis carolinensis",
+    "41": "whiptail, whiptail lizard",
+    "42": "agama",
+    "43": "frilled lizard, Chlamydosaurus kingi",
+    "44": "alligator lizard",
+    "45": "Gila monster, Heloderma suspectum",
+    "46": "green lizard, Lacerta viridis",
+    "47": "African chameleon, Chamaeleo chamaeleon",
+    "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
+    "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
+    "50": "American alligator, Alligator mississipiensis",
+    "51": "triceratops",
+    "52": "thunder snake, worm snake, Carphophis amoenus",
+    "53": "ringneck snake, ring-necked snake, ring snake",
+    "54": "hognose snake, puff adder, sand viper",
+    "55": "green snake, grass snake",
+    "56": "king snake, kingsnake",
+    "57": "garter snake, grass snake",
+    "58": "water snake",
+    "59": "vine snake",
+    "60": "night snake, Hypsiglena torquata",
+    "61": "boa constrictor, Constrictor constrictor",
+    "62": "rock python, rock snake, Python sebae",
+    "63": "Indian cobra, Naja naja",
+    "64": "green mamba",
+    "65": "sea snake",
+    "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
+    "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
+    "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
+    "69": "trilobite",
+    "70": "harvestman, daddy longlegs, Phalangium opilio",
+    "71": "scorpion",
+    "72": "black and gold garden spider, Argiope aurantia",
+    "73": "barn spider, Araneus cavaticus",
+    "74": "garden spider, Aranea diademata",
+    "75": "black widow, Latrodectus mactans",
+    "76": "tarantula",
+    "77": "wolf spider, hunting spider",
+    "78": "tick",
+    "79": "centipede",
+    "80": "black grouse",
+    "81": "ptarmigan",
+    "82": "ruffed grouse, partridge, Bonasa umbellus",
+    "83": "prairie chicken, prairie grouse, prairie fowl",
+    "84": "peacock",
+    "85": "quail",
+    "86": "partridge",
+    "87": "African grey, African gray, Psittacus erithacus",
+    "88": "macaw",
+    "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
+    "90": "lorikeet",
+    "91": "coucal",
+    "92": "bee eater",
+    "93": "hornbill",
+    "94": "hummingbird",
+    "95": "jacamar",
+    "96": "toucan",
+    "97": "drake",
+    "98": "red-breasted merganser, Mergus serrator",
+    "99": "goose",
+    "100": "black swan, Cygnus atratus",
+    "101": "tusker",
+    "102": "echidna, spiny anteater, anteater",
+    "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
+    "104": "wallaby, brush kangaroo",
+    "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
+    "106": "wombat",
+    "107": "jellyfish",
+    "108": "sea anemone, anemone",
+    "109": "brain coral",
+    "110": "flatworm, platyhelminth",
+    "111": "nematode, nematode worm, roundworm",
+    "112": "conch",
+    "113": "snail",
+    "114": "slug",
+    "115": "sea slug, nudibranch",
+    "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
+    "117": "chambered nautilus, pearly nautilus, nautilus",
+    "118": "Dungeness crab, Cancer magister",
+    "119": "rock crab, Cancer irroratus",
+    "120": "fiddler crab",
+    "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
+    "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
+    "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
+    "124": "crayfish, crawfish, crawdad, crawdaddy",
+    "125": "hermit crab",
+    "126": "isopod",
+    "127": "white stork, Ciconia ciconia",
+    "128": "black stork, Ciconia nigra",
+    "129": "spoonbill",
+    "130": "flamingo",
+    "131": "little blue heron, Egretta caerulea",
+    "132": "American egret, great white heron, Egretta albus",
+    "133": "bittern",
+    "134": "crane",
+    "135": "limpkin, Aramus pictus",
+    "136": "European gallinule, Porphyrio porphyrio",
+    "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
+    "138": "bustard",
+    "139": "ruddy turnstone, Arenaria interpres",
+    "140": "red-backed sandpiper, dunlin, Erolia alpina",
+    "141": "redshank, Tringa totanus",
+    "142": "dowitcher",
+    "143": "oystercatcher, oyster catcher",
+    "144": "pelican",
+    "145": "king penguin, Aptenodytes patagonica",
+    "146": "albatross, mollymawk",
+    "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
+    "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
+    "149": "dugong, Dugong dugon",
+    "150": "sea lion",
+    "151": "Chihuahua",
+    "152": "Japanese spaniel",
+    "153": "Maltese dog, Maltese terrier, Maltese",
+    "154": "Pekinese, Pekingese, Peke",
+    "155": "Shih-Tzu",
+    "156": "Blenheim spaniel",
+    "157": "papillon",
+    "158": "toy terrier",
+    "159": "Rhodesian ridgeback",
+    "160": "Afghan hound, Afghan",
+    "161": "basset, basset hound",
+    "162": "beagle",
+    "163": "bloodhound, sleuthhound",
+    "164": "bluetick",
+    "165": "black-and-tan coonhound",
+    "166": "Walker hound, Walker foxhound",
+    "167": "English foxhound",
+    "168": "redbone",
+    "169": "borzoi, Russian wolfhound",
+    "170": "Irish wolfhound",
+    "171": "Italian greyhound",
+    "172": "whippet",
+    "173": "Ibizan hound, Ibizan Podenco",
+    "174": "Norwegian elkhound, elkhound",
+    "175": "otterhound, otter hound",
+    "176": "Saluki, gazelle hound",
+    "177": "Scottish deerhound, deerhound",
+    "178": "Weimaraner",
+    "179": "Staffordshire bullterrier, Staffordshire bull terrier",
+    "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
+    "181": "Bedlington terrier",
+    "182": "Border terrier",
+    "183": "Kerry blue terrier",
+    "184": "Irish terrier",
+    "185": "Norfolk terrier",
+    "186": "Norwich terrier",
+    "187": "Yorkshire terrier",
+    "188": "wire-haired fox terrier",
+    "189": "Lakeland terrier",
+    "190": "Sealyham terrier, Sealyham",
+    "191": "Airedale, Airedale terrier",
+    "192": "cairn, cairn terrier",
+    "193": "Australian terrier",
+    "194": "Dandie Dinmont, Dandie Dinmont terrier",
+    "195": "Boston bull, Boston terrier",
+    "196": "miniature schnauzer",
+    "197": "giant schnauzer",
+    "198": "standard schnauzer",
+    "199": "Scotch terrier, Scottish terrier, Scottie",
+    "200": "Tibetan terrier, chrysanthemum dog",
+    "201": "silky terrier, Sydney silky",
+    "202": "soft-coated wheaten terrier",
+    "203": "West Highland white terrier",
+    "204": "Lhasa, Lhasa apso",
+    "205": "flat-coated retriever",
+    "206": "curly-coated retriever",
+    "207": "golden retriever",
+    "208": "Labrador retriever",
+    "209": "Chesapeake Bay retriever",
+    "210": "German short-haired pointer",
+    "211": "vizsla, Hungarian pointer",
+    "212": "English setter",
+    "213": "Irish setter, red setter",
+    "214": "Gordon setter",
+    "215": "Brittany spaniel",
+    "216": "clumber, clumber spaniel",
+    "217": "English springer, English springer spaniel",
+    "218": "Welsh springer spaniel",
+    "219": "cocker spaniel, English cocker spaniel, cocker",
+    "220": "Sussex spaniel",
+    "221": "Irish water spaniel",
+    "222": "kuvasz",
+    "223": "schipperke",
+    "224": "groenendael",
+    "225": "malinois",
+    "226": "briard",
+    "227": "kelpie",
+    "228": "komondor",
+    "229": "Old English sheepdog, bobtail",
+    "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
+    "231": "collie",
+    "232": "Border collie",
+    "233": "Bouvier des Flandres, Bouviers des Flandres",
+    "234": "Rottweiler",
+    "235": "German shepherd, German shepherd dog, German police dog, alsatian",
+    "236": "Doberman, Doberman pinscher",
+    "237": "miniature pinscher",
+    "238": "Greater Swiss Mountain dog",
+    "239": "Bernese mountain dog",
+    "240": "Appenzeller",
+    "241": "EntleBucher",
+    "242": "boxer",
+    "243": "bull mastiff",
+    "244": "Tibetan mastiff",
+    "245": "French bulldog",
+    "246": "Great Dane",
+    "247": "Saint Bernard, St Bernard",
+    "248": "Eskimo dog, husky",
+    "249": "malamute, malemute, Alaskan malamute",
+    "250": "Siberian husky",
+    "251": "dalmatian, coach dog, carriage dog",
+    "252": "affenpinscher, monkey pinscher, monkey dog",
+    "253": "basenji",
+    "254": "pug, pug-dog",
+    "255": "Leonberg",
+    "256": "Newfoundland, Newfoundland dog",
+    "257": "Great Pyrenees",
+    "258": "Samoyed, Samoyede",
+    "259": "Pomeranian",
+    "260": "chow, chow chow",
+    "261": "keeshond",
+    "262": "Brabancon griffon",
+    "263": "Pembroke, Pembroke Welsh corgi",
+    "264": "Cardigan, Cardigan Welsh corgi",
+    "265": "toy poodle",
+    "266": "miniature poodle",
+    "267": "standard poodle",
+    "268": "Mexican hairless",
+    "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
+    "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
+    "271": "red wolf, maned wolf, Canis rufus, Canis niger",
+    "272": "coyote, prairie wolf, brush wolf, Canis latrans",
+    "273": "dingo, warrigal, warragal, Canis dingo",
+    "274": "dhole, Cuon alpinus",
+    "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
+    "276": "hyena, hyaena",
+    "277": "red fox, Vulpes vulpes",
+    "278": "kit fox, Vulpes macrotis",
+    "279": "Arctic fox, white fox, Alopex lagopus",
+    "280": "grey fox, gray fox, Urocyon cinereoargenteus",
+    "281": "tabby, tabby cat",
+    "282": "tiger cat",
+    "283": "Persian cat",
+    "284": "Siamese cat, Siamese",
+    "285": "Egyptian cat",
+    "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
+    "287": "lynx, catamount",
+    "288": "leopard, Panthera pardus",
+    "289": "snow leopard, ounce, Panthera uncia",
+    "290": "jaguar, panther, Panthera onca, Felis onca",
+    "291": "lion, king of beasts, Panthera leo",
+    "292": "tiger, Panthera tigris",
+    "293": "cheetah, chetah, Acinonyx jubatus",
+    "294": "brown bear, bruin, Ursus arctos",
+    "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
+    "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
+    "297": "sloth bear, Melursus ursinus, Ursus ursinus",
+    "298": "mongoose",
+    "299": "meerkat, mierkat",
+    "300": "tiger beetle",
+    "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
+    "302": "ground beetle, carabid beetle",
+    "303": "long-horned beetle, longicorn, longicorn beetle",
+    "304": "leaf beetle, chrysomelid",
+    "305": "dung beetle",
+    "306": "rhinoceros beetle",
+    "307": "weevil",
+    "308": "fly",
+    "309": "bee",
+    "310": "ant, emmet, pismire",
+    "311": "grasshopper, hopper",
+    "312": "cricket",
+    "313": "walking stick, walkingstick, stick insect",
+    "314": "cockroach, roach",
+    "315": "mantis, mantid",
+    "316": "cicada, cicala",
+    "317": "leafhopper",
+    "318": "lacewing, lacewing fly",
+    "319": "dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+    "320": "damselfly",
+    "321": "admiral",
+    "322": "ringlet, ringlet butterfly",
+    "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
+    "324": "cabbage butterfly",
+    "325": "sulphur butterfly, sulfur butterfly",
+    "326": "lycaenid, lycaenid butterfly",
+    "327": "starfish, sea star",
+    "328": "sea urchin",
+    "329": "sea cucumber, holothurian",
+    "330": "wood rabbit, cottontail, cottontail rabbit",
+    "331": "hare",
+    "332": "Angora, Angora rabbit",
+    "333": "hamster",
+    "334": "porcupine, hedgehog",
+    "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
+    "336": "marmot",
+    "337": "beaver",
+    "338": "guinea pig, Cavia cobaya",
+    "339": "sorrel",
+    "340": "zebra",
+    "341": "hog, pig, grunter, squealer, Sus scrofa",
+    "342": "wild boar, boar, Sus scrofa",
+    "343": "warthog",
+    "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
+    "345": "ox",
+    "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
+    "347": "bison",
+    "348": "ram, tup",
+    "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
+    "350": "ibex, Capra ibex",
+    "351": "hartebeest",
+    "352": "impala, Aepyceros melampus",
+    "353": "gazelle",
+    "354": "Arabian camel, dromedary, Camelus dromedarius",
+    "355": "llama",
+    "356": "weasel",
+    "357": "mink",
+    "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
+    "359": "black-footed ferret, ferret, Mustela nigripes",
+    "360": "otter",
+    "361": "skunk, polecat, wood pussy",
+    "362": "badger",
+    "363": "armadillo",
+    "364": "three-toed sloth, ai, Bradypus tridactylus",
+    "365": "orangutan, orang, orangutang, Pongo pygmaeus",
+    "366": "gorilla, Gorilla gorilla",
+    "367": "chimpanzee, chimp, Pan troglodytes",
+    "368": "gibbon, Hylobates lar",
+    "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
+    "370": "guenon, guenon monkey",
+    "371": "patas, hussar monkey, Erythrocebus patas",
+    "372": "baboon",
+    "373": "macaque",
+    "374": "langur",
+    "375": "colobus, colobus monkey",
+    "376": "proboscis monkey, Nasalis larvatus",
+    "377": "marmoset",
+    "378": "capuchin, ringtail, Cebus capucinus",
+    "379": "howler monkey, howler",
+    "380": "titi, titi monkey",
+    "381": "spider monkey, Ateles geoffroyi",
+    "382": "squirrel monkey, Saimiri sciureus",
+    "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
+    "384": "indri, indris, Indri indri, Indri brevicaudatus",
+    "385": "Indian elephant, Elephas maximus",
+    "386": "African elephant, Loxodonta africana",
+    "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
+    "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
+    "389": "barracouta, snoek",
+    "390": "eel",
+    "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
+    "392": "rock beauty, Holocanthus tricolor",
+    "393": "anemone fish",
+    "394": "sturgeon",
+    "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
+    "396": "lionfish",
+    "397": "puffer, pufferfish, blowfish, globefish",
+    "398": "abacus",
+    "399": "abaya",
+    "400": "academic gown, academic robe, judge robe",
+    "401": "accordion, piano accordion, squeeze box",
+    "402": "acoustic guitar",
+    "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+    "404": "airliner",
+    "405": "airship, dirigible",
+    "406": "altar",
+    "407": "ambulance",
+    "408": "amphibian, amphibious vehicle",
+    "409": "analog clock",
+    "410": "apiary, bee house",
+    "411": "apron",
+    "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+    "413": "assault rifle, assault gun",
+    "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+    "415": "bakery, bakeshop, bakehouse",
+    "416": "balance beam, beam",
+    "417": "balloon",
+    "418": "ballpoint, ballpoint pen, ballpen, Biro",
+    "419": "Band Aid",
+    "420": "banjo",
+    "421": "bannister, banister, balustrade, balusters, handrail",
+    "422": "barbell",
+    "423": "barber chair",
+    "424": "barbershop",
+    "425": "barn",
+    "426": "barometer",
+    "427": "barrel, cask",
+    "428": "barrow, garden cart, lawn cart, wheelbarrow",
+    "429": "baseball",
+    "430": "basketball",
+    "431": "bassinet",
+    "432": "bassoon",
+    "433": "bathing cap, swimming cap",
+    "434": "bath towel",
+    "435": "bathtub, bathing tub, bath, tub",
+    "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
+    "437": "beacon, lighthouse, beacon light, pharos",
+    "438": "beaker",
+    "439": "bearskin, busby, shako",
+    "440": "beer bottle",
+    "441": "beer glass",
+    "442": "bell cote, bell cot",
+    "443": "bib",
+    "444": "bicycle-built-for-two, tandem bicycle, tandem",
+    "445": "bikini, two-piece",
+    "446": "binder, ring-binder",
+    "447": "binoculars, field glasses, opera glasses",
+    "448": "birdhouse",
+    "449": "boathouse",
+    "450": "bobsled, bobsleigh, bob",
+    "451": "bolo tie, bolo, bola tie, bola",
+    "452": "bonnet, poke bonnet",
+    "453": "bookcase",
+    "454": "bookshop, bookstore, bookstall",
+    "455": "bottlecap",
+    "456": "bow",
+    "457": "bow tie, bow-tie, bowtie",
+    "458": "brass, memorial tablet, plaque",
+    "459": "brassiere, bra, bandeau",
+    "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
+    "461": "breastplate, aegis, egis",
+    "462": "broom",
+    "463": "bucket, pail",
+    "464": "buckle",
+    "465": "bulletproof vest",
+    "466": "bullet train, bullet",
+    "467": "butcher shop, meat market",
+    "468": "cab, hack, taxi, taxicab",
+    "469": "caldron, cauldron",
+    "470": "candle, taper, wax light",
+    "471": "cannon",
+    "472": "canoe",
+    "473": "can opener, tin opener",
+    "474": "cardigan",
+    "475": "car mirror",
+    "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
+    "477": "carpenters kit, tool kit",
+    "478": "carton",
+    "479": "car wheel",
+    "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
+    "481": "cassette",
+    "482": "cassette player",
+    "483": "castle",
+    "484": "catamaran",
+    "485": "CD player",
+    "486": "cello, violoncello",
+    "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+    "488": "chain",
+    "489": "chainlink fence",
+    "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
+    "491": "chain saw, chainsaw",
+    "492": "chest",
+    "493": "chiffonier, commode",
+    "494": "chime, bell, gong",
+    "495": "china cabinet, china closet",
+    "496": "Christmas stocking",
+    "497": "church, church building",
+    "498": "cinema, movie theater, movie theatre, movie house, picture palace",
+    "499": "cleaver, meat cleaver, chopper",
+    "500": "cliff dwelling",
+    "501": "cloak",
+    "502": "clog, geta, patten, sabot",
+    "503": "cocktail shaker",
+    "504": "coffee mug",
+    "505": "coffeepot",
+    "506": "coil, spiral, volute, whorl, helix",
+    "507": "combination lock",
+    "508": "computer keyboard, keypad",
+    "509": "confectionery, confectionary, candy store",
+    "510": "container ship, containership, container vessel",
+    "511": "convertible",
+    "512": "corkscrew, bottle screw",
+    "513": "cornet, horn, trumpet, trump",
+    "514": "cowboy boot",
+    "515": "cowboy hat, ten-gallon hat",
+    "516": "cradle",
+    "517": "crane",
+    "518": "crash helmet",
+    "519": "crate",
+    "520": "crib, cot",
+    "521": "Crock Pot",
+    "522": "croquet ball",
+    "523": "crutch",
+    "524": "cuirass",
+    "525": "dam, dike, dyke",
+    "526": "desk",
+    "527": "desktop computer",
+    "528": "dial telephone, dial phone",
+    "529": "diaper, nappy, napkin",
+    "530": "digital clock",
+    "531": "digital watch",
+    "532": "dining table, board",
+    "533": "dishrag, dishcloth",
+    "534": "dishwasher, dish washer, dishwashing machine",
+    "535": "disk brake, disc brake",
+    "536": "dock, dockage, docking facility",
+    "537": "dogsled, dog sled, dog sleigh",
+    "538": "dome",
+    "539": "doormat, welcome mat",
+    "540": "drilling platform, offshore rig",
+    "541": "drum, membranophone, tympan",
+    "542": "drumstick",
+    "543": "dumbbell",
+    "544": "Dutch oven",
+    "545": "electric fan, blower",
+    "546": "electric guitar",
+    "547": "electric locomotive",
+    "548": "entertainment center",
+    "549": "envelope",
+    "550": "espresso maker",
+    "551": "face powder",
+    "552": "feather boa, boa",
+    "553": "file, file cabinet, filing cabinet",
+    "554": "fireboat",
+    "555": "fire engine, fire truck",
+    "556": "fire screen, fireguard",
+    "557": "flagpole, flagstaff",
+    "558": "flute, transverse flute",
+    "559": "folding chair",
+    "560": "football helmet",
+    "561": "forklift",
+    "562": "fountain",
+    "563": "fountain pen",
+    "564": "four-poster",
+    "565": "freight car",
+    "566": "French horn, horn",
+    "567": "frying pan, frypan, skillet",
+    "568": "fur coat",
+    "569": "garbage truck, dustcart",
+    "570": "gasmask, respirator, gas helmet",
+    "571": "gas pump, gasoline pump, petrol pump, island dispenser",
+    "572": "goblet",
+    "573": "go-kart",
+    "574": "golf ball",
+    "575": "golfcart, golf cart",
+    "576": "gondola",
+    "577": "gong, tam-tam",
+    "578": "gown",
+    "579": "grand piano, grand",
+    "580": "greenhouse, nursery, glasshouse",
+    "581": "grille, radiator grille",
+    "582": "grocery store, grocery, food market, market",
+    "583": "guillotine",
+    "584": "hair slide",
+    "585": "hair spray",
+    "586": "half track",
+    "587": "hammer",
+    "588": "hamper",
+    "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
+    "590": "hand-held computer, hand-held microcomputer",
+    "591": "handkerchief, hankie, hanky, hankey",
+    "592": "hard disc, hard disk, fixed disk",
+    "593": "harmonica, mouth organ, harp, mouth harp",
+    "594": "harp",
+    "595": "harvester, reaper",
+    "596": "hatchet",
+    "597": "holster",
+    "598": "home theater, home theatre",
+    "599": "honeycomb",
+    "600": "hook, claw",
+    "601": "hoopskirt, crinoline",
+    "602": "horizontal bar, high bar",
+    "603": "horse cart, horse-cart",
+    "604": "hourglass",
+    "605": "iPod",
+    "606": "iron, smoothing iron",
+    "607": "jack-o-lantern",
+    "608": "jean, blue jean, denim",
+    "609": "jeep, landrover",
+    "610": "jersey, T-shirt, tee shirt",
+    "611": "jigsaw puzzle",
+    "612": "jinrikisha, ricksha, rickshaw",
+    "613": "joystick",
+    "614": "kimono",
+    "615": "knee pad",
+    "616": "knot",
+    "617": "lab coat, laboratory coat",
+    "618": "ladle",
+    "619": "lampshade, lamp shade",
+    "620": "laptop, laptop computer",
+    "621": "lawn mower, mower",
+    "622": "lens cap, lens cover",
+    "623": "letter opener, paper knife, paperknife",
+    "624": "library",
+    "625": "lifeboat",
+    "626": "lighter, light, igniter, ignitor",
+    "627": "limousine, limo",
+    "628": "liner, ocean liner",
+    "629": "lipstick, lip rouge",
+    "630": "Loafer",
+    "631": "lotion",
+    "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+    "633": "loupe, jewelers loupe",
+    "634": "lumbermill, sawmill",
+    "635": "magnetic compass",
+    "636": "mailbag, postbag",
+    "637": "mailbox, letter box",
+    "638": "maillot",
+    "639": "maillot, tank suit",
+    "640": "manhole cover",
+    "641": "maraca",
+    "642": "marimba, xylophone",
+    "643": "mask",
+    "644": "matchstick",
+    "645": "maypole",
+    "646": "maze, labyrinth",
+    "647": "measuring cup",
+    "648": "medicine chest, medicine cabinet",
+    "649": "megalith, megalithic structure",
+    "650": "microphone, mike",
+    "651": "microwave, microwave oven",
+    "652": "military uniform",
+    "653": "milk can",
+    "654": "minibus",
+    "655": "miniskirt, mini",
+    "656": "minivan",
+    "657": "missile",
+    "658": "mitten",
+    "659": "mixing bowl",
+    "660": "mobile home, manufactured home",
+    "661": "Model T",
+    "662": "modem",
+    "663": "monastery",
+    "664": "monitor",
+    "665": "moped",
+    "666": "mortar",
+    "667": "mortarboard",
+    "668": "mosque",
+    "669": "mosquito net",
+    "670": "motor scooter, scooter",
+    "671": "mountain bike, all-terrain bike, off-roader",
+    "672": "mountain tent",
+    "673": "mouse, computer mouse",
+    "674": "mousetrap",
+    "675": "moving van",
+    "676": "muzzle",
+    "677": "nail",
+    "678": "neck brace",
+    "679": "necklace",
+    "680": "nipple",
+    "681": "notebook, notebook computer",
+    "682": "obelisk",
+    "683": "oboe, hautboy, hautbois",
+    "684": "ocarina, sweet potato",
+    "685": "odometer, hodometer, mileometer, milometer",
+    "686": "oil filter",
+    "687": "organ, pipe organ",
+    "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
+    "689": "overskirt",
+    "690": "oxcart",
+    "691": "oxygen mask",
+    "692": "packet",
+    "693": "paddle, boat paddle",
+    "694": "paddlewheel, paddle wheel",
+    "695": "padlock",
+    "696": "paintbrush",
+    "697": "pajama, pyjama, pjs, jammies",
+    "698": "palace",
+    "699": "panpipe, pandean pipe, syrinx",
+    "700": "paper towel",
+    "701": "parachute, chute",
+    "702": "parallel bars, bars",
+    "703": "park bench",
+    "704": "parking meter",
+    "705": "passenger car, coach, carriage",
+    "706": "patio, terrace",
+    "707": "pay-phone, pay-station",
+    "708": "pedestal, plinth, footstall",
+    "709": "pencil box, pencil case",
+    "710": "pencil sharpener",
+    "711": "perfume, essence",
+    "712": "Petri dish",
+    "713": "photocopier",
+    "714": "pick, plectrum, plectron",
+    "715": "pickelhaube",
+    "716": "picket fence, paling",
+    "717": "pickup, pickup truck",
+    "718": "pier",
+    "719": "piggy bank, penny bank",
+    "720": "pill bottle",
+    "721": "pillow",
+    "722": "ping-pong ball",
+    "723": "pinwheel",
+    "724": "pirate, pirate ship",
+    "725": "pitcher, ewer",
+    "726": "plane, carpenters plane, woodworking plane",
+    "727": "planetarium",
+    "728": "plastic bag",
+    "729": "plate rack",
+    "730": "plow, plough",
+    "731": "plunger, plumbers helper",
+    "732": "Polaroid camera, Polaroid Land camera",
+    "733": "pole",
+    "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
+    "735": "poncho",
+    "736": "pool table, billiard table, snooker table",
+    "737": "pop bottle, soda bottle",
+    "738": "pot, flowerpot",
+    "739": "potters wheel",
+    "740": "power drill",
+    "741": "prayer rug, prayer mat",
+    "742": "printer",
+    "743": "prison, prison house",
+    "744": "projectile, missile",
+    "745": "projector",
+    "746": "puck, hockey puck",
+    "747": "punching bag, punch bag, punching ball, punchball",
+    "748": "purse",
+    "749": "quill, quill pen",
+    "750": "quilt, comforter, comfort, puff",
+    "751": "racer, race car, racing car",
+    "752": "racket, racquet",
+    "753": "radiator",
+    "754": "radio, wireless",
+    "755": "radio telescope, radio reflector",
+    "756": "rain barrel",
+    "757": "recreational vehicle, RV, R.V.",
+    "758": "reel",
+    "759": "reflex camera",
+    "760": "refrigerator, icebox",
+    "761": "remote control, remote",
+    "762": "restaurant, eating house, eating place, eatery",
+    "763": "revolver, six-gun, six-shooter",
+    "764": "rifle",
+    "765": "rocking chair, rocker",
+    "766": "rotisserie",
+    "767": "rubber eraser, rubber, pencil eraser",
+    "768": "rugby ball",
+    "769": "rule, ruler",
+    "770": "running shoe",
+    "771": "safe",
+    "772": "safety pin",
+    "773": "saltshaker, salt shaker",
+    "774": "sandal",
+    "775": "sarong",
+    "776": "sax, saxophone",
+    "777": "scabbard",
+    "778": "scale, weighing machine",
+    "779": "school bus",
+    "780": "schooner",
+    "781": "scoreboard",
+    "782": "screen, CRT screen",
+    "783": "screw",
+    "784": "screwdriver",
+    "785": "seat belt, seatbelt",
+    "786": "sewing machine",
+    "787": "shield, buckler",
+    "788": "shoe shop, shoe-shop, shoe store",
+    "789": "shoji",
+    "790": "shopping basket",
+    "791": "shopping cart",
+    "792": "shovel",
+    "793": "shower cap",
+    "794": "shower curtain",
+    "795": "ski",
+    "796": "ski mask",
+    "797": "sleeping bag",
+    "798": "slide rule, slipstick",
+    "799": "sliding door",
+    "800": "slot, one-armed bandit",
+    "801": "snorkel",
+    "802": "snowmobile",
+    "803": "snowplow, snowplough",
+    "804": "soap dispenser",
+    "805": "soccer ball",
+    "806": "sock",
+    "807": "solar dish, solar collector, solar furnace",
+    "808": "sombrero",
+    "809": "soup bowl",
+    "810": "space bar",
+    "811": "space heater",
+    "812": "space shuttle",
+    "813": "spatula",
+    "814": "speedboat",
+    "815": "spider web, spiders web",
+    "816": "spindle",
+    "817": "sports car, sport car",
+    "818": "spotlight, spot",
+    "819": "stage",
+    "820": "steam locomotive",
+    "821": "steel arch bridge",
+    "822": "steel drum",
+    "823": "stethoscope",
+    "824": "stole",
+    "825": "stone wall",
+    "826": "stopwatch, stop watch",
+    "827": "stove",
+    "828": "strainer",
+    "829": "streetcar, tram, tramcar, trolley, trolley car",
+    "830": "stretcher",
+    "831": "studio couch, day bed",
+    "832": "stupa, tope",
+    "833": "submarine, pigboat, sub, U-boat",
+    "834": "suit, suit of clothes",
+    "835": "sundial",
+    "836": "sunglass",
+    "837": "sunglasses, dark glasses, shades",
+    "838": "sunscreen, sunblock, sun blocker",
+    "839": "suspension bridge",
+    "840": "swab, swob, mop",
+    "841": "sweatshirt",
+    "842": "swimming trunks, bathing trunks",
+    "843": "swing",
+    "844": "switch, electric switch, electrical switch",
+    "845": "syringe",
+    "846": "table lamp",
+    "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
+    "848": "tape player",
+    "849": "teapot",
+    "850": "teddy, teddy bear",
+    "851": "television, television system",
+    "852": "tennis ball",
+    "853": "thatch, thatched roof",
+    "854": "theater curtain, theatre curtain",
+    "855": "thimble",
+    "856": "thresher, thrasher, threshing machine",
+    "857": "throne",
+    "858": "tile roof",
+    "859": "toaster",
+    "860": "tobacco shop, tobacconist shop, tobacconist",
+    "861": "toilet seat",
+    "862": "torch",
+    "863": "totem pole",
+    "864": "tow truck, tow car, wrecker",
+    "865": "toyshop",
+    "866": "tractor",
+    "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
+    "868": "tray",
+    "869": "trench coat",
+    "870": "tricycle, trike, velocipede",
+    "871": "trimaran",
+    "872": "tripod",
+    "873": "triumphal arch",
+    "874": "trolleybus, trolley coach, trackless trolley",
+    "875": "trombone",
+    "876": "tub, vat",
+    "877": "turnstile",
+    "878": "typewriter keyboard",
+    "879": "umbrella",
+    "880": "unicycle, monocycle",
+    "881": "upright, upright piano",
+    "882": "vacuum, vacuum cleaner",
+    "883": "vase",
+    "884": "vault",
+    "885": "velvet",
+    "886": "vending machine",
+    "887": "vestment",
+    "888": "viaduct",
+    "889": "violin, fiddle",
+    "890": "volleyball",
+    "891": "waffle iron",
+    "892": "wall clock",
+    "893": "wallet, billfold, notecase, pocketbook",
+    "894": "wardrobe, closet, press",
+    "895": "warplane, military plane",
+    "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
+    "897": "washer, automatic washer, washing machine",
+    "898": "water bottle",
+    "899": "water jug",
+    "900": "water tower",
+    "901": "whiskey jug",
+    "902": "whistle",
+    "903": "wig",
+    "904": "window screen",
+    "905": "window shade",
+    "906": "Windsor tie",
+    "907": "wine bottle",
+    "908": "wing",
+    "909": "wok",
+    "910": "wooden spoon",
+    "911": "wool, woolen, woollen",
+    "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
+    "913": "wreck",
+    "914": "yawl",
+    "915": "yurt",
+    "916": "web site, website, internet site, site",
+    "917": "comic book",
+    "918": "crossword puzzle, crossword",
+    "919": "street sign",
+    "920": "traffic light, traffic signal, stoplight",
+    "921": "book jacket, dust cover, dust jacket, dust wrapper",
+    "922": "menu",
+    "923": "plate",
+    "924": "guacamole",
+    "925": "consomme",
+    "926": "hot pot, hotpot",
+    "927": "trifle",
+    "928": "ice cream, icecream",
+    "929": "ice lolly, lolly, lollipop, popsicle",
+    "930": "French loaf",
+    "931": "bagel, beigel",
+    "932": "pretzel",
+    "933": "cheeseburger",
+    "934": "hotdog, hot dog, red hot",
+    "935": "mashed potato",
+    "936": "head cabbage",
+    "937": "broccoli",
+    "938": "cauliflower",
+    "939": "zucchini, courgette",
+    "940": "spaghetti squash",
+    "941": "acorn squash",
+    "942": "butternut squash",
+    "943": "cucumber, cuke",
+    "944": "artichoke, globe artichoke",
+    "945": "bell pepper",
+    "946": "cardoon",
+    "947": "mushroom",
+    "948": "Granny Smith",
+    "949": "strawberry",
+    "950": "orange",
+    "951": "lemon",
+    "952": "fig",
+    "953": "pineapple, ananas",
+    "954": "banana",
+    "955": "jackfruit, jak, jack",
+    "956": "custard apple",
+    "957": "pomegranate",
+    "958": "hay",
+    "959": "carbonara",
+    "960": "chocolate sauce, chocolate syrup",
+    "961": "dough",
+    "962": "meat loaf, meatloaf",
+    "963": "pizza, pizza pie",
+    "964": "potpie",
+    "965": "burrito",
+    "966": "red wine",
+    "967": "espresso",
+    "968": "cup",
+    "969": "eggnog",
+    "970": "alp",
+    "971": "bubble",
+    "972": "cliff, drop, drop-off",
+    "973": "coral reef",
+    "974": "geyser",
+    "975": "lakeside, lakeshore",
+    "976": "promontory, headland, head, foreland",
+    "977": "sandbar, sand bar",
+    "978": "seashore, coast, seacoast, sea-coast",
+    "979": "valley, vale",
+    "980": "volcano",
+    "981": "ballplayer, baseball player",
+    "982": "groom, bridegroom",
+    "983": "scuba diver",
+    "984": "rapeseed",
+    "985": "daisy",
+    "986": "yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+    "987": "corn",
+    "988": "acorn",
+    "989": "hip, rose hip, rosehip",
+    "990": "buckeye, horse chestnut, conker",
+    "991": "coral fungus",
+    "992": "agaric",
+    "993": "gyromitra",
+    "994": "stinkhorn, carrion fungus",
+    "995": "earthstar",
+    "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
+    "997": "bolete",
+    "998": "ear, spike, capitulum",
+    "999": "toilet tissue, toilet paper, bathroom tissue"
+  }
+}

DeCo-XL-16-512/pipeline.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""Hub custom pipeline: DeCoPipeline (class-conditioned c2i).
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.utils.torch_utils import randn_tensor
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pathlib import Path
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+        >>> model_dir = Path("./DeCo-XL-16-512").resolve()
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     trust_remote_code=True,
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe.to("cuda")
+        >>> print(pipe.id2label[207])
+        >>> print(pipe.get_label_ids("golden retriever"))
+        >>> generator = torch.Generator(device="cuda").manual_seed(42)
+        >>> image = pipe(
+        ...     class_labels="golden retriever",
+        ...     num_inference_steps=100,
+        ...     guidance_scale=5.0,
+        ...     generator=generator,
+        ... ).images[0]
+        ```
+"""
+class DeCoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for class-conditional image generation with DeCo.
+    Parameters:
+        transformer ([`DeCoTransformer2DModel`]):
+            Class-conditional DeCo transformer.
+        scheduler ([`DeCoFlowMatchEulerDiscreteScheduler`]):
+            Flow-matching Euler scheduler for DeCo.
+        decoder ([`DeCoPatchDecoderModel`]):
+            Per-patch RGB decoder (NerfEmbedder + AdaLN MLP).
+        id2label (`dict[int, str]`, *optional*):
+            ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
+    """
+    model_cpu_offload_seq = "transformer->decoder"
+    def __init__(
+        self,
+        transformer,
+        scheduler,
+        decoder,
+        id2label: Optional[Dict[Union[int, str], str]] = None,
+    ):
+        super().__init__()
+        self.register_modules(transformer=transformer, scheduler=scheduler, decoder=decoder)
+        self._id2label = self._normalize_id2label(id2label)
+        self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
+    @staticmethod
+    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
+        if not id2label:
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
+        if not model_index_path.exists():
+            return {}
+        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
+        id2label = raw.get("id2label")
+        if not isinstance(id2label, dict):
+            return {}
+        return {int(key): value for key, value in id2label.items()}
+    @staticmethod
+    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
+        label2id: Dict[str, int] = {}
+        for class_id, value in id2label.items():
+            for synonym in value.split(","):
+                synonym = synonym.strip()
+                if synonym:
+                    label2id[synonym] = int(class_id)
+        return dict(sorted(label2id.items()))
+    @property
+    def id2label(self) -> Dict[int, str]:
+        r"""ImageNet class id to English label string (comma-separated synonyms)."""
+        self._ensure_labels_loaded()
+        return self._id2label
+    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        r"""
+        Map ImageNet label strings to class ids.
+        Args:
+            label (`str` or `list[str]`):
+                One or more English label strings. Each string must match a synonym in `id2label`.
+        """
+        self._ensure_labels_loaded()
+        label2id = self.labels
+        if not label2id:
+            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
+        if isinstance(label, str):
+            label = [label]
+        missing = [item for item in label if item not in label2id]
+        if missing:
+            preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
+        return [label2id[item] for item in label]
+    def _normalize_class_labels(
+        self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
+    ) -> torch.LongTensor:
+        if torch.is_tensor(class_labels):
+            return class_labels.to(device=self._execution_device, dtype=torch.long).reshape(-1)
+        if isinstance(class_labels, int):
+            class_label_ids = [class_labels]
+        elif isinstance(class_labels, str):
+            class_label_ids = self.get_label_ids(class_labels)
+        elif class_labels and isinstance(class_labels[0], str):
+            class_label_ids = self.get_label_ids(class_labels)
+        else:
+            class_label_ids = list(class_labels)
+        return torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
+    def _default_sample_size(self) -> int:
+        return int(getattr(self.transformer.config, "sample_size", 256))
+    @torch.no_grad()
+    def __call__(
+        self,
+        class_labels: Union[int, str, List[Union[int, str]], torch.LongTensor],
+        batch_size: Optional[int] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.0,
+        generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Generate class-conditional images with DeCo.
+        Args:
+            class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`):
+                ImageNet class indices or human-readable English label strings.
+            batch_size (`int`, *optional*):
+                Number of images to generate. Defaults to the number of class labels. When a single
+                class label is provided, repeats it to match `batch_size`.
+            height (`int`, *optional*):
+                Output image height in pixels. Defaults to `transformer.config.sample_size`.
+            width (`int`, *optional*):
+                Output image width in pixels. Defaults to `transformer.config.sample_size`.
+            num_inference_steps (`int`, defaults to `50`):
+                Number of denoising steps.
+            guidance_scale (`float`, defaults to `1.0`):
+                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
+            generator (`torch.Generator`, *optional*):
+                RNG for reproducibility.
+            output_type (`str`, defaults to `"pil"`):
+                `"pil"`, `"np"`, or `"latent"`.
+            return_dict (`bool`, defaults to `True`):
+                Return [`ImagePipelineOutput`] if True.
+        """
+        device = self._execution_device
+        dtype = next(self.transformer.parameters()).dtype
+        do_cfg = guidance_scale is not None and float(guidance_scale) > 1.0
+        sample_size = self._default_sample_size()
+        height = int(height if height is not None else sample_size)
+        width = int(width if width is not None else sample_size)
+        class_labels = self._normalize_class_labels(class_labels)
+        if batch_size is None:
+            batch_size = int(class_labels.numel())
+        elif class_labels.numel() == 1 and batch_size > 1:
+            class_labels = class_labels.repeat(batch_size)
+        elif class_labels.numel() != batch_size:
+            raise ValueError("class_labels batch size must match batch_size")
+        if do_cfg:
+            null_label = int(self.transformer.config.num_classes)
+            uncond_labels = torch.full((batch_size,), null_label, device=device, dtype=torch.long)
+        latents = randn_tensor(
+            (batch_size, int(self.transformer.config.in_channels), height, width),
+            generator=generator,
+            device=device,
+            dtype=dtype,
+        )
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps[:-1]
+        for timestep in self.progress_bar(timesteps):
+            latent_model_input = self.scheduler.scale_model_input(latents, timestep)
+            if do_cfg:
+                latent_model_input = torch.cat([latent_model_input, latent_model_input], dim=0)
+                model_output = self.transformer(
+                    latent_model_input,
+                    timestep,
+                    class_labels=torch.cat([uncond_labels, class_labels], dim=0),
+                    decoder=self.decoder,
+                ).sample
+                model_output_uncond, model_output_cond = model_output.chunk(2)
+                model_output = model_output_uncond + float(guidance_scale) * (model_output_cond - model_output_uncond)
+            else:
+                model_output = self.transformer(
+                    latent_model_input, timestep, class_labels=class_labels, decoder=self.decoder
+                ).sample
+            latents = self.scheduler.step(model_output, timestep, latents).prev_sample
+        image = latents
+        if output_type == "latent":
+            if not return_dict:
+                return (image,)
+            return ImagePipelineOutput(images=image)
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        elif output_type != "np":
+            raise ValueError("output_type must be one of {'pil', 'np', 'latent'}")
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

DeCo-XL-16-512/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_class_name": "DeCoFlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.31.0",
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "last_step": null,
+  "prediction_type": "v_prediction"
+}

DeCo-XL-16-512/scheduler/scheduling_deco_flow_match_euler_discrete.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import annotations
+from typing import Optional, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+def _shift_respace_fn(t: torch.Tensor, shift: float = 1.0) -> torch.Tensor:
+    return t / (t + (1 - t) * shift)
+class DeCoFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    config_name = "scheduler_config.json"
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        last_step: Optional[float] = None,
+        prediction_type: str = "v_prediction",
+    ):
+        self.timesteps = torch.tensor([], dtype=torch.float32)
+        self.num_inference_steps: Optional[int] = None
+        self._step_index: int = 0
+    @property
+    def init_noise_sigma(self) -> float:
+        return 1.0
+    def set_timesteps(self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None):
+        if num_inference_steps <= 0:
+            raise ValueError("num_inference_steps must be > 0")
+        self.num_inference_steps = int(num_inference_steps)
+        last_step = self.config.last_step
+        if last_step is None:
+            last_step = 1.0 / float(self.num_inference_steps)
+        base_timesteps = torch.linspace(0.0, 1.0 - float(last_step), self.num_inference_steps, dtype=torch.float32)
+        base_timesteps = torch.cat([base_timesteps, torch.tensor([1.0], dtype=torch.float32)], dim=0)
+        timesteps = _shift_respace_fn(base_timesteps, shift=float(self.config.shift))
+        if device is not None:
+            timesteps = timesteps.to(device)
+        self.timesteps = timesteps
+        self._step_index = 0
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return sample
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[torch.Tensor, float],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ):
+        if self.num_inference_steps is None or self.timesteps.numel() == 0:
+            raise ValueError("Call set_timesteps before step")
+        step_index = min(self._step_index, len(self.timesteps) - 2)
+        dt = (self.timesteps[step_index + 1] - self.timesteps[step_index]).to(device=sample.device, dtype=sample.dtype)
+        prev_sample = sample + model_output * dt
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def add_noise(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        if timesteps.ndim == 0:
+            timesteps = timesteps[None]
+        t = timesteps.to(device=original_samples.device, dtype=original_samples.dtype).view(-1, 1, 1, 1)
+        return t * original_samples + (1.0 - t) * noise

DeCo-XL-16-512/transformer/__pycache__/transformer_deco.cpython-312.pyc ADDED Viewed

Binary file (23.2 kB). View file

DeCo-XL-16-512/transformer/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "sample_size": 512,
+  "conditioning_type": "class",
+  "decoder_hidden_size": 64,
+  "deep_supervision": 0,
+  "hidden_size": 1152,
+  "hidden_size_x": 32,
+  "in_channels": 3,
+  "learn_sigma": true,
+  "nerf_mlpratio": 4,
+  "num_blocks": 31,
+  "num_classes": 1000,
+  "num_cond_blocks": 28,
+  "num_decoder_blocks": 4,
+  "num_encoder_blocks": 18,
+  "num_groups": 16,
+  "num_text_blocks": 4,
+  "patch_size": 16,
+  "txt_embed_dim": 1024,
+  "txt_max_length": 100
+}

DeCo-XL-16-512/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0468009ed0ab700db3cbf906ae88f6ae19ac6548ddef7f8f2a8f0195c2fe33f
+size 2691309848

DeCo-XL-16-512/transformer/diffusion_pytorch_model.safetensors.bak ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d5ae272eea0747e306bcef99cc32b014cc1180f7fc1462cdb3e6e27ee0ffd3e
+size 2691309848

DeCo-XL-16-512/transformer/transformer_deco.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.functional import scaled_dot_product_attention
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.models.normalization import RMSNorm
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans: int, embed_dim: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(in_chans, embed_dim, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.proj(x)
+class TimestepEmbedder(nn.Module):
+    """Sinusoidal timestep embedding with checkpoint-compatible `mlp` module names."""
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t: torch.Tensor, dim: int, max_period: int = 10) -> torch.Tensor:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+        )
+        args = t[..., None].float() * freqs[None, ...]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding.to(t.dtype)
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        return self.mlp(t_freq)
+class DeCoSwiGLU(nn.Module):
+    """SwiGLU MLP with w1/w2/w3 layout matching official DeCo checkpoints."""
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float = 10000.0, scale: float = 16.0) -> torch.Tensor:
+    x_pos = torch.linspace(0, scale, width)
+    y_pos = torch.linspace(0, scale, height)
+    y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+    y_pos = y_pos.reshape(-1)
+    x_pos = x_pos.reshape(-1)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    x_freqs = torch.outer(x_pos, freqs).float()
+    y_freqs = torch.outer(y_pos, freqs).float()
+    x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+    y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+    freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+    return freqs_cis.reshape(height * width, -1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    freqs_cis = freqs_cis[None, :, None, :]
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class LabelEmbedder(nn.Module):
+    def __init__(self, num_classes: int, hidden_size: int):
+        super().__init__()
+        self.embedding_table = nn.Embedding(num_classes, hidden_size)
+    def forward(self, labels: torch.Tensor) -> torch.Tensor:
+        return self.embedding_table(labels)
+class RAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = True,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = RMSNorm(self.head_dim, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = RMSNorm(self.head_dim, eps=1e-6) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, pos: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, num_tokens, channels = x.shape
+        qkv = self.qkv(x).reshape(batch_size, num_tokens, 3, self.num_heads, self.head_dim).permute(2, 0, 1, 3, 4)
+        query, key, value = qkv[0], qkv[1], qkv[2]
+        query = self.q_norm(query)
+        key = self.k_norm(key)
+        query, key = apply_rotary_emb(query, key, freqs_cis=pos)
+        query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        x = scaled_dot_product_attention(query, key, value, attn_mask=mask, dropout_p=0.0)
+        x = x.transpose(1, 2).reshape(batch_size, num_tokens, channels)
+        return self.proj_drop(self.proj(x))
+class FlattenDiTBlock(nn.Module):
+    def __init__(self, hidden_size: int, groups: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
+        self.attn = RAttention(hidden_size, num_heads=groups, qkv_bias=False)
+        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
+        self.mlp = DeCoSwiGLU(hidden_size, int(hidden_size * mlp_ratio))
+        self.adaLN_modulation = nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+    def forward(self, x: torch.Tensor, c: torch.Tensor, pos: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate_msa * self.attn(_modulate(self.norm1(x), shift_msa, scale_msa), pos, mask=mask)
+        return x + gate_mlp * self.mlp(_modulate(self.norm2(x), shift_mlp, scale_mlp))
+@dataclass
+class DeCoTransformer2DModelOutput(BaseOutput):
+    sample: torch.Tensor
+class _DeCoTransformerBackbone(nn.Module):
+    """Class-conditioned DeCo conditioning trunk. Checkpoint weights live under the `backbone.` prefix."""
+    def __init__(
+        self,
+        in_channels: int,
+        patch_size: int,
+        num_groups: int,
+        hidden_size: int,
+        num_cond_blocks: int,
+        num_classes: int,
+        learn_sigma: bool,
+        deep_supervision: int,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.deep_supervision = deep_supervision
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.num_groups = num_groups
+        self.num_cond_blocks = num_cond_blocks
+        self.s_embedder = PatchEmbed(in_channels * patch_size**2, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes + 1, hidden_size)
+        self.blocks = nn.ModuleList([FlattenDiTBlock(hidden_size, num_groups) for _ in range(num_cond_blocks)])
+        self.precompute_pos: dict[tuple[int, int], torch.Tensor] = {}
+        self._init_weights()
+    def _init_weights(self) -> None:
+        weight = self.s_embedder.proj.weight.data
+        nn.init.xavier_uniform_(weight.view([weight.shape[0], -1]))
+        nn.init.constant_(self.s_embedder.proj.bias, 0)
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+    def fetch_pos(self, height: int, width: int, device: torch.device) -> torch.Tensor:
+        key = (height, width)
+        if key not in self.precompute_pos:
+            self.precompute_pos[key] = precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width)
+        return self.precompute_pos[key].to(device)
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: torch.Tensor,
+        decoder: nn.Module,
+        s: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = x.shape
+        pos = self.fetch_pos(height // self.patch_size, width // self.patch_size, x.device)
+        x = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+        t = self.t_embedder(t.view(-1)).view(batch_size, -1, self.hidden_size)
+        y = self.y_embedder(y).view(batch_size, 1, self.hidden_size)
+        c = F.silu(t + y)
+        if s is None:
+            s = self.s_embedder(x)
+            for block in self.blocks:
+                s = block(s, c, pos, mask)
+            s = F.silu(t + s)
+        batch_size, length, _ = s.shape
+        patch_pixels = x.reshape(batch_size * length, self.in_channels, self.patch_size**2).transpose(1, 2)
+        conditioning = s.view(batch_size * length, self.hidden_size)
+        decoded = decoder(patch_pixels, conditioning).sample
+        x = decoded.transpose(1, 2).reshape(batch_size, length, -1)
+        return F.fold(
+            x.transpose(1, 2).contiguous(),
+            (height, width),
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+class DeCoTransformer2DModel(ModelMixin, ConfigMixin):
+    """Class-conditioned DeCo transformer (c2i) for Diffusers pipelines."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        patch_size: int = 2,
+        num_groups: int = 12,
+        hidden_size: int = 1152,
+        hidden_size_x: int = 64,
+        num_blocks: int = 18,
+        num_cond_blocks: int = 4,
+        num_classes: int = 1000,
+        learn_sigma: bool = True,
+        deep_supervision: int = 0,
+        sample_size: int = 256,
+        # Deprecated config keys kept for backward-compatible hub configs.
+        conditioning_type: str = "class",
+        nerf_mlpratio: int = 4,
+        decoder_hidden_size: int = 64,
+        num_encoder_blocks: int = 18,
+        num_decoder_blocks: int = 4,
+        num_text_blocks: int = 4,
+        txt_embed_dim: int = 1024,
+        txt_max_length: int = 100,
+    ):
+        super().__init__()
+        del hidden_size_x, nerf_mlpratio, decoder_hidden_size, num_encoder_blocks, num_decoder_blocks
+        del num_text_blocks, txt_embed_dim, txt_max_length
+        if conditioning_type != "class":
+            raise ValueError("DeCoTransformer2DModel only supports class conditioning (c2i).")
+        self.backbone = _DeCoTransformerBackbone(
+            in_channels=in_channels,
+            patch_size=patch_size,
+            num_groups=num_groups,
+            hidden_size=hidden_size,
+            num_cond_blocks=num_cond_blocks,
+            num_classes=num_classes,
+            learn_sigma=learn_sigma,
+            deep_supervision=deep_supervision,
+        )
+    @property
+    def in_channels(self) -> int:
+        return int(self.config.in_channels)
+    def _prepare_timestep(
+        self, timestep: Union[torch.Tensor, float, int], batch_size: int, sample: torch.Tensor
+    ) -> torch.Tensor:
+        if not isinstance(timestep, torch.Tensor):
+            timestep = torch.tensor([timestep], device=sample.device, dtype=sample.dtype)
+        timestep = timestep.to(device=sample.device, dtype=sample.dtype)
+        if timestep.ndim == 0:
+            timestep = timestep[None]
+        if timestep.shape[0] == 1 and batch_size > 1:
+            timestep = timestep.repeat(batch_size)
+        return timestep
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        class_labels: Optional[torch.Tensor] = None,
+        decoder: Optional[nn.Module] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DeCoTransformer2DModelOutput, tuple[torch.Tensor]]:
+        if encoder_hidden_states is not None:
+            raise ValueError("encoder_hidden_states is not supported; use class_labels for c2i DeCo models.")
+        if class_labels is None:
+            raise ValueError("class_labels must be provided for class-conditioned DeCo models.")
+        if decoder is None:
+            raise ValueError("decoder must be provided; load DeCoPatchDecoderModel as a separate pipeline component.")
+        batch_size = sample.shape[0]
+        t = self._prepare_timestep(timestep=timestep, batch_size=batch_size, sample=sample)
+        output = self.backbone(
+            sample,
+            t,
+            class_labels.to(device=sample.device, dtype=torch.long),
+            decoder=decoder,
+        )
+        if not return_dict:
+            return (output,)
+        return DeCoTransformer2DModelOutput(sample=output)

DeCo-XXL-16-512-t2i/decoder/__pycache__/decoder_deco.cpython-312.pyc ADDED Viewed

Binary file (12 kB). View file

DeCo-XXL-16-512-t2i/decoder/config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "in_channels": 3,
+  "hidden_size_x": 32,
+  "z_channels": 1536,
+  "max_freqs": 8,
+  "num_res_blocks": 3,
+  "patch_size": 16
+}

DeCo-XXL-16-512-t2i/decoder/decoder_deco.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+def _precompute_freqs_cis_ex2d(
+    dim: int,
+    height: int,
+    width: int,
+    theta: float = 10000.0,
+    scale: float = 1.0,
+) -> torch.Tensor:
+    """Match Zehong-Ma/DeCo `precompute_freqs_cis_ex2d` used by NerfEmbedder."""
+    if isinstance(scale, float):
+        scale = (scale, scale)
+    x_pos = torch.linspace(0, height * scale[0], width)
+    y_pos = torch.linspace(0, width * scale[1], height)
+    y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+    y_pos = y_pos.reshape(-1)
+    x_pos = x_pos.reshape(-1)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    x_freqs = torch.outer(x_pos, freqs).float()
+    y_freqs = torch.outer(y_pos, freqs).float()
+    x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+    y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+    freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+    return freqs_cis.reshape(height * width, -1)
+class NerfEmbedder(nn.Module):
+    def __init__(self, in_channels: int, hidden_size_input: int, max_freqs: int):
+        super().__init__()
+        self.max_freqs = max_freqs
+        self.embedder = nn.Sequential(nn.Linear(in_channels + max_freqs**2, hidden_size_input, bias=True))
+    @lru_cache
+    def fetch_pos(self, patch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        pos = _precompute_freqs_cis_ex2d(self.max_freqs**2 * 2, patch_size, patch_size)
+        # Official code casts complex cis to real when concatenating with patch pixels.
+        return pos[None, :, :].to(device=device, dtype=dtype)
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        batch_size, patch_tokens, _ = inputs.shape
+        patch_size = int(patch_tokens**0.5)
+        dct = self.fetch_pos(patch_size, inputs.device, inputs.dtype).repeat(batch_size, 1, 1)
+        return self.embedder(torch.cat([inputs, dct], dim=-1))
+class ResBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(channels, 3 * channels, bias=True))
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        return x + gate_mlp * self.mlp(_modulate(self.in_ln(x), shift_mlp, scale_mlp))
+class DecoderFinalLayer(nn.Module):
+    def __init__(self, model_channels: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.norm_final(x))
+class SimpleMLPAdaLN(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        out_channels: int,
+        z_channels: int,
+        num_res_blocks: int,
+        patch_size: int,
+        grad_checkpointing: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.grad_checkpointing = grad_checkpointing
+        self.cond_embed = nn.Linear(z_channels, patch_size**2 * model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        self.res_blocks = nn.ModuleList([ResBlock(model_channels) for _ in range(num_res_blocks)])
+        self.final_layer = DecoderFinalLayer(model_channels, out_channels)
+        self._init_weights()
+    def _init_weights(self) -> None:
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        x = self.input_proj(x)
+        y = self.cond_embed(c).reshape(c.shape[0], self.patch_size**2, -1)
+        for block in self.res_blocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(block, x, y)
+            else:
+                x = block(x, y)
+        return self.final_layer(x)
+@dataclass
+class DeCoPatchDecoderOutput(BaseOutput):
+    sample: torch.Tensor
+class DeCoPatchDecoderModel(ModelMixin, ConfigMixin):
+    """Per-patch RGB decoder for DeCo (NerfEmbedder + AdaLN MLP)."""
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        hidden_size_x: int = 32,
+        z_channels: int = 1152,
+        num_res_blocks: int = 3,
+        patch_size: int = 16,
+        max_freqs: int = 8,
+    ):
+        super().__init__()
+        self.x_embedder = NerfEmbedder(in_channels, hidden_size_x, max_freqs=max_freqs)
+        self.dec_net = SimpleMLPAdaLN(
+            in_channels=hidden_size_x,
+            model_channels=hidden_size_x,
+            out_channels=in_channels,
+            z_channels=z_channels,
+            num_res_blocks=num_res_blocks,
+            patch_size=patch_size,
+        )
+    def forward(
+        self,
+        patch_pixels: torch.Tensor,
+        conditioning: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[DeCoPatchDecoderOutput, tuple[torch.Tensor]]:
+        """
+        Args:
+            patch_pixels (`torch.Tensor`):
+                Flattened patch pixels of shape `(batch * num_patches, patch_size ** 2, in_channels)`.
+            conditioning (`torch.Tensor`):
+                Per-patch conditioning of shape `(batch * num_patches, z_channels)`.
+        """
+        output = self.dec_net(self.x_embedder(patch_pixels), conditioning)
+        if not return_dict:
+            return (output,)
+        return DeCoPatchDecoderOutput(sample=output)

DeCo-XXL-16-512-t2i/decoder/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2468211dbc3dd72c7ebd9d7d86913442c9a1dc93fec03cfa135a658d84d5fd5e
+size 50445148

DeCo-XXL-16-512-t2i/model_index.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_class_name": [
+    "pipeline",
+    "DeCoT2IPipeline"
+  ],
+  "_diffusers_version": "0.31.0",
+  "transformer": [
+    "transformer_deco_t2i",
+    "DeCoT2ITransformer2DModel"
+  ],
+  "decoder": [
+    "decoder_deco",
+    "DeCoPatchDecoderModel"
+  ],
+  "scheduler": [
+    "scheduling_deco_flow_match_adam_discrete",
+    "DeCoFlowMatchAdamDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "Qwen3Model"
+  ],
+  "tokenizer": [
+    "transformers",
+    "Qwen2Tokenizer"
+  ]
+}

DeCo-XXL-16-512-t2i/pipeline.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""Hub custom pipeline: DeCoT2IPipeline (text-to-image, 512×512).
+Sampling matches official DeCo AdamLMSampler:
+https://github.com/Zehong-Ma/DeCo/blob/main/src/diffusion/flow_matching/adam_sampling.py
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import torch
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.utils.torch_utils import randn_tensor
+DEFAULT_TEXT_ENCODER_REPO = "Qwen/Qwen3-1.7B"
+class DeCoT2IPipeline(DiffusionPipeline):
+    model_cpu_offload_seq = "text_encoder->transformer->decoder"
+    _optional_components = ["text_encoder", "tokenizer"]
+    def __init__(
+        self,
+        transformer,
+        scheduler,
+        decoder,
+        text_encoder=None,
+        tokenizer=None,
+    ):
+        super().__init__()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        pipe = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        if pipe.text_encoder is None or pipe.tokenizer is None:
+            model_dir = Path(getattr(pipe.config, "_name_or_path", pretrained_model_name_or_path)).resolve()
+            pipe._load_text_encoder(model_dir, **kwargs)
+        return pipe
+    @staticmethod
+    def _resolve_text_encoder_path(model_dir: Path) -> Path:
+        hint = model_dir / "text_encoder_pretrained_model_name_or_path.txt"
+        if hint.exists():
+            raw = hint.read_text(encoding="utf-8").strip().splitlines()[0].strip()
+            path = Path(raw)
+            if not path.is_absolute():
+                path = (model_dir / path).resolve()
+            if path.exists():
+                return path
+        local = model_dir / "text_encoder"
+        if local.exists():
+            return local.resolve()
+        return Path(DEFAULT_TEXT_ENCODER_REPO)
+    def _load_text_encoder(self, model_dir: Path, **kwargs) -> None:
+        from transformers import Qwen2Tokenizer, Qwen3Model
+        text_path = self._resolve_text_encoder_path(model_dir)
+        load_kwargs = {
+            k: kwargs[k]
+            for k in ("torch_dtype", "device_map", "local_files_only", "revision", "cache_dir")
+            if k in kwargs
+        }
+        text_encoder = Qwen3Model.from_pretrained(str(text_path), **load_kwargs)
+        tokenizer = Qwen2Tokenizer.from_pretrained(
+            str(text_path),
+            max_length=self.txt_max_length,
+            padding_side="right",
+            **{k: v for k, v in load_kwargs.items() if k in ("local_files_only", "revision", "cache_dir")},
+        )
+        self.register_modules(text_encoder=text_encoder, tokenizer=tokenizer)
+    @property
+    def txt_embed_dim(self) -> int:
+        return int(getattr(self.transformer.config, "txt_embed_dim", 2048))
+    @property
+    def txt_max_length(self) -> int:
+        return int(getattr(self.transformer.config, "txt_max_length", 128))
+    @staticmethod
+    def _effective_guidance_scale(
+        timestep: Union[torch.Tensor, float],
+        guidance_scale: float,
+        do_cfg: bool,
+        guidance_interval_min: float,
+        guidance_interval_max: float,
+    ) -> float:
+        """Match official AdamLMSampler: CFG when t > min and t < max."""
+        if not do_cfg:
+            return 1.0
+        t = float(timestep)
+        if t > guidance_interval_min and t < guidance_interval_max:
+            return float(guidance_scale)
+        return 1.0
+    @staticmethod
+    def _fp_to_uint8(image: torch.Tensor) -> torch.Tensor:
+        return torch.clip_((image + 1) * 127.5 + 0.5, 0, 255).to(torch.uint8)
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.text_encoder is None or self.tokenizer is None:
+            raise ValueError("text_encoder and tokenizer must be loaded for t2i inference.")
+        device = device or self._execution_device
+        dtype = dtype or next(self.text_encoder.parameters()).dtype
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        batch_size = len(prompt)
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        def _encode(texts: List[str]) -> torch.Tensor:
+            tokenized = self.tokenizer(
+                texts,
+                truncation=True,
+                max_length=self.txt_max_length,
+                padding="max_length",
+                return_tensors="pt",
+            )
+            input_ids = tokenized.input_ids.to(device)
+            attention_mask = tokenized.attention_mask.to(device)
+            outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
+            hidden = outputs[0]
+            embed_dim = self.txt_embed_dim
+            if hidden.shape[-1] < embed_dim:
+                pad = torch.zeros(
+                    hidden.shape[0],
+                    hidden.shape[1],
+                    embed_dim - hidden.shape[-1],
+                    device=hidden.device,
+                    dtype=hidden.dtype,
+                )
+                hidden = torch.cat([hidden, pad], dim=-1)
+            elif hidden.shape[-1] > embed_dim:
+                hidden = hidden[:, :, :embed_dim]
+            return hidden.to(dtype=dtype)
+        return _encode(prompt), _encode(negative_prompt)
+    def _default_sample_size(self) -> int:
+        return int(getattr(self.transformer.config, "sample_size", 512))
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=torch.cuda.is_available())
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 4.0,
+        timeshift: Optional[float] = None,
+        order: Optional[int] = None,
+        guidance_interval_min: Optional[float] = None,
+        guidance_interval_max: Optional[float] = None,
+        generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        device = self._execution_device
+        dtype = next(self.transformer.parameters()).dtype
+        do_cfg = guidance_scale is not None and float(guidance_scale) > 1.0
+        if prompt_embeds is not None:
+            batch_size = int(prompt_embeds.shape[0])
+        elif prompt is None:
+            raise ValueError("Either `prompt` or `prompt_embeds` must be provided.")
+        elif isinstance(prompt, str):
+            batch_size = 1
+        else:
+            batch_size = len(prompt)
+        sample_size = self._default_sample_size()
+        height = int(height if height is not None else sample_size)
+        width = int(width if width is not None else sample_size)
+        height = height // 16 * 16
+        width = width // 16 * 16
+        interval_min = (
+            float(guidance_interval_min)
+            if guidance_interval_min is not None
+            else float(getattr(self.scheduler.config, "guidance_interval_min", 0.0))
+        )
+        interval_max = (
+            float(guidance_interval_max)
+            if guidance_interval_max is not None
+            else float(getattr(self.scheduler.config, "guidance_interval_max", 1.0))
+        )
+        if prompt_embeds is None:
+            prompt_embeds, negative_embeds = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                device=device,
+                dtype=dtype,
+            )
+        else:
+            negative_embeds = negative_prompt_embeds
+            if negative_embeds is None:
+                negative_embeds = torch.zeros_like(prompt_embeds)
+        # Official DeCo t2i: float32 noise on CPU, then move to device (see app.py / GenEval).
+        noise_shape = (batch_size, int(self.transformer.config.in_channels), height, width)
+        if generator is not None:
+            gen_device = getattr(generator, "device", None)
+            if gen_device is not None and str(gen_device).startswith("cuda"):
+                latents = randn_tensor(
+                    noise_shape, generator=generator, device=device, dtype=torch.float32
+                )
+            else:
+                latents = randn_tensor(
+                    noise_shape, generator=generator, device="cpu", dtype=torch.float32
+                ).to(device)
+        else:
+            latents = randn_tensor(noise_shape, device="cpu", dtype=torch.float32).to(device)
+        set_kwargs = {
+            "num_inference_steps": num_inference_steps,
+            "guidance_scale": guidance_scale,
+            "device": device,
+        }
+        if timeshift is not None:
+            set_kwargs["timeshift"] = timeshift
+        if order is not None:
+            set_kwargs["order"] = order
+        self.scheduler.set_timesteps(**set_kwargs)
+        cfg_condition = torch.cat([negative_embeds, prompt_embeds], dim=0)
+        pred_trajectory: list[torch.Tensor] = []
+        t_cur = torch.zeros(batch_size, device=device, dtype=torch.float32)
+        timedeltas = self.scheduler._timedeltas
+        solver_coeffs = self.scheduler._solver_coeffs
+        for i in self.progress_bar(range(len(timedeltas))):
+            cfg_x = torch.cat([latents, latents], dim=0)
+            cfg_t = t_cur.repeat(2)
+            out = self.transformer(cfg_x, cfg_t, encoder_hidden_states=cfg_condition, decoder=self.decoder).sample
+            if do_cfg and t_cur[0] > interval_min and t_cur[0] < interval_max:
+                cfg_scale = float(guidance_scale)
+            else:
+                cfg_scale = 1.0
+            uncond, cond = out.chunk(2, dim=0)
+            out = uncond + cfg_scale * (cond - uncond)
+            pred_trajectory.append(out)
+            combined = torch.zeros_like(out)
+            order = len(solver_coeffs[i])
+            for j in range(order):
+                combined = combined + solver_coeffs[i][j] * pred_trajectory[-order:][j]
+            latents = latents + combined * timedeltas[i]
+            t_cur = t_cur + timedeltas[i]
+        if output_type == "latent":
+            if not return_dict:
+                return (latents,)
+            return ImagePipelineOutput(images=latents)
+        images_uint8 = self._fp_to_uint8(latents.float()).permute(0, 2, 3, 1).cpu().numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(images_uint8)
+        elif output_type == "np":
+            image = images_uint8
+        else:
+            raise ValueError("output_type must be one of {'pil', 'np', 'latent'}")
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

DeCo-XXL-16-512-t2i/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "DeCoFlowMatchAdamDiscreteScheduler",
+  "_diffusers_version": "0.31.0",
+  "num_train_timesteps": 1000,
+  "num_inference_steps": 25,
+  "guidance_scale": 4.0,
+  "timeshift": 3.0,
+  "order": 2,
+  "guidance_interval_min": 0.0,
+  "guidance_interval_max": 1.0,
+  "last_step": null,
+  "prediction_type": "v_prediction"
+}

DeCo-XXL-16-512-t2i/scheduler/scheduling_deco_flow_match_adam_discrete.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""Flow-matching AdamLM scheduler matching Zehong-Ma/DeCo AdamLMSampler."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, List, Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+@dataclass
+class DeCoFlowMatchAdamSchedulerOutput:
+    prev_sample: torch.Tensor
+class DeCoFlowMatchAdamDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """AdamLM multi-step flow-matching sampler (order=2 by default for t2i)."""
+    config_name = "scheduler_config.json"
+    order = 1
+    init_noise_sigma = 1.0
+    @staticmethod
+    def _lagrange_coeffs(order: int, pre_ts: torch.Tensor, t_start: torch.Tensor, t_end: torch.Tensor) -> List[float]:
+        ts = [float(v) for v in pre_ts[-order:].tolist()]
+        a = float(t_start)
+        b = float(t_end)
+        if order == 1:
+            return [1.0]
+        if order == 2:
+            t1, t2 = ts
+            int1 = 0.5 / (t1 - t2) * ((b - t2) ** 2 - (a - t2) ** 2)
+            int2 = 0.5 / (t2 - t1) * ((b - t1) ** 2 - (a - t1) ** 2)
+            total = int1 + int2
+            return [int1 / total, int2 / total]
+        if order == 3:
+            t1, t2, t3 = ts
+            int1_denom = (t1 - t2) * (t1 - t3)
+            int1 = ((1 / 3) * b**3 - 0.5 * (t2 + t3) * b**2 + (t2 * t3) * b) - (
+                (1 / 3) * a**3 - 0.5 * (t2 + t3) * a**2 + (t2 * t3) * a
+            )
+            int1 = int1 / int1_denom
+            int2_denom = (t2 - t1) * (t2 - t3)
+            int2 = ((1 / 3) * b**3 - 0.5 * (t1 + t3) * b**2 + (t1 * t3) * b) - (
+                (1 / 3) * a**3 - 0.5 * (t1 + t3) * a**2 + (t1 * t3) * a
+            )
+            int2 = int2 / int2_denom
+            int3_denom = (t3 - t1) * (t3 - t2)
+            int3 = ((1 / 3) * b**3 - 0.5 * (t1 + t2) * b**2 + (t1 * t2) * b) - (
+                (1 / 3) * a**3 - 0.5 * (t1 + t2) * a**2 + (t1 * t2) * a
+            )
+            int3 = int3 / int3_denom
+            total = int1 + int2 + int3
+            return [int1 / total, int2 / total, int3 / total]
+        raise ValueError(f"Unsupported solver order: {order}.")
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 4.0,
+        timeshift: float = 3.0,
+        order: int = 2,
+        guidance_interval_min: float = 0.0,
+        guidance_interval_max: float = 1.0,
+        last_step: Optional[float] = None,
+        prediction_type: str = "v_prediction",
+    ) -> None:
+        self.num_inference_steps = int(num_inference_steps)
+        self.guidance_scale = float(guidance_scale)
+        self.timeshift = float(timeshift)
+        self.order = int(order)
+        self.guidance_interval_min = float(guidance_interval_min)
+        self.guidance_interval_max = float(guidance_interval_max)
+        self.last_step = last_step
+        self._reset_state()
+    def _reset_state(self) -> None:
+        self.timesteps: Optional[torch.Tensor] = None
+        self._timedeltas: Optional[torch.Tensor] = None
+        self._solver_coeffs: Optional[List[List[float]]] = None
+        self._model_outputs: List[torch.Tensor] = []
+        self._step_index = 0
+    @staticmethod
+    def _shift_respace_fn(t: torch.Tensor, shift: float = 3.0) -> torch.Tensor:
+        return t / (t + (1 - t) * shift)
+    def _build_solver_state(
+        self,
+        num_inference_steps: int,
+        timeshift: float,
+        device: Optional[Union[str, torch.device]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[float]]]:
+        last_step = self.last_step
+        if last_step is None:
+            last_step = 1.0 / float(num_inference_steps)
+        endpoints = torch.linspace(0.0, 1.0 - float(last_step), int(num_inference_steps), dtype=torch.float32)
+        endpoints = torch.cat([endpoints, torch.tensor([1.0], dtype=torch.float32)], dim=0)
+        timesteps = self._shift_respace_fn(endpoints, timeshift).to(device=device)
+        timedeltas = (timesteps[1:] - timesteps[:-1]).to(device=device)
+        solver_coeffs: List[List[float]] = [[] for _ in range(int(num_inference_steps))]
+        for i in range(int(num_inference_steps)):
+            order = min(self.order, i + 1)
+            pre_ts = timesteps[: i + 1]
+            coeffs = self._lagrange_coeffs(order, pre_ts, pre_ts[i], timesteps[i + 1])
+            solver_coeffs[i] = coeffs
+        return timesteps[:-1], timedeltas, solver_coeffs
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timeshift: Optional[float] = None,
+        guidance_scale: Optional[float] = None,
+        order: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        if num_inference_steps is not None:
+            self.num_inference_steps = int(num_inference_steps)
+        if timeshift is not None:
+            self.timeshift = float(timeshift)
+        else:
+            self.timeshift = float(getattr(self.config, "timeshift", self.timeshift))
+        if guidance_scale is not None:
+            self.guidance_scale = float(guidance_scale)
+        if order is not None:
+            self.order = int(order)
+        else:
+            self.order = int(getattr(self.config, "order", self.order))
+        timesteps, timedeltas, solver_coeffs = self._build_solver_state(
+            self.num_inference_steps,
+            self.timeshift,
+            device=device,
+        )
+        self.timesteps = timesteps
+        self._timedeltas = timedeltas
+        self._solver_coeffs = solver_coeffs
+        self._model_outputs = []
+        self._step_index = 0
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return sample
+    def classifier_free_guidance(
+        self,
+        model_output: torch.Tensor,
+        guidance_scale: Optional[float] = None,
+    ) -> torch.Tensor:
+        if model_output.shape[0] % 2 != 0:
+            raise ValueError("Classifier-free guidance expects concatenated unconditional/conditional batches.")
+        scale = self.guidance_scale if guidance_scale is None else float(guidance_scale)
+        uncond, cond = model_output.chunk(2, dim=0)
+        return uncond + scale * (cond - uncond)
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+        **kwargs: Any,
+    ) -> Union[DeCoFlowMatchAdamSchedulerOutput, Tuple[torch.Tensor]]:
+        del timestep, kwargs
+        if self.timesteps is None or self._timedeltas is None or self._solver_coeffs is None:
+            raise RuntimeError("`set_timesteps` must be called before `step`.")
+        if self._step_index >= len(self._solver_coeffs):
+            raise RuntimeError("Scheduler step index exceeded configured timesteps.")
+        coeffs = self._solver_coeffs[self._step_index]
+        self._model_outputs.append(model_output)
+        order = len(coeffs)
+        pred = torch.zeros_like(model_output)
+        recent = self._model_outputs[-order:]
+        for coeff, output in zip(coeffs, recent):
+            pred = pred + coeff * output
+        prev_sample = sample + pred * self._timedeltas[self._step_index]
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return DeCoFlowMatchAdamSchedulerOutput(prev_sample=prev_sample)
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        alpha = timesteps.view(-1, 1, 1, 1)
+        sigma = (1.0 - timesteps).view(-1, 1, 1, 1)
+        return alpha * original_samples + sigma * noise

DeCo-XXL-16-512-t2i/scheduler/scheduling_deco_flow_match_euler_discrete.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import annotations
+from typing import Optional, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
+def _shift_respace_fn(t: torch.Tensor, shift: float = 1.0) -> torch.Tensor:
+    return t / (t + (1 - t) * shift)
+class DeCoFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    config_name = "scheduler_config.json"
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        last_step: Optional[float] = None,
+        prediction_type: str = "v_prediction",
+    ):
+        self.timesteps = torch.tensor([], dtype=torch.float32)
+        self.num_inference_steps: Optional[int] = None
+        self._step_index: int = 0
+    @property
+    def init_noise_sigma(self) -> float:
+        return 1.0
+    def set_timesteps(self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None):
+        if num_inference_steps <= 0:
+            raise ValueError("num_inference_steps must be > 0")
+        self.num_inference_steps = int(num_inference_steps)
+        last_step = self.config.last_step
+        if last_step is None:
+            last_step = 1.0 / float(self.num_inference_steps)
+        base_timesteps = torch.linspace(0.0, 1.0 - float(last_step), self.num_inference_steps, dtype=torch.float32)
+        base_timesteps = torch.cat([base_timesteps, torch.tensor([1.0], dtype=torch.float32)], dim=0)
+        timesteps = _shift_respace_fn(base_timesteps, shift=float(self.config.shift))
+        if device is not None:
+            timesteps = timesteps.to(device)
+        self.timesteps = timesteps
+        self._step_index = 0
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return sample
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[torch.Tensor, float],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+    ):
+        if self.num_inference_steps is None or self.timesteps.numel() == 0:
+            raise ValueError("Call set_timesteps before step")
+        step_index = min(self._step_index, len(self.timesteps) - 2)
+        dt = (self.timesteps[step_index + 1] - self.timesteps[step_index]).to(device=sample.device, dtype=sample.dtype)
+        prev_sample = sample + model_output * dt
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def add_noise(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        if timesteps.ndim == 0:
+            timesteps = timesteps[None]
+        t = timesteps.to(device=original_samples.device, dtype=original_samples.dtype).view(-1, 1, 1, 1)
+        return t * original_samples + (1.0 - t) * noise

DeCo-XXL-16-512-t2i/scripts/run_t2i_demo.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/usr/bin/env python3
+"""Full t2i demo: load local Qwen text encoder and save demo.png."""
+from __future__ import annotations
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+MODEL_DIR = Path(__file__).resolve().parents[1]
+def main() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    pipe = DiffusionPipeline.from_pretrained(
+        str(MODEL_DIR),
+        local_files_only=True,
+        custom_pipeline=str(MODEL_DIR / "pipeline.py"),
+        trust_remote_code=True,
+        torch_dtype=dtype,
+    )
+    print("text_encoder:", type(pipe.text_encoder).__name__)
+    pipe.to(device)
+    prompt = "a golden retriever playing in the snow, high quality photograph"
+    # Official DeCo uses CPU generator for reproducible noise (app.py / GenEval).
+    generator = torch.Generator(device="cpu").manual_seed(42)
+    print("generating...", prompt)
+    result = pipe(
+        prompt=prompt,
+        negative_prompt="Unrealistic, JPEG artifacts.",
+        num_inference_steps=25,
+        guidance_scale=4.0,
+        generator=generator,
+        output_type="pil",
+    )
+    image = result.images[0]
+    out_path = MODEL_DIR / "demo.png"
+    image.save(out_path)
+    print("saved", out_path, image.size)
+if __name__ == "__main__":
+    main()

DeCo-XXL-16-512-t2i/scripts/test_t2i_load.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3
+"""Smoke test: load converted DeCo-XXL-16-512-t2i and run 2-step denoise with dummy text."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+MODEL_DIR = Path(__file__).resolve().parents[1]
+def main() -> None:
+    pipe = DiffusionPipeline.from_pretrained(
+        str(MODEL_DIR),
+        local_files_only=True,
+        custom_pipeline=str(MODEL_DIR / "pipeline.py"),
+        trust_remote_code=True,
+        torch_dtype=torch.float32,
+    )
+    assert pipe.decoder is not None and pipe.transformer is not None
+    batch_size = 1
+    seq_len = int(pipe.transformer.config.txt_max_length)
+    embed_dim = int(pipe.transformer.config.txt_embed_dim)
+    hidden = torch.randn(batch_size, seq_len, embed_dim)
+    torch.manual_seed(0)
+    with torch.inference_mode():
+        result = pipe.transformer(
+            torch.randn(batch_size, 3, 512, 512),
+            0.5,
+            encoder_hidden_states=hidden,
+            decoder=pipe.decoder,
+            return_dict=True,
+        )
+    out = result.sample
+    assert out.shape == (batch_size, 3, 512, 512)
+    print("transformer:", type(pipe.transformer).__name__)
+    print("decoder:", type(pipe.decoder).__name__)
+    print("output shape:", tuple(out.shape))
+    print("ok")
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as exc:
+        print(f"FAILED: {exc}", file=sys.stderr)
+        raise

DeCo-XXL-16-512-t2i/transformer/__pycache__/transformer_deco_t2i.cpython-312.pyc ADDED Viewed

Binary file (29.7 kB). View file

DeCo-XXL-16-512-t2i/transformer/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "sample_size": 512,
+  "conditioning_type": "text",
+  "decoder_hidden_size": 32,
+  "deep_supervision": 0,
+  "hidden_size": 1536,
+  "hidden_size_x": 32,
+  "in_channels": 3,
+  "learn_sigma": true,
+  "nerf_mlpratio": 4,
+  "num_blocks": 19,
+  "num_classes": 0,
+  "num_cond_blocks": 16,
+  "num_decoder_blocks": 3,
+  "num_encoder_blocks": 16,
+  "num_groups": 24,
+  "num_text_blocks": 4,
+  "patch_size": 16,
+  "txt_embed_dim": 2048,
+  "txt_max_length": 128
+}

DeCo-XXL-16-512-t2i/transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dacd4b933da1f5326035b1f63d21d127aa72f598d97eb07ca57b54c0dab9b08
+size 4484623152

DeCo-XXL-16-512-t2i/transformer/transformer_deco_t2i.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.functional import scaled_dot_product_attention
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+class RMSNorm(nn.Module):
+    """Match Zehong-Ma/DeCo `src.models.layers.rmsnorm.RMSNorm` (not diffusers variant)."""
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+class PatchEmbed(nn.Module):
+    def __init__(self, in_chans: int, embed_dim: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(in_chans, embed_dim, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.proj(x)
+class TimestepEmbedder(nn.Module):
+    """Sinusoidal timestep embedding with checkpoint-compatible `mlp` module names."""
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t: torch.Tensor, dim: int, max_period: int = 10) -> torch.Tensor:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+        )
+        args = t[..., None].float() * freqs[None, ...]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding.to(t.dtype)
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        return self.mlp(t_freq)
+class DeCoSwiGLU(nn.Module):
+    """SwiGLU MLP with w1/w2/w3 layout matching official DeCo checkpoints."""
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+def precompute_freqs_cis_2d(
+    dim: int,
+    height: int,
+    width: int,
+    theta: float = 10000.0,
+    scale: float = 1.0,
+) -> torch.Tensor:
+    """Official t2i uses `precompute_freqs_cis_ex2d` (aliased as precompute_freqs_cis_2d)."""
+    if isinstance(scale, float):
+        scale = (scale, scale)
+    x_pos = torch.linspace(0, height * scale[0], width)
+    y_pos = torch.linspace(0, width * scale[1], height)
+    y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+    y_pos = y_pos.reshape(-1)
+    x_pos = x_pos.reshape(-1)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    x_freqs = torch.outer(x_pos, freqs).float()
+    y_freqs = torch.outer(y_pos, freqs).float()
+    x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+    y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+    freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+    return freqs_cis.reshape(height * width, -1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    freqs_cis = freqs_cis[None, None, :, :]
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class DeCoT2ISwiGLU(nn.Module):
+    """Official DeCo-XXL t2i SwiGLU (w12/w3), distinct from c2i w1/w2/w3 layout."""
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w12 = nn.Linear(dim, hidden_dim * 2, bias=False)
+        self.w3 = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = self.w12(x).chunk(2, dim=-1)
+        return self.w3(F.silu(x1) * x2)
+def _modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+class TextEmbedder(nn.Module):
+    def __init__(self, in_channels: int, embed_dim: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(in_channels, embed_dim, bias=bias)
+        self.norm = RMSNorm(embed_dim, eps=1e-6)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.norm(self.proj(x))
+class CrossAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_drop: float = 0.0):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv_x = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.kv_y = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.q_norm = RMSNorm(self.head_dim, eps=1e-6)
+        self.k_norm = RMSNorm(self.head_dim, eps=1e-6)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, y: torch.Tensor, pos: torch.Tensor) -> torch.Tensor:
+        batch_size, num_tokens, channels = x.shape
+        qkv_x = self.qkv_x(x).reshape(batch_size, num_tokens, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        query, key_x, value_x = qkv_x[0], qkv_x[1], qkv_x[2]
+        query = self.q_norm(query.contiguous())
+        key_x = self.k_norm(key_x.contiguous())
+        query, key_x = apply_rotary_emb(query, key_x, freqs_cis=pos)
+        kv_y = self.kv_y(y).reshape(batch_size, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        key_y, value_y = kv_y[0], kv_y[1]
+        key_y = self.k_norm(key_y.contiguous())
+        key = torch.cat([key_x, key_y], dim=2)
+        value = torch.cat([value_x, value_y], dim=2)
+        query = query.view(batch_size, self.num_heads, -1, self.head_dim)
+        key = key.view(batch_size, self.num_heads, -1, self.head_dim).contiguous()
+        value = value.view(batch_size, self.num_heads, -1, self.head_dim).contiguous()
+        out = scaled_dot_product_attention(query, key, value, dropout_p=0.0)
+        out = out.transpose(1, 2).reshape(batch_size, num_tokens, channels)
+        return self.proj_drop(self.proj(out))
+class TextRefineAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_drop: float = 0.0):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = RMSNorm(self.head_dim, eps=1e-6)
+        self.k_norm = RMSNorm(self.head_dim, eps=1e-6)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, num_tokens, channels = x.shape
+        qkv = self.qkv(x).reshape(batch_size, num_tokens, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        query, key, value = qkv[0], qkv[1], qkv[2]
+        query = self.q_norm(query.contiguous())
+        key = self.k_norm(key.contiguous())
+        query = query.view(batch_size, self.num_heads, -1, self.head_dim)
+        key = key.view(batch_size, self.num_heads, -1, self.head_dim).contiguous()
+        value = value.view(batch_size, self.num_heads, -1, self.head_dim).contiguous()
+        out = scaled_dot_product_attention(query, key, value, dropout_p=0.0)
+        out = out.transpose(1, 2).reshape(batch_size, num_tokens, channels)
+        return self.proj_drop(self.proj(out))
+class T2IFlattenDiTBlock(nn.Module):
+    def __init__(self, hidden_size: int, groups: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
+        self.attn = CrossAttention(hidden_size, num_heads=groups, qkv_bias=False)
+        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
+        self.mlp = DeCoT2ISwiGLU(hidden_size, int(hidden_size * mlp_ratio))
+        self.adaLN_modulation = nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+    def forward(self, x: torch.Tensor, y: torch.Tensor, c: torch.Tensor, pos: torch.Tensor) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate_msa * self.attn(_modulate(self.norm1(x), shift_msa, scale_msa), y, pos)
+        return x + gate_mlp * self.mlp(_modulate(self.norm2(x), shift_mlp, scale_mlp))
+class TextRefineBlock(nn.Module):
+    def __init__(self, hidden_size: int, groups: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
+        self.attn = TextRefineAttention(hidden_size, num_heads=groups, qkv_bias=False)
+        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
+        self.mlp = DeCoT2ISwiGLU(hidden_size, int(hidden_size * mlp_ratio))
+        self.adaLN_modulation = nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate_msa * self.attn(_modulate(self.norm1(x), shift_msa, scale_msa))
+        return x + gate_mlp * self.mlp(_modulate(self.norm2(x), shift_mlp, scale_mlp))
+@dataclass
+class DeCoT2ITransformer2DModelOutput(BaseOutput):
+    sample: torch.Tensor
+class _DeCoT2ITransformerBackbone(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        patch_size: int,
+        num_groups: int,
+        hidden_size: int,
+        num_encoder_blocks: int,
+        num_text_blocks: int,
+        txt_embed_dim: int,
+        txt_max_length: int,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.num_groups = num_groups
+        self.num_encoder_blocks = num_encoder_blocks
+        self.txt_max_length = txt_max_length
+        self.s_embedder = PatchEmbed(in_channels * patch_size**2, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = TextEmbedder(txt_embed_dim, hidden_size, bias=True)
+        self.y_pos_embedding = nn.Parameter(torch.randn(1, txt_max_length, hidden_size))
+        self.blocks = nn.ModuleList(
+            [T2IFlattenDiTBlock(hidden_size, num_groups) for _ in range(num_encoder_blocks)]
+        )
+        self.text_refine_blocks = nn.ModuleList(
+            [TextRefineBlock(hidden_size, num_groups) for _ in range(num_text_blocks)]
+        )
+        self.precompute_pos: dict[tuple[int, int], torch.Tensor] = {}
+        self._init_weights()
+    def _init_weights(self) -> None:
+        weight = self.s_embedder.proj.weight.data
+        nn.init.xavier_uniform_(weight.view([weight.shape[0], -1]))
+        nn.init.constant_(self.s_embedder.proj.bias, 0)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+    def fetch_pos(self, height: int, width: int, device: torch.device) -> torch.Tensor:
+        key = (height, width)
+        if key not in self.precompute_pos:
+            self.precompute_pos[key] = precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width)
+        return self.precompute_pos[key].to(device)
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        decoder: nn.Module,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = x.shape
+        pos = self.fetch_pos(height // self.patch_size, width // self.patch_size, x.device)
+        x = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+        t = self.t_embedder(t.view(-1)).view(batch_size, -1, self.hidden_size)
+        y = self.y_embedder(encoder_hidden_states) + self.y_pos_embedding.to(encoder_hidden_states.dtype)
+        condition = F.silu(t)
+        for block in self.text_refine_blocks:
+            y = block(y, condition)
+        s = self.s_embedder(x)
+        for block in self.blocks:
+            s = block(s, y, condition, pos)
+        s = F.silu(t + s)
+        batch_size, length, _ = s.shape
+        patch_pixels = x.reshape(batch_size * length, self.in_channels, self.patch_size**2).transpose(1, 2)
+        conditioning = s.view(batch_size * length, self.hidden_size)
+        decoded = decoder(patch_pixels, conditioning).sample
+        x = decoded.transpose(1, 2).reshape(batch_size, length, -1)
+        return F.fold(
+            x.transpose(1, 2).contiguous(),
+            (height, width),
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+class DeCoT2ITransformer2DModel(ModelMixin, ConfigMixin):
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        patch_size: int = 16,
+        num_groups: int = 24,
+        hidden_size: int = 1536,
+        hidden_size_x: int = 32,
+        num_blocks: int = 19,
+        num_encoder_blocks: int = 16,
+        num_decoder_blocks: int = 3,
+        num_text_blocks: int = 4,
+        num_cond_blocks: int = 16,
+        num_classes: int = 0,
+        learn_sigma: bool = True,
+        deep_supervision: int = 0,
+        sample_size: int = 512,
+        conditioning_type: str = "text",
+        nerf_mlpratio: int = 4,
+        decoder_hidden_size: int = 32,
+        txt_embed_dim: int = 2048,
+        txt_max_length: int = 128,
+    ):
+        super().__init__()
+        del hidden_size_x, nerf_mlpratio, num_blocks, num_cond_blocks, num_classes, learn_sigma, deep_supervision
+        if conditioning_type != "text":
+            raise ValueError("DeCoT2ITransformer2DModel only supports text conditioning (t2i).")
+        self.backbone = _DeCoT2ITransformerBackbone(
+            in_channels=in_channels,
+            patch_size=patch_size,
+            num_groups=num_groups,
+            hidden_size=hidden_size,
+            num_encoder_blocks=num_encoder_blocks,
+            txt_embed_dim=txt_embed_dim,
+            txt_max_length=txt_max_length,
+            num_text_blocks=num_text_blocks,
+        )
+    @property
+    def in_channels(self) -> int:
+        return int(self.config.in_channels)
+    def _prepare_timestep(
+        self, timestep: Union[torch.Tensor, float, int], batch_size: int, sample: torch.Tensor
+    ) -> torch.Tensor:
+        if not isinstance(timestep, torch.Tensor):
+            timestep = torch.tensor([timestep], device=sample.device, dtype=sample.dtype)
+        timestep = timestep.to(device=sample.device, dtype=sample.dtype)
+        if timestep.ndim == 0:
+            timestep = timestep[None]
+        if timestep.shape[0] == 1 and batch_size > 1:
+            timestep = timestep.repeat(batch_size)
+        return timestep
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        decoder: Optional[nn.Module] = None,
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DeCoT2ITransformer2DModelOutput, tuple[torch.Tensor]]:
+        if class_labels is not None:
+            raise ValueError("class_labels are not supported; use encoder_hidden_states for t2i DeCo models.")
+        if encoder_hidden_states is None:
+            raise ValueError("encoder_hidden_states must be provided for text-conditioned DeCo models.")
+        if decoder is None:
+            raise ValueError("decoder must be provided; load DeCoPatchDecoderModel as a separate pipeline component.")
+        batch_size = sample.shape[0]
+        t = self._prepare_timestep(timestep=timestep, batch_size=batch_size, sample=sample)
+        output = self.backbone(sample, t, encoder_hidden_states, decoder=decoder)
+        if not return_dict:
+            return (output,)
+        return DeCoT2ITransformer2DModelOutput(sample=output)

README.md ADDED Viewed

	@@ -0,0 +1,109 @@

+---
+library_name: diffusers
+pipeline_tag: unconditional-image-generation
+tags:
+  - diffusers
+  - deco
+  - image-generation
+  - class-conditional
+  - imagenet
+license: mit
+inference: true
+widget:
+  - text: golden retriever
+    output:
+      url: DeCo-XL-16-512/demo.png
+language:
+  - en
+---
+# DeCo-diffusers
+Diffusers-ready checkpoints for **DeCo** (Decoupled Conditioning), converted for local/offline use.
+This root folder is a model collection that contains:
+- `DeCo-XL-16-256`
+- `DeCo-XL-16-512`
+- `DeCo-XXL-16-512-t2i` (text-to-image; requires `Qwen/Qwen3-1.7B` text encoder)
+Each subfolder is a self-contained Diffusers model repo with:
+- `pipeline.py`
+- `transformer/transformer_deco.py`
+- `scheduler/scheduling_deco_flow_match_euler_discrete.py`
+- `transformer/diffusion_pytorch_model.safetensors`
+- `vae/autoencoder_deco.py`
+Each variant embeds English `id2label` directly in `model_index.json` (DiT-style), so class labels can be passed as
+ImageNet ids or English synonym strings.
+- `pipe.id2label` — id → English label (comma-separated synonyms)
+- `pipe.get_label_ids("golden retriever")` — English label → id
+## Demo
+![DeCo-XL-16-512 demo](DeCo-XL-16-512/demo.png)
+Class-conditional sample (ImageNet class **207**, golden retriever), `DeCo-XL/16` at 512×512, 100 steps, CFG 5.0, seed 42.
+## Model Paths
+Use paths relative to this root README:
+| Model | Resolution | Source checkpoint | Local path |
+| --- | ---: | --- | --- |
+| DeCo-XL/16 | 256×256 | `imagenet256_epoch800.ckpt` (EMA) | `./DeCo-XL-16-256` |
+| DeCo-XL/16 | 512×512 | `imagenet512_epoch340.ckpt` (EMA) | `./DeCo-XL-16-512` |
+| DeCo-XXL/16 | 512×512 t2i | `t2i_DeCo.ckpt` (EMA) | `./DeCo-XXL-16-512-t2i` |
+## Inference Demo (Diffusers)
+### 1) Load a local subfolder checkpoint
+```python
+import torch
+from diffusers import DiffusionPipeline
+model_path = "./DeCo-XL-16-512"  # change to ./DeCo-XL-16-256 for 256px
+device = "cuda" if torch.cuda.is_available() else "cpu"
+pipe = DiffusionPipeline.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+).to(device)
+generator = torch.Generator(device=device).manual_seed(42)
+# ImageNet class example: 207 = golden retriever
+print(pipe.id2label[207])
+print(pipe.get_label_ids("golden retriever"))  # [207]
+result = pipe(
+    class_labels="golden retriever",
+    num_inference_steps=100,
+    guidance_scale=5.0,  # use 3.2 for DeCo-XL-16-256
+    generator=generator,
+)
+image = result.images[0]
+image.save("deco_xl_512_demo.png")
+```
+### 2) Quick variant switch (256 model)
+```python
+model_path = "./DeCo-XL-16-256"
+pipe = DiffusionPipeline.from_pretrained(model_path, trust_remote_code=True).to(device)
+image = pipe(
+    class_labels=207,
+    num_inference_steps=100,
+    guidance_scale=3.2,
+    generator=generator,
+).images[0]
+image.save("deco_xl_256_demo.png")
+```
+Integer class ids, batched labels, and optional `batch_size` for repeating a single label are also supported.

t2i_DeCo.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:433f55684b19c446d9fad4591f840fcdf9770a668c383ea910da86362651492f
+size 4558758567