BiliSakura commited on
Commit
4968e7f
·
verified ·
1 Parent(s): bd7a133

Upload folder using huggingface_hub

Browse files
Files changed (40) hide show
  1. .gitignore +1 -0
  2. PixelFlow-256/__pycache__/pipeline.cpython-312.pyc +0 -0
  3. PixelFlow-256/demo.png +0 -0
  4. PixelFlow-256/model_index.json +12 -0
  5. PixelFlow-256/pipeline.py +489 -0
  6. PixelFlow-256/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc +0 -0
  7. PixelFlow-256/scheduler/scheduler_config.json +7 -0
  8. PixelFlow-256/scheduler/scheduling_pixelflow.py +135 -0
  9. PixelFlow-256/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc +0 -0
  10. PixelFlow-256/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc +0 -0
  11. PixelFlow-256/transformer/config.json +16 -0
  12. PixelFlow-256/transformer/diffusion_pytorch_model.safetensors +3 -0
  13. PixelFlow-256/transformer/modeling_pixelflow.py +448 -0
  14. PixelFlow-256/transformer/transformer_pixelflow.py +85 -0
  15. PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc +0 -0
  16. PixelFlow-T2I/model_index.json +20 -0
  17. PixelFlow-T2I/pipeline.py +405 -0
  18. PixelFlow-T2I/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc +0 -0
  19. PixelFlow-T2I/scheduler/scheduler_config.json +7 -0
  20. PixelFlow-T2I/scheduler/scheduling_pixelflow.py +135 -0
  21. PixelFlow-T2I/text_encoder/config.json +58 -0
  22. PixelFlow-T2I/text_encoder/generation_config.json +7 -0
  23. PixelFlow-T2I/text_encoder/model-00001-of-00002.safetensors +3 -0
  24. PixelFlow-T2I/text_encoder/model-00002-of-00002.safetensors +3 -0
  25. PixelFlow-T2I/text_encoder/model.safetensors.index.json +567 -0
  26. PixelFlow-T2I/tokenizer/special_tokens_map.json +107 -0
  27. PixelFlow-T2I/tokenizer/spiece.model +3 -0
  28. PixelFlow-T2I/tokenizer/tokenizer.json +0 -0
  29. PixelFlow-T2I/tokenizer/tokenizer_config.json +113 -0
  30. PixelFlow-T2I/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc +0 -0
  31. PixelFlow-T2I/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc +0 -0
  32. PixelFlow-T2I/transformer/config.json +16 -0
  33. PixelFlow-T2I/transformer/diffusion_pytorch_model.safetensors +3 -0
  34. PixelFlow-T2I/transformer/modeling_pixelflow.py +448 -0
  35. PixelFlow-T2I/transformer/transformer_pixelflow.py +85 -0
  36. README.md +110 -0
  37. labels/__pycache__/imagenet_labels.cpython-312.pyc +0 -0
  38. labels/id2label_cn.json +1002 -0
  39. labels/id2label_en.json +1002 -0
  40. labels/imagenet_labels.py +61 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
PixelFlow-256/__pycache__/pipeline.cpython-312.pyc ADDED
Binary file (14 kB). View file
 
PixelFlow-256/demo.png ADDED
PixelFlow-256/model_index.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PixelFlowPipeline",
3
+ "_diffusers_version": "0.36.0",
4
+ "scheduler": [
5
+ "scheduling_pixelflow",
6
+ "PixelFlowScheduler"
7
+ ],
8
+ "transformer": [
9
+ "transformer_pixelflow",
10
+ "PixelFlowTransformer2DModel"
11
+ ]
12
+ }
PixelFlow-256/pipeline.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hub custom pipeline: PixelFlowPipeline.
2
+
3
+ Load with native Hugging Face diffusers and `trust_remote_code=True`.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import importlib
9
+ import math
10
+ import sys
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import List, Optional, Tuple, Union
14
+
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from einops import rearrange
19
+
20
+ from diffusers.image_processor import VaeImageProcessor
21
+ from diffusers.models.embeddings import get_2d_rotary_pos_embed
22
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
23
+ from diffusers.utils import BaseOutput
24
+ from diffusers.utils.torch_utils import randn_tensor
25
+
26
+
27
+ @dataclass
28
+ class PixelFlowPipelineOutput(BaseOutput):
29
+ images: Union[torch.Tensor, List, np.ndarray]
30
+
31
+
32
+ class PixelFlowPipeline(DiffusionPipeline):
33
+ """Pipeline for PixelFlow pixel-space flow generation (class-conditional or text-to-image)."""
34
+
35
+ model_cpu_offload_seq = "text_encoder->transformer"
36
+ _optional_components = ["text_encoder", "tokenizer"]
37
+
38
+ @classmethod
39
+ def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
40
+ """Load a self-contained variant folder locally or from the Hub."""
41
+ repo_root = Path(__file__).resolve().parent
42
+
43
+ if pretrained_model_name_or_path in (None, "", "."):
44
+ variant = repo_root
45
+ elif (
46
+ isinstance(pretrained_model_name_or_path, str)
47
+ and "/" in pretrained_model_name_or_path
48
+ and not Path(pretrained_model_name_or_path).exists()
49
+ ):
50
+ from huggingface_hub import snapshot_download
51
+
52
+ hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
53
+ if subfolder:
54
+ hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
55
+ cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
56
+ variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
57
+ else:
58
+ variant = Path(pretrained_model_name_or_path)
59
+ if not variant.is_absolute():
60
+ candidate = (Path.cwd() / variant).resolve()
61
+ variant = candidate if candidate.exists() else (repo_root / variant).resolve()
62
+ if subfolder:
63
+ variant = variant / subfolder
64
+
65
+ model_kwargs = dict(kwargs)
66
+ inserted: List[str] = []
67
+
68
+ def _load_component(folder: str, module_name: str, class_name: str):
69
+ comp_dir = variant / folder
70
+ module_path = comp_dir / f"{module_name}.py"
71
+ has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
72
+ if not module_path.exists() or not has_weights:
73
+ return None
74
+
75
+ comp_path = str(comp_dir)
76
+ if comp_path not in sys.path:
77
+ sys.path.insert(0, comp_path)
78
+ inserted.append(comp_path)
79
+
80
+ module = importlib.import_module(module_name)
81
+ component_cls = getattr(module, class_name)
82
+ return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
83
+
84
+ def _load_text_components():
85
+ text_encoder = None
86
+ tokenizer = None
87
+ te_dir = variant / "text_encoder"
88
+ tok_dir = variant / "tokenizer"
89
+ if te_dir.exists() and (te_dir / "config.json").exists():
90
+ from transformers import T5EncoderModel, T5Tokenizer
91
+
92
+ text_encoder = T5EncoderModel.from_pretrained(str(te_dir), **model_kwargs)
93
+ tokenizer = T5Tokenizer.from_pretrained(str(tok_dir))
94
+ return text_encoder, tokenizer
95
+
96
+ try:
97
+ transformer = _load_component("transformer", "transformer_pixelflow", "PixelFlowTransformer2DModel")
98
+ scheduler = _load_component("scheduler", "scheduling_pixelflow", "PixelFlowScheduler")
99
+ text_encoder, tokenizer = _load_text_components()
100
+
101
+ if scheduler is None:
102
+ sched_dir = variant / "scheduler"
103
+ if (sched_dir / "scheduling_pixelflow.py").exists():
104
+ sched_path = str(sched_dir)
105
+ if sched_path not in sys.path:
106
+ sys.path.insert(0, sched_path)
107
+ inserted.append(sched_path)
108
+ scheduler = importlib.import_module("scheduling_pixelflow").PixelFlowScheduler()
109
+
110
+ if transformer is None:
111
+ raise ValueError(f"No loadable transformer found under {variant}")
112
+
113
+ id2label = None
114
+ id2label_cn = None
115
+ labels_dir = variant.parent / "labels"
116
+ if labels_dir.is_dir():
117
+ labels_path = str(labels_dir)
118
+ if labels_path not in sys.path:
119
+ sys.path.insert(0, labels_path)
120
+ inserted.append(labels_path)
121
+ from imagenet_labels import load_id2label
122
+
123
+ id2label = load_id2label(labels_dir, lang="en")
124
+ id2label_cn = load_id2label(labels_dir, lang="cn")
125
+
126
+ return cls(
127
+ transformer=transformer,
128
+ scheduler=scheduler,
129
+ text_encoder=text_encoder,
130
+ tokenizer=tokenizer,
131
+ id2label=id2label,
132
+ id2label_cn=id2label_cn,
133
+ )
134
+ finally:
135
+ for comp_path in inserted:
136
+ if comp_path in sys.path:
137
+ sys.path.remove(comp_path)
138
+
139
+ def __init__(
140
+ self,
141
+ transformer,
142
+ scheduler,
143
+ text_encoder=None,
144
+ tokenizer=None,
145
+ max_token_length: int = 512,
146
+ id2label: Optional[dict[int, str]] = None,
147
+ id2label_cn: Optional[dict[int, str]] = None,
148
+ ):
149
+ super().__init__()
150
+ self.register_modules(
151
+ transformer=transformer,
152
+ scheduler=scheduler,
153
+ text_encoder=text_encoder,
154
+ tokenizer=tokenizer,
155
+ )
156
+ self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
157
+ self.class_cond = transformer.config.num_classes > 0
158
+ self.max_token_length = max_token_length
159
+
160
+ self._id2label = id2label or {}
161
+ self._id2label_cn = id2label_cn or {}
162
+ self.labels = self._build_label2id(self._id2label)
163
+ self.labels_cn = self._build_label2id(self._id2label_cn)
164
+
165
+ @staticmethod
166
+ def _build_label2id(id2label: dict[int, str]) -> dict[str, int]:
167
+ label2id: dict[str, int] = {}
168
+ for class_id, value in id2label.items():
169
+ for synonym in value.split(","):
170
+ synonym = synonym.strip()
171
+ if synonym:
172
+ label2id[synonym] = int(class_id)
173
+ return dict(sorted(label2id.items()))
174
+
175
+ @property
176
+ def id2label(self) -> dict[int, str]:
177
+ """ImageNet class id to English label string (comma-separated synonyms)."""
178
+ return self._id2label
179
+
180
+ @property
181
+ def id2label_cn(self) -> dict[int, str]:
182
+ """ImageNet class id to Chinese label string (comma-separated synonyms)."""
183
+ return self._id2label_cn
184
+
185
+ def get_label_ids(self, label: Union[str, List[str]], lang: str = "en") -> List[int]:
186
+ r"""
187
+ Map ImageNet label strings to class ids.
188
+
189
+ Args:
190
+ label (`str` or `list[str]`):
191
+ One or more label strings. Each string must match a synonym in `id2label` (English)
192
+ or `id2label_cn` (Chinese).
193
+ lang (`str`, *optional*, defaults to `"en"`):
194
+ `"en"` uses English synonyms; `"cn"` uses Chinese synonyms.
195
+
196
+ Returns:
197
+ `list[int]`: Class ids for [`~PixelFlowPipeline.__call__`].
198
+ """
199
+ if lang not in ("en", "cn"):
200
+ raise ValueError(f"`lang` must be 'en' or 'cn', got {lang!r}.")
201
+
202
+ label2id = self.labels if lang == "en" else self.labels_cn
203
+ if not label2id:
204
+ raise ValueError(
205
+ f"No {lang} labels loaded. Ensure `labels/id2label_{lang}.json` exists next to the variant folder."
206
+ )
207
+
208
+ if isinstance(label, str):
209
+ label = [label]
210
+
211
+ missing = [item for item in label if item not in label2id]
212
+ if missing:
213
+ preview = ", ".join(list(label2id.keys())[:8])
214
+ raise ValueError(
215
+ f"Unknown label(s) for lang={lang!r}: {missing}. Example valid labels: {preview}, ..."
216
+ )
217
+ return [label2id[item] for item in label]
218
+
219
+ def _normalize_class_labels(
220
+ self,
221
+ class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
222
+ ) -> Optional[Union[int, List[int], torch.Tensor]]:
223
+ if class_labels is None:
224
+ return None
225
+
226
+ if isinstance(class_labels, str):
227
+ return self.get_label_ids(class_labels)[0]
228
+
229
+ if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
230
+ if all(label in self.labels for label in class_labels):
231
+ return self.get_label_ids(class_labels, lang="en")
232
+ if all(label in self.labels_cn for label in class_labels):
233
+ return self.get_label_ids(class_labels, lang="cn")
234
+ raise ValueError(
235
+ "Could not resolve string `class_labels`. Use English synonyms from `pipe.labels` "
236
+ "or Chinese synonyms from `pipe.labels_cn`."
237
+ )
238
+
239
+ return class_labels
240
+
241
+ def sample_block_noise(self, bs, ch, height, width, eps=1e-6):
242
+ gamma = self.scheduler.gamma
243
+ dist = torch.distributions.multivariate_normal.MultivariateNormal(
244
+ torch.zeros(4),
245
+ torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
246
+ )
247
+ block_number = bs * ch * (height // 2) * (width // 2)
248
+ noise = torch.stack([dist.sample() for _ in range(block_number)])
249
+ noise = rearrange(
250
+ noise,
251
+ "(b c h w) (p q) -> b c (h p) (w q)",
252
+ b=bs,
253
+ c=ch,
254
+ h=height // 2,
255
+ w=width // 2,
256
+ p=2,
257
+ q=2,
258
+ )
259
+ return noise
260
+
261
+ def _stage_guidance_scale(self, stage_idx: int) -> float:
262
+ if not self.class_cond:
263
+ return self._guidance_scale_value
264
+ scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
265
+ return (self._guidance_scale_value - 1) * scale_dict[stage_idx] + 1
266
+
267
+ @property
268
+ def do_classifier_free_guidance(self) -> bool:
269
+ return self._guidance_scale_value > 0
270
+
271
+ @torch.no_grad()
272
+ def encode_prompt(
273
+ self,
274
+ prompt: Union[str, List[str]],
275
+ device: torch.device,
276
+ num_images_per_prompt: int = 1,
277
+ do_classifier_free_guidance: bool = True,
278
+ negative_prompt: Union[str, List[str]] = "",
279
+ max_length: Optional[int] = None,
280
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
281
+ if self.text_encoder is None or self.tokenizer is None:
282
+ raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
283
+
284
+ if isinstance(prompt, str):
285
+ prompt = [prompt]
286
+ batch_size = len(prompt)
287
+ max_length = max_length or self.max_token_length
288
+
289
+ text_inputs = self.tokenizer(
290
+ prompt,
291
+ padding="max_length",
292
+ max_length=max_length,
293
+ truncation=True,
294
+ add_special_tokens=True,
295
+ return_tensors="pt",
296
+ )
297
+ text_input_ids = text_inputs.input_ids.to(device)
298
+ prompt_attention_mask = text_inputs.attention_mask.to(device)
299
+ prompt_embeds = self.text_encoder(
300
+ text_input_ids,
301
+ attention_mask=prompt_attention_mask,
302
+ )[0]
303
+
304
+ dtype = self.text_encoder.dtype
305
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
306
+ bs_embed, seq_len, _ = prompt_embeds.shape
307
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
308
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
309
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
310
+
311
+ if do_classifier_free_guidance:
312
+ if isinstance(negative_prompt, str):
313
+ uncond_tokens = [negative_prompt] * batch_size
314
+ elif isinstance(negative_prompt, list):
315
+ if len(negative_prompt) != batch_size:
316
+ raise ValueError(
317
+ f"Negative prompt list length ({len(negative_prompt)}) must match prompt batch ({batch_size})."
318
+ )
319
+ uncond_tokens = negative_prompt
320
+ else:
321
+ raise ValueError("Negative prompt must be a string or list of strings.")
322
+
323
+ uncond_inputs = self.tokenizer(
324
+ uncond_tokens,
325
+ padding="max_length",
326
+ max_length=prompt_embeds.shape[1],
327
+ truncation=True,
328
+ return_attention_mask=True,
329
+ add_special_tokens=True,
330
+ return_tensors="pt",
331
+ )
332
+ negative_input_ids = uncond_inputs.input_ids.to(device)
333
+ negative_prompt_attention_mask = uncond_inputs.attention_mask.to(device)
334
+ negative_prompt_embeds = self.text_encoder(
335
+ negative_input_ids,
336
+ attention_mask=negative_prompt_attention_mask,
337
+ )[0]
338
+
339
+ seq_len_neg = negative_prompt_embeds.shape[1]
340
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
341
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
342
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
343
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
344
+
345
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
346
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
347
+
348
+ return prompt_embeds, prompt_attention_mask
349
+
350
+ @torch.no_grad()
351
+ def __call__(
352
+ self,
353
+ prompt: Optional[Union[str, List[str]]] = None,
354
+ class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]] = None,
355
+ height: Optional[int] = None,
356
+ width: Optional[int] = None,
357
+ num_inference_steps: Union[int, List[int]] = 10,
358
+ guidance_scale: float = 4.0,
359
+ shift: float = 1.0,
360
+ negative_prompt: Union[str, List[str]] = "",
361
+ num_images_per_prompt: int = 1,
362
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
363
+ output_type: str = "pil",
364
+ return_dict: bool = True,
365
+ ) -> Union[PixelFlowPipelineOutput, Tuple]:
366
+ if height is None:
367
+ height = int(self.transformer.config.sample_size)
368
+ if width is None:
369
+ width = int(self.transformer.config.sample_size)
370
+
371
+ device = self._execution_device
372
+ self._guidance_scale_value = guidance_scale
373
+
374
+ if isinstance(num_inference_steps, int):
375
+ num_inference_steps = [num_inference_steps] * self.scheduler.num_stages
376
+
377
+ prompt_attention_mask = None
378
+ if self.class_cond:
379
+ if class_labels is None:
380
+ raise ValueError("`class_labels` are required for class-conditional PixelFlow checkpoints.")
381
+ class_labels = self._normalize_class_labels(class_labels)
382
+ if isinstance(class_labels, int):
383
+ class_labels = [class_labels]
384
+ if not torch.is_tensor(class_labels):
385
+ class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
386
+ else:
387
+ class_labels = class_labels.to(device=device, dtype=torch.long)
388
+
389
+ batch_size = class_labels.shape[0]
390
+ prompt_embeds = class_labels
391
+ negative_prompt_embeds = torch.full_like(prompt_embeds, self.transformer.config.num_classes)
392
+ if self.do_classifier_free_guidance:
393
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
394
+ else:
395
+ if prompt is None:
396
+ raise ValueError("`prompt` is required for text-to-image PixelFlow checkpoints.")
397
+ if isinstance(prompt, str):
398
+ prompt = [prompt]
399
+ batch_size = len(prompt)
400
+ prompt_embeds, prompt_attention_mask = self.encode_prompt(
401
+ prompt,
402
+ device,
403
+ num_images_per_prompt=num_images_per_prompt,
404
+ do_classifier_free_guidance=self.do_classifier_free_guidance and guidance_scale > 1.0,
405
+ negative_prompt=negative_prompt,
406
+ )
407
+
408
+ init_factor = 2 ** (self.scheduler.num_stages - 1)
409
+ height, width = height // init_factor, width // init_factor
410
+ latents = randn_tensor(
411
+ (batch_size * num_images_per_prompt, 3, height, width),
412
+ generator=generator,
413
+ device=device,
414
+ dtype=torch.float32,
415
+ )
416
+
417
+ for stage_idx in range(self.scheduler.num_stages):
418
+ self.scheduler.set_timesteps(num_inference_steps[stage_idx], stage_idx, device=device, shift=shift)
419
+ timesteps = self.scheduler.Timesteps
420
+
421
+ if stage_idx > 0:
422
+ height, width = height * 2, width * 2
423
+ latents = F.interpolate(latents, size=(height, width), mode="nearest")
424
+ original_start_t = self.scheduler.original_start_t[stage_idx]
425
+ gamma = self.scheduler.gamma
426
+ alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
427
+ beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
428
+
429
+ noise = self.sample_block_noise(*latents.shape)
430
+ noise = noise.to(device=device, dtype=latents.dtype)
431
+ latents = alpha * latents + beta * noise
432
+
433
+ size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
434
+ pos_embed = get_2d_rotary_pos_embed(
435
+ embed_dim=self.transformer.attention_head_dim,
436
+ crops_coords=((0, 0), (latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size)),
437
+ grid_size=(latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size),
438
+ device=device,
439
+ output_type="pt",
440
+ )
441
+ rope_pos = torch.stack(pos_embed, -1)
442
+
443
+ autocast_enabled = device.type == "cuda"
444
+ autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
445
+ for timestep in timesteps:
446
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
447
+ timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
448
+ with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
449
+ if self.class_cond:
450
+ noise_pred = self.transformer(
451
+ latent_model_input,
452
+ timestep=timestep_batch,
453
+ class_labels=prompt_embeds,
454
+ latent_size=size_tensor,
455
+ pos_embed=rope_pos,
456
+ ).sample
457
+ else:
458
+ noise_pred = self.transformer(
459
+ latent_model_input,
460
+ encoder_hidden_states=prompt_embeds,
461
+ encoder_attention_mask=prompt_attention_mask,
462
+ timestep=timestep_batch,
463
+ latent_size=size_tensor,
464
+ pos_embed=rope_pos,
465
+ ).sample
466
+
467
+ if self.do_classifier_free_guidance:
468
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
469
+ noise_pred = noise_pred_uncond + self._stage_guidance_scale(stage_idx) * (
470
+ noise_pred_text - noise_pred_uncond
471
+ )
472
+
473
+ latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
474
+
475
+ image = (latents / 2 + 0.5).clamp(0, 1)
476
+
477
+ if output_type == "pt":
478
+ pass
479
+ elif output_type in ("pil", "np"):
480
+ image = self.image_processor.postprocess(image, output_type=output_type)
481
+ else:
482
+ raise ValueError(f"Unsupported output_type: {output_type}")
483
+
484
+ self.maybe_free_model_hooks()
485
+
486
+ if not return_dict:
487
+ return (image,)
488
+
489
+ return PixelFlowPipelineOutput(images=image)
PixelFlow-256/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc ADDED
Binary file (7.76 kB). View file
 
PixelFlow-256/scheduler/scheduler_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PixelFlowScheduler",
3
+ "_diffusers_version": "0.36.0",
4
+ "gamma": -0.3333333333333333,
5
+ "num_stages": 4,
6
+ "num_train_timesteps": 1000
7
+ }
PixelFlow-256/scheduler/scheduling_pixelflow.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+
8
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
9
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
10
+ from diffusers.utils import BaseOutput
11
+
12
+
13
+ def cal_rectify_ratio(start_t, gamma):
14
+ return 1 / (math.sqrt(1 - (1 / gamma)) * (1 - start_t) + start_t)
15
+
16
+
17
+ @dataclass
18
+ class PixelFlowSchedulerOutput(BaseOutput):
19
+ prev_sample: torch.FloatTensor
20
+
21
+
22
+ class PixelFlowScheduler(SchedulerMixin, ConfigMixin):
23
+ """Cascade flow scheduler for PixelFlow multi-stage pixel-space generation."""
24
+
25
+ order = 1
26
+
27
+ @register_to_config
28
+ def __init__(
29
+ self,
30
+ num_train_timesteps: int = 1000,
31
+ num_stages: int = 4,
32
+ gamma: float = -1 / 3,
33
+ ):
34
+ assert num_stages > 0, f"num_stages must be positive, got {num_stages}"
35
+ self.num_stages = num_stages
36
+ self.gamma = gamma
37
+
38
+ self.Timesteps = torch.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=torch.float32)
39
+ self.t = self.Timesteps / num_train_timesteps
40
+ self.stage_range = [x / num_stages for x in range(num_stages + 1)]
41
+
42
+ self.original_start_t = {}
43
+ self.start_t, self.end_t = {}, {}
44
+ self.t_window_per_stage = {}
45
+ self.Timesteps_per_stage = {}
46
+ stage_distance = []
47
+
48
+ for stage_idx in range(num_stages):
49
+ start_idx = max(int(num_train_timesteps * self.stage_range[stage_idx]), 0)
50
+ end_idx = min(int(num_train_timesteps * self.stage_range[stage_idx + 1]), num_train_timesteps)
51
+
52
+ start_t = self.t[start_idx].item()
53
+ end_t = self.t[end_idx].item() if end_idx < num_train_timesteps else 1.0
54
+
55
+ self.original_start_t[stage_idx] = start_t
56
+
57
+ if stage_idx > 0:
58
+ start_t *= cal_rectify_ratio(start_t, gamma)
59
+
60
+ self.start_t[stage_idx] = start_t
61
+ self.end_t[stage_idx] = end_t
62
+ stage_distance.append(end_t - start_t)
63
+
64
+ total_stage_distance = sum(stage_distance)
65
+ t_within_stage = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float64)[:-1]
66
+
67
+ for stage_idx in range(num_stages):
68
+ start_ratio = 0.0 if stage_idx == 0 else sum(stage_distance[:stage_idx]) / total_stage_distance
69
+ end_ratio = 1.0 if stage_idx == num_stages - 1 else sum(stage_distance[:stage_idx + 1]) / total_stage_distance
70
+
71
+ Timestep_start = self.Timesteps[int(num_train_timesteps * start_ratio)]
72
+ Timestep_end = self.Timesteps[min(int(num_train_timesteps * end_ratio), num_train_timesteps - 1)]
73
+
74
+ self.t_window_per_stage[stage_idx] = t_within_stage
75
+
76
+ if stage_idx == num_stages - 1:
77
+ self.Timesteps_per_stage[stage_idx] = torch.linspace(
78
+ Timestep_start.item(), Timestep_end.item(), num_train_timesteps, dtype=torch.float64
79
+ )
80
+ else:
81
+ self.Timesteps_per_stage[stage_idx] = torch.linspace(
82
+ Timestep_start.item(), Timestep_end.item(), num_train_timesteps + 1, dtype=torch.float64
83
+ )[:-1]
84
+
85
+ self._step_index = None
86
+ self.Timesteps = None
87
+
88
+ @staticmethod
89
+ def time_linear_to_Timesteps(t, t_start, t_end, T_start, T_end):
90
+ k = (T_end - T_start) / (t_end - t_start)
91
+ b = T_start - t_start * k
92
+ return k * t + b
93
+
94
+ def set_timesteps(self, num_inference_steps, stage_index, device=None, shift=1.0):
95
+ self.num_inference_steps = num_inference_steps
96
+ self._step_index = None
97
+
98
+ stage_T_start = self.Timesteps_per_stage[stage_index][0].item()
99
+ stage_T_end = self.Timesteps_per_stage[stage_index][-1].item()
100
+
101
+ t_start = self.t_window_per_stage[stage_index][0].item()
102
+ t_end = self.t_window_per_stage[stage_index][-1].item()
103
+
104
+ t = np.linspace(t_start, t_end, num_inference_steps, dtype=np.float64)
105
+ t = t / (shift + (1 - shift) * t)
106
+
107
+ Timesteps = self.time_linear_to_Timesteps(t, t_start, t_end, stage_T_start, stage_T_end)
108
+ self.Timesteps = torch.from_numpy(Timesteps).to(device=device)
109
+
110
+ self.t = torch.from_numpy(np.append(t, 1.0)).to(device=device, dtype=torch.float64)
111
+
112
+ def step(
113
+ self,
114
+ model_output: torch.Tensor,
115
+ sample: torch.Tensor,
116
+ return_dict: bool = True,
117
+ ) -> Union[PixelFlowSchedulerOutput, SchedulerOutput, Tuple[torch.Tensor, ...]]:
118
+ if self._step_index is None:
119
+ self._step_index = 0
120
+
121
+ sample = sample.to(torch.float32)
122
+ t = self.t[self._step_index].float()
123
+ t_next = self.t[self._step_index + 1].float()
124
+
125
+ prev_sample = sample + (t_next - t) * model_output
126
+ self._step_index += 1
127
+
128
+ if not return_dict:
129
+ return (prev_sample.to(model_output.dtype),)
130
+
131
+ return PixelFlowSchedulerOutput(prev_sample=prev_sample.to(model_output.dtype))
132
+
133
+ @property
134
+ def step_index(self) -> Optional[int]:
135
+ return self._step_index
PixelFlow-256/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc ADDED
Binary file (24.1 kB). View file
 
PixelFlow-256/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc ADDED
Binary file (3.84 kB). View file
 
PixelFlow-256/transformer/config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PixelFlowTransformer2DModel",
3
+ "_diffusers_version": "0.36.0",
4
+ "attention_bias": true,
5
+ "attention_head_dim": 72,
6
+ "cross_attention_dim": null,
7
+ "depth": 28,
8
+ "dropout": 0.0,
9
+ "in_channels": 3,
10
+ "init_weights": false,
11
+ "num_attention_heads": 16,
12
+ "num_classes": 1000,
13
+ "out_channels": 3,
14
+ "patch_size": 4,
15
+ "sample_size": 256
16
+ }
PixelFlow-256/transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6524ee52cec42041ff72b19d8b606f1c6196cbd9c623202d57af908280b3703
3
+ size 2706502480
PixelFlow-256/transformer/modeling_pixelflow.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Union
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import warnings
7
+
8
+ from diffusers.models.embeddings import LabelEmbedding, TimestepEmbedding, Timesteps
9
+
10
+ try:
11
+ from flash_attn import flash_attn_varlen_func
12
+ except ImportError:
13
+ warnings.warn("`flash-attn` is not installed. Training mode may not work properly.", UserWarning)
14
+ flash_attn_varlen_func = None
15
+
16
+
17
+ def apply_rotary_emb(
18
+ x: torch.Tensor,
19
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
20
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
21
+ cos, sin = freqs_cis.unbind(-1)
22
+ cos = cos[None, None]
23
+ sin = sin[None, None]
24
+ cos, sin = cos.to(x.device), sin.to(x.device)
25
+
26
+ x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
27
+ x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
28
+ out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
29
+
30
+ return out
31
+
32
+
33
+ class PatchEmbed(nn.Module):
34
+ def __init__(self, patch_size, in_channels, embed_dim, bias=True):
35
+ super().__init__()
36
+ self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
37
+
38
+ def forward_unfold(self, x):
39
+ out_unfold = x.matmul(self.proj.weight.view(self.proj.weight.size(0), -1).t())
40
+ if self.proj.bias is not None:
41
+ out_unfold += self.proj.bias.to(out_unfold.dtype)
42
+ return out_unfold
43
+
44
+ def forward(self, x):
45
+ if self.training:
46
+ return self.forward_unfold(x)
47
+ out = self.proj(x)
48
+ out = out.flatten(2).transpose(1, 2)
49
+ return out
50
+
51
+
52
+ class AdaLayerNorm(nn.Module):
53
+ def __init__(self, embedding_dim):
54
+ super().__init__()
55
+ self.embedding_dim = embedding_dim
56
+ self.silu = nn.SiLU()
57
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
58
+ self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
59
+
60
+ def forward(self, x, timestep, seqlen_list=None):
61
+ input_dtype = x.dtype
62
+ emb = self.linear(self.silu(timestep))
63
+
64
+ if seqlen_list is not None:
65
+ emb = torch.cat([one_emb[None].expand(repeat_time, -1) for one_emb, repeat_time in zip(emb, seqlen_list)])
66
+ else:
67
+ emb = emb.unsqueeze(1)
68
+
69
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.float().chunk(6, dim=-1)
70
+ x = self.norm(x).float() * (1 + scale_msa) + shift_msa
71
+ return x.to(input_dtype), gate_msa, shift_mlp, scale_mlp, gate_mlp
72
+
73
+
74
+ class FeedForward(nn.Module):
75
+ def __init__(self, dim, dim_out=None, mult=4, inner_dim=None, bias=True):
76
+ super().__init__()
77
+ inner_dim = int(dim * mult) if inner_dim is None else inner_dim
78
+ dim_out = dim_out if dim_out is not None else dim
79
+ self.fc1 = nn.Linear(dim, inner_dim, bias=bias)
80
+ self.fc2 = nn.Linear(inner_dim, dim_out, bias=bias)
81
+
82
+ def forward(self, hidden_states):
83
+ hidden_states = self.fc1(hidden_states)
84
+ hidden_states = F.gelu(hidden_states, approximate="tanh")
85
+ hidden_states = self.fc2(hidden_states)
86
+ return hidden_states
87
+
88
+
89
+ class RMSNorm(nn.Module):
90
+ def __init__(self, dim: int, eps=1e-6):
91
+ super().__init__()
92
+ self.weight = nn.Parameter(torch.ones(dim))
93
+ self.eps = eps
94
+
95
+ def forward(self, x):
96
+ output = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
97
+ return (self.weight * output).to(x.dtype)
98
+
99
+
100
+ class Attention(nn.Module):
101
+ def __init__(self, q_dim, kv_dim=None, heads=8, head_dim=64, dropout=0.0, bias=False):
102
+ super().__init__()
103
+ self.q_dim = q_dim
104
+ self.kv_dim = kv_dim if kv_dim is not None else q_dim
105
+ self.inner_dim = head_dim * heads
106
+ self.dropout = dropout
107
+ self.head_dim = head_dim
108
+ self.num_heads = heads
109
+
110
+ self.q_proj = nn.Linear(self.q_dim, self.inner_dim, bias=bias)
111
+ self.k_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
112
+ self.v_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
113
+ self.o_proj = nn.Linear(self.inner_dim, self.q_dim, bias=bias)
114
+ self.q_norm = RMSNorm(self.inner_dim)
115
+ self.k_norm = RMSNorm(self.inner_dim)
116
+
117
+ def prepare_attention_mask(self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3):
118
+ head_size = self.num_heads
119
+ if attention_mask is None:
120
+ return attention_mask
121
+
122
+ current_length: int = attention_mask.shape[-1]
123
+ if current_length != target_length:
124
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
125
+
126
+ if out_dim == 3:
127
+ if attention_mask.shape[0] < batch_size * head_size:
128
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
129
+ elif out_dim == 4:
130
+ attention_mask = attention_mask.unsqueeze(1)
131
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
132
+
133
+ return attention_mask
134
+
135
+ def forward(
136
+ self,
137
+ inputs_q,
138
+ inputs_kv,
139
+ attention_mask=None,
140
+ cross_attention=False,
141
+ rope_pos_embed=None,
142
+ cu_seqlens_q=None,
143
+ cu_seqlens_k=None,
144
+ max_seqlen_q=None,
145
+ max_seqlen_k=None,
146
+ ):
147
+ inputs_kv = inputs_q if inputs_kv is None else inputs_kv
148
+
149
+ query_states = self.q_proj(inputs_q)
150
+ key_states = self.k_proj(inputs_kv)
151
+ value_states = self.v_proj(inputs_kv)
152
+
153
+ query_states = self.q_norm(query_states)
154
+ key_states = self.k_norm(key_states)
155
+
156
+ if max_seqlen_q is None:
157
+ assert not self.training, "PixelFlow needs sequence packing for training"
158
+
159
+ bsz, q_len, _ = inputs_q.shape
160
+ _, kv_len, _ = inputs_kv.shape
161
+
162
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
163
+ key_states = key_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
164
+ value_states = value_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
165
+
166
+ query_states = apply_rotary_emb(query_states, rope_pos_embed)
167
+ if not cross_attention:
168
+ key_states = apply_rotary_emb(key_states, rope_pos_embed)
169
+
170
+ if attention_mask is not None:
171
+ attention_mask = self.prepare_attention_mask(attention_mask, kv_len, bsz)
172
+ attention_mask = attention_mask.view(bsz, self.num_heads, -1, attention_mask.shape[-1])
173
+
174
+ attn_output = F.scaled_dot_product_attention(
175
+ query_states,
176
+ key_states,
177
+ value_states,
178
+ attn_mask=attention_mask,
179
+ dropout_p=self.dropout if self.training else 0.0,
180
+ is_causal=False,
181
+ )
182
+
183
+ attn_output = attn_output.transpose(1, 2).contiguous()
184
+ attn_output = attn_output.view(bsz, q_len, self.inner_dim)
185
+ attn_output = self.o_proj(attn_output)
186
+ return attn_output
187
+
188
+ query_states = query_states.view(-1, self.num_heads, self.head_dim)
189
+ key_states = key_states.view(-1, self.num_heads, self.head_dim)
190
+ value_states = value_states.view(-1, self.num_heads, self.head_dim)
191
+
192
+ query_states = apply_rotary_emb(query_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
193
+ if not cross_attention:
194
+ key_states = apply_rotary_emb(key_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
195
+
196
+ attn_output = flash_attn_varlen_func(
197
+ query_states,
198
+ key_states,
199
+ value_states,
200
+ cu_seqlens_q=cu_seqlens_q,
201
+ cu_seqlens_k=cu_seqlens_k,
202
+ max_seqlen_q=max_seqlen_q,
203
+ max_seqlen_k=max_seqlen_k,
204
+ )
205
+
206
+ attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
207
+ attn_output = self.o_proj(attn_output)
208
+ return attn_output
209
+
210
+
211
+ class TransformerBlock(nn.Module):
212
+ def __init__(
213
+ self,
214
+ dim,
215
+ num_attention_heads,
216
+ attention_head_dim,
217
+ dropout=0.0,
218
+ cross_attention_dim=None,
219
+ attention_bias=False,
220
+ ):
221
+ super().__init__()
222
+ self.norm1 = AdaLayerNorm(dim)
223
+ self.attn1 = Attention(
224
+ q_dim=dim,
225
+ kv_dim=None,
226
+ heads=num_attention_heads,
227
+ head_dim=attention_head_dim,
228
+ dropout=dropout,
229
+ bias=attention_bias,
230
+ )
231
+
232
+ if cross_attention_dim is not None:
233
+ self.norm2 = RMSNorm(dim, eps=1e-6)
234
+ self.attn2 = Attention(
235
+ q_dim=dim,
236
+ kv_dim=cross_attention_dim,
237
+ heads=num_attention_heads,
238
+ head_dim=attention_head_dim,
239
+ dropout=dropout,
240
+ bias=attention_bias,
241
+ )
242
+ else:
243
+ self.attn2 = None
244
+
245
+ self.norm3 = RMSNorm(dim, eps=1e-6)
246
+ self.mlp = FeedForward(dim)
247
+
248
+ def forward(
249
+ self,
250
+ hidden_states,
251
+ encoder_hidden_states=None,
252
+ encoder_attention_mask=None,
253
+ timestep=None,
254
+ rope_pos_embed=None,
255
+ cu_seqlens_q=None,
256
+ cu_seqlens_k=None,
257
+ seqlen_list_q=None,
258
+ seqlen_list_k=None,
259
+ ):
260
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, timestep, seqlen_list_q)
261
+
262
+ attn_output = self.attn1(
263
+ inputs_q=norm_hidden_states,
264
+ inputs_kv=None,
265
+ attention_mask=None,
266
+ cross_attention=False,
267
+ rope_pos_embed=rope_pos_embed,
268
+ cu_seqlens_q=cu_seqlens_q,
269
+ cu_seqlens_k=cu_seqlens_q,
270
+ max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
271
+ max_seqlen_k=max(seqlen_list_q) if seqlen_list_q is not None else None,
272
+ )
273
+
274
+ attn_output = (gate_msa * attn_output.float()).to(attn_output.dtype)
275
+ hidden_states = attn_output + hidden_states
276
+
277
+ if self.attn2 is not None:
278
+ norm_hidden_states = self.norm2(hidden_states)
279
+ attn_output = self.attn2(
280
+ inputs_q=norm_hidden_states,
281
+ inputs_kv=encoder_hidden_states,
282
+ attention_mask=encoder_attention_mask,
283
+ cross_attention=True,
284
+ rope_pos_embed=rope_pos_embed,
285
+ cu_seqlens_q=cu_seqlens_q,
286
+ cu_seqlens_k=cu_seqlens_k,
287
+ max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
288
+ max_seqlen_k=max(seqlen_list_k) if seqlen_list_k is not None else None,
289
+ )
290
+ hidden_states = hidden_states + attn_output
291
+
292
+ norm_hidden_states = self.norm3(hidden_states)
293
+ norm_hidden_states = (norm_hidden_states.float() * (1 + scale_mlp) + shift_mlp).to(norm_hidden_states.dtype)
294
+ ff_output = self.mlp(norm_hidden_states)
295
+ ff_output = (gate_mlp * ff_output.float()).to(ff_output.dtype)
296
+ hidden_states = ff_output + hidden_states
297
+
298
+ return hidden_states
299
+
300
+
301
+ class PixelFlowModel(torch.nn.Module):
302
+ def __init__(
303
+ self,
304
+ in_channels,
305
+ out_channels,
306
+ num_attention_heads,
307
+ attention_head_dim,
308
+ depth,
309
+ patch_size,
310
+ dropout=0.0,
311
+ cross_attention_dim=None,
312
+ attention_bias=True,
313
+ num_classes=0,
314
+ init_weights=True,
315
+ ):
316
+ super().__init__()
317
+ self.patch_size = patch_size
318
+ self.attention_head_dim = attention_head_dim
319
+ self.num_classes = num_classes
320
+ self.out_channels = out_channels
321
+
322
+ embed_dim = num_attention_heads * attention_head_dim
323
+ self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
324
+
325
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
326
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
327
+ self.latent_size_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
328
+ if self.num_classes > 0:
329
+ self.class_embedder = LabelEmbedding(num_classes, embed_dim, dropout_prob=0.1)
330
+
331
+ self.transformer_blocks = nn.ModuleList(
332
+ [
333
+ TransformerBlock(
334
+ embed_dim,
335
+ num_attention_heads,
336
+ attention_head_dim,
337
+ dropout,
338
+ cross_attention_dim,
339
+ attention_bias,
340
+ )
341
+ for _ in range(depth)
342
+ ]
343
+ )
344
+
345
+ self.norm_out = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
346
+ self.proj_out_1 = nn.Linear(embed_dim, 2 * embed_dim)
347
+ self.proj_out_2 = nn.Linear(embed_dim, patch_size * patch_size * out_channels)
348
+
349
+ if init_weights:
350
+ self.initialize_from_scratch()
351
+
352
+ def initialize_from_scratch(self):
353
+ def _basic_init(module):
354
+ if isinstance(module, nn.Linear):
355
+ torch.nn.init.xavier_uniform_(module.weight)
356
+ if module.bias is not None:
357
+ nn.init.constant_(module.bias, 0)
358
+
359
+ self.apply(_basic_init)
360
+
361
+ w = self.patch_embed.proj.weight.data
362
+ nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
363
+ nn.init.constant_(self.patch_embed.proj.bias, 0)
364
+
365
+ nn.init.normal_(self.timestep_embedder.linear_1.weight, std=0.02)
366
+ nn.init.normal_(self.timestep_embedder.linear_2.weight, std=0.02)
367
+ nn.init.normal_(self.latent_size_embedder.linear_1.weight, std=0.02)
368
+ nn.init.normal_(self.latent_size_embedder.linear_2.weight, std=0.02)
369
+
370
+ if self.num_classes > 0:
371
+ nn.init.normal_(self.class_embedder.embedding_table.weight, std=0.02)
372
+
373
+ for block in self.transformer_blocks:
374
+ nn.init.constant_(block.norm1.linear.weight, 0)
375
+ nn.init.constant_(block.norm1.linear.bias, 0)
376
+
377
+ nn.init.constant_(self.proj_out_1.weight, 0)
378
+ nn.init.constant_(self.proj_out_1.bias, 0)
379
+ nn.init.constant_(self.proj_out_2.weight, 0)
380
+ nn.init.constant_(self.proj_out_2.bias, 0)
381
+
382
+ def forward(
383
+ self,
384
+ hidden_states,
385
+ encoder_hidden_states=None,
386
+ class_labels=None,
387
+ timestep=None,
388
+ latent_size=None,
389
+ encoder_attention_mask=None,
390
+ pos_embed=None,
391
+ cu_seqlens_q=None,
392
+ cu_seqlens_k=None,
393
+ seqlen_list_q=None,
394
+ seqlen_list_k=None,
395
+ ):
396
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
397
+ encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
398
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
399
+
400
+ orig_height, orig_width = hidden_states.shape[-2], hidden_states.shape[-1]
401
+ hidden_states = hidden_states.to(torch.float32)
402
+ hidden_states = self.patch_embed(hidden_states)
403
+
404
+ timesteps_proj = self.time_proj(timestep)
405
+ conditioning = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
406
+
407
+ if self.num_classes > 0:
408
+ class_embed = self.class_embedder(class_labels)
409
+ conditioning += class_embed
410
+
411
+ latent_size_proj = self.time_proj(latent_size)
412
+ latent_size_embed = self.latent_size_embedder(latent_size_proj.to(dtype=hidden_states.dtype))
413
+ conditioning += latent_size_embed
414
+
415
+ for block in self.transformer_blocks:
416
+ hidden_states = block(
417
+ hidden_states,
418
+ encoder_hidden_states=encoder_hidden_states,
419
+ encoder_attention_mask=encoder_attention_mask,
420
+ timestep=conditioning,
421
+ rope_pos_embed=pos_embed,
422
+ cu_seqlens_q=cu_seqlens_q,
423
+ cu_seqlens_k=cu_seqlens_k,
424
+ seqlen_list_q=seqlen_list_q,
425
+ seqlen_list_k=seqlen_list_k,
426
+ )
427
+
428
+ shift, scale = self.proj_out_1(F.silu(conditioning)).float().chunk(2, dim=1)
429
+ if seqlen_list_q is None:
430
+ shift = shift.unsqueeze(1)
431
+ scale = scale.unsqueeze(1)
432
+ else:
433
+ shift = torch.cat([shift_i[None].expand(ri, -1) for shift_i, ri in zip(shift, seqlen_list_q)])
434
+ scale = torch.cat([scale_i[None].expand(ri, -1) for scale_i, ri in zip(scale, seqlen_list_q)])
435
+
436
+ hidden_states = (self.norm_out(hidden_states).float() * (1 + scale) + shift).to(hidden_states.dtype)
437
+ hidden_states = self.proj_out_2(hidden_states)
438
+ if self.training:
439
+ hidden_states = hidden_states.reshape(hidden_states.shape[0], self.patch_size, self.patch_size, self.out_channels)
440
+ hidden_states = hidden_states.permute(0, 3, 1, 2).flatten(1)
441
+ return hidden_states
442
+
443
+ height, width = orig_height // self.patch_size, orig_width // self.patch_size
444
+ hidden_states = hidden_states.reshape(shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels))
445
+ hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
446
+ output = hidden_states.reshape(shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size))
447
+
448
+ return output
PixelFlow-256/transformer/transformer_pixelflow.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple, Union
3
+
4
+ import torch
5
+
6
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
7
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
8
+ from diffusers.models.modeling_utils import ModelMixin
9
+ from diffusers.utils import BaseOutput
10
+
11
+ from modeling_pixelflow import PixelFlowModel
12
+
13
+
14
+ @dataclass
15
+ class PixelFlowTransformerOutput(BaseOutput):
16
+ sample: torch.FloatTensor
17
+
18
+
19
+ class PixelFlowTransformer2DModel(ModelMixin, ConfigMixin):
20
+ """PixelFlow transformer for class-conditional pixel-space flow generation."""
21
+
22
+ @register_to_config
23
+ def __init__(
24
+ self,
25
+ in_channels: int = 3,
26
+ out_channels: int = 3,
27
+ num_attention_heads: int = 16,
28
+ attention_head_dim: int = 72,
29
+ depth: int = 28,
30
+ patch_size: int = 4,
31
+ dropout: float = 0.0,
32
+ cross_attention_dim: Optional[int] = None,
33
+ attention_bias: bool = True,
34
+ num_classes: int = 1000,
35
+ sample_size: int = 256,
36
+ init_weights: bool = True,
37
+ ):
38
+ super().__init__()
39
+ self.model = PixelFlowModel(
40
+ in_channels=in_channels,
41
+ out_channels=out_channels,
42
+ num_attention_heads=num_attention_heads,
43
+ attention_head_dim=attention_head_dim,
44
+ depth=depth,
45
+ patch_size=patch_size,
46
+ dropout=dropout,
47
+ cross_attention_dim=cross_attention_dim,
48
+ attention_bias=attention_bias,
49
+ num_classes=num_classes,
50
+ init_weights=init_weights,
51
+ )
52
+
53
+ @property
54
+ def patch_size(self) -> int:
55
+ return self.model.patch_size
56
+
57
+ @property
58
+ def attention_head_dim(self) -> int:
59
+ return self.model.attention_head_dim
60
+
61
+ def forward(
62
+ self,
63
+ hidden_states: torch.Tensor,
64
+ timestep: Optional[torch.Tensor] = None,
65
+ class_labels: Optional[torch.Tensor] = None,
66
+ latent_size: Optional[torch.Tensor] = None,
67
+ pos_embed: Optional[torch.Tensor] = None,
68
+ encoder_hidden_states: Optional[torch.Tensor] = None,
69
+ encoder_attention_mask: Optional[torch.Tensor] = None,
70
+ return_dict: bool = True,
71
+ ) -> Union[PixelFlowTransformerOutput, Transformer2DModelOutput, Tuple[torch.Tensor, ...]]:
72
+ output = self.model(
73
+ hidden_states=hidden_states,
74
+ encoder_hidden_states=encoder_hidden_states,
75
+ class_labels=class_labels,
76
+ timestep=timestep,
77
+ latent_size=latent_size,
78
+ encoder_attention_mask=encoder_attention_mask,
79
+ pos_embed=pos_embed,
80
+ )
81
+
82
+ if not return_dict:
83
+ return (output,)
84
+
85
+ return Transformer2DModelOutput(sample=output)
PixelFlow-T2I/__pycache__/pipeline.cpython-312.pyc ADDED
Binary file (20.2 kB). View file
 
PixelFlow-T2I/model_index.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PixelFlowPipeline",
3
+ "_diffusers_version": "0.36.0",
4
+ "scheduler": [
5
+ "scheduling_pixelflow",
6
+ "PixelFlowScheduler"
7
+ ],
8
+ "transformer": [
9
+ "transformer_pixelflow",
10
+ "PixelFlowTransformer2DModel"
11
+ ],
12
+ "text_encoder": [
13
+ "transformers",
14
+ "T5EncoderModel"
15
+ ],
16
+ "tokenizer": [
17
+ "transformers",
18
+ "T5Tokenizer"
19
+ ]
20
+ }
PixelFlow-T2I/pipeline.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hub custom pipeline: PixelFlowPipeline.
2
+
3
+ Load with native Hugging Face diffusers and `trust_remote_code=True`.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import importlib
9
+ import math
10
+ import sys
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import List, Optional, Tuple, Union
14
+
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from einops import rearrange
19
+
20
+ from diffusers.image_processor import VaeImageProcessor
21
+ from diffusers.models.embeddings import get_2d_rotary_pos_embed
22
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
23
+ from diffusers.utils import BaseOutput
24
+ from diffusers.utils.torch_utils import randn_tensor
25
+
26
+
27
+ @dataclass
28
+ class PixelFlowPipelineOutput(BaseOutput):
29
+ images: Union[torch.Tensor, List, np.ndarray]
30
+
31
+
32
+ class PixelFlowPipeline(DiffusionPipeline):
33
+ """Pipeline for PixelFlow pixel-space flow generation (class-conditional or text-to-image)."""
34
+
35
+ model_cpu_offload_seq = "text_encoder->transformer"
36
+ _optional_components = ["text_encoder", "tokenizer"]
37
+
38
+ @classmethod
39
+ def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
40
+ """Load a self-contained variant folder locally or from the Hub."""
41
+ repo_root = Path(__file__).resolve().parent
42
+
43
+ if pretrained_model_name_or_path in (None, "", "."):
44
+ variant = repo_root
45
+ elif (
46
+ isinstance(pretrained_model_name_or_path, str)
47
+ and "/" in pretrained_model_name_or_path
48
+ and not Path(pretrained_model_name_or_path).exists()
49
+ ):
50
+ from huggingface_hub import snapshot_download
51
+
52
+ hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
53
+ if subfolder:
54
+ hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
55
+ cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
56
+ variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
57
+ else:
58
+ variant = Path(pretrained_model_name_or_path)
59
+ if not variant.is_absolute():
60
+ candidate = (Path.cwd() / variant).resolve()
61
+ variant = candidate if candidate.exists() else (repo_root / variant).resolve()
62
+ if subfolder:
63
+ variant = variant / subfolder
64
+
65
+ model_kwargs = dict(kwargs)
66
+ inserted: List[str] = []
67
+
68
+ def _load_component(folder: str, module_name: str, class_name: str):
69
+ comp_dir = variant / folder
70
+ module_path = comp_dir / f"{module_name}.py"
71
+ has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
72
+ if not module_path.exists() or not has_weights:
73
+ return None
74
+
75
+ comp_path = str(comp_dir)
76
+ if comp_path not in sys.path:
77
+ sys.path.insert(0, comp_path)
78
+ inserted.append(comp_path)
79
+
80
+ module = importlib.import_module(module_name)
81
+ component_cls = getattr(module, class_name)
82
+ return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
83
+
84
+ def _load_text_components():
85
+ text_encoder = None
86
+ tokenizer = None
87
+ te_dir = variant / "text_encoder"
88
+ tok_dir = variant / "tokenizer"
89
+ if te_dir.exists() and (te_dir / "config.json").exists():
90
+ from transformers import T5EncoderModel, T5Tokenizer
91
+
92
+ text_encoder = T5EncoderModel.from_pretrained(str(te_dir), **model_kwargs)
93
+ tokenizer = T5Tokenizer.from_pretrained(str(tok_dir))
94
+ return text_encoder, tokenizer
95
+
96
+ def _load_text_encoder_name() -> str:
97
+ metadata_path = variant / "conversion_metadata.json"
98
+ if metadata_path.exists():
99
+ import json
100
+
101
+ metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
102
+ if metadata.get("text_encoder"):
103
+ return metadata["text_encoder"]
104
+ return "google/flan-t5-xl"
105
+
106
+ try:
107
+ transformer = _load_component("transformer", "transformer_pixelflow", "PixelFlowTransformer2DModel")
108
+ scheduler = _load_component("scheduler", "scheduling_pixelflow", "PixelFlowScheduler")
109
+ text_encoder, tokenizer = _load_text_components()
110
+
111
+ if scheduler is None:
112
+ sched_dir = variant / "scheduler"
113
+ if (sched_dir / "scheduling_pixelflow.py").exists():
114
+ sched_path = str(sched_dir)
115
+ if sched_path not in sys.path:
116
+ sys.path.insert(0, sched_path)
117
+ inserted.append(sched_path)
118
+ scheduler = importlib.import_module("scheduling_pixelflow").PixelFlowScheduler()
119
+
120
+ if transformer is None:
121
+ raise ValueError(f"No loadable transformer found under {variant}")
122
+
123
+ if (
124
+ text_encoder is None
125
+ and tokenizer is None
126
+ and transformer.config.num_classes == 0
127
+ and transformer.config.cross_attention_dim is not None
128
+ ):
129
+ from transformers import T5EncoderModel, T5Tokenizer
130
+
131
+ text_encoder_name = _load_text_encoder_name()
132
+ text_encoder = T5EncoderModel.from_pretrained(text_encoder_name, **model_kwargs)
133
+ tokenizer = T5Tokenizer.from_pretrained(text_encoder_name)
134
+
135
+ return cls(
136
+ transformer=transformer,
137
+ scheduler=scheduler,
138
+ text_encoder=text_encoder,
139
+ tokenizer=tokenizer,
140
+ )
141
+ finally:
142
+ for comp_path in inserted:
143
+ if comp_path in sys.path:
144
+ sys.path.remove(comp_path)
145
+
146
+ def __init__(self, transformer, scheduler, text_encoder=None, tokenizer=None, max_token_length: int = 512):
147
+ super().__init__()
148
+ self.register_modules(
149
+ transformer=transformer,
150
+ scheduler=scheduler,
151
+ text_encoder=text_encoder,
152
+ tokenizer=tokenizer,
153
+ )
154
+ self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
155
+ self.class_cond = transformer.config.num_classes > 0
156
+ self.max_token_length = max_token_length
157
+
158
+ def sample_block_noise(self, bs, ch, height, width, eps=1e-6):
159
+ gamma = self.scheduler.gamma
160
+ dist = torch.distributions.multivariate_normal.MultivariateNormal(
161
+ torch.zeros(4),
162
+ torch.eye(4) * (1 - gamma) + torch.ones(4, 4) * gamma + eps * torch.eye(4),
163
+ )
164
+ block_number = bs * ch * (height // 2) * (width // 2)
165
+ noise = torch.stack([dist.sample() for _ in range(block_number)])
166
+ noise = rearrange(
167
+ noise,
168
+ "(b c h w) (p q) -> b c (h p) (w q)",
169
+ b=bs,
170
+ c=ch,
171
+ h=height // 2,
172
+ w=width // 2,
173
+ p=2,
174
+ q=2,
175
+ )
176
+ return noise
177
+
178
+ def _stage_guidance_scale(self, stage_idx: int) -> float:
179
+ if not self.class_cond:
180
+ return self._guidance_scale_value
181
+ scale_dict = {0: 0, 1: 1 / 6, 2: 2 / 3, 3: 1}
182
+ return (self._guidance_scale_value - 1) * scale_dict[stage_idx] + 1
183
+
184
+ @property
185
+ def do_classifier_free_guidance(self) -> bool:
186
+ return self._guidance_scale_value > 0
187
+
188
+ @torch.no_grad()
189
+ def encode_prompt(
190
+ self,
191
+ prompt: Union[str, List[str]],
192
+ device: torch.device,
193
+ num_images_per_prompt: int = 1,
194
+ do_classifier_free_guidance: bool = True,
195
+ negative_prompt: Union[str, List[str]] = "",
196
+ max_length: Optional[int] = None,
197
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
198
+ if self.text_encoder is None or self.tokenizer is None:
199
+ raise ValueError("Text-to-image generation requires `text_encoder` and `tokenizer`.")
200
+
201
+ if isinstance(prompt, str):
202
+ prompt = [prompt]
203
+ batch_size = len(prompt)
204
+ max_length = max_length or self.max_token_length
205
+
206
+ text_inputs = self.tokenizer(
207
+ prompt,
208
+ padding="max_length",
209
+ max_length=max_length,
210
+ truncation=True,
211
+ add_special_tokens=True,
212
+ return_tensors="pt",
213
+ )
214
+ text_input_ids = text_inputs.input_ids.to(device)
215
+ prompt_attention_mask = text_inputs.attention_mask.to(device)
216
+ prompt_embeds = self.text_encoder(
217
+ text_input_ids,
218
+ attention_mask=prompt_attention_mask,
219
+ )[0]
220
+
221
+ dtype = self.text_encoder.dtype
222
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
223
+ bs_embed, seq_len, _ = prompt_embeds.shape
224
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
225
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
226
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
227
+
228
+ if do_classifier_free_guidance:
229
+ if isinstance(negative_prompt, str):
230
+ uncond_tokens = [negative_prompt] * batch_size
231
+ elif isinstance(negative_prompt, list):
232
+ if len(negative_prompt) != batch_size:
233
+ raise ValueError(
234
+ f"Negative prompt list length ({len(negative_prompt)}) must match prompt batch ({batch_size})."
235
+ )
236
+ uncond_tokens = negative_prompt
237
+ else:
238
+ raise ValueError("Negative prompt must be a string or list of strings.")
239
+
240
+ uncond_inputs = self.tokenizer(
241
+ uncond_tokens,
242
+ padding="max_length",
243
+ max_length=prompt_embeds.shape[1],
244
+ truncation=True,
245
+ return_attention_mask=True,
246
+ add_special_tokens=True,
247
+ return_tensors="pt",
248
+ )
249
+ negative_input_ids = uncond_inputs.input_ids.to(device)
250
+ negative_prompt_attention_mask = uncond_inputs.attention_mask.to(device)
251
+ negative_prompt_embeds = self.text_encoder(
252
+ negative_input_ids,
253
+ attention_mask=negative_prompt_attention_mask,
254
+ )[0]
255
+
256
+ seq_len_neg = negative_prompt_embeds.shape[1]
257
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
258
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
259
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len_neg, -1)
260
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1).repeat(num_images_per_prompt, 1)
261
+
262
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
263
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
264
+
265
+ return prompt_embeds, prompt_attention_mask
266
+
267
+ @torch.no_grad()
268
+ def __call__(
269
+ self,
270
+ prompt: Optional[Union[str, List[str]]] = None,
271
+ class_labels: Optional[Union[int, List[int], torch.Tensor]] = None,
272
+ height: Optional[int] = None,
273
+ width: Optional[int] = None,
274
+ num_inference_steps: Union[int, List[int]] = 10,
275
+ guidance_scale: float = 4.0,
276
+ shift: float = 1.0,
277
+ negative_prompt: Union[str, List[str]] = "",
278
+ num_images_per_prompt: int = 1,
279
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
280
+ output_type: str = "pil",
281
+ return_dict: bool = True,
282
+ ) -> Union[PixelFlowPipelineOutput, Tuple]:
283
+ if height is None:
284
+ height = int(self.transformer.config.sample_size)
285
+ if width is None:
286
+ width = int(self.transformer.config.sample_size)
287
+
288
+ device = self._execution_device
289
+ self._guidance_scale_value = guidance_scale
290
+
291
+ if isinstance(num_inference_steps, int):
292
+ num_inference_steps = [num_inference_steps] * self.scheduler.num_stages
293
+
294
+ prompt_attention_mask = None
295
+ if self.class_cond:
296
+ if class_labels is None:
297
+ raise ValueError("`class_labels` are required for class-conditional PixelFlow checkpoints.")
298
+ if isinstance(class_labels, int):
299
+ class_labels = [class_labels]
300
+ if not torch.is_tensor(class_labels):
301
+ class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
302
+ else:
303
+ class_labels = class_labels.to(device=device, dtype=torch.long)
304
+
305
+ batch_size = class_labels.shape[0]
306
+ prompt_embeds = class_labels
307
+ negative_prompt_embeds = torch.full_like(prompt_embeds, self.transformer.config.num_classes)
308
+ if self.do_classifier_free_guidance:
309
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
310
+ else:
311
+ if prompt is None:
312
+ raise ValueError("`prompt` is required for text-to-image PixelFlow checkpoints.")
313
+ if isinstance(prompt, str):
314
+ prompt = [prompt]
315
+ batch_size = len(prompt)
316
+ prompt_embeds, prompt_attention_mask = self.encode_prompt(
317
+ prompt,
318
+ device,
319
+ num_images_per_prompt=num_images_per_prompt,
320
+ do_classifier_free_guidance=self.do_classifier_free_guidance and guidance_scale > 1.0,
321
+ negative_prompt=negative_prompt,
322
+ )
323
+
324
+ init_factor = 2 ** (self.scheduler.num_stages - 1)
325
+ height, width = height // init_factor, width // init_factor
326
+ latents = randn_tensor(
327
+ (batch_size * num_images_per_prompt, 3, height, width),
328
+ generator=generator,
329
+ device=device,
330
+ dtype=torch.float32,
331
+ )
332
+
333
+ for stage_idx in range(self.scheduler.num_stages):
334
+ self.scheduler.set_timesteps(num_inference_steps[stage_idx], stage_idx, device=device, shift=shift)
335
+ timesteps = self.scheduler.Timesteps
336
+
337
+ if stage_idx > 0:
338
+ height, width = height * 2, width * 2
339
+ latents = F.interpolate(latents, size=(height, width), mode="nearest")
340
+ original_start_t = self.scheduler.original_start_t[stage_idx]
341
+ gamma = self.scheduler.gamma
342
+ alpha = 1 / (math.sqrt(1 - (1 / gamma)) * (1 - original_start_t) + original_start_t)
343
+ beta = alpha * (1 - original_start_t) / math.sqrt(-gamma)
344
+
345
+ noise = self.sample_block_noise(*latents.shape)
346
+ noise = noise.to(device=device, dtype=latents.dtype)
347
+ latents = alpha * latents + beta * noise
348
+
349
+ size_tensor = torch.tensor([latents.shape[-1] // self.transformer.patch_size], dtype=torch.int32, device=device)
350
+ pos_embed = get_2d_rotary_pos_embed(
351
+ embed_dim=self.transformer.attention_head_dim,
352
+ crops_coords=((0, 0), (latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size)),
353
+ grid_size=(latents.shape[-1] // self.transformer.patch_size, latents.shape[-1] // self.transformer.patch_size),
354
+ device=device,
355
+ output_type="pt",
356
+ )
357
+ rope_pos = torch.stack(pos_embed, -1)
358
+
359
+ autocast_enabled = device.type == "cuda"
360
+ autocast_dtype = torch.bfloat16 if autocast_enabled else torch.float32
361
+ for timestep in timesteps:
362
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
363
+ timestep_batch = timestep.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
364
+ with torch.autocast(device.type, enabled=autocast_enabled, dtype=autocast_dtype):
365
+ if self.class_cond:
366
+ noise_pred = self.transformer(
367
+ latent_model_input,
368
+ timestep=timestep_batch,
369
+ class_labels=prompt_embeds,
370
+ latent_size=size_tensor,
371
+ pos_embed=rope_pos,
372
+ ).sample
373
+ else:
374
+ noise_pred = self.transformer(
375
+ latent_model_input,
376
+ encoder_hidden_states=prompt_embeds,
377
+ encoder_attention_mask=prompt_attention_mask,
378
+ timestep=timestep_batch,
379
+ latent_size=size_tensor,
380
+ pos_embed=rope_pos,
381
+ ).sample
382
+
383
+ if self.do_classifier_free_guidance:
384
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
385
+ noise_pred = noise_pred_uncond + self._stage_guidance_scale(stage_idx) * (
386
+ noise_pred_text - noise_pred_uncond
387
+ )
388
+
389
+ latents = self.scheduler.step(model_output=noise_pred, sample=latents).prev_sample
390
+
391
+ image = (latents / 2 + 0.5).clamp(0, 1)
392
+
393
+ if output_type == "pt":
394
+ pass
395
+ elif output_type in ("pil", "np"):
396
+ image = self.image_processor.postprocess(image, output_type=output_type)
397
+ else:
398
+ raise ValueError(f"Unsupported output_type: {output_type}")
399
+
400
+ self.maybe_free_model_hooks()
401
+
402
+ if not return_dict:
403
+ return (image,)
404
+
405
+ return PixelFlowPipelineOutput(images=image)
PixelFlow-T2I/scheduler/__pycache__/scheduling_pixelflow.cpython-312.pyc ADDED
Binary file (7.76 kB). View file
 
PixelFlow-T2I/scheduler/scheduler_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PixelFlowScheduler",
3
+ "_diffusers_version": "0.36.0",
4
+ "gamma": -0.3333333333333333,
5
+ "num_stages": 4,
6
+ "num_train_timesteps": 1000
7
+ }
PixelFlow-T2I/scheduler/scheduling_pixelflow.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+
8
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
9
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin, SchedulerOutput
10
+ from diffusers.utils import BaseOutput
11
+
12
+
13
+ def cal_rectify_ratio(start_t, gamma):
14
+ return 1 / (math.sqrt(1 - (1 / gamma)) * (1 - start_t) + start_t)
15
+
16
+
17
+ @dataclass
18
+ class PixelFlowSchedulerOutput(BaseOutput):
19
+ prev_sample: torch.FloatTensor
20
+
21
+
22
+ class PixelFlowScheduler(SchedulerMixin, ConfigMixin):
23
+ """Cascade flow scheduler for PixelFlow multi-stage pixel-space generation."""
24
+
25
+ order = 1
26
+
27
+ @register_to_config
28
+ def __init__(
29
+ self,
30
+ num_train_timesteps: int = 1000,
31
+ num_stages: int = 4,
32
+ gamma: float = -1 / 3,
33
+ ):
34
+ assert num_stages > 0, f"num_stages must be positive, got {num_stages}"
35
+ self.num_stages = num_stages
36
+ self.gamma = gamma
37
+
38
+ self.Timesteps = torch.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=torch.float32)
39
+ self.t = self.Timesteps / num_train_timesteps
40
+ self.stage_range = [x / num_stages for x in range(num_stages + 1)]
41
+
42
+ self.original_start_t = {}
43
+ self.start_t, self.end_t = {}, {}
44
+ self.t_window_per_stage = {}
45
+ self.Timesteps_per_stage = {}
46
+ stage_distance = []
47
+
48
+ for stage_idx in range(num_stages):
49
+ start_idx = max(int(num_train_timesteps * self.stage_range[stage_idx]), 0)
50
+ end_idx = min(int(num_train_timesteps * self.stage_range[stage_idx + 1]), num_train_timesteps)
51
+
52
+ start_t = self.t[start_idx].item()
53
+ end_t = self.t[end_idx].item() if end_idx < num_train_timesteps else 1.0
54
+
55
+ self.original_start_t[stage_idx] = start_t
56
+
57
+ if stage_idx > 0:
58
+ start_t *= cal_rectify_ratio(start_t, gamma)
59
+
60
+ self.start_t[stage_idx] = start_t
61
+ self.end_t[stage_idx] = end_t
62
+ stage_distance.append(end_t - start_t)
63
+
64
+ total_stage_distance = sum(stage_distance)
65
+ t_within_stage = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float64)[:-1]
66
+
67
+ for stage_idx in range(num_stages):
68
+ start_ratio = 0.0 if stage_idx == 0 else sum(stage_distance[:stage_idx]) / total_stage_distance
69
+ end_ratio = 1.0 if stage_idx == num_stages - 1 else sum(stage_distance[:stage_idx + 1]) / total_stage_distance
70
+
71
+ Timestep_start = self.Timesteps[int(num_train_timesteps * start_ratio)]
72
+ Timestep_end = self.Timesteps[min(int(num_train_timesteps * end_ratio), num_train_timesteps - 1)]
73
+
74
+ self.t_window_per_stage[stage_idx] = t_within_stage
75
+
76
+ if stage_idx == num_stages - 1:
77
+ self.Timesteps_per_stage[stage_idx] = torch.linspace(
78
+ Timestep_start.item(), Timestep_end.item(), num_train_timesteps, dtype=torch.float64
79
+ )
80
+ else:
81
+ self.Timesteps_per_stage[stage_idx] = torch.linspace(
82
+ Timestep_start.item(), Timestep_end.item(), num_train_timesteps + 1, dtype=torch.float64
83
+ )[:-1]
84
+
85
+ self._step_index = None
86
+ self.Timesteps = None
87
+
88
+ @staticmethod
89
+ def time_linear_to_Timesteps(t, t_start, t_end, T_start, T_end):
90
+ k = (T_end - T_start) / (t_end - t_start)
91
+ b = T_start - t_start * k
92
+ return k * t + b
93
+
94
+ def set_timesteps(self, num_inference_steps, stage_index, device=None, shift=1.0):
95
+ self.num_inference_steps = num_inference_steps
96
+ self._step_index = None
97
+
98
+ stage_T_start = self.Timesteps_per_stage[stage_index][0].item()
99
+ stage_T_end = self.Timesteps_per_stage[stage_index][-1].item()
100
+
101
+ t_start = self.t_window_per_stage[stage_index][0].item()
102
+ t_end = self.t_window_per_stage[stage_index][-1].item()
103
+
104
+ t = np.linspace(t_start, t_end, num_inference_steps, dtype=np.float64)
105
+ t = t / (shift + (1 - shift) * t)
106
+
107
+ Timesteps = self.time_linear_to_Timesteps(t, t_start, t_end, stage_T_start, stage_T_end)
108
+ self.Timesteps = torch.from_numpy(Timesteps).to(device=device)
109
+
110
+ self.t = torch.from_numpy(np.append(t, 1.0)).to(device=device, dtype=torch.float64)
111
+
112
+ def step(
113
+ self,
114
+ model_output: torch.Tensor,
115
+ sample: torch.Tensor,
116
+ return_dict: bool = True,
117
+ ) -> Union[PixelFlowSchedulerOutput, SchedulerOutput, Tuple[torch.Tensor, ...]]:
118
+ if self._step_index is None:
119
+ self._step_index = 0
120
+
121
+ sample = sample.to(torch.float32)
122
+ t = self.t[self._step_index].float()
123
+ t_next = self.t[self._step_index + 1].float()
124
+
125
+ prev_sample = sample + (t_next - t) * model_output
126
+ self._step_index += 1
127
+
128
+ if not return_dict:
129
+ return (prev_sample.to(model_output.dtype),)
130
+
131
+ return PixelFlowSchedulerOutput(prev_sample=prev_sample.to(model_output.dtype))
132
+
133
+ @property
134
+ def step_index(self) -> Optional[int]:
135
+ return self._step_index
PixelFlow-T2I/text_encoder/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "T5ForConditionalGeneration"
4
+ ],
5
+ "d_ff": 5120,
6
+ "d_kv": 64,
7
+ "d_model": 2048,
8
+ "decoder_start_token_id": 0,
9
+ "dropout_rate": 0.1,
10
+ "eos_token_id": 1,
11
+ "feed_forward_proj": "gated-gelu",
12
+ "initializer_factor": 1.0,
13
+ "is_encoder_decoder": true,
14
+ "layer_norm_epsilon": 1e-06,
15
+ "model_type": "t5",
16
+ "n_positions": 512,
17
+ "num_decoder_layers": 24,
18
+ "num_heads": 32,
19
+ "num_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 0,
22
+ "relative_attention_max_distance": 128,
23
+ "relative_attention_num_buckets": 32,
24
+ "task_specific_params": {
25
+ "summarization": {
26
+ "early_stopping": true,
27
+ "length_penalty": 2.0,
28
+ "max_length": 200,
29
+ "min_length": 30,
30
+ "no_repeat_ngram_size": 3,
31
+ "num_beams": 4,
32
+ "prefix": "summarize: "
33
+ },
34
+ "translation_en_to_de": {
35
+ "early_stopping": true,
36
+ "max_length": 300,
37
+ "num_beams": 4,
38
+ "prefix": "translate English to German: "
39
+ },
40
+ "translation_en_to_fr": {
41
+ "early_stopping": true,
42
+ "max_length": 300,
43
+ "num_beams": 4,
44
+ "prefix": "translate English to French: "
45
+ },
46
+ "translation_en_to_ro": {
47
+ "early_stopping": true,
48
+ "max_length": 300,
49
+ "num_beams": 4,
50
+ "prefix": "translate English to Romanian: "
51
+ }
52
+ },
53
+ "tie_word_embeddings": false,
54
+ "torch_dtype": "float32",
55
+ "transformers_version": "4.24.0.dev0",
56
+ "use_cache": true,
57
+ "vocab_size": 32128
58
+ }
PixelFlow-T2I/text_encoder/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.27.0.dev0"
7
+ }
PixelFlow-T2I/text_encoder/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99196ddfbe886e8ef860f52de979df64890edfc792c3d94ce0502991f347dd18
3
+ size 9449619912
PixelFlow-T2I/text_encoder/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0c677ddeb21009b6efd97146f37fc3a0396707fb5e63ade7aff64884dce9806
3
+ size 1949477672
PixelFlow-T2I/text_encoder/model.safetensors.index.json ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 11925413888
4
+ },
5
+ "weight_map": {
6
+ "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
7
+ "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
8
+ "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
9
+ "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00002.safetensors",
10
+ "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
11
+ "decoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
12
+ "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
13
+ "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
14
+ "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
15
+ "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
16
+ "decoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
17
+ "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
18
+ "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
19
+ "decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
20
+ "decoder.block.0.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
21
+ "decoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
22
+ "decoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
23
+ "decoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
24
+ "decoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
25
+ "decoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
26
+ "decoder.block.1.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
27
+ "decoder.block.1.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
28
+ "decoder.block.1.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
29
+ "decoder.block.1.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
30
+ "decoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
31
+ "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
32
+ "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
33
+ "decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
34
+ "decoder.block.1.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
35
+ "decoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
36
+ "decoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
37
+ "decoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
38
+ "decoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
39
+ "decoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
40
+ "decoder.block.10.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
41
+ "decoder.block.10.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
42
+ "decoder.block.10.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
43
+ "decoder.block.10.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
44
+ "decoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
45
+ "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
46
+ "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
47
+ "decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
48
+ "decoder.block.10.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
49
+ "decoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
50
+ "decoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
51
+ "decoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
52
+ "decoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
53
+ "decoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
54
+ "decoder.block.11.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
55
+ "decoder.block.11.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
56
+ "decoder.block.11.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
57
+ "decoder.block.11.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
58
+ "decoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
59
+ "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
60
+ "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
61
+ "decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
62
+ "decoder.block.11.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
63
+ "decoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
64
+ "decoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
65
+ "decoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
66
+ "decoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
67
+ "decoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
68
+ "decoder.block.12.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
69
+ "decoder.block.12.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
70
+ "decoder.block.12.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
71
+ "decoder.block.12.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
72
+ "decoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
73
+ "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
74
+ "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
75
+ "decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
76
+ "decoder.block.12.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
77
+ "decoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
78
+ "decoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
79
+ "decoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
80
+ "decoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
81
+ "decoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
82
+ "decoder.block.13.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
83
+ "decoder.block.13.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
84
+ "decoder.block.13.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
85
+ "decoder.block.13.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
86
+ "decoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
87
+ "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
88
+ "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
89
+ "decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
90
+ "decoder.block.13.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
91
+ "decoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
92
+ "decoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
93
+ "decoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
94
+ "decoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
95
+ "decoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
96
+ "decoder.block.14.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
97
+ "decoder.block.14.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
98
+ "decoder.block.14.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
99
+ "decoder.block.14.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
100
+ "decoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
101
+ "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
102
+ "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
103
+ "decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
104
+ "decoder.block.14.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
105
+ "decoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
106
+ "decoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
107
+ "decoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
108
+ "decoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
109
+ "decoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
110
+ "decoder.block.15.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
111
+ "decoder.block.15.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
112
+ "decoder.block.15.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
113
+ "decoder.block.15.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
114
+ "decoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
115
+ "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
116
+ "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
117
+ "decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
118
+ "decoder.block.15.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
119
+ "decoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
120
+ "decoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
121
+ "decoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
122
+ "decoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
123
+ "decoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
124
+ "decoder.block.16.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
125
+ "decoder.block.16.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
126
+ "decoder.block.16.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
127
+ "decoder.block.16.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
128
+ "decoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
129
+ "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
130
+ "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
131
+ "decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
132
+ "decoder.block.16.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
133
+ "decoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
134
+ "decoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
135
+ "decoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
136
+ "decoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
137
+ "decoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
138
+ "decoder.block.17.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
139
+ "decoder.block.17.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
140
+ "decoder.block.17.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
141
+ "decoder.block.17.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
142
+ "decoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
143
+ "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
144
+ "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
145
+ "decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
146
+ "decoder.block.17.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
147
+ "decoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
148
+ "decoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
149
+ "decoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
150
+ "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
151
+ "decoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
152
+ "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
153
+ "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
154
+ "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
155
+ "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
156
+ "decoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
157
+ "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
158
+ "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
159
+ "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
160
+ "decoder.block.18.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
161
+ "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
162
+ "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
163
+ "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
164
+ "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
165
+ "decoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
166
+ "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
167
+ "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
168
+ "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
169
+ "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
170
+ "decoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
171
+ "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
172
+ "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
173
+ "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
174
+ "decoder.block.19.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
175
+ "decoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
176
+ "decoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
177
+ "decoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
178
+ "decoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
179
+ "decoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
180
+ "decoder.block.2.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
181
+ "decoder.block.2.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
182
+ "decoder.block.2.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
183
+ "decoder.block.2.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
184
+ "decoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
185
+ "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
186
+ "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
187
+ "decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
188
+ "decoder.block.2.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
189
+ "decoder.block.20.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
190
+ "decoder.block.20.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
191
+ "decoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
192
+ "decoder.block.20.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
193
+ "decoder.block.20.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
194
+ "decoder.block.20.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
195
+ "decoder.block.20.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
196
+ "decoder.block.20.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
197
+ "decoder.block.20.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
198
+ "decoder.block.20.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
199
+ "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
200
+ "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
201
+ "decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
202
+ "decoder.block.20.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
203
+ "decoder.block.21.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
204
+ "decoder.block.21.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
205
+ "decoder.block.21.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
206
+ "decoder.block.21.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
207
+ "decoder.block.21.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
208
+ "decoder.block.21.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
209
+ "decoder.block.21.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
210
+ "decoder.block.21.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
211
+ "decoder.block.21.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
212
+ "decoder.block.21.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
213
+ "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
214
+ "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
215
+ "decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
216
+ "decoder.block.21.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
217
+ "decoder.block.22.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
218
+ "decoder.block.22.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
219
+ "decoder.block.22.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
220
+ "decoder.block.22.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
221
+ "decoder.block.22.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
222
+ "decoder.block.22.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
223
+ "decoder.block.22.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
224
+ "decoder.block.22.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
225
+ "decoder.block.22.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
226
+ "decoder.block.22.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
227
+ "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
228
+ "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
229
+ "decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
230
+ "decoder.block.22.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
231
+ "decoder.block.23.layer.0.SelfAttention.k.weight": "model-00002-of-00002.safetensors",
232
+ "decoder.block.23.layer.0.SelfAttention.o.weight": "model-00002-of-00002.safetensors",
233
+ "decoder.block.23.layer.0.SelfAttention.q.weight": "model-00002-of-00002.safetensors",
234
+ "decoder.block.23.layer.0.SelfAttention.v.weight": "model-00002-of-00002.safetensors",
235
+ "decoder.block.23.layer.0.layer_norm.weight": "model-00002-of-00002.safetensors",
236
+ "decoder.block.23.layer.1.EncDecAttention.k.weight": "model-00002-of-00002.safetensors",
237
+ "decoder.block.23.layer.1.EncDecAttention.o.weight": "model-00002-of-00002.safetensors",
238
+ "decoder.block.23.layer.1.EncDecAttention.q.weight": "model-00002-of-00002.safetensors",
239
+ "decoder.block.23.layer.1.EncDecAttention.v.weight": "model-00002-of-00002.safetensors",
240
+ "decoder.block.23.layer.1.layer_norm.weight": "model-00002-of-00002.safetensors",
241
+ "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00002.safetensors",
242
+ "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00002.safetensors",
243
+ "decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00002-of-00002.safetensors",
244
+ "decoder.block.23.layer.2.layer_norm.weight": "model-00002-of-00002.safetensors",
245
+ "decoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
246
+ "decoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
247
+ "decoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
248
+ "decoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
249
+ "decoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
250
+ "decoder.block.3.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
251
+ "decoder.block.3.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
252
+ "decoder.block.3.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
253
+ "decoder.block.3.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
254
+ "decoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
255
+ "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
256
+ "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
257
+ "decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
258
+ "decoder.block.3.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
259
+ "decoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
260
+ "decoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
261
+ "decoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
262
+ "decoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
263
+ "decoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
264
+ "decoder.block.4.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
265
+ "decoder.block.4.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
266
+ "decoder.block.4.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
267
+ "decoder.block.4.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
268
+ "decoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
269
+ "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
270
+ "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
271
+ "decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
272
+ "decoder.block.4.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
273
+ "decoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
274
+ "decoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
275
+ "decoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
276
+ "decoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
277
+ "decoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
278
+ "decoder.block.5.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
279
+ "decoder.block.5.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
280
+ "decoder.block.5.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
281
+ "decoder.block.5.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
282
+ "decoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
283
+ "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
284
+ "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
285
+ "decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
286
+ "decoder.block.5.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
287
+ "decoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
288
+ "decoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
289
+ "decoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
290
+ "decoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
291
+ "decoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
292
+ "decoder.block.6.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
293
+ "decoder.block.6.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
294
+ "decoder.block.6.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
295
+ "decoder.block.6.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
296
+ "decoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
297
+ "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
298
+ "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
299
+ "decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
300
+ "decoder.block.6.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
301
+ "decoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
302
+ "decoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
303
+ "decoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
304
+ "decoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
305
+ "decoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
306
+ "decoder.block.7.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
307
+ "decoder.block.7.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
308
+ "decoder.block.7.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
309
+ "decoder.block.7.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
310
+ "decoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
311
+ "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
312
+ "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
313
+ "decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
314
+ "decoder.block.7.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
315
+ "decoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
316
+ "decoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
317
+ "decoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
318
+ "decoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
319
+ "decoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
320
+ "decoder.block.8.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
321
+ "decoder.block.8.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
322
+ "decoder.block.8.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
323
+ "decoder.block.8.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
324
+ "decoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
325
+ "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
326
+ "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
327
+ "decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
328
+ "decoder.block.8.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
329
+ "decoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
330
+ "decoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
331
+ "decoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
332
+ "decoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
333
+ "decoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
334
+ "decoder.block.9.layer.1.EncDecAttention.k.weight": "model-00001-of-00002.safetensors",
335
+ "decoder.block.9.layer.1.EncDecAttention.o.weight": "model-00001-of-00002.safetensors",
336
+ "decoder.block.9.layer.1.EncDecAttention.q.weight": "model-00001-of-00002.safetensors",
337
+ "decoder.block.9.layer.1.EncDecAttention.v.weight": "model-00001-of-00002.safetensors",
338
+ "decoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
339
+ "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
340
+ "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
341
+ "decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
342
+ "decoder.block.9.layer.2.layer_norm.weight": "model-00001-of-00002.safetensors",
343
+ "decoder.embed_tokens.weight": "model-00001-of-00002.safetensors",
344
+ "decoder.final_layer_norm.weight": "model-00002-of-00002.safetensors",
345
+ "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
346
+ "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
347
+ "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
348
+ "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00002.safetensors",
349
+ "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
350
+ "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
351
+ "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
352
+ "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
353
+ "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
354
+ "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
355
+ "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
356
+ "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
357
+ "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
358
+ "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
359
+ "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
360
+ "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
361
+ "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
362
+ "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
363
+ "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
364
+ "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
365
+ "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
366
+ "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
367
+ "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
368
+ "encoder.block.10.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
369
+ "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
370
+ "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
371
+ "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
372
+ "encoder.block.10.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
373
+ "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
374
+ "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
375
+ "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
376
+ "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
377
+ "encoder.block.11.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
378
+ "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
379
+ "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
380
+ "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
381
+ "encoder.block.11.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
382
+ "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
383
+ "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
384
+ "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
385
+ "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
386
+ "encoder.block.12.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
387
+ "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
388
+ "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
389
+ "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
390
+ "encoder.block.12.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
391
+ "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
392
+ "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
393
+ "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
394
+ "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
395
+ "encoder.block.13.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
396
+ "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
397
+ "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
398
+ "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
399
+ "encoder.block.13.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
400
+ "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
401
+ "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
402
+ "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
403
+ "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
404
+ "encoder.block.14.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
405
+ "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
406
+ "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
407
+ "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
408
+ "encoder.block.14.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
409
+ "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
410
+ "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
411
+ "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
412
+ "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
413
+ "encoder.block.15.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
414
+ "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
415
+ "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
416
+ "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
417
+ "encoder.block.15.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
418
+ "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
419
+ "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
420
+ "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
421
+ "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
422
+ "encoder.block.16.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
423
+ "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
424
+ "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
425
+ "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
426
+ "encoder.block.16.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
427
+ "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
428
+ "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
429
+ "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
430
+ "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
431
+ "encoder.block.17.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
432
+ "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
433
+ "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
434
+ "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
435
+ "encoder.block.17.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
436
+ "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
437
+ "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
438
+ "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
439
+ "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
440
+ "encoder.block.18.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
441
+ "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
442
+ "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
443
+ "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
444
+ "encoder.block.18.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
445
+ "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
446
+ "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
447
+ "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
448
+ "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
449
+ "encoder.block.19.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
450
+ "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
451
+ "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
452
+ "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
453
+ "encoder.block.19.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
454
+ "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
455
+ "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
456
+ "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
457
+ "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
458
+ "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
459
+ "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
460
+ "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
461
+ "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
462
+ "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
463
+ "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
464
+ "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
465
+ "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
466
+ "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
467
+ "encoder.block.20.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
468
+ "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
469
+ "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
470
+ "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
471
+ "encoder.block.20.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
472
+ "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
473
+ "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
474
+ "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
475
+ "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
476
+ "encoder.block.21.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
477
+ "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
478
+ "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
479
+ "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
480
+ "encoder.block.21.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
481
+ "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
482
+ "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
483
+ "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
484
+ "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
485
+ "encoder.block.22.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
486
+ "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
487
+ "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
488
+ "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
489
+ "encoder.block.22.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
490
+ "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
491
+ "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
492
+ "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
493
+ "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
494
+ "encoder.block.23.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
495
+ "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
496
+ "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
497
+ "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
498
+ "encoder.block.23.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
499
+ "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
500
+ "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
501
+ "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
502
+ "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
503
+ "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
504
+ "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
505
+ "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
506
+ "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
507
+ "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
508
+ "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
509
+ "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
510
+ "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
511
+ "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
512
+ "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
513
+ "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
514
+ "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
515
+ "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
516
+ "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
517
+ "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
518
+ "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
519
+ "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
520
+ "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
521
+ "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
522
+ "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
523
+ "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
524
+ "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
525
+ "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
526
+ "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
527
+ "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
528
+ "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
529
+ "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
530
+ "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
531
+ "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
532
+ "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
533
+ "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
534
+ "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
535
+ "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
536
+ "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
537
+ "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
538
+ "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
539
+ "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
540
+ "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
541
+ "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
542
+ "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
543
+ "encoder.block.7.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
544
+ "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
545
+ "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
546
+ "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
547
+ "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
548
+ "encoder.block.8.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
549
+ "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
550
+ "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
551
+ "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
552
+ "encoder.block.8.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
553
+ "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00001-of-00002.safetensors",
554
+ "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00001-of-00002.safetensors",
555
+ "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00001-of-00002.safetensors",
556
+ "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00001-of-00002.safetensors",
557
+ "encoder.block.9.layer.0.layer_norm.weight": "model-00001-of-00002.safetensors",
558
+ "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00002.safetensors",
559
+ "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00002.safetensors",
560
+ "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00002.safetensors",
561
+ "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00002.safetensors",
562
+ "encoder.embed_tokens.weight": "model-00001-of-00002.safetensors",
563
+ "encoder.final_layer_norm.weight": "model-00001-of-00002.safetensors",
564
+ "lm_head.weight": "model-00002-of-00002.safetensors",
565
+ "shared.weight": "model-00001-of-00002.safetensors"
566
+ }
567
+ }
PixelFlow-T2I/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
PixelFlow-T2I/tokenizer/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
PixelFlow-T2I/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
PixelFlow-T2I/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "extra_ids": 100,
106
+ "model_max_length": 512,
107
+ "name_or_path": "google/t5-v1_1-small",
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "special_tokens_map_file": "/home/arthur_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-small/snapshots/fb7e6cba609f7bab11c614294bc04f82f613c7b1/special_tokens_map.json",
111
+ "tokenizer_class": "T5Tokenizer",
112
+ "unk_token": "<unk>"
113
+ }
PixelFlow-T2I/transformer/__pycache__/modeling_pixelflow.cpython-312.pyc ADDED
Binary file (24.1 kB). View file
 
PixelFlow-T2I/transformer/__pycache__/transformer_pixelflow.cpython-312.pyc ADDED
Binary file (3.83 kB). View file
 
PixelFlow-T2I/transformer/config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PixelFlowTransformer2DModel",
3
+ "_diffusers_version": "0.36.0",
4
+ "attention_bias": true,
5
+ "attention_head_dim": 72,
6
+ "cross_attention_dim": 2048,
7
+ "depth": 28,
8
+ "dropout": 0.0,
9
+ "in_channels": 3,
10
+ "init_weights": false,
11
+ "num_attention_heads": 16,
12
+ "num_classes": 0,
13
+ "out_channels": 3,
14
+ "patch_size": 4,
15
+ "sample_size": 1024
16
+ }
PixelFlow-T2I/transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a62a11eef9c84f80ff482e996311666546c032270a7ce024684c455cf800251c
3
+ size 3528583392
PixelFlow-T2I/transformer/modeling_pixelflow.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Union
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import warnings
7
+
8
+ from diffusers.models.embeddings import LabelEmbedding, TimestepEmbedding, Timesteps
9
+
10
+ try:
11
+ from flash_attn import flash_attn_varlen_func
12
+ except ImportError:
13
+ warnings.warn("`flash-attn` is not installed. Training mode may not work properly.", UserWarning)
14
+ flash_attn_varlen_func = None
15
+
16
+
17
+ def apply_rotary_emb(
18
+ x: torch.Tensor,
19
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
20
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
21
+ cos, sin = freqs_cis.unbind(-1)
22
+ cos = cos[None, None]
23
+ sin = sin[None, None]
24
+ cos, sin = cos.to(x.device), sin.to(x.device)
25
+
26
+ x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
27
+ x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
28
+ out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
29
+
30
+ return out
31
+
32
+
33
+ class PatchEmbed(nn.Module):
34
+ def __init__(self, patch_size, in_channels, embed_dim, bias=True):
35
+ super().__init__()
36
+ self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
37
+
38
+ def forward_unfold(self, x):
39
+ out_unfold = x.matmul(self.proj.weight.view(self.proj.weight.size(0), -1).t())
40
+ if self.proj.bias is not None:
41
+ out_unfold += self.proj.bias.to(out_unfold.dtype)
42
+ return out_unfold
43
+
44
+ def forward(self, x):
45
+ if self.training:
46
+ return self.forward_unfold(x)
47
+ out = self.proj(x)
48
+ out = out.flatten(2).transpose(1, 2)
49
+ return out
50
+
51
+
52
+ class AdaLayerNorm(nn.Module):
53
+ def __init__(self, embedding_dim):
54
+ super().__init__()
55
+ self.embedding_dim = embedding_dim
56
+ self.silu = nn.SiLU()
57
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
58
+ self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
59
+
60
+ def forward(self, x, timestep, seqlen_list=None):
61
+ input_dtype = x.dtype
62
+ emb = self.linear(self.silu(timestep))
63
+
64
+ if seqlen_list is not None:
65
+ emb = torch.cat([one_emb[None].expand(repeat_time, -1) for one_emb, repeat_time in zip(emb, seqlen_list)])
66
+ else:
67
+ emb = emb.unsqueeze(1)
68
+
69
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.float().chunk(6, dim=-1)
70
+ x = self.norm(x).float() * (1 + scale_msa) + shift_msa
71
+ return x.to(input_dtype), gate_msa, shift_mlp, scale_mlp, gate_mlp
72
+
73
+
74
+ class FeedForward(nn.Module):
75
+ def __init__(self, dim, dim_out=None, mult=4, inner_dim=None, bias=True):
76
+ super().__init__()
77
+ inner_dim = int(dim * mult) if inner_dim is None else inner_dim
78
+ dim_out = dim_out if dim_out is not None else dim
79
+ self.fc1 = nn.Linear(dim, inner_dim, bias=bias)
80
+ self.fc2 = nn.Linear(inner_dim, dim_out, bias=bias)
81
+
82
+ def forward(self, hidden_states):
83
+ hidden_states = self.fc1(hidden_states)
84
+ hidden_states = F.gelu(hidden_states, approximate="tanh")
85
+ hidden_states = self.fc2(hidden_states)
86
+ return hidden_states
87
+
88
+
89
+ class RMSNorm(nn.Module):
90
+ def __init__(self, dim: int, eps=1e-6):
91
+ super().__init__()
92
+ self.weight = nn.Parameter(torch.ones(dim))
93
+ self.eps = eps
94
+
95
+ def forward(self, x):
96
+ output = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
97
+ return (self.weight * output).to(x.dtype)
98
+
99
+
100
+ class Attention(nn.Module):
101
+ def __init__(self, q_dim, kv_dim=None, heads=8, head_dim=64, dropout=0.0, bias=False):
102
+ super().__init__()
103
+ self.q_dim = q_dim
104
+ self.kv_dim = kv_dim if kv_dim is not None else q_dim
105
+ self.inner_dim = head_dim * heads
106
+ self.dropout = dropout
107
+ self.head_dim = head_dim
108
+ self.num_heads = heads
109
+
110
+ self.q_proj = nn.Linear(self.q_dim, self.inner_dim, bias=bias)
111
+ self.k_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
112
+ self.v_proj = nn.Linear(self.kv_dim, self.inner_dim, bias=bias)
113
+ self.o_proj = nn.Linear(self.inner_dim, self.q_dim, bias=bias)
114
+ self.q_norm = RMSNorm(self.inner_dim)
115
+ self.k_norm = RMSNorm(self.inner_dim)
116
+
117
+ def prepare_attention_mask(self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3):
118
+ head_size = self.num_heads
119
+ if attention_mask is None:
120
+ return attention_mask
121
+
122
+ current_length: int = attention_mask.shape[-1]
123
+ if current_length != target_length:
124
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
125
+
126
+ if out_dim == 3:
127
+ if attention_mask.shape[0] < batch_size * head_size:
128
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
129
+ elif out_dim == 4:
130
+ attention_mask = attention_mask.unsqueeze(1)
131
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
132
+
133
+ return attention_mask
134
+
135
+ def forward(
136
+ self,
137
+ inputs_q,
138
+ inputs_kv,
139
+ attention_mask=None,
140
+ cross_attention=False,
141
+ rope_pos_embed=None,
142
+ cu_seqlens_q=None,
143
+ cu_seqlens_k=None,
144
+ max_seqlen_q=None,
145
+ max_seqlen_k=None,
146
+ ):
147
+ inputs_kv = inputs_q if inputs_kv is None else inputs_kv
148
+
149
+ query_states = self.q_proj(inputs_q)
150
+ key_states = self.k_proj(inputs_kv)
151
+ value_states = self.v_proj(inputs_kv)
152
+
153
+ query_states = self.q_norm(query_states)
154
+ key_states = self.k_norm(key_states)
155
+
156
+ if max_seqlen_q is None:
157
+ assert not self.training, "PixelFlow needs sequence packing for training"
158
+
159
+ bsz, q_len, _ = inputs_q.shape
160
+ _, kv_len, _ = inputs_kv.shape
161
+
162
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
163
+ key_states = key_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
164
+ value_states = value_states.view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
165
+
166
+ query_states = apply_rotary_emb(query_states, rope_pos_embed)
167
+ if not cross_attention:
168
+ key_states = apply_rotary_emb(key_states, rope_pos_embed)
169
+
170
+ if attention_mask is not None:
171
+ attention_mask = self.prepare_attention_mask(attention_mask, kv_len, bsz)
172
+ attention_mask = attention_mask.view(bsz, self.num_heads, -1, attention_mask.shape[-1])
173
+
174
+ attn_output = F.scaled_dot_product_attention(
175
+ query_states,
176
+ key_states,
177
+ value_states,
178
+ attn_mask=attention_mask,
179
+ dropout_p=self.dropout if self.training else 0.0,
180
+ is_causal=False,
181
+ )
182
+
183
+ attn_output = attn_output.transpose(1, 2).contiguous()
184
+ attn_output = attn_output.view(bsz, q_len, self.inner_dim)
185
+ attn_output = self.o_proj(attn_output)
186
+ return attn_output
187
+
188
+ query_states = query_states.view(-1, self.num_heads, self.head_dim)
189
+ key_states = key_states.view(-1, self.num_heads, self.head_dim)
190
+ value_states = value_states.view(-1, self.num_heads, self.head_dim)
191
+
192
+ query_states = apply_rotary_emb(query_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
193
+ if not cross_attention:
194
+ key_states = apply_rotary_emb(key_states.permute(1, 0, 2)[None], rope_pos_embed)[0].permute(1, 0, 2)
195
+
196
+ attn_output = flash_attn_varlen_func(
197
+ query_states,
198
+ key_states,
199
+ value_states,
200
+ cu_seqlens_q=cu_seqlens_q,
201
+ cu_seqlens_k=cu_seqlens_k,
202
+ max_seqlen_q=max_seqlen_q,
203
+ max_seqlen_k=max_seqlen_k,
204
+ )
205
+
206
+ attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
207
+ attn_output = self.o_proj(attn_output)
208
+ return attn_output
209
+
210
+
211
+ class TransformerBlock(nn.Module):
212
+ def __init__(
213
+ self,
214
+ dim,
215
+ num_attention_heads,
216
+ attention_head_dim,
217
+ dropout=0.0,
218
+ cross_attention_dim=None,
219
+ attention_bias=False,
220
+ ):
221
+ super().__init__()
222
+ self.norm1 = AdaLayerNorm(dim)
223
+ self.attn1 = Attention(
224
+ q_dim=dim,
225
+ kv_dim=None,
226
+ heads=num_attention_heads,
227
+ head_dim=attention_head_dim,
228
+ dropout=dropout,
229
+ bias=attention_bias,
230
+ )
231
+
232
+ if cross_attention_dim is not None:
233
+ self.norm2 = RMSNorm(dim, eps=1e-6)
234
+ self.attn2 = Attention(
235
+ q_dim=dim,
236
+ kv_dim=cross_attention_dim,
237
+ heads=num_attention_heads,
238
+ head_dim=attention_head_dim,
239
+ dropout=dropout,
240
+ bias=attention_bias,
241
+ )
242
+ else:
243
+ self.attn2 = None
244
+
245
+ self.norm3 = RMSNorm(dim, eps=1e-6)
246
+ self.mlp = FeedForward(dim)
247
+
248
+ def forward(
249
+ self,
250
+ hidden_states,
251
+ encoder_hidden_states=None,
252
+ encoder_attention_mask=None,
253
+ timestep=None,
254
+ rope_pos_embed=None,
255
+ cu_seqlens_q=None,
256
+ cu_seqlens_k=None,
257
+ seqlen_list_q=None,
258
+ seqlen_list_k=None,
259
+ ):
260
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, timestep, seqlen_list_q)
261
+
262
+ attn_output = self.attn1(
263
+ inputs_q=norm_hidden_states,
264
+ inputs_kv=None,
265
+ attention_mask=None,
266
+ cross_attention=False,
267
+ rope_pos_embed=rope_pos_embed,
268
+ cu_seqlens_q=cu_seqlens_q,
269
+ cu_seqlens_k=cu_seqlens_q,
270
+ max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
271
+ max_seqlen_k=max(seqlen_list_q) if seqlen_list_q is not None else None,
272
+ )
273
+
274
+ attn_output = (gate_msa * attn_output.float()).to(attn_output.dtype)
275
+ hidden_states = attn_output + hidden_states
276
+
277
+ if self.attn2 is not None:
278
+ norm_hidden_states = self.norm2(hidden_states)
279
+ attn_output = self.attn2(
280
+ inputs_q=norm_hidden_states,
281
+ inputs_kv=encoder_hidden_states,
282
+ attention_mask=encoder_attention_mask,
283
+ cross_attention=True,
284
+ rope_pos_embed=rope_pos_embed,
285
+ cu_seqlens_q=cu_seqlens_q,
286
+ cu_seqlens_k=cu_seqlens_k,
287
+ max_seqlen_q=max(seqlen_list_q) if seqlen_list_q is not None else None,
288
+ max_seqlen_k=max(seqlen_list_k) if seqlen_list_k is not None else None,
289
+ )
290
+ hidden_states = hidden_states + attn_output
291
+
292
+ norm_hidden_states = self.norm3(hidden_states)
293
+ norm_hidden_states = (norm_hidden_states.float() * (1 + scale_mlp) + shift_mlp).to(norm_hidden_states.dtype)
294
+ ff_output = self.mlp(norm_hidden_states)
295
+ ff_output = (gate_mlp * ff_output.float()).to(ff_output.dtype)
296
+ hidden_states = ff_output + hidden_states
297
+
298
+ return hidden_states
299
+
300
+
301
+ class PixelFlowModel(torch.nn.Module):
302
+ def __init__(
303
+ self,
304
+ in_channels,
305
+ out_channels,
306
+ num_attention_heads,
307
+ attention_head_dim,
308
+ depth,
309
+ patch_size,
310
+ dropout=0.0,
311
+ cross_attention_dim=None,
312
+ attention_bias=True,
313
+ num_classes=0,
314
+ init_weights=True,
315
+ ):
316
+ super().__init__()
317
+ self.patch_size = patch_size
318
+ self.attention_head_dim = attention_head_dim
319
+ self.num_classes = num_classes
320
+ self.out_channels = out_channels
321
+
322
+ embed_dim = num_attention_heads * attention_head_dim
323
+ self.patch_embed = PatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
324
+
325
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
326
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
327
+ self.latent_size_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embed_dim)
328
+ if self.num_classes > 0:
329
+ self.class_embedder = LabelEmbedding(num_classes, embed_dim, dropout_prob=0.1)
330
+
331
+ self.transformer_blocks = nn.ModuleList(
332
+ [
333
+ TransformerBlock(
334
+ embed_dim,
335
+ num_attention_heads,
336
+ attention_head_dim,
337
+ dropout,
338
+ cross_attention_dim,
339
+ attention_bias,
340
+ )
341
+ for _ in range(depth)
342
+ ]
343
+ )
344
+
345
+ self.norm_out = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
346
+ self.proj_out_1 = nn.Linear(embed_dim, 2 * embed_dim)
347
+ self.proj_out_2 = nn.Linear(embed_dim, patch_size * patch_size * out_channels)
348
+
349
+ if init_weights:
350
+ self.initialize_from_scratch()
351
+
352
+ def initialize_from_scratch(self):
353
+ def _basic_init(module):
354
+ if isinstance(module, nn.Linear):
355
+ torch.nn.init.xavier_uniform_(module.weight)
356
+ if module.bias is not None:
357
+ nn.init.constant_(module.bias, 0)
358
+
359
+ self.apply(_basic_init)
360
+
361
+ w = self.patch_embed.proj.weight.data
362
+ nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
363
+ nn.init.constant_(self.patch_embed.proj.bias, 0)
364
+
365
+ nn.init.normal_(self.timestep_embedder.linear_1.weight, std=0.02)
366
+ nn.init.normal_(self.timestep_embedder.linear_2.weight, std=0.02)
367
+ nn.init.normal_(self.latent_size_embedder.linear_1.weight, std=0.02)
368
+ nn.init.normal_(self.latent_size_embedder.linear_2.weight, std=0.02)
369
+
370
+ if self.num_classes > 0:
371
+ nn.init.normal_(self.class_embedder.embedding_table.weight, std=0.02)
372
+
373
+ for block in self.transformer_blocks:
374
+ nn.init.constant_(block.norm1.linear.weight, 0)
375
+ nn.init.constant_(block.norm1.linear.bias, 0)
376
+
377
+ nn.init.constant_(self.proj_out_1.weight, 0)
378
+ nn.init.constant_(self.proj_out_1.bias, 0)
379
+ nn.init.constant_(self.proj_out_2.weight, 0)
380
+ nn.init.constant_(self.proj_out_2.bias, 0)
381
+
382
+ def forward(
383
+ self,
384
+ hidden_states,
385
+ encoder_hidden_states=None,
386
+ class_labels=None,
387
+ timestep=None,
388
+ latent_size=None,
389
+ encoder_attention_mask=None,
390
+ pos_embed=None,
391
+ cu_seqlens_q=None,
392
+ cu_seqlens_k=None,
393
+ seqlen_list_q=None,
394
+ seqlen_list_k=None,
395
+ ):
396
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
397
+ encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
398
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
399
+
400
+ orig_height, orig_width = hidden_states.shape[-2], hidden_states.shape[-1]
401
+ hidden_states = hidden_states.to(torch.float32)
402
+ hidden_states = self.patch_embed(hidden_states)
403
+
404
+ timesteps_proj = self.time_proj(timestep)
405
+ conditioning = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
406
+
407
+ if self.num_classes > 0:
408
+ class_embed = self.class_embedder(class_labels)
409
+ conditioning += class_embed
410
+
411
+ latent_size_proj = self.time_proj(latent_size)
412
+ latent_size_embed = self.latent_size_embedder(latent_size_proj.to(dtype=hidden_states.dtype))
413
+ conditioning += latent_size_embed
414
+
415
+ for block in self.transformer_blocks:
416
+ hidden_states = block(
417
+ hidden_states,
418
+ encoder_hidden_states=encoder_hidden_states,
419
+ encoder_attention_mask=encoder_attention_mask,
420
+ timestep=conditioning,
421
+ rope_pos_embed=pos_embed,
422
+ cu_seqlens_q=cu_seqlens_q,
423
+ cu_seqlens_k=cu_seqlens_k,
424
+ seqlen_list_q=seqlen_list_q,
425
+ seqlen_list_k=seqlen_list_k,
426
+ )
427
+
428
+ shift, scale = self.proj_out_1(F.silu(conditioning)).float().chunk(2, dim=1)
429
+ if seqlen_list_q is None:
430
+ shift = shift.unsqueeze(1)
431
+ scale = scale.unsqueeze(1)
432
+ else:
433
+ shift = torch.cat([shift_i[None].expand(ri, -1) for shift_i, ri in zip(shift, seqlen_list_q)])
434
+ scale = torch.cat([scale_i[None].expand(ri, -1) for scale_i, ri in zip(scale, seqlen_list_q)])
435
+
436
+ hidden_states = (self.norm_out(hidden_states).float() * (1 + scale) + shift).to(hidden_states.dtype)
437
+ hidden_states = self.proj_out_2(hidden_states)
438
+ if self.training:
439
+ hidden_states = hidden_states.reshape(hidden_states.shape[0], self.patch_size, self.patch_size, self.out_channels)
440
+ hidden_states = hidden_states.permute(0, 3, 1, 2).flatten(1)
441
+ return hidden_states
442
+
443
+ height, width = orig_height // self.patch_size, orig_width // self.patch_size
444
+ hidden_states = hidden_states.reshape(shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels))
445
+ hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
446
+ output = hidden_states.reshape(shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size))
447
+
448
+ return output
PixelFlow-T2I/transformer/transformer_pixelflow.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple, Union
3
+
4
+ import torch
5
+
6
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
7
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
8
+ from diffusers.models.modeling_utils import ModelMixin
9
+ from diffusers.utils import BaseOutput
10
+
11
+ from modeling_pixelflow import PixelFlowModel
12
+
13
+
14
+ @dataclass
15
+ class PixelFlowTransformerOutput(BaseOutput):
16
+ sample: torch.FloatTensor
17
+
18
+
19
+ class PixelFlowTransformer2DModel(ModelMixin, ConfigMixin):
20
+ """PixelFlow transformer for class-conditional pixel-space flow generation."""
21
+
22
+ @register_to_config
23
+ def __init__(
24
+ self,
25
+ in_channels: int = 3,
26
+ out_channels: int = 3,
27
+ num_attention_heads: int = 16,
28
+ attention_head_dim: int = 72,
29
+ depth: int = 28,
30
+ patch_size: int = 4,
31
+ dropout: float = 0.0,
32
+ cross_attention_dim: Optional[int] = None,
33
+ attention_bias: bool = True,
34
+ num_classes: int = 1000,
35
+ sample_size: int = 256,
36
+ init_weights: bool = True,
37
+ ):
38
+ super().__init__()
39
+ self.model = PixelFlowModel(
40
+ in_channels=in_channels,
41
+ out_channels=out_channels,
42
+ num_attention_heads=num_attention_heads,
43
+ attention_head_dim=attention_head_dim,
44
+ depth=depth,
45
+ patch_size=patch_size,
46
+ dropout=dropout,
47
+ cross_attention_dim=cross_attention_dim,
48
+ attention_bias=attention_bias,
49
+ num_classes=num_classes,
50
+ init_weights=init_weights,
51
+ )
52
+
53
+ @property
54
+ def patch_size(self) -> int:
55
+ return self.model.patch_size
56
+
57
+ @property
58
+ def attention_head_dim(self) -> int:
59
+ return self.model.attention_head_dim
60
+
61
+ def forward(
62
+ self,
63
+ hidden_states: torch.Tensor,
64
+ timestep: Optional[torch.Tensor] = None,
65
+ class_labels: Optional[torch.Tensor] = None,
66
+ latent_size: Optional[torch.Tensor] = None,
67
+ pos_embed: Optional[torch.Tensor] = None,
68
+ encoder_hidden_states: Optional[torch.Tensor] = None,
69
+ encoder_attention_mask: Optional[torch.Tensor] = None,
70
+ return_dict: bool = True,
71
+ ) -> Union[PixelFlowTransformerOutput, Transformer2DModelOutput, Tuple[torch.Tensor, ...]]:
72
+ output = self.model(
73
+ hidden_states=hidden_states,
74
+ encoder_hidden_states=encoder_hidden_states,
75
+ class_labels=class_labels,
76
+ timestep=timestep,
77
+ latent_size=latent_size,
78
+ encoder_attention_mask=encoder_attention_mask,
79
+ pos_embed=pos_embed,
80
+ )
81
+
82
+ if not return_dict:
83
+ return (output,)
84
+
85
+ return Transformer2DModelOutput(sample=output)
README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: diffusers
4
+ pipeline_tag: text-to-image
5
+ tags:
6
+ - diffusers
7
+ - pixelflow
8
+ - image-generation
9
+ - class-conditional
10
+ - flow-matching
11
+ widget:
12
+ - output:
13
+ url: PixelFlow-256/demo.png
14
+ language:
15
+ - en
16
+ ---
17
+
18
+ # BiliSakura/PixelFlow-diffusers
19
+
20
+ Self-contained PixelFlow checkpoints for Hugging Face diffusers. Each subfolder ships its own `pipeline.py`, component modules, and weights.
21
+
22
+ ## Available checkpoints
23
+
24
+ | Subfolder | Task | Resolution | Params |
25
+ | --- | --- | ---: | ---: |
26
+ | [`PixelFlow-256/`](PixelFlow-256/) | class-to-image | 256×256 | 677M |
27
+ | [`PixelFlow-T2I/`](PixelFlow-T2I/) | text-to-image | 1024×1024 | 882M |
28
+
29
+ ## ImageNet class labels
30
+
31
+ For class-conditional [`PixelFlow-256/`](PixelFlow-256/), ImageNet-1k labels live in shared [`labels/`](labels/) at the repo root:
32
+
33
+ | File | Direction | Value format |
34
+ | --- | --- | --- |
35
+ | `labels/id2label_en.json` | id → English | comma-separated synonyms, e.g. `"207": "golden retriever"` |
36
+ | `labels/id2label_cn.json` | id → Chinese | comma-separated synonyms, e.g. `"207": "金毛猎犬"` |
37
+
38
+ After `PixelFlowPipeline.from_pretrained(...)`, the pipeline exposes:
39
+
40
+ - `pipe.id2label` / `pipe.id2label_cn` — inspect id → label correspondence
41
+ - `pipe.labels` / `pipe.labels_cn` — reverse maps (synonym → id)
42
+ - `pipe.get_label_ids("golden retriever")` or `pipe.get_label_ids("金毛猎犬", lang="cn")`
43
+ - `pipe(class_labels="golden retriever", ...)` — string labels resolved automatically
44
+
45
+ ## Demo
46
+
47
+ ![PixelFlow-256 demo](PixelFlow-256/demo.png)
48
+
49
+ ## Load from a local clone
50
+
51
+ ```python
52
+ import sys
53
+ from pathlib import Path
54
+
55
+ repo = Path("BiliSakura/PixelFlow-diffusers").resolve()
56
+ variant = "PixelFlow-256"
57
+
58
+ sys.path.insert(0, str(repo / variant))
59
+ from pipeline import PixelFlowPipeline
60
+
61
+ pipe = PixelFlowPipeline.from_pretrained(".")
62
+ pipe.to("cuda")
63
+
64
+ images = pipe(
65
+ class_labels=207,
66
+ num_inference_steps=[10, 10, 10, 10],
67
+ guidance_scale=4.0,
68
+ ).images
69
+
70
+ # Human-readable ImageNet labels (English or Chinese)
71
+ print(pipe.id2label[207]) # "golden retriever"
72
+ print(pipe.id2label_cn[207]) # "金毛猎犬"
73
+ pipe.get_label_ids("golden retriever") # [207]
74
+ pipe.get_label_ids("金毛猎犬", lang="cn") # [207]
75
+ images = pipe(class_labels="golden retriever", num_inference_steps=[10, 10, 10, 10]).images
76
+ ```
77
+
78
+ ### Text-to-image (`PixelFlow-T2I`)
79
+
80
+ Uses [`google/flan-t5-xl`](https://huggingface.co/google/flan-t5-xl) as the text encoder (loaded from Hugging Face at runtime, not bundled in the repo).
81
+
82
+ ```python
83
+ variant = "PixelFlow-T2I"
84
+ sys.path.insert(0, str(repo / variant))
85
+ from pipeline import PixelFlowPipeline
86
+
87
+ pipe = PixelFlowPipeline.from_pretrained(".")
88
+ pipe.to("cuda")
89
+
90
+ images = pipe(
91
+ prompt="A golden retriever playing in a sunny garden",
92
+ num_inference_steps=[10, 10, 10, 10],
93
+ guidance_scale=4.0,
94
+ ).images
95
+ ```
96
+
97
+ ## Conversion
98
+
99
+ ```bash
100
+ python scripts/convert_pixelflow_to_diffusers.py \
101
+ --checkpoint models/raw/PixelFlow/c2i/model.pt \
102
+ --config models/raw/PixelFlow/c2i/config.yaml \
103
+ --output models/BiliSakura/PixelFlow-diffusers/PixelFlow-256
104
+
105
+ python scripts/convert_pixelflow_to_diffusers.py \
106
+ --checkpoint models/raw/PixelFlow/t2i/model.pt \
107
+ --config models/raw/PixelFlow/t2i/config.yaml \
108
+ --output models/BiliSakura/PixelFlow-diffusers/PixelFlow-T2I \
109
+ --skip-text-encoder
110
+ ```
labels/__pycache__/imagenet_labels.cpython-312.pyc ADDED
Binary file (3.24 kB). View file
 
labels/id2label_cn.json ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "丁鲷",
3
+ "1": "金鱼",
4
+ "2": "大白鲨",
5
+ "3": "虎鲨",
6
+ "4": "锤头鲨",
7
+ "5": "电鳐",
8
+ "6": "黄貂鱼",
9
+ "7": "公鸡",
10
+ "8": "母鸡",
11
+ "9": "鸵鸟",
12
+ "10": "燕雀",
13
+ "11": "金翅雀",
14
+ "12": "家朱雀",
15
+ "13": "灯芯草雀",
16
+ "14": "靛蓝雀,靛蓝鸟",
17
+ "15": "蓝鹀",
18
+ "16": "夜莺",
19
+ "17": "松鸦",
20
+ "18": "喜鹊",
21
+ "19": "山雀",
22
+ "20": "河鸟",
23
+ "21": "鸢(猛禽)",
24
+ "22": "秃头鹰",
25
+ "23": "秃鹫",
26
+ "24": "大灰猫头鹰",
27
+ "25": "欧洲火蝾螈",
28
+ "26": "普通蝾螈",
29
+ "27": "水蜥",
30
+ "28": "斑点蝾螈",
31
+ "29": "蝾螈,泥狗",
32
+ "30": "牛蛙",
33
+ "31": "树蛙",
34
+ "32": "尾蛙,铃蟾蜍,肋蟾蜍,尾蟾蜍",
35
+ "33": "红海龟",
36
+ "34": "皮革龟",
37
+ "35": "泥龟",
38
+ "36": "淡水龟",
39
+ "37": "箱龟",
40
+ "38": "带状壁虎",
41
+ "39": "普通鬣蜥",
42
+ "40": "美国变色龙",
43
+ "41": "鞭尾蜥蜴",
44
+ "42": "飞龙科蜥蜴",
45
+ "43": "褶边蜥蜴",
46
+ "44": "鳄鱼蜥蜴",
47
+ "45": "毒蜥",
48
+ "46": "绿蜥蜴",
49
+ "47": "非洲变色龙",
50
+ "48": "科莫多蜥蜴",
51
+ "49": "非洲鳄,尼罗河鳄鱼",
52
+ "50": "美国鳄鱼,鳄鱼",
53
+ "51": "三角龙",
54
+ "52": "雷蛇,蠕虫蛇",
55
+ "53": "环蛇,环颈蛇",
56
+ "54": "希腊蛇",
57
+ "55": "绿蛇,草蛇",
58
+ "56": "国王蛇",
59
+ "57": "袜带蛇,草蛇",
60
+ "58": "水蛇",
61
+ "59": "藤蛇",
62
+ "60": "夜蛇",
63
+ "61": "大蟒蛇",
64
+ "62": "岩石蟒蛇,岩蛇,蟒蛇",
65
+ "63": "印度眼镜蛇",
66
+ "64": "绿曼巴",
67
+ "65": "海蛇",
68
+ "66": "角腹蛇",
69
+ "67": "菱纹响尾蛇",
70
+ "68": "角响尾蛇",
71
+ "69": "三叶虫",
72
+ "70": "盲蜘蛛",
73
+ "71": "蝎子",
74
+ "72": "黑金花园蜘蛛",
75
+ "73": "谷仓蜘蛛",
76
+ "74": "花园蜘蛛",
77
+ "75": "黑寡妇蜘蛛",
78
+ "76": "狼蛛",
79
+ "77": "狼蜘蛛,狩猎蜘蛛",
80
+ "78": "壁虱",
81
+ "79": "蜈蚣",
82
+ "80": "黑松鸡",
83
+ "81": "松鸡,雷鸟",
84
+ "82": "披肩鸡,披肩榛鸡",
85
+ "83": "草原鸡,草原松鸡",
86
+ "84": "孔雀",
87
+ "85": "鹌鹑",
88
+ "86": "鹧鸪",
89
+ "87": "非洲灰鹦鹉",
90
+ "88": "金刚鹦鹉",
91
+ "89": "硫冠鹦鹉",
92
+ "90": "短尾鹦鹉",
93
+ "91": "褐翅鸦鹃",
94
+ "92": "蜜蜂",
95
+ "93": "犀鸟",
96
+ "94": "蜂鸟",
97
+ "95": "鹟䴕",
98
+ "96": "犀鸟",
99
+ "97": "野鸭",
100
+ "98": "红胸秋沙鸭",
101
+ "99": "鹅",
102
+ "100": "黑天鹅",
103
+ "101": "大象",
104
+ "102": "针鼹鼠",
105
+ "103": "鸭嘴兽",
106
+ "104": "沙袋鼠",
107
+ "105": "考拉,考拉熊",
108
+ "106": "袋熊",
109
+ "107": "水母",
110
+ "108": "海葵",
111
+ "109": "脑珊瑚",
112
+ "110": "扁形虫扁虫",
113
+ "111": "线虫,蛔虫",
114
+ "112": "海螺",
115
+ "113": "蜗牛",
116
+ "114": "鼻涕虫",
117
+ "115": "海参",
118
+ "116": "石鳖",
119
+ "117": "鹦鹉螺",
120
+ "118": "珍宝蟹",
121
+ "119": "石蟹",
122
+ "120": "招潮蟹",
123
+ "121": "帝王蟹,阿拉斯加蟹,阿拉斯加帝王蟹",
124
+ "122": "美国龙虾,缅因州龙虾",
125
+ "123": "大螯虾",
126
+ "124": "小龙虾",
127
+ "125": "寄居蟹",
128
+ "126": "等足目动物(明虾和螃蟹近亲)",
129
+ "127": "白鹳",
130
+ "128": "黑鹳",
131
+ "129": "鹭",
132
+ "130": "火烈鸟",
133
+ "131": "小蓝鹭",
134
+ "132": "美国鹭,大白鹭",
135
+ "133": "麻鸦",
136
+ "134": "鹤",
137
+ "135": "秧鹤",
138
+ "136": "欧洲水鸡,紫水鸡",
139
+ "137": "沼泽泥母鸡,水母鸡",
140
+ "138": "鸨",
141
+ "139": "红翻石鹬",
142
+ "140": "红背鹬,黑腹滨鹬",
143
+ "141": "红脚鹬",
144
+ "142": "半蹼鹬",
145
+ "143": "蛎鹬",
146
+ "144": "鹈鹕",
147
+ "145": "国王企鹅",
148
+ "146": "信天翁,大海鸟",
149
+ "147": "灰鲸",
150
+ "148": "杀人鲸,逆戟鲸,虎鲸",
151
+ "149": "海牛",
152
+ "150": "海狮",
153
+ "151": "奇瓦瓦",
154
+ "152": "日本猎犬",
155
+ "153": "马尔济斯犬",
156
+ "154": "狮子狗",
157
+ "155": "西施犬",
158
+ "156": "布莱尼姆猎犬",
159
+ "157": "巴比狗",
160
+ "158": "玩具犬",
161
+ "159": "罗得西亚长背猎狗",
162
+ "160": "阿富汗猎犬",
163
+ "161": "猎犬",
164
+ "162": "比格犬,猎兔犬",
165
+ "163": "侦探犬",
166
+ "164": "蓝色快狗",
167
+ "165": "黑褐猎浣熊犬",
168
+ "166": "沃克猎犬",
169
+ "167": "英国猎狐犬",
170
+ "168": "美洲赤狗",
171
+ "169": "俄罗斯猎狼犬",
172
+ "170": "爱尔兰猎狼犬",
173
+ "171": "意大利灰狗",
174
+ "172": "惠比特犬",
175
+ "173": "依比沙猎犬",
176
+ "174": "挪威猎犬",
177
+ "175": "奥达猎犬,水獭猎犬",
178
+ "176": "沙克犬,瞪羚猎犬",
179
+ "177": "苏格兰猎鹿犬,猎鹿犬",
180
+ "178": "威玛猎犬",
181
+ "179": "斯塔福德郡牛头梗,斯塔福德郡斗牛梗",
182
+ "180": "美国斯塔福德郡梗,美国比特斗牛梗,斗牛梗",
183
+ "181": "贝德灵顿梗",
184
+ "182": "边境梗",
185
+ "183": "凯丽蓝梗",
186
+ "184": "爱尔兰梗",
187
+ "185": "诺福克梗",
188
+ "186": "诺维奇梗",
189
+ "187": "约克郡梗",
190
+ "188": "刚毛猎狐梗",
191
+ "189": "莱克兰梗",
192
+ "190": "锡利哈姆梗",
193
+ "191": "艾尔谷犬",
194
+ "192": "凯恩梗",
195
+ "193": "澳大利亚梗",
196
+ "194": "丹迪丁蒙梗",
197
+ "195": "波士顿梗",
198
+ "196": "迷你雪纳瑞犬",
199
+ "197": "巨型雪纳瑞犬",
200
+ "198": "标准雪纳瑞犬",
201
+ "199": "苏格兰梗",
202
+ "200": "西藏梗,菊花狗",
203
+ "201": "丝毛梗",
204
+ "202": "软毛麦色梗",
205
+ "203": "西高地白梗",
206
+ "204": "拉萨阿普索犬",
207
+ "205": "平毛寻回犬",
208
+ "206": "卷毛寻回犬",
209
+ "207": "金毛猎犬",
210
+ "208": "拉布拉多猎犬",
211
+ "209": "乞沙比克猎犬",
212
+ "210": "德国短毛猎犬",
213
+ "211": "维兹拉犬",
214
+ "212": "英国谍犬",
215
+ "213": "爱尔兰雪达犬,红色猎犬",
216
+ "214": "戈登雪达犬",
217
+ "215": "布列塔尼犬猎犬",
218
+ "216": "黄毛,黄毛猎犬",
219
+ "217": "英国史宾格犬",
220
+ "218": "威尔士史宾格犬",
221
+ "219": "可卡犬,英国可卡犬",
222
+ "220": "萨塞克斯猎犬",
223
+ "221": "爱尔兰水猎犬",
224
+ "222": "哥威斯犬",
225
+ "223": "舒柏奇犬",
226
+ "224": "比利时牧羊犬",
227
+ "225": "马里努阿犬",
228
+ "226": "伯瑞犬",
229
+ "227": "凯尔皮犬",
230
+ "228": "匈牙利牧羊犬",
231
+ "229": "老英国牧羊犬",
232
+ "230": "喜乐蒂牧羊犬",
233
+ "231": "牧羊犬",
234
+ "232": "边境牧羊犬",
235
+ "233": "法兰德斯牧牛狗",
236
+ "234": "罗特韦尔犬",
237
+ "235": "德国牧羊犬,德国警犬,阿尔萨斯",
238
+ "236": "多伯曼犬,杜宾犬",
239
+ "237": "迷你杜宾犬",
240
+ "238": "大瑞士山地犬",
241
+ "239": "伯恩山犬",
242
+ "240": "Appenzeller狗",
243
+ "241": "EntleBucher狗",
244
+ "242": "拳师狗",
245
+ "243": "斗牛獒",
246
+ "244": "藏獒",
247
+ "245": "法国斗牛犬",
248
+ "246": "大丹犬",
249
+ "247": "圣伯纳德狗",
250
+ "248": "爱斯基摩犬,哈士奇",
251
+ "249": "雪橇犬,阿拉斯加爱斯基摩狗",
252
+ "250": "哈士奇",
253
+ "251": "达尔马提亚,教练车狗",
254
+ "252": "狮毛狗",
255
+ "253": "巴辛吉狗",
256
+ "254": "哈巴狗,狮子狗",
257
+ "255": "莱昂贝格狗",
258
+ "256": "纽芬兰岛狗",
259
+ "257": "大白熊犬",
260
+ "258": "萨摩耶犬",
261
+ "259": "博美犬",
262
+ "260": "松狮,松狮",
263
+ "261": "荷兰卷尾狮毛狗",
264
+ "262": "布鲁塞尔格林芬犬",
265
+ "263": "彭布洛克威尔士科基犬",
266
+ "264": "威尔士柯基犬",
267
+ "265": "玩具贵宾犬",
268
+ "266": "迷你贵宾犬",
269
+ "267": "标准贵宾犬",
270
+ "268": "墨西哥无毛犬",
271
+ "269": "灰狼",
272
+ "270": "白狼,北极狼",
273
+ "271": "红太狼,鬃狼,犬犬鲁弗斯",
274
+ "272": "狼,草原狼,刷狼,郊狼",
275
+ "273": "澳洲野狗,澳大利亚野犬",
276
+ "274": "豺",
277
+ "275": "非洲猎犬,土狼犬",
278
+ "276": "鬣狗",
279
+ "277": "红狐狸",
280
+ "278": "沙狐",
281
+ "279": "北极狐狸,白狐狸",
282
+ "280": "灰狐狸",
283
+ "281": "虎斑猫",
284
+ "282": "山猫,虎猫",
285
+ "283": "波斯猫",
286
+ "284": "暹罗暹罗猫,",
287
+ "285": "埃及猫",
288
+ "286": "美洲狮,美洲豹",
289
+ "287": "猞猁,山猫",
290
+ "288": "豹子",
291
+ "289": "雪豹",
292
+ "290": "美洲虎",
293
+ "291": "狮子",
294
+ "292": "老虎",
295
+ "293": "猎豹",
296
+ "294": "棕熊",
297
+ "295": "美洲黑熊",
298
+ "296": "冰熊,北极熊",
299
+ "297": "懒熊",
300
+ "298": "猫鼬",
301
+ "299": "猫鼬,海猫",
302
+ "300": "虎甲虫",
303
+ "301": "瓢虫",
304
+ "302": "土鳖虫",
305
+ "303": "天牛",
306
+ "304": "龟甲虫",
307
+ "305": "粪甲虫",
308
+ "306": "犀牛甲虫",
309
+ "307": "象甲",
310
+ "308": "苍蝇",
311
+ "309": "蜜蜂",
312
+ "310": "蚂蚁",
313
+ "311": "蚱蜢",
314
+ "312": "蟋蟀",
315
+ "313": "竹节虫",
316
+ "314": "蟑螂",
317
+ "315": "螳螂",
318
+ "316": "蝉",
319
+ "317": "叶蝉",
320
+ "318": "草蜻蛉",
321
+ "319": "蜻蜓",
322
+ "320": "豆娘,蜻蛉",
323
+ "321": "优红蛱蝶",
324
+ "322": "小环蝴蝶",
325
+ "323": "君主蝴蝶,大斑蝶",
326
+ "324": "菜粉蝶",
327
+ "325": "白蝴蝶",
328
+ "326": "灰蝶",
329
+ "327": "海星",
330
+ "328": "海胆",
331
+ "329": "海参,海黄瓜",
332
+ "330": "野兔",
333
+ "331": "兔",
334
+ "332": "安哥拉兔",
335
+ "333": "仓鼠",
336
+ "334": "刺猬,豪猪,",
337
+ "335": "黑松鼠",
338
+ "336": "土拨鼠",
339
+ "337": "海狸",
340
+ "338": "豚鼠,豚鼠",
341
+ "339": "栗色马",
342
+ "340": "斑马",
343
+ "341": "猪",
344
+ "342": "野猪",
345
+ "343": "疣猪",
346
+ "344": "河马",
347
+ "345": "牛",
348
+ "346": "水牛,亚洲水牛",
349
+ "347": "野牛",
350
+ "348": "公羊",
351
+ "349": "大角羊,洛矶山大角羊",
352
+ "350": "山羊",
353
+ "351": "狷羚",
354
+ "352": "黑斑羚",
355
+ "353": "瞪羚",
356
+ "354": "阿拉伯单峰骆驼,骆驼",
357
+ "355": "羊驼",
358
+ "356": "黄鼠狼",
359
+ "357": "水貂",
360
+ "358": "臭猫",
361
+ "359": "黑足鼬",
362
+ "360": "水獭",
363
+ "361": "臭鼬,木猫",
364
+ "362": "獾",
365
+ "363": "犰狳",
366
+ "364": "树懒",
367
+ "365": "猩猩,婆罗洲猩猩",
368
+ "366": "大猩猩",
369
+ "367": "黑猩猩",
370
+ "368": "长臂猿",
371
+ "369": "合趾猿长臂猿,合趾猿",
372
+ "370": "长尾猴",
373
+ "371": "赤猴",
374
+ "372": "狒狒",
375
+ "373": "恒河猴,猕猴",
376
+ "374": "白头叶猴",
377
+ "375": "疣猴",
378
+ "376": "长鼻猴",
379
+ "377": "狨(美洲产小型长尾猴)",
380
+ "378": "卷尾猴",
381
+ "379": "吼猴",
382
+ "380": "伶猴",
383
+ "381": "蜘蛛猴",
384
+ "382": "松鼠猴",
385
+ "383": "马达加斯加环尾狐猴,鼠狐猴",
386
+ "384": "大狐猴,马达加斯加大狐猴",
387
+ "385": "印度大象,亚洲象",
388
+ "386": "非洲象,非洲象",
389
+ "387": "小熊猫",
390
+ "388": "大熊猫",
391
+ "389": "杖鱼",
392
+ "390": "鳗鱼",
393
+ "391": "银鲑,银鲑���",
394
+ "392": "三色刺蝶鱼",
395
+ "393": "海葵鱼",
396
+ "394": "鲟鱼",
397
+ "395": "雀鳝",
398
+ "396": "狮子鱼",
399
+ "397": "河豚",
400
+ "398": "算盘",
401
+ "399": "长袍",
402
+ "400": "学位袍",
403
+ "401": "手风琴",
404
+ "402": "原声吉他",
405
+ "403": "航空母舰",
406
+ "404": "客机",
407
+ "405": "飞艇",
408
+ "406": "祭坛",
409
+ "407": "救护车",
410
+ "408": "水陆两用车",
411
+ "409": "模拟时钟",
412
+ "410": "蜂房",
413
+ "411": "围裙",
414
+ "412": "垃圾桶",
415
+ "413": "攻击步枪,枪",
416
+ "414": "背包",
417
+ "415": "面包店,面包铺,",
418
+ "416": "平衡木",
419
+ "417": "热气球",
420
+ "418": "圆珠笔",
421
+ "419": "创可贴",
422
+ "420": "班卓琴",
423
+ "421": "栏杆,楼梯扶手",
424
+ "422": "杠铃",
425
+ "423": "理发师的椅子",
426
+ "424": "理发店",
427
+ "425": "牲口棚",
428
+ "426": "晴雨表",
429
+ "427": "圆筒",
430
+ "428": "园地小车,手推车",
431
+ "429": "棒球",
432
+ "430": "篮球",
433
+ "431": "婴儿床",
434
+ "432": "巴松管,低音管",
435
+ "433": "游泳帽",
436
+ "434": "沐浴毛巾",
437
+ "435": "浴缸,澡盆",
438
+ "436": "沙滩车,旅行车",
439
+ "437": "灯塔",
440
+ "438": "高脚杯",
441
+ "439": "熊皮高帽",
442
+ "440": "啤酒瓶",
443
+ "441": "啤酒杯",
444
+ "442": "钟塔",
445
+ "443": "(小儿用的)围嘴",
446
+ "444": "串联自行车,",
447
+ "445": "比基尼",
448
+ "446": "装订册",
449
+ "447": "双筒望远镜",
450
+ "448": "鸟舍",
451
+ "449": "船库",
452
+ "450": "雪橇",
453
+ "451": "饰扣式领带",
454
+ "452": "阔边女帽",
455
+ "453": "书橱",
456
+ "454": "书店,书摊",
457
+ "455": "瓶盖",
458
+ "456": "弓箭",
459
+ "457": "蝴蝶结领结",
460
+ "458": "铜制牌位",
461
+ "459": "奶罩",
462
+ "460": "防波堤,海堤",
463
+ "461": "铠甲",
464
+ "462": "扫帚",
465
+ "463": "桶",
466
+ "464": "扣环",
467
+ "465": "防弹背心",
468
+ "466": "动车,子弹头列车",
469
+ "467": "肉铺,肉菜市场",
470
+ "468": "出租车",
471
+ "469": "大锅",
472
+ "470": "蜡烛",
473
+ "471": "大炮",
474
+ "472": "独木舟",
475
+ "473": "开瓶器,开罐器",
476
+ "474": "开衫",
477
+ "475": "车镜",
478
+ "476": "旋转木马",
479
+ "477": "木匠的工具包,工具包",
480
+ "478": "纸箱",
481
+ "479": "车轮",
482
+ "480": "取款机,自动取款机",
483
+ "481": "盒式录音带",
484
+ "482": "卡带播放器",
485
+ "483": "城堡",
486
+ "484": "双体船",
487
+ "485": "CD播放器",
488
+ "486": "大提琴",
489
+ "487": "移动电话,手机",
490
+ "488": "铁链",
491
+ "489": "围栏",
492
+ "490": "链甲",
493
+ "491": "电锯,油锯",
494
+ "492": "箱子",
495
+ "493": "衣柜,洗脸台",
496
+ "494": "编钟,钟,锣",
497
+ "495": "中国橱柜",
498
+ "496": "圣诞袜",
499
+ "497": "教堂,教堂建筑",
500
+ "498": "电影院,剧场",
501
+ "499": "切肉刀,菜刀",
502
+ "500": "悬崖屋",
503
+ "501": "斗篷",
504
+ "502": "木屐,木鞋",
505
+ "503": "鸡尾酒调酒器",
506
+ "504": "咖啡杯",
507
+ "505": "咖啡壶",
508
+ "506": "螺旋结构(楼梯)",
509
+ "507": "组合锁",
510
+ "508": "电脑键盘,键盘",
511
+ "509": "糖果,糖果店",
512
+ "510": "集装箱船",
513
+ "511": "敞篷车",
514
+ "512": "开瓶器,瓶螺杆",
515
+ "513": "短号,喇叭",
516
+ "514": "牛仔靴",
517
+ "515": "牛仔帽",
518
+ "516": "摇篮",
519
+ "517": "起重机",
520
+ "518": "头盔",
521
+ "519": "板条箱",
522
+ "520": "小儿床",
523
+ "521": "砂锅",
524
+ "522": "槌球",
525
+ "523": "拐杖",
526
+ "524": "胸甲",
527
+ "525": "大坝,堤防",
528
+ "526": "书桌",
529
+ "527": "台式电脑",
530
+ "528": "有线电话",
531
+ "529": "尿布湿",
532
+ "530": "数字时钟",
533
+ "531": "数字手表",
534
+ "532": "餐桌板",
535
+ "533": "抹布",
536
+ "534": "洗碗机,洗碟机",
537
+ "535": "盘式制动器",
538
+ "536": "码头,船坞,码头设施",
539
+ "537": "狗拉雪橇",
540
+ "538": "圆顶",
541
+ "539": "门垫,垫子",
542
+ "540": "钻井平台,海上钻井",
543
+ "541": "鼓,乐器,鼓膜",
544
+ "542": "鼓槌",
545
+ "543": "哑铃",
546
+ "544": "荷兰烤箱",
547
+ "545": "电风扇,鼓风机",
548
+ "546": "电吉他",
549
+ "547": "电力机车",
550
+ "548": "电视,电视柜",
551
+ "549": "信封",
552
+ "550": "浓缩咖啡机",
553
+ "551": "扑面粉",
554
+ "552": "女用长围巾",
555
+ "553": "文件,文件柜,档案柜",
556
+ "554": "消防船",
557
+ "555": "消防车",
558
+ "556": "火炉栏",
559
+ "557": "旗杆",
560
+ "558": "长笛",
561
+ "559": "折叠椅",
562
+ "560": "橄榄球头盔",
563
+ "561": "叉车",
564
+ "562": "喷泉",
565
+ "563": "钢笔",
566
+ "564": "有四根帷柱的床",
567
+ "565": "运货车厢",
568
+ "566": "圆号,喇叭",
569
+ "567": "煎锅",
570
+ "568": "裘皮大衣",
571
+ "569": "垃圾车",
572
+ "570": "防毒面具,呼吸器",
573
+ "571": "汽油泵",
574
+ "572": "高脚杯",
575
+ "573": "卡丁车",
576
+ "574": "高尔夫球",
577
+ "575": "高尔夫球车",
578
+ "576": "狭长小船",
579
+ "577": "锣",
580
+ "578": "礼服",
581
+ "579": "钢琴",
582
+ "580": "温室,苗圃",
583
+ "581": "散热器格栅",
584
+ "582": "杂货店,食品市场",
585
+ "583": "断头台",
586
+ "584": "小发夹",
587
+ "585": "头发喷雾",
588
+ "586": "半履带装甲车",
589
+ "587": "锤子",
590
+ "588": "大篮子",
591
+ "589": "手摇鼓风机,吹风机",
592
+ "590": "手提电脑",
593
+ "591": "手帕",
594
+ "592": "硬盘",
595
+ "593": "口琴,口风琴",
596
+ "594": "竖琴",
597
+ "595": "收割机",
598
+ "596": "斧头",
599
+ "597": "手枪皮套",
600
+ "598": "家庭影院",
601
+ "599": "蜂窝",
602
+ "600": "钩爪",
603
+ "601": "衬裙",
604
+ "602": "单杠",
605
+ "603": "马车",
606
+ "604": "沙漏",
607
+ "605": "手机,iPad",
608
+ "606": "熨斗",
609
+ "607": "南瓜灯笼",
610
+ "608": "牛仔裤,蓝色牛仔裤",
611
+ "609": "吉普车",
612
+ "610": "运动衫,T恤",
613
+ "611": "拼图",
614
+ "612": "人力车",
615
+ "613": "操纵杆",
616
+ "614": "和服",
617
+ "615": "护膝",
618
+ "616": "蝴蝶结",
619
+ "617": "大褂,实验室外套",
620
+ "618": "长柄勺",
621
+ "619": "灯罩",
622
+ "620": "笔记本电脑",
623
+ "621": "割草机",
624
+ "622": "镜头盖",
625
+ "623": "开信刀,裁纸刀",
626
+ "624": "图书馆",
627
+ "625": "救生艇",
628
+ "626": "点火器,打火机",
629
+ "627": "豪华轿车",
630
+ "628": "远洋班轮",
631
+ "629": "唇膏,口红",
632
+ "630": "平底便鞋",
633
+ "631": "洗剂",
634
+ "632": "扬声器",
635
+ "633": "放大镜",
636
+ "634": "锯木厂",
637
+ "635": "磁罗盘",
638
+ "636": "邮袋",
639
+ "637": "信箱",
640
+ "638": "女游泳衣",
641
+ "639": "有肩带浴衣",
642
+ "640": "窨井盖",
643
+ "641": "沙球(一种打击乐器)",
644
+ "642": "马林巴木琴",
645
+ "643": "面膜",
646
+ "644": "火柴",
647
+ "645": "花柱",
648
+ "646": "迷宫",
649
+ "647": "量杯",
650
+ "648": "药箱",
651
+ "649": "巨石,巨石结构",
652
+ "650": "麦克风",
653
+ "651": "微波炉",
654
+ "652": "军装",
655
+ "653": "奶桶",
656
+ "654": "迷你巴士",
657
+ "655": "迷你裙",
658
+ "656": "面包车",
659
+ "657": "导弹",
660
+ "658": "连指手套",
661
+ "659": "搅拌钵",
662
+ "660": "活动房屋(由汽车拖拉的)",
663
+ "661": "T型发动机小汽车",
664
+ "662": "调制解调器",
665
+ "663": "修道院",
666
+ "664": "显示器",
667
+ "665": "电瓶车",
668
+ "666": "砂浆",
669
+ "667": "学士",
670
+ "668": "清真寺",
671
+ "669": "蚊帐",
672
+ "670": "摩托车",
673
+ "671": "山地自行车",
674
+ "672": "登山帐",
675
+ "673": "鼠标,电脑鼠标",
676
+ "674": "捕鼠器",
677
+ "675": "搬家车",
678
+ "676": "口套",
679
+ "677": "钉子",
680
+ "678": "颈托",
681
+ "679": "项链",
682
+ "680": "乳头(瓶)",
683
+ "681": "笔记本,笔记本电脑",
684
+ "682": "方尖碑",
685
+ "683": "双簧管",
686
+ "684": "陶笛,卵形笛",
687
+ "685": "里程表",
688
+ "686": "滤油器",
689
+ "687": "风琴,管风琴",
690
+ "688": "示波器",
691
+ "689": "罩裙",
692
+ "690": "牛车",
693
+ "691": "氧气面罩",
694
+ "692": "包装",
695
+ "693": "船桨",
696
+ "694": "明轮,桨轮",
697
+ "695": "挂锁,扣锁",
698
+ "696": "画笔",
699
+ "697": "睡衣",
700
+ "698": "宫殿",
701
+ "699": "排箫,鸣管",
702
+ "700": "纸巾",
703
+ "701": "降落伞",
704
+ "702": "双杠",
705
+ "703": "公园长椅",
706
+ "704": "停车收费表,停车计时器",
707
+ "705": "客车,教练车",
708
+ "706": "露台,阳台",
709
+ "707": "付费电话",
710
+ "708": "基座,基脚",
711
+ "709": "铅笔盒",
712
+ "710": "卷笔刀",
713
+ "711": "香水(瓶)",
714
+ "712": "培养皿",
715
+ "713": "复印机",
716
+ "714": "拨弦片,拨子",
717
+ "715": "尖顶头盔",
718
+ "716": "栅栏,栅栏",
719
+ "717": "皮卡,皮卡车",
720
+ "718": "桥墩",
721
+ "719": "存钱罐",
722
+ "720": "药瓶",
723
+ "721": "枕头",
724
+ "722": "乒乓球",
725
+ "723": "风车",
726
+ "724": "海盗船",
727
+ "725": "水罐",
728
+ "726": "木工刨",
729
+ "727": "天文馆",
730
+ "728": "塑料袋",
731
+ "729": "板架",
732
+ "730": "犁型铲雪机",
733
+ "731": "手压皮碗泵",
734
+ "732": "宝丽来相机",
735
+ "733": "电线杆",
736
+ "734": "警车,巡逻车",
737
+ "735": "雨披",
738
+ "736": "台球桌",
739
+ "737": "充气饮料瓶",
740
+ "738": "花盆",
741
+ "739": "陶工旋盘",
742
+ "740": "电钻",
743
+ "741": "祈祷垫,地毯",
744
+ "742": "打印机",
745
+ "743": "监狱",
746
+ "744": "炮弹,导弹",
747
+ "745": "投影仪",
748
+ "746": "冰球",
749
+ "747": "沙包,吊球",
750
+ "748": "钱包",
751
+ "749": "羽管笔",
752
+ "750": "被子",
753
+ "751": "赛车",
754
+ "752": "球拍",
755
+ "753": "散热器",
756
+ "754": "收音机",
757
+ "755": "射电望远镜,无线电反射器",
758
+ "756": "雨桶",
759
+ "757": "休闲车,房车",
760
+ "758": "卷轴,卷筒",
761
+ "759": "反射式照相机",
762
+ "760": "冰箱,冰柜",
763
+ "761": "遥控器",
764
+ "762": "餐厅,饮食店,食堂",
765
+ "763": "左轮手枪",
766
+ "764": "步枪",
767
+ "765": "摇椅",
768
+ "766": "电转烤肉架",
769
+ "767": "橡皮",
770
+ "768": "橄榄球",
771
+ "769": "直尺",
772
+ "770": "跑步鞋",
773
+ "771": "保险柜",
774
+ "772": "安全别针",
775
+ "773": "盐瓶(调味用)",
776
+ "774": "凉鞋",
777
+ "775": "纱笼,围裙",
778
+ "776": "萨克斯管",
779
+ "777": "剑鞘",
780
+ "778": "秤,称重机",
781
+ "779": "校车",
782
+ "780": "帆船",
783
+ "781": "记分牌",
784
+ "782": "屏幕",
785
+ "783": "螺丝",
786
+ "784": "螺丝刀",
787
+ "785": "安全带",
788
+ "786": "缝纫机",
789
+ "787": "盾牌,盾牌",
790
+ "788": "皮鞋店,鞋店",
791
+ "789": "障子",
792
+ "790": "购物篮",
793
+ "791": "购物车",
794
+ "792": "铁锹",
795
+ "793": "浴帽",
796
+ "794": "浴帘",
797
+ "795": "滑雪板",
798
+ "796": "滑雪面罩",
799
+ "797": "睡袋",
800
+ "798": "滑尺",
801
+ "799": "滑动门",
802
+ "800": "角子老虎机",
803
+ "801": "潜水通气管",
804
+ "802": "雪橇",
805
+ "803": "扫雪机,扫雪机",
806
+ "804": "皂液器",
807
+ "805": "足球",
808
+ "806": "袜子",
809
+ "807": "碟式太阳能,太阳能集热器,太阳能炉",
810
+ "808": "宽边帽",
811
+ "809": "汤碗",
812
+ "810": "空格键",
813
+ "811": "空间加热器",
814
+ "812": "航天飞机",
815
+ "813": "铲(搅拌或涂敷用的)",
816
+ "814": "快艇",
817
+ "815": "蜘蛛网",
818
+ "816": "纺锤,纱锭",
819
+ "817": "跑车",
820
+ "818": "聚光灯",
821
+ "819": "舞台",
822
+ "820": "蒸汽机车",
823
+ "821": "钢拱桥",
824
+ "822": "钢滚筒",
825
+ "823": "听诊器",
826
+ "824": "女用披肩",
827
+ "825": "石头墙",
828
+ "826": "秒表",
829
+ "827": "火炉",
830
+ "828": "过滤器",
831
+ "829": "有轨电车,电车",
832
+ "830": "担架",
833
+ "831": "沙发床",
834
+ "832": "佛塔",
835
+ "833": "潜艇,潜水艇",
836
+ "834": "套装,衣服",
837
+ "835": "日晷",
838
+ "836": "太阳镜",
839
+ "837": "太阳镜,墨镜",
840
+ "838": "防晒霜,防晒剂",
841
+ "839": "悬索桥",
842
+ "840": "拖把",
843
+ "841": "运动衫",
844
+ "842": "游泳裤",
845
+ "843": "秋千",
846
+ "844": "开关,电器开关",
847
+ "845": "注射器",
848
+ "846": "台灯",
849
+ "847": "坦克,装甲战车,装甲战斗车辆",
850
+ "848": "磁带播放器",
851
+ "849": "茶壶",
852
+ "850": "泰迪,泰迪熊",
853
+ "851": "电视",
854
+ "852": "网球",
855
+ "853": "茅草,茅草屋顶",
856
+ "854": "幕布,剧院的帷幕",
857
+ "855": "顶针",
858
+ "856": "脱粒机",
859
+ "857": "宝座",
860
+ "858": "瓦屋顶",
861
+ "859": "烤面包机",
862
+ "860": "烟草店,烟草",
863
+ "861": "马桶",
864
+ "862": "火炬",
865
+ "863": "图腾柱",
866
+ "864": "拖车,牵引车,清障车",
867
+ "865": "玩具店",
868
+ "866": "拖拉机",
869
+ "867": "拖车,铰接式卡车",
870
+ "868": "托盘",
871
+ "869": "风衣",
872
+ "870": "三轮车",
873
+ "871": "三体船",
874
+ "872": "三脚架",
875
+ "873": "凯旋门",
876
+ "874": "无轨电车",
877
+ "875": "长号",
878
+ "876": "浴盆,浴缸",
879
+ "877": "旋转式栅门",
880
+ "878": "打字机键盘",
881
+ "879": "伞",
882
+ "880": "独轮车",
883
+ "881": "直立式钢琴",
884
+ "882": "真空吸尘器",
885
+ "883": "花瓶",
886
+ "884": "拱顶",
887
+ "885": "天鹅绒",
888
+ "886": "自动售货机",
889
+ "887": "祭服",
890
+ "888": "高架桥",
891
+ "889": "小提琴,小提琴",
892
+ "890": "排球",
893
+ "891": "松饼机",
894
+ "892": "挂钟",
895
+ "893": "钱包,皮夹",
896
+ "894": "衣柜,壁橱",
897
+ "895": "军用飞机",
898
+ "896": "洗脸盆,洗手盆",
899
+ "897": "洗衣机,自动洗衣机",
900
+ "898": "水瓶",
901
+ "899": "水壶",
902
+ "900": "水塔",
903
+ "901": "威士忌壶",
904
+ "902": "哨子",
905
+ "903": "假发",
906
+ "904": "纱窗",
907
+ "905": "百叶窗",
908
+ "906": "温莎领带",
909
+ "907": "葡萄酒瓶",
910
+ "908": "飞机翅膀,飞机",
911
+ "909": "炒菜锅",
912
+ "910": "木制的勺子",
913
+ "911": "毛织品,羊绒",
914
+ "912": "栅栏,围栏",
915
+ "913": "沉船",
916
+ "914": "双桅船",
917
+ "915": "蒙古包",
918
+ "916": "网站,互联网网站",
919
+ "917": "漫画",
920
+ "918": "纵横字谜",
921
+ "919": "路标",
922
+ "920": "交通信号灯",
923
+ "921": "防尘罩,书皮",
924
+ "922": "菜单",
925
+ "923": "盘子",
926
+ "924": "鳄梨酱",
927
+ "925": "清汤",
928
+ "926": "罐焖土豆烧肉",
929
+ "927": "蛋糕",
930
+ "928": "冰淇淋",
931
+ "929": "雪糕,冰棍,冰棒",
932
+ "930": "法式面包",
933
+ "931": "百吉饼",
934
+ "932": "椒盐脆饼",
935
+ "933": "芝士汉堡",
936
+ "934": "热狗",
937
+ "935": "土豆泥",
938
+ "936": "结球甘蓝",
939
+ "937": "西兰花",
940
+ "938": "菜花",
941
+ "939": "绿皮密生西葫芦",
942
+ "940": "西葫芦",
943
+ "941": "小青南瓜",
944
+ "942": "南瓜",
945
+ "943": "黄瓜",
946
+ "944": "朝鲜蓟",
947
+ "945": "甜椒",
948
+ "946": "刺棘蓟",
949
+ "947": "蘑菇",
950
+ "948": "绿苹果",
951
+ "949": "草莓",
952
+ "950": "橘子",
953
+ "951": "柠檬",
954
+ "952": "无花果",
955
+ "953": "菠萝",
956
+ "954": "香蕉",
957
+ "955": "菠萝蜜",
958
+ "956": "蛋奶冻苹果",
959
+ "957": "石榴",
960
+ "958": "干草",
961
+ "959": "烤面条加干酪沙司",
962
+ "960": "巧克力酱,巧克力糖浆",
963
+ "961": "面团",
964
+ "962": "瑞士肉包,肉饼",
965
+ "963": "披萨,披萨饼",
966
+ "964": "馅饼",
967
+ "965": "卷饼",
968
+ "966": "红葡萄酒",
969
+ "967": "意大利浓咖啡",
970
+ "968": "杯子",
971
+ "969": "蛋酒",
972
+ "970": "高山",
973
+ "971": "泡泡",
974
+ "972": "悬崖",
975
+ "973": "珊瑚礁",
976
+ "974": "间歇泉",
977
+ "975": "湖边,湖岸",
978
+ "976": "海角",
979
+ "977": "沙洲,沙坝",
980
+ "978": "海滨,海岸",
981
+ "979": "峡谷",
982
+ "980": "火山",
983
+ "981": "棒球,棒球运动员",
984
+ "982": "新郎",
985
+ "983": "潜水员",
986
+ "984": "油菜",
987
+ "985": "雏菊",
988
+ "986": "杓兰",
989
+ "987": "玉米",
990
+ "988": "橡子",
991
+ "989": "玫瑰果",
992
+ "990": "七叶树果实",
993
+ "991": "珊瑚菌",
994
+ "992": "木耳",
995
+ "993": "鹿花菌",
996
+ "994": "鬼笔菌",
997
+ "995": "地星(菌类)",
998
+ "996": "多叶奇果菌",
999
+ "997": "牛肝菌",
1000
+ "998": "玉米穗",
1001
+ "999": "卫生纸"
1002
+ }
labels/id2label_en.json ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "tench, Tinca tinca",
3
+ "1": "goldfish, Carassius auratus",
4
+ "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
5
+ "3": "tiger shark, Galeocerdo cuvieri",
6
+ "4": "hammerhead, hammerhead shark",
7
+ "5": "electric ray, crampfish, numbfish, torpedo",
8
+ "6": "stingray",
9
+ "7": "cock",
10
+ "8": "hen",
11
+ "9": "ostrich, Struthio camelus",
12
+ "10": "brambling, Fringilla montifringilla",
13
+ "11": "goldfinch, Carduelis carduelis",
14
+ "12": "house finch, linnet, Carpodacus mexicanus",
15
+ "13": "junco, snowbird",
16
+ "14": "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
17
+ "15": "robin, American robin, Turdus migratorius",
18
+ "16": "bulbul",
19
+ "17": "jay",
20
+ "18": "magpie",
21
+ "19": "chickadee",
22
+ "20": "water ouzel, dipper",
23
+ "21": "kite",
24
+ "22": "bald eagle, American eagle, Haliaeetus leucocephalus",
25
+ "23": "vulture",
26
+ "24": "great grey owl, great gray owl, Strix nebulosa",
27
+ "25": "European fire salamander, Salamandra salamandra",
28
+ "26": "common newt, Triturus vulgaris",
29
+ "27": "eft",
30
+ "28": "spotted salamander, Ambystoma maculatum",
31
+ "29": "axolotl, mud puppy, Ambystoma mexicanum",
32
+ "30": "bullfrog, Rana catesbeiana",
33
+ "31": "tree frog, tree-frog",
34
+ "32": "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
35
+ "33": "loggerhead, loggerhead turtle, Caretta caretta",
36
+ "34": "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
37
+ "35": "mud turtle",
38
+ "36": "terrapin",
39
+ "37": "box turtle, box tortoise",
40
+ "38": "banded gecko",
41
+ "39": "common iguana, iguana, Iguana iguana",
42
+ "40": "American chameleon, anole, Anolis carolinensis",
43
+ "41": "whiptail, whiptail lizard",
44
+ "42": "agama",
45
+ "43": "frilled lizard, Chlamydosaurus kingi",
46
+ "44": "alligator lizard",
47
+ "45": "Gila monster, Heloderma suspectum",
48
+ "46": "green lizard, Lacerta viridis",
49
+ "47": "African chameleon, Chamaeleo chamaeleon",
50
+ "48": "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
51
+ "49": "African crocodile, Nile crocodile, Crocodylus niloticus",
52
+ "50": "American alligator, Alligator mississipiensis",
53
+ "51": "triceratops",
54
+ "52": "thunder snake, worm snake, Carphophis amoenus",
55
+ "53": "ringneck snake, ring-necked snake, ring snake",
56
+ "54": "hognose snake, puff adder, sand viper",
57
+ "55": "green snake, grass snake",
58
+ "56": "king snake, kingsnake",
59
+ "57": "garter snake, grass snake",
60
+ "58": "water snake",
61
+ "59": "vine snake",
62
+ "60": "night snake, Hypsiglena torquata",
63
+ "61": "boa constrictor, Constrictor constrictor",
64
+ "62": "rock python, rock snake, Python sebae",
65
+ "63": "Indian cobra, Naja naja",
66
+ "64": "green mamba",
67
+ "65": "sea snake",
68
+ "66": "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
69
+ "67": "diamondback, diamondback rattlesnake, Crotalus adamanteus",
70
+ "68": "sidewinder, horned rattlesnake, Crotalus cerastes",
71
+ "69": "trilobite",
72
+ "70": "harvestman, daddy longlegs, Phalangium opilio",
73
+ "71": "scorpion",
74
+ "72": "black and gold garden spider, Argiope aurantia",
75
+ "73": "barn spider, Araneus cavaticus",
76
+ "74": "garden spider, Aranea diademata",
77
+ "75": "black widow, Latrodectus mactans",
78
+ "76": "tarantula",
79
+ "77": "wolf spider, hunting spider",
80
+ "78": "tick",
81
+ "79": "centipede",
82
+ "80": "black grouse",
83
+ "81": "ptarmigan",
84
+ "82": "ruffed grouse, partridge, Bonasa umbellus",
85
+ "83": "prairie chicken, prairie grouse, prairie fowl",
86
+ "84": "peacock",
87
+ "85": "quail",
88
+ "86": "partridge",
89
+ "87": "African grey, African gray, Psittacus erithacus",
90
+ "88": "macaw",
91
+ "89": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
92
+ "90": "lorikeet",
93
+ "91": "coucal",
94
+ "92": "bee eater",
95
+ "93": "hornbill",
96
+ "94": "hummingbird",
97
+ "95": "jacamar",
98
+ "96": "toucan",
99
+ "97": "drake",
100
+ "98": "red-breasted merganser, Mergus serrator",
101
+ "99": "goose",
102
+ "100": "black swan, Cygnus atratus",
103
+ "101": "tusker",
104
+ "102": "echidna, spiny anteater, anteater",
105
+ "103": "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
106
+ "104": "wallaby, brush kangaroo",
107
+ "105": "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
108
+ "106": "wombat",
109
+ "107": "jellyfish",
110
+ "108": "sea anemone, anemone",
111
+ "109": "brain coral",
112
+ "110": "flatworm, platyhelminth",
113
+ "111": "nematode, nematode worm, roundworm",
114
+ "112": "conch",
115
+ "113": "snail",
116
+ "114": "slug",
117
+ "115": "sea slug, nudibranch",
118
+ "116": "chiton, coat-of-mail shell, sea cradle, polyplacophore",
119
+ "117": "chambered nautilus, pearly nautilus, nautilus",
120
+ "118": "Dungeness crab, Cancer magister",
121
+ "119": "rock crab, Cancer irroratus",
122
+ "120": "fiddler crab",
123
+ "121": "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
124
+ "122": "American lobster, Northern lobster, Maine lobster, Homarus americanus",
125
+ "123": "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
126
+ "124": "crayfish, crawfish, crawdad, crawdaddy",
127
+ "125": "hermit crab",
128
+ "126": "isopod",
129
+ "127": "white stork, Ciconia ciconia",
130
+ "128": "black stork, Ciconia nigra",
131
+ "129": "spoonbill",
132
+ "130": "flamingo",
133
+ "131": "little blue heron, Egretta caerulea",
134
+ "132": "American egret, great white heron, Egretta albus",
135
+ "133": "bittern",
136
+ "134": "crane",
137
+ "135": "limpkin, Aramus pictus",
138
+ "136": "European gallinule, Porphyrio porphyrio",
139
+ "137": "American coot, marsh hen, mud hen, water hen, Fulica americana",
140
+ "138": "bustard",
141
+ "139": "ruddy turnstone, Arenaria interpres",
142
+ "140": "red-backed sandpiper, dunlin, Erolia alpina",
143
+ "141": "redshank, Tringa totanus",
144
+ "142": "dowitcher",
145
+ "143": "oystercatcher, oyster catcher",
146
+ "144": "pelican",
147
+ "145": "king penguin, Aptenodytes patagonica",
148
+ "146": "albatross, mollymawk",
149
+ "147": "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
150
+ "148": "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
151
+ "149": "dugong, Dugong dugon",
152
+ "150": "sea lion",
153
+ "151": "Chihuahua",
154
+ "152": "Japanese spaniel",
155
+ "153": "Maltese dog, Maltese terrier, Maltese",
156
+ "154": "Pekinese, Pekingese, Peke",
157
+ "155": "Shih-Tzu",
158
+ "156": "Blenheim spaniel",
159
+ "157": "papillon",
160
+ "158": "toy terrier",
161
+ "159": "Rhodesian ridgeback",
162
+ "160": "Afghan hound, Afghan",
163
+ "161": "basset, basset hound",
164
+ "162": "beagle",
165
+ "163": "bloodhound, sleuthhound",
166
+ "164": "bluetick",
167
+ "165": "black-and-tan coonhound",
168
+ "166": "Walker hound, Walker foxhound",
169
+ "167": "English foxhound",
170
+ "168": "redbone",
171
+ "169": "borzoi, Russian wolfhound",
172
+ "170": "Irish wolfhound",
173
+ "171": "Italian greyhound",
174
+ "172": "whippet",
175
+ "173": "Ibizan hound, Ibizan Podenco",
176
+ "174": "Norwegian elkhound, elkhound",
177
+ "175": "otterhound, otter hound",
178
+ "176": "Saluki, gazelle hound",
179
+ "177": "Scottish deerhound, deerhound",
180
+ "178": "Weimaraner",
181
+ "179": "Staffordshire bullterrier, Staffordshire bull terrier",
182
+ "180": "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier",
183
+ "181": "Bedlington terrier",
184
+ "182": "Border terrier",
185
+ "183": "Kerry blue terrier",
186
+ "184": "Irish terrier",
187
+ "185": "Norfolk terrier",
188
+ "186": "Norwich terrier",
189
+ "187": "Yorkshire terrier",
190
+ "188": "wire-haired fox terrier",
191
+ "189": "Lakeland terrier",
192
+ "190": "Sealyham terrier, Sealyham",
193
+ "191": "Airedale, Airedale terrier",
194
+ "192": "cairn, cairn terrier",
195
+ "193": "Australian terrier",
196
+ "194": "Dandie Dinmont, Dandie Dinmont terrier",
197
+ "195": "Boston bull, Boston terrier",
198
+ "196": "miniature schnauzer",
199
+ "197": "giant schnauzer",
200
+ "198": "standard schnauzer",
201
+ "199": "Scotch terrier, Scottish terrier, Scottie",
202
+ "200": "Tibetan terrier, chrysanthemum dog",
203
+ "201": "silky terrier, Sydney silky",
204
+ "202": "soft-coated wheaten terrier",
205
+ "203": "West Highland white terrier",
206
+ "204": "Lhasa, Lhasa apso",
207
+ "205": "flat-coated retriever",
208
+ "206": "curly-coated retriever",
209
+ "207": "golden retriever",
210
+ "208": "Labrador retriever",
211
+ "209": "Chesapeake Bay retriever",
212
+ "210": "German short-haired pointer",
213
+ "211": "vizsla, Hungarian pointer",
214
+ "212": "English setter",
215
+ "213": "Irish setter, red setter",
216
+ "214": "Gordon setter",
217
+ "215": "Brittany spaniel",
218
+ "216": "clumber, clumber spaniel",
219
+ "217": "English springer, English springer spaniel",
220
+ "218": "Welsh springer spaniel",
221
+ "219": "cocker spaniel, English cocker spaniel, cocker",
222
+ "220": "Sussex spaniel",
223
+ "221": "Irish water spaniel",
224
+ "222": "kuvasz",
225
+ "223": "schipperke",
226
+ "224": "groenendael",
227
+ "225": "malinois",
228
+ "226": "briard",
229
+ "227": "kelpie",
230
+ "228": "komondor",
231
+ "229": "Old English sheepdog, bobtail",
232
+ "230": "Shetland sheepdog, Shetland sheep dog, Shetland",
233
+ "231": "collie",
234
+ "232": "Border collie",
235
+ "233": "Bouvier des Flandres, Bouviers des Flandres",
236
+ "234": "Rottweiler",
237
+ "235": "German shepherd, German shepherd dog, German police dog, alsatian",
238
+ "236": "Doberman, Doberman pinscher",
239
+ "237": "miniature pinscher",
240
+ "238": "Greater Swiss Mountain dog",
241
+ "239": "Bernese mountain dog",
242
+ "240": "Appenzeller",
243
+ "241": "EntleBucher",
244
+ "242": "boxer",
245
+ "243": "bull mastiff",
246
+ "244": "Tibetan mastiff",
247
+ "245": "French bulldog",
248
+ "246": "Great Dane",
249
+ "247": "Saint Bernard, St Bernard",
250
+ "248": "Eskimo dog, husky",
251
+ "249": "malamute, malemute, Alaskan malamute",
252
+ "250": "Siberian husky",
253
+ "251": "dalmatian, coach dog, carriage dog",
254
+ "252": "affenpinscher, monkey pinscher, monkey dog",
255
+ "253": "basenji",
256
+ "254": "pug, pug-dog",
257
+ "255": "Leonberg",
258
+ "256": "Newfoundland, Newfoundland dog",
259
+ "257": "Great Pyrenees",
260
+ "258": "Samoyed, Samoyede",
261
+ "259": "Pomeranian",
262
+ "260": "chow, chow chow",
263
+ "261": "keeshond",
264
+ "262": "Brabancon griffon",
265
+ "263": "Pembroke, Pembroke Welsh corgi",
266
+ "264": "Cardigan, Cardigan Welsh corgi",
267
+ "265": "toy poodle",
268
+ "266": "miniature poodle",
269
+ "267": "standard poodle",
270
+ "268": "Mexican hairless",
271
+ "269": "timber wolf, grey wolf, gray wolf, Canis lupus",
272
+ "270": "white wolf, Arctic wolf, Canis lupus tundrarum",
273
+ "271": "red wolf, maned wolf, Canis rufus, Canis niger",
274
+ "272": "coyote, prairie wolf, brush wolf, Canis latrans",
275
+ "273": "dingo, warrigal, warragal, Canis dingo",
276
+ "274": "dhole, Cuon alpinus",
277
+ "275": "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
278
+ "276": "hyena, hyaena",
279
+ "277": "red fox, Vulpes vulpes",
280
+ "278": "kit fox, Vulpes macrotis",
281
+ "279": "Arctic fox, white fox, Alopex lagopus",
282
+ "280": "grey fox, gray fox, Urocyon cinereoargenteus",
283
+ "281": "tabby, tabby cat",
284
+ "282": "tiger cat",
285
+ "283": "Persian cat",
286
+ "284": "Siamese cat, Siamese",
287
+ "285": "Egyptian cat",
288
+ "286": "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
289
+ "287": "lynx, catamount",
290
+ "288": "leopard, Panthera pardus",
291
+ "289": "snow leopard, ounce, Panthera uncia",
292
+ "290": "jaguar, panther, Panthera onca, Felis onca",
293
+ "291": "lion, king of beasts, Panthera leo",
294
+ "292": "tiger, Panthera tigris",
295
+ "293": "cheetah, chetah, Acinonyx jubatus",
296
+ "294": "brown bear, bruin, Ursus arctos",
297
+ "295": "American black bear, black bear, Ursus americanus, Euarctos americanus",
298
+ "296": "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
299
+ "297": "sloth bear, Melursus ursinus, Ursus ursinus",
300
+ "298": "mongoose",
301
+ "299": "meerkat, mierkat",
302
+ "300": "tiger beetle",
303
+ "301": "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
304
+ "302": "ground beetle, carabid beetle",
305
+ "303": "long-horned beetle, longicorn, longicorn beetle",
306
+ "304": "leaf beetle, chrysomelid",
307
+ "305": "dung beetle",
308
+ "306": "rhinoceros beetle",
309
+ "307": "weevil",
310
+ "308": "fly",
311
+ "309": "bee",
312
+ "310": "ant, emmet, pismire",
313
+ "311": "grasshopper, hopper",
314
+ "312": "cricket",
315
+ "313": "walking stick, walkingstick, stick insect",
316
+ "314": "cockroach, roach",
317
+ "315": "mantis, mantid",
318
+ "316": "cicada, cicala",
319
+ "317": "leafhopper",
320
+ "318": "lacewing, lacewing fly",
321
+ "319": "dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
322
+ "320": "damselfly",
323
+ "321": "admiral",
324
+ "322": "ringlet, ringlet butterfly",
325
+ "323": "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
326
+ "324": "cabbage butterfly",
327
+ "325": "sulphur butterfly, sulfur butterfly",
328
+ "326": "lycaenid, lycaenid butterfly",
329
+ "327": "starfish, sea star",
330
+ "328": "sea urchin",
331
+ "329": "sea cucumber, holothurian",
332
+ "330": "wood rabbit, cottontail, cottontail rabbit",
333
+ "331": "hare",
334
+ "332": "Angora, Angora rabbit",
335
+ "333": "hamster",
336
+ "334": "porcupine, hedgehog",
337
+ "335": "fox squirrel, eastern fox squirrel, Sciurus niger",
338
+ "336": "marmot",
339
+ "337": "beaver",
340
+ "338": "guinea pig, Cavia cobaya",
341
+ "339": "sorrel",
342
+ "340": "zebra",
343
+ "341": "hog, pig, grunter, squealer, Sus scrofa",
344
+ "342": "wild boar, boar, Sus scrofa",
345
+ "343": "warthog",
346
+ "344": "hippopotamus, hippo, river horse, Hippopotamus amphibius",
347
+ "345": "ox",
348
+ "346": "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
349
+ "347": "bison",
350
+ "348": "ram, tup",
351
+ "349": "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis",
352
+ "350": "ibex, Capra ibex",
353
+ "351": "hartebeest",
354
+ "352": "impala, Aepyceros melampus",
355
+ "353": "gazelle",
356
+ "354": "Arabian camel, dromedary, Camelus dromedarius",
357
+ "355": "llama",
358
+ "356": "weasel",
359
+ "357": "mink",
360
+ "358": "polecat, fitch, foulmart, foumart, Mustela putorius",
361
+ "359": "black-footed ferret, ferret, Mustela nigripes",
362
+ "360": "otter",
363
+ "361": "skunk, polecat, wood pussy",
364
+ "362": "badger",
365
+ "363": "armadillo",
366
+ "364": "three-toed sloth, ai, Bradypus tridactylus",
367
+ "365": "orangutan, orang, orangutang, Pongo pygmaeus",
368
+ "366": "gorilla, Gorilla gorilla",
369
+ "367": "chimpanzee, chimp, Pan troglodytes",
370
+ "368": "gibbon, Hylobates lar",
371
+ "369": "siamang, Hylobates syndactylus, Symphalangus syndactylus",
372
+ "370": "guenon, guenon monkey",
373
+ "371": "patas, hussar monkey, Erythrocebus patas",
374
+ "372": "baboon",
375
+ "373": "macaque",
376
+ "374": "langur",
377
+ "375": "colobus, colobus monkey",
378
+ "376": "proboscis monkey, Nasalis larvatus",
379
+ "377": "marmoset",
380
+ "378": "capuchin, ringtail, Cebus capucinus",
381
+ "379": "howler monkey, howler",
382
+ "380": "titi, titi monkey",
383
+ "381": "spider monkey, Ateles geoffroyi",
384
+ "382": "squirrel monkey, Saimiri sciureus",
385
+ "383": "Madagascar cat, ring-tailed lemur, Lemur catta",
386
+ "384": "indri, indris, Indri indri, Indri brevicaudatus",
387
+ "385": "Indian elephant, Elephas maximus",
388
+ "386": "African elephant, Loxodonta africana",
389
+ "387": "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
390
+ "388": "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
391
+ "389": "barracouta, snoek",
392
+ "390": "eel",
393
+ "391": "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
394
+ "392": "rock beauty, Holocanthus tricolor",
395
+ "393": "anemone fish",
396
+ "394": "sturgeon",
397
+ "395": "gar, garfish, garpike, billfish, Lepisosteus osseus",
398
+ "396": "lionfish",
399
+ "397": "puffer, pufferfish, blowfish, globefish",
400
+ "398": "abacus",
401
+ "399": "abaya",
402
+ "400": "academic gown, academic robe, judge robe",
403
+ "401": "accordion, piano accordion, squeeze box",
404
+ "402": "acoustic guitar",
405
+ "403": "aircraft carrier, carrier, flattop, attack aircraft carrier",
406
+ "404": "airliner",
407
+ "405": "airship, dirigible",
408
+ "406": "altar",
409
+ "407": "ambulance",
410
+ "408": "amphibian, amphibious vehicle",
411
+ "409": "analog clock",
412
+ "410": "apiary, bee house",
413
+ "411": "apron",
414
+ "412": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
415
+ "413": "assault rifle, assault gun",
416
+ "414": "backpack, back pack, knapsack, packsack, rucksack, haversack",
417
+ "415": "bakery, bakeshop, bakehouse",
418
+ "416": "balance beam, beam",
419
+ "417": "balloon",
420
+ "418": "ballpoint, ballpoint pen, ballpen, Biro",
421
+ "419": "Band Aid",
422
+ "420": "banjo",
423
+ "421": "bannister, banister, balustrade, balusters, handrail",
424
+ "422": "barbell",
425
+ "423": "barber chair",
426
+ "424": "barbershop",
427
+ "425": "barn",
428
+ "426": "barometer",
429
+ "427": "barrel, cask",
430
+ "428": "barrow, garden cart, lawn cart, wheelbarrow",
431
+ "429": "baseball",
432
+ "430": "basketball",
433
+ "431": "bassinet",
434
+ "432": "bassoon",
435
+ "433": "bathing cap, swimming cap",
436
+ "434": "bath towel",
437
+ "435": "bathtub, bathing tub, bath, tub",
438
+ "436": "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
439
+ "437": "beacon, lighthouse, beacon light, pharos",
440
+ "438": "beaker",
441
+ "439": "bearskin, busby, shako",
442
+ "440": "beer bottle",
443
+ "441": "beer glass",
444
+ "442": "bell cote, bell cot",
445
+ "443": "bib",
446
+ "444": "bicycle-built-for-two, tandem bicycle, tandem",
447
+ "445": "bikini, two-piece",
448
+ "446": "binder, ring-binder",
449
+ "447": "binoculars, field glasses, opera glasses",
450
+ "448": "birdhouse",
451
+ "449": "boathouse",
452
+ "450": "bobsled, bobsleigh, bob",
453
+ "451": "bolo tie, bolo, bola tie, bola",
454
+ "452": "bonnet, poke bonnet",
455
+ "453": "bookcase",
456
+ "454": "bookshop, bookstore, bookstall",
457
+ "455": "bottlecap",
458
+ "456": "bow",
459
+ "457": "bow tie, bow-tie, bowtie",
460
+ "458": "brass, memorial tablet, plaque",
461
+ "459": "brassiere, bra, bandeau",
462
+ "460": "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
463
+ "461": "breastplate, aegis, egis",
464
+ "462": "broom",
465
+ "463": "bucket, pail",
466
+ "464": "buckle",
467
+ "465": "bulletproof vest",
468
+ "466": "bullet train, bullet",
469
+ "467": "butcher shop, meat market",
470
+ "468": "cab, hack, taxi, taxicab",
471
+ "469": "caldron, cauldron",
472
+ "470": "candle, taper, wax light",
473
+ "471": "cannon",
474
+ "472": "canoe",
475
+ "473": "can opener, tin opener",
476
+ "474": "cardigan",
477
+ "475": "car mirror",
478
+ "476": "carousel, carrousel, merry-go-round, roundabout, whirligig",
479
+ "477": "carpenters kit, tool kit",
480
+ "478": "carton",
481
+ "479": "car wheel",
482
+ "480": "cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM",
483
+ "481": "cassette",
484
+ "482": "cassette player",
485
+ "483": "castle",
486
+ "484": "catamaran",
487
+ "485": "CD player",
488
+ "486": "cello, violoncello",
489
+ "487": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
490
+ "488": "chain",
491
+ "489": "chainlink fence",
492
+ "490": "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
493
+ "491": "chain saw, chainsaw",
494
+ "492": "chest",
495
+ "493": "chiffonier, commode",
496
+ "494": "chime, bell, gong",
497
+ "495": "china cabinet, china closet",
498
+ "496": "Christmas stocking",
499
+ "497": "church, church building",
500
+ "498": "cinema, movie theater, movie theatre, movie house, picture palace",
501
+ "499": "cleaver, meat cleaver, chopper",
502
+ "500": "cliff dwelling",
503
+ "501": "cloak",
504
+ "502": "clog, geta, patten, sabot",
505
+ "503": "cocktail shaker",
506
+ "504": "coffee mug",
507
+ "505": "coffeepot",
508
+ "506": "coil, spiral, volute, whorl, helix",
509
+ "507": "combination lock",
510
+ "508": "computer keyboard, keypad",
511
+ "509": "confectionery, confectionary, candy store",
512
+ "510": "container ship, containership, container vessel",
513
+ "511": "convertible",
514
+ "512": "corkscrew, bottle screw",
515
+ "513": "cornet, horn, trumpet, trump",
516
+ "514": "cowboy boot",
517
+ "515": "cowboy hat, ten-gallon hat",
518
+ "516": "cradle",
519
+ "517": "crane",
520
+ "518": "crash helmet",
521
+ "519": "crate",
522
+ "520": "crib, cot",
523
+ "521": "Crock Pot",
524
+ "522": "croquet ball",
525
+ "523": "crutch",
526
+ "524": "cuirass",
527
+ "525": "dam, dike, dyke",
528
+ "526": "desk",
529
+ "527": "desktop computer",
530
+ "528": "dial telephone, dial phone",
531
+ "529": "diaper, nappy, napkin",
532
+ "530": "digital clock",
533
+ "531": "digital watch",
534
+ "532": "dining table, board",
535
+ "533": "dishrag, dishcloth",
536
+ "534": "dishwasher, dish washer, dishwashing machine",
537
+ "535": "disk brake, disc brake",
538
+ "536": "dock, dockage, docking facility",
539
+ "537": "dogsled, dog sled, dog sleigh",
540
+ "538": "dome",
541
+ "539": "doormat, welcome mat",
542
+ "540": "drilling platform, offshore rig",
543
+ "541": "drum, membranophone, tympan",
544
+ "542": "drumstick",
545
+ "543": "dumbbell",
546
+ "544": "Dutch oven",
547
+ "545": "electric fan, blower",
548
+ "546": "electric guitar",
549
+ "547": "electric locomotive",
550
+ "548": "entertainment center",
551
+ "549": "envelope",
552
+ "550": "espresso maker",
553
+ "551": "face powder",
554
+ "552": "feather boa, boa",
555
+ "553": "file, file cabinet, filing cabinet",
556
+ "554": "fireboat",
557
+ "555": "fire engine, fire truck",
558
+ "556": "fire screen, fireguard",
559
+ "557": "flagpole, flagstaff",
560
+ "558": "flute, transverse flute",
561
+ "559": "folding chair",
562
+ "560": "football helmet",
563
+ "561": "forklift",
564
+ "562": "fountain",
565
+ "563": "fountain pen",
566
+ "564": "four-poster",
567
+ "565": "freight car",
568
+ "566": "French horn, horn",
569
+ "567": "frying pan, frypan, skillet",
570
+ "568": "fur coat",
571
+ "569": "garbage truck, dustcart",
572
+ "570": "gasmask, respirator, gas helmet",
573
+ "571": "gas pump, gasoline pump, petrol pump, island dispenser",
574
+ "572": "goblet",
575
+ "573": "go-kart",
576
+ "574": "golf ball",
577
+ "575": "golfcart, golf cart",
578
+ "576": "gondola",
579
+ "577": "gong, tam-tam",
580
+ "578": "gown",
581
+ "579": "grand piano, grand",
582
+ "580": "greenhouse, nursery, glasshouse",
583
+ "581": "grille, radiator grille",
584
+ "582": "grocery store, grocery, food market, market",
585
+ "583": "guillotine",
586
+ "584": "hair slide",
587
+ "585": "hair spray",
588
+ "586": "half track",
589
+ "587": "hammer",
590
+ "588": "hamper",
591
+ "589": "hand blower, blow dryer, blow drier, hair dryer, hair drier",
592
+ "590": "hand-held computer, hand-held microcomputer",
593
+ "591": "handkerchief, hankie, hanky, hankey",
594
+ "592": "hard disc, hard disk, fixed disk",
595
+ "593": "harmonica, mouth organ, harp, mouth harp",
596
+ "594": "harp",
597
+ "595": "harvester, reaper",
598
+ "596": "hatchet",
599
+ "597": "holster",
600
+ "598": "home theater, home theatre",
601
+ "599": "honeycomb",
602
+ "600": "hook, claw",
603
+ "601": "hoopskirt, crinoline",
604
+ "602": "horizontal bar, high bar",
605
+ "603": "horse cart, horse-cart",
606
+ "604": "hourglass",
607
+ "605": "iPod",
608
+ "606": "iron, smoothing iron",
609
+ "607": "jack-o-lantern",
610
+ "608": "jean, blue jean, denim",
611
+ "609": "jeep, landrover",
612
+ "610": "jersey, T-shirt, tee shirt",
613
+ "611": "jigsaw puzzle",
614
+ "612": "jinrikisha, ricksha, rickshaw",
615
+ "613": "joystick",
616
+ "614": "kimono",
617
+ "615": "knee pad",
618
+ "616": "knot",
619
+ "617": "lab coat, laboratory coat",
620
+ "618": "ladle",
621
+ "619": "lampshade, lamp shade",
622
+ "620": "laptop, laptop computer",
623
+ "621": "lawn mower, mower",
624
+ "622": "lens cap, lens cover",
625
+ "623": "letter opener, paper knife, paperknife",
626
+ "624": "library",
627
+ "625": "lifeboat",
628
+ "626": "lighter, light, igniter, ignitor",
629
+ "627": "limousine, limo",
630
+ "628": "liner, ocean liner",
631
+ "629": "lipstick, lip rouge",
632
+ "630": "Loafer",
633
+ "631": "lotion",
634
+ "632": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
635
+ "633": "loupe, jewelers loupe",
636
+ "634": "lumbermill, sawmill",
637
+ "635": "magnetic compass",
638
+ "636": "mailbag, postbag",
639
+ "637": "mailbox, letter box",
640
+ "638": "maillot",
641
+ "639": "maillot, tank suit",
642
+ "640": "manhole cover",
643
+ "641": "maraca",
644
+ "642": "marimba, xylophone",
645
+ "643": "mask",
646
+ "644": "matchstick",
647
+ "645": "maypole",
648
+ "646": "maze, labyrinth",
649
+ "647": "measuring cup",
650
+ "648": "medicine chest, medicine cabinet",
651
+ "649": "megalith, megalithic structure",
652
+ "650": "microphone, mike",
653
+ "651": "microwave, microwave oven",
654
+ "652": "military uniform",
655
+ "653": "milk can",
656
+ "654": "minibus",
657
+ "655": "miniskirt, mini",
658
+ "656": "minivan",
659
+ "657": "missile",
660
+ "658": "mitten",
661
+ "659": "mixing bowl",
662
+ "660": "mobile home, manufactured home",
663
+ "661": "Model T",
664
+ "662": "modem",
665
+ "663": "monastery",
666
+ "664": "monitor",
667
+ "665": "moped",
668
+ "666": "mortar",
669
+ "667": "mortarboard",
670
+ "668": "mosque",
671
+ "669": "mosquito net",
672
+ "670": "motor scooter, scooter",
673
+ "671": "mountain bike, all-terrain bike, off-roader",
674
+ "672": "mountain tent",
675
+ "673": "mouse, computer mouse",
676
+ "674": "mousetrap",
677
+ "675": "moving van",
678
+ "676": "muzzle",
679
+ "677": "nail",
680
+ "678": "neck brace",
681
+ "679": "necklace",
682
+ "680": "nipple",
683
+ "681": "notebook, notebook computer",
684
+ "682": "obelisk",
685
+ "683": "oboe, hautboy, hautbois",
686
+ "684": "ocarina, sweet potato",
687
+ "685": "odometer, hodometer, mileometer, milometer",
688
+ "686": "oil filter",
689
+ "687": "organ, pipe organ",
690
+ "688": "oscilloscope, scope, cathode-ray oscilloscope, CRO",
691
+ "689": "overskirt",
692
+ "690": "oxcart",
693
+ "691": "oxygen mask",
694
+ "692": "packet",
695
+ "693": "paddle, boat paddle",
696
+ "694": "paddlewheel, paddle wheel",
697
+ "695": "padlock",
698
+ "696": "paintbrush",
699
+ "697": "pajama, pyjama, pjs, jammies",
700
+ "698": "palace",
701
+ "699": "panpipe, pandean pipe, syrinx",
702
+ "700": "paper towel",
703
+ "701": "parachute, chute",
704
+ "702": "parallel bars, bars",
705
+ "703": "park bench",
706
+ "704": "parking meter",
707
+ "705": "passenger car, coach, carriage",
708
+ "706": "patio, terrace",
709
+ "707": "pay-phone, pay-station",
710
+ "708": "pedestal, plinth, footstall",
711
+ "709": "pencil box, pencil case",
712
+ "710": "pencil sharpener",
713
+ "711": "perfume, essence",
714
+ "712": "Petri dish",
715
+ "713": "photocopier",
716
+ "714": "pick, plectrum, plectron",
717
+ "715": "pickelhaube",
718
+ "716": "picket fence, paling",
719
+ "717": "pickup, pickup truck",
720
+ "718": "pier",
721
+ "719": "piggy bank, penny bank",
722
+ "720": "pill bottle",
723
+ "721": "pillow",
724
+ "722": "ping-pong ball",
725
+ "723": "pinwheel",
726
+ "724": "pirate, pirate ship",
727
+ "725": "pitcher, ewer",
728
+ "726": "plane, carpenters plane, woodworking plane",
729
+ "727": "planetarium",
730
+ "728": "plastic bag",
731
+ "729": "plate rack",
732
+ "730": "plow, plough",
733
+ "731": "plunger, plumbers helper",
734
+ "732": "Polaroid camera, Polaroid Land camera",
735
+ "733": "pole",
736
+ "734": "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
737
+ "735": "poncho",
738
+ "736": "pool table, billiard table, snooker table",
739
+ "737": "pop bottle, soda bottle",
740
+ "738": "pot, flowerpot",
741
+ "739": "potters wheel",
742
+ "740": "power drill",
743
+ "741": "prayer rug, prayer mat",
744
+ "742": "printer",
745
+ "743": "prison, prison house",
746
+ "744": "projectile, missile",
747
+ "745": "projector",
748
+ "746": "puck, hockey puck",
749
+ "747": "punching bag, punch bag, punching ball, punchball",
750
+ "748": "purse",
751
+ "749": "quill, quill pen",
752
+ "750": "quilt, comforter, comfort, puff",
753
+ "751": "racer, race car, racing car",
754
+ "752": "racket, racquet",
755
+ "753": "radiator",
756
+ "754": "radio, wireless",
757
+ "755": "radio telescope, radio reflector",
758
+ "756": "rain barrel",
759
+ "757": "recreational vehicle, RV, R.V.",
760
+ "758": "reel",
761
+ "759": "reflex camera",
762
+ "760": "refrigerator, icebox",
763
+ "761": "remote control, remote",
764
+ "762": "restaurant, eating house, eating place, eatery",
765
+ "763": "revolver, six-gun, six-shooter",
766
+ "764": "rifle",
767
+ "765": "rocking chair, rocker",
768
+ "766": "rotisserie",
769
+ "767": "rubber eraser, rubber, pencil eraser",
770
+ "768": "rugby ball",
771
+ "769": "rule, ruler",
772
+ "770": "running shoe",
773
+ "771": "safe",
774
+ "772": "safety pin",
775
+ "773": "saltshaker, salt shaker",
776
+ "774": "sandal",
777
+ "775": "sarong",
778
+ "776": "sax, saxophone",
779
+ "777": "scabbard",
780
+ "778": "scale, weighing machine",
781
+ "779": "school bus",
782
+ "780": "schooner",
783
+ "781": "scoreboard",
784
+ "782": "screen, CRT screen",
785
+ "783": "screw",
786
+ "784": "screwdriver",
787
+ "785": "seat belt, seatbelt",
788
+ "786": "sewing machine",
789
+ "787": "shield, buckler",
790
+ "788": "shoe shop, shoe-shop, shoe store",
791
+ "789": "shoji",
792
+ "790": "shopping basket",
793
+ "791": "shopping cart",
794
+ "792": "shovel",
795
+ "793": "shower cap",
796
+ "794": "shower curtain",
797
+ "795": "ski",
798
+ "796": "ski mask",
799
+ "797": "sleeping bag",
800
+ "798": "slide rule, slipstick",
801
+ "799": "sliding door",
802
+ "800": "slot, one-armed bandit",
803
+ "801": "snorkel",
804
+ "802": "snowmobile",
805
+ "803": "snowplow, snowplough",
806
+ "804": "soap dispenser",
807
+ "805": "soccer ball",
808
+ "806": "sock",
809
+ "807": "solar dish, solar collector, solar furnace",
810
+ "808": "sombrero",
811
+ "809": "soup bowl",
812
+ "810": "space bar",
813
+ "811": "space heater",
814
+ "812": "space shuttle",
815
+ "813": "spatula",
816
+ "814": "speedboat",
817
+ "815": "spider web, spiders web",
818
+ "816": "spindle",
819
+ "817": "sports car, sport car",
820
+ "818": "spotlight, spot",
821
+ "819": "stage",
822
+ "820": "steam locomotive",
823
+ "821": "steel arch bridge",
824
+ "822": "steel drum",
825
+ "823": "stethoscope",
826
+ "824": "stole",
827
+ "825": "stone wall",
828
+ "826": "stopwatch, stop watch",
829
+ "827": "stove",
830
+ "828": "strainer",
831
+ "829": "streetcar, tram, tramcar, trolley, trolley car",
832
+ "830": "stretcher",
833
+ "831": "studio couch, day bed",
834
+ "832": "stupa, tope",
835
+ "833": "submarine, pigboat, sub, U-boat",
836
+ "834": "suit, suit of clothes",
837
+ "835": "sundial",
838
+ "836": "sunglass",
839
+ "837": "sunglasses, dark glasses, shades",
840
+ "838": "sunscreen, sunblock, sun blocker",
841
+ "839": "suspension bridge",
842
+ "840": "swab, swob, mop",
843
+ "841": "sweatshirt",
844
+ "842": "swimming trunks, bathing trunks",
845
+ "843": "swing",
846
+ "844": "switch, electric switch, electrical switch",
847
+ "845": "syringe",
848
+ "846": "table lamp",
849
+ "847": "tank, army tank, armored combat vehicle, armoured combat vehicle",
850
+ "848": "tape player",
851
+ "849": "teapot",
852
+ "850": "teddy, teddy bear",
853
+ "851": "television, television system",
854
+ "852": "tennis ball",
855
+ "853": "thatch, thatched roof",
856
+ "854": "theater curtain, theatre curtain",
857
+ "855": "thimble",
858
+ "856": "thresher, thrasher, threshing machine",
859
+ "857": "throne",
860
+ "858": "tile roof",
861
+ "859": "toaster",
862
+ "860": "tobacco shop, tobacconist shop, tobacconist",
863
+ "861": "toilet seat",
864
+ "862": "torch",
865
+ "863": "totem pole",
866
+ "864": "tow truck, tow car, wrecker",
867
+ "865": "toyshop",
868
+ "866": "tractor",
869
+ "867": "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
870
+ "868": "tray",
871
+ "869": "trench coat",
872
+ "870": "tricycle, trike, velocipede",
873
+ "871": "trimaran",
874
+ "872": "tripod",
875
+ "873": "triumphal arch",
876
+ "874": "trolleybus, trolley coach, trackless trolley",
877
+ "875": "trombone",
878
+ "876": "tub, vat",
879
+ "877": "turnstile",
880
+ "878": "typewriter keyboard",
881
+ "879": "umbrella",
882
+ "880": "unicycle, monocycle",
883
+ "881": "upright, upright piano",
884
+ "882": "vacuum, vacuum cleaner",
885
+ "883": "vase",
886
+ "884": "vault",
887
+ "885": "velvet",
888
+ "886": "vending machine",
889
+ "887": "vestment",
890
+ "888": "viaduct",
891
+ "889": "violin, fiddle",
892
+ "890": "volleyball",
893
+ "891": "waffle iron",
894
+ "892": "wall clock",
895
+ "893": "wallet, billfold, notecase, pocketbook",
896
+ "894": "wardrobe, closet, press",
897
+ "895": "warplane, military plane",
898
+ "896": "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
899
+ "897": "washer, automatic washer, washing machine",
900
+ "898": "water bottle",
901
+ "899": "water jug",
902
+ "900": "water tower",
903
+ "901": "whiskey jug",
904
+ "902": "whistle",
905
+ "903": "wig",
906
+ "904": "window screen",
907
+ "905": "window shade",
908
+ "906": "Windsor tie",
909
+ "907": "wine bottle",
910
+ "908": "wing",
911
+ "909": "wok",
912
+ "910": "wooden spoon",
913
+ "911": "wool, woolen, woollen",
914
+ "912": "worm fence, snake fence, snake-rail fence, Virginia fence",
915
+ "913": "wreck",
916
+ "914": "yawl",
917
+ "915": "yurt",
918
+ "916": "web site, website, internet site, site",
919
+ "917": "comic book",
920
+ "918": "crossword puzzle, crossword",
921
+ "919": "street sign",
922
+ "920": "traffic light, traffic signal, stoplight",
923
+ "921": "book jacket, dust cover, dust jacket, dust wrapper",
924
+ "922": "menu",
925
+ "923": "plate",
926
+ "924": "guacamole",
927
+ "925": "consomme",
928
+ "926": "hot pot, hotpot",
929
+ "927": "trifle",
930
+ "928": "ice cream, icecream",
931
+ "929": "ice lolly, lolly, lollipop, popsicle",
932
+ "930": "French loaf",
933
+ "931": "bagel, beigel",
934
+ "932": "pretzel",
935
+ "933": "cheeseburger",
936
+ "934": "hotdog, hot dog, red hot",
937
+ "935": "mashed potato",
938
+ "936": "head cabbage",
939
+ "937": "broccoli",
940
+ "938": "cauliflower",
941
+ "939": "zucchini, courgette",
942
+ "940": "spaghetti squash",
943
+ "941": "acorn squash",
944
+ "942": "butternut squash",
945
+ "943": "cucumber, cuke",
946
+ "944": "artichoke, globe artichoke",
947
+ "945": "bell pepper",
948
+ "946": "cardoon",
949
+ "947": "mushroom",
950
+ "948": "Granny Smith",
951
+ "949": "strawberry",
952
+ "950": "orange",
953
+ "951": "lemon",
954
+ "952": "fig",
955
+ "953": "pineapple, ananas",
956
+ "954": "banana",
957
+ "955": "jackfruit, jak, jack",
958
+ "956": "custard apple",
959
+ "957": "pomegranate",
960
+ "958": "hay",
961
+ "959": "carbonara",
962
+ "960": "chocolate sauce, chocolate syrup",
963
+ "961": "dough",
964
+ "962": "meat loaf, meatloaf",
965
+ "963": "pizza, pizza pie",
966
+ "964": "potpie",
967
+ "965": "burrito",
968
+ "966": "red wine",
969
+ "967": "espresso",
970
+ "968": "cup",
971
+ "969": "eggnog",
972
+ "970": "alp",
973
+ "971": "bubble",
974
+ "972": "cliff, drop, drop-off",
975
+ "973": "coral reef",
976
+ "974": "geyser",
977
+ "975": "lakeside, lakeshore",
978
+ "976": "promontory, headland, head, foreland",
979
+ "977": "sandbar, sand bar",
980
+ "978": "seashore, coast, seacoast, sea-coast",
981
+ "979": "valley, vale",
982
+ "980": "volcano",
983
+ "981": "ballplayer, baseball player",
984
+ "982": "groom, bridegroom",
985
+ "983": "scuba diver",
986
+ "984": "rapeseed",
987
+ "985": "daisy",
988
+ "986": "yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
989
+ "987": "corn",
990
+ "988": "acorn",
991
+ "989": "hip, rose hip, rosehip",
992
+ "990": "buckeye, horse chestnut, conker",
993
+ "991": "coral fungus",
994
+ "992": "agaric",
995
+ "993": "gyromitra",
996
+ "994": "stinkhorn, carrion fungus",
997
+ "995": "earthstar",
998
+ "996": "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
999
+ "997": "bolete",
1000
+ "998": "ear, spike, capitulum",
1001
+ "999": "toilet tissue, toilet paper, bathroom tissue"
1002
+ }
labels/imagenet_labels.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ImageNet-1k class labels for ADM class-conditional generation.
2
+
3
+ Labels are stored as Hugging Face-style ``id2label`` JSON maps (string keys ``"0"``–``"999"``).
4
+ Each value is a comma-separated list of synonyms for that class id.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Literal
12
+
13
+ Language = Literal["en", "cn"]
14
+
15
+ _LABELS_DIR = Path(__file__).resolve().parent
16
+
17
+
18
+ def load_id2label(
19
+ labels_dir: Path | str | None = None,
20
+ lang: Language = "en",
21
+ ) -> dict[int, str]:
22
+ """Load ``id2label`` from ``id2label_en.json`` or ``id2label_cn.json``."""
23
+ root = Path(labels_dir) if labels_dir is not None else _LABELS_DIR
24
+ filename = "id2label_en.json" if lang == "en" else "id2label_cn.json"
25
+ path = root / filename
26
+ if not path.exists():
27
+ raise FileNotFoundError(f"ImageNet label file not found: {path}")
28
+
29
+ raw = json.loads(path.read_text(encoding="utf-8"))
30
+ return {int(key): value for key, value in raw.items()}
31
+
32
+
33
+ def build_label2id(id2label: dict[int, str]) -> dict[str, int]:
34
+ """Build a synonym -> class id map from an ``id2label`` dict (DiT-style)."""
35
+ labels: dict[str, int] = {}
36
+ for class_id, value in id2label.items():
37
+ for synonym in value.split(","):
38
+ synonym = synonym.strip()
39
+ if synonym:
40
+ labels[synonym] = int(class_id)
41
+ return dict(sorted(labels.items()))
42
+
43
+
44
+ def resolve_label_ids(
45
+ labels: str | list[str],
46
+ label2id: dict[str, int],
47
+ *,
48
+ lang: Language = "en",
49
+ ) -> list[int]:
50
+ """Map one or more label strings to ImageNet class ids."""
51
+ if isinstance(labels, str):
52
+ labels = [labels]
53
+
54
+ missing = [label for label in labels if label not in label2id]
55
+ if missing:
56
+ preview = ", ".join(list(label2id.keys())[:8])
57
+ raise ValueError(
58
+ f"Unknown label(s) for lang={lang!r}: {missing}. "
59
+ f"Example valid labels: {preview}, ..."
60
+ )
61
+ return [label2id[label] for label in labels]