root commited on Jan 25

Commit

036458a

1 Parent(s): 33ec12f

Fix LFS and upload model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
__init__.py +18 -0
assets/{banner.png → input_0_0.png} +2 -2
assets/{logo.png → input_1_0.png} +2 -2
assets/{robot.png → input_1_1.png} +2 -2
assets/{framework.png → input_2_0.png} +2 -2
assets/{gsb.png → input_2_1.png} +2 -2
assets/input_2_2.png +3 -0
assets/pg_imgs/image1.png +0 -3
assets/pg_imgs/image2.png +0 -3
assets/pg_imgs/image3.png +0 -3
assets/pg_imgs/image4.png +0 -3
assets/pg_imgs/image5.png +0 -3
assets/pg_imgs/image6.png +0 -3
assets/pg_imgs/image7.png +0 -3
assets/pg_imgs/image8.png +0 -3
assets/ssae_side_by_side_comparison.png +0 -3
assets/ssae_side_by_side_heatmap.png +0 -3
assets/user.png +0 -3
autoencoder_kl_3d.py +1081 -0
cache_utils.py +226 -0
config.json +283 -0
configuration_hunyuan_image_3.py +310 -0
generation_config.json +21 -0
hunyuan_image_3_pipeline.py +913 -0
image_processor.py +465 -0
model-0001-of-0032.safetensors +3 -0
model-0002-of-0032.safetensors +3 -0
model-0003-of-0032.safetensors +3 -0
model-0004-of-0032.safetensors +3 -0
model-0005-of-0032.safetensors +3 -0
model-0006-of-0032.safetensors +3 -0
model-0007-of-0032.safetensors +3 -0
model-0008-of-0032.safetensors +3 -0
model-0009-of-0032.safetensors +3 -0
model-0010-of-0032.safetensors +3 -0
model-0011-of-0032.safetensors +3 -0
model-0012-of-0032.safetensors +3 -0
model-0013-of-0032.safetensors +3 -0
model-0014-of-0032.safetensors +3 -0
model-0015-of-0032.safetensors +3 -0
model-0016-of-0032.safetensors +3 -0
model-0017-of-0032.safetensors +3 -0
model-0018-of-0032.safetensors +3 -0
model-0019-of-0032.safetensors +3 -0
model-0020-of-0032.safetensors +3 -0
model-0021-of-0032.safetensors +3 -0
model-0022-of-0032.safetensors +3 -0
model-0023-of-0032.safetensors +3 -0
model-0024-of-0032.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 assets/banner_all.jpg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
 assets/**/*.png filter=lfs diff=lfs merge=lfs -text

 assets/banner_all.jpg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
 assets/**/*.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import TYPE_CHECKING
+from utils import _LazyModule
+from utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_hunyuan_image_3 import *
+    from .modeling_hunyuan_image_3 import *
+    from .autoencoder_kl_3d import *
+    from .image_processor import *
+    from .siglip2 import *
+    from .tokenization_hunyuan_image_3 import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

assets/{banner.png → input_0_0.png} RENAMED Viewed

File without changes

assets/{logo.png → input_1_0.png} RENAMED Viewed

File without changes

assets/{robot.png → input_1_1.png} RENAMED Viewed

File without changes

assets/{framework.png → input_2_0.png} RENAMED Viewed

File without changes

assets/{gsb.png → input_2_1.png} RENAMED Viewed

File without changes

assets/input_2_2.png ADDED Viewed

Git LFS Details

SHA256: 79ba5ec3f33a18bad35b1ef9ef6f281f3d292fdd3852f5f2f21e057e5add4458
Pointer size: 132 Bytes
Size of remote file: 3.78 MB

assets/pg_imgs/image1.png DELETED Viewed

Git LFS Details

SHA256: a385db722efc89cff4d5e4afdb82c96156f223907d70f0d3c7eb8b3e59edbccb
Pointer size: 132 Bytes
Size of remote file: 1.64 MB

assets/pg_imgs/image2.png DELETED Viewed

Git LFS Details

SHA256: 84a7d37d3ff8452c32ecb79f98692ad38f8f190dc201922a924ae2fda4515e12
Pointer size: 132 Bytes
Size of remote file: 1.7 MB

assets/pg_imgs/image3.png DELETED Viewed

Git LFS Details

SHA256: 913376a1ad5d10bc1549f9f28fc25a0c9f94a119b99434618bacacc7429996fa
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

assets/pg_imgs/image4.png DELETED Viewed

Git LFS Details

SHA256: 71dcfd968f4c76ccec2ccc1806e9ce97babed56c73441d744f7264433bf9339a
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

assets/pg_imgs/image5.png DELETED Viewed

Git LFS Details

SHA256: b93338e2f81f9809f8a9f674e0fe3da7c03de4fc4d7aba1819acb878384abb3e
Pointer size: 132 Bytes
Size of remote file: 3.31 MB

assets/pg_imgs/image6.png DELETED Viewed

Git LFS Details

SHA256: 84e7c73dafea831bf1ceb8c3dd76c16238c9b4a31ac30e3f46c38d61005b5895
Pointer size: 132 Bytes
Size of remote file: 2.02 MB

assets/pg_imgs/image7.png DELETED Viewed

Git LFS Details

SHA256: 701f2449436f8d46f537100bdaa63569586e39448e30fffe2e5bc3e95e558daa
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

assets/pg_imgs/image8.png DELETED Viewed

Git LFS Details

SHA256: b80b97174d4f98030eda02d5dbaac2e294d814f086d957d47957482eb7b70251
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

assets/ssae_side_by_side_comparison.png DELETED Viewed

Git LFS Details

SHA256: 665dce959769e799a14fa7d176d4a676feb33193c6575db21da97458732488fc
Pointer size: 132 Bytes
Size of remote file: 1.52 MB

assets/ssae_side_by_side_heatmap.png DELETED Viewed

Git LFS Details

SHA256: 00e2342afb5cabaf20b9b415587fb2986456f8c7c8cb96dd6ecc68455457045e
Pointer size: 131 Bytes
Size of remote file: 639 kB

assets/user.png DELETED Viewed

Git LFS Details

SHA256: 75543c163927df138a1c3d2958322e151ba259fc52fcd91bebb4cea92fc1af89
Pointer size: 130 Bytes
Size of remote file: 13.5 kB

autoencoder_kl_3d.py ADDED Viewed

	@@ -0,0 +1,1081 @@

+"""
+Reference code
+[FLUX] https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/autoencoder.py
+[DCAE] https://github.com/mit-han-lab/efficientvit/blob/master/efficientvit/models/efficientvit/dc_ae.py
+"""
+import os
+from dataclasses import dataclass
+from typing import Tuple, Optional
+import math
+import random
+import numpy as np
+from einops import rearrange
+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from safetensors import safe_open
+import os
+from collections import OrderedDict
+from collections.abc import Iterable
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import BaseOutput
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var +
+                    self.var / other.var -
+                    1.0 -
+                    self.logvar +
+                    other.logvar,
+                    dim=reduce_dim,
+                )
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self) -> torch.Tensor:
+        return self.mean
+@dataclass
+class DecoderOutput(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+def forward_with_checkpointing(module, *inputs, use_checkpointing=False):
+    def create_custom_forward(module):
+        def custom_forward(*inputs):
+            return module(*inputs)
+        return custom_forward
+    if use_checkpointing:
+        return torch.utils.checkpoint.checkpoint(create_custom_forward(module), *inputs, use_reentrant=False)
+    else:
+        return module(*inputs)
+class Conv3d(nn.Conv3d):
+    """Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5. Only symmetric padding is supported."""
+    def forward(self, input):
+        B, C, T, H, W = input.shape
+        memory_count = (C * T * H * W) * 2 / 1024**3
+        if memory_count > 2:
+            n_split = math.ceil(memory_count / 2)
+            assert n_split >= 2
+            chunks = torch.chunk(input, chunks=n_split, dim=-3)
+            padded_chunks = []
+            for i in range(len(chunks)):
+                if self.padding[0] > 0:
+                    padded_chunk = F.pad(
+                        chunks[i],
+                        (0, 0, 0, 0, self.padding[0], self.padding[0]),
+                        mode="constant" if self.padding_mode == "zeros" else self.padding_mode,
+                        value=0,
+                    )
+                    if i > 0:
+                        padded_chunk[:, :, :self.padding[0]] = chunks[i - 1][:, :, -self.padding[0]:]
+                    if i < len(chunks) - 1:
+                        padded_chunk[:, :, -self.padding[0]:] = chunks[i + 1][:, :, :self.padding[0]]
+                else:
+                    padded_chunk = chunks[i]
+                padded_chunks.append(padded_chunk)
+            padding_bak = self.padding
+            self.padding = (0, self.padding[1], self.padding[2])
+            outputs = []
+            for i in range(len(padded_chunks)):
+                outputs.append(super().forward(padded_chunks[i]))
+            self.padding = padding_bak
+            return torch.cat(outputs, dim=-3)
+        else:
+            return super().forward(input)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.k = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.v = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = Conv3d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, f, h, w = q.shape
+        q = rearrange(q, "b c f h w -> b 1 (f h w) c").contiguous()
+        k = rearrange(k, "b c f h w -> b 1 (f h w) c").contiguous()
+        v = rearrange(v, "b c f h w -> b 1 (f h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (f h w) c -> b c f h w", f=f, h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        self.add_temporal_downsample = add_temporal_downsample
+        stride = (2, 2, 2) if add_temporal_downsample else (1, 2, 2)  # THW
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=stride, padding=0)
+    def forward(self, x: Tensor):
+        spatial_pad = (0, 1, 0, 1, 0, 0)  # WHT
+        x = nn.functional.pad(x, spatial_pad, mode="constant", value=0)
+        temporal_pad = (0, 0, 0, 0, 0, 1) if self.add_temporal_downsample else (0, 0, 0, 0, 1, 1)
+        x = nn.functional.pad(x, temporal_pad, mode="replicate")
+        x = self.conv(x)
+        return x
+class DownsampleDCAE(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2
+        assert out_channels % factor == 0
+        self.conv = Conv3d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
+        self.add_temporal_downsample = add_temporal_downsample
+        self.group_size = factor * in_channels // out_channels
+    def forward(self, x: Tensor):
+        r1 = 2 if self.add_temporal_downsample else 1
+        h = self.conv(x)
+        h = rearrange(h, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
+        shortcut = rearrange(x, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
+        B, C, T, H, W = shortcut.shape
+        shortcut = shortcut.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
+        return h + shortcut
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        self.add_temporal_upsample = add_temporal_upsample
+        self.scale_factor = (2, 2, 2) if add_temporal_upsample else (1, 2, 2)  # THW
+        self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
+        x = self.conv(x)
+        return x
+class UpsampleDCAE(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_upsample else 1 * 2 * 2
+        self.conv = Conv3d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
+        self.add_temporal_upsample = add_temporal_upsample
+        self.repeats = factor * out_channels // in_channels
+    def forward(self, x: Tensor):
+        r1 = 2 if self.add_temporal_upsample else 1
+        h = self.conv(x)
+        h = rearrange(h, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+        shortcut = x.repeat_interleave(repeats=self.repeats, dim=1)
+        shortcut = rearrange(shortcut, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+        return h + shortcut
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        z_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        downsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        assert block_out_channels[-1] % (2 * z_channels) == 0
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+        # downsampling
+        self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+        self.down = nn.ModuleList()
+        block_in = block_out_channels[0]
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            add_spatial_downsample = bool(i_level < np.log2(ffactor_spatial))
+            add_temporal_downsample = add_spatial_downsample and bool(i_level >= np.log2(ffactor_spatial // ffactor_temporal))
+            if add_spatial_downsample or add_temporal_downsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1] if downsample_match_channel else block_in
+                down.downsample = DownsampleDCAE(block_in, block_out, add_temporal_downsample)
+                block_in = block_out
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv3d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x: Tensor) -> Tensor:
+        with torch.no_grad():
+            use_checkpointing = bool(self.training and self.gradient_checkpointing)
+            # downsampling
+            h = self.conv_in(x)
+            for i_level in range(len(self.block_out_channels)):
+                for i_block in range(self.num_res_blocks):
+                    h = forward_with_checkpointing(self.down[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+                if hasattr(self.down[i_level], "downsample"):
+                    h = forward_with_checkpointing(self.down[i_level].downsample, h, use_checkpointing=use_checkpointing)
+            # middle
+            h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
+            # end
+            group_size = self.block_out_channels[-1] // (2 * self.z_channels)
+            shortcut = rearrange(h, "b (c r) f h w -> b c r f h w", r=group_size).mean(dim=2)
+            h = self.norm_out(h)
+            h = swish(h)
+            h = self.conv_out(h)
+            h += shortcut
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        upsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        assert block_out_channels[0] % z_channels == 0
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+        # z to block_in
+        block_in = block_out_channels[0]
+        self.conv_in = Conv3d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            add_spatial_upsample = bool(i_level < np.log2(ffactor_spatial))
+            add_temporal_upsample = bool(i_level < np.log2(ffactor_temporal))
+            if add_spatial_upsample or add_temporal_upsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1] if upsample_match_channel else block_in
+                up.upsample = UpsampleDCAE(block_in, block_out, add_temporal_upsample)
+                block_in = block_out
+            self.up.append(up)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv3d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, z: Tensor) -> Tensor:
+        with torch.no_grad():
+            use_checkpointing = bool(self.training and self.gradient_checkpointing)
+            # z to block_in
+            repeats = self.block_out_channels[0] // (self.z_channels)
+            h = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+            # middle
+            h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
+            # upsampling
+            for i_level in range(len(self.block_out_channels)):
+                for i_block in range(self.num_res_blocks + 1):
+                    h = forward_with_checkpointing(self.up[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+                if hasattr(self.up[i_level], "upsample"):
+                    h = forward_with_checkpointing(self.up[i_level].upsample, h, use_checkpointing=use_checkpointing)
+            # end
+            h = self.norm_out(h)
+            h = swish(h)
+            h = self.conv_out(h)
+        return h
+class AutoencoderKLConv3D(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        latent_channels: int,
+        block_out_channels: Tuple[int, ...],
+        layers_per_block: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        sample_size: int,
+        sample_tsize: int,
+        scaling_factor: float = None,
+        shift_factor: Optional[float] = None,
+        downsample_match_channel: bool = True,
+        upsample_match_channel: bool = True,
+        only_encoder: bool = False,
+        only_decoder: bool = False,
+    ):
+        super().__init__()
+        self.ffactor_spatial = ffactor_spatial
+        self.ffactor_temporal = ffactor_temporal
+        self.scaling_factor = scaling_factor
+        self.shift_factor = shift_factor
+        if not only_decoder:
+            self.encoder = Encoder(
+                in_channels=in_channels,
+                z_channels=latent_channels,
+                block_out_channels=block_out_channels,
+                num_res_blocks=layers_per_block,
+                ffactor_spatial=ffactor_spatial,
+                ffactor_temporal=ffactor_temporal,
+                downsample_match_channel=downsample_match_channel,
+            )
+        if not only_encoder:
+            self.decoder = Decoder(
+                z_channels=latent_channels,
+                out_channels=out_channels,
+                block_out_channels=list(reversed(block_out_channels)),
+                num_res_blocks=layers_per_block,
+                ffactor_spatial=ffactor_spatial,
+                ffactor_temporal=ffactor_temporal,
+                upsample_match_channel=upsample_match_channel,
+            )
+        self.use_slicing = False
+        self.slicing_bsz = 1
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+        self.use_tiling_during_training = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = sample_size
+        self.tile_latent_min_size = sample_size // ffactor_spatial
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // ffactor_temporal
+        self.tile_overlap_factor = 0.125
+        self.use_compile = False
+        self.empty_cache = torch.empty(0, device="cuda")
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+    def enable_tiling_during_training(self, use_tiling: bool = True):
+        self.use_tiling_during_training = use_tiling
+    def disable_tiling_during_training(self):
+        self.enable_tiling_during_training(False)
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+    def enable_tiling(self, use_tiling: bool = True):
+        self.enable_spatial_tiling(use_tiling)
+    def disable_tiling(self):
+        self.disable_spatial_tiling()
+    def enable_slicing(self):
+        self.use_slicing = True
+    def disable_slicing(self):
+        self.use_slicing = False
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+    def spatial_tiled_encode(self, x: torch.Tensor):
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))  # 256 * (1 - 0.25) = 192
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        row_limit = self.tile_latent_min_size - blend_extent  # 8 - 2 = 6
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = x[:, :, :, i: i + self.tile_sample_min_size, j: j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+        return moments
+    def temporal_tiled_encode(self, x: torch.Tensor):
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))  # 64 * (1 - 0.25) = 48
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        t_limit = self.tile_latent_min_tsize - blend_extent  # 8 - 2 = 6
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i: i + self.tile_sample_min_tsize, :, :]
+            if self.use_spatial_tiling and (tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
+                tile = self.spatial_tiled_encode(tile)
+            else:
+                tile = self.encoder(tile)
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+        moments = torch.cat(result_row, dim=-3)
+        return moments
+    def spatial_tiled_decode(self, z: torch.Tensor):
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))  # 24 * (1 - 0.125) = 21
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)  # 384 * 0.125 = 48
+        row_limit = self.tile_sample_min_size - blend_extent  # 384 - 48 = 336
+        # 分布式/多卡：输入不做 padding -> 每 rank 对解码输出做右/下 padding -> GPU all_gather -> rank0重组/融合/裁剪
+        if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            # 统计tile
+            num_rows = math.ceil(H / overlap_size)
+            num_cols = math.ceil(W / overlap_size)
+            total_tiles = num_rows * num_cols
+            tiles_per_rank = math.ceil(total_tiles / world_size)
+            print(f"==={torch.distributed.get_rank()},  {total_tiles=}, {tiles_per_rank=}, {world_size=}")
+            # 本 rank 的 tile 索引（循环分配）：rank, rank+world_size,
+            my_linear_indices = list(range(rank, total_tiles, world_size))
+            if my_linear_indices == []:
+                my_linear_indices = [0]
+            print(f"==={torch.distributed.get_rank()},  {my_linear_indices=}")
+            decoded_tiles = [] # tiles
+            decoded_metas = [] # (ri, rj, pad_w, pad_h)
+            H_out_std = self.tile_sample_min_size
+            W_out_std = self.tile_sample_min_size
+            for lin_idx in my_linear_indices:
+                ri = lin_idx // num_cols
+                rj = lin_idx % num_cols
+                i = ri * overlap_size
+                j = rj * overlap_size
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                dec = self.decoder(tile)
+                # 对边界 tile 的输出做右/下方向 padding 到标准尺寸
+                pad_h = max(0, H_out_std - dec.shape[-2])
+                pad_w = max(0, W_out_std - dec.shape[-1])
+                if pad_h > 0 or pad_w > 0:
+                    dec = F.pad(dec, (0, pad_w, 0, pad_h, 0, 0), "constant", 0)
+                decoded_tiles.append(dec)
+                decoded_metas.append(torch.tensor([ri, rj, pad_w, pad_h], device=z.device, dtype=torch.int64))
+            # 各rank数量不一定相同，进行padding到相同长度
+            T_out = decoded_tiles[0].shape[2] if len(decoded_tiles) > 0 else (T-1)*self.ffactor_temporal+1
+            while len(decoded_tiles) < tiles_per_rank:
+                decoded_tiles.append(torch.zeros([1, 3, T_out, self.tile_sample_min_size, self.tile_sample_min_size], device=z.device, dtype=dec.dtype))
+                decoded_metas.append(torch.tensor([-1, -1, self.tile_sample_min_size, self.tile_sample_min_size], device=z.device, dtype=torch.int64))
+            # 进行gpu的all_gather
+            decoded_tiles = torch.stack(decoded_tiles, dim=0)
+            decoded_metas = torch.stack(decoded_metas, dim=0)
+            tiles_gather_list = [torch.empty_like(decoded_tiles) for _ in range(world_size)]
+            metas_gather_list = [torch.empty_like(decoded_metas) for _ in range(world_size)]
+            dist.all_gather(tiles_gather_list, decoded_tiles)
+            dist.all_gather(metas_gather_list, decoded_metas)
+            if rank != 0:
+                # 非0号rank返回空占位，结果只在rank0上有效
+                return torch.empty(0, device=z.device)
+            # rank0：根据 (ri, rj) 元信息重建 tile 网格；跳过占位项 (ri, rj) == (-1, -1)
+            rows = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+            for r in range(world_size):
+                gathered_tiles_r = tiles_gather_list[r]  # [tiles_per_rank, B, C, T, H, W]
+                gathered_metas_r = metas_gather_list[r]  # [tiles_per_rank, 4]，元素: (ri, rj, pad_w, pad_h)
+                for k in range(gathered_tiles_r.shape[0]):
+                    ri = int(gathered_metas_r[k][0])
+                    rj = int(gathered_metas_r[k][1])
+                    if ri < 0 or rj < 0:
+                        continue
+                    if ri < num_rows and rj < num_cols:
+                        # 去除padding
+                        pad_w = int(gathered_metas_r[k][2])
+                        pad_h = int(gathered_metas_r[k][3])
+                        h_end = None if pad_h == 0 else -pad_h
+                        w_end = None if pad_w == 0 else -pad_w
+                        rows[ri][rj] = gathered_tiles_r[k][:, :, :, :h_end, :w_end]
+            result_rows = []
+            for i, row in enumerate(rows):
+                result_row = []
+                for j, tile in enumerate(row):
+                    if tile is None:
+                        continue
+                    if i > 0:
+                        tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                    if j > 0:
+                        tile = self.blend_h(row[j - 1], tile, blend_extent)
+                    result_row.append(tile[:, :, :, :row_limit, :row_limit])
+                result_rows.append(torch.cat(result_row, dim=-1))
+            dec = torch.cat(result_rows, dim=-2)
+            return dec
+        # 单卡：原有串行逻辑
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=-2)
+        return dec
+    def temporal_tiled_decode(self, z: torch.Tensor):
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)  # 64 * 0.25 = 16
+        t_limit = self.tile_sample_min_tsize - blend_extent  # 64 - 16 = 48
+        assert 0 < overlap_size < self.tile_latent_min_tsize
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i: i + self.tile_latent_min_tsize, :, :]
+            if self.use_spatial_tiling and (tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+                decoded = self.spatial_tiled_decode(tile)
+            else:
+                decoded = self.decoder(tile)
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+        dec = torch.cat(result_row, dim=-3)
+        return dec
+    def encode(self, x: Tensor, return_dict: bool = True):
+        def _encode(x):
+            if self.use_temporal_tiling and x.shape[-3] > self.tile_sample_min_tsize:
+                return self.temporal_tiled_encode(x)
+            if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+                return self.spatial_tiled_encode(x)
+            if self.use_compile:
+                @torch.compile
+                def encoder(x):
+                    return self.encoder(x)
+                return encoder(x)
+            return self.encoder(x)
+        if len(x.shape) != 5:  # (B, C, T, H, W)
+            x = x[:, :, None]
+        assert len(x.shape) == 5  # (B, C, T, H, W)
+        if x.shape[2] == 1:
+            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
+        else:
+            assert x.shape[2] != self.ffactor_temporal and x.shape[2] % self.ffactor_temporal == 0
+        if self.use_slicing and x.shape[0] > 1:
+            if self.slicing_bsz == 1:
+                encoded_slices = [_encode(x_slice) for x_slice in x.split(1)]
+            else:
+                sections = [self.slicing_bsz] * (x.shape[0] // self.slicing_bsz)
+                if x.shape[0] % self.slicing_bsz != 0:
+                    sections.append(x.shape[0] % self.slicing_bsz)
+                encoded_slices = [_encode(x_slice) for x_slice in x.split(sections)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = _encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def decode(self, z: Tensor, return_dict: bool = True, generator=None):
+        def _decode(z):
+            if self.use_temporal_tiling and z.shape[-3] > self.tile_latent_min_tsize:
+                return self.temporal_tiled_decode(z)
+            if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+                return self.spatial_tiled_decode(z)
+            return self.decoder(z)
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [_decode(z_slice) for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = _decode(z)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() != 0:
+                return self.empty_cache
+        if z.shape[-3] == 1:
+            decoded = decoded[:, :, -1:]
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def decode_dist(self, z: Tensor, return_dict: bool = True, generator=None):
+        z = z.cuda()
+        self.use_spatial_tiling = True
+        decoded = self.decode(z)
+        self.use_spatial_tiling = False
+        return decoded
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_posterior: bool = True,
+        return_dict: bool = True
+    ):
+        posterior = self.encode(sample).latent_dist
+        z = posterior.sample() if sample_posterior else posterior.mode()
+        dec = self.decode(z).sample
+        return DecoderOutput(sample=dec, posterior=posterior) if return_dict else (dec, posterior)
+    def random_reset_tiling(self, x: torch.Tensor):
+        if x.shape[-3] == 1:
+            self.disable_spatial_tiling()
+            self.disable_temporal_tiling()
+            return
+        # tiling在input_shape和sample_size上限制很多，任意的input_shape和sample_size很可能不满足条件，因此这里使用固定值
+        min_sample_size = int(1 / self.tile_overlap_factor) * self.ffactor_spatial
+        min_sample_tsize = int(1 / self.tile_overlap_factor) * self.ffactor_temporal
+        sample_size = random.choice([None, 1 * min_sample_size, 2 * min_sample_size, 3 * min_sample_size])
+        if sample_size is None:
+            self.disable_spatial_tiling()
+        else:
+            self.tile_sample_min_size = sample_size
+            self.tile_latent_min_size = sample_size // self.ffactor_spatial
+            self.enable_spatial_tiling()
+        sample_tsize = random.choice([None, 1 * min_sample_tsize, 2 * min_sample_tsize, 3 * min_sample_tsize])
+        if sample_tsize is None:
+            self.disable_temporal_tiling()
+        else:
+            self.tile_sample_min_tsize = sample_tsize
+            self.tile_latent_min_tsize = sample_tsize // self.ffactor_temporal
+            self.enable_temporal_tiling()
+def load_sharded_safetensors(model_dir):
+    """
+    手动加载分片的 safetensors 文件
+    Args:
+        model_dir: 包含分片文件的目录路径
+    Returns:
+        合并后的完整权重字典
+    """
+    # 获取所有分片文件并按编号排序
+    shard_files = []
+    for file in os.listdir(model_dir):
+        if file.endswith(".safetensors"):
+            shard_files.append(file)
+    # 按分片编号排序
+    shard_files.sort(key=lambda x: int(x.split("-")[1]))
+    print(f"找到 {len(shard_files)} 个分片文件")
+    # 合并所有权重
+    merged_state_dict = dict()
+    for shard_file in shard_files:
+        shard_path = os.path.join(model_dir, shard_file)
+        print(f"加载分片: {shard_file}")
+        # 使用 safetensors 加载当前分片
+        with safe_open(shard_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                tensor = f.get_tensor(key)
+                merged_state_dict[key] = tensor
+    print(f"合并完成，总键数量: {len(merged_state_dict)}")
+    return merged_state_dict
+def load_weights(model, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+    def update_state_dict(state_dict: dict[str, torch.Tensor], name, weight):
+        if name not in state_dict:
+            raise ValueError(f"Unexpected weight {name}")
+        model_tensor = state_dict[name]
+        if model_tensor.shape != weight.shape:
+            raise ValueError(
+                f"Shape mismatch for weight {name}: "
+                f"model tensor shape {model_tensor.shape} vs. "
+                f"loaded tensor shape {weight.shape}"
+            )
+        if isinstance(weight, torch.Tensor):
+            model_tensor.data.copy_(weight.data)
+        else:
+            raise ValueError(
+                f"Unsupported tensor type in load_weights "
+                f"for {name}: {type(weight)}"
+            )
+    loaded_params = set()
+    for name, load_tensor in weights.items():
+        updated = True
+        name = name.replace('vae.', '')
+        if name in model.state_dict():
+            update_state_dict(model.state_dict(), name, load_tensor)
+        else:
+            updated = False
+        if updated:
+            loaded_params.add(name)
+    return loaded_params
+def _worker(path, config,
+    rank=None, world_size=None, port=None, req_queue=None, rsp_queue=None):
+    """
+    each rank's worker:
+      - idle: block on req_queue.get() (CPU blocking, no GPU)
+      - receive request: run runner.predict(), all ranks forward
+      - only rank0 put result to rsp_queue
+    """
+    # _tame_cpu_threads_and_comm()
+    # basic env
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(port)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    # device binding should be early than all CUDA operations
+    visible = torch.cuda.device_count()
+    assert visible >= world_size, f"可见卡数 {visible} < world_size {world_size}"
+    local_rank = int(os.environ["LOCAL_RANK"])
+    print(f"[worker {rank}] bind to cuda:{local_rank} (visible={visible})", flush=True)
+    if not torch.distributed.is_initialized():
+        dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    #from .. import load_vae
+    #vae = load_vae(vae_type, vae_precision, device, logger, args, weights_only, only_encoder, only_decoder, sample_size, skip_create_dist=True)
+    #vae = vae.cuda()
+    vae = AutoencoderKLConv3D.from_config(config)
+    merged_state_dict = load_sharded_safetensors(path)
+    loaded_params = load_weights(vae, merged_state_dict)
+    vae = vae.cuda()
+    vae.eval()  # 关闭 Dropout、BatchNorm 训练行为
+    for param in vae.parameters():
+        param.requires_grad = False  #
+    while True:
+        req = req_queue.get()  # blocking
+        if req == "__STOP__":
+            break
+        out = vae.decode_dist(req, return_dict=False)
+        if rank == 0:
+            rsp_queue.put(out)
+    #try:
+    #    while True:
+    #        # blocking on CPU queue
+    #        req = req_queue.get()  # blocking
+    #        if req == "__STOP__":
+    #            break
+    #        out = vae.decode_dist(req, return_dict=False)
+    #        if rank == 0:
+    #            rsp_queue.put(out)
+    #finally:
+    #    # destroy process group before exit
+    #    try:
+    #        dist.destroy_process_group()
+    #    except Exception:
+    #        pass
+#def _find_free_port():
+#    import socket
+#    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+#        s.bind(("127.0.0.1", 0))
+#        return s.getsockname()[1]
+# 避免端口冲突的常见做法
+def _find_free_port(start_port=8100, max_attempts=900):
+    import socket
+    """获取一个可用的端口"""
+    for port in range(start_port, start_port + max_attempts):
+        try:
+            with socket.socket() as s:
+                s.bind(('localhost', port))
+                return s.getsockname()[1]  # 返回实际绑定的端口
+        except OSError:
+            continue
+    raise RuntimeError("找不到可用端口")
+class AutoencoderKLConv3D_Dist(AutoencoderKLConv3D):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        latent_channels: int,
+        block_out_channels: Tuple[int, ...],
+        layers_per_block: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        sample_size: int,
+        sample_tsize: int,
+        scaling_factor: float = None,
+        shift_factor: Optional[float] = None,
+        downsample_match_channel: bool = True,
+        upsample_match_channel: bool = True,
+        only_encoder: bool = False,
+        only_decoder: bool = False,
+    ):
+        super().__init__(in_channels, out_channels, latent_channels, block_out_channels, layers_per_block, ffactor_spatial, ffactor_temporal, sample_size, sample_tsize, scaling_factor, shift_factor, downsample_match_channel, upsample_match_channel, only_encoder, only_decoder)
+    def create_dist(self, path, config,
+    ):
+        self.world_size = 8
+        self.port = _find_free_port()
+        ctx = mp.get_context("spawn")
+        # 每个 rank 一个请求队列（纯 CPU），再加一个公共响应队列
+        self.req_queues = [ctx.Queue() for _ in range(self.world_size)]
+        self.rsp_queue = ctx.Queue()
+        self.procs = []
+        for rank in range(self.world_size):
+            p = ctx.Process(
+                target=_worker,
+                args=(
+                    path, config,
+                    rank, self.world_size, self.port,
+                    self.req_queues[rank], self.rsp_queue,
+                ),
+                daemon=True,
+            )
+            p.start()
+            self.procs.append(p)
+    def decode(self, z: Tensor, return_dict: bool = True, generator=None):
+        """
+        synchronous inference: put the same request to all ranks' queues.
+        return rank0's result.
+        """
+        # check alive
+        for p in self.procs:
+            if not p.is_alive():
+                raise RuntimeError("One of the processes is not alive")
+        # put to each rank's queue
+        for q in self.req_queues:
+            q.put(z)
+        # wait for rank0's result
+        return self.rsp_queue.get(timeout=None)

cache_utils.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+import math
+from typing import Tuple
+def cache_init(cache_interval, max_order, num_steps=None,
+               enable_first_enhance=False, first_enhance_steps=3,
+               enable_tailing_enhance=False, tailing_enhance_steps=1,
+               low_freqs_order=0, high_freqs_order=2):
+    cache_dic = {}
+    cache_dic['counter']= 0
+    cache_dic['current_step'] = 0
+    cache_dic['cache_interval']= cache_interval
+    cache_dic['max_order'] = max_order
+    cache_dic['num_steps'] = num_steps
+    # enhance related utils
+    # first enhance: fully compute first some steps, enhancing contour infos
+    cache_dic['enable_first_enhance'] = enable_first_enhance
+    cache_dic['first_enhance_steps'] = first_enhance_steps
+    # tailing enhance: fully compute the last 1 steps, enhancing details
+    cache_dic['enable_tailing_enhance'] = enable_tailing_enhance
+    cache_dic['tailing_enhance_steps'] = tailing_enhance_steps
+    # freqs related utils
+    cache_dic['low_freqs_order'] = low_freqs_order
+    cache_dic['high_freqs_order'] = high_freqs_order
+    # features for training-aware cache, here we don't use these
+    cache_dic['enable_force_control']= False
+    cache_dic['force_compute']=False
+    return cache_dic
+class TaylorCacheContainer(nn.Module):
+    def __init__(self, max_order):
+        super().__init__()
+        self.max_order = max_order
+        # 逐个注册buffer
+        for i in range(max_order + 1):
+            self.register_buffer(f"derivative_{i}", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}", None, persistent=False)
+    def get_derivative(self, order):
+        return getattr(self, f"derivative_{order}")
+    def set_derivative(self, order, tensor):
+        setattr(self, f"derivative_{order}", tensor)
+    def set_temp_derivative(self, order, tensor):
+        setattr(self, f"temp_derivative_{order}", tensor)
+    def get_temp_derivative(self, order):
+        return getattr(self, f"temp_derivative_{order}")
+    def clear_temp_derivative(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"temp_derivative_{i}", None)
+    def move_temp_to_derivative(self):
+        for i in range(self.max_order + 1):
+            if self.get_temp_derivative(i) is not None:
+                setattr(self, f"derivative_{i}", self.get_temp_derivative(i))
+            else:
+                break
+        self.clear_temp_derivative()
+    def get_all_derivatives(self):
+        return [getattr(self, f"derivative_{i}") for i in range(self.max_order + 1)]
+    def get_all_filled_derivatives(self):
+        return [self.get_derivative(i) for i in range(self.max_order + 1) if self.get_derivative(i) is not None]
+    def taylor_formula(self, distance):
+        output = 0
+        for i in range(len(self.get_all_filled_derivatives())):
+            output += (1 / math.factorial(i)) * self.get_derivative(i) * (distance ** i)
+        return output
+    def derivatives_computation(self, x, distance):
+        '''
+        x: tensor, the new x_0
+        distance: int, the distance between the current step and the last full computation step
+        '''
+        self.set_temp_derivative(0, x)
+        for i in range(self.max_order):
+            if self.get_derivative(i) is not None:
+                self.set_temp_derivative(i+1, (self.get_temp_derivative(i) - self.get_derivative(i)) / distance)
+            else:
+                break
+        self.move_temp_to_derivative()
+    def clear_derivatives(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"derivative_{i}", None)
+            setattr(self, f"temp_derivative_{i}", None)
+@torch.compile
+def decomposition_FFT(x: torch.Tensor, cutoff_ratio: float = 0.1) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fast Fourier Transform frequency domain decomposition
+    Args:
+        x: Input tensor [B, H*W, D]
+        cutoff_ratio: Cutoff frequency ratio (0~0.5)
+    Returns:
+        Tuple of (low_freq, high_freq) tensors with same dtype as input
+    """
+    orig_dtype = x.dtype
+    device = x.device
+    x_fp32 = x.to(torch.float32)  # Convert to fp32 for FFT compatibility
+    B, HW, D = x_fp32.shape
+    freq = torch.fft.fft(x_fp32, dim=1)  # FFT on spatial dimension
+    freqs = torch.fft.fftfreq(HW, d=1.0, device=device)
+    cutoff = cutoff_ratio * freqs.abs().max()
+    # Create frequency masks
+    low_mask = freqs.abs() <= cutoff
+    high_mask = ~low_mask
+    low_mask = low_mask[None, :, None]  # Broadcast to (B, HW, D)
+    high_mask = high_mask[None, :, None]
+    low_freq_complex  = freq * low_mask
+    high_freq_complex = freq * high_mask
+    # IFFT and take real part
+    low_fp32  = torch.fft.ifft(low_freq_complex,  dim=1).real
+    high_fp32 = torch.fft.ifft(high_freq_complex, dim=1).real
+    low  = low_fp32.to(device=device, dtype=orig_dtype)
+    high = high_fp32.to(device=device, dtype=orig_dtype)
+    return low, high
+@torch.compile
+def reconstruction(low_freq: torch.Tensor, high_freq: torch.Tensor) -> torch.Tensor:
+    return low_freq + high_freq
+class CacheWithFreqsContainer(nn.Module):
+    def __init__(self, max_order):
+        super().__init__()
+        self.max_order = max_order
+        # 逐个注册buffer
+        for i in range(max_order + 1):
+            self.register_buffer(f"derivative_{i}_low_freqs", None, persistent=False)
+            self.register_buffer(f"derivative_{i}_high_freqs", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}_low_freqs", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}_high_freqs", None, persistent=False)
+    def get_derivative(self, order, freqs):
+        return getattr(self, f"derivative_{order}_{freqs}")
+    def set_derivative(self, order, freqs, tensor):
+        setattr(self, f"derivative_{order}_{freqs}", tensor)
+    def set_temp_derivative(self, order, freqs, tensor):
+        setattr(self, f"temp_derivative_{order}_{freqs}", tensor)
+    def get_temp_derivative(self, order, freqs):
+        return getattr(self, f"temp_derivative_{order}_{freqs}")
+    def move_temp_to_derivative(self):
+        for i in range(self.max_order + 1):
+            if self.get_temp_derivative(i, "low_freqs") is not None:
+                setattr(self, f"derivative_{i}_low_freqs", self.get_temp_derivative(i, "low_freqs"))
+            if self.get_temp_derivative(i, "high_freqs") is not None:
+                setattr(self, f"derivative_{i}_high_freqs", self.get_temp_derivative(i, "high_freqs"))
+            else:
+                break
+        self.clear_temp_derivative()
+    def get_all_filled_derivatives(self, freqs):
+        return [self.get_derivative(i, freqs) for i in range(self.max_order + 1) if self.get_derivative(i, freqs) is not None]
+    def taylor_formula(self, distance):
+        low_freqs_output = 0
+        high_freqs_output = 0
+        for i in range(len(self.get_all_filled_derivatives("low_freqs"))):
+            low_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "low_freqs") * (distance ** i)
+        for i in range(len(self.get_all_filled_derivatives("high_freqs"))):
+            high_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "high_freqs") * (distance ** i)
+        return reconstruction(low_freqs_output, high_freqs_output)
+    def hermite_formula(self, distance):
+        low_freqs_output = 0
+        high_freqs_output = 0
+        for i in range(len(self.get_all_filled_derivatives("low_freqs"))):
+            low_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "low_freqs") * (distance ** i)
+        for i in range(len(self.get_all_filled_derivatives("high_freqs"))):
+            high_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "high_freqs") * (distance ** i)
+        return reconstruction(low_freqs_output, high_freqs_output)
+    def derivatives_computation(self, x, distance, low_freqs_order, high_freqs_order):
+        '''
+        x: tensor, the new x_0
+        distance: int, the distance between the current step and the last full computation step
+        '''
+        x_low, x_high = decomposition_FFT(x, cutoff_ratio=0.1)
+        self.set_temp_derivative(0, "low_freqs", x_low)
+        self.set_temp_derivative(0, "high_freqs", x_high)
+        for i in range(low_freqs_order):
+            if self.get_derivative(i, "low_freqs") is not None:
+                self.set_temp_derivative(i+1, "low_freqs", (self.get_temp_derivative(i, "low_freqs") - self.get_derivative(i, "low_freqs")) / distance)
+        for i in range(high_freqs_order):
+            if self.get_derivative(i, "high_freqs") is not None:
+                self.set_temp_derivative(i+1, "high_freqs", (self.get_temp_derivative(i, "high_freqs") - self.get_derivative(i, "high_freqs")) / distance)
+        self.move_temp_to_derivative()
+    def clear_temp_derivative(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"temp_derivative_{i}_low_freqs", None)
+            setattr(self, f"temp_derivative_{i}_high_freqs", None)
+    def clear_derivatives(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"derivative_{i}_low_freqs", None)
+            setattr(self, f"derivative_{i}_high_freqs", None)
+            setattr(self, f"temp_derivative_{i}_low_freqs", None)
+            setattr(self, f"temp_derivative_{i}_high_freqs", None)

config.json ADDED Viewed

	@@ -0,0 +1,283 @@

+{
+    "add_classification_head": false,
+    "anyres_pooling_size": 2,
+    "anyres_vit_max_image_size": null,
+    "anyres_vit_two_views": false,
+    "architectures": [
+        "HunyuanImage3ForCausalMM"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_hunyuan_image_3.HunyuanImage3Config",
+        "AutoModel": "modeling_hunyuan_image_3.HunyuanImage3Model",
+        "AutoModelForCausalLM": "modeling_hunyuan_image_3.HunyuanImage3ForCausalMM"
+    },
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_head_dim": 128,
+    "bos_token_id": 127958,
+    "cla_share_factor": 2,
+    "class_num": 0,
+    "dense_list": [
+        4096,
+        0
+    ],
+    "eod_token_id": 3,
+    "eos_token_id": 127957,
+    "group_limited_greedy": false,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "im_end_id": 128001,
+    "im_newline_id": 11,
+    "im_start_id": 128000,
+    "image_token_id": 128006,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "kv_lora_rank": null,
+    "mask_init_id": 12,
+    "max_position_embeddings": 22800,
+    "mlp_bias": false,
+    "model_type": "hunyuan_image_3_moe",
+    "moe_drop_tokens": false,
+    "moe_intermediate_size": [
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072
+    ],
+    "moe_layer_num_skipped": 0,
+    "moe_random_routing_dropped_token": false,
+    "moe_topk": [
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8
+    ],
+    "n_group": false,
+    "norm_topk_prob": true,
+    "norm_type": "rms",
+    "num_attention_heads": 32,
+    "num_experts": 64,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "num_media_embeds": 257,
+    "num_shared_expert": [
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1
+    ],
+    "pad_id": 128009,
+    "pad_token_id": 128009,
+    "pool_type": "last",
+    "position_embedding_xdrope": false,
+    "pretraining_tp": 1,
+    "q_lora_rank": null,
+    "qk_nope_head_dim": null,
+    "qk_rope_head_dim": null,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "alpha": 1.0,
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "factor": 1.0,
+        "mscale": 1.0,
+        "mscale_all_dim": 1.0,
+        "type": "custom"
+    },
+    "rope_theta": 10000.0,
+    "routed_scaling_factor": false,
+    "skip_cls_token": false,
+    "text_end_id": 7,
+    "text_start_id": 6,
+    "tie_word_embeddings": false,
+    "topk_group": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.50.0",
+    "use_cache": true,
+    "use_cla": false,
+    "use_mixed_mlp_moe": true,
+    "use_mla": false,
+    "use_qk_norm": true,
+    "use_rotary_pos_emb": true,
+    "v_head_dim": null,
+    "video_end_id": 10,
+    "video_start_id": 9,
+    "vit_add_patchemb_bias": false,
+    "vit_input_resolution": 224,
+    "vit_mapping_type": "resampler",
+    "vit_norm_type": "fused",
+    "vit_patch": 1,
+    "vit_path": null,
+    "vit_remove_prenorm": false,
+    "vit_token": 64,
+    "vit_type": "siglip2-so400m-patch16-naflex",
+    "vit_used_rms_norm": false,
+    "vocab_size": 133120,
+    "xdrope_section": null,
+    "head_dim": 128,
+    "rope_type": "2d",
+    "vae_downsample_factor": [
+        16,
+        16
+    ],
+    "vit_downsample_factor": [
+        16,
+        16
+    ],
+    "cond_token_attn_type": "joint_full",
+    "cond_image_type": "vae_vit",
+    "vae_type": "hunyuan-image-vae-v1",
+    "vae_dtype": "float32",
+    "vae_autocast_dtype": "float16",
+    "vae": {
+        "_class_name": "AutoencoderKLConv3D",
+        "block_out_channels": [
+            128,
+            256,
+            512,
+            1024,
+            1024
+        ],
+        "in_channels": 3,
+        "out_channels": 3,
+        "latent_channels": 32,
+        "layers_per_block": 2,
+        "ffactor_spatial": 16,
+        "ffactor_temporal": 4,
+        "sample_size": 384,
+        "sample_tsize": 96,
+        "downsample_match_channel": true,
+        "upsample_match_channel": true,
+        "scaling_factor": 0.562679178327931
+    },
+    "vit": {
+        "_attn_implementation": "sdpa",
+        "attention_dropout": 0.0,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "intermediate_size": 4304,
+        "layer_norm_eps": 1e-06,
+        "num_attention_heads": 16,
+        "num_channels": 3,
+        "num_hidden_layers": 27,
+        "num_patches": 256,
+        "patch_size": 16,
+        "torch_dtype": "float32",
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "use_return_dict": true
+    },
+    "vit_processor": {
+        "do_convert_rgb": null,
+        "do_normalize": true,
+        "do_rescale": true,
+        "do_resize": true,
+        "image_mean": [
+            0.5,
+            0.5,
+            0.5
+        ],
+        "image_processor_type": "Siglip2ImageProcessorFast",
+        "image_std": [
+            0.5,
+            0.5,
+            0.5
+        ],
+        "max_num_patches": 1024,
+        "patch_size": 16,
+        "processor_class": "Siglip2Processor",
+        "resample": 2,
+        "rescale_factor": 0.00392156862745098
+    },
+    "vit_aligner": {
+        "projector_type": "mlp_gelu",
+        "input_dim": 1152,
+        "n_embed": 4096,
+        "depth": 2,
+        "torch_dtype": "float32"
+    }
+}

configuration_hunyuan_image_3.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from typing import List, Union, Optional
+logger = logging.get_logger(__name__)
+class HunyuanImage3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunyuanImage3Model`]. It is used to instantiate
+    an Hunyuan model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Hunyuan-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Hunyuan Image 3 model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`HunyuanImage3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        moe_intermediate_size (`int` or `List`, *optional*, defaults to 11008):
+            Dimension of the MLP representations in MoE. Use a list if you want a different size per layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether query and key in attention use norm
+        use_cla (`bool`, *optional*, defaults to `False`):
+            Whether to use CLA in attention
+        cla_share_factor (`int`, *optional*, defaults to 1):
+            The share factor of CLA
+        num_experts (`int` or `List`, *optional*, defaults to 1):
+            The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
+        num_shared_expert (`int` or `List`, *optional*, defaults to 1):
+            The number of shared experts for moe. If it is a list, it will be used as the number of shared experts
+            for each layer.
+        moe_topk (`int` or `List`, *optional*, defaults to 1):
+            The topk value for moe. If it is a list, it will be used as the topk value for each layer.
+        capacity_factor (Not used) (`float` or `List`, *optional*, defaults to 1.0):
+            The capacity factor for moe. If it is a list, it will be used as the capacity factor for each layer.
+        moe_layer_num_skipped (`int`, *optional*, defaults to 0):
+            First moe_layer_num_skipped layers do not use MoE.
+    """
+    model_type = "Hunyuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size: int = 290943,
+            hidden_size: int = 4096,
+            intermediate_size: int = 11008,
+            moe_intermediate_size: Union[int, List] = None,
+            num_hidden_layers: int = 32,
+            num_attention_heads: int = 32,
+            num_key_value_heads: Optional[int] = None,
+            attention_head_dim: Optional[int] = None,
+            hidden_act="silu",
+            max_position_embeddings=2048,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            eod_token_id=3,
+            im_start_id=4,
+            im_end_id=5,
+            text_start_id=6,
+            text_end_id=7,
+            image_token_id=8,
+            video_start_id=9,
+            video_end_id=10,
+            im_newline_id=11,
+            mask_init_id=12,
+            pretraining_tp=1,
+            tie_word_embeddings=False,
+            rope_theta=10000.0,
+            rope_scaling=None,
+            attention_bias=False,
+            mlp_bias=False,
+            attention_dropout=0.0,
+            use_qk_norm=False,
+            use_rotary_pos_emb=True,
+            use_cla=False,
+            cla_share_factor=1,
+            norm_type="hf_rms",
+            num_experts: Union[int, List] = 1,
+            use_mixed_mlp_moe=False,
+            num_shared_expert: Union[int, List] = 1,
+            moe_topk: Union[int, List] = 1,
+            capacity_factor: int = 1.0,
+            moe_drop_tokens=False,
+            moe_random_routing_dropped_token=False,
+            use_mla=False,
+            kv_lora_rank=512,
+            q_lora_rank=1536,
+            qk_rope_head_dim=64,
+            v_head_dim=128,
+            qk_nope_head_dim=128,
+            moe_layer_num_skipped=0,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+            group_limited_greedy=False,
+            n_group=None,
+            topk_group=None,
+            add_classification_head=False,
+            class_num=0,
+            pool_type="last",
+            pad_id=-1,
+            # Added
+            moe_impl="eager",
+            vae_downsample_factor=(16, 16),     # (h, w)
+            img_proj_type="unet",
+            patch_size=1,
+            patch_embed_hidden_dim=1024,
+            image_base_size=1024,
+            rope_type="2d",
+            cond_token_attn_type="full",
+            cond_image_type="vae_vit",
+            vae_type=None,
+            vae_dtype="float32",
+            vae_autocast_dtype="float16",
+            vae=None,
+            vit_type=None,
+            vit=None,
+            vit_processor=None,
+            vit_aligner=None,
+            cfg_distilled=False,
+            use_meanflow=False,
+            **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.moe_impl = moe_impl
+        self.num_experts = num_experts
+        self.use_mixed_mlp_moe = use_mixed_mlp_moe
+        self.num_shared_expert = num_shared_expert
+        self.moe_topk = moe_topk
+        self.capacity_factor = capacity_factor
+        self.moe_drop_tokens = moe_drop_tokens
+        self.moe_random_routing_dropped_token = moe_random_routing_dropped_token
+        if attention_head_dim is not None:
+            self.attention_head_dim = attention_head_dim
+        else:
+            self.attention_head_dim = self.hidden_size // num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.attention_dropout = attention_dropout
+        self.use_qk_norm = use_qk_norm
+        self.use_rotary_pos_emb = use_rotary_pos_emb
+        self.use_cla = use_cla
+        self.cla_share_factor = cla_share_factor
+        self.norm_type = norm_type
+        # MLA args
+        self.use_mla = use_mla
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        # DeepSeek related args
+        self.moe_layer_num_skipped = moe_layer_num_skipped
+        self.norm_topk_prob = norm_topk_prob
+        self.routed_scaling_factor = routed_scaling_factor
+        self.group_limited_greedy = group_limited_greedy
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.add_classification_head = add_classification_head
+        self.class_num = class_num
+        self.pool_type = pool_type
+        self.pad_id = pad_id
+        if self.class_num is not None:
+            self.dense_list = [self.hidden_size, self.class_num]
+        # Conditioning image configs
+        self.cond_token_attn_type = cond_token_attn_type
+        self.cond_image_type = cond_image_type
+        # ViT args
+        self.vit_type = vit_type
+        self.vit = vit
+        self.vit_processor = vit_processor
+        self.vit_aligner = vit_aligner
+        # Image Gen args
+        self.vae_type = vae_type
+        self.vae_dtype = vae_dtype
+        self.vae_autocast_dtype = vae_autocast_dtype
+        self.vae = vae
+        self.vae_downsample_factor = vae_downsample_factor
+        self.img_proj_type = img_proj_type
+        self.patch_size = patch_size
+        self.patch_embed_hidden_dim = patch_embed_hidden_dim
+        self.image_base_size = image_base_size
+        self.rope_type = rope_type
+        # token id
+        self.eod_token_id = eod_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.text_start_id = text_start_id
+        self.text_end_id = text_end_id
+        self.image_token_id = image_token_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+        self.im_newline_id = im_newline_id
+        self.mask_init_id = mask_init_id
+        # flag of cfg distilled model
+        self.cfg_distilled = cfg_distilled
+        # flag of meanflow distilled model
+        self.use_meanflow = use_meanflow
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["HunyuanImage3Config"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "disable_compile": true,
+    "eos_token_id": [
+        127957
+    ],
+    "pad_token_id": 128009,
+    "do_sample": true,
+    "top_k": 1024,
+    "top_p": 0.95,
+    "temperature": 0.6,
+    "max_length": 22800,
+    "sequence_template": "instruct",
+    "diff_infer_steps": 50,
+    "diff_guidance_scale": 2.5,
+    "flow_shift": 3.0,
+    "use_system_prompt": "en_unified",
+    "drop_think": false,
+    "bot_task": "think_recaption",
+    "max_new_tokens": 2048,
+    "transformers_version": "4.50.0"
+}

hunyuan_image_3_pipeline.py ADDED Viewed

	@@ -0,0 +1,913 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================================
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+from .cache_utils import cache_init
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+@dataclass
+class HunyuanImage3Text2ImagePipelineOutput(BaseOutput):
+    samples: Union[List[Any], np.ndarray]
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+        reverse (`bool`, defaults to `True`):
+            Whether to reverse the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            shift: float = 1.0,
+            reverse: bool = True,
+            solver: str = "euler",
+            use_flux_shift: bool = False,
+            flux_base_shift: float = 0.5,
+            flux_max_shift: float = 1.15,
+            n_tokens: Optional[int] = None,
+    ):
+        sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+        if not reverse:
+            sigmas = sigmas.flip(0)
+        self.sigmas = sigmas
+        # the value fed to model
+        self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+        self.timesteps_full = (sigmas * num_train_timesteps).to(dtype=torch.float32)
+        self._step_index = None
+        self._begin_index = None
+        self.supported_solver = [
+            "euler",
+            "heun-2", "midpoint-2",
+            "kutta-4",
+        ]
+        if solver not in self.supported_solver:
+            raise ValueError(f"Solver {solver} not supported. Supported solvers: {self.supported_solver}")
+        # empty dt and derivative (for heun)
+        self.derivative_1 = None
+        self.derivative_2 = None
+        self.derivative_3 = None
+        self.dt = None
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    @property
+    def state_in_first_order(self):
+        return self.derivative_1 is None
+    @property
+    def state_in_second_order(self):
+        return self.derivative_2 is None
+    @property
+    def state_in_third_order(self):
+        return self.derivative_3 is None
+    def get_timestep_r(self, timestep: Union[float, torch.FloatTensor]):
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        return self.timesteps_full[self.step_index + 1]
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None,
+                      n_tokens: int = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+        """
+        self.num_inference_steps = num_inference_steps
+        sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+        # Apply timestep shift
+        if self.config.use_flux_shift:
+            assert isinstance(n_tokens, int), "n_tokens should be provided for flux shift"
+            mu = self.get_lin_function(y1=self.config.flux_base_shift, y2=self.config.flux_max_shift)(n_tokens)
+            sigmas = self.flux_time_shift(mu, 1.0, sigmas)
+        elif self.config.shift != 1.:
+            sigmas = self.sd3_time_shift(sigmas)
+        if not self.config.reverse:
+            sigmas = 1 - sigmas
+        self.sigmas = sigmas
+        self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+        self.timesteps_full = (sigmas * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+        # empty dt and derivative (for kutta)
+        self.derivative_1 = None
+        self.derivative_2 = None
+        self.derivative_3 = None
+        self.dt = None
+        # Reset step index
+        self._step_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        return sample
+    @staticmethod
+    def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15):
+        m = (y2 - y1) / (x2 - x1)
+        b = y1 - m * x1
+        return lambda x: m * x + b
+    @staticmethod
+    def flux_time_shift(mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def sd3_time_shift(self, t: torch.Tensor):
+        return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+    def step(
+            self,
+            model_output: torch.FloatTensor,
+            timestep: Union[float, torch.FloatTensor],
+            sample: torch.FloatTensor,
+            pred_uncond: torch.FloatTensor = None,
+            generator: Optional[torch.Generator] = None,
+            n_tokens: Optional[int] = None,
+            return_dict: bool = True,
+    ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+                isinstance(timestep, int)
+                or isinstance(timestep, torch.IntTensor)
+                or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        model_output = model_output.to(torch.float32)
+        pred_uncond = pred_uncond.to(torch.float32) if pred_uncond is not None else None
+        # dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        last_inner_step = True
+        if self.config.solver == "euler":
+            derivative, dt, sample, last_inner_step = self.first_order_method(model_output, sigma, sigma_next, sample)
+        elif self.config.solver in ["heun-2", "midpoint-2"]:
+            derivative, dt, sample, last_inner_step = self.second_order_method(model_output, sigma, sigma_next, sample)
+        elif self.config.solver == "kutta-4":
+            derivative, dt, sample, last_inner_step = self.fourth_order_method(model_output, sigma, sigma_next, sample)
+        else:
+            raise ValueError(f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}")
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        # prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        if last_inner_step:
+            self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def first_order_method(self, model_output, sigma, sigma_next, sample):
+        derivative = model_output
+        dt = sigma_next - sigma
+        return derivative, dt, sample, True
+    def second_order_method(self, model_output, sigma, sigma_next, sample):
+        if self.state_in_first_order:
+            # store for 2nd order step
+            self.derivative_1 = model_output
+            self.dt = sigma_next - sigma
+            self.sample = sample
+            derivative = model_output
+            if self.config.solver == 'heun-2':
+                dt = self.dt
+            elif self.config.solver == 'midpoint-2':
+                dt = self.dt / 2
+            else:
+                raise NotImplementedError(f"Solver {self.config.solver} not supported.")
+            last_inner_step = False
+        else:
+            if self.config.solver == 'heun-2':
+                derivative = 0.5 * (self.derivative_1 + model_output)
+            elif self.config.solver == 'midpoint-2':
+                derivative = model_output
+            else:
+                raise NotImplementedError(f"Solver {self.config.solver} not supported.")
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+            last_inner_step = True
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.derivative_1 = None
+            self.dt = None
+            self.sample = None
+        return derivative, dt, sample, last_inner_step
+    def fourth_order_method(self, model_output, sigma, sigma_next, sample):
+        if self.state_in_first_order:
+            self.derivative_1 = model_output
+            self.dt = sigma_next - sigma
+            self.sample = sample
+            derivative = model_output
+            dt = self.dt / 2
+            last_inner_step = False
+        elif self.state_in_second_order:
+            self.derivative_2 = model_output
+            derivative = model_output
+            dt = self.dt / 2
+            last_inner_step = False
+        elif self.state_in_third_order:
+            self.derivative_3 = model_output
+            derivative = model_output
+            dt = self.dt
+            last_inner_step = False
+        else:
+            derivative = (1/6 * self.derivative_1 + 1/3 * self.derivative_2 + 1/3 * self.derivative_3 +
+                          1/6 * model_output)
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+            last_inner_step = True
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.derivative_1 = None
+            self.derivative_2 = None
+            self.derivative_3 = None
+            self.dt = None
+            self.sample = None
+        return derivative, dt, sample, last_inner_step
+    def __len__(self):
+        return self.config.num_train_timesteps
+class ClassifierFreeGuidance:
+    def __init__(
+        self,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__()
+        self.use_original_formulation = use_original_formulation
+    def __call__(
+            self,
+            pred_cond: torch.Tensor,
+            pred_uncond: Optional[torch.Tensor],
+            guidance_scale: float,
+            step: int,
+        ) -> torch.Tensor:
+        shift = pred_cond - pred_uncond
+        pred = pred_cond if self.use_original_formulation else pred_uncond
+        pred = pred + guidance_scale * shift
+        return pred
+class HunyuanImage3Text2ImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for condition-to-sample generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        model ([`ModelMixin`]):
+            A model to denoise the diffused latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `diffusion_model` to denoise the diffused latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    model_cpu_offload_seq = ""
+    _optional_components = []
+    _exclude_from_cpu_offload = []
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        model,
+        scheduler: SchedulerMixin,
+        vae,
+        progress_bar_config: Dict[str, Any] = None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, '_progress_bar_config'):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        # ==========================================================================================
+        self.register_modules(
+            model=model,
+            scheduler=scheduler,
+            vae=vae,
+        )
+        # should be a tuple or a list corresponding to the size of latents (batch_size, channel, *size)
+        # if None, will be treated as a tuple of 1
+        self.latent_scale_factor = self.model.config.vae_downsample_factor
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.latent_scale_factor)
+        # Must start with APG_mode_
+        self.cfg_operator = ClassifierFreeGuidance()
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_kwargs[k] = v
+        return extra_kwargs
+    def prepare_latents(self, batch_size, latent_channel, image_size, dtype, device, generator, latents=None):
+        if self.latent_scale_factor is None:
+            latent_scale_factor = (1,) * len(image_size)
+        elif isinstance(self.latent_scale_factor, int):
+            latent_scale_factor = (self.latent_scale_factor,) * len(image_size)
+        elif isinstance(self.latent_scale_factor, tuple) or isinstance(self.latent_scale_factor, list):
+            assert len(self.latent_scale_factor) == len(image_size), \
+                "len(latent_scale_factor) shoudl be the same as len(image_size)"
+            latent_scale_factor = self.latent_scale_factor
+        else:
+            raise ValueError(
+                f"latent_scale_factor should be either None, int, tuple of int, or list of int, "
+                f"but got {self.latent_scale_factor}"
+            )
+        latents_shape = (
+            batch_size,
+            latent_channel,
+            *[int(s) // f for s, f in zip(image_size, latent_scale_factor)],
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    def set_scheduler(self, new_scheduler):
+        self.register_modules(scheduler=new_scheduler)
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int,
+        image_size: List[int],
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        meanflow: bool = False,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        guidance_rescale: float = 0.0,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        model_kwargs: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The text to guide image generation.
+            image_size (`Tuple[int]` or `List[int]`):
+                The size (height, width) of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate samples closely linked to the
+                `condition` at the expense of lower sample quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for sample
+                generation. Can be used to tweak the same generation with different conditions. If not provided,
+                a latents tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~DiffusionPipelineOutput`] instead of a
+                plain tuple.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~DiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~DiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated samples.
+        """
+        callback_steps = kwargs.pop("callback_steps", None)
+        pbar_steps = kwargs.pop("pbar_steps", None)
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        if not kwargs.get('cfg_distilled', False):
+            cfg_factor = 1 + self.do_classifier_free_guidance
+        else:
+            cfg_factor = 1
+        # Define call parameters
+        device = self._execution_device
+        # Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas,
+        )
+        # Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            latent_channel=self.model.config.vae["latent_channels"],
+            image_size=image_size,
+            dtype=torch.bfloat16,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+        # Prepare extra step kwargs.
+        _scheduler_step_extra_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step, {"generator": generator}
+        )
+        # Prepare model kwargs
+        input_ids = model_kwargs.pop("input_ids")
+        attention_mask = self.model._prepare_attention_mask_for_generation(     # noqa
+            input_ids, self.model.generation_config, model_kwargs=model_kwargs,
+        )
+        model_kwargs["attention_mask"] = attention_mask.to(latents.device)
+        # Sampling loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        # Taylor cache
+        cache_dic = None
+        if self.model.use_taylor_cache:
+            cache_dic = cache_init(cache_interval=self.model.taylor_cache_interval, max_order=self.model.taylor_cache_order, num_steps=len(timesteps),
+                                enable_first_enhance=self.model.taylor_cache_enable_first_enhance, first_enhance_steps=self.model.taylor_cache_first_enhance_steps,
+                                enable_tailing_enhance=self.model.taylor_cache_enable_tailing_enhance,
+                                tailing_enhance_steps=self.model.taylor_cache_tailing_enhance_steps,
+                                low_freqs_order=self.model.taylor_cache_low_freqs_order,
+                                high_freqs_order=self.model.taylor_cache_high_freqs_order)
+        print(f"***use_taylor_cache: {self.model.use_taylor_cache}, cache_dic: {cache_dic}")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * cfg_factor)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if meanflow:
+                    r = self.scheduler.get_timestep_r(t)
+                    r_expand = r.repeat(latent_model_input.shape[0])
+                else:
+                    r_expand = None
+                model_kwargs["timesteps_r"] = r_expand
+                t_expand = t.repeat(latent_model_input.shape[0])
+                if self.model.use_taylor_cache:
+                    cache_dic['current_step'] = i
+                    model_kwargs['cache_dic'] = cache_dic
+                if kwargs.get('cfg_distilled', False):
+                    model_kwargs["guidance"] = torch.tensor(
+                        [1000.0*self._guidance_scale], device=self.device, dtype=torch.bfloat16
+                    )
+                model_inputs = self.model.prepare_inputs_for_generation(
+                    input_ids,
+                    images=latent_model_input,
+                    timesteps=t_expand,
+                    **model_kwargs,
+                )
+                with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+                    model_output = self.model(**model_inputs, first_step=(i == 0))
+                    pred = model_output["diffusion_prediction"]
+                pred = pred.to(dtype=torch.float32)
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    if not kwargs.get('cfg_distilled', False):
+                        pred_cond, pred_uncond = pred.chunk(2)
+                        pred = self.cfg_operator(pred_cond, pred_uncond, self.guidance_scale, step=i)
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    pred = rescale_noise_cfg(pred, pred_cond, guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(pred, t, latents, **_scheduler_step_extra_kwargs, return_dict=False)[0]
+                if i != len(timesteps) - 1:
+                    model_kwargs = self.model._update_model_kwargs_for_generation(  # noqa
+                        model_output,
+                        model_kwargs,
+                    )
+                    input_ids = None
+                    # if input_ids.shape[1] != model_kwargs["position_ids"].shape[1]:
+                    #     input_ids = torch.gather(input_ids, 1, index=model_kwargs["position_ids"])
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if hasattr(self.vae.config, 'scaling_factor') and self.vae.config.scaling_factor:
+            latents = latents / self.vae.config.scaling_factor
+        if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
+            latents = latents + self.vae.config.shift_factor
+        if hasattr(self.vae, "ffactor_temporal"):
+            latents = latents.unsqueeze(2)
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+            image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
+        # b c t h w
+        if hasattr(self.vae, "ffactor_temporal"):
+            assert image.shape[2] == 1, "image should have shape [B, C, T, H, W] and T should be 1"
+            image = image.squeeze(2)
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if not return_dict:
+            return (image,)
+        return HunyuanImage3Text2ImagePipelineOutput(samples=image)

image_processor.py ADDED Viewed

	@@ -0,0 +1,465 @@

+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from dataclasses import dataclass, field, asdict
+from typing import Tuple, Optional, Callable, Union, Any
+import random
+import math
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import load_image
+from transformers.models.siglip2.image_processing_siglip2_fast import Siglip2ImageProcessorFast
+from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
+from .tokenization_hunyuan_image_3 import ImageInfo, ImageTensor, CondImage, Resolution, ResolutionGroup
+InputImage = Union[Image.Image, str]
+class SliceVocabLogitsProcessor(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that performs vocab slicing, i.e. restricting probabilities with in some range. This processor
+    is often used in multimodal discrete LLMs, which ensure that we only sample within one modality
+    Args:
+        vocab_start (`int`): start of slice, default None meaning from 0
+        vocab_end (`int`): end of slice, default None meaning to the end of list
+        when start and end are all None, this processor does noting
+    """
+    def __init__(self, vocab_start: int = None, vocab_end: int = None, **kwargs):
+        if vocab_start is not None and vocab_end is not None:
+            assert vocab_start < vocab_end, f"Ensure vocab_start {vocab_start} < vocab_end {vocab_end}"
+        self.vocab_start = vocab_start
+        self.vocab_end = vocab_end
+        self.other_slices = kwargs.get("other_slices", [])
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores_processed = scores[:, self.vocab_start: self.vocab_end]
+        for other_slice in self.other_slices:
+            scores_processed = torch.cat([scores_processed, scores[:, other_slice[0]: other_slice[1]]], dim=-1)
+        return scores_processed
+    def __repr__(self):
+        return f"SliceVocabLogitsWarper(vocab_start={self.vocab_start}, vocab_end={self.vocab_end}, other_slices={self.other_slices})"
+def resize_and_crop(image: Image.Image, target_size: Tuple[int, int], resample=Image.Resampling.LANCZOS, crop_type='center', crop_coords=None) -> Image.Image:
+    tw, th = target_size
+    w, h = image.size
+    tr = th / tw
+    r = h / w
+    if crop_type == "resize":
+        resize_width = tw
+        resize_height = th
+        crop_top = 0
+        crop_left = 0
+        image = image.resize((resize_width, resize_height), resample=resample)
+    else:
+        # maintain the aspect ratio
+        if r < tr:
+            resize_height = th
+            resize_width = int(round(th / h * w))
+        else:
+            resize_width = tw
+            resize_height = int(round(tw / w * h))
+        if crop_type == 'center':
+            crop_top = int(round((resize_height - th) / 2.0))
+            crop_left = int(round((resize_width - tw) / 2.0))
+        elif crop_type == 'random':
+            crop_top = random.randint(0, resize_height - th)
+            crop_left = random.randint(0, resize_width - tw)
+        elif crop_type == 'fixed':
+            assert crop_coords is not None, 'crop_coords should be provided when crop_type is fixed.'
+            crop_left, crop_top = crop_coords
+        else:
+            raise ValueError(f'crop_type must be center, random or fixed, but got {crop_type}')
+        image = image.resize((resize_width, resize_height), resample=resample)
+        image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
+    return image
+@dataclass
+class ResolutionGroupConfig:
+    base_size: int = None
+    step: Optional[int] = None
+    align: int = 16
+    def to_dict(self):
+        return asdict(self)
+@dataclass
+class VAEInfo:
+    encoder_type: str
+    down_h_factor: int = -1
+    down_w_factor: int = -1
+    patch_size: int = 1
+    h_factor: int = -1
+    w_factor: int = -1
+    image_type: str = None
+    def __post_init__(self):
+        self.h_factor = self.down_h_factor * self.patch_size
+        self.w_factor = self.down_w_factor * self.patch_size
+        if self.image_type is None:
+            self.image_type = "vae"
+@dataclass
+class ViTInfo:
+    encoder_type: str
+    h_factor: int = -1
+    w_factor: int = -1
+    max_token_length: int = 0   # pad to max_token_length
+    processor: Callable = field(default_factory=BaseImageProcessor)
+    image_type: str = None
+    def __post_init__(self):
+        if self.image_type is None:
+            self.image_type = self.encoder_type.split("-")[0]
+class HunyuanImage3ImageProcessor(object):
+    def __init__(self, config):
+        self.config = config
+        self.reso_group_config = ResolutionGroupConfig(base_size=config.image_base_size)
+        self.vae_reso_group = ResolutionGroup(
+            **self.reso_group_config.to_dict(),
+            extra_resolutions=[
+                Resolution("1024x768"),
+                Resolution("1280x720"),
+                Resolution("768x1024"),
+                Resolution("720x1280"),
+            ]
+        )
+        self.img_ratio_slice_logits_processor = None
+        self.pil_image_to_tensor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),  # transform to [-1, 1]
+        ])
+        self.vae_info = VAEInfo(
+            encoder_type=config.vae_type,
+            down_h_factor=config.vae_downsample_factor[0], down_w_factor=config.vae_downsample_factor[0],
+            patch_size=config.patch_size,
+        )
+        if config.vit_type == "siglip2-so400m-patch16-naflex":
+            self.vit_processor = Siglip2ImageProcessorFast.from_dict(config.vit_processor)
+        else:
+            raise ValueError(f"Unsupported vit_type: {config.vit_type}")
+        self.vit_info = ViTInfo(
+            encoder_type=config.vit_type,
+            h_factor=self.vit_processor.patch_size,
+            w_factor=self.vit_processor.patch_size,
+            max_token_length=self.vit_processor.max_num_patches,
+            processor=self.vit_processor,
+        )
+        self.cond_token_attn_type = config.cond_token_attn_type
+        self.cond_image_type = config.cond_image_type
+    def build_gen_image_info(self, image_size, add_guidance_token=False, add_timestep_r_token=False) -> ImageInfo:
+        # parse image size (HxW, H:W, or <img_ratio_i>)
+        if isinstance(image_size, str):
+            if image_size.startswith("<img_ratio_"):
+                ratio_index = int(image_size.split("_")[-1].rstrip(">"))
+                reso = self.vae_reso_group[ratio_index]
+                image_size = reso.height, reso.width
+            elif 'x' in image_size:
+                image_size = [int(s) for s in image_size.split('x')]
+            elif ':' in image_size:
+                image_size = [int(s) for s in image_size.split(':')]
+                assert len(image_size) == 2, f"`image_size` should be in the format of 'W:H', got {image_size}."
+                # Note that ratio is width:height
+                image_size = [image_size[1], image_size[0]]
+            else:
+                raise ValueError(
+                    f"`image_size` should be in the format of 'HxW', 'W:H' or <img_ratio_i>, got {image_size}.")
+            assert len(image_size) == 2, f"`image_size` should be in the format of 'HxW', got {image_size}."
+        elif isinstance(image_size, (list, tuple)):
+            assert len(image_size) == 2 and all(isinstance(s, int) for s in image_size), \
+                f"`image_size` should be a tuple of two integers or a string in the format of 'HxW', got {image_size}."
+        else:
+            raise ValueError(f"`image_size` should be a tuple of two integers or a string in the format of 'WxH', "
+                             f"got {image_size}.")
+        image_width, image_height = self.vae_reso_group.get_target_size(image_size[1], image_size[0])
+        token_height = image_height // self.vae_info.h_factor
+        token_width = image_width // self.vae_info.w_factor
+        base_size, ratio_idx = self.vae_reso_group.get_base_size_and_ratio_index(image_size[1], image_size[0])
+        image_info = ImageInfo(
+            image_type="gen_image", image_width=image_width, image_height=image_height,
+            token_width=token_width, token_height=token_height, base_size=base_size, ratio_index=ratio_idx,
+            add_guidance_token=add_guidance_token, add_timestep_r_token=add_timestep_r_token,
+        )
+        return image_info
+    def as_image_tensor(self, image, image_type, **kwargs) -> ImageTensor:
+        if isinstance(image, Image.Image):
+            tensor = self.pil_image_to_tensor(image)
+        else:
+            tensor = image
+        origin_size = kwargs["origin_size"]
+        ori_image_width = origin_size[0]
+        ori_image_height = origin_size[1]
+        if image_type == "vae":
+            assert tensor.ndim == 3 or tensor.ndim == 4
+            h, w = tensor.shape[-2], tensor.shape[-1]
+            assert (h % self.vae_info.h_factor == 0 and w % self.vae_info.w_factor == 0), \
+                (f"Image size should be divisible by ({self.vae_info.h_factor}, {self.vae_info.w_factor}), "
+                 f"but got ({h} x {w}).")
+            tk_height = h // self.vae_info.h_factor
+            tk_width = w // self.vae_info.w_factor
+            base_size, ratio_idx = self.vae_reso_group.get_base_size_and_ratio_index(w, h)
+            tensor.i = ImageInfo(
+                image_type=image_type,
+                image_width=w, image_height=h, token_width=tk_width, token_height=tk_height,
+                base_size=base_size, ratio_index=ratio_idx,
+                ori_image_width=ori_image_width,
+                ori_image_height=ori_image_height,
+            )
+            tensor.section_type = "cond_vae_image"
+        elif image_type == "siglip2":
+            spatial_shapes = kwargs["spatial_shapes"]  # 2  (h, w)
+            pixel_attention_mask = kwargs["pixel_attention_mask"]  # seq_len
+            tensor.i = ImageInfo(
+                image_type=image_type,
+                image_width=spatial_shapes[1].item() * self.vit_info.w_factor,
+                image_height=spatial_shapes[0].item() * self.vit_info.h_factor,
+                token_width=spatial_shapes[1].item(),
+                token_height=spatial_shapes[0].item(),
+                image_token_length=self.vit_info.max_token_length,
+                ori_image_width=ori_image_width,
+                ori_image_height=ori_image_height,
+            )
+            tensor.section_type = "cond_vit_image"
+            tensor.vision_encoder_kwargs = {
+                "spatial_shapes": spatial_shapes,
+                "pixel_attention_mask": pixel_attention_mask,
+            }
+        elif image_type == "anyres":
+            token_width = kwargs["resized_image_width"] // self.vit_info.w_factor
+            token_height = kwargs["resized_image_height"] // self.vit_info.h_factor
+            tensor.i = ImageInfo(
+                image_type=image_type,
+                image_width=kwargs["resized_image_width"],
+                image_height=kwargs["resized_image_height"],
+                token_width=token_width,
+                token_height=token_height,
+                image_token_length=token_height * (token_width + 1) + 2,
+            )
+            tensor.section_type = "cond_vit_image"
+        else:
+            raise ValueError(f"Unknown image type: {image_type}")
+        return tensor
+    def vae_process_image(self, image, target_size, random_crop: bool | str = False) -> ImageTensor:
+        origin_size = image.size
+        crop_type = random_crop if isinstance(random_crop, str) else ("random" if random_crop else "center")
+        resized_image = resize_and_crop(image, target_size, crop_type=crop_type)
+        return self.as_image_tensor(resized_image, image_type=self.vae_info.image_type, origin_size=origin_size)
+    def vit_process_image(self, image) -> ImageTensor:
+        origin_size = image.size
+        inputs = self.vit_info.processor(image)
+        image = inputs["pixel_values"].squeeze(0)   # (seq_len, dim)
+        remain_keys = set(inputs.keys()) - {"pixel_values"}
+        remain_kwargs = {}
+        for key in remain_keys:
+            if isinstance(inputs[key], torch.Tensor):
+                remain_kwargs[key] = inputs[key].squeeze(0)
+            else:
+                remain_kwargs[key] = inputs[key]
+        return self.as_image_tensor(image, image_type=self.vit_info.image_type, origin_size=origin_size, **remain_kwargs)
+    def get_image_with_size(
+            self,
+            src: InputImage,
+            random_crop: bool | str = False,
+            return_type: str = "vae",
+    ) -> tuple[ImageTensor | CondImage, bool]:
+        """ For various image generation tasks, dynamic image sizes """
+        image = load_image(src)
+        image_flag = "normal"
+        img_success = image_flag != "gray"
+        origin_size = image.size  # (w_ori, h_ori)
+        if "vae" in return_type:
+            target_size = self.vae_reso_group.get_target_size(*origin_size)
+            vae_image_tensor = self.vae_process_image(image, target_size, random_crop=random_crop)
+        else:
+            vae_image_tensor = None
+        if "vit" in return_type:
+            vit_image_tensor = self.vit_process_image(image)
+        else:
+            vit_image_tensor = None
+        if return_type == "vae":
+            image_tensor = vae_image_tensor
+        elif return_type == "vit":
+            image_tensor = vit_image_tensor
+        elif return_type == "vae_vit":
+            image_tensor = CondImage(image_type=return_type, vae_image=vae_image_tensor, vit_image=vit_image_tensor)
+        else:
+            raise ValueError(f"Unknown return_type: {return_type}")
+        return image_tensor, img_success
+    def build_cond_images(
+            self,
+            image_list: Optional[list[InputImage]] = None,
+            message_list: Optional[list[dict[str, Any]]] = None,
+            infer_align_image_size: bool = False,
+    ) -> Optional[list[CondImage]]:
+        if image_list is not None and message_list is not None:
+            raise ValueError("`image_list` and `message_list` cannot be provided at the same time.")
+        if message_list is not None:
+            image_list = []
+            for message in message_list:
+                visuals = [
+                    content
+                    for content in message["content"]
+                    if isinstance(content, dict) and content["type"] in ["image"]
+                ]
+                image_list.extend([
+                    vision_info[key]
+                    for vision_info in visuals
+                    for key in ["image", "url", "path", "base64"]
+                    if key in vision_info and vision_info["type"] == "image"
+                ])
+        if infer_align_image_size:
+            random_crop = "resize"
+        else:
+            random_crop = "center"
+        return [
+            self.get_image_with_size(src, return_type=self.cond_image_type, random_crop=random_crop)[0]
+            for src in image_list
+        ]
+    def prepare_full_attn_slices(self, output, batch_idx=None, with_gen=True):
+        """ Determine full attention image slices according to strategies. """
+        if self.cond_image_type == "vae":
+            cond_choices = dict(
+                causal=[],
+                full=output.vae_image_slices[batch_idx] if batch_idx is not None else output.vae_image_slices
+            )
+        elif self.cond_image_type == "vit":
+            cond_choices = dict(
+                causal=[],
+                full=output.vit_image_slices[batch_idx] if batch_idx is not None else output.vit_image_slices
+            )
+        elif self.cond_image_type == "vae_vit":
+            cond_choices = {
+                "causal": [],
+                "full": (
+                    output.vae_image_slices[batch_idx] + output.vit_image_slices[batch_idx]
+                    if batch_idx is not None
+                    else output.vae_image_slices + output.vit_image_slices
+                ),
+                "joint_full": (
+                    output.joint_image_slices[batch_idx]
+                    if batch_idx is not None
+                    else output.joint_image_slices
+                ),
+                "full_causal": (
+                    output.vae_image_slices[batch_idx]
+                    if batch_idx is not None
+                    else output.vae_image_slices
+                ),
+            }
+        else:
+            raise ValueError(f"Unknown cond_image_type: {self.cond_image_type}")
+        slices = cond_choices[self.cond_token_attn_type]
+        if with_gen:
+            gen_image_slices = (
+                output.gen_image_slices[batch_idx]
+                if batch_idx is not None
+                else output.gen_image_slices
+            )
+            slices = slices + gen_image_slices
+        return slices
+    def build_img_ratio_slice_logits_processor(self, tokenizer):
+        if self.img_ratio_slice_logits_processor is None:
+            self.img_ratio_slice_logits_processor = LogitsProcessorList()
+            self.img_ratio_slice_logits_processor.append(
+                SliceVocabLogitsProcessor(
+                    vocab_start=tokenizer.start_ratio_token_id,
+                    vocab_end=tokenizer.end_ratio_token_id + 1,
+                    other_slices=getattr(tokenizer, "ratio_token_other_slices", []),
+                )
+            )
+    def postprocess_outputs(self, outputs: list[Image.Image], batch_cond_images, infer_align_image_size: bool = False):
+        if infer_align_image_size:
+            target_area = self.vae_reso_group.base_size ** 2
+            for batch_index, (output_image, cond_images) in enumerate(zip(outputs, batch_cond_images)):
+                output_image_ratio_index = self.vae_reso_group.get_base_size_and_ratio_index(width=output_image.width, height=output_image.height)[1]
+                cond_images_ratio_index_list = []
+                cond_images_ori_width_list = []
+                cond_images_ori_height_list = []
+                for cond_image in cond_images:
+                    if isinstance(cond_image, ImageTensor):
+                        cond_images_ratio_index_list.append(cond_image.i.ratio_index)
+                        cond_images_ori_width_list.append(cond_image.i.ori_image_width)
+                        cond_images_ori_height_list.append(cond_image.i.ori_image_height)
+                    else: # CondImage
+                        cond_images_ratio_index_list.append(cond_image.vae_image.i.ratio_index)
+                        cond_images_ori_width_list.append(cond_image.vae_image.i.ori_image_width)
+                        cond_images_ori_height_list.append(cond_image.vae_image.i.ori_image_height)
+                if len(cond_images) == 0:
+                    continue
+                elif len(cond_images) == 1:
+                    if output_image_ratio_index == cond_images_ratio_index_list[0]:
+                        if abs(cond_images_ori_height_list[0] / cond_images_ori_width_list[0] - self.vae_reso_group[output_image_ratio_index].ratio) >= 0.01:
+                            scale = math.sqrt(target_area / (cond_images_ori_width_list[0] * cond_images_ori_height_list[0]))
+                            new_w = round(cond_images_ori_width_list[0] * scale)
+                            new_h = round(cond_images_ori_height_list[0] * scale)
+                            outputs[batch_index] = output_image.resize((new_w, new_h), resample=Image.Resampling.LANCZOS)
+                else:
+                    for cond_image_ratio_index, cond_image_ori_width, cond_image_ori_height in zip(cond_images_ratio_index_list, cond_images_ori_width_list, cond_images_ori_height_list):
+                        if output_image_ratio_index == cond_image_ratio_index:
+                            if abs(cond_image_ori_height / cond_image_ori_width - self.vae_reso_group[output_image_ratio_index].ratio) >= 0.01:
+                                scale = math.sqrt(target_area / (cond_image_ori_width * cond_image_ori_height))
+                                new_w = round(cond_image_ori_width * scale)
+                                new_h = round(cond_image_ori_height * scale)
+                                outputs[batch_index] = output_image.resize((new_w, new_h), resample=Image.Resampling.LANCZOS)
+                            break
+        return outputs
+__all__ = [
+    "HunyuanImage3ImageProcessor"
+]

model-0001-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2db6ab327b5a5a9ff2be48bc41fae98d7de01b0a29f1a5ecc88b079637bce016
+size 5363066616

model-0002-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09538e24c7437751d2384dde73cf4e913dce1e67bfdf87b0b1933963dc117a41
+size 5318937248

model-0003-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c268190bd3c0d57b05cd5e859d5dcce1b30df2ede2486396179b28b2517cf820
+size 5344627472

model-0004-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:315220e3fcc1a02673670e63c1eb8d2a73970e17e5b787156902fd7f7258220d
+size 5327343192

model-0005-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad5d55a79c80186537367d8bfcee7de722f8b34c820391b656f14a5fed1b085
+size 5344103080

model-0006-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:834abe96ce34acbbb77a72990058c6e324207db8dcd878a01752792dd1fb38b4
+size 5318937248

model-0007-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3825bbee6d58000f357ea24b312040eb16a466d14bed5b44b89eeda07344a4fa
+size 5344103088

model-0008-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4108cb77f70a545335ba2108d7fb4bcb6f6831dd3080e5a6b330433e7de69f
+size 5318937256

model-0009-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aee7924c4d27ba7fe722b2581d2a87775715085ad38016042b6c1b8c998afb7b
+size 5344103088

model-0010-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:559b75c2da0baf8c0389a889990925d140dcee1f8a57ecd0b3cc6db4bb7013be
+size 5318937304

model-0011-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09882d830666ae891b41023d4e7af0cc7083d731d198295411b2ebfbabedbc7c
+size 5344103232

model-0012-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c304453394b2b30dc281b9f4e155b6709b682a8602994093240afefc49548bd7
+size 5318937400

model-0013-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48d01ca1697b035d9dd0f4482ac17d740876ec82bdc511931f128978bdbdd5c3
+size 5344103232

model-0014-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c472e079b15d85e0cd81ee57f34d7815834b0b43b5a8fbccc561285e6cafacb1
+size 5318937400

model-0015-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:886837fb5ba9b75b6bf754c98563af1955ad9b27a8130e5480dd608638f03576
+size 5344103232

model-0016-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa722844f3e88ff87973a385cc46dd4e5be2a820b1dd9be6957c152b62dd25b2
+size 5318937400

model-0017-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a237fe2fad03479bd29f635e8ef7e4a1c9e354e29c2295ecea10fd43602f3e05
+size 5344103224

model-0018-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:889f2bb13328f2085593dfe06328ce4be24d2379e294ef37869827a9f59b59a6
+size 5327859080

model-0019-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb4dd0053a798204b952a28b103d8abecd35a40ad1702ae6a39f71aedbeef627
+size 5344111888

model-0020-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7baf9d180d2e5ef99e35864440190d700f6378c33a8dbc1adc629b2a2ec263ca
+size 5318937392

model-0021-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c419d5445acff5eb09a436e0a6236f7b833a4aac6b3f2316c044abe770ed58e
+size 5344103232

model-0022-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20d6d65cf9208fb649daafb0e1511b526c0b2eeac0463ead6fa151e6bd8e3207
+size 5318937400

model-0023-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c5b70acb118d07ab02683c3467e7636e8197cb3b8969220eb447c7dd97470bd
+size 5344103232

model-0024-of-0032.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65d5830e1162a4bed3f4e8af24d8fcf22fbf9ceb8d0260e8a51ca14cf748d64f
+size 5318937400