diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..d3c41fa5cd549f87632161645de4ce360a620314 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+utils/__pycache__/import_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+utils/__pycache__/import_utils.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e0ee69e6f12cf79a359f653177e9a37cdbb0cf9
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from utils import _LazyModule
+from utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hunyuan_image_3 import *
+    from .modeling_hunyuan_image_3 import *
+    from .autoencoder_kl_3d import *
+    from .image_processor import *
+    from .siglip2 import *
+    from .tokenization_hunyuan_image_3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/autoencoder_kl_3d.py b/autoencoder_kl_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecf3153ae063b3992619ccd47b6944ac3ef73a7
--- /dev/null
+++ b/autoencoder_kl_3d.py
@@ -0,0 +1,1081 @@
+"""
+Reference code
+[FLUX] https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/autoencoder.py
+[DCAE] https://github.com/mit-han-lab/efficientvit/blob/master/efficientvit/models/efficientvit/dc_ae.py
+"""
+import os
+from dataclasses import dataclass
+from typing import Tuple, Optional
+import math
+import random
+import numpy as np
+from einops import rearrange
+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from safetensors import safe_open
+import os
+from collections import OrderedDict
+from collections.abc import Iterable
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import BaseOutput
+
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var +
+                    self.var / other.var -
+                    1.0 -
+                    self.logvar +
+                    other.logvar,
+                    dim=reduce_dim,
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+
+def forward_with_checkpointing(module, *inputs, use_checkpointing=False):
+    def create_custom_forward(module):
+        def custom_forward(*inputs):
+            return module(*inputs)
+        return custom_forward
+
+    if use_checkpointing:
+        return torch.utils.checkpoint.checkpoint(create_custom_forward(module), *inputs, use_reentrant=False)
+    else:
+        return module(*inputs)
+
+
+class Conv3d(nn.Conv3d):
+    """Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5. Only symmetric padding is supported."""
+
+    def forward(self, input):
+        B, C, T, H, W = input.shape
+        memory_count = (C * T * H * W) * 2 / 1024**3
+        if memory_count > 2:
+            n_split = math.ceil(memory_count / 2)
+            assert n_split >= 2
+            chunks = torch.chunk(input, chunks=n_split, dim=-3)
+            padded_chunks = []
+            for i in range(len(chunks)):
+                if self.padding[0] > 0:
+                    padded_chunk = F.pad(
+                        chunks[i],
+                        (0, 0, 0, 0, self.padding[0], self.padding[0]),
+                        mode="constant" if self.padding_mode == "zeros" else self.padding_mode,
+                        value=0,
+                    )
+                    if i > 0:
+                        padded_chunk[:, :, :self.padding[0]] = chunks[i - 1][:, :, -self.padding[0]:]
+                    if i < len(chunks) - 1:
+                        padded_chunk[:, :, -self.padding[0]:] = chunks[i + 1][:, :, :self.padding[0]]
+                else:
+                    padded_chunk = chunks[i]
+                padded_chunks.append(padded_chunk)
+            padding_bak = self.padding
+            self.padding = (0, self.padding[1], self.padding[2])
+            outputs = []
+            for i in range(len(padded_chunks)):
+                outputs.append(super().forward(padded_chunks[i]))
+            self.padding = padding_bak
+            return torch.cat(outputs, dim=-3)
+        else:
+            return super().forward(input)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+        self.q = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.k = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.v = Conv3d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = Conv3d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, f, h, w = q.shape
+        q = rearrange(q, "b c f h w -> b 1 (f h w) c").contiguous()
+        k = rearrange(k, "b c f h w -> b 1 (f h w) c").contiguous()
+        v = rearrange(v, "b c f h w -> b 1 (f h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+
+        return rearrange(h_, "b 1 (f h w) c -> b c f h w", f=f, h=h, w=w, c=c, b=b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        self.add_temporal_downsample = add_temporal_downsample
+        stride = (2, 2, 2) if add_temporal_downsample else (1, 2, 2)  # THW
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=stride, padding=0)
+
+    def forward(self, x: Tensor):
+        spatial_pad = (0, 1, 0, 1, 0, 0)  # WHT
+        x = nn.functional.pad(x, spatial_pad, mode="constant", value=0)
+
+        temporal_pad = (0, 0, 0, 0, 0, 1) if self.add_temporal_downsample else (0, 0, 0, 0, 1, 1)
+        x = nn.functional.pad(x, temporal_pad, mode="replicate")
+
+        x = self.conv(x)
+        return x
+
+
+class DownsampleDCAE(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2
+        assert out_channels % factor == 0
+        self.conv = Conv3d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
+
+        self.add_temporal_downsample = add_temporal_downsample
+        self.group_size = factor * in_channels // out_channels
+
+    def forward(self, x: Tensor):
+        r1 = 2 if self.add_temporal_downsample else 1
+        h = self.conv(x)
+        h = rearrange(h, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
+        shortcut = rearrange(x, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
+
+        B, C, T, H, W = shortcut.shape
+        shortcut = shortcut.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
+        return h + shortcut
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        self.add_temporal_upsample = add_temporal_upsample
+        self.scale_factor = (2, 2, 2) if add_temporal_upsample else (1, 2, 2)  # THW
+        self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
+        x = self.conv(x)
+        return x
+
+
+class UpsampleDCAE(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_upsample else 1 * 2 * 2
+        self.conv = Conv3d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
+
+        self.add_temporal_upsample = add_temporal_upsample
+        self.repeats = factor * out_channels // in_channels
+
+    def forward(self, x: Tensor):
+        r1 = 2 if self.add_temporal_upsample else 1
+        h = self.conv(x)
+        h = rearrange(h, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+        shortcut = x.repeat_interleave(repeats=self.repeats, dim=1)
+        shortcut = rearrange(shortcut, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
+        return h + shortcut
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        z_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        downsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        assert block_out_channels[-1] % (2 * z_channels) == 0
+
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+
+        # downsampling
+        self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+
+        self.down = nn.ModuleList()
+        block_in = block_out_channels[0]
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+
+            add_spatial_downsample = bool(i_level < np.log2(ffactor_spatial))
+            add_temporal_downsample = add_spatial_downsample and bool(i_level >= np.log2(ffactor_spatial // ffactor_temporal))
+            if add_spatial_downsample or add_temporal_downsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1] if downsample_match_channel else block_in
+                down.downsample = DownsampleDCAE(block_in, block_out, add_temporal_downsample)
+                block_in = block_out
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv3d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x: Tensor) -> Tensor:
+        with torch.no_grad():
+            use_checkpointing = bool(self.training and self.gradient_checkpointing)
+
+            # downsampling
+            h = self.conv_in(x)
+            for i_level in range(len(self.block_out_channels)):
+                for i_block in range(self.num_res_blocks):
+                    h = forward_with_checkpointing(self.down[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+                if hasattr(self.down[i_level], "downsample"):
+                    h = forward_with_checkpointing(self.down[i_level].downsample, h, use_checkpointing=use_checkpointing)
+
+            # middle
+            h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
+
+            # end
+            group_size = self.block_out_channels[-1] // (2 * self.z_channels)
+            shortcut = rearrange(h, "b (c r) f h w -> b c r f h w", r=group_size).mean(dim=2)
+            h = self.norm_out(h)
+            h = swish(h)
+            h = self.conv_out(h)
+            h += shortcut
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        upsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        assert block_out_channels[0] % z_channels == 0
+
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+
+        # z to block_in
+        block_in = block_out_channels[0]
+        self.conv_in = Conv3d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+
+            add_spatial_upsample = bool(i_level < np.log2(ffactor_spatial))
+            add_temporal_upsample = bool(i_level < np.log2(ffactor_temporal))
+            if add_spatial_upsample or add_temporal_upsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1] if upsample_match_channel else block_in
+                up.upsample = UpsampleDCAE(block_in, block_out, add_temporal_upsample)
+                block_in = block_out
+            self.up.append(up)
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv3d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.gradient_checkpointing = False
+
+
+    def forward(self, z: Tensor) -> Tensor:
+        with torch.no_grad():
+            use_checkpointing = bool(self.training and self.gradient_checkpointing)
+            # z to block_in
+            repeats = self.block_out_channels[0] // (self.z_channels)
+            h = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+            # middle
+            h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
+            h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
+            # upsampling
+            for i_level in range(len(self.block_out_channels)):
+                for i_block in range(self.num_res_blocks + 1):
+                    h = forward_with_checkpointing(self.up[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
+                if hasattr(self.up[i_level], "upsample"):
+                    h = forward_with_checkpointing(self.up[i_level].upsample, h, use_checkpointing=use_checkpointing)
+            # end
+            h = self.norm_out(h)
+            h = swish(h)
+            h = self.conv_out(h)
+        return h
+
+
+class AutoencoderKLConv3D(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        latent_channels: int,
+        block_out_channels: Tuple[int, ...],
+        layers_per_block: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        sample_size: int,
+        sample_tsize: int,
+        scaling_factor: float = None,
+        shift_factor: Optional[float] = None,
+        downsample_match_channel: bool = True,
+        upsample_match_channel: bool = True,
+        only_encoder: bool = False,
+        only_decoder: bool = False,
+    ):
+        super().__init__()
+        self.ffactor_spatial = ffactor_spatial
+        self.ffactor_temporal = ffactor_temporal
+        self.scaling_factor = scaling_factor
+        self.shift_factor = shift_factor
+
+        if not only_decoder:
+            self.encoder = Encoder(
+                in_channels=in_channels,
+                z_channels=latent_channels,
+                block_out_channels=block_out_channels,
+                num_res_blocks=layers_per_block,
+                ffactor_spatial=ffactor_spatial,
+                ffactor_temporal=ffactor_temporal,
+                downsample_match_channel=downsample_match_channel,
+            )
+        if not only_encoder:
+            self.decoder = Decoder(
+                z_channels=latent_channels,
+                out_channels=out_channels,
+                block_out_channels=list(reversed(block_out_channels)),
+                num_res_blocks=layers_per_block,
+                ffactor_spatial=ffactor_spatial,
+                ffactor_temporal=ffactor_temporal,
+                upsample_match_channel=upsample_match_channel,
+            )
+
+        self.use_slicing = False
+        self.slicing_bsz = 1
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+        self.use_tiling_during_training = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = sample_size
+        self.tile_latent_min_size = sample_size // ffactor_spatial
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // ffactor_temporal
+        self.tile_overlap_factor = 0.125
+
+        self.use_compile = False
+
+        self.empty_cache = torch.empty(0, device="cuda")
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+
+    def enable_tiling_during_training(self, use_tiling: bool = True):
+        self.use_tiling_during_training = use_tiling
+
+    def disable_tiling_during_training(self):
+        self.enable_tiling_during_training(False)
+
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+
+    def enable_tiling(self, use_tiling: bool = True):
+        self.enable_spatial_tiling(use_tiling)
+
+    def disable_tiling(self):
+        self.disable_spatial_tiling()
+
+    def enable_slicing(self):
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        self.use_slicing = False
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+
+    def spatial_tiled_encode(self, x: torch.Tensor):
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))  # 256 * (1 - 0.25) = 192
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        row_limit = self.tile_latent_min_size - blend_extent  # 8 - 2 = 6
+
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = x[:, :, :, i: i + self.tile_sample_min_size, j: j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+        return moments
+
+    def temporal_tiled_encode(self, x: torch.Tensor):
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))  # 64 * (1 - 0.25) = 48
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        t_limit = self.tile_latent_min_tsize - blend_extent  # 8 - 2 = 6
+
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i: i + self.tile_sample_min_tsize, :, :]
+            if self.use_spatial_tiling and (tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
+                tile = self.spatial_tiled_encode(tile)
+            else:
+                tile = self.encoder(tile)
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+        moments = torch.cat(result_row, dim=-3)
+        return moments
+
+    def spatial_tiled_decode(self, z: torch.Tensor):
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))  # 24 * (1 - 0.125) = 21
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)  # 384 * 0.125 = 48
+        row_limit = self.tile_sample_min_size - blend_extent  # 384 - 48 = 336
+
+        # 分布式/多卡：输入不做 padding -> 每 rank 对解码输出做右/下 padding -> GPU all_gather -> rank0重组/融合/裁剪
+        if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            # 统计tile
+            num_rows = math.ceil(H / overlap_size)
+            num_cols = math.ceil(W / overlap_size)
+            total_tiles = num_rows * num_cols
+            tiles_per_rank = math.ceil(total_tiles / world_size)
+
+            print(f"==={torch.distributed.get_rank()},  {total_tiles=}, {tiles_per_rank=}, {world_size=}")
+
+            # 本 rank 的 tile 索引（循环分配）：rank, rank+world_size,
+            my_linear_indices = list(range(rank, total_tiles, world_size))
+            if my_linear_indices == []:
+                my_linear_indices = [0]
+            print(f"==={torch.distributed.get_rank()},  {my_linear_indices=}")
+            decoded_tiles = [] # tiles
+            decoded_metas = [] # (ri, rj, pad_w, pad_h)
+            H_out_std = self.tile_sample_min_size
+            W_out_std = self.tile_sample_min_size
+            for lin_idx in my_linear_indices:
+                ri = lin_idx // num_cols
+                rj = lin_idx % num_cols
+                i = ri * overlap_size
+                j = rj * overlap_size
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                dec = self.decoder(tile)
+                # 对边界 tile 的输出做右/下方向 padding 到标准尺寸
+                pad_h = max(0, H_out_std - dec.shape[-2])
+                pad_w = max(0, W_out_std - dec.shape[-1])
+                if pad_h > 0 or pad_w > 0:
+                    dec = F.pad(dec, (0, pad_w, 0, pad_h, 0, 0), "constant", 0)
+                decoded_tiles.append(dec)
+                decoded_metas.append(torch.tensor([ri, rj, pad_w, pad_h], device=z.device, dtype=torch.int64))
+            
+            # 各rank数量不一定相同，进行padding到相同长度
+            T_out = decoded_tiles[0].shape[2] if len(decoded_tiles) > 0 else (T-1)*self.ffactor_temporal+1
+            while len(decoded_tiles) < tiles_per_rank:
+                decoded_tiles.append(torch.zeros([1, 3, T_out, self.tile_sample_min_size, self.tile_sample_min_size], device=z.device, dtype=dec.dtype))
+                decoded_metas.append(torch.tensor([-1, -1, self.tile_sample_min_size, self.tile_sample_min_size], device=z.device, dtype=torch.int64)) 
+                
+            # 进行gpu的all_gather
+            decoded_tiles = torch.stack(decoded_tiles, dim=0)
+            decoded_metas = torch.stack(decoded_metas, dim=0)
+            
+            tiles_gather_list = [torch.empty_like(decoded_tiles) for _ in range(world_size)]
+            metas_gather_list = [torch.empty_like(decoded_metas) for _ in range(world_size)]
+
+            dist.all_gather(tiles_gather_list, decoded_tiles)
+            dist.all_gather(metas_gather_list, decoded_metas)
+
+            if rank != 0:
+                # 非0号rank返回空占位，结果只在rank0上有效
+                return torch.empty(0, device=z.device)
+
+            # rank0：根据 (ri, rj) 元信息重建 tile 网格；跳过占位项 (ri, rj) == (-1, -1)
+            rows = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+            for r in range(world_size):
+                gathered_tiles_r = tiles_gather_list[r]  # [tiles_per_rank, B, C, T, H, W]
+                gathered_metas_r = metas_gather_list[r]  # [tiles_per_rank, 4]，元素: (ri, rj, pad_w, pad_h)
+                for k in range(gathered_tiles_r.shape[0]):
+                    ri = int(gathered_metas_r[k][0])
+                    rj = int(gathered_metas_r[k][1])
+                    if ri < 0 or rj < 0:
+                        continue
+                    if ri < num_rows and rj < num_cols:
+                        # 去除padding
+                        pad_w = int(gathered_metas_r[k][2])
+                        pad_h = int(gathered_metas_r[k][3])
+                        h_end = None if pad_h == 0 else -pad_h
+                        w_end = None if pad_w == 0 else -pad_w
+                        rows[ri][rj] = gathered_tiles_r[k][:, :, :, :h_end, :w_end]
+
+            result_rows = []
+            for i, row in enumerate(rows):
+                result_row = []
+                for j, tile in enumerate(row):
+                    if tile is None:
+                        continue
+                    if i > 0:
+                        tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                    if j > 0:
+                        tile = self.blend_h(row[j - 1], tile, blend_extent)
+                    result_row.append(tile[:, :, :, :row_limit, :row_limit])
+                result_rows.append(torch.cat(result_row, dim=-1))
+
+            dec = torch.cat(result_rows, dim=-2)
+            return dec
+
+        # 单卡：原有串行逻辑
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=-2)
+        return dec
+
+    def temporal_tiled_decode(self, z: torch.Tensor):
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)  # 64 * 0.25 = 16
+        t_limit = self.tile_sample_min_tsize - blend_extent  # 64 - 16 = 48
+        assert 0 < overlap_size < self.tile_latent_min_tsize
+
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i: i + self.tile_latent_min_tsize, :, :]
+            if self.use_spatial_tiling and (tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+                decoded = self.spatial_tiled_decode(tile)
+            else:
+                decoded = self.decoder(tile)
+            row.append(decoded)
+
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+        dec = torch.cat(result_row, dim=-3)
+        return dec
+
+    def encode(self, x: Tensor, return_dict: bool = True):
+
+        def _encode(x):
+            if self.use_temporal_tiling and x.shape[-3] > self.tile_sample_min_tsize:
+                return self.temporal_tiled_encode(x)
+            if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+                return self.spatial_tiled_encode(x)
+
+            if self.use_compile:
+                @torch.compile
+                def encoder(x):
+                    return self.encoder(x)
+                return encoder(x)
+            return self.encoder(x)
+
+        if len(x.shape) != 5:  # (B, C, T, H, W)
+            x = x[:, :, None]
+        assert len(x.shape) == 5  # (B, C, T, H, W)
+        if x.shape[2] == 1:
+            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
+        else:
+            assert x.shape[2] != self.ffactor_temporal and x.shape[2] % self.ffactor_temporal == 0
+
+        if self.use_slicing and x.shape[0] > 1:
+            if self.slicing_bsz == 1:
+                encoded_slices = [_encode(x_slice) for x_slice in x.split(1)]
+            else:
+                sections = [self.slicing_bsz] * (x.shape[0] // self.slicing_bsz)
+                if x.shape[0] % self.slicing_bsz != 0:
+                    sections.append(x.shape[0] % self.slicing_bsz)
+                encoded_slices = [_encode(x_slice) for x_slice in x.split(sections)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = _encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def decode(self, z: Tensor, return_dict: bool = True, generator=None):
+
+        def _decode(z):
+            if self.use_temporal_tiling and z.shape[-3] > self.tile_latent_min_tsize:
+                return self.temporal_tiled_decode(z)
+            if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+                return self.spatial_tiled_decode(z)
+            return self.decoder(z)
+
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [_decode(z_slice) for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = _decode(z)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() != 0:
+                return self.empty_cache
+
+        if z.shape[-3] == 1:
+            decoded = decoded[:, :, -1:]
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def decode_dist(self, z: Tensor, return_dict: bool = True, generator=None):
+        z = z.cuda()
+        self.use_spatial_tiling = True
+        decoded = self.decode(z)
+        self.use_spatial_tiling = False
+        return decoded
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_posterior: bool = True,
+        return_dict: bool = True
+    ):
+        posterior = self.encode(sample).latent_dist
+        z = posterior.sample() if sample_posterior else posterior.mode()
+        dec = self.decode(z).sample
+        return DecoderOutput(sample=dec, posterior=posterior) if return_dict else (dec, posterior)
+
+    def random_reset_tiling(self, x: torch.Tensor):
+        if x.shape[-3] == 1:
+            self.disable_spatial_tiling()
+            self.disable_temporal_tiling()
+            return
+
+        # tiling在input_shape和sample_size上限制很多，任意的input_shape和sample_size很可能不满足条件，因此这里使用固定值
+        min_sample_size = int(1 / self.tile_overlap_factor) * self.ffactor_spatial
+        min_sample_tsize = int(1 / self.tile_overlap_factor) * self.ffactor_temporal
+        sample_size = random.choice([None, 1 * min_sample_size, 2 * min_sample_size, 3 * min_sample_size])
+        if sample_size is None:
+            self.disable_spatial_tiling()
+        else:
+            self.tile_sample_min_size = sample_size
+            self.tile_latent_min_size = sample_size // self.ffactor_spatial
+            self.enable_spatial_tiling()
+
+        sample_tsize = random.choice([None, 1 * min_sample_tsize, 2 * min_sample_tsize, 3 * min_sample_tsize])
+        if sample_tsize is None:
+            self.disable_temporal_tiling()
+        else:
+            self.tile_sample_min_tsize = sample_tsize
+            self.tile_latent_min_tsize = sample_tsize // self.ffactor_temporal
+            self.enable_temporal_tiling()
+
+def load_sharded_safetensors(model_dir):
+    """
+    手动加载分片的 safetensors 文件
+
+    Args:
+        model_dir: 包含分片文件的目录路径
+
+    Returns:
+        合并后的完整权重字典
+    """
+    # 获取所有分片文件并按编号排序
+    shard_files = []
+    for file in os.listdir(model_dir):
+        if file.endswith(".safetensors"):
+            shard_files.append(file)
+
+    # 按分片编号排序
+    shard_files.sort(key=lambda x: int(x.split("-")[1]))
+
+    print(f"找到 {len(shard_files)} 个分片文件")
+
+    # 合并所有权重
+    merged_state_dict = dict()
+
+    for shard_file in shard_files:
+        shard_path = os.path.join(model_dir, shard_file)
+        print(f"加载分片: {shard_file}")
+
+        # 使用 safetensors 加载当前分片
+        with safe_open(shard_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                tensor = f.get_tensor(key)
+                merged_state_dict[key] = tensor
+
+    print(f"合并完成，总键数量: {len(merged_state_dict)}")
+    return merged_state_dict
+
+def load_weights(model, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+    def update_state_dict(state_dict: dict[str, torch.Tensor], name, weight):
+        if name not in state_dict:
+            raise ValueError(f"Unexpected weight {name}")
+
+        model_tensor = state_dict[name]
+        if model_tensor.shape != weight.shape:
+            raise ValueError(
+                f"Shape mismatch for weight {name}: "
+                f"model tensor shape {model_tensor.shape} vs. "
+                f"loaded tensor shape {weight.shape}"
+            )
+        if isinstance(weight, torch.Tensor):
+            model_tensor.data.copy_(weight.data)
+        else:
+            raise ValueError(
+                f"Unsupported tensor type in load_weights "
+                f"for {name}: {type(weight)}"
+            )
+
+    loaded_params = set()
+    for name, load_tensor in weights.items():
+        updated = True
+        name = name.replace('vae.', '')
+        if name in model.state_dict():
+            update_state_dict(model.state_dict(), name, load_tensor)
+        else:
+            updated = False
+
+        if updated:
+            loaded_params.add(name)
+
+    return loaded_params
+
+def _worker(path, config, 
+    rank=None, world_size=None, port=None, req_queue=None, rsp_queue=None):
+    """
+    each rank's worker:
+      - idle: block on req_queue.get() (CPU blocking, no GPU)
+      - receive request: run runner.predict(), all ranks forward
+      - only rank0 put result to rsp_queue
+    """
+    # _tame_cpu_threads_and_comm()
+    # basic env
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(port)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+
+    # device binding should be early than all CUDA operations
+    visible = torch.cuda.device_count()
+    assert visible >= world_size, f"可见卡数 {visible} < world_size {world_size}"
+    local_rank = int(os.environ["LOCAL_RANK"])
+    
+    print(f"[worker {rank}] bind to cuda:{local_rank} (visible={visible})", flush=True)
+    if not torch.distributed.is_initialized():
+        dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    #from .. import load_vae
+
+    #vae = load_vae(vae_type, vae_precision, device, logger, args, weights_only, only_encoder, only_decoder, sample_size, skip_create_dist=True)
+    #vae = vae.cuda()
+    vae = AutoencoderKLConv3D.from_config(config)
+    merged_state_dict = load_sharded_safetensors(path)
+    loaded_params = load_weights(vae, merged_state_dict) 
+    vae = vae.cuda()
+    vae.eval()  # 关闭 Dropout、BatchNorm 训练行为
+    for param in vae.parameters():
+        param.requires_grad = False  #
+    
+    while True:
+        req = req_queue.get()  # blocking
+        if req == "__STOP__":
+            break
+        out = vae.decode_dist(req, return_dict=False)
+        if rank == 0:
+            rsp_queue.put(out)
+
+    #try:
+    #    while True:
+    #        # blocking on CPU queue
+    #        req = req_queue.get()  # blocking
+    #        if req == "__STOP__":
+    #            break
+    #        out = vae.decode_dist(req, return_dict=False)
+    #        if rank == 0:
+    #            rsp_queue.put(out)
+    #finally:
+    #    # destroy process group before exit
+    #    try:
+    #        dist.destroy_process_group()
+    #    except Exception:
+    #        pass
+
+#def _find_free_port():
+#    import socket
+#    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+#        s.bind(("127.0.0.1", 0))
+#        return s.getsockname()[1]
+
+# 避免端口冲突的常见做法
+def _find_free_port(start_port=8100, max_attempts=900):
+    import socket
+    """获取一个可用的端口"""
+    for port in range(start_port, start_port + max_attempts):
+        try:
+            with socket.socket() as s:
+                s.bind(('localhost', port))
+                return s.getsockname()[1]  # 返回实际绑定的端口
+        except OSError:
+            continue
+    raise RuntimeError("找不到可用端口")
+
+class AutoencoderKLConv3D_Dist(AutoencoderKLConv3D):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        latent_channels: int,
+        block_out_channels: Tuple[int, ...],
+        layers_per_block: int,
+        ffactor_spatial: int,
+        ffactor_temporal: int,
+        sample_size: int,
+        sample_tsize: int,
+        scaling_factor: float = None,
+        shift_factor: Optional[float] = None,
+        downsample_match_channel: bool = True,
+        upsample_match_channel: bool = True,
+        only_encoder: bool = False,
+        only_decoder: bool = False,
+    ):
+        super().__init__(in_channels, out_channels, latent_channels, block_out_channels, layers_per_block, ffactor_spatial, ffactor_temporal, sample_size, sample_tsize, scaling_factor, shift_factor, downsample_match_channel, upsample_match_channel, only_encoder, only_decoder)
+
+    def create_dist(self, path, config, 
+    ):
+        self.world_size = 8
+        self.port = _find_free_port()
+        ctx = mp.get_context("spawn")
+        # 每个 rank 一个请求队列（纯 CPU），再加一个公共响应队列
+        self.req_queues = [ctx.Queue() for _ in range(self.world_size)]
+        self.rsp_queue = ctx.Queue()
+
+        self.procs = []
+        for rank in range(self.world_size):
+            p = ctx.Process(
+                target=_worker,
+                args=(
+                    path, config, 
+                    rank, self.world_size, self.port,
+                    self.req_queues[rank], self.rsp_queue,
+                ),
+                daemon=True,
+            )
+            p.start()
+            self.procs.append(p)
+    
+    def decode(self, z: Tensor, return_dict: bool = True, generator=None):
+        """
+        synchronous inference: put the same request to all ranks' queues.
+        return rank0's result.
+        """
+        # check alive
+        for p in self.procs:
+            if not p.is_alive():
+                raise RuntimeError("One of the processes is not alive")
+
+        # put to each rank's queue
+        for q in self.req_queues:
+            q.put(z)
+
+        # wait for rank0's result
+        return self.rsp_queue.get(timeout=None)
diff --git a/cache_utils.py b/cache_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bf49f226c70394df60218323eb9bc5958a2ce54
--- /dev/null
+++ b/cache_utils.py
@@ -0,0 +1,226 @@
+import torch
+import torch.nn as nn
+import math
+from typing import Tuple
+
+def cache_init(cache_interval, max_order, num_steps=None,
+               enable_first_enhance=False, first_enhance_steps=3, 
+               enable_tailing_enhance=False, tailing_enhance_steps=1, 
+               low_freqs_order=0, high_freqs_order=2):
+    cache_dic = {}
+    cache_dic['counter']= 0
+    cache_dic['current_step'] = 0
+    cache_dic['cache_interval']= cache_interval
+    cache_dic['max_order'] = max_order
+    cache_dic['num_steps'] = num_steps
+
+    # enhance related utils
+    
+    # first enhance: fully compute first some steps, enhancing contour infos
+    cache_dic['enable_first_enhance'] = enable_first_enhance
+    cache_dic['first_enhance_steps'] = first_enhance_steps
+
+    # tailing enhance: fully compute the last 1 steps, enhancing details
+    cache_dic['enable_tailing_enhance'] = enable_tailing_enhance
+    cache_dic['tailing_enhance_steps'] = tailing_enhance_steps
+
+    # freqs related utils
+    cache_dic['low_freqs_order'] = low_freqs_order
+    cache_dic['high_freqs_order'] = high_freqs_order
+
+    # features for training-aware cache, here we don't use these
+    cache_dic['enable_force_control']= False 
+    cache_dic['force_compute']=False
+    return cache_dic
+
+class TaylorCacheContainer(nn.Module):
+    def __init__(self, max_order):
+        super().__init__()
+        self.max_order = max_order
+        # 逐个注册buffer
+        for i in range(max_order + 1):
+            self.register_buffer(f"derivative_{i}", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}", None, persistent=False)
+    
+    def get_derivative(self, order):
+        return getattr(self, f"derivative_{order}")
+    
+    def set_derivative(self, order, tensor):
+        setattr(self, f"derivative_{order}", tensor)
+
+    def set_temp_derivative(self, order, tensor):
+        setattr(self, f"temp_derivative_{order}", tensor)
+
+    def get_temp_derivative(self, order):
+        return getattr(self, f"temp_derivative_{order}")
+    
+    def clear_temp_derivative(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"temp_derivative_{i}", None)
+
+    def move_temp_to_derivative(self):
+        for i in range(self.max_order + 1):
+            if self.get_temp_derivative(i) is not None:
+                setattr(self, f"derivative_{i}", self.get_temp_derivative(i))
+            else:
+                break
+        self.clear_temp_derivative()
+
+    def get_all_derivatives(self):
+        return [getattr(self, f"derivative_{i}") for i in range(self.max_order + 1)]
+
+    def get_all_filled_derivatives(self):
+        return [self.get_derivative(i) for i in range(self.max_order + 1) if self.get_derivative(i) is not None]
+
+    def taylor_formula(self, distance):
+        output = 0
+        for i in range(len(self.get_all_filled_derivatives())):
+            output += (1 / math.factorial(i)) * self.get_derivative(i) * (distance ** i)
+        return output
+    
+    def derivatives_computation(self, x, distance):
+        '''
+        x: tensor, the new x_0
+        distance: int, the distance between the current step and the last full computation step
+        '''
+        self.set_temp_derivative(0, x)
+        for i in range(self.max_order):
+            if self.get_derivative(i) is not None:
+                self.set_temp_derivative(i+1, (self.get_temp_derivative(i) - self.get_derivative(i)) / distance)
+            else:
+                break
+        self.move_temp_to_derivative()
+
+    def clear_derivatives(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"derivative_{i}", None)
+            setattr(self, f"temp_derivative_{i}", None)
+
+
+@torch.compile
+def decomposition_FFT(x: torch.Tensor, cutoff_ratio: float = 0.1) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fast Fourier Transform frequency domain decomposition
+    
+    Args:
+        x: Input tensor [B, H*W, D]
+        cutoff_ratio: Cutoff frequency ratio (0~0.5)
+        
+    Returns:
+        Tuple of (low_freq, high_freq) tensors with same dtype as input
+    """
+    orig_dtype = x.dtype
+    device = x.device
+
+    x_fp32 = x.to(torch.float32)  # Convert to fp32 for FFT compatibility
+
+    B, HW, D = x_fp32.shape
+    freq = torch.fft.fft(x_fp32, dim=1)  # FFT on spatial dimension
+
+    freqs = torch.fft.fftfreq(HW, d=1.0, device=device)
+    cutoff = cutoff_ratio * freqs.abs().max()
+
+    # Create frequency masks
+    low_mask = freqs.abs() <= cutoff
+    high_mask = ~low_mask
+
+    low_mask = low_mask[None, :, None]  # Broadcast to (B, HW, D)
+    high_mask = high_mask[None, :, None]
+
+    low_freq_complex  = freq * low_mask
+    high_freq_complex = freq * high_mask
+
+    # IFFT and take real part
+    low_fp32  = torch.fft.ifft(low_freq_complex,  dim=1).real
+    high_fp32 = torch.fft.ifft(high_freq_complex, dim=1).real
+
+    low  = low_fp32.to(device=device, dtype=orig_dtype)
+    high = high_fp32.to(device=device, dtype=orig_dtype)
+
+    return low, high
+
+@torch.compile
+def reconstruction(low_freq: torch.Tensor, high_freq: torch.Tensor) -> torch.Tensor:
+    return low_freq + high_freq
+
+class CacheWithFreqsContainer(nn.Module):
+    def __init__(self, max_order):
+        super().__init__()
+        self.max_order = max_order
+        # 逐个注册buffer
+        for i in range(max_order + 1):
+            self.register_buffer(f"derivative_{i}_low_freqs", None, persistent=False)
+            self.register_buffer(f"derivative_{i}_high_freqs", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}_low_freqs", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}_high_freqs", None, persistent=False)
+    
+    def get_derivative(self, order, freqs):
+        return getattr(self, f"derivative_{order}_{freqs}")
+    
+    def set_derivative(self, order, freqs, tensor):
+        setattr(self, f"derivative_{order}_{freqs}", tensor)
+
+    def set_temp_derivative(self, order, freqs, tensor):
+        setattr(self, f"temp_derivative_{order}_{freqs}", tensor)
+
+    def get_temp_derivative(self, order, freqs):
+        return getattr(self, f"temp_derivative_{order}_{freqs}")
+    
+    def move_temp_to_derivative(self):
+        for i in range(self.max_order + 1):
+            if self.get_temp_derivative(i, "low_freqs") is not None:
+                setattr(self, f"derivative_{i}_low_freqs", self.get_temp_derivative(i, "low_freqs"))
+            if self.get_temp_derivative(i, "high_freqs") is not None:
+                setattr(self, f"derivative_{i}_high_freqs", self.get_temp_derivative(i, "high_freqs"))
+            else:
+                break
+        self.clear_temp_derivative()
+
+    def get_all_filled_derivatives(self, freqs):
+        return [self.get_derivative(i, freqs) for i in range(self.max_order + 1) if self.get_derivative(i, freqs) is not None]
+
+    def taylor_formula(self, distance):
+        low_freqs_output = 0
+        high_freqs_output = 0
+        for i in range(len(self.get_all_filled_derivatives("low_freqs"))):
+            low_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "low_freqs") * (distance ** i)
+        for i in range(len(self.get_all_filled_derivatives("high_freqs"))):
+            high_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "high_freqs") * (distance ** i)
+        return reconstruction(low_freqs_output, high_freqs_output)
+    
+    def hermite_formula(self, distance):
+        low_freqs_output = 0
+        high_freqs_output = 0
+        for i in range(len(self.get_all_filled_derivatives("low_freqs"))):
+            low_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "low_freqs") * (distance ** i)
+        for i in range(len(self.get_all_filled_derivatives("high_freqs"))):
+            high_freqs_output += (1 / math.factorial(i)) * self.get_derivative(i, "high_freqs") * (distance ** i)
+        return reconstruction(low_freqs_output, high_freqs_output)
+
+    def derivatives_computation(self, x, distance, low_freqs_order, high_freqs_order):
+        '''
+        x: tensor, the new x_0
+        distance: int, the distance between the current step and the last full computation step
+        '''
+        x_low, x_high = decomposition_FFT(x, cutoff_ratio=0.1)
+        self.set_temp_derivative(0, "low_freqs", x_low)
+        self.set_temp_derivative(0, "high_freqs", x_high)
+        for i in range(low_freqs_order):
+            if self.get_derivative(i, "low_freqs") is not None:
+                self.set_temp_derivative(i+1, "low_freqs", (self.get_temp_derivative(i, "low_freqs") - self.get_derivative(i, "low_freqs")) / distance)
+        for i in range(high_freqs_order):
+            if self.get_derivative(i, "high_freqs") is not None:
+                self.set_temp_derivative(i+1, "high_freqs", (self.get_temp_derivative(i, "high_freqs") - self.get_derivative(i, "high_freqs")) / distance)
+        self.move_temp_to_derivative()
+        
+    def clear_temp_derivative(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"temp_derivative_{i}_low_freqs", None)
+            setattr(self, f"temp_derivative_{i}_high_freqs", None)
+
+    def clear_derivatives(self):
+        for i in range(self.max_order + 1):
+            setattr(self, f"derivative_{i}_low_freqs", None)
+            setattr(self, f"derivative_{i}_high_freqs", None)
+            setattr(self, f"temp_derivative_{i}_low_freqs", None)
+            setattr(self, f"temp_derivative_{i}_high_freqs", None)
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b4364671ec8ed6551e634a28c388f912e058d6e
--- /dev/null
+++ b/config.json
@@ -0,0 +1,285 @@
+{
+    "cfg_distilled": true,
+    "use_meanflow": true,
+    "add_classification_head": false,
+    "anyres_pooling_size": 2,
+    "anyres_vit_max_image_size": null,
+    "anyres_vit_two_views": false,
+    "architectures": [
+        "HunyuanImage3ForCausalMM"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_hunyuan_image_3.HunyuanImage3Config",
+        "AutoModel": "modeling_hunyuan_image_3.HunyuanImage3Model",
+        "AutoModelForCausalLM": "modeling_hunyuan_image_3.HunyuanImage3ForCausalMM"
+    },
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_head_dim": 128,
+    "bos_token_id": 127958,
+    "cla_share_factor": 2,
+    "class_num": 0,
+    "dense_list": [
+        4096,
+        0
+    ],
+    "eod_token_id": 3,
+    "eos_token_id": 127957,
+    "group_limited_greedy": false,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "im_end_id": 128001,
+    "im_newline_id": 11,
+    "im_start_id": 128000,
+    "image_token_id": 128006,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "kv_lora_rank": null,
+    "mask_init_id": 12,
+    "max_position_embeddings": 22800,
+    "mlp_bias": false,
+    "model_type": "hunyuan_image_3_moe",
+    "moe_drop_tokens": false,
+    "moe_intermediate_size": [
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072,
+        3072
+    ],
+    "moe_layer_num_skipped": 0,
+    "moe_random_routing_dropped_token": false,
+    "moe_topk": [
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8,
+        8
+    ],
+    "n_group": false,
+    "norm_topk_prob": true,
+    "norm_type": "rms",
+    "num_attention_heads": 32,
+    "num_experts": 64,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "num_media_embeds": 257,
+    "num_shared_expert": [
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1
+    ],
+    "pad_id": 128009,
+    "pad_token_id": 128009,
+    "pool_type": "last",
+    "position_embedding_xdrope": false,
+    "pretraining_tp": 1,
+    "q_lora_rank": null,
+    "qk_nope_head_dim": null,
+    "qk_rope_head_dim": null,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "alpha": 1.0,
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "factor": 1.0,
+        "mscale": 1.0,
+        "mscale_all_dim": 1.0,
+        "type": "custom"
+    },
+    "rope_theta": 10000.0,
+    "routed_scaling_factor": false,
+    "skip_cls_token": false,
+    "text_end_id": 7,
+    "text_start_id": 6,
+    "tie_word_embeddings": false,
+    "topk_group": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.50.0",
+    "use_cache": true,
+    "use_cla": false,
+    "use_mixed_mlp_moe": true,
+    "use_mla": false,
+    "use_qk_norm": true,
+    "use_rotary_pos_emb": true,
+    "v_head_dim": null,
+    "video_end_id": 10,
+    "video_start_id": 9,
+    "vit_add_patchemb_bias": false,
+    "vit_input_resolution": 224,
+    "vit_mapping_type": "resampler",
+    "vit_norm_type": "fused",
+    "vit_patch": 1,
+    "vit_path": null,
+    "vit_remove_prenorm": false,
+    "vit_token": 64,
+    "vit_type": "siglip2-so400m-patch16-naflex",
+    "vit_used_rms_norm": false,
+    "vocab_size": 133120,
+    "xdrope_section": null,
+    "head_dim": 128,
+    "rope_type": "2d",
+    "vae_downsample_factor": [
+        16,
+        16
+    ],
+    "vit_downsample_factor": [
+        16,
+        16
+    ],
+    "cond_token_attn_type": "joint_full",
+    "cond_image_type": "vae_vit",
+    "vae_type": "hunyuan-image-vae-v1",
+    "vae_dtype": "float32",
+    "vae_autocast_dtype": "float16",
+    "vae": {
+        "_class_name": "AutoencoderKLConv3D",
+        "block_out_channels": [
+            128,
+            256,
+            512,
+            1024,
+            1024
+        ],
+        "in_channels": 3,
+        "out_channels": 3,
+        "latent_channels": 32,
+        "layers_per_block": 2,
+        "ffactor_spatial": 16,
+        "ffactor_temporal": 4,
+        "sample_size": 384,
+        "sample_tsize": 96,
+        "downsample_match_channel": true,
+        "upsample_match_channel": true,
+        "scaling_factor": 0.562679178327931
+    },
+    "vit": {
+        "_attn_implementation": "sdpa",
+        "attention_dropout": 0.0,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "intermediate_size": 4304,
+        "layer_norm_eps": 1e-06,
+        "num_attention_heads": 16,
+        "num_channels": 3,
+        "num_hidden_layers": 27,
+        "num_patches": 256,
+        "patch_size": 16,
+        "torch_dtype": "float32",
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "use_return_dict": true
+    },
+    "vit_processor": {
+        "do_convert_rgb": null,
+        "do_normalize": true,
+        "do_rescale": true,
+        "do_resize": true,
+        "image_mean": [
+            0.5,
+            0.5,
+            0.5
+        ],
+        "image_processor_type": "Siglip2ImageProcessorFast",
+        "image_std": [
+            0.5,
+            0.5,
+            0.5
+        ],
+        "max_num_patches": 1024,
+        "patch_size": 16,
+        "processor_class": "Siglip2Processor",
+        "resample": 2,
+        "rescale_factor": 0.00392156862745098
+    },
+    "vit_aligner": {
+        "projector_type": "mlp_gelu",
+        "input_dim": 1152,
+        "n_embed": 4096,
+        "depth": 2,
+        "torch_dtype": "float32"
+    }
+}
diff --git a/configuration_hunyuan_image_3.py b/configuration_hunyuan_image_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..e960da3925c4f5ed106855c7a4273c823e138e1c
--- /dev/null
+++ b/configuration_hunyuan_image_3.py
@@ -0,0 +1,310 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from typing import List, Union, Optional
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunyuanImage3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunyuanImage3Model`]. It is used to instantiate
+    an Hunyuan model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Hunyuan-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Hunyuan Image 3 model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`HunyuanImage3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        moe_intermediate_size (`int` or `List`, *optional*, defaults to 11008):
+            Dimension of the MLP representations in MoE. Use a list if you want a different size per layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether query and key in attention use norm
+        use_cla (`bool`, *optional*, defaults to `False`):
+            Whether to use CLA in attention
+        cla_share_factor (`int`, *optional*, defaults to 1):
+            The share factor of CLA
+        num_experts (`int` or `List`, *optional*, defaults to 1):
+            The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
+        num_shared_expert (`int` or `List`, *optional*, defaults to 1):
+            The number of shared experts for moe. If it is a list, it will be used as the number of shared experts
+            for each layer.
+        moe_topk (`int` or `List`, *optional*, defaults to 1):
+            The topk value for moe. If it is a list, it will be used as the topk value for each layer.
+        capacity_factor (Not used) (`float` or `List`, *optional*, defaults to 1.0):
+            The capacity factor for moe. If it is a list, it will be used as the capacity factor for each layer.
+        moe_layer_num_skipped (`int`, *optional*, defaults to 0):
+            First moe_layer_num_skipped layers do not use MoE.
+    """
+
+    model_type = "Hunyuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+            self,
+            vocab_size: int = 290943,
+            hidden_size: int = 4096,
+            intermediate_size: int = 11008,
+            moe_intermediate_size: Union[int, List] = None,
+            num_hidden_layers: int = 32,
+            num_attention_heads: int = 32,
+            num_key_value_heads: Optional[int] = None,
+            attention_head_dim: Optional[int] = None,
+            hidden_act="silu",
+            max_position_embeddings=2048,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            eod_token_id=3,
+            im_start_id=4,
+            im_end_id=5,
+            text_start_id=6,
+            text_end_id=7,
+            image_token_id=8,
+            video_start_id=9,
+            video_end_id=10,
+            im_newline_id=11,
+            mask_init_id=12,
+            pretraining_tp=1,
+            tie_word_embeddings=False,
+            rope_theta=10000.0,
+            rope_scaling=None,
+            attention_bias=False,
+            mlp_bias=False,
+            attention_dropout=0.0,
+            use_qk_norm=False,
+            use_rotary_pos_emb=True,
+            use_cla=False,
+            cla_share_factor=1,
+            norm_type="hf_rms",
+            num_experts: Union[int, List] = 1,
+            use_mixed_mlp_moe=False,
+            num_shared_expert: Union[int, List] = 1,
+            moe_topk: Union[int, List] = 1,
+            capacity_factor: int = 1.0,
+            moe_drop_tokens=False,
+            moe_random_routing_dropped_token=False,
+            use_mla=False,
+            kv_lora_rank=512,
+            q_lora_rank=1536,
+            qk_rope_head_dim=64,
+            v_head_dim=128,
+            qk_nope_head_dim=128,
+            moe_layer_num_skipped=0,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+            group_limited_greedy=False,
+            n_group=None,
+            topk_group=None,
+            add_classification_head=False,
+            class_num=0,
+            pool_type="last",
+            pad_id=-1,
+            # Added
+            moe_impl="eager",
+            vae_downsample_factor=(16, 16),     # (h, w)
+            img_proj_type="unet",
+            patch_size=1,
+            patch_embed_hidden_dim=1024,
+            image_base_size=1024,
+            rope_type="2d",
+            cond_token_attn_type="full",
+            cond_image_type="vae_vit",
+            vae_type=None,
+            vae_dtype="float32",
+            vae_autocast_dtype="float16",
+            vae=None,
+            vit_type=None,
+            vit=None,
+            vit_processor=None,
+            vit_aligner=None,
+            cfg_distilled=False,
+            use_meanflow=False,
+            **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.moe_impl = moe_impl
+        self.num_experts = num_experts
+        self.use_mixed_mlp_moe = use_mixed_mlp_moe
+        self.num_shared_expert = num_shared_expert
+        self.moe_topk = moe_topk
+        self.capacity_factor = capacity_factor
+        self.moe_drop_tokens = moe_drop_tokens
+        self.moe_random_routing_dropped_token = moe_random_routing_dropped_token
+
+        if attention_head_dim is not None:
+            self.attention_head_dim = attention_head_dim
+        else:
+            self.attention_head_dim = self.hidden_size // num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.attention_dropout = attention_dropout
+        self.use_qk_norm = use_qk_norm
+        self.use_rotary_pos_emb = use_rotary_pos_emb
+        self.use_cla = use_cla
+        self.cla_share_factor = cla_share_factor
+        self.norm_type = norm_type
+        # MLA args
+        self.use_mla = use_mla
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+
+        # DeepSeek related args
+        self.moe_layer_num_skipped = moe_layer_num_skipped
+        self.norm_topk_prob = norm_topk_prob
+        self.routed_scaling_factor = routed_scaling_factor
+        self.group_limited_greedy = group_limited_greedy
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.add_classification_head = add_classification_head
+        self.class_num = class_num
+        self.pool_type = pool_type
+        self.pad_id = pad_id
+
+        if self.class_num is not None:
+            self.dense_list = [self.hidden_size, self.class_num]
+
+        # Conditioning image configs
+        self.cond_token_attn_type = cond_token_attn_type
+        self.cond_image_type = cond_image_type
+
+        # ViT args
+        self.vit_type = vit_type
+        self.vit = vit
+        self.vit_processor = vit_processor
+        self.vit_aligner = vit_aligner
+
+        # Image Gen args
+        self.vae_type = vae_type
+        self.vae_dtype = vae_dtype
+        self.vae_autocast_dtype = vae_autocast_dtype
+        self.vae = vae
+        self.vae_downsample_factor = vae_downsample_factor
+        self.img_proj_type = img_proj_type
+        self.patch_size = patch_size
+        self.patch_embed_hidden_dim = patch_embed_hidden_dim
+        self.image_base_size = image_base_size
+        self.rope_type = rope_type
+
+        # token id
+        self.eod_token_id = eod_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.text_start_id = text_start_id
+        self.text_end_id = text_end_id
+        self.image_token_id = image_token_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+        self.im_newline_id = im_newline_id
+        self.mask_init_id = mask_init_id
+
+        # flag of cfg distilled model
+        self.cfg_distilled = cfg_distilled
+        # flag of meanflow distilled model
+        self.use_meanflow = use_meanflow
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["HunyuanImage3Config"]
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae57595139e4f7a332abc506760004d6a6d52359
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,21 @@
+{
+    "disable_compile": true,
+    "eos_token_id": [
+        127957
+    ],
+    "pad_token_id": 128009,
+    "do_sample": true,
+    "top_k": 1024,
+    "top_p": 0.95,
+    "temperature": 0.6,
+    "max_length": 22800,
+    "sequence_template": "instruct",
+    "diff_infer_steps": 8,
+    "diff_guidance_scale": 2.5,
+    "flow_shift": 3.0,
+    "use_system_prompt": "en_unified",
+    "drop_think": false,
+    "bot_task": "think_recaption",
+    "max_new_tokens": 2048,
+    "transformers_version": "4.50.0"
+}
diff --git a/hunyuan_image_3_pipeline.py b/hunyuan_image_3_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbb2ad332d6229056a2d1d775d7e578db3b75d9e
--- /dev/null
+++ b/hunyuan_image_3_pipeline.py
@@ -0,0 +1,913 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================================
+
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+from .cache_utils import cache_init
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+@dataclass
+class HunyuanImage3Text2ImagePipelineOutput(BaseOutput):
+    samples: Union[List[Any], np.ndarray]
+
+
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+        reverse (`bool`, defaults to `True`):
+            Whether to reverse the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            shift: float = 1.0,
+            reverse: bool = True,
+            solver: str = "euler",
+            use_flux_shift: bool = False,
+            flux_base_shift: float = 0.5,
+            flux_max_shift: float = 1.15,
+            n_tokens: Optional[int] = None,
+    ):
+        sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+
+        if not reverse:
+            sigmas = sigmas.flip(0)
+
+        self.sigmas = sigmas
+        # the value fed to model
+        self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+        self.timesteps_full = (sigmas * num_train_timesteps).to(dtype=torch.float32)
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.supported_solver = [
+            "euler",
+            "heun-2", "midpoint-2",
+            "kutta-4",
+        ]
+        if solver not in self.supported_solver:
+            raise ValueError(f"Solver {solver} not supported. Supported solvers: {self.supported_solver}")
+
+        # empty dt and derivative (for heun)
+        self.derivative_1 = None
+        self.derivative_2 = None
+        self.derivative_3 = None
+        self.dt = None
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    @property
+    def state_in_first_order(self):
+        return self.derivative_1 is None
+
+    @property
+    def state_in_second_order(self):
+        return self.derivative_2 is None
+
+    @property
+    def state_in_third_order(self):
+        return self.derivative_3 is None
+
+    def get_timestep_r(self, timestep: Union[float, torch.FloatTensor]):
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        return self.timesteps_full[self.step_index + 1]
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None,
+                      n_tokens: int = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+
+        # Apply timestep shift
+        if self.config.use_flux_shift:
+            assert isinstance(n_tokens, int), "n_tokens should be provided for flux shift"
+            mu = self.get_lin_function(y1=self.config.flux_base_shift, y2=self.config.flux_max_shift)(n_tokens)
+            sigmas = self.flux_time_shift(mu, 1.0, sigmas)
+        elif self.config.shift != 1.:
+            sigmas = self.sd3_time_shift(sigmas)
+
+        if not self.config.reverse:
+            sigmas = 1 - sigmas
+
+        self.sigmas = sigmas
+        self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+        self.timesteps_full = (sigmas * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+
+        # empty dt and derivative (for kutta)
+        self.derivative_1 = None
+        self.derivative_2 = None
+        self.derivative_3 = None
+        self.dt = None
+
+        # Reset step index
+        self._step_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        return sample
+
+    @staticmethod
+    def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15):
+        m = (y2 - y1) / (x2 - x1)
+        b = y1 - m * x1
+        return lambda x: m * x + b
+
+    @staticmethod
+    def flux_time_shift(mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+    def sd3_time_shift(self, t: torch.Tensor):
+        return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+
+    def step(
+            self,
+            model_output: torch.FloatTensor,
+            timestep: Union[float, torch.FloatTensor],
+            sample: torch.FloatTensor,
+            pred_uncond: torch.FloatTensor = None,
+            generator: Optional[torch.Generator] = None,
+            n_tokens: Optional[int] = None,
+            return_dict: bool = True,
+    ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+                isinstance(timestep, int)
+                or isinstance(timestep, torch.IntTensor)
+                or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        model_output = model_output.to(torch.float32)
+        pred_uncond = pred_uncond.to(torch.float32) if pred_uncond is not None else None
+
+        # dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+
+        last_inner_step = True
+        if self.config.solver == "euler":
+            derivative, dt, sample, last_inner_step = self.first_order_method(model_output, sigma, sigma_next, sample)
+        elif self.config.solver in ["heun-2", "midpoint-2"]:
+            derivative, dt, sample, last_inner_step = self.second_order_method(model_output, sigma, sigma_next, sample)
+        elif self.config.solver == "kutta-4":
+            derivative, dt, sample, last_inner_step = self.fourth_order_method(model_output, sigma, sigma_next, sample)
+        else:
+            raise ValueError(f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}")
+
+        prev_sample = sample + derivative * dt
+
+        # Cast sample back to model compatible dtype
+        # prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        if last_inner_step:
+            self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+    def first_order_method(self, model_output, sigma, sigma_next, sample):
+        derivative = model_output
+        dt = sigma_next - sigma
+        return derivative, dt, sample, True
+
+    def second_order_method(self, model_output, sigma, sigma_next, sample):
+        if self.state_in_first_order:
+            # store for 2nd order step
+            self.derivative_1 = model_output
+            self.dt = sigma_next - sigma
+            self.sample = sample
+
+            derivative = model_output
+            if self.config.solver == 'heun-2':
+                dt = self.dt
+            elif self.config.solver == 'midpoint-2':
+                dt = self.dt / 2
+            else:
+                raise NotImplementedError(f"Solver {self.config.solver} not supported.")
+            last_inner_step = False
+
+        else:
+            if self.config.solver == 'heun-2':
+                derivative = 0.5 * (self.derivative_1 + model_output)
+            elif self.config.solver == 'midpoint-2':
+                derivative = model_output
+            else:
+                raise NotImplementedError(f"Solver {self.config.solver} not supported.")
+
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+            last_inner_step = True
+
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.derivative_1 = None
+            self.dt = None
+            self.sample = None
+
+        return derivative, dt, sample, last_inner_step
+
+    def fourth_order_method(self, model_output, sigma, sigma_next, sample):
+        if self.state_in_first_order:
+            self.derivative_1 = model_output
+            self.dt = sigma_next - sigma
+            self.sample = sample
+            derivative = model_output
+            dt = self.dt / 2
+            last_inner_step = False
+
+        elif self.state_in_second_order:
+            self.derivative_2 = model_output
+            derivative = model_output
+            dt = self.dt / 2
+            last_inner_step = False
+
+        elif self.state_in_third_order:
+            self.derivative_3 = model_output
+            derivative = model_output
+            dt = self.dt
+            last_inner_step = False
+
+        else:
+            derivative = (1/6 * self.derivative_1 + 1/3 * self.derivative_2 + 1/3 * self.derivative_3 +
+                          1/6 * model_output)
+
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+            last_inner_step = True
+
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.derivative_1 = None
+            self.derivative_2 = None
+            self.derivative_3 = None
+            self.dt = None
+            self.sample = None
+
+        return derivative, dt, sample, last_inner_step
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+class ClassifierFreeGuidance:
+    def __init__(
+        self,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__()
+        self.use_original_formulation = use_original_formulation
+
+    def __call__(
+            self,
+            pred_cond: torch.Tensor,
+            pred_uncond: Optional[torch.Tensor],
+            guidance_scale: float,
+            step: int,
+        ) -> torch.Tensor:
+
+        shift = pred_cond - pred_uncond
+        pred = pred_cond if self.use_original_formulation else pred_uncond
+        pred = pred + guidance_scale * shift
+
+        return pred
+
+
+class HunyuanImage3Text2ImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for condition-to-sample generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        model ([`ModelMixin`]):
+            A model to denoise the diffused latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `diffusion_model` to denoise the diffused latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = ""
+    _optional_components = []
+    _exclude_from_cpu_offload = []
+    _callback_tensor_inputs = ["latents"]
+
+    def __init__(
+        self,
+        model,
+        scheduler: SchedulerMixin,
+        vae,
+        progress_bar_config: Dict[str, Any] = None,
+    ):
+        super().__init__()
+
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, '_progress_bar_config'):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        # ==========================================================================================
+
+        self.register_modules(
+            model=model,
+            scheduler=scheduler,
+            vae=vae,
+        )
+
+        # should be a tuple or a list corresponding to the size of latents (batch_size, channel, *size)
+        # if None, will be treated as a tuple of 1
+        self.latent_scale_factor = self.model.config.vae_downsample_factor
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.latent_scale_factor)
+
+        # Must start with APG_mode_
+        self.cfg_operator = ClassifierFreeGuidance()
+
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+
+    @staticmethod
+    def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_kwargs = {}
+
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_kwargs[k] = v
+        return extra_kwargs
+
+    def prepare_latents(self, batch_size, latent_channel, image_size, dtype, device, generator, latents=None):
+        if self.latent_scale_factor is None:
+            latent_scale_factor = (1,) * len(image_size)
+        elif isinstance(self.latent_scale_factor, int):
+            latent_scale_factor = (self.latent_scale_factor,) * len(image_size)
+        elif isinstance(self.latent_scale_factor, tuple) or isinstance(self.latent_scale_factor, list):
+            assert len(self.latent_scale_factor) == len(image_size), \
+                "len(latent_scale_factor) shoudl be the same as len(image_size)"
+            latent_scale_factor = self.latent_scale_factor
+        else:
+            raise ValueError(
+                f"latent_scale_factor should be either None, int, tuple of int, or list of int, "
+                f"but got {self.latent_scale_factor}"
+            )
+
+        latents_shape = (
+            batch_size,
+            latent_channel,
+            *[int(s) // f for s, f in zip(image_size, latent_scale_factor)],
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    def set_scheduler(self, new_scheduler):
+        self.register_modules(scheduler=new_scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int,
+        image_size: List[int],
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        meanflow: bool = False,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        guidance_rescale: float = 0.0,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        model_kwargs: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The text to guide image generation.
+            image_size (`Tuple[int]` or `List[int]`):
+                The size (height, width) of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate samples closely linked to the
+                `condition` at the expense of lower sample quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for sample
+                generation. Can be used to tweak the same generation with different conditions. If not provided,
+                a latents tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~DiffusionPipelineOutput`] instead of a
+                plain tuple.
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~DiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~DiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated samples.
+        """
+
+        callback_steps = kwargs.pop("callback_steps", None)
+        pbar_steps = kwargs.pop("pbar_steps", None)
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+
+
+        if not kwargs.get('cfg_distilled', False):
+            cfg_factor = 1 + self.do_classifier_free_guidance
+        else:
+            cfg_factor = 1
+        # Define call parameters
+        device = self._execution_device
+
+        # Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas,
+        )
+
+        # Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            latent_channel=self.model.config.vae["latent_channels"],
+            image_size=image_size,
+            dtype=torch.bfloat16,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # Prepare extra step kwargs.
+        _scheduler_step_extra_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step, {"generator": generator}
+        )
+
+        # Prepare model kwargs
+        input_ids = model_kwargs.pop("input_ids")
+        attention_mask = self.model._prepare_attention_mask_for_generation(     # noqa
+            input_ids, self.model.generation_config, model_kwargs=model_kwargs,
+        )
+        model_kwargs["attention_mask"] = attention_mask.to(latents.device)
+
+        # Sampling loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        # Taylor cache 
+        cache_dic = None
+        if self.model.use_taylor_cache:
+            cache_dic = cache_init(cache_interval=self.model.taylor_cache_interval, max_order=self.model.taylor_cache_order, num_steps=len(timesteps),
+                                enable_first_enhance=self.model.taylor_cache_enable_first_enhance, first_enhance_steps=self.model.taylor_cache_first_enhance_steps, 
+                                enable_tailing_enhance=self.model.taylor_cache_enable_tailing_enhance,
+                                tailing_enhance_steps=self.model.taylor_cache_tailing_enhance_steps,
+                                low_freqs_order=self.model.taylor_cache_low_freqs_order,
+                                high_freqs_order=self.model.taylor_cache_high_freqs_order)
+        print(f"***use_taylor_cache: {self.model.use_taylor_cache}, cache_dic: {cache_dic}")
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * cfg_factor)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if meanflow:
+                    r = self.scheduler.get_timestep_r(t)
+                    r_expand = r.repeat(latent_model_input.shape[0])
+                else:
+                    r_expand = None
+                model_kwargs["timesteps_r"] = r_expand
+
+                t_expand = t.repeat(latent_model_input.shape[0])
+
+                if self.model.use_taylor_cache:
+                    cache_dic['current_step'] = i
+                    model_kwargs['cache_dic'] = cache_dic
+                if kwargs.get('cfg_distilled', False):
+                    model_kwargs["guidance"] = torch.tensor(
+                        [1000.0*self._guidance_scale], device=self.device, dtype=torch.bfloat16
+                    )
+                model_inputs = self.model.prepare_inputs_for_generation(
+                    input_ids,
+                    images=latent_model_input,
+                    timesteps=t_expand,
+                    **model_kwargs,
+                )
+                with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+                    model_output = self.model(**model_inputs, first_step=(i == 0))
+                    pred = model_output["diffusion_prediction"]
+                pred = pred.to(dtype=torch.float32)
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    if not kwargs.get('cfg_distilled', False):
+                        pred_cond, pred_uncond = pred.chunk(2)
+                        pred = self.cfg_operator(pred_cond, pred_uncond, self.guidance_scale, step=i)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    pred = rescale_noise_cfg(pred, pred_cond, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(pred, t, latents, **_scheduler_step_extra_kwargs, return_dict=False)[0]
+
+                if i != len(timesteps) - 1:
+                    model_kwargs = self.model._update_model_kwargs_for_generation(  # noqa
+                        model_output,
+                        model_kwargs,
+                    )
+                    input_ids = None
+                    # if input_ids.shape[1] != model_kwargs["position_ids"].shape[1]:
+                    #     input_ids = torch.gather(input_ids, 1, index=model_kwargs["position_ids"])
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if hasattr(self.vae.config, 'scaling_factor') and self.vae.config.scaling_factor:
+            latents = latents / self.vae.config.scaling_factor
+        if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
+            latents = latents + self.vae.config.shift_factor
+
+        if hasattr(self.vae, "ffactor_temporal"):
+            latents = latents.unsqueeze(2)
+
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+            image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
+
+        # b c t h w
+        if hasattr(self.vae, "ffactor_temporal"):
+            assert image.shape[2] == 1, "image should have shape [B, C, T, H, W] and T should be 1"
+            image = image.squeeze(2)
+
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image,)
+
+        return HunyuanImage3Text2ImagePipelineOutput(samples=image)
diff --git a/image_processor.py b/image_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8740eefd7e0dad2f7765be00a2f39c67e9214dc0
--- /dev/null
+++ b/image_processor.py
@@ -0,0 +1,465 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from dataclasses import dataclass, field, asdict
+from typing import Tuple, Optional, Callable, Union, Any
+import random
+import math
+
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import load_image
+from transformers.models.siglip2.image_processing_siglip2_fast import Siglip2ImageProcessorFast
+from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
+
+from .tokenization_hunyuan_image_3 import ImageInfo, ImageTensor, CondImage, Resolution, ResolutionGroup
+
+InputImage = Union[Image.Image, str]
+
+
+class SliceVocabLogitsProcessor(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that performs vocab slicing, i.e. restricting probabilities with in some range. This processor
+    is often used in multimodal discrete LLMs, which ensure that we only sample within one modality
+
+    Args:
+        vocab_start (`int`): start of slice, default None meaning from 0
+        vocab_end (`int`): end of slice, default None meaning to the end of list
+        when start and end are all None, this processor does noting
+
+    """
+
+    def __init__(self, vocab_start: int = None, vocab_end: int = None, **kwargs):
+        if vocab_start is not None and vocab_end is not None:
+            assert vocab_start < vocab_end, f"Ensure vocab_start {vocab_start} < vocab_end {vocab_end}"
+        self.vocab_start = vocab_start
+        self.vocab_end = vocab_end
+        self.other_slices = kwargs.get("other_slices", [])
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores_processed = scores[:, self.vocab_start: self.vocab_end]
+        for other_slice in self.other_slices:
+            scores_processed = torch.cat([scores_processed, scores[:, other_slice[0]: other_slice[1]]], dim=-1)
+        return scores_processed
+
+    def __repr__(self):
+        return f"SliceVocabLogitsWarper(vocab_start={self.vocab_start}, vocab_end={self.vocab_end}, other_slices={self.other_slices})"
+
+
+def resize_and_crop(image: Image.Image, target_size: Tuple[int, int], resample=Image.Resampling.LANCZOS, crop_type='center', crop_coords=None) -> Image.Image:
+    tw, th = target_size
+    w, h = image.size
+
+    tr = th / tw
+    r = h / w
+
+    if crop_type == "resize":
+        resize_width = tw
+        resize_height = th
+        crop_top = 0
+        crop_left = 0
+        image = image.resize((resize_width, resize_height), resample=resample)
+    else:
+        # maintain the aspect ratio
+        if r < tr:
+            resize_height = th
+            resize_width = int(round(th / h * w))
+        else:
+            resize_width = tw
+            resize_height = int(round(tw / w * h))
+
+        if crop_type == 'center':
+            crop_top = int(round((resize_height - th) / 2.0))
+            crop_left = int(round((resize_width - tw) / 2.0))
+        elif crop_type == 'random':
+            crop_top = random.randint(0, resize_height - th)
+            crop_left = random.randint(0, resize_width - tw)
+        elif crop_type == 'fixed':
+            assert crop_coords is not None, 'crop_coords should be provided when crop_type is fixed.'
+            crop_left, crop_top = crop_coords
+        else:
+            raise ValueError(f'crop_type must be center, random or fixed, but got {crop_type}')
+
+        image = image.resize((resize_width, resize_height), resample=resample)
+        image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
+
+    return image
+
+
+@dataclass
+class ResolutionGroupConfig:
+    base_size: int = None
+    step: Optional[int] = None
+    align: int = 16
+
+    def to_dict(self):
+        return asdict(self)
+
+
+@dataclass
+class VAEInfo:
+    encoder_type: str
+    down_h_factor: int = -1
+    down_w_factor: int = -1
+    patch_size: int = 1
+    h_factor: int = -1
+    w_factor: int = -1
+    image_type: str = None
+
+    def __post_init__(self):
+        self.h_factor = self.down_h_factor * self.patch_size
+        self.w_factor = self.down_w_factor * self.patch_size
+        if self.image_type is None:
+            self.image_type = "vae"
+
+
+@dataclass
+class ViTInfo:
+    encoder_type: str
+    h_factor: int = -1
+    w_factor: int = -1
+    max_token_length: int = 0   # pad to max_token_length
+    processor: Callable = field(default_factory=BaseImageProcessor)
+    image_type: str = None
+
+    def __post_init__(self):
+        if self.image_type is None:
+            self.image_type = self.encoder_type.split("-")[0]
+
+
+class HunyuanImage3ImageProcessor(object):
+    def __init__(self, config):
+        self.config = config
+
+        self.reso_group_config = ResolutionGroupConfig(base_size=config.image_base_size)
+        self.vae_reso_group = ResolutionGroup(
+            **self.reso_group_config.to_dict(),
+            extra_resolutions=[
+                Resolution("1024x768"),
+                Resolution("1280x720"),
+                Resolution("768x1024"),
+                Resolution("720x1280"),
+            ]
+        )
+        self.img_ratio_slice_logits_processor = None
+        self.pil_image_to_tensor = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),  # transform to [-1, 1]
+        ])
+        self.vae_info = VAEInfo(
+            encoder_type=config.vae_type,
+            down_h_factor=config.vae_downsample_factor[0], down_w_factor=config.vae_downsample_factor[0],
+            patch_size=config.patch_size,
+        )
+
+        if config.vit_type == "siglip2-so400m-patch16-naflex":
+            self.vit_processor = Siglip2ImageProcessorFast.from_dict(config.vit_processor)
+        else:
+            raise ValueError(f"Unsupported vit_type: {config.vit_type}")
+        self.vit_info = ViTInfo(
+            encoder_type=config.vit_type,
+            h_factor=self.vit_processor.patch_size,
+            w_factor=self.vit_processor.patch_size,
+            max_token_length=self.vit_processor.max_num_patches,
+            processor=self.vit_processor,
+        )
+        self.cond_token_attn_type = config.cond_token_attn_type
+        self.cond_image_type = config.cond_image_type
+
+    def build_gen_image_info(self, image_size, add_guidance_token=False, add_timestep_r_token=False) -> ImageInfo:
+        # parse image size (HxW, H:W, or <img_ratio_i>)
+        if isinstance(image_size, str):
+            if image_size.startswith("<img_ratio_"):
+                ratio_index = int(image_size.split("_")[-1].rstrip(">"))
+                reso = self.vae_reso_group[ratio_index]
+                image_size = reso.height, reso.width
+            elif 'x' in image_size:
+                image_size = [int(s) for s in image_size.split('x')]
+            elif ':' in image_size:
+                image_size = [int(s) for s in image_size.split(':')]
+                assert len(image_size) == 2, f"`image_size` should be in the format of 'W:H', got {image_size}."
+                # Note that ratio is width:height
+                image_size = [image_size[1], image_size[0]]
+            else:
+                raise ValueError(
+                    f"`image_size` should be in the format of 'HxW', 'W:H' or <img_ratio_i>, got {image_size}.")
+            assert len(image_size) == 2, f"`image_size` should be in the format of 'HxW', got {image_size}."
+        elif isinstance(image_size, (list, tuple)):
+            assert len(image_size) == 2 and all(isinstance(s, int) for s in image_size), \
+                f"`image_size` should be a tuple of two integers or a string in the format of 'HxW', got {image_size}."
+        else:
+            raise ValueError(f"`image_size` should be a tuple of two integers or a string in the format of 'WxH', "
+                             f"got {image_size}.")
+        image_width, image_height = self.vae_reso_group.get_target_size(image_size[1], image_size[0])
+        token_height = image_height // self.vae_info.h_factor
+        token_width = image_width // self.vae_info.w_factor
+        base_size, ratio_idx = self.vae_reso_group.get_base_size_and_ratio_index(image_size[1], image_size[0])
+        image_info = ImageInfo(
+            image_type="gen_image", image_width=image_width, image_height=image_height,
+            token_width=token_width, token_height=token_height, base_size=base_size, ratio_index=ratio_idx,
+            add_guidance_token=add_guidance_token, add_timestep_r_token=add_timestep_r_token,
+        )
+        return image_info
+
+    def as_image_tensor(self, image, image_type, **kwargs) -> ImageTensor:
+        if isinstance(image, Image.Image):
+            tensor = self.pil_image_to_tensor(image)
+        else:
+            tensor = image
+        
+        origin_size = kwargs["origin_size"]
+        ori_image_width = origin_size[0]
+        ori_image_height = origin_size[1]
+
+        if image_type == "vae":
+            assert tensor.ndim == 3 or tensor.ndim == 4
+            h, w = tensor.shape[-2], tensor.shape[-1]
+            assert (h % self.vae_info.h_factor == 0 and w % self.vae_info.w_factor == 0), \
+                (f"Image size should be divisible by ({self.vae_info.h_factor}, {self.vae_info.w_factor}), "
+                 f"but got ({h} x {w}).")
+            tk_height = h // self.vae_info.h_factor
+            tk_width = w // self.vae_info.w_factor
+            base_size, ratio_idx = self.vae_reso_group.get_base_size_and_ratio_index(w, h)
+            tensor.i = ImageInfo(
+                image_type=image_type,
+                image_width=w, image_height=h, token_width=tk_width, token_height=tk_height,
+                base_size=base_size, ratio_index=ratio_idx,
+                ori_image_width=ori_image_width,
+                ori_image_height=ori_image_height,
+            )
+            tensor.section_type = "cond_vae_image"
+        elif image_type == "siglip2":
+            spatial_shapes = kwargs["spatial_shapes"]  # 2  (h, w)
+            pixel_attention_mask = kwargs["pixel_attention_mask"]  # seq_len
+            tensor.i = ImageInfo(
+                image_type=image_type,
+                image_width=spatial_shapes[1].item() * self.vit_info.w_factor,
+                image_height=spatial_shapes[0].item() * self.vit_info.h_factor,
+                token_width=spatial_shapes[1].item(),
+                token_height=spatial_shapes[0].item(),
+                image_token_length=self.vit_info.max_token_length,
+                ori_image_width=ori_image_width,
+                ori_image_height=ori_image_height,
+            )
+            tensor.section_type = "cond_vit_image"
+            tensor.vision_encoder_kwargs = {
+                "spatial_shapes": spatial_shapes,
+                "pixel_attention_mask": pixel_attention_mask,
+            }
+        elif image_type == "anyres":
+            token_width = kwargs["resized_image_width"] // self.vit_info.w_factor
+            token_height = kwargs["resized_image_height"] // self.vit_info.h_factor
+            tensor.i = ImageInfo(
+                image_type=image_type,
+                image_width=kwargs["resized_image_width"],
+                image_height=kwargs["resized_image_height"],
+                token_width=token_width,
+                token_height=token_height,
+                image_token_length=token_height * (token_width + 1) + 2,
+            )
+            tensor.section_type = "cond_vit_image"
+        else:
+            raise ValueError(f"Unknown image type: {image_type}")
+        return tensor
+
+    def vae_process_image(self, image, target_size, random_crop: bool | str = False) -> ImageTensor:
+        origin_size = image.size
+        crop_type = random_crop if isinstance(random_crop, str) else ("random" if random_crop else "center")
+        resized_image = resize_and_crop(image, target_size, crop_type=crop_type)
+        return self.as_image_tensor(resized_image, image_type=self.vae_info.image_type, origin_size=origin_size)
+
+    def vit_process_image(self, image) -> ImageTensor:
+        origin_size = image.size
+        inputs = self.vit_info.processor(image)
+        image = inputs["pixel_values"].squeeze(0)   # (seq_len, dim)
+
+        remain_keys = set(inputs.keys()) - {"pixel_values"}
+        remain_kwargs = {}
+        for key in remain_keys:
+            if isinstance(inputs[key], torch.Tensor):
+                remain_kwargs[key] = inputs[key].squeeze(0)
+            else:
+                remain_kwargs[key] = inputs[key]
+
+        return self.as_image_tensor(image, image_type=self.vit_info.image_type, origin_size=origin_size, **remain_kwargs)
+
+    def get_image_with_size(
+            self,
+            src: InputImage,
+            random_crop: bool | str = False,
+            return_type: str = "vae",
+    ) -> tuple[ImageTensor | CondImage, bool]:
+        """ For various image generation tasks, dynamic image sizes """
+        image = load_image(src)
+        image_flag = "normal"
+        img_success = image_flag != "gray"
+        origin_size = image.size  # (w_ori, h_ori)
+
+        if "vae" in return_type:
+            target_size = self.vae_reso_group.get_target_size(*origin_size)
+            vae_image_tensor = self.vae_process_image(image, target_size, random_crop=random_crop)
+        else:
+            vae_image_tensor = None
+
+        if "vit" in return_type:
+            vit_image_tensor = self.vit_process_image(image)
+        else:
+            vit_image_tensor = None
+
+        if return_type == "vae":
+            image_tensor = vae_image_tensor
+        elif return_type == "vit":
+            image_tensor = vit_image_tensor
+        elif return_type == "vae_vit":
+            image_tensor = CondImage(image_type=return_type, vae_image=vae_image_tensor, vit_image=vit_image_tensor)
+        else:
+            raise ValueError(f"Unknown return_type: {return_type}")
+
+        return image_tensor, img_success
+
+    def build_cond_images(
+            self,
+            image_list: Optional[list[InputImage]] = None,
+            message_list: Optional[list[dict[str, Any]]] = None,
+            infer_align_image_size: bool = False,
+    ) -> Optional[list[CondImage]]:
+        if image_list is not None and message_list is not None:
+            raise ValueError("`image_list` and `message_list` cannot be provided at the same time.")
+        if message_list is not None:
+            image_list = []
+            for message in message_list:
+                visuals = [
+                    content
+                    for content in message["content"]
+                    if isinstance(content, dict) and content["type"] in ["image"]
+                ]
+                image_list.extend([
+                    vision_info[key]
+                    for vision_info in visuals
+                    for key in ["image", "url", "path", "base64"]
+                    if key in vision_info and vision_info["type"] == "image"
+                ])
+
+        if infer_align_image_size:
+            random_crop = "resize"
+        else:
+            random_crop = "center"
+
+        return [
+            self.get_image_with_size(src, return_type=self.cond_image_type, random_crop=random_crop)[0]
+            for src in image_list
+        ]
+
+    def prepare_full_attn_slices(self, output, batch_idx=None, with_gen=True):
+        """ Determine full attention image slices according to strategies. """
+        if self.cond_image_type == "vae":
+            cond_choices = dict(
+                causal=[],
+                full=output.vae_image_slices[batch_idx] if batch_idx is not None else output.vae_image_slices
+            )
+
+        elif self.cond_image_type == "vit":
+            cond_choices = dict(
+                causal=[],
+                full=output.vit_image_slices[batch_idx] if batch_idx is not None else output.vit_image_slices
+            )
+
+        elif self.cond_image_type == "vae_vit":
+            cond_choices = {
+                "causal": [],
+                "full": (
+                    output.vae_image_slices[batch_idx] + output.vit_image_slices[batch_idx]
+                    if batch_idx is not None
+                    else output.vae_image_slices + output.vit_image_slices
+                ),
+                "joint_full": (
+                    output.joint_image_slices[batch_idx]
+                    if batch_idx is not None
+                    else output.joint_image_slices
+                ),
+                "full_causal": (
+                    output.vae_image_slices[batch_idx]
+                    if batch_idx is not None
+                    else output.vae_image_slices
+                ),
+            }
+
+        else:
+            raise ValueError(f"Unknown cond_image_type: {self.cond_image_type}")
+        slices = cond_choices[self.cond_token_attn_type]
+
+        if with_gen:
+            gen_image_slices = (
+                output.gen_image_slices[batch_idx]
+                if batch_idx is not None
+                else output.gen_image_slices
+            )
+            slices = slices + gen_image_slices
+        return slices
+
+    def build_img_ratio_slice_logits_processor(self, tokenizer):
+        if self.img_ratio_slice_logits_processor is None:
+            self.img_ratio_slice_logits_processor = LogitsProcessorList()
+            self.img_ratio_slice_logits_processor.append(
+                SliceVocabLogitsProcessor(
+                    vocab_start=tokenizer.start_ratio_token_id,
+                    vocab_end=tokenizer.end_ratio_token_id + 1,
+                    other_slices=getattr(tokenizer, "ratio_token_other_slices", []),
+                )
+            )
+
+    def postprocess_outputs(self, outputs: list[Image.Image], batch_cond_images, infer_align_image_size: bool = False):
+        if infer_align_image_size:
+            target_area = self.vae_reso_group.base_size ** 2
+
+            for batch_index, (output_image, cond_images) in enumerate(zip(outputs, batch_cond_images)):
+                output_image_ratio_index = self.vae_reso_group.get_base_size_and_ratio_index(width=output_image.width, height=output_image.height)[1]
+                cond_images_ratio_index_list = []
+                cond_images_ori_width_list = []
+                cond_images_ori_height_list = []
+                for cond_image in cond_images:
+                    if isinstance(cond_image, ImageTensor):
+                        cond_images_ratio_index_list.append(cond_image.i.ratio_index)
+                        cond_images_ori_width_list.append(cond_image.i.ori_image_width)
+                        cond_images_ori_height_list.append(cond_image.i.ori_image_height)
+                    else: # CondImage
+                        cond_images_ratio_index_list.append(cond_image.vae_image.i.ratio_index)
+                        cond_images_ori_width_list.append(cond_image.vae_image.i.ori_image_width)
+                        cond_images_ori_height_list.append(cond_image.vae_image.i.ori_image_height)
+
+                if len(cond_images) == 0:
+                    continue
+                elif len(cond_images) == 1:
+                    if output_image_ratio_index == cond_images_ratio_index_list[0]:
+                        if abs(cond_images_ori_height_list[0] / cond_images_ori_width_list[0] - self.vae_reso_group[output_image_ratio_index].ratio) >= 0.01:
+                            scale = math.sqrt(target_area / (cond_images_ori_width_list[0] * cond_images_ori_height_list[0]))
+                            new_w = round(cond_images_ori_width_list[0] * scale)
+                            new_h = round(cond_images_ori_height_list[0] * scale)
+                            outputs[batch_index] = output_image.resize((new_w, new_h), resample=Image.Resampling.LANCZOS)
+                else:
+                    for cond_image_ratio_index, cond_image_ori_width, cond_image_ori_height in zip(cond_images_ratio_index_list, cond_images_ori_width_list, cond_images_ori_height_list):
+                        if output_image_ratio_index == cond_image_ratio_index:
+                            if abs(cond_image_ori_height / cond_image_ori_width - self.vae_reso_group[output_image_ratio_index].ratio) >= 0.01:
+                                scale = math.sqrt(target_area / (cond_image_ori_width * cond_image_ori_height))
+                                new_w = round(cond_image_ori_width * scale)
+                                new_h = round(cond_image_ori_height * scale)
+                                outputs[batch_index] = output_image.resize((new_w, new_h), resample=Image.Resampling.LANCZOS)
+                            break
+
+        return outputs
+
+__all__ = [
+    "HunyuanImage3ImageProcessor"
+]
diff --git a/model-0001-of-0032.safetensors b/model-0001-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c24e6617f78553d3c8248be5de85f1dcec2c64e6
--- /dev/null
+++ b/model-0001-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f636cb156e0d38d0739bd8b879da5bcce568f6ff3121314bf54a744d70a3303e
+size 5348403192
diff --git a/model-0002-of-0032.safetensors b/model-0002-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b8dc0cae0755152410b52ba304d54d368971dcbc
--- /dev/null
+++ b/model-0002-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d596c456ffb1c0369b296815dbf5aecae344658399f9dbd44b1d0a383cdb43f
+size 5344103080
diff --git a/model-0003-of-0032.safetensors b/model-0003-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b609c65321393da660af67a7c15affd880bcb186
--- /dev/null
+++ b/model-0003-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:873864acf1cb6412ea2379cdf9a3517a64f64fbd584945ac225a5a5155a6b145
+size 5318937248
diff --git a/model-0004-of-0032.safetensors b/model-0004-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fd2520a7f7c0f5182ca1642c94894a43e227c422
--- /dev/null
+++ b/model-0004-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e1315b5aa43a77285047e916f67add4d8c63f08d894c723595edb8c7e3b3f09
+size 5353033424
diff --git a/model-0005-of-0032.safetensors b/model-0005-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f91d162b3bc6b825b4a30c271767562f347cd761
--- /dev/null
+++ b/model-0005-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8a4e3175a906e0eb9b1e6b0ac45c9d26f4aad7a2f37f7d35c562f2a0154345
+size 5318937248
diff --git a/model-0006-of-0032.safetensors b/model-0006-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1777f6991b5cdf9345f772337e018cf946d3d26a
--- /dev/null
+++ b/model-0006-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72ee1387a20f18cecced4fcd1dd19e6a5fe47473259f9901dc52d60a7b06a0cf
+size 5344103080
diff --git a/model-0007-of-0032.safetensors b/model-0007-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b4fb9c2728ae89be70b1a25f645bfd82e19e0c5a
--- /dev/null
+++ b/model-0007-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:735b9cc3f26729f3ac058d40dbab4dd6c4fc80fcef87814ecf979dde5b4809a7
+size 5318937256
diff --git a/model-0008-of-0032.safetensors b/model-0008-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..932518f2c2268509af3e233c26be796cbc62af41
--- /dev/null
+++ b/model-0008-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38ddb09d7f113bfd2b9b77d604251a27c1f5252fe161ef16138ddd3092c4a7b3
+size 5344103088
diff --git a/model-0009-of-0032.safetensors b/model-0009-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3684580e0fde1ac4b69572be162ae292892bf788
--- /dev/null
+++ b/model-0009-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a670adad4ffc1c084887a144f40d83f2b737fb39d51b99e0a7ac719c0cd0e58
+size 5318937256
diff --git a/model-0010-of-0032.safetensors b/model-0010-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eecf6d835e2f60fbd045419fc5563c3aac1aadf7
--- /dev/null
+++ b/model-0010-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bc6d5bd8e956687d7f1ecdc2dcf61731828f16758c6a0e61c6904a8e874f1ef
+size 5344103136
diff --git a/model-0011-of-0032.safetensors b/model-0011-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..39595fa64a217c5a7bd5c2199eebd9da79e27b75
--- /dev/null
+++ b/model-0011-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc082641d9f77d84c8b29a67cd4bdade9c754485741a4785fc550ecf17042a8
+size 5318937400
diff --git a/model-0012-of-0032.safetensors b/model-0012-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3216707895af54a268e147ecb8b4f9645abf4fa6
--- /dev/null
+++ b/model-0012-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:493b7acccb94a8808c3851ab0fe585af0c39df1c1a36fd66305d6aea030f4292
+size 5344103232
diff --git a/model-0013-of-0032.safetensors b/model-0013-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c5a3e8d14b7ab00ae1d60c7dcf886cd9d28f99ba
--- /dev/null
+++ b/model-0013-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a0ed8a1eadb8a750d238d490db84fd7546d31649991666b867811e5e2d59853
+size 5318937400
diff --git a/model-0014-of-0032.safetensors b/model-0014-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..de963059669ef55fcbf01776cd5dd99fc50735c6
--- /dev/null
+++ b/model-0014-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6023be614b51bbc4a987930cf1a40d5beb192b6cb33b916601813dcca8e0d98
+size 5344103232
diff --git a/model-0015-of-0032.safetensors b/model-0015-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8034993821f23f28c8336e78ffabdc1eaa0dc467
--- /dev/null
+++ b/model-0015-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf87b76628ed4c85a0d8d6259a224dfea5464eb2d59a7adea00a02e733574f72
+size 5318937400
diff --git a/model-0016-of-0032.safetensors b/model-0016-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2ae24b4da3d2cc384210cbfc05de147c9a8dcadb
--- /dev/null
+++ b/model-0016-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4f83f99e3ba5038cb05e05a6419ff5bbc72f341f453d3be87cf975fdb798e8c
+size 5344103232
diff --git a/model-0017-of-0032.safetensors b/model-0017-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..85f67199db75097b3b9e43a329a5cf780070b730
--- /dev/null
+++ b/model-0017-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:791f22de90a61a7eab1bd8d00827920fac4b3591ff5f8c7b5ba3addcb23e564b
+size 5318937392
diff --git a/model-0018-of-0032.safetensors b/model-0018-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a376868abb59591d323cc72eccdc75d5e4105461
--- /dev/null
+++ b/model-0018-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6239f3e75845c6fc21ce4b865b4d2bdcaece44363c549da1ca2209d3d49ef4c0
+size 5344636312
diff --git a/model-0019-of-0032.safetensors b/model-0019-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..94544d37ab700db52fdfd3fb45e22242de490db8
--- /dev/null
+++ b/model-0019-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3eeffa236a37d693e6d2bc111d97a53f844a3d6f4adf99576841a2df8bc425d2
+size 5327334656
diff --git a/model-0020-of-0032.safetensors b/model-0020-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..069405fd41fe6cb59303f34b315bc60af4a33a4b
--- /dev/null
+++ b/model-0020-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03a578aed706c5a11abeb8698a1eeb7aa1b05efac904087c0373c2218cc25a70
+size 5344103224
diff --git a/model-0021-of-0032.safetensors b/model-0021-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bff930ae7a115a9d9515f9dd41cd1f6c87c6cdec
--- /dev/null
+++ b/model-0021-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:443c86dd32419ef1b1891d876f0e5260dbf8a8758519bb70a41047394efdc53f
+size 5318937400
diff --git a/model-0022-of-0032.safetensors b/model-0022-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..809894a70825e08eb8ef1cf3b90612cd828059f0
--- /dev/null
+++ b/model-0022-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99fef1fe82cdde878b6b7e022644571aba9710535be9fbbc9860f71eeb0d3efc
+size 5344103232
diff --git a/model-0023-of-0032.safetensors b/model-0023-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..93fb147334d9f1ccb423bbe25419e7769fe66912
--- /dev/null
+++ b/model-0023-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65c68d803db0691f317ef0a91a0937c84926be1dd9f5d7b622b4910b58ae9324
+size 5318937400
diff --git a/model-0024-of-0032.safetensors b/model-0024-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a98f5e356a6e58d82a55f1f6dc6d3ff327ec4333
--- /dev/null
+++ b/model-0024-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6323ef5306a06cae635a72aea64e2d34677cfd69beff18ffc90b248b2098257
+size 5344103232
diff --git a/model-0025-of-0032.safetensors b/model-0025-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..95ba19ba12c9110b4966ead9cba5d6260c0515d9
--- /dev/null
+++ b/model-0025-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10239c412419c8cce8cc62bd6196775a037bc5c607b13018e55b0b13f1ecdef
+size 5318937400
diff --git a/model-0026-of-0032.safetensors b/model-0026-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..294f6421f4a0adecfa3ddeb28d3a090da3f9cff4
--- /dev/null
+++ b/model-0026-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94a48c3477b95f036ed5564e59e244400abd167da53d3b48af59870cb38d4c57
+size 5344103232
diff --git a/model-0027-of-0032.safetensors b/model-0027-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2274bdecba9afdae1eae1da7a385ba957ebf95f8
--- /dev/null
+++ b/model-0027-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77c33d36df1de8cac089fb38e56674d8824f20dae0609c18b0184a2b590b9881
+size 5318937400
diff --git a/model-0028-of-0032.safetensors b/model-0028-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..67434000d63596e182890f70617d96ea7278e913
--- /dev/null
+++ b/model-0028-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:403c781276c72030dbe4b378cfb65bcbce7c40fb145bc42ba1295fe24c0ca59f
+size 5344103232
diff --git a/model-0029-of-0032.safetensors b/model-0029-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9a5a377fa5d269ce296c321edf70ad635a2a9540
--- /dev/null
+++ b/model-0029-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4a8f2224fd91b6a2c1ba83c88e751d4a1cced6996c9c1e25bf92da64788c52
+size 5318937400
diff --git a/model-0030-of-0032.safetensors b/model-0030-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0908b42307f8ed21d35d76db52debe82c7caa29a
--- /dev/null
+++ b/model-0030-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2223dd71e2a2826259094bf91c22cc5b70cdea5f26890fe332a7e9362cba3508
+size 5344103232
diff --git a/model-0031-of-0032.safetensors b/model-0031-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6b517c98f9b57a2a38400a06afba15461fbb098
--- /dev/null
+++ b/model-0031-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:459d8bbc4c0f52b2ebfeb79d74886a271216157c689748d1e6bbd97a8393a38b
+size 5302504652
diff --git a/model-0032-of-0032.safetensors b/model-0032-of-0032.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..128c1188a18119fb1705f28b4696cb47c7e67ae3
--- /dev/null
+++ b/model-0032-of-0032.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65b72ec507661c7a146b3174fd0baaa2689a5c7e85a8736aa9778b1d146a4ec6
+size 3316299512
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8412533cb8149b036157d80e918231c669af0c9
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 168611084748}, "weight_map": {"final_layer.model.0.emb_layers.1.bias": "model-0001-of-0032.safetensors", "final_layer.model.0.emb_layers.1.weight": "model-0001-of-0032.safetensors", "final_layer.model.0.in_layers.0.bias": "model-0001-of-0032.safetensors", "final_layer.model.0.in_layers.0.weight": "model-0001-of-0032.safetensors", "final_layer.model.0.in_layers.2.bias": "model-0001-of-0032.safetensors", "final_layer.model.0.in_layers.2.weight": "model-0001-of-0032.safetensors", "final_layer.model.0.out_layers.0.bias": "model-0001-of-0032.safetensors", "final_layer.model.0.out_layers.0.weight": "model-0001-of-0032.safetensors", "final_layer.model.0.out_layers.3.bias": "model-0001-of-0032.safetensors", "final_layer.model.0.out_layers.3.weight": "model-0001-of-0032.safetensors", "final_layer.model.0.skip_connection.bias": "model-0001-of-0032.safetensors", "final_layer.model.0.skip_connection.weight": "model-0001-of-0032.safetensors", "final_layer.model.1.0.bias": "model-0001-of-0032.safetensors", "final_layer.model.1.0.weight": "model-0001-of-0032.safetensors", "final_layer.model.1.2.bias": "model-0001-of-0032.safetensors", "final_layer.model.1.2.weight": "model-0001-of-0032.safetensors", "guidance_emb.mlp.0.bias": "model-0001-of-0032.safetensors", "guidance_emb.mlp.0.weight": "model-0001-of-0032.safetensors", "guidance_emb.mlp.2.bias": "model-0001-of-0032.safetensors", "guidance_emb.mlp.2.weight": "model-0001-of-0032.safetensors", "lm_head.weight": "model-0001-of-0032.safetensors", "model.layers.0.input_layernorm.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.0.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.0.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.1.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.1.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.2.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.2.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.3.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.3.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.4.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.4.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.5.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.5.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.6.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.6.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.7.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.7.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.8.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.8.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.9.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.9.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.10.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.10.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.11.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.11.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.12.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.12.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.13.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.13.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.14.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.14.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.15.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.15.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.16.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.16.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.17.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.17.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.18.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.18.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.19.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.19.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.20.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.20.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.21.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.21.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.22.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.22.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.23.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.23.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.24.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.24.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.25.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.25.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.26.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.26.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.27.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.27.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.28.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.28.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.29.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.29.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.30.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.30.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.31.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.31.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.32.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.32.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.33.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.33.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.34.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.34.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.35.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.35.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.36.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.36.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.37.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.37.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.38.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.38.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.39.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.39.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.40.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.40.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.41.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.41.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.42.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.42.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.43.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.43.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.44.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.44.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.45.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.45.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.46.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.46.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.47.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.47.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.48.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.48.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.49.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.49.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.50.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.50.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.51.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.51.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.52.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.52.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.53.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.53.gate_and_up_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.54.down_proj.weight": "model-0001-of-0032.safetensors", "model.layers.0.mlp.experts.54.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.55.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.55.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.56.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.56.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.57.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.57.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.58.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.58.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.59.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.59.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.60.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.60.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.61.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.61.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.62.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.62.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.63.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.experts.63.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.gate.wg.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.shared_mlp.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.mlp.shared_mlp.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-0002-of-0032.safetensors", "model.layers.0.self_attn.key_layernorm.weight": "model-0002-of-0032.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.self_attn.qkv_proj.weight": "model-0002-of-0032.safetensors", "model.layers.0.self_attn.query_layernorm.weight": "model-0002-of-0032.safetensors", "model.layers.1.input_layernorm.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.0.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.0.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.1.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.1.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.2.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.2.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.3.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.3.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.4.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.4.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.5.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.5.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.6.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.6.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.7.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.7.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.8.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.8.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.9.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.9.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.10.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.10.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.11.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.11.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.12.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.12.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.13.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.13.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.14.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.14.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.15.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.15.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.16.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.16.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.17.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.17.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.18.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.18.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.19.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.19.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.20.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.20.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.21.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.21.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.22.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.22.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.23.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.23.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.24.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.24.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.25.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.25.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.26.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.26.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.27.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.27.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.28.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.28.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.29.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.29.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.30.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.30.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.31.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.31.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.32.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.32.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.33.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.33.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.34.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.34.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.35.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.35.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.36.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.36.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.37.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.37.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.38.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.38.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.39.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.39.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.40.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.40.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.41.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.41.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.42.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.42.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.43.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.43.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.44.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.44.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.45.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.45.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.46.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.46.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.47.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.47.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.48.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.48.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.49.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.49.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.50.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.50.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.51.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.51.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.52.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.52.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.53.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.53.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.54.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.54.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.55.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.55.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.56.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.56.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.57.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.57.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.58.down_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.58.gate_and_up_proj.weight": "model-0002-of-0032.safetensors", "model.layers.1.mlp.experts.59.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.59.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.60.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.60.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.61.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.61.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.62.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.62.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.63.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.experts.63.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.gate.wg.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.shared_mlp.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.mlp.shared_mlp.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-0003-of-0032.safetensors", "model.layers.1.self_attn.key_layernorm.weight": "model-0003-of-0032.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.self_attn.qkv_proj.weight": "model-0003-of-0032.safetensors", "model.layers.1.self_attn.query_layernorm.weight": "model-0003-of-0032.safetensors", "model.layers.2.input_layernorm.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.0.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.0.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.1.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.1.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.2.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.2.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.3.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.3.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.4.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.4.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.5.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.5.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.6.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.6.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.7.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.7.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.8.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.8.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.9.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.9.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.10.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.10.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.11.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.11.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.12.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.12.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.13.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.13.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.14.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.14.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.15.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.15.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.16.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.16.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.17.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.17.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.18.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.18.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.19.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.19.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.20.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.20.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.21.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.21.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.22.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.22.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.23.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.23.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.24.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.24.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.25.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.25.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.26.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.26.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.27.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.27.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.28.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.28.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.29.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.29.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.30.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.30.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.31.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.31.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.32.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.32.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.33.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.33.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.34.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.34.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.35.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.35.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.36.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.36.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.37.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.37.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.38.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.38.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.39.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.39.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.40.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.40.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.41.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.41.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.42.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.42.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.43.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.43.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.44.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.44.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.45.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.45.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.46.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.46.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.47.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.47.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.48.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.48.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.49.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.49.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.50.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.50.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.51.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.51.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.52.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.52.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.53.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.53.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.54.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.54.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.55.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.55.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.56.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.56.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.57.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.57.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.58.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.58.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.59.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.59.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.60.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.60.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.61.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.61.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.62.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.62.gate_and_up_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.63.down_proj.weight": "model-0003-of-0032.safetensors", "model.layers.2.mlp.experts.63.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.2.mlp.gate.wg.weight": "model-0004-of-0032.safetensors", "model.layers.2.mlp.shared_mlp.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.2.mlp.shared_mlp.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.2.self_attn.key_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-0004-of-0032.safetensors", "model.layers.2.self_attn.qkv_proj.weight": "model-0004-of-0032.safetensors", "model.layers.2.self_attn.query_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.3.input_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.0.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.0.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.1.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.1.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.2.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.2.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.3.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.3.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.4.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.4.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.5.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.5.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.6.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.6.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.7.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.7.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.8.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.8.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.9.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.9.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.10.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.10.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.11.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.11.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.12.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.12.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.13.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.13.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.14.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.14.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.15.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.15.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.16.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.16.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.17.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.17.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.18.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.18.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.19.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.19.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.20.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.20.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.21.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.21.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.22.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.22.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.23.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.23.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.24.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.24.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.25.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.25.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.26.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.26.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.27.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.27.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.28.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.28.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.29.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.29.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.30.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.30.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.31.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.31.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.32.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.32.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.33.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.33.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.34.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.34.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.35.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.35.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.36.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.36.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.37.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.37.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.38.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.38.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.39.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.39.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.40.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.40.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.41.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.41.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.42.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.42.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.43.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.43.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.44.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.44.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.45.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.45.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.46.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.46.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.47.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.47.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.48.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.48.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.49.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.49.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.50.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.50.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.51.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.51.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.52.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.52.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.53.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.53.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.54.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.54.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.55.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.55.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.56.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.56.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.57.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.57.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.58.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.58.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.59.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.59.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.60.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.60.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.61.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.61.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.62.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.62.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.63.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.experts.63.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.gate.wg.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.shared_mlp.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.mlp.shared_mlp.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.3.self_attn.key_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.self_attn.qkv_proj.weight": "model-0004-of-0032.safetensors", "model.layers.3.self_attn.query_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.4.input_layernorm.weight": "model-0004-of-0032.safetensors", "model.layers.4.mlp.experts.0.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.4.mlp.experts.0.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.4.mlp.experts.1.down_proj.weight": "model-0004-of-0032.safetensors", "model.layers.4.mlp.experts.1.gate_and_up_proj.weight": "model-0004-of-0032.safetensors", "model.layers.4.mlp.experts.2.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.2.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.3.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.3.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.4.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.4.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.5.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.5.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.6.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.6.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.7.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.7.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.8.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.8.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.9.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.9.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.10.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.10.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.11.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.11.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.12.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.12.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.13.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.13.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.14.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.14.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.15.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.15.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.16.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.16.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.17.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.17.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.18.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.18.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.19.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.19.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.20.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.20.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.21.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.21.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.22.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.22.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.23.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.23.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.24.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.24.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.25.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.25.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.26.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.26.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.27.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.27.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.28.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.28.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.29.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.29.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.30.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.30.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.31.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.31.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.32.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.32.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.33.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.33.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.34.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.34.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.35.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.35.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.36.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.36.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.37.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.37.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.38.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.38.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.39.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.39.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.40.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.40.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.41.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.41.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.42.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.42.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.43.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.43.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.44.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.44.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.45.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.45.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.46.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.46.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.47.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.47.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.48.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.48.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.49.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.49.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.50.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.50.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.51.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.51.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.52.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.52.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.53.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.53.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.54.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.54.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.55.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.55.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.56.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.56.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.57.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.57.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.58.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.58.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.59.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.59.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.60.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.60.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.61.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.61.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.62.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.62.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.63.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.experts.63.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.gate.wg.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.shared_mlp.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.mlp.shared_mlp.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-0005-of-0032.safetensors", "model.layers.4.self_attn.key_layernorm.weight": "model-0005-of-0032.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.self_attn.qkv_proj.weight": "model-0005-of-0032.safetensors", "model.layers.4.self_attn.query_layernorm.weight": "model-0005-of-0032.safetensors", "model.layers.5.input_layernorm.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.0.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.0.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.1.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.1.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.2.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.2.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.3.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.3.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.4.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.4.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.5.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.5.gate_and_up_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.6.down_proj.weight": "model-0005-of-0032.safetensors", "model.layers.5.mlp.experts.6.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.7.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.7.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.8.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.8.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.9.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.9.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.10.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.10.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.11.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.11.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.12.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.12.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.13.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.13.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.14.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.14.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.15.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.15.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.16.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.16.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.17.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.17.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.18.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.18.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.19.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.19.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.20.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.20.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.21.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.21.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.22.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.22.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.23.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.23.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.24.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.24.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.25.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.25.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.26.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.26.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.27.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.27.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.28.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.28.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.29.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.29.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.30.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.30.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.31.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.31.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.32.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.32.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.33.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.33.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.34.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.34.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.35.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.35.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.36.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.36.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.37.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.37.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.38.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.38.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.39.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.39.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.40.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.40.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.41.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.41.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.42.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.42.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.43.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.43.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.44.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.44.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.45.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.45.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.46.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.46.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.47.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.47.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.48.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.48.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.49.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.49.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.50.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.50.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.51.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.51.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.52.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.52.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.53.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.53.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.54.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.54.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.55.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.55.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.56.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.56.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.57.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.57.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.58.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.58.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.59.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.59.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.60.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.60.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.61.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.61.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.62.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.62.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.63.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.experts.63.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.gate.wg.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.shared_mlp.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.mlp.shared_mlp.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-0006-of-0032.safetensors", "model.layers.5.self_attn.key_layernorm.weight": "model-0006-of-0032.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.self_attn.qkv_proj.weight": "model-0006-of-0032.safetensors", "model.layers.5.self_attn.query_layernorm.weight": "model-0006-of-0032.safetensors", "model.layers.6.input_layernorm.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.0.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.0.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.1.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.1.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.2.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.2.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.3.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.3.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.4.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.4.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.5.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.5.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.6.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.6.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.7.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.7.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.8.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.8.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.9.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.9.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.10.down_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.10.gate_and_up_proj.weight": "model-0006-of-0032.safetensors", "model.layers.6.mlp.experts.11.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.11.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.12.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.12.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.13.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.13.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.14.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.14.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.15.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.15.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.16.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.16.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.17.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.17.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.18.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.18.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.19.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.19.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.20.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.20.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.21.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.21.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.22.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.22.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.23.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.23.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.24.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.24.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.25.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.25.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.26.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.26.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.27.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.27.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.28.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.28.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.29.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.29.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.30.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.30.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.31.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.31.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.32.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.32.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.33.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.33.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.34.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.34.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.35.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.35.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.36.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.36.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.37.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.37.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.38.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.38.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.39.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.39.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.40.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.40.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.41.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.41.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.42.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.42.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.43.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.43.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.44.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.44.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.45.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.45.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.46.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.46.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.47.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.47.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.48.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.48.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.49.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.49.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.50.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.50.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.51.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.51.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.52.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.52.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.53.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.53.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.54.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.54.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.55.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.55.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.56.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.56.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.57.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.57.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.58.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.58.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.59.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.59.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.60.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.60.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.61.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.61.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.62.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.62.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.63.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.experts.63.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.gate.wg.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.shared_mlp.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.mlp.shared_mlp.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-0007-of-0032.safetensors", "model.layers.6.self_attn.key_layernorm.weight": "model-0007-of-0032.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.self_attn.qkv_proj.weight": "model-0007-of-0032.safetensors", "model.layers.6.self_attn.query_layernorm.weight": "model-0007-of-0032.safetensors", "model.layers.7.input_layernorm.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.0.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.0.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.1.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.1.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.2.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.2.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.3.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.3.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.4.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.4.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.5.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.5.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.6.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.6.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.7.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.7.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.8.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.8.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.9.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.9.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.10.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.10.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.11.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.11.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.12.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.12.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.13.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.13.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.14.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.14.gate_and_up_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.15.down_proj.weight": "model-0007-of-0032.safetensors", "model.layers.7.mlp.experts.15.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.16.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.16.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.17.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.17.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.18.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.18.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.19.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.19.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.20.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.20.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.21.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.21.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.22.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.22.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.23.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.23.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.24.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.24.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.25.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.25.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.26.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.26.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.27.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.27.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.28.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.28.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.29.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.29.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.30.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.30.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.31.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.31.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.32.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.32.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.33.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.33.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.34.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.34.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.35.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.35.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.36.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.36.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.37.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.37.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.38.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.38.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.39.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.39.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.40.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.40.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.41.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.41.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.42.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.42.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.43.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.43.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.44.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.44.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.45.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.45.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.46.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.46.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.47.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.47.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.48.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.48.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.49.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.49.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.50.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.50.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.51.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.51.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.52.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.52.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.53.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.53.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.54.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.54.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.55.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.55.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.56.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.56.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.57.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.57.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.58.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.58.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.59.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.59.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.60.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.60.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.61.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.61.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.62.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.62.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.63.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.experts.63.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.gate.wg.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.shared_mlp.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.mlp.shared_mlp.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-0008-of-0032.safetensors", "model.layers.7.self_attn.key_layernorm.weight": "model-0008-of-0032.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.self_attn.qkv_proj.weight": "model-0008-of-0032.safetensors", "model.layers.7.self_attn.query_layernorm.weight": "model-0008-of-0032.safetensors", "model.layers.8.input_layernorm.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.0.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.0.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.1.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.1.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.2.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.2.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.3.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.3.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.4.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.4.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.5.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.5.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.6.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.6.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.7.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.7.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.8.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.8.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.9.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.9.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.10.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.10.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.11.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.11.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.12.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.12.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.13.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.13.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.14.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.14.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.15.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.15.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.16.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.16.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.17.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.17.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.18.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.18.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.19.down_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.19.gate_and_up_proj.weight": "model-0008-of-0032.safetensors", "model.layers.8.mlp.experts.20.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.20.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.21.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.21.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.22.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.22.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.23.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.23.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.24.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.24.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.25.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.25.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.26.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.26.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.27.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.27.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.28.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.28.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.29.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.29.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.30.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.30.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.31.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.31.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.32.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.32.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.33.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.33.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.34.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.34.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.35.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.35.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.36.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.36.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.37.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.37.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.38.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.38.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.39.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.39.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.40.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.40.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.41.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.41.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.42.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.42.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.43.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.43.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.44.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.44.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.45.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.45.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.46.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.46.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.47.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.47.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.48.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.48.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.49.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.49.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.50.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.50.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.51.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.51.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.52.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.52.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.53.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.53.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.54.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.54.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.55.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.55.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.56.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.56.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.57.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.57.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.58.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.58.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.59.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.59.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.60.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.60.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.61.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.61.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.62.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.62.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.63.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.experts.63.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.gate.wg.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.shared_mlp.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.mlp.shared_mlp.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-0009-of-0032.safetensors", "model.layers.8.self_attn.key_layernorm.weight": "model-0009-of-0032.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.self_attn.qkv_proj.weight": "model-0009-of-0032.safetensors", "model.layers.8.self_attn.query_layernorm.weight": "model-0009-of-0032.safetensors", "model.layers.9.input_layernorm.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.0.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.0.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.1.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.1.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.2.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.2.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.3.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.3.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.4.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.4.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.5.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.5.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.6.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.6.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.7.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.7.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.8.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.8.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.9.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.9.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.10.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.10.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.11.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.11.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.12.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.12.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.13.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.13.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.14.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.14.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.15.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.15.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.16.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.16.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.17.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.17.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.18.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.18.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.19.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.19.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.20.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.20.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.21.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.21.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.22.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.22.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.23.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.23.gate_and_up_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.24.down_proj.weight": "model-0009-of-0032.safetensors", "model.layers.9.mlp.experts.24.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.25.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.25.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.26.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.26.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.27.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.27.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.28.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.28.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.29.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.29.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.30.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.30.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.31.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.31.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.32.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.32.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.33.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.33.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.34.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.34.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.35.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.35.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.36.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.36.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.37.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.37.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.38.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.38.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.39.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.39.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.40.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.40.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.41.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.41.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.42.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.42.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.43.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.43.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.44.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.44.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.45.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.45.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.46.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.46.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.47.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.47.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.48.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.48.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.49.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.49.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.50.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.50.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.51.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.51.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.52.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.52.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.53.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.53.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.54.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.54.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.55.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.55.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.56.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.56.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.57.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.57.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.58.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.58.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.59.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.59.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.60.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.60.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.61.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.61.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.62.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.62.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.63.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.experts.63.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.gate.wg.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.shared_mlp.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.mlp.shared_mlp.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-0010-of-0032.safetensors", "model.layers.9.self_attn.key_layernorm.weight": "model-0010-of-0032.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.self_attn.qkv_proj.weight": "model-0010-of-0032.safetensors", "model.layers.9.self_attn.query_layernorm.weight": "model-0010-of-0032.safetensors", "model.layers.10.input_layernorm.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.0.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.0.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.1.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.1.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.2.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.2.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.3.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.3.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.4.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.4.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.5.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.5.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.6.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.6.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.7.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.7.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.8.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.8.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.9.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.9.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.10.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.10.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.11.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.11.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.12.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.12.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.13.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.13.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.14.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.14.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.15.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.15.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.16.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.16.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.17.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.17.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.18.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.18.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.19.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.19.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.20.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.20.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.21.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.21.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.22.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.22.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.23.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.23.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.24.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.24.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.25.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.25.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.26.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.26.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.27.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.27.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.28.down_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.28.gate_and_up_proj.weight": "model-0010-of-0032.safetensors", "model.layers.10.mlp.experts.29.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.29.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.30.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.30.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.31.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.31.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.32.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.32.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.33.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.33.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.34.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.34.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.35.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.35.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.36.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.36.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.37.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.37.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.38.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.38.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.39.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.39.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.40.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.40.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.41.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.41.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.42.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.42.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.43.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.43.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.44.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.44.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.45.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.45.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.46.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.46.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.47.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.47.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.48.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.48.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.49.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.49.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.50.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.50.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.51.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.51.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.52.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.52.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.53.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.53.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.54.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.54.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.55.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.55.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.56.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.56.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.57.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.57.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.58.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.58.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.59.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.59.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.60.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.60.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.61.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.61.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.62.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.62.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.63.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.experts.63.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.gate.wg.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.shared_mlp.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.mlp.shared_mlp.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-0011-of-0032.safetensors", "model.layers.10.self_attn.key_layernorm.weight": "model-0011-of-0032.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.self_attn.qkv_proj.weight": "model-0011-of-0032.safetensors", "model.layers.10.self_attn.query_layernorm.weight": "model-0011-of-0032.safetensors", "model.layers.11.input_layernorm.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.0.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.0.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.1.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.1.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.2.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.2.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.3.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.3.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.4.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.4.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.5.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.5.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.6.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.6.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.7.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.7.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.8.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.8.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.9.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.9.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.10.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.10.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.11.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.11.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.12.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.12.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.13.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.13.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.14.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.14.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.15.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.15.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.16.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.16.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.17.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.17.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.18.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.18.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.19.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.19.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.20.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.20.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.21.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.21.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.22.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.22.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.23.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.23.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.24.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.24.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.25.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.25.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.26.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.26.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.27.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.27.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.28.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.28.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.29.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.29.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.30.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.30.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.31.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.31.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.32.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.32.gate_and_up_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.33.down_proj.weight": "model-0011-of-0032.safetensors", "model.layers.11.mlp.experts.33.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.34.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.34.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.35.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.35.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.36.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.36.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.37.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.37.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.38.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.38.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.39.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.39.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.40.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.40.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.41.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.41.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.42.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.42.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.43.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.43.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.44.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.44.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.45.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.45.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.46.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.46.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.47.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.47.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.48.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.48.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.49.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.49.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.50.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.50.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.51.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.51.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.52.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.52.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.53.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.53.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.54.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.54.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.55.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.55.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.56.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.56.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.57.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.57.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.58.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.58.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.59.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.59.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.60.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.60.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.61.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.61.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.62.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.62.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.63.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.experts.63.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.gate.wg.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.shared_mlp.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.mlp.shared_mlp.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-0012-of-0032.safetensors", "model.layers.11.self_attn.key_layernorm.weight": "model-0012-of-0032.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.self_attn.qkv_proj.weight": "model-0012-of-0032.safetensors", "model.layers.11.self_attn.query_layernorm.weight": "model-0012-of-0032.safetensors", "model.layers.12.input_layernorm.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.0.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.0.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.1.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.1.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.2.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.2.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.3.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.3.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.4.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.4.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.5.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.5.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.6.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.6.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.7.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.7.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.8.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.8.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.9.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.9.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.10.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.10.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.11.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.11.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.12.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.12.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.13.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.13.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.14.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.14.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.15.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.15.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.16.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.16.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.17.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.17.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.18.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.18.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.19.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.19.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.20.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.20.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.21.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.21.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.22.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.22.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.23.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.23.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.24.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.24.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.25.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.25.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.26.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.26.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.27.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.27.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.28.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.28.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.29.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.29.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.30.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.30.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.31.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.31.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.32.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.32.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.33.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.33.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.34.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.34.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.35.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.35.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.36.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.36.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.37.down_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.37.gate_and_up_proj.weight": "model-0012-of-0032.safetensors", "model.layers.12.mlp.experts.38.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.38.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.39.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.39.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.40.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.40.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.41.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.41.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.42.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.42.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.43.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.43.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.44.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.44.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.45.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.45.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.46.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.46.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.47.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.47.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.48.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.48.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.49.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.49.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.50.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.50.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.51.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.51.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.52.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.52.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.53.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.53.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.54.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.54.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.55.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.55.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.56.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.56.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.57.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.57.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.58.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.58.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.59.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.59.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.60.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.60.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.61.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.61.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.62.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.62.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.63.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.experts.63.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.gate.wg.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.shared_mlp.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.mlp.shared_mlp.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-0013-of-0032.safetensors", "model.layers.12.self_attn.key_layernorm.weight": "model-0013-of-0032.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.self_attn.qkv_proj.weight": "model-0013-of-0032.safetensors", "model.layers.12.self_attn.query_layernorm.weight": "model-0013-of-0032.safetensors", "model.layers.13.input_layernorm.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.0.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.0.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.1.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.1.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.2.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.2.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.3.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.3.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.4.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.4.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.5.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.5.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.6.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.6.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.7.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.7.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.8.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.8.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.9.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.9.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.10.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.10.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.11.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.11.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.12.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.12.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.13.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.13.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.14.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.14.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.15.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.15.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.16.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.16.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.17.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.17.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.18.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.18.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.19.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.19.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.20.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.20.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.21.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.21.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.22.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.22.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.23.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.23.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.24.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.24.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.25.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.25.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.26.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.26.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.27.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.27.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.28.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.28.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.29.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.29.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.30.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.30.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.31.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.31.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.32.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.32.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.33.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.33.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.34.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.34.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.35.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.35.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.36.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.36.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.37.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.37.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.38.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.38.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.39.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.39.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.40.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.40.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.41.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.41.gate_and_up_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.42.down_proj.weight": "model-0013-of-0032.safetensors", "model.layers.13.mlp.experts.42.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.43.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.43.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.44.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.44.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.45.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.45.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.46.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.46.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.47.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.47.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.48.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.48.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.49.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.49.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.50.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.50.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.51.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.51.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.52.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.52.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.53.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.53.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.54.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.54.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.55.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.55.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.56.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.56.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.57.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.57.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.58.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.58.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.59.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.59.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.60.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.60.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.61.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.61.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.62.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.62.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.63.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.experts.63.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.gate.wg.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.shared_mlp.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.mlp.shared_mlp.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-0014-of-0032.safetensors", "model.layers.13.self_attn.key_layernorm.weight": "model-0014-of-0032.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.self_attn.qkv_proj.weight": "model-0014-of-0032.safetensors", "model.layers.13.self_attn.query_layernorm.weight": "model-0014-of-0032.safetensors", "model.layers.14.input_layernorm.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.0.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.0.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.1.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.1.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.2.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.2.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.3.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.3.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.4.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.4.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.5.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.5.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.6.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.6.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.7.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.7.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.8.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.8.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.9.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.9.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.10.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.10.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.11.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.11.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.12.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.12.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.13.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.13.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.14.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.14.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.15.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.15.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.16.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.16.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.17.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.17.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.18.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.18.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.19.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.19.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.20.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.20.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.21.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.21.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.22.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.22.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.23.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.23.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.24.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.24.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.25.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.25.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.26.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.26.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.27.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.27.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.28.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.28.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.29.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.29.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.30.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.30.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.31.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.31.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.32.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.32.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.33.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.33.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.34.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.34.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.35.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.35.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.36.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.36.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.37.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.37.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.38.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.38.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.39.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.39.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.40.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.40.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.41.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.41.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.42.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.42.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.43.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.43.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.44.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.44.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.45.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.45.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.46.down_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.46.gate_and_up_proj.weight": "model-0014-of-0032.safetensors", "model.layers.14.mlp.experts.47.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.47.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.48.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.48.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.49.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.49.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.50.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.50.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.51.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.51.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.52.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.52.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.53.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.53.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.54.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.54.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.55.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.55.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.56.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.56.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.57.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.57.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.58.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.58.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.59.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.59.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.60.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.60.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.61.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.61.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.62.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.62.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.63.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.experts.63.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.gate.wg.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.shared_mlp.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.mlp.shared_mlp.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-0015-of-0032.safetensors", "model.layers.14.self_attn.key_layernorm.weight": "model-0015-of-0032.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.self_attn.qkv_proj.weight": "model-0015-of-0032.safetensors", "model.layers.14.self_attn.query_layernorm.weight": "model-0015-of-0032.safetensors", "model.layers.15.input_layernorm.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.0.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.0.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.1.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.1.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.2.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.2.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.3.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.3.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.4.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.4.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.5.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.5.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.6.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.6.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.7.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.7.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.8.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.8.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.9.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.9.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.10.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.10.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.11.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.11.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.12.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.12.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.13.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.13.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.14.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.14.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.15.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.15.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.16.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.16.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.17.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.17.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.18.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.18.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.19.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.19.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.20.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.20.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.21.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.21.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.22.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.22.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.23.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.23.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.24.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.24.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.25.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.25.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.26.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.26.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.27.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.27.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.28.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.28.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.29.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.29.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.30.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.30.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.31.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.31.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.32.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.32.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.33.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.33.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.34.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.34.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.35.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.35.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.36.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.36.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.37.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.37.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.38.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.38.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.39.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.39.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.40.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.40.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.41.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.41.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.42.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.42.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.43.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.43.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.44.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.44.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.45.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.45.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.46.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.46.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.47.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.47.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.48.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.48.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.49.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.49.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.50.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.50.gate_and_up_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.51.down_proj.weight": "model-0015-of-0032.safetensors", "model.layers.15.mlp.experts.51.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.52.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.52.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.53.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.53.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.54.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.54.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.55.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.55.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.56.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.56.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.57.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.57.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.58.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.58.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.59.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.59.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.60.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.60.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.61.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.61.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.62.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.62.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.63.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.experts.63.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.gate.wg.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.shared_mlp.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.mlp.shared_mlp.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-0016-of-0032.safetensors", "model.layers.15.self_attn.key_layernorm.weight": "model-0016-of-0032.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.self_attn.qkv_proj.weight": "model-0016-of-0032.safetensors", "model.layers.15.self_attn.query_layernorm.weight": "model-0016-of-0032.safetensors", "model.layers.16.input_layernorm.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.0.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.0.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.1.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.1.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.2.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.2.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.3.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.3.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.4.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.4.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.5.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.5.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.6.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.6.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.7.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.7.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.8.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.8.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.9.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.9.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.10.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.10.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.11.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.11.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.12.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.12.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.13.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.13.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.14.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.14.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.15.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.15.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.16.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.16.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.17.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.17.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.18.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.18.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.19.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.19.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.20.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.20.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.21.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.21.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.22.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.22.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.23.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.23.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.24.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.24.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.25.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.25.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.26.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.26.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.27.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.27.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.28.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.28.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.29.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.29.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.30.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.30.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.31.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.31.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.32.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.32.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.33.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.33.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.34.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.34.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.35.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.35.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.36.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.36.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.37.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.37.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.38.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.38.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.39.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.39.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.40.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.40.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.41.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.41.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.42.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.42.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.43.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.43.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.44.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.44.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.45.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.45.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.46.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.46.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.47.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.47.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.48.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.48.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.49.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.49.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.50.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.50.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.51.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.51.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.52.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.52.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.53.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.53.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.54.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.54.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.55.down_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.55.gate_and_up_proj.weight": "model-0016-of-0032.safetensors", "model.layers.16.mlp.experts.56.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.56.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.57.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.57.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.58.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.58.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.59.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.59.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.60.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.60.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.61.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.61.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.62.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.62.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.63.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.experts.63.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.gate.wg.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.shared_mlp.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.mlp.shared_mlp.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-0017-of-0032.safetensors", "model.layers.16.self_attn.key_layernorm.weight": "model-0017-of-0032.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.self_attn.qkv_proj.weight": "model-0017-of-0032.safetensors", "model.layers.16.self_attn.query_layernorm.weight": "model-0017-of-0032.safetensors", "model.layers.17.input_layernorm.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.0.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.0.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.1.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.1.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.2.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.2.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.3.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.3.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.4.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.4.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.5.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.5.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.6.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.6.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.7.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.7.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.8.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.8.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.9.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.9.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.10.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.10.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.11.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.11.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.12.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.12.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.13.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.13.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.14.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.14.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.15.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.15.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.16.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.16.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.17.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.17.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.18.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.18.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.19.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.19.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.20.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.20.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.21.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.21.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.22.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.22.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.23.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.23.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.24.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.24.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.25.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.25.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.26.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.26.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.27.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.27.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.28.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.28.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.29.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.29.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.30.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.30.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.31.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.31.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.32.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.32.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.33.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.33.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.34.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.34.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.35.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.35.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.36.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.36.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.37.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.37.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.38.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.38.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.39.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.39.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.40.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.40.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.41.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.41.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.42.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.42.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.43.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.43.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.44.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.44.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.45.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.45.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.46.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.46.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.47.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.47.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.48.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.48.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.49.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.49.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.50.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.50.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.51.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.51.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.52.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.52.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.53.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.53.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.54.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.54.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.55.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.55.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.56.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.56.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.57.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.57.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.58.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.58.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.59.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.59.gate_and_up_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.60.down_proj.weight": "model-0017-of-0032.safetensors", "model.layers.17.mlp.experts.60.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.experts.61.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.experts.61.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.experts.62.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.experts.62.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.experts.63.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.experts.63.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.gate.wg.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.shared_mlp.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.mlp.shared_mlp.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-0018-of-0032.safetensors", "model.layers.17.self_attn.key_layernorm.weight": "model-0018-of-0032.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.self_attn.qkv_proj.weight": "model-0018-of-0032.safetensors", "model.layers.17.self_attn.query_layernorm.weight": "model-0018-of-0032.safetensors", "model.layers.18.input_layernorm.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.0.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.0.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.1.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.1.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.2.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.2.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.3.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.3.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.4.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.4.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.5.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.5.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.6.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.6.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.7.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.7.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.8.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.8.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.9.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.9.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.10.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.10.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.11.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.11.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.12.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.12.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.13.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.13.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.14.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.14.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.15.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.15.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.16.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.16.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.17.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.17.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.18.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.18.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.19.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.19.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.20.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.20.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.21.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.21.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.22.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.22.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.23.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.23.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.24.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.24.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.25.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.25.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.26.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.26.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.27.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.27.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.28.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.28.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.29.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.29.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.30.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.30.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.31.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.31.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.32.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.32.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.33.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.33.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.34.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.34.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.35.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.35.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.36.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.36.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.37.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.37.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.38.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.38.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.39.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.39.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.40.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.40.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.41.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.41.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.42.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.42.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.43.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.43.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.44.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.44.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.45.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.45.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.46.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.46.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.47.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.47.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.48.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.48.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.49.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.49.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.50.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.50.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.51.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.51.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.52.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.52.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.53.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.53.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.54.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.54.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.55.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.55.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.56.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.56.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.57.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.57.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.58.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.58.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.59.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.59.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.60.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.60.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.61.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.61.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.62.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.62.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.63.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.experts.63.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.gate.wg.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.shared_mlp.down_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.mlp.shared_mlp.gate_and_up_proj.weight": "model-0018-of-0032.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-0018-of-0032.safetensors", "model.layers.18.self_attn.key_layernorm.weight": "model-0018-of-0032.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-0019-of-0032.safetensors", "model.layers.18.self_attn.qkv_proj.weight": "model-0019-of-0032.safetensors", "model.layers.18.self_attn.query_layernorm.weight": "model-0019-of-0032.safetensors", "model.layers.19.input_layernorm.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.0.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.0.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.1.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.1.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.2.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.2.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.3.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.3.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.4.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.4.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.5.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.5.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.6.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.6.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.7.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.7.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.8.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.8.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.9.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.9.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.10.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.10.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.11.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.11.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.12.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.12.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.13.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.13.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.14.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.14.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.15.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.15.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.16.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.16.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.17.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.17.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.18.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.18.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.19.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.19.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.20.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.20.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.21.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.21.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.22.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.22.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.23.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.23.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.24.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.24.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.25.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.25.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.26.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.26.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.27.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.27.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.28.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.28.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.29.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.29.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.30.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.30.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.31.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.31.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.32.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.32.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.33.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.33.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.34.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.34.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.35.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.35.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.36.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.36.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.37.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.37.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.38.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.38.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.39.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.39.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.40.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.40.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.41.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.41.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.42.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.42.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.43.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.43.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.44.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.44.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.45.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.45.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.46.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.46.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.47.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.47.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.48.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.48.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.49.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.49.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.50.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.50.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.51.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.51.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.52.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.52.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.53.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.53.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.54.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.54.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.55.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.55.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.56.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.56.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.57.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.57.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.58.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.58.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.59.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.59.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.60.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.60.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.61.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.61.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.62.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.62.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.63.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.experts.63.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.gate.wg.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.shared_mlp.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.mlp.shared_mlp.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-0019-of-0032.safetensors", "model.layers.19.self_attn.key_layernorm.weight": "model-0019-of-0032.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.self_attn.qkv_proj.weight": "model-0019-of-0032.safetensors", "model.layers.19.self_attn.query_layernorm.weight": "model-0019-of-0032.safetensors", "model.layers.20.input_layernorm.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.0.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.0.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.1.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.1.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.2.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.2.gate_and_up_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.3.down_proj.weight": "model-0019-of-0032.safetensors", "model.layers.20.mlp.experts.3.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.4.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.4.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.5.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.5.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.6.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.6.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.7.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.7.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.8.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.8.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.9.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.9.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.10.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.10.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.11.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.11.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.12.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.12.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.13.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.13.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.14.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.14.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.15.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.15.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.16.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.16.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.17.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.17.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.18.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.18.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.19.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.19.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.20.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.20.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.21.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.21.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.22.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.22.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.23.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.23.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.24.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.24.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.25.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.25.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.26.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.26.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.27.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.27.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.28.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.28.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.29.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.29.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.30.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.30.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.31.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.31.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.32.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.32.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.33.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.33.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.34.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.34.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.35.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.35.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.36.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.36.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.37.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.37.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.38.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.38.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.39.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.39.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.40.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.40.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.41.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.41.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.42.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.42.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.43.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.43.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.44.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.44.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.45.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.45.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.46.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.46.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.47.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.47.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.48.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.48.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.49.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.49.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.50.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.50.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.51.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.51.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.52.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.52.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.53.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.53.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.54.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.54.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.55.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.55.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.56.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.56.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.57.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.57.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.58.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.58.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.59.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.59.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.60.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.60.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.61.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.61.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.62.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.62.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.63.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.experts.63.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.gate.wg.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.shared_mlp.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.mlp.shared_mlp.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-0020-of-0032.safetensors", "model.layers.20.self_attn.key_layernorm.weight": "model-0020-of-0032.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.self_attn.qkv_proj.weight": "model-0020-of-0032.safetensors", "model.layers.20.self_attn.query_layernorm.weight": "model-0020-of-0032.safetensors", "model.layers.21.input_layernorm.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.0.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.0.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.1.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.1.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.2.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.2.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.3.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.3.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.4.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.4.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.5.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.5.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.6.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.6.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.7.down_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.7.gate_and_up_proj.weight": "model-0020-of-0032.safetensors", "model.layers.21.mlp.experts.8.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.8.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.9.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.9.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.10.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.10.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.11.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.11.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.12.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.12.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.13.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.13.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.14.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.14.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.15.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.15.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.16.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.16.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.17.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.17.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.18.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.18.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.19.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.19.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.20.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.20.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.21.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.21.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.22.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.22.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.23.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.23.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.24.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.24.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.25.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.25.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.26.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.26.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.27.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.27.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.28.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.28.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.29.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.29.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.30.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.30.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.31.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.31.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.32.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.32.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.33.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.33.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.34.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.34.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.35.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.35.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.36.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.36.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.37.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.37.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.38.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.38.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.39.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.39.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.40.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.40.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.41.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.41.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.42.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.42.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.43.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.43.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.44.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.44.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.45.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.45.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.46.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.46.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.47.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.47.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.48.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.48.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.49.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.49.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.50.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.50.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.51.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.51.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.52.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.52.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.53.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.53.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.54.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.54.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.55.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.55.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.56.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.56.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.57.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.57.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.58.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.58.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.59.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.59.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.60.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.60.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.61.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.61.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.62.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.62.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.63.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.experts.63.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.gate.wg.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.shared_mlp.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.mlp.shared_mlp.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-0021-of-0032.safetensors", "model.layers.21.self_attn.key_layernorm.weight": "model-0021-of-0032.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.self_attn.qkv_proj.weight": "model-0021-of-0032.safetensors", "model.layers.21.self_attn.query_layernorm.weight": "model-0021-of-0032.safetensors", "model.layers.22.input_layernorm.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.0.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.0.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.1.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.1.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.2.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.2.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.3.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.3.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.4.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.4.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.5.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.5.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.6.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.6.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.7.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.7.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.8.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.8.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.9.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.9.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.10.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.10.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.11.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.11.gate_and_up_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.12.down_proj.weight": "model-0021-of-0032.safetensors", "model.layers.22.mlp.experts.12.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.13.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.13.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.14.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.14.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.15.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.15.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.16.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.16.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.17.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.17.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.18.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.18.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.19.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.19.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.20.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.20.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.21.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.21.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.22.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.22.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.23.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.23.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.24.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.24.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.25.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.25.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.26.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.26.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.27.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.27.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.28.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.28.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.29.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.29.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.30.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.30.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.31.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.31.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.32.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.32.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.33.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.33.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.34.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.34.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.35.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.35.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.36.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.36.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.37.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.37.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.38.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.38.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.39.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.39.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.40.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.40.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.41.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.41.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.42.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.42.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.43.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.43.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.44.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.44.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.45.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.45.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.46.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.46.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.47.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.47.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.48.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.48.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.49.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.49.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.50.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.50.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.51.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.51.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.52.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.52.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.53.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.53.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.54.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.54.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.55.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.55.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.56.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.56.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.57.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.57.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.58.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.58.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.59.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.59.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.60.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.60.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.61.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.61.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.62.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.62.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.63.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.experts.63.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.gate.wg.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.shared_mlp.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.mlp.shared_mlp.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-0022-of-0032.safetensors", "model.layers.22.self_attn.key_layernorm.weight": "model-0022-of-0032.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.self_attn.qkv_proj.weight": "model-0022-of-0032.safetensors", "model.layers.22.self_attn.query_layernorm.weight": "model-0022-of-0032.safetensors", "model.layers.23.input_layernorm.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.0.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.0.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.1.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.1.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.2.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.2.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.3.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.3.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.4.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.4.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.5.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.5.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.6.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.6.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.7.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.7.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.8.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.8.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.9.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.9.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.10.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.10.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.11.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.11.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.12.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.12.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.13.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.13.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.14.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.14.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.15.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.15.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.16.down_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.16.gate_and_up_proj.weight": "model-0022-of-0032.safetensors", "model.layers.23.mlp.experts.17.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.17.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.18.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.18.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.19.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.19.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.20.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.20.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.21.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.21.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.22.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.22.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.23.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.23.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.24.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.24.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.25.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.25.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.26.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.26.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.27.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.27.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.28.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.28.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.29.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.29.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.30.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.30.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.31.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.31.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.32.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.32.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.33.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.33.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.34.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.34.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.35.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.35.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.36.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.36.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.37.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.37.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.38.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.38.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.39.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.39.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.40.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.40.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.41.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.41.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.42.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.42.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.43.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.43.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.44.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.44.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.45.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.45.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.46.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.46.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.47.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.47.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.48.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.48.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.49.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.49.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.50.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.50.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.51.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.51.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.52.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.52.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.53.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.53.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.54.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.54.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.55.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.55.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.56.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.56.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.57.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.57.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.58.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.58.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.59.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.59.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.60.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.60.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.61.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.61.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.62.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.62.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.63.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.experts.63.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.gate.wg.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.shared_mlp.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.mlp.shared_mlp.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-0023-of-0032.safetensors", "model.layers.23.self_attn.key_layernorm.weight": "model-0023-of-0032.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.self_attn.qkv_proj.weight": "model-0023-of-0032.safetensors", "model.layers.23.self_attn.query_layernorm.weight": "model-0023-of-0032.safetensors", "model.layers.24.input_layernorm.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.0.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.0.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.1.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.1.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.2.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.2.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.3.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.3.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.4.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.4.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.5.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.5.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.6.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.6.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.7.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.7.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.8.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.8.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.9.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.9.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.10.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.10.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.11.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.11.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.12.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.12.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.13.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.13.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.14.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.14.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.15.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.15.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.16.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.16.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.17.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.17.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.18.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.18.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.19.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.19.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.20.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.20.gate_and_up_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.21.down_proj.weight": "model-0023-of-0032.safetensors", "model.layers.24.mlp.experts.21.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.22.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.22.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.23.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.23.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.24.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.24.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.25.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.25.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.26.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.26.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.27.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.27.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.28.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.28.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.29.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.29.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.30.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.30.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.31.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.31.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.32.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.32.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.33.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.33.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.34.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.34.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.35.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.35.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.36.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.36.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.37.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.37.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.38.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.38.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.39.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.39.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.40.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.40.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.41.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.41.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.42.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.42.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.43.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.43.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.44.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.44.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.45.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.45.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.46.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.46.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.47.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.47.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.48.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.48.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.49.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.49.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.50.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.50.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.51.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.51.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.52.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.52.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.53.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.53.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.54.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.54.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.55.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.55.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.56.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.56.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.57.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.57.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.58.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.58.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.59.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.59.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.60.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.60.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.61.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.61.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.62.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.62.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.63.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.experts.63.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.gate.wg.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.shared_mlp.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.mlp.shared_mlp.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-0024-of-0032.safetensors", "model.layers.24.self_attn.key_layernorm.weight": "model-0024-of-0032.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.self_attn.qkv_proj.weight": "model-0024-of-0032.safetensors", "model.layers.24.self_attn.query_layernorm.weight": "model-0024-of-0032.safetensors", "model.layers.25.input_layernorm.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.0.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.0.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.1.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.1.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.2.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.2.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.3.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.3.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.4.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.4.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.5.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.5.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.6.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.6.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.7.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.7.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.8.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.8.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.9.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.9.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.10.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.10.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.11.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.11.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.12.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.12.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.13.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.13.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.14.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.14.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.15.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.15.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.16.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.16.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.17.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.17.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.18.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.18.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.19.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.19.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.20.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.20.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.21.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.21.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.22.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.22.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.23.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.23.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.24.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.24.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.25.down_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.25.gate_and_up_proj.weight": "model-0024-of-0032.safetensors", "model.layers.25.mlp.experts.26.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.26.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.27.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.27.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.28.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.28.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.29.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.29.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.30.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.30.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.31.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.31.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.32.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.32.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.33.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.33.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.34.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.34.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.35.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.35.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.36.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.36.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.37.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.37.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.38.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.38.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.39.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.39.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.40.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.40.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.41.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.41.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.42.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.42.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.43.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.43.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.44.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.44.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.45.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.45.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.46.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.46.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.47.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.47.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.48.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.48.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.49.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.49.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.50.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.50.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.51.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.51.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.52.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.52.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.53.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.53.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.54.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.54.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.55.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.55.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.56.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.56.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.57.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.57.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.58.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.58.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.59.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.59.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.60.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.60.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.61.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.61.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.62.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.62.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.63.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.experts.63.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.gate.wg.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.shared_mlp.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.mlp.shared_mlp.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-0025-of-0032.safetensors", "model.layers.25.self_attn.key_layernorm.weight": "model-0025-of-0032.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.self_attn.qkv_proj.weight": "model-0025-of-0032.safetensors", "model.layers.25.self_attn.query_layernorm.weight": "model-0025-of-0032.safetensors", "model.layers.26.input_layernorm.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.0.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.0.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.1.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.1.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.2.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.2.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.3.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.3.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.4.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.4.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.5.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.5.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.6.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.6.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.7.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.7.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.8.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.8.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.9.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.9.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.10.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.10.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.11.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.11.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.12.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.12.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.13.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.13.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.14.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.14.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.15.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.15.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.16.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.16.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.17.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.17.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.18.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.18.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.19.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.19.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.20.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.20.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.21.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.21.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.22.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.22.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.23.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.23.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.24.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.24.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.25.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.25.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.26.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.26.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.27.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.27.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.28.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.28.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.29.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.29.gate_and_up_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.30.down_proj.weight": "model-0025-of-0032.safetensors", "model.layers.26.mlp.experts.30.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.31.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.31.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.32.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.32.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.33.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.33.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.34.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.34.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.35.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.35.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.36.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.36.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.37.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.37.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.38.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.38.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.39.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.39.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.40.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.40.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.41.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.41.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.42.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.42.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.43.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.43.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.44.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.44.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.45.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.45.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.46.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.46.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.47.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.47.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.48.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.48.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.49.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.49.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.50.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.50.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.51.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.51.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.52.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.52.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.53.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.53.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.54.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.54.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.55.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.55.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.56.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.56.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.57.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.57.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.58.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.58.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.59.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.59.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.60.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.60.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.61.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.61.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.62.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.62.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.63.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.experts.63.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.gate.wg.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.shared_mlp.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.mlp.shared_mlp.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-0026-of-0032.safetensors", "model.layers.26.self_attn.key_layernorm.weight": "model-0026-of-0032.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.self_attn.qkv_proj.weight": "model-0026-of-0032.safetensors", "model.layers.26.self_attn.query_layernorm.weight": "model-0026-of-0032.safetensors", "model.layers.27.input_layernorm.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.0.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.0.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.1.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.1.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.2.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.2.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.3.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.3.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.4.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.4.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.5.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.5.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.6.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.6.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.7.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.7.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.8.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.8.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.9.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.9.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.10.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.10.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.11.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.11.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.12.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.12.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.13.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.13.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.14.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.14.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.15.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.15.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.16.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.16.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.17.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.17.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.18.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.18.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.19.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.19.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.20.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.20.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.21.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.21.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.22.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.22.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.23.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.23.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.24.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.24.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.25.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.25.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.26.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.26.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.27.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.27.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.28.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.28.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.29.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.29.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.30.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.30.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.31.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.31.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.32.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.32.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.33.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.33.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.34.down_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.34.gate_and_up_proj.weight": "model-0026-of-0032.safetensors", "model.layers.27.mlp.experts.35.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.35.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.36.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.36.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.37.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.37.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.38.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.38.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.39.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.39.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.40.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.40.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.41.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.41.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.42.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.42.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.43.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.43.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.44.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.44.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.45.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.45.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.46.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.46.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.47.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.47.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.48.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.48.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.49.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.49.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.50.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.50.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.51.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.51.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.52.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.52.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.53.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.53.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.54.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.54.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.55.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.55.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.56.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.56.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.57.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.57.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.58.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.58.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.59.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.59.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.60.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.60.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.61.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.61.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.62.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.62.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.63.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.experts.63.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.gate.wg.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.shared_mlp.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.mlp.shared_mlp.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-0027-of-0032.safetensors", "model.layers.27.self_attn.key_layernorm.weight": "model-0027-of-0032.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.self_attn.qkv_proj.weight": "model-0027-of-0032.safetensors", "model.layers.27.self_attn.query_layernorm.weight": "model-0027-of-0032.safetensors", "model.layers.28.input_layernorm.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.0.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.0.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.1.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.1.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.2.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.2.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.3.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.3.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.4.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.4.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.5.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.5.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.6.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.6.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.7.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.7.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.8.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.8.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.9.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.9.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.10.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.10.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.11.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.11.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.12.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.12.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.13.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.13.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.14.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.14.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.15.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.15.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.16.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.16.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.17.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.17.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.18.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.18.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.19.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.19.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.20.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.20.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.21.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.21.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.22.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.22.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.23.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.23.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.24.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.24.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.25.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.25.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.26.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.26.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.27.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.27.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.28.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.28.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.29.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.29.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.30.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.30.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.31.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.31.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.32.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.32.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.33.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.33.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.34.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.34.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.35.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.35.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.36.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.36.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.37.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.37.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.38.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.38.gate_and_up_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.39.down_proj.weight": "model-0027-of-0032.safetensors", "model.layers.28.mlp.experts.39.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.40.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.40.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.41.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.41.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.42.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.42.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.43.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.43.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.44.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.44.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.45.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.45.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.46.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.46.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.47.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.47.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.48.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.48.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.49.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.49.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.50.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.50.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.51.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.51.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.52.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.52.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.53.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.53.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.54.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.54.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.55.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.55.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.56.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.56.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.57.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.57.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.58.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.58.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.59.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.59.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.60.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.60.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.61.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.61.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.62.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.62.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.63.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.experts.63.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.gate.wg.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.shared_mlp.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.mlp.shared_mlp.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-0028-of-0032.safetensors", "model.layers.28.self_attn.key_layernorm.weight": "model-0028-of-0032.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.self_attn.qkv_proj.weight": "model-0028-of-0032.safetensors", "model.layers.28.self_attn.query_layernorm.weight": "model-0028-of-0032.safetensors", "model.layers.29.input_layernorm.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.0.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.0.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.1.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.1.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.2.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.2.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.3.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.3.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.4.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.4.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.5.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.5.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.6.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.6.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.7.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.7.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.8.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.8.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.9.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.9.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.10.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.10.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.11.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.11.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.12.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.12.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.13.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.13.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.14.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.14.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.15.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.15.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.16.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.16.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.17.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.17.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.18.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.18.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.19.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.19.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.20.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.20.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.21.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.21.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.22.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.22.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.23.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.23.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.24.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.24.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.25.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.25.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.26.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.26.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.27.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.27.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.28.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.28.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.29.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.29.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.30.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.30.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.31.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.31.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.32.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.32.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.33.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.33.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.34.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.34.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.35.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.35.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.36.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.36.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.37.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.37.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.38.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.38.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.39.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.39.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.40.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.40.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.41.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.41.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.42.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.42.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.43.down_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.43.gate_and_up_proj.weight": "model-0028-of-0032.safetensors", "model.layers.29.mlp.experts.44.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.44.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.45.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.45.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.46.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.46.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.47.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.47.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.48.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.48.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.49.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.49.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.50.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.50.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.51.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.51.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.52.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.52.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.53.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.53.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.54.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.54.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.55.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.55.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.56.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.56.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.57.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.57.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.58.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.58.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.59.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.59.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.60.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.60.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.61.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.61.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.62.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.62.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.63.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.experts.63.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.gate.wg.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.shared_mlp.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.mlp.shared_mlp.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-0029-of-0032.safetensors", "model.layers.29.self_attn.key_layernorm.weight": "model-0029-of-0032.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.self_attn.qkv_proj.weight": "model-0029-of-0032.safetensors", "model.layers.29.self_attn.query_layernorm.weight": "model-0029-of-0032.safetensors", "model.layers.30.input_layernorm.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.0.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.0.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.1.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.1.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.2.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.2.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.3.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.3.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.4.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.4.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.5.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.5.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.6.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.6.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.7.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.7.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.8.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.8.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.9.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.9.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.10.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.10.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.11.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.11.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.12.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.12.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.13.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.13.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.14.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.14.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.15.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.15.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.16.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.16.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.17.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.17.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.18.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.18.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.19.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.19.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.20.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.20.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.21.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.21.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.22.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.22.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.23.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.23.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.24.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.24.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.25.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.25.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.26.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.26.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.27.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.27.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.28.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.28.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.29.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.29.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.30.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.30.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.31.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.31.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.32.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.32.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.33.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.33.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.34.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.34.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.35.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.35.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.36.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.36.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.37.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.37.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.38.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.38.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.39.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.39.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.40.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.40.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.41.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.41.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.42.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.42.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.43.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.43.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.44.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.44.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.45.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.45.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.46.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.46.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.47.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.47.gate_and_up_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.48.down_proj.weight": "model-0029-of-0032.safetensors", "model.layers.30.mlp.experts.48.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.49.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.49.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.50.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.50.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.51.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.51.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.52.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.52.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.53.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.53.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.54.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.54.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.55.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.55.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.56.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.56.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.57.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.57.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.58.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.58.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.59.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.59.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.60.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.60.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.61.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.61.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.62.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.62.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.63.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.experts.63.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.gate.wg.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.shared_mlp.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.mlp.shared_mlp.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-0030-of-0032.safetensors", "model.layers.30.self_attn.key_layernorm.weight": "model-0030-of-0032.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.self_attn.qkv_proj.weight": "model-0030-of-0032.safetensors", "model.layers.30.self_attn.query_layernorm.weight": "model-0030-of-0032.safetensors", "model.layers.31.input_layernorm.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.0.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.0.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.1.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.1.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.2.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.2.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.3.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.3.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.4.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.4.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.5.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.5.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.6.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.6.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.7.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.7.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.8.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.8.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.9.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.9.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.10.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.10.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.11.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.11.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.12.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.12.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.13.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.13.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.14.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.14.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.15.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.15.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.16.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.16.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.17.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.17.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.18.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.18.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.19.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.19.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.20.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.20.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.21.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.21.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.22.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.22.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.23.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.23.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.24.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.24.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.25.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.25.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.26.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.26.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.27.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.27.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.28.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.28.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.29.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.29.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.30.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.30.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.31.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.31.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.32.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.32.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.33.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.33.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.34.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.34.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.35.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.35.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.36.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.36.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.37.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.37.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.38.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.38.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.39.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.39.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.40.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.40.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.41.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.41.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.42.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.42.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.43.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.43.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.44.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.44.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.45.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.45.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.46.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.46.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.47.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.47.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.48.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.48.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.49.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.49.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.50.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.50.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.51.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.51.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.52.down_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.52.gate_and_up_proj.weight": "model-0030-of-0032.safetensors", "model.layers.31.mlp.experts.53.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.53.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.54.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.54.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.55.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.55.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.56.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.56.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.57.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.57.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.58.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.58.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.59.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.59.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.60.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.60.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.61.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.61.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.62.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.62.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.63.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.experts.63.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.gate.wg.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.shared_mlp.down_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.mlp.shared_mlp.gate_and_up_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-0031-of-0032.safetensors", "model.layers.31.self_attn.key_layernorm.weight": "model-0031-of-0032.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.self_attn.qkv_proj.weight": "model-0031-of-0032.safetensors", "model.layers.31.self_attn.query_layernorm.weight": "model-0031-of-0032.safetensors", "model.ln_f.weight": "model-0031-of-0032.safetensors", "model.wte.weight": "model-0031-of-0032.safetensors", "patch_embed.model.0.bias": "model-0031-of-0032.safetensors", "patch_embed.model.0.weight": "model-0031-of-0032.safetensors", "patch_embed.model.1.emb_layers.1.bias": "model-0031-of-0032.safetensors", "patch_embed.model.1.emb_layers.1.weight": "model-0031-of-0032.safetensors", "patch_embed.model.1.in_layers.0.bias": "model-0031-of-0032.safetensors", "patch_embed.model.1.in_layers.0.weight": "model-0031-of-0032.safetensors", "patch_embed.model.1.in_layers.2.bias": "model-0031-of-0032.safetensors", "patch_embed.model.1.in_layers.2.weight": "model-0031-of-0032.safetensors", "patch_embed.model.1.out_layers.0.bias": "model-0031-of-0032.safetensors", "patch_embed.model.1.out_layers.0.weight": "model-0031-of-0032.safetensors", "patch_embed.model.1.out_layers.3.bias": "model-0031-of-0032.safetensors", "patch_embed.model.1.out_layers.3.weight": "model-0031-of-0032.safetensors", "patch_embed.model.1.skip_connection.bias": "model-0031-of-0032.safetensors", "patch_embed.model.1.skip_connection.weight": "model-0031-of-0032.safetensors", "time_embed.mlp.0.bias": "model-0031-of-0032.safetensors", "time_embed.mlp.0.weight": "model-0031-of-0032.safetensors", "time_embed.mlp.2.bias": "model-0031-of-0032.safetensors", "time_embed.mlp.2.weight": "model-0031-of-0032.safetensors", "time_embed_2.mlp.0.bias": "model-0031-of-0032.safetensors", "time_embed_2.mlp.0.weight": "model-0031-of-0032.safetensors", "time_embed_2.mlp.2.bias": "model-0031-of-0032.safetensors", "time_embed_2.mlp.2.weight": "model-0031-of-0032.safetensors", "timestep_emb.mlp.0.bias": "model-0031-of-0032.safetensors", "timestep_emb.mlp.0.weight": "model-0031-of-0032.safetensors", "timestep_emb.mlp.2.bias": "model-0031-of-0032.safetensors", "timestep_emb.mlp.2.weight": "model-0031-of-0032.safetensors", "timestep_r_emb.mlp.0.bias": "model-0031-of-0032.safetensors", "timestep_r_emb.mlp.0.weight": "model-0031-of-0032.safetensors", "timestep_r_emb.mlp.2.bias": "model-0031-of-0032.safetensors", "timestep_r_emb.mlp.2.weight": "model-0031-of-0032.safetensors", "vae.decoder.conv_in.bias": "model-0031-of-0032.safetensors", "vae.decoder.conv_in.weight": "model-0031-of-0032.safetensors", "vae.decoder.conv_out.bias": "model-0031-of-0032.safetensors", "vae.decoder.conv_out.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.k.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.k.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.norm.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.norm.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.proj_out.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.proj_out.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.q.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.q.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.v.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.attn_1.v.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_1.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.mid.block_2.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.norm_out.bias": "model-0031-of-0032.safetensors", "vae.decoder.norm_out.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.0.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.1.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.block.2.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.0.upsample.conv.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.0.upsample.conv.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.0.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.conv2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.norm1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.norm1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.norm2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.1.norm2.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.2.conv1.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.2.conv1.weight": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.2.conv2.bias": "model-0031-of-0032.safetensors", "vae.decoder.up.1.block.2.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.1.block.2.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.1.block.2.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.1.block.2.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.1.block.2.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.1.upsample.conv.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.1.upsample.conv.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.block.2.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.2.upsample.conv.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.2.upsample.conv.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.block.2.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.3.upsample.conv.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.3.upsample.conv.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.conv1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.conv1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.conv2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.conv2.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.norm1.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.norm1.weight": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.norm2.bias": "model-0032-of-0032.safetensors", "vae.decoder.up.4.block.2.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.conv_in.bias": "model-0032-of-0032.safetensors", "vae.encoder.conv_in.weight": "model-0032-of-0032.safetensors", "vae.encoder.conv_out.bias": "model-0032-of-0032.safetensors", "vae.encoder.conv_out.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.0.downsample.conv.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.0.downsample.conv.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.1.downsample.conv.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.1.downsample.conv.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.2.downsample.conv.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.2.downsample.conv.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.3.downsample.conv.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.3.downsample.conv.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.0.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.down.4.block.1.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.k.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.k.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.norm.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.norm.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.proj_out.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.proj_out.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.q.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.q.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.v.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.attn_1.v.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_1.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.conv1.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.conv1.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.conv2.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.conv2.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.norm1.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.norm1.weight": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.norm2.bias": "model-0032-of-0032.safetensors", "vae.encoder.mid.block_2.norm2.weight": "model-0032-of-0032.safetensors", "vae.encoder.norm_out.bias": "model-0032-of-0032.safetensors", "vae.encoder.norm_out.weight": "model-0032-of-0032.safetensors", "vision_aligner.layers.0.bias": "model-0032-of-0032.safetensors", "vision_aligner.layers.0.weight": "model-0032-of-0032.safetensors", "vision_aligner.layers.2.bias": "model-0032-of-0032.safetensors", "vision_aligner.layers.2.weight": "model-0032-of-0032.safetensors", "vision_model.embeddings.patch_embedding.bias": "model-0032-of-0032.safetensors", "vision_model.embeddings.patch_embedding.weight": "model-0032-of-0032.safetensors", "vision_model.embeddings.position_embedding.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.layer_norm1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.layer_norm1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.layer_norm2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.layer_norm2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-0032-of-0032.safetensors", "vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-0032-of-0032.safetensors", "vision_model.head.attention.in_proj_bias": "model-0032-of-0032.safetensors", "vision_model.head.attention.in_proj_weight": "model-0032-of-0032.safetensors", "vision_model.head.attention.out_proj.bias": "model-0032-of-0032.safetensors", "vision_model.head.attention.out_proj.weight": "model-0032-of-0032.safetensors", "vision_model.head.layernorm.bias": "model-0032-of-0032.safetensors", "vision_model.head.layernorm.weight": "model-0032-of-0032.safetensors", "vision_model.head.mlp.fc1.bias": "model-0032-of-0032.safetensors", "vision_model.head.mlp.fc1.weight": "model-0032-of-0032.safetensors", "vision_model.head.mlp.fc2.bias": "model-0032-of-0032.safetensors", "vision_model.head.mlp.fc2.weight": "model-0032-of-0032.safetensors", "vision_model.head.probe": "model-0032-of-0032.safetensors", "vision_model.post_layernorm.bias": "model-0032-of-0032.safetensors", "vision_model.post_layernorm.weight": "model-0032-of-0032.safetensors"}}
\ No newline at end of file
diff --git a/modeling_hunyuan_image_3.py b/modeling_hunyuan_image_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8c61e80f465fdf9990b4f167f1e36269f5c4426
--- /dev/null
+++ b/modeling_hunyuan_image_3.py
@@ -0,0 +1,3403 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import math
+import random
+import re
+import time
+import warnings
+from dataclasses import dataclass
+from typing import List, Union, Optional, Dict, Any, Tuple, Callable, TYPE_CHECKING
+from datetime import datetime
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from PIL import Image
+from einops import rearrange
+from torch import Tensor
+from torch import nn
+from torch.cuda import nvtx
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
+from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.streamers import TextStreamer
+from transformers.generation.utils import GenerationMixin, GenerationConfig, ALL_CACHE_NAMES
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+)
+
+try:
+    import flashinfer
+except Exception as e:
+    flashinfer = None
+
+#from .autoencoder_kl_3d import AutoencoderKLConv3D
+from .autoencoder_kl_3d import AutoencoderKLConv3D_Dist, AutoencoderKLConv3D
+from .configuration_hunyuan_image_3 import HunyuanImage3Config
+from .hunyuan_image_3_pipeline import HunyuanImage3Text2ImagePipeline, FlowMatchDiscreteScheduler
+from .image_processor import HunyuanImage3ImageProcessor
+from .siglip2 import Siglip2VisionTransformer, LightProjector
+from .tokenization_hunyuan_image_3 import HunyuanImage3TokenizerFast, ImageInfo, ImageTensor, CondImage
+from .system_prompt import get_system_prompt
+
+from .cache_utils import TaylorCacheContainer, CacheWithFreqsContainer
+
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+
+logger = logging.get_logger(__name__)
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func
+
+# Type aliases
+BatchRaggedImages = Union[torch.Tensor, List[Union[torch.Tensor, List[torch.Tensor]]]]
+BatchRaggedTensor = Union[torch.Tensor, List[torch.Tensor]]
+InputImage = Optional[Union[Image.Image, str, bytes]]
+
+
+def get_device(tensor: BatchRaggedImages):
+    if isinstance(tensor, torch.Tensor):
+        return tensor.device
+    elif isinstance(tensor, list):
+        return get_device(tensor[0])
+    else:
+        raise ValueError(f"Unsupported type for get_device: {type(tensor)}")
+
+
+_CONFIG_FOR_DOC = "HunyuanImage3Config"
+
+Hunyuan_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`HunyuanImage3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+# =======================================================
+#     Helper Functions
+# =======================================================
+
+def default(val, d):
+    return val if val is not None else d
+
+
+def to_device(data, device):
+    if device is None:
+        return data
+    if isinstance(data, torch.Tensor):
+        return data.to(device)
+    elif isinstance(data, list):
+        return [to_device(x, device) for x in data]
+    else:
+        return data
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def real_batched_index_select(t, dim, idx):
+    """ index_select for batched index and batched t """
+    assert t.ndim >= 2 and idx.ndim >= 2, f"{t.ndim=} {idx.ndim=}"
+    assert len(t) == len(idx), f"{len(t)=} != {len(idx)=}"
+    return torch.stack([torch.index_select(t[i], dim - 1, idx[i]) for i in range(len(t))])
+
+
+# =======================================================
+#     Module Functions
+# =======================================================
+
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat(
+            [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+        )
+    return embedding
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def normalization(channels, **kwargs):
+    """
+    Make a standard normalization layer.
+
+    :param channels: number of input channels.
+    :return: a nn.Module for normalization.
+    """
+    return nn.GroupNorm(32, channels, **kwargs)
+
+
+def topkgating(
+        logits: Tensor,
+        topk: int,
+        group_limited_greedy: bool = False,
+        n_group: int = None,
+        topk_group: int = None,
+        norm_topk_prob: bool = True,
+        routed_scaling_factor: float = 1.0,
+        capacity_factor: float = 1.0,
+        drop_tokens: bool = False,
+):
+    logits = logits.float()
+    gates = F.softmax(logits, dim=1)
+
+    if group_limited_greedy:
+        group_shape = list(gates.shape[:-1]) + [n_group, gates.shape[-1] // n_group]
+        group_scores = (
+            gates.reshape(group_shape).max(dim=-1).values
+        )  # [n, n_group]
+        group_idx = torch.topk(
+            group_scores, topk_group, dim=-1, sorted=False
+        )[
+            1
+        ]  # [n, top_k_group]
+        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(
+                group_shape
+            )
+            .reshape(list(gates.shape))
+        )  # [n, e]
+        gates = gates.masked_fill(~score_mask.bool(), 0.0)
+
+    num_experts = int(gates.shape[1])
+    # Top-k router probability and corresponding expert indices for each token.
+    # Shape: [tokens_per_group, num_selected_experts].
+    expert_gate, expert_index = torch.topk(gates, topk)
+    expert_mask = F.one_hot(expert_index, num_experts)
+    # For a given token, determine if it was routed to a given expert.
+    # Shape: [tokens_per_group, num_experts]
+    expert_mask_aux = expert_mask.max(dim=-2)[0]
+    tokens_per_group_and_expert = torch.mean(expert_mask_aux.float(), dim=-2)
+    router_prob_per_group_and_expert = torch.mean(gates.float(), dim=-2)
+    l_aux = num_experts ** 2 * torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert)
+
+    if drop_tokens:
+        expert_capacity = int(max(topk, topk * gates.shape[0] // gates.shape[1]) * capacity_factor)
+    else:
+        expert_index_flat = expert_index.flatten()
+        tokens_per_expert = torch.bincount(expert_index_flat, minlength=num_experts)
+        expert_capacity = torch.max(tokens_per_expert).item()
+
+    if norm_topk_prob and topk > 1:
+        gates_s = torch.clamp(
+            torch.matmul(expert_mask.float(), gates.unsqueeze(-1)).sum(dim=1), min=torch.finfo(gates.dtype).eps
+        )
+        router_probs = gates / gates_s
+    else:
+        router_probs = gates * routed_scaling_factor
+    # Make num_selected_experts the leading axis to ensure that top-1 choices
+    # have priority over top-2 choices, which have priority over top-3 choices,
+    # etc.
+    expert_index = torch.transpose(expert_index, 0, 1)
+    # Shape: [num_selected_experts * tokens_per_group]
+    expert_index = expert_index.reshape(-1)
+
+    # Create mask out of indices.
+    # Shape: [tokens_per_group * num_selected_experts, num_experts].
+    expert_mask = F.one_hot(expert_index, num_experts).to(torch.int32)
+    exp_counts = torch.sum(expert_mask, dim=0).detach()
+
+    # Experts have a fixed capacity that we cannot exceed. A token's priority
+    # within the expert's buffer is given by the masked, cumulative capacity of
+    # its target expert.
+    # Shape: [tokens_per_group * num_selected_experts, num_experts].
+    token_priority = torch.cumsum(expert_mask, dim=0) * expert_mask - 1
+    # Shape: [num_selected_experts, tokens_per_group, num_experts].
+    token_priority = token_priority.reshape((topk, -1, num_experts))
+    # Shape: [tokens_per_group, num_selected_experts, num_experts].
+    token_priority = torch.transpose(token_priority, 0, 1)
+    # For each token, across all selected experts, select the only non-negative
+    # (unmasked) priority. Now, for group G routing to expert E, token T has
+    # non-negative priority (i.e. token_priority[G,T,E] >= 0) if and only if E
+    # is its targeted expert.
+    # Shape: [tokens_per_group, num_experts].
+    token_priority = torch.max(token_priority, dim=1)[0]
+
+    # Token T can only be routed to expert E if its priority is positive and
+    # less than the expert capacity. One-hot matrix will ignore indices outside
+    # the range [0, expert_capacity).
+    # Shape: [tokens_per_group, num_experts, expert_capacity].
+    valid_mask = torch.logical_and(token_priority >= 0, token_priority < expert_capacity)
+    token_priority = torch.masked_fill(token_priority, ~valid_mask, 0)
+    dispatch_mask = F.one_hot(token_priority, expert_capacity).to(torch.bool)
+    valid_mask = valid_mask.unsqueeze(-1).expand(-1, -1, expert_capacity)
+    dispatch_mask = torch.masked_fill(dispatch_mask, ~valid_mask, 0)
+
+    # The combine array will be used for combining expert outputs, scaled by the
+    # router probabilities. Shape: [num_groups, tokens_per_group, num_experts,
+    # expert_capacity].
+    combine_weights = torch.einsum("...te,...tec->...tec", router_probs, dispatch_mask)
+    exp_counts_capacity = torch.sum(dispatch_mask)
+    exp_capacity_rate = exp_counts_capacity / (logits.shape[0] * topk)
+
+    return [l_aux, exp_capacity_rate], combine_weights, dispatch_mask, exp_counts
+
+
+# =======================================================
+#     Multi-Dimensional RoPE
+# =======================================================
+
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, *args, dim=2, device="cpu"):
+    """
+    Get n-D meshgrid with start, stop and num.
+
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+        # assert num are all integers
+        num_int = [int(x) for x in num]
+        assert (torch.tensor(num) == torch.tensor(num_int)).all(), f"num should be int, but got {num}"
+        num = num_int
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)       # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)      # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)       # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32, device=device)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")   # dim x [H, W]
+    grid = torch.stack(grid, dim=0)     # [dim, H, W]
+
+    return grid
+
+
+def build_2d_rope(
+        seq_len: int, n_elem: int, image_infos: Optional[List[Tuple[slice, Tuple[int, int]]]] = None,
+        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
+        return_all_pos: bool = False,
+):
+    """
+    Reference: https://kexue.fm/archives/10352
+
+    Start from 1, we have
+        beta_y = L + (wh - h)/2
+        beta_x = L + (wh - w)/2
+
+    Returns
+    -------
+    cos: torch.Tensor with shape of [seq_len, n_elem]
+    sin: torch.Tensor with shape of [seq_len, n_elem]
+    """
+    assert n_elem % 4 == 0, f"n_elem must be divisible by 4, but got {n_elem}."
+
+    # theta
+    if base_rescale_factor != 1.0:
+        base *= base_rescale_factor ** (n_elem / (n_elem - 2))
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+    theta = theta.reshape(1, n_elem // 4, 2)    # [1, half_d, 2]
+
+    # position indices
+    if image_infos is None:
+        image_infos = []
+
+    image_infos_list = [image_infos]
+    sample_seq_lens = [seq_len]
+
+    # Prepare position indices for each sample
+    x_sections = []
+    y_sections = []
+    for sample_id, sample_image_infos in enumerate(image_infos_list):
+        last_pos = 0
+        for sec_slice, (h, w) in sample_image_infos:
+            L = sec_slice.start   # start from 0, so image_slice.start is just L
+            # previous text
+            if last_pos < L:
+                y_sections.append(torch.arange(last_pos, L, device=device))
+                x_sections.append(torch.arange(last_pos, L, device=device))
+            elif h is None:
+                # Interleave data has overlapped positions for <boi> <size> <ratio> <timestep> <eoi> tokens.
+                y_sections.append(torch.arange(sec_slice.start, sec_slice.stop, device=device))
+                x_sections.append(torch.arange(sec_slice.start, sec_slice.stop, device=device))
+                continue
+            else:
+                # Interleave data has overlapped positions for noised image and the successive clean image,
+                # leading to last_pos (= last text end L + noise w * h) > L (last text end L).
+                pass
+            # current image
+            beta_y = L + (w * h - h) / 2
+            beta_x = L + (w * h - w) / 2
+            grid = get_meshgrid_nd((beta_y, beta_x), (beta_y + h, beta_x + w), device=device)  # [2, h, w]
+            grid = grid.reshape(2, -1)  # (y, x)
+            y_sections.append(grid[0])
+            x_sections.append(grid[1])
+            # step
+            last_pos = L + w * h
+        # final text
+        y_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id], device=device))
+        x_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id], device=device))
+
+    x_pos = torch.cat(x_sections).long()
+    y_pos = torch.cat(y_sections).long()
+    # If there are overlap positions, we need to remove them.
+    x_pos = x_pos[:seq_len]
+    y_pos = y_pos[:seq_len]
+    all_pos = torch.stack((y_pos, x_pos), dim=1).unsqueeze(1).to(device)    # [seq_len, 1, 2]
+
+    # calc rope
+    idx_theta = (all_pos * theta).reshape(all_pos.shape[0], n_elem // 2).repeat(1, 2)
+
+    cos = torch.cos(idx_theta)
+    sin = torch.sin(idx_theta)
+
+    if return_all_pos:
+        return cos, sin, all_pos
+
+    return cos, sin
+
+
+def build_batch_2d_rope(
+        seq_len: int, n_elem: int, image_infos: Optional[List[List[Tuple[slice, Tuple[int, int]]]]] = None,
+        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
+        return_all_pos: bool = False,
+):
+    cos_list, sin_list, all_pos_list = [], [], []
+    if image_infos is None:
+        image_infos = [None]
+    for i, image_info in enumerate(image_infos):
+        res = build_2d_rope(
+            seq_len, n_elem, image_infos=image_info, device=device,
+            base=base, base_rescale_factor=base_rescale_factor,
+            return_all_pos=return_all_pos,
+        )
+        if return_all_pos:
+            cos, sin, all_pos = res
+        else:
+            cos, sin = res
+            all_pos = None
+        cos_list.append(cos)
+        sin_list.append(sin)
+        all_pos_list.append(all_pos)
+
+    stacked_cos = torch.stack(cos_list, dim=0)
+    stacked_sin = torch.stack(sin_list, dim=0)
+
+    if return_all_pos:
+        return stacked_cos, stacked_sin, all_pos_list
+
+    return stacked_cos, stacked_sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass shifted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    if position_ids is not None:
+        cos = cos[position_ids]
+        sin = sin[position_ids]
+
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# =======================================================
+#     Modules for Image Generation
+# =======================================================
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self,
+                 hidden_size,
+                 act_layer=nn.GELU,
+                 frequency_embedding_size=256,
+                 max_period=10000,
+                 out_size=None,
+                 dtype=None,
+                 device=None
+                 ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1, **factory_kwargs)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=1, **factory_kwargs
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(nn.Module):
+    """
+    A residual block that can optionally change the number of channels.
+
+    :param in_channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        emb_channels,
+        out_channels=None,
+        dropout=0.0,
+        use_conv=False,
+        dims=2,
+        up=False,
+        down=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.in_channels = in_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or self.in_channels
+        self.use_conv = use_conv
+
+        self.in_layers = nn.Sequential(
+            normalization(self.in_channels, **factory_kwargs),
+            nn.SiLU(),
+            conv_nd(dims, self.in_channels, self.out_channels, 3, padding=1, **factory_kwargs),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(self.in_channels, False, dims, **factory_kwargs)
+            self.x_upd = Upsample(self.in_channels, False, dims, **factory_kwargs)
+        elif down:
+            self.h_upd = Downsample(self.in_channels, False, dims, **factory_kwargs)
+            self.x_upd = Downsample(self.in_channels, False, dims, **factory_kwargs)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(emb_channels, 2 * self.out_channels, **factory_kwargs)
+        )
+
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels, **factory_kwargs),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1, **factory_kwargs)
+            ),
+        )
+
+        if self.out_channels == self.in_channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, self.in_channels, self.out_channels, 3, padding=1, **factory_kwargs
+            )
+        else:
+            self.skip_connection = conv_nd(dims, self.in_channels, self.out_channels, 1, **factory_kwargs)
+
+    def forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+
+        emb_out = self.emb_layers(emb)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+
+        # Adaptive Group Normalization
+        out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+        scale, shift = torch.chunk(emb_out, 2, dim=1)
+        h = out_norm(h) * (1. + scale) + shift
+        h = out_rest(h)
+
+        return self.skip_connection(x) + h
+
+
+class UNetDown(nn.Module):
+    """
+    patch_size: one of [1, 2 ,4 ,8]
+    in_channels: vae latent dim
+    hidden_channels: hidden dim for reducing parameters
+    out_channels: transformer model dim
+    """
+    def __init__(self, patch_size, in_channels, emb_channels, hidden_channels, out_channels,
+                 dropout=0.0, device=None, dtype=None):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+
+        self.patch_size = patch_size
+        assert self.patch_size in [1, 2, 4, 8]
+
+        self.model = nn.ModuleList(
+            [conv_nd(
+                2,
+                in_channels=in_channels,
+                out_channels=hidden_channels,
+                kernel_size=3,
+                padding=1,
+                **factory_kwargs
+            )]
+        )
+
+        if self.patch_size == 1:
+            self.model.append(ResBlock(
+                in_channels=hidden_channels,
+                emb_channels=emb_channels,
+                out_channels=out_channels,
+                dropout=dropout,
+                **factory_kwargs
+            ))
+        else:
+            for i in range(self.patch_size // 2):
+                self.model.append(ResBlock(
+                    in_channels=hidden_channels,
+                    emb_channels=emb_channels,
+                    out_channels=hidden_channels if (i + 1) * 2 != self.patch_size else out_channels,
+                    dropout=dropout,
+                    down=True,
+                    **factory_kwargs
+                ))
+
+    def forward(self, x, t):
+        assert x.shape[2] % self.patch_size == 0 and x.shape[3] % self.patch_size == 0
+        for module in self.model:
+            if isinstance(module, ResBlock):
+                x = module(x, t)
+            else:
+                x = module(x)
+        _, _, token_h, token_w = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        return x, token_h, token_w
+
+
+class UNetUp(nn.Module):
+    """
+    patch_size: one of [1, 2 ,4 ,8]
+    in_channels: transformer model dim
+    hidden_channels: hidden dim for reducing parameters
+    out_channels: vae latent dim
+    """
+    def __init__(self, patch_size, in_channels, emb_channels, hidden_channels, out_channels,
+                 dropout=0.0, device=None, dtype=None, out_norm=False):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+
+        self.patch_size = patch_size
+        assert self.patch_size in [1, 2, 4, 8]
+
+        self.model = nn.ModuleList()
+
+        if self.patch_size == 1:
+            self.model.append(ResBlock(
+                in_channels=in_channels,
+                emb_channels=emb_channels,
+                out_channels=hidden_channels,
+                dropout=dropout,
+                **factory_kwargs
+            ))
+        else:
+            for i in range(self.patch_size // 2):
+                self.model.append(ResBlock(
+                    in_channels=in_channels if i == 0 else hidden_channels,
+                    emb_channels=emb_channels,
+                    out_channels=hidden_channels,
+                    dropout=dropout,
+                    up=True,
+                    **factory_kwargs
+                ))
+
+        if out_norm:
+            self.model.append(nn.Sequential(
+                normalization(hidden_channels, **factory_kwargs),
+                nn.SiLU(),
+                conv_nd(
+                    2,
+                    in_channels=hidden_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    **factory_kwargs
+                ),
+            ))
+        else:
+            self.model.append(conv_nd(
+                2,
+                in_channels=hidden_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+                **factory_kwargs
+            ))
+
+    # batch_size, seq_len, model_dim
+    def forward(self, x, t, token_h, token_w):
+        x = rearrange(x, 'b (h w) c -> b c h w', h=token_h, w=token_w)
+        for module in self.model:
+            if isinstance(module, ResBlock):
+                x = module(x, t)
+            else:
+                x = module(x)
+        return x
+
+
+# =======================================================
+#     Modules for Transformer Backbone
+# =======================================================
+
+@dataclass
+class CausalMMOutputWithPast(CausalLMOutputWithPast):
+    diffusion_prediction: Optional[torch.Tensor] = None
+
+
+class HunyuanStaticCache(StaticCache):
+    """
+    A custom static cache for multi-modal models that supports dynamic extension of the cache
+    and inplace updates of the cache.
+
+    This cache supports batch cache_position updates.
+    """
+    def __init__(self, *args, **kwargs):
+        self.dynamic = kwargs.pop("dynamic", False)
+        super().__init__(*args, **kwargs)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
+                to know how where to write in the cache.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        cache_position = cache_kwargs.get("cache_position")
+        if self.layers[layer_idx].keys is None:
+            self.layers[layer_idx].lazy_initialization(key_states)
+        k_out = self.layers[layer_idx].keys
+        v_out = self.layers[layer_idx].values
+
+        if cache_position is None:
+            k_out.copy_(key_states)
+            v_out.copy_(value_states)
+        else:
+            # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
+            # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
+            # operation, that avoids copies and uses less memory.
+            if cache_position.dim() == 1:
+                k_out.index_copy_(2, cache_position, key_states)
+                v_out.index_copy_(2, cache_position, value_states)
+
+                if self.dynamic:
+                    end = cache_position[-1].item() + 1
+                    k_out = k_out[:, :, :end]
+                    v_out = v_out[:, :, :end]
+            else:
+                assert cache_position.dim() == 2, f"multiple batch dims not yet {cache_position.shape=}"
+                batch_size, idx_size = cache_position.shape
+                assert batch_size == k_out.size(0)
+                assert batch_size == v_out.size(0)
+                assert batch_size == key_states.size(0)
+                assert batch_size == value_states.size(0)
+                for i in range(batch_size):
+                    unbatched_dim = 1
+                    k_out[i].index_copy_(unbatched_dim, cache_position[i], key_states[i])
+                    v_out[i].index_copy_(unbatched_dim, cache_position[i], value_states[i])
+
+                if self.dynamic:
+                    assert len(cache_position) == 1
+                    end = cache_position[0, -1].item() + 1
+                    k_out = k_out[:, :, :end]
+                    v_out = v_out[:, :, :end]
+
+        return k_out, v_out
+
+
+class CachedRoPE(object):
+    """ A 2D RoPE is determined by rope_image_info and seq_len. """
+
+    def __init__(self, config):
+        self.config = config
+        self.cos_cache = None
+        self.sin_cache = None
+        self.seq_len = None
+        self.rope_image_info = None
+
+    def __call__(self, seq_len, device, rope_image_info=None, position_ids=None):
+        """ Get cached RoPE for given seq_len and rope_image_info.
+        If cache miss, compute and cache it.
+
+        Args:
+            seq_len (int): The sequence length.
+            device (torch.device): The device to store the RoPE.
+            rope_image_info (list): The rope image info. list of lists of (slice, (height, width)) tuples.
+            position_ids (torch.Tensor): The input positions.
+
+        Returns:
+            The RoPE cos and sin tensors.
+        """
+        if (self.seq_len != seq_len) or (rope_image_info is not None and self.rope_image_info != rope_image_info):
+            # Cache miss, compute RoPE
+            if self.config.rope_type in ["2d", "default"]:
+                self.cos_cache, self.sin_cache = build_batch_2d_rope(
+                    image_infos=rope_image_info,
+                    seq_len=seq_len,
+                    n_elem=self.config.attention_head_dim,
+                    device=device,
+                    base=self.config.rope_theta,
+                )
+            else:
+                raise NotImplementedError(f"rope_type `{self.config.rope_type}` not supported")
+        else:
+            # hit cache
+            pass
+
+        if position_ids is None:
+            # Typically for training
+            cos, sin = self.cos_cache, self.sin_cache
+        else:
+            # Typically for inference
+            assert position_ids.dim() == 2, f"{position_ids.shape=}"
+            head_size = self.cos_cache.size(-1)
+            cos = torch.gather(self.cos_cache, dim=1, index=position_ids.unsqueeze(-1).expand(-1, -1, head_size))
+            sin = torch.gather(self.sin_cache, dim=1, index=position_ids.unsqueeze(-1).expand(-1, -1, head_size))
+
+        return cos, sin
+
+
+class HunyuanRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, cast_weight_fp32=False):
+        """
+        HunyuanRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.cast_weight_fp32 = cast_weight_fp32
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.cast_weight_fp32:
+            return (self.weight.float() * hidden_states).to(input_dtype)
+        else:
+            return self.weight * hidden_states.to(input_dtype)
+
+
+class HunyuanMLP(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx=None, is_shared_mlp=False, is_moe=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.hidden_act = config.hidden_act
+
+        self.intermediate_size = config.intermediate_size
+        if is_shared_mlp or is_moe:
+            # 如果是 moe 的话，优先用 moe_intermediate_size
+            if config.moe_intermediate_size is not None:
+                self.intermediate_size = config.moe_intermediate_size \
+                    if isinstance(config.moe_intermediate_size, int) else config.moe_intermediate_size[layer_idx]
+
+            if is_shared_mlp:
+                num_shared_expert = config.num_shared_expert \
+                    if isinstance(config.num_shared_expert, int) else config.num_shared_expert[layer_idx]
+                self.intermediate_size *= num_shared_expert
+
+        self.act_fn = ACT2FN[config.hidden_act]
+        if self.hidden_act == "silu":
+            self.intermediate_size *= 2  # SwiGLU
+            self.gate_and_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+            self.down_proj = nn.Linear(self.intermediate_size // 2, self.hidden_size, bias=config.mlp_bias)
+        elif self.hidden_act == "gelu":
+            self.gate_and_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        else:
+            assert False, "other hidden_act are not supported"
+
+    def forward(self, x):
+        if self.hidden_act == "silu":
+            gate_and_up_proj = self.gate_and_up_proj(x)
+            x1, x2 = gate_and_up_proj.chunk(2, dim=2)
+            down_proj = self.down_proj(x1 * self.act_fn(x2))
+            return down_proj
+        elif self.hidden_act == "gelu":
+            intermediate = self.gate_and_up_proj(x)
+            intermediate = self.act_fn(intermediate)
+            output = self.down_proj(intermediate)
+            return output
+        else:
+            assert False, "other hidden_act are not supported"
+
+
+class HunyuanTopKGate(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.moe_topk = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
+        self.drop_tokens = config.moe_drop_tokens
+        self.min_capacity = 8
+        self.random_routing_dropped_token = config.moe_random_routing_dropped_token
+        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)
+
+        # DeepSeek gating args
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+        self.group_limited_greedy = config.group_limited_greedy
+
+    def forward(self, hidden_states, topk_impl='default'):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        if topk_impl == 'default':
+            gate_output = topkgating(logits, self.moe_topk, group_limited_greedy=self.group_limited_greedy,
+                                     n_group=self.n_group, topk_group=self.topk_group,
+                                     norm_topk_prob=self.norm_topk_prob,
+                                     routed_scaling_factor=self.routed_scaling_factor,
+                                     capacity_factor=self.config.capacity_factor,
+                                     drop_tokens=self.drop_tokens)
+        elif topk_impl == 'easy':
+            gate_output = self.easy_topk(logits, self.moe_topk)
+        else:
+            raise ValueError(f"Unsupported topk_impl: {topk_impl}")
+
+        return gate_output
+
+    @staticmethod
+    def easy_topk(logits, moe_topk):
+        gates = F.softmax(logits, dim=1)
+        topk_weight_1, expert_index = torch.topk(gates, moe_topk)
+        weight_sums = topk_weight_1.sum(dim=1, keepdim=True)
+        weight_sums = torch.clamp(weight_sums, min=1e-8)
+        topk_weight = topk_weight_1 / weight_sums
+
+        return topk_weight, expert_index
+
+
+class HunyuanMoE(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.moe_topk = config.moe_topk
+        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        if config.use_mixed_mlp_moe:
+            self.shared_mlp = HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+        self.gate = HunyuanTopKGate(config, layer_idx=layer_idx)
+        self.experts = nn.ModuleList(
+            [HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False, is_moe=True) for _ in range(self.num_experts)]
+        )
+
+        self._moe_impl = config.moe_impl
+        # For FlashInfer
+        self.moe_weight = None
+        self.moe_weight_2 = None
+        self._weights_initialized = False
+
+    @property
+    def moe_impl(self):
+        return self._moe_impl
+
+    @moe_impl.setter
+    def moe_impl(self, value):
+        self._moe_impl = value
+        if self._moe_impl == "flashinfer":
+            assert flashinfer is not None, "When using fused_moe, flashinfer must be installed."
+
+    def forward(self, hidden_states):
+        torch.cuda.set_device(hidden_states.device.index)
+        bsz, seq_len, hidden_size = hidden_states.shape
+
+        if self.config.use_mixed_mlp_moe:
+            hidden_states_mlp = self.shared_mlp(hidden_states)
+
+        reshaped_input = hidden_states.reshape(-1, hidden_size) # [bsz*seq_len, hidden_size]
+
+        with nvtx.range("MoE"):
+            if self._moe_impl == "flashinfer":
+                # Get expert weights
+                if not self._weights_initialized:
+                    self._initialize_weights_on_device(hidden_states.device)
+                topk_weight, topk_index = self.gate(hidden_states, topk_impl='easy')
+
+                combined_output = torch.zeros_like(reshaped_input)
+                _ = flashinfer.fused_moe.cutlass_fused_moe(     # noqa
+                    reshaped_input.contiguous(),
+                    topk_index.to(torch.int).contiguous(),
+                    topk_weight.to(torch.float).contiguous(),
+                    self.moe_weight,
+                    self.moe_weight_2,
+                    torch.bfloat16,
+                    output=combined_output,
+                    quant_scales=None,
+                )
+            else:
+                # Original implementation - fallback for compatibility
+                l_moe, combine_weights, dispatch_mask, exp_counts = self.gate(hidden_states, topk_impl='default')
+                dispatched_input = torch.einsum("sec,sm->ecm", dispatch_mask.type_as(hidden_states), reshaped_input)
+                chunks = dispatched_input.chunk(self.num_experts, dim=0)
+                expert_outputs = []
+                for chunk, expert in zip(chunks, self.experts):
+                    expert_outputs.append(expert(chunk))
+
+                expert_output = torch.cat(expert_outputs, dim=0)
+                combined_output = torch.einsum("sec,ecm->sm", combine_weights.type_as(hidden_states), expert_output)
+
+        combined_output = combined_output.reshape(bsz, seq_len, hidden_size)
+
+        if self.config.use_mixed_mlp_moe:
+            output = hidden_states_mlp + combined_output    # noqa
+        else:
+            output = combined_output
+
+        return output
+
+    def _initialize_weights_on_device(self, device):
+        expert_weights_gate_up = []
+        expert_weights_down = []
+
+        for expert in self.experts:
+            expert.to(device)
+            expert_weights_gate_up.append(expert.gate_and_up_proj.weight.to(device))
+            expert_weights_down.append(expert.down_proj.weight.to(device))
+
+        self.moe_weight = torch.stack(expert_weights_gate_up).contiguous()
+        self.moe_weight_2 = torch.stack(expert_weights_down).contiguous()
+        # empty the expert weights
+        for expert in self.experts:
+            expert.gate_and_up_proj.weight.data = torch.empty(0, device=device)
+            if expert.gate_and_up_proj.bias is not None:
+                expert.gate_and_up_proj.bias.data = torch.empty(0, device=device)
+            expert.down_proj.weight.data = torch.empty(0, device=device)
+            if expert.down_proj.bias is not None:
+                expert.down_proj.bias.data = torch.empty(0, device=device)
+
+        self._weights_initialized = True
+
+
+class HunyuanImage3SDPAAttention(nn.Module):
+    """PyTorch SDPA attention implementation using torch.nn.functional.scaled_dot_product_attention"""
+
+    def __init__(self, config: HunyuanImage3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_type = 'self'
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        # self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim: int = config.attention_head_dim
+        self.num_key_value_heads = config.num_key_value_heads if config.num_key_value_heads else self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.use_qk_norm = config.use_qk_norm
+        self.use_rotary_pos_emb = config.use_rotary_pos_emb
+        self.hidden_size_q = self.head_dim * self.num_heads
+        self.hidden_size_kv = self.head_dim * self.num_key_value_heads
+
+        # define layers
+        self.qkv_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size_q + 2 * self.hidden_size_kv,
+            bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(self.hidden_size_q, self.hidden_size, bias=config.attention_bias)
+
+        if self.use_qk_norm:
+            self.query_layernorm = HunyuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = HunyuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        if self.use_rotary_pos_emb:
+            self._init_rope()
+
+    def _init_rope(self):
+        scaling_type = self.config.rope_scaling["type"]
+        if scaling_type == "custom":
+            # Using custom rotary embedding
+            self.rotary_emb = None
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: Optional[bool] = False,
+            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
+            **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if output_attentions:
+            raise NotImplementedError(
+                'HunyuanImage3Model is using HunyuanImage3SDPAAttention,'
+                'but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`.'
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv_states = self.qkv_proj(hidden_states)
+        qkv_states = qkv_states.reshape(bsz, q_len, self.num_key_value_heads, self.num_key_value_groups + 2,
+                                        self.head_dim)
+        query_states, key_states, value_states = torch.split(qkv_states, [self.num_key_value_groups, 1, 1], dim=3)
+
+        query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if self.use_rotary_pos_emb:
+            cos, sin = custom_pos_emb
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if self.use_qk_norm:
+            query_states = self.query_layernorm(query_states)
+            key_states = self.key_layernorm(key_states)
+
+        query_states = query_states.to(value_states.dtype)
+        key_states = key_states.to(value_states.dtype)
+
+        if past_key_value is not None:
+            cache_kwargs = {"cache_position": position_ids}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            query_states = query_states.to(key_states.dtype)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
+        # custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states, key_states, value_states, attn_mask=attention_mask, dropout_p=0.0
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+Hunyuan_ATTENTION_CLASSES = {
+    "eager": HunyuanImage3SDPAAttention,
+    "sdpa": HunyuanImage3SDPAAttention,
+}
+
+
+class HunyuanImage3DecoderLayer(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+
+        attn_impl = config._attn_implementation     # noqa
+        if attn_impl in Hunyuan_ATTENTION_CLASSES:
+            self.self_attn = Hunyuan_ATTENTION_CLASSES[attn_impl](config=config, layer_idx=layer_idx)
+        else:
+            raise ValueError(f"Unsupported attention implementation: {attn_impl}")
+
+        if ((isinstance(config.num_experts, int) and config.num_experts > 1) or (
+                isinstance(config.num_experts, list) and max(
+                config.num_experts) > 1)) and layer_idx >= config.moe_layer_num_skipped:
+            self.mlp = HunyuanMoE(config, layer_idx=layer_idx)
+        else:
+            self.mlp = HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False, is_moe=False)
+        if config.norm_type == 'hf_rms' or config.norm_type == 'rms':
+            self.input_layernorm = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_attention_layernorm = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        elif config.norm_type == 'fused' or config.norm_type == 'torch_nn':
+            self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            assert False, "other norm_type are not supported"
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
+            **kwargs,
+    ) -> Tuple[torch.FloatTensor | Any]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            position_ids (`torch.LongTensor`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            custom_pos_emb (`Tuple[torch.FloatTensor]`, *optional*): custom position embedding for rotary
+                position embedding
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use "
+                "`attention_mask` instead.`"
+            )
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            custom_pos_emb=custom_pos_emb,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The bare Hunyuan Image 3 Model outputting raw hidden-states without any specific head on top.",
+    Hunyuan_START_DOCSTRING,
+)
+class HunyuanImage3PreTrainedModel(PreTrainedModel):
+    config_class = HunyuanImage3Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunyuanImage3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+Hunyuan_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Hunyuan Model outputting raw hidden-states without any specific head on top.",
+    Hunyuan_START_DOCSTRING,
+)
+class HunyuanImage3Model(HunyuanImage3PreTrainedModel):
+    def __init__(self, config: HunyuanImage3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.add_classification_head = config.add_classification_head
+        self.wte = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunyuanImage3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        if not config.add_classification_head:
+            self.ln_f = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.shared_tensor = None
+
+    @add_start_docstrings_to_model_forward(Hunyuan_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
+            mode: str = "gen_text",
+            first_step: Optional[bool] = None,
+            post_token_len: int = None,
+            num_image_tokens: int = None,
+            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
+            num_special_tokens: int = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                custom_pos_emb=custom_pos_emb,
+                mode=mode,
+                first_step=first_step,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        if not self.add_classification_head:
+            # Do ln_f outside of the model for compatibility with image generation.
+            pass
+            # hidden_states = self.ln_f(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class HunyuanImage3ForCausalMM(HunyuanImage3PreTrainedModel, GenerationMixin):
+    def __init__(self, config: HunyuanImage3Config, skip_load_module:set[str]={}, use_dist_vae=False, wgt_path=""):
+        """HunyuanImage3ForCausalMM
+
+        Args:
+            config (HunyuanImage3Config): model config to initialize the model
+            skip_load_module (set[str], optional):
+                modules to skip loading, used for vllm inference. Defaults to {}.
+
+        Raises:
+            ValueError: if config is invalid
+        """
+        super().__init__(config)
+        self.config = config
+        self._tokenizer: Optional[HunyuanImage3TokenizerFast] = None
+
+        #self.generation_config = GenerationConfig.from_model_config(config)
+
+        # Initialize image preprocessor (for conditional images)
+        self.image_processor = HunyuanImage3ImageProcessor(config)
+
+        if 'all' in skip_load_module:
+            skip_load_module = {
+                'vae',
+                'vit',
+                'timestep_emb',
+                'patch_embed',
+                'time_embed',
+                'final_layer',
+                'time_embed_2',
+                'transformers',
+            }
+        if 'vae' not in skip_load_module:
+            # vae and gen_image pipeline
+            if not use_dist_vae:
+                self.vae = AutoencoderKLConv3D.from_config(config.vae)
+                self.vae_dtype = getattr(torch, config.vae_dtype)
+                self.vae_autocast_dtype = getattr(torch, config.vae_autocast_dtype)
+                self.vae = self.vae.eval()
+                for param in self.vae.parameters():
+                    param.requires_grad = False  #
+            else:
+                self.vae = AutoencoderKLConv3D_Dist.from_config(config.vae)
+                self.vae_dtype = getattr(torch, config.vae_dtype)
+                self.vae_autocast_dtype = getattr(torch, config.vae_autocast_dtype)
+                self.vae.create_dist(wgt_path, config.vae)
+        self._pipeline = None
+
+        if 'vit' not in skip_load_module:
+            # vit
+            self.vision_model = Siglip2VisionTransformer(config.vit)
+            self.vision_aligner = LightProjector(config.vit_aligner)
+
+        if 'timestep_emb' not in skip_load_module:
+            # image generation related
+            self.timestep_emb = TimestepEmbedder(hidden_size=config.hidden_size)
+
+        if self.config.cfg_distilled:
+            self.guidance_emb = TimestepEmbedder(hidden_size=config.hidden_size)
+        if self.config.use_meanflow: 
+            self.timestep_r_emb = TimestepEmbedder(hidden_size=config.hidden_size)
+
+        if config.img_proj_type == "unet":
+            if 'patch_embed' not in skip_load_module:
+                self.patch_embed = UNetDown(
+                    patch_size=config.patch_size,
+                    emb_channels=config.hidden_size,
+                    in_channels=config.vae["latent_channels"],
+                    hidden_channels=config.patch_embed_hidden_dim,
+                    out_channels=config.hidden_size,
+                )
+            if 'time_embed' not in skip_load_module:
+                self.time_embed = TimestepEmbedder(hidden_size=config.hidden_size)
+
+            if 'final_layer' not in skip_load_module:
+                self.final_layer = UNetUp(
+                    patch_size=config.patch_size,
+                    emb_channels=config.hidden_size,
+                    in_channels=config.hidden_size,
+                    hidden_channels=config.patch_embed_hidden_dim,
+                    out_channels=config.vae["latent_channels"],
+                    out_norm=True,
+                )
+            if 'time_embed_2' not in skip_load_module:
+                self.time_embed_2 = TimestepEmbedder(hidden_size=config.hidden_size)
+        else:
+            raise ValueError(f"Unknown img_proj_type {config.img_proj_type}")
+
+        if 'transformers' not in skip_load_module:
+            # transformer backbone
+            self.model = HunyuanImage3Model(config)
+            # linear head
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.pad_id = config.pad_id
+        self.vocab_size = config.vocab_size
+      
+        # Taylor Cache
+        self.use_taylor_cache = False
+
+        self.num_image_tokens = None
+        self.num_special_tokens = None
+        # Initialize cached rope, supporting automatic cache update
+        self.cached_rope = CachedRoPE(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def from_config(cls, config: HunyuanImage3Config,  skip_load_module:set[str]={}):
+        return cls(config, skip_load_module=skip_load_module)
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            raise ValueError("Attribute `tokenizer` has not been initialized yet. Please set it first.")
+        return self._tokenizer
+
+    def load_tokenizer(self, tokenizer):
+        self._tokenizer = HunyuanImage3TokenizerFast.from_pretrained(tokenizer)
+
+    @property
+    def pipeline(self):
+        if self._pipeline is None:
+            self.scheduler = FlowMatchDiscreteScheduler(
+                shift=self.generation_config.flow_shift, reverse=True, solver="euler",
+            )
+            self._pipeline = HunyuanImage3Text2ImagePipeline(
+                model=self, scheduler=self.scheduler, vae=self.vae,
+            )
+        return self._pipeline
+
+    def instantiate_vae_image_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            timesteps: BatchRaggedTensor,
+            images: BatchRaggedImages,
+            image_mask: torch.Tensor,
+            guidance: torch.Tensor = None,
+            timesteps_r: torch.Tensor = None,
+    ):
+        """
+        Instantiate the VAE image embeddings into the input embedding sequence.
+
+        Args:
+            hidden_states: input sequence, (batch_size, seq_len, n_embd)
+            images: BatchRaggedImages
+                images can be a 4-D tensor, or a list of 4-D tensors, or a list of lists of 3-D tensors.
+            timesteps: BatchRaggedTensor
+                ts can be a 1-D tensor, or a list of 1-D tensors
+            image_mask: (batch_size, seq_len)
+        """
+        if hidden_states is None:
+            # Only for inference in non-first step image generation
+            t_emb = self.time_embed(timesteps)
+            image_emb = self.patch_embed(images, t_emb)[0]
+            timestep_emb = self.timestep_emb(timesteps).reshape(images.size(0), -1, self.config.hidden_size)
+            cat_list = [timestep_emb, image_emb]
+            
+            if guidance is not None:
+                guidance_src = self.guidance_emb(guidance.reshape(-1))    # (bsz * n, n_embd)
+                guidance_emb = guidance_src.reshape(images.size(0), -1, self.config.hidden_size)
+            if timesteps_r is not None:
+                timesteps_r_src = self.timestep_r_emb(timesteps_r.reshape(-1))    # (bsz * n, n_embd)
+                timesteps_r_emb = timesteps_r_src.reshape(images.size(0), -1, self.config.hidden_size)
+
+            if guidance is not None and timesteps_r is not None:
+                cat_list = [timestep_emb, guidance_emb, timesteps_r_emb, image_emb]
+            elif guidance is not None:
+                cat_list = [timestep_emb, guidance_emb, image_emb]
+            elif timesteps_r is not None:
+                cat_list = [timestep_emb, timesteps_r_emb, image_emb]
+            hidden_states = torch.cat(cat_list, dim=1)
+            return hidden_states
+
+        bsz, seqlen, n_embd = hidden_states.shape
+        assert isinstance(images, (torch.Tensor, list)), f"images should be BatchRaggedImages, got {type(images)}"
+
+        if isinstance(images, torch.Tensor):
+            assert images.ndim == 4, f"images should be a 4-D tensor, got {images.ndim}-D tensor"
+            assert isinstance(timesteps, torch.Tensor), f"timesteps should be 1-D tensor, got {type(timesteps)}"
+
+            bsz, seqlen, n_embd = hidden_states.shape
+            index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)   # (bsz, seqlen)
+            t_emb = self.time_embed(timesteps)     # (bsz, n_embd)
+            image_seq, token_h, token_w = self.patch_embed(images, t_emb)   # (bsz, num_patches, n_embd)
+            image_scatter_index = index.masked_select(image_mask.bool()).reshape(bsz, -1)   # (bsz, num_patches)
+            hidden_states.scatter_(
+                dim=1,
+                index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=image_seq,
+            )
+
+        else:   # list
+            index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)   # (bsz, seqlen)
+            for i, (image_i, t_i) in enumerate(zip(images, timesteps)):
+                t_i_emb = self.time_embed(t_i)      # (n_i, n_embd)
+
+                if isinstance(image_i, torch.Tensor):
+                    image_i_seq, _, _ = self.patch_embed(image_i, t_i_emb)  # (n_i, num_patches, n_embd)
+
+                elif isinstance(image_i, list):
+                    image_i_seq_list = []
+                    for j in range(len(image_i)):
+                        image_ij = image_i[j].unsqueeze(0)
+                        assert image_ij.ndim == 4, \
+                            f"image_ij should have size of (1, C, H, W), got {list(image_ij.size())}"
+                        image_i_seq_j = self.patch_embed(image_ij, t_i_emb[j:j + 1])[0]  # (1, num_patches, n_embd)
+                        image_i_seq_list.append(image_i_seq_j)
+                    image_i_seq = torch.cat(image_i_seq_list, dim=1)    # (1, Σj num_patches_j, n_embd)
+
+                else:
+                    raise TypeError(f"image_i should be a torch.Tensor or a list, got {type(image_i)}")
+
+                image_i_index = index[i:i + 1].masked_select(image_mask[i:i + 1].bool()).reshape(1, -1)  # (1, img_seqlen)
+                hidden_states[i:i + 1].scatter_(
+                    dim=1,
+                    index=image_i_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=image_i_seq.reshape(1, -1, n_embd),  # (1, img_seqlen, n_embd)
+                )
+
+        return hidden_states
+
+    def _forward_vision_encoder(self, images, **image_kwargs):
+        image_embeds = self.vision_model(images, **image_kwargs).last_hidden_state
+        image_embeds = self.vision_aligner(image_embeds)
+
+        return image_embeds
+
+    def instantiate_vit_image_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            images: torch.Tensor | list[torch.Tensor],
+            image_masks: torch.Tensor,
+            image_kwargs: dict[str, torch.Tensor],
+    ):
+        """
+        Encode images using vision encoder(vit), and then instantiate the image embeddings into
+        the input embedding sequence.
+
+        Args:
+            hidden_states (torch.Tensor): input sequence, (bsz, seqlen, n_embd)
+            images (torch.Tensor | list[torch.Tensor]): images can be a 3-D or 4-D tensor, or a list of tensors.
+            image_masks (torch.Tensor): mask for the images, (bsz, seqlen)
+            image_kwargs (dict[str, torch.Tensor]): additional keyword arguments for the image encoder
+
+        Returns:
+            Instantiated input sequence
+        """
+        bsz, seqlen, n_embd = hidden_states.shape
+        index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)
+
+        if isinstance(images, torch.Tensor):
+            assert images.ndim in [3, 4, 5], f"images should be a 3-D, 4-D, or 5-D tensor, got {images.ndim}-D tensor."
+            if images.ndim in [4, 5]:
+                bsz, n = images.shape[:2]
+                images = images.view(bsz * n, *images.shape[2:])
+                image_kwargs = image_kwargs if image_kwargs is not None else {}
+                for k, v in image_kwargs.items():
+                    image_kwargs[k] = v.reshape(bsz * n, *v.shape[2:])
+            else:
+                n = 1
+            image_embeds = self._forward_vision_encoder(images, **image_kwargs)
+            image_seqlen = image_embeds.size(1)
+
+            image_scatter_index = index.masked_select(image_masks.bool()).reshape(bsz, -1)
+            hidden_states.scatter_(
+                dim=1,
+                index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=image_embeds.reshape(bsz, n * image_seqlen, n_embd),
+            )
+
+        elif isinstance(images, list):
+            for i, (image, image_mask) in enumerate(zip(images, image_masks)):
+                cur_kwargs = {k: v[i] for k, v in image_kwargs.items()} if image_kwargs is not None else {}
+                image_embed = self._forward_vision_encoder(image, **cur_kwargs)
+                n, image_seqlen, n_embd = image_embed.shape
+                image_embed = image_embed.reshape(n * image_seqlen, n_embd)
+
+                image_scatter_index = index[i:i+1].masked_select(image_mask.bool()).reshape(1, -1)
+                hidden_states[i:i+1].scatter_(
+                    dim=1,
+                    index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=image_embed.reshape(1, -1, n_embd),
+                )
+        else:
+            raise ValueError(f"und_images should be Tensor or List, but got {type(images)}")
+
+        return hidden_states
+
+    def instantiate_continuous_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            timesteps: Optional[BatchRaggedTensor] = None,
+            timesteps_index: Optional[BatchRaggedTensor] = None,
+    ):
+        bsz, seqlen, n_embd = hidden_states.shape
+
+        if isinstance(timesteps, list):
+            for i, timestep in enumerate(timesteps):
+                timestep_src = self.timestep_emb(timestep)  # (n, n_embd)
+                hidden_states[i:i+1].scatter_(
+                    dim=1,
+                    index=timesteps_index[i].unsqueeze(0).unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=timestep_src.reshape(1, -1, n_embd),
+                )
+        else:
+            timesteps_src = self.timestep_emb(timesteps.reshape(-1))    # (bsz * n, n_embd)
+            hidden_states.scatter_(
+                dim=1,
+                index=timesteps_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=timesteps_src.reshape(bsz, -1, n_embd),
+            )
+
+        return hidden_states
+
+    def instantiate_guidance_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            guidance: Optional[BatchRaggedTensor] = None,
+            guidance_index: Optional[BatchRaggedTensor] = None,
+    ):
+        bsz, seqlen, n_embd = hidden_states.shape
+
+        guidance_src = self.guidance_emb(guidance.reshape(-1))    # (bsz * n, n_embd)
+        hidden_states.scatter_(
+            dim=1,
+            index=guidance_index.unsqueeze(-1).repeat(1, 1, n_embd),
+            src=guidance_src.reshape(bsz, -1, n_embd),
+        )
+
+        return hidden_states
+
+
+    def instantiate_timestep_r_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            timesteps_r: Optional[BatchRaggedTensor] = None,
+            timesteps_r_index: Optional[BatchRaggedTensor] = None,
+    ):
+        bsz, seqlen, n_embd = hidden_states.shape
+
+        if isinstance(timesteps_r, list):
+            for i, timestep_r in enumerate(timesteps_r):
+                timestep_r_src = self.timestep_r_emb(timestep_r)  # (n, n_embd)
+                hidden_states[i:i+1].scatter_(
+                    dim=1,
+                    index=timesteps_r_index[i].unsqueeze(0).unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=timestep_r_src.reshape(1, -1, n_embd),
+                )
+        else:
+            timesteps_r_src = self.timestep_r_emb(timesteps_r.reshape(-1))    # (bsz * n, n_embd)
+            hidden_states.scatter_(
+                dim=1,
+                index=timesteps_r_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=timesteps_r_src.reshape(bsz, -1, n_embd),
+            )
+
+        return hidden_states
+
+    def get_image_tokens_hw(self, images: BatchRaggedImages):
+        assert isinstance(images, (torch.Tensor, list)), f"images should be BatchRaggedImages, got {type(images)}"
+        if isinstance(images, torch.Tensor):
+            token_h = images.shape[-2] // self.config.patch_size
+            token_w = images.shape[-1] // self.config.patch_size
+        else:
+            token_h, token_w = [], []
+            for image_i in images:
+                assert isinstance(image_i, (torch.Tensor, list)), \
+                    f"image_i should be a tensor or a list of tensors, got {type(image_i)}"
+                if isinstance(image_i, torch.Tensor):
+                    token_h.append(image_i.shape[-2] // self.config.patch_size)
+                    token_w.append(image_i.shape[-1] // self.config.patch_size)
+                else:
+                    token_h.append([])
+                    token_w.append([])
+                    for j in range(len(image_i)):
+                        token_h[-1].append(image_i[j].shape[-2] // self.config.patch_size)
+                        token_w[-1].append(image_i[j].shape[-1] // self.config.patch_size)
+        return token_h, token_w
+
+    def ragged_final_layer(self, hidden_states, image_mask, timesteps, token_h, token_w, first_step=None):
+        n_embd = hidden_states.size(-1)
+        if isinstance(timesteps, torch.Tensor):
+            # Only one target image.
+            t_emb = self.time_embed_2(timesteps)
+            if first_step is False:
+                # only for gen_image non-first-step inference
+                image_output = hidden_states[:, self.num_special_tokens:, :]
+            else:   # first_step is True or None
+                image_output = hidden_states.masked_select(
+                    image_mask.unsqueeze(-1).bool()).reshape(-1, token_h * token_w, n_embd)
+            pred = self.final_layer(image_output, t_emb, token_h, token_w)
+        else:
+            # Multiple target images(interleave data).
+            # In this case, each line of the image_mask may contain different number of Trues, leading
+            # the `reshape(batch_size, ...)` is not possible.
+            sections = image_mask.sum(1).tolist()
+            image_output = hidden_states.masked_select(
+                image_mask.unsqueeze(-1).bool()).reshape(-1, n_embd).split(sections)
+            pred = []
+            for image_output_i, t_i, token_h_i, token_w_i in zip(image_output, timesteps, token_h, token_w):
+                t_emb_i = self.time_embed_2(t_i)
+                if isinstance(token_h_i, int):
+                    image_output_i = image_output_i.reshape(-1, token_h_i * token_w_i, n_embd)
+                    pred_i = self.final_layer(image_output_i, t_emb_i, token_h_i, token_w_i)
+                    pred.append(pred_i)
+                else:
+                    subsections = [token_h_ij * token_w_ij for token_h_ij, token_w_ij in zip(token_h_i, token_w_i)]
+                    image_output_i = image_output_i.split(subsections)
+                    pred_i = []
+                    for j, image_output_ij in enumerate(image_output_i):
+                        pred_ij = self.final_layer(image_output_ij[None], t_emb_i[j:j+1], token_h_i[j], token_w_i[j])
+                        pred_i.append(pred_ij)
+                    pred.append(pred_i)
+        return pred
+
+    @staticmethod
+    def _check_inputs(cond, target, check_list):
+        if cond:
+            for name, item in check_list:
+                assert item is not None, f"`{name}` should be provided when `{target}`."
+
+    @add_start_docstrings_to_model_forward(Hunyuan_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,  # bsz x seqlen
+            attention_mask: Optional[torch.Tensor] = None,  # bsz x 1 x seqlen x seqlen
+            rope_image_info: Optional[list[list[tuple[slice, tuple[int, int]]]]] = None,
+            return_dict: bool = True,
+            # for gen images
+            images: Optional[BatchRaggedImages] = None,  # bsz x c x h x w, or bsz x (n_i x (c x h_ij x w_ij))
+            image_mask: Optional[torch.Tensor] = None,  # bsz x seqlen
+            timesteps: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
+            timesteps_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
+            timesteps_r: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
+            timesteps_r_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
+            guidance: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
+            guidance_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
+            # for cond images
+            cond_vae_images: Optional[BatchRaggedImages] = None,  # bsz x c x h x w, or bsz x (m_i x (c x h_ij x w_ij))
+            cond_vae_image_mask: Optional[torch.Tensor] = None,  # bsz x seqlen
+            cond_timesteps: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (m_i)
+            cond_timesteps_index: Optional[BatchRaggedTensor] = None,
+            cond_vit_images: Optional[BatchRaggedImages] = None,
+            cond_vit_image_mask: Optional[torch.Tensor] = None,
+            cond_vit_image_kwargs: Optional[dict[str, Any]] = None,
+            # only for inference
+            position_ids: Optional[torch.Tensor] = None,  # bsz x seq_len-1, used for KVCache
+            past_key_values: Optional[HunyuanStaticCache] = None,
+            mode: Optional[str] = None,
+            first_step: Optional[bool] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            cache_dic = None,
+            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalMMOutputWithPast]:
+        
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Sanity Check of Inputs
+        self._check_inputs(mode == "gen_image", "in `gen_image` mode", [
+            ("images", images), ("timesteps", timesteps),
+        ])
+        self._check_inputs(mode == "gen_image" and first_step, "in `gen_image` mode at the first step", [
+            ("image_mask", image_mask), ("timesteps_index", timesteps_index),
+        ])
+        self._check_inputs(cond_vae_images is not None, "`cond_vae_images` is provided", [
+            ("cond_timesteps", cond_timesteps), ("cond_vae_image_mask", cond_vae_image_mask),
+            ("cond_timesteps_index", cond_timesteps_index),
+        ])
+        self._check_inputs(cond_vit_images is not None, "`cond_vit_images` is provided", [
+            ("cond_vit_image_mask", cond_vit_image_mask),
+        ])
+        if input_ids is None and images is None:
+            raise ValueError("Either input_ids or images should be provided.")
+        if input_ids is not None:
+            device = input_ids.device
+        else:
+            device = get_device(images)
+        if self.training:
+            seqlen = input_ids.size(1)
+        else:
+            # For inference, we always set seqlen to maximum length to simplify the rope cache handling
+            seqlen = self.config.max_position_embeddings
+        assert self.config.max_position_embeddings >= seqlen, (
+            f"Cannot forward sequence of length {seqlen}, "
+            f"max position embeddings is only {self.config.max_position_embeddings}, "
+            f"try set --max-position-embeddings to a larger value."
+        )
+
+        # Calculate multimodal 2d rope
+        cos, sin = self.cached_rope(
+            seqlen, device, rope_image_info=rope_image_info, position_ids=position_ids,
+        )
+        # === Map token ids to embeddings ===
+        if input_ids is not None:
+            hidden_states = self.model.wte(input_ids)  # (bsz, seqlen, n_embd)
+        else:
+            hidden_states = None  # only for non-first step inference of the image generation
+         
+        # === Input layers ===
+        if images is not None:
+            if self.config.cfg_distilled and input_ids is None:
+                hidden_states = self.instantiate_vae_image_tokens(hidden_states, timesteps, images, image_mask, guidance, timesteps_r)
+            else:
+                hidden_states = self.instantiate_vae_image_tokens(hidden_states, timesteps, images, image_mask)
+
+        if cond_vae_images is not None:
+            hidden_states = self.instantiate_vae_image_tokens(hidden_states, cond_timesteps, cond_vae_images,
+                                                              cond_vae_image_mask)
+
+        if cond_vit_images is not None:
+            hidden_states = self.instantiate_vit_image_tokens(hidden_states, cond_vit_images, cond_vit_image_mask,
+                                                              cond_vit_image_kwargs)
+        if timesteps_index is not None:
+            hidden_states = self.instantiate_continuous_tokens(hidden_states, timesteps, timesteps_index)
+
+        # guidance token
+        if guidance_index is not None:
+            hidden_states = self.instantiate_guidance_tokens(hidden_states, guidance, guidance_index)
+
+        # timestep r token
+        if timesteps_r_index is not None:
+            hidden_states = self.instantiate_timestep_r_tokens(hidden_states, timesteps_r, timesteps_r_index)
+
+        if cond_timesteps_index is not None:
+            hidden_states = self.instantiate_continuous_tokens(hidden_states, cond_timesteps, cond_timesteps_index)
+        if mode == "gen_text":
+            first_step = True
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        if not self.use_taylor_cache:
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=hidden_states,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                custom_pos_emb=(cos, sin),
+                mode=mode,
+                first_step=first_step,
+                post_token_len = self.post_token_len,
+                num_image_tokens = self.num_image_tokens,
+                gen_timestep_scatter_index = gen_timestep_scatter_index,
+                num_special_tokens = self.num_special_tokens,
+            )
+            hidden_states = outputs[0]
+        else:
+            if not hasattr(self.model, "taylor_cache"):
+                self.model.taylor_cache = CacheWithFreqsContainer(cache_dic['max_order'])
+            if not hasattr(self.model, "counter"):
+                self.model.counter = 0
+
+            full_computation = (cache_dic['current_step'] == 0) \
+                or (self.model.counter == cache_dic['cache_interval'] -1) \
+                or (cache_dic['enable_first_enhance'] and cache_dic['current_step'] < cache_dic['first_enhance_steps']) \
+                or (cache_dic['enable_tailing_enhance'] and cache_dic['current_step'] >= cache_dic['num_steps'] - cache_dic['tailing_enhance_steps'])
+            if not hasattr(self.model, "last_full_computation_step"):
+                self.model.last_full_computation_step = 0
+            if full_computation:
+                self.model.counter = 0
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=hidden_states,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    custom_pos_emb=(cos, sin),
+                    mode=mode,
+                    first_step=first_step,
+                    post_token_len = self.post_token_len,
+                    num_image_tokens = self.num_image_tokens,
+                    gen_timestep_scatter_index = gen_timestep_scatter_index,
+                    num_special_tokens = self.num_special_tokens,
+                )
+                hidden_states = outputs[0]
+
+                if cache_dic['enable_first_enhance'] and (cache_dic['current_step'] < (cache_dic['first_enhance_steps']-1)):
+                    pass
+                else:
+                    self.model.taylor_cache.derivatives_computation(hidden_states, distance = cache_dic['current_step'] - self.model.last_full_computation_step, low_freqs_order=cache_dic['low_freqs_order'], high_freqs_order=cache_dic['high_freqs_order'])
+
+                self.model.last_full_computation_step = cache_dic['current_step']
+                self.model.taylor_cache.last_past_key_values = outputs.past_key_values  
+            else:
+                self.model.counter += 1
+                hidden_states = self.model.taylor_cache.taylor_formula(distance = self.model.counter)
+                outputs = BaseModelOutputWithPast(
+                    last_hidden_state=hidden_states,
+                    past_key_values=self.model.taylor_cache.last_past_key_values,
+                    hidden_states=None,
+                    attentions=None,
+                ) 
+            if cache_dic['current_step'] == cache_dic['num_steps'] - 1:
+                self.model.taylor_cache.clear_derivatives()
+
+
+        # === Output layers ===
+        # -- image tokens
+        if images is not None:
+            token_h, token_w = self.get_image_tokens_hw(images)
+            hidden_states = hidden_states.to(device=get_device(images))
+            diff_pred = self.ragged_final_layer(
+                hidden_states, image_mask, timesteps, token_h, token_w, first_step)
+        else:
+            diff_pred = None
+        # -- text tokens
+        if input_ids is None or mode == "gen_image":
+            logits = None
+        else:
+            hidden_states = self.model.ln_f(hidden_states)
+            logits = self.lm_head(hidden_states)  # (bsz, seqlen, vocab_size)
+        # -- for inference
+        if not return_dict:
+            return (logits.float(),) + outputs[1:] + (diff_pred,)
+        return CausalMMOutputWithPast(
+            logits=logits.float() if logits is not None else None,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            diffusion_prediction=diff_pred,
+        )
+
+    @staticmethod
+    def check_inputs(prompt=None, image=None, message_list=None):
+        if prompt is None and message_list is None:
+            raise ValueError("Either `prompt` or `message_list` should be provided.")
+        if prompt is not None and message_list is not None:
+            raise ValueError("`prompt` and `message_list` cannot be provided at the same time.")
+        if message_list is not None:
+            if not isinstance(message_list, list):
+                raise ValueError(f"`message_list` should be a list of messages, but got {type(message_list)}.")
+            assert len(message_list) > 0, "`message_list` should be a non-empty list."
+            for message in message_list:
+                assert isinstance(message, list) or isinstance(message, dict), \
+                    f"Each message should be a list of dicts or a dict, but got {type(message)}."
+        if image is not None:
+            error_msg = \
+                "`image` should be a PIL Image, a string path, a base64 string, bytes, or a list of them, but got {}."
+            if isinstance(image, list):
+                for im in image:
+                    assert isinstance(im, (Image.Image, str, bytes)), error_msg.format(type(im))
+            else:
+                assert isinstance(image, (Image.Image, str, bytes)), error_msg.format(type(image))
+
+    @staticmethod
+    def _validate_and_batchify_text(text, name, check_batch_size=None):
+        if text is None:
+            return text
+        assert isinstance(text, str) or isinstance(text, list), \
+            f"Input `{name}` should be a string or a list of strings, but got {type(text)}."
+        if isinstance(text, str):
+            text = [text]
+        #assert len(text) > 0 and all(isinstance(p, str) and len(p) > 0 for p in text), \
+        #    f"Input `{name}` should be a non-empty list of non-empty strings, got {text}."
+        if check_batch_size is not None:
+            assert len(text) == check_batch_size, \
+                f"Input `{name}` should have the same batch size as other inputs({check_batch_size}), got {len(text)}."
+        return text
+
+    @staticmethod
+    def _validate_and_batchify_image(image, name, check_batch_size=None):
+        if image is None:
+            return image
+        assert isinstance(image, (InputImage, list)), \
+            f"Input `{name}` should be a image or a list of images, but got {type(image)}."
+        if not isinstance(image, list):
+            image = [image]
+        batch_image_list = [image] if not isinstance(image[0], list) else image
+        for image_list in batch_image_list:
+            assert all(isinstance(im, InputImage) for im in image_list), \
+                (f"Each item in `{name}` should be a PIL Image, a string path, a base64 string, or bytes, "
+                 f"got {[type(im) for im in image_list]}.")
+        if check_batch_size is not None:
+            assert len(batch_image_list) == check_batch_size, \
+                f"Input `{name}` should have the same batch size as other inputs({check_batch_size})"
+        return batch_image_list
+
+    @staticmethod
+    def prepare_seed(seed, batch_size):
+        if isinstance(seed, torch.Tensor):
+            seed = seed.tolist()
+        if seed is None:
+            seeds = [random.randint(0, 10_000_000) for _ in range(batch_size)]
+        elif isinstance(seed, int):
+            seeds = [seed for _ in range(batch_size)]
+        elif isinstance(seed, (list, tuple)):
+            if len(seed) == batch_size:
+                seeds = [int(seed[i]) for i in range(batch_size)]
+            else:
+                raise ValueError(f"Length of seed must be equal to the batch_size({batch_size}), got {seed}.")
+        else:
+            raise ValueError(f"Seed must be an integer, a list of integers, or None, got {seed}.")
+        return seeds
+
+    def build_batch_rope_image_info(self, output, sections):
+        # Rope 1D. No need to build rope_image_info
+        if self.config.rope_type == "default":
+            return None
+
+        # Rope 2D
+        assert self.config.rope_type == "2d", \
+            f"Rope type {self.config.rope_type} not supported by method 'build_batch_rope_image_info'."
+        rope_image_info = []
+        for image_slices, sections_i in zip(output.all_image_slices, sections):
+            rope_2d_image_slices = []
+            rope_2d_image_shapes = []
+            image_idx = 0
+
+            for section in sections_i:
+                if section['type'] in ["gen_image", "cond_vae_image", "cond_vit_image"]:
+                    assert image_idx < len(image_slices), \
+                        f"Image index {image_idx} out of range for image slices with length {len(image_slices)}."
+                    rope_2d_image_slices.append(image_slices[image_idx])
+                    rope_2d_image_shapes.append((section['token_height'], section['token_width']))
+                    image_idx += 1
+
+                elif section['type'] == "cond_joint_image":
+                    assert image_idx + 1 < len(image_slices), \
+                        f"Image index {image_idx + 1} out of range for image slices with length {len(image_slices)}."
+                    assert len(section['token_height']) == len(section['token_width']), \
+                        (f"token_height and token_width should have the same length, "
+                         f"but got {len(section['token_height'])} and {len(section['token_width'])}")
+
+                    if self.image_processor.cond_token_attn_type in ["full", "joint_full"]:
+                        rope_2d_image_slices.extend([image_slices[image_idx], image_slices[image_idx + 1]])
+                        rope_2d_image_shapes.extend(list(zip(section['token_height'], section['token_width'])))
+                    elif self.image_processor.cond_token_attn_type == "full_causal":
+                        rope_2d_image_slices.append(image_slices[image_idx])
+                        rope_2d_image_shapes.append((section['token_height'][0], section['token_width'][0]))
+                    elif self.image_processor.cond_token_attn_type == "causal":
+                        pass
+                    else:
+                        raise NotImplementedError(
+                            f"cond_token_attn_type {self.image_processor.cond_token_attn_type} not supported "
+                            f"by method 'build_batch_rope_image_info'."
+                        )
+                    image_idx += 2
+
+            rope_image_info.append(list(zip(rope_2d_image_slices, rope_2d_image_shapes)))
+
+        return rope_image_info
+
+    def vae_encode(self, image, cfg_factor=1, generator=None):
+        config = self.vae.config
+
+        with torch.autocast(
+                device_type="cuda", dtype=self.vae_autocast_dtype,  # noqa
+                enabled=self.vae_autocast_dtype != torch.float32
+        ):
+            vae_encode_result = self.vae.encode(image)
+            if isinstance(vae_encode_result, torch.Tensor):
+                latents = vae_encode_result
+            else:
+                latents = vae_encode_result.latent_dist.sample(generator)
+            if hasattr(config, 'shift_factor') and config.shift_factor:
+                latents.sub_(config.shift_factor)
+            if hasattr(config, 'scaling_factor') and config.scaling_factor:
+                latents.mul_(config.scaling_factor)
+
+        if hasattr(self.vae, "ffactor_temporal"):
+            assert latents.shape[2] == 1, "latents should have shape [B, C, T, H, W] and T should be 1"
+            latents = latents.squeeze(2)
+
+        # Here we always use t=0 to declare it is a clean conditional image
+        t = torch.zeros((latents.shape[0],))
+
+        if cfg_factor > 1:
+            t = t.repeat(cfg_factor)
+            latents = latents.repeat(cfg_factor, 1, 1, 1)
+
+        return t, latents
+
+    def _encode_cond_image(
+            self,
+            batch_cond_images: list[list[Union[ImageTensor, CondImage]]],
+            cfg_factor: int = 1,
+            generator=None,
+    ):
+        if batch_cond_images is None or len(batch_cond_images[0]) == 0:
+            return None, None, None
+
+        first_image = batch_cond_images[0][0]
+
+        # 1. If vae_image presents
+        if first_image.section_type in ["cond_vae_image", "cond_joint_image"]:
+            # VAE encode one by one, as we assume cond images have different sizes
+            batch_cond_vae_images, batch_cond_t = [], []
+            for cond_images in batch_cond_images:
+                cond_vae_image_list, cond_t_list = [], []
+                for cond_image in cond_images:
+                    vae_image = (
+                        cond_image.vae_image
+                        if cond_image.section_type == "cond_joint_image"
+                        else cond_image
+                    )
+                    cond_t_, cond_vae_image_ = self.vae_encode(
+                        vae_image[None].to(self.device),
+                        generator=generator,
+                    )
+                    cond_vae_image_list.append(cond_vae_image_.squeeze(0))
+                    cond_t_list.append(cond_t_)
+                batch_cond_vae_images.append(cond_vae_image_list)
+                batch_cond_t.append(cond_t_list)
+
+            # If only one cond image for each sample and all have the same size, we can batch them together
+            # In this case, cond_vae_images is a 4-D tensor.
+            if all([len(items) == 1 for items in batch_cond_vae_images]) and all(
+                    items[0].shape == batch_cond_vae_images[0][0].shape for items in batch_cond_vae_images):
+                cond_vae_images = torch.stack([items[0] for items in batch_cond_vae_images], dim=0)
+                cond_t = torch.cat([items[0] for items in batch_cond_t], dim=0)
+                if cfg_factor > 1:
+                    cond_t = cond_t.repeat(cfg_factor)
+                    cond_vae_images = cond_vae_images.repeat(cfg_factor, 1, 1, 1)
+            else:
+                # In this case, cond_vae_images is a list of 4-D tensors or a list of lists of 3-D tensors.
+                cond_t = [torch.cat(item, dim=0) for item in batch_cond_t]
+                cond_vae_images = []
+                for items in batch_cond_vae_images:
+                    if all(items[0].shape == item.shape for item in items):
+                        cond_vae_images.append(torch.stack(items, dim=0))
+                    else:
+                        cond_vae_images.append(items)
+                if cfg_factor > 1:
+                    cond_t = cond_t * cfg_factor
+                    cond_vae_images = cond_vae_images * cfg_factor
+
+        else:
+            cond_vae_images = None
+            cond_t = None
+
+        # 2. If vit_image presents
+        if first_image.section_type in ["cond_vit_image", "cond_joint_image"]:
+            cond_vit_images = []
+            for cond_images in batch_cond_images:
+                cond_vit_image_list = []
+                for cond_image in cond_images:
+                    vit_image = (
+                        cond_image.vit_image
+                        if cond_image.section_type == "cond_joint_image"
+                        else cond_image
+                    )
+                    cond_vit_image_list.append(vit_image)
+                # Here we force convert the tensor to dtype
+                cond_vit_images.append(
+                    torch.stack(cond_vit_image_list, dim=0).to(dtype=self.dtype)
+                )
+
+            if cfg_factor > 1:
+                cond_vit_images = cond_vit_images * cfg_factor
+
+        else:
+            cond_vit_images = None
+
+        return cond_vae_images, cond_t, cond_vit_images
+
+    @staticmethod
+    def _prepare_vit_image_kwargs(batch_cond_images, cfg_factor):
+        if batch_cond_images is None or len(batch_cond_images[0]) == 0:
+            return None
+        first_image = batch_cond_images[0][0]
+        if first_image.section_type == "cond_joint_image":
+            vit_image = first_image.vit_image
+        else:
+            vit_image = first_image
+        if not hasattr(vit_image, "vision_encoder_kwargs") or len(vit_image.vision_encoder_kwargs) == 0:
+            return None
+
+        # Pack vit kwargs. Siglip2-so requires spatial_shapes and attention_mask for inference.
+        cond_vit_image_kwargs = {"spatial_shapes": [], "attention_mask": []}
+        for cond_images in batch_cond_images:
+            cond_vit_image_kwargs["spatial_shapes"].append(
+                torch.stack([
+                    cond_image.vit_image.vision_encoder_kwargs["spatial_shapes"]
+                    for cond_image in cond_images
+                ]))
+            cond_vit_image_kwargs["attention_mask"].append(
+                torch.stack([
+                    cond_image.vit_image.vision_encoder_kwargs["pixel_attention_mask"]
+                    for cond_image in cond_images
+                ]))
+        if cfg_factor > 1:
+            cond_vit_image_kwargs["spatial_shapes"] = cond_vit_image_kwargs["spatial_shapes"] * cfg_factor
+            cond_vit_image_kwargs["attention_mask"] = cond_vit_image_kwargs["attention_mask"] * cfg_factor
+        return cond_vit_image_kwargs
+
+    @torch.no_grad()
+    def prepare_message_list(
+            self,
+            message_list,
+            cond_images: list[CondImage] = None,
+            gen_image_info: ImageInfo = None,
+    ):
+        """ Convert a batch message list of OpenAI style to the internal format. """
+        inner_message_list = []
+        image_idx = 0
+        for message in message_list:
+            content = message["content"]
+            if isinstance(content, str):
+                inner_message_list.append(dict(role=message["role"], type="text", content=content))
+            elif isinstance(content, list):
+                for item in content:
+                    if item["type"] == "text":
+                        inner_message_list.append(dict(role=message["role"], type="text", content=item['text']))
+                    elif item["type"] == "image":
+                        if all(key not in item for key in ["image", "url", "path", "base64"]):
+                            continue
+                        assert cond_images is not None and image_idx < len(cond_images), \
+                            f"Image index {image_idx} out of range for cond images with length {len(cond_images)}."
+                        image = cond_images[image_idx]
+                        inner_message_list.append(dict(role="assistant", type=image.section_type, content=image.i))
+                        image_idx += 1
+                    else:
+                        raise NotImplementedError(f"Message content type {item['type']} not supported.")
+            else:
+                raise ValueError(f"Message content should be str or list, but got {type(content)}.")
+
+        if gen_image_info is not None:
+            inner_message_list.append(dict(role="assistant", type="gen_image", content=gen_image_info))
+
+        return inner_message_list
+
+    def preprocess_inputs(
+            self,
+            prompt: str | list[str] = None,
+            image: InputImage | list[InputImage] = None,
+            cot_text=None,
+            message_list=None,
+            cfg_factor=1,
+            bot_task='auto',
+            system_prompt=None,
+            max_new_tokens=None,
+            mode="gen_text",
+            image_size="auto",
+            infer_align_image_size=False,
+            device=None,
+            **kwargs,
+    ):
+        # 1. Sanity check
+        self.check_inputs(prompt, image, message_list)
+
+        # 2. Format inputs
+        batch_message_list = message_list
+        batch_prompt = prompt
+        batch_cot_text = cot_text
+        batch_system_prompt = system_prompt
+
+        #   -- 2.1 message_list
+        batch_cond_images = kwargs.get('batch_cond_images', None)
+        if batch_message_list is not None:
+            if isinstance(batch_message_list[0], dict):
+                batch_message_list = [batch_message_list]
+            batch_size = len(batch_message_list)
+
+            # Multiple cond images are allowed.
+            if batch_cond_images is None:
+                batch_cond_images = [
+                    self.image_processor.build_cond_images(
+                        message_list=message_list_,
+                        infer_align_image_size=infer_align_image_size,
+                    )
+                    for message_list_ in batch_message_list
+                ]
+            if mode == "gen_image":
+                batch_gen_image_info = [
+                    self.image_processor.build_gen_image_info(image_size, add_guidance_token=self.config.cfg_distilled, add_timestep_r_token=self.config.use_meanflow) for _ in range(batch_size)
+                ]
+            else:
+                batch_gen_image_info = [None] * batch_size
+            # Convert OpenAI message list into inner message list
+            batch_message_list = [
+                self.prepare_message_list(message_list_, cond_images, gen_image_info)
+                for message_list_, cond_images, gen_image_info in zip(
+                    batch_message_list, batch_cond_images, batch_gen_image_info
+                )
+            ]
+
+        #   -- 2.2 Prompt, image, cot text, system prompt
+        else:
+            batch_prompt = self._validate_and_batchify_text(batch_prompt, 'prompt')
+            batch_size = len(batch_prompt)
+
+            batch_cot_text = self._validate_and_batchify_text(batch_cot_text, 'cot_text', batch_size)
+            batch_system_prompt = self._validate_and_batchify_text(batch_system_prompt, 'system_prompt', batch_size)
+
+            batch_image_list = self._validate_and_batchify_image(image, 'image', batch_size)
+            if batch_cond_images is None:
+                batch_cond_images = [
+                    self.image_processor.build_cond_images(
+                        image_list=image_list,
+                        infer_align_image_size=infer_align_image_size
+                    )
+                    for image_list in batch_image_list
+                ] if batch_image_list is not None else None
+
+            if mode == "gen_image":
+                batch_gen_image_info = [
+                    self.image_processor.build_gen_image_info(image_size, add_guidance_token=self.config.cfg_distilled, add_timestep_r_token=self.config.use_meanflow) for _ in range(batch_size)
+                ]
+            else:
+                batch_gen_image_info = [None] * batch_size
+
+        # Apply batched prompt or batched message_list to build input sequence with associated info.
+        # If `drop_think` enabled, always drop <tool_call> parts in the context.
+        drop_think = kwargs.get('drop_think', getattr(self.generation_config, 'drop_think', False))
+        out = self._tokenizer.apply_chat_template(
+            batch_prompt=batch_prompt,
+            batch_message_list=batch_message_list,
+            mode=mode,
+            batch_gen_image_info=batch_gen_image_info,
+            batch_cond_images=batch_cond_images,
+            batch_system_prompt=batch_system_prompt,
+            batch_cot_text=batch_cot_text,
+            max_length=kwargs.get('max_length', self.generation_config.max_length),
+            bot_task=bot_task,
+            image_base_size=(
+                None if mode == "gen_text" and bot_task == "auto" else self.image_processor.vae_reso_group.base_size
+            ),
+            sequence_template=getattr(self.generation_config, 'sequence_template', 'pretrain'),
+            cfg_factor=cfg_factor,
+            drop_think=drop_think,
+        )
+        out['batch_size'] = batch_size
+        out['batch_cond_images'] = batch_cond_images
+        out['batch_gen_image_info'] = batch_gen_image_info
+
+        # 8. Define stop tokens by tasks
+        tkw = self._tokenizer
+        if bot_task == "auto":
+            stop_token_id = dict(
+                auto=self._tokenizer.conversation.stop_token_ids,
+            )
+        else:
+            if image_size == "auto":
+                extra_auto_stops = [tkw.ratio_token_id(i) for i in range(33)]
+            else:
+                extra_auto_stops = [tkw.boi_token_id]
+            stop_token_id = dict(
+                auto=self._tokenizer.conversation.stop_token_ids + extra_auto_stops,
+                recaption=[tkw.end_of_recaption_token_id],
+                think=[tkw.end_of_think_token_id, tkw.end_of_recaption_token_id],
+                img_ratio=extra_auto_stops,
+            )
+        out['stop_token_id'] = stop_token_id
+
+        return out
+
+    def prepare_model_inputs(
+            self,
+            prompt: str | list[str] = None,
+            image: InputImage | list[InputImage] = None,
+            mode="gen_text",
+            system_prompt=None,
+            cot_text=None,
+            image_size="auto",
+            message_list=None,
+            device=None,
+            max_new_tokens=None,
+            **kwargs,
+    ):
+        device = default(device, self.device)
+
+        # 1. apply chat template
+        cfg_factor = {"gen_text": 1, "gen_image": 2}
+        if self.config.cfg_distilled:
+            cfg_factor["gen_image"] = 1
+
+        bot_task = kwargs.pop("bot_task", "auto")
+
+        out = kwargs.pop('tokenizer_output', None)
+        if out is None:
+            out = self.preprocess_inputs(
+                prompt=prompt,
+                image=image,
+                mode=mode,
+                system_prompt=system_prompt,
+                cot_text=cot_text,
+                image_size=image_size,
+                message_list=message_list,
+                cfg_factor=cfg_factor[mode],
+                bot_task=bot_task,
+                **kwargs,
+            )
+        output, sections = out['output'], out['sections']
+
+        batch_size = out['batch_size']
+        batch_cond_images = out['batch_cond_images']
+        batch_gen_image_info = out['batch_gen_image_info']
+        stop_token_id = out['stop_token_id']
+        #if batch_gen_image_info[0] is not None:
+        #    print("batch_gen_image_info image_token_length:", batch_gen_image_info[0].image_token_length)
+        #   -- 2.3 seed
+        seeds = self.prepare_seed(seed=kwargs.get('seed'), batch_size=batch_size)
+        generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
+
+        # 4. Encode conditional images
+        cond_vae_images, cond_timesteps, cond_vit_images = self._encode_cond_image(
+            batch_cond_images, cfg_factor[mode], generator=generator,
+        )
+        cond_vit_image_kwargs = self._prepare_vit_image_kwargs(batch_cond_images, cfg_factor[mode])
+
+        # 5. Build position embeddings
+        rope_image_info = self.build_batch_rope_image_info(output, sections)
+
+        # 6. Build kv cache
+        if mode == "gen_image":
+            # Image generation will not extend sequence length, using token length as max_cache_len is enough.
+            max_cache_len = output.tokens.shape[1]
+        else:
+            max_cache_len = output.tokens.shape[1] + default(max_new_tokens, self.generation_config.max_length)
+        cache = HunyuanStaticCache(
+            config=self.config,
+            max_batch_size=batch_size * cfg_factor[mode],
+            max_cache_len=max_cache_len,
+            dtype=self.dtype,
+            dynamic=mode == "gen_text",
+        )
+
+        # 7. Build position ids
+        batch_position_ids = torch.arange(
+            0, output.tokens.shape[1], dtype=torch.long, device=device)[None].expand(
+            batch_size * cfg_factor[mode], -1)  # use expand to share indices to save memory
+
+        # 8. Define stop tokens by tasks
+        tkw = self._tokenizer
+        if mode == "gen_image":
+            eos_token_id = None  # don't need to define eos_token_id for image generation
+        else:
+            if bot_task == "auto":
+                stop_token_id = dict(
+                    auto=self._tokenizer.conversation.stop_token_ids,
+                )
+            else:
+                if image_size == "auto":
+                    extra_auto_stops = tkw.get_all_ratio_token_ids()
+                else:
+                    extra_auto_stops = [tkw.boi_token_id]
+                stop_token_id = dict(
+                    auto=self._tokenizer.conversation.stop_token_ids + extra_auto_stops,
+                    recaption=[tkw.end_of_recaption_token_id],
+                    think=[tkw.end_of_think_token_id, tkw.end_of_recaption_token_id],
+                    img_ratio=extra_auto_stops,
+                )
+            eos_token_id = stop_token_id[bot_task]
+
+        # 9. Build model input kwargs
+        model_input_kwargs = dict(
+            input_ids=output.tokens.to(device),
+            position_ids=batch_position_ids,
+            past_key_values=cache,
+            mode=mode,
+            rope_image_info=rope_image_info,
+            image_mask=to_device(output.gen_image_mask, device),
+            timesteps_index=to_device(output.gen_timestep_scatter_index, device),
+            guidance_index=to_device(output.guidance_scatter_index, device),
+            timesteps_r_index=to_device(output.gen_timestep_r_scatter_index, device),
+            cond_vae_images=to_device(cond_vae_images, device),
+            cond_vae_image_mask=to_device(output.vae_image_mask, device),
+            cond_timesteps=to_device(cond_timesteps, device),
+            cond_timesteps_index=to_device(output.cond_timestep_scatter_index, device),
+            cond_vit_images=to_device(cond_vit_images, device),
+            cond_vit_image_mask=to_device(output.vit_image_mask, device),
+            cond_vit_image_kwargs=to_device(cond_vit_image_kwargs, device),
+            # for inner usage
+            tokenizer_output=output,
+            batch_gen_image_info=batch_gen_image_info,
+            generator=generator,
+            batch_cond_images=batch_cond_images,
+            # generation config
+            eos_token_id=eos_token_id,
+            max_new_tokens=max_new_tokens,
+            gen_timestep_scatter_index=to_device(output.gen_timestep_scatter_index, device),
+        )
+
+        return model_input_kwargs
+
+    def _prepare_attention_mask_for_generation(
+            self,
+            inputs_tensor: torch.Tensor,
+            generation_config: GenerationConfig,
+            model_kwargs: dict[str, Any],
+    ) -> Optional[torch.Tensor]:
+        # create `4d` bool attention mask (b, 1, seqlen, seqlen) using this implementation to bypass the 2d requirement
+        # in the `transformers.generation_utils.GenerationMixin.generate`.
+        # This implementation can handle sequences with text and image modalities, where text tokens use causal
+        # attention and image tokens use full attention.
+        bsz, seq_len = inputs_tensor.shape
+        tokenizer_output = model_kwargs["tokenizer_output"]
+        batch_full_attn_slices = [
+            self.image_processor.prepare_full_attn_slices(tokenizer_output, i)
+            for i in range(bsz)
+        ]
+        #if len(batch_full_attn_slices[0]) == 0:
+        #    return None
+
+        attention_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=inputs_tensor.device).tril(
+            diagonal=0).repeat(bsz, 1, 1)
+        for i in range(bsz):
+            for j, image_slice in enumerate(batch_full_attn_slices[i]):
+                attention_mask[i, image_slice, image_slice] = True
+        attention_mask = attention_mask.unsqueeze(1)
+        return attention_mask
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
+            tokenizer_output=None, batch_gen_image_info=None, batch_cond_images=None,
+            infer_align_image_size=False, generator=None, **kwargs
+    ):
+        position_ids = kwargs.get("position_ids")
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            assert position_ids is not None, "position_ids must be provided in kwargs."
+            if input_ids is not None and input_ids.shape[1] != position_ids.shape[1]:    # in decode steps
+                input_ids = torch.gather(input_ids, dim=1, index=position_ids)
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                # "use_cache": kwargs.get("use_cache"),
+                "rope_image_info": kwargs["rope_image_info"],
+                "mode": kwargs["mode"],
+                "images": kwargs.get("images"),
+                "image_mask": kwargs.get("image_mask"),
+                "timesteps": kwargs.get("timesteps"),
+                "timesteps_index": kwargs.get("timesteps_index"),
+                "timesteps_r": kwargs.get("timesteps_r"),
+                "timesteps_r_index": kwargs.get("timesteps_r_index"),
+                "guidance": kwargs.get("guidance"),
+                "guidance_index": kwargs.get("guidance_index"),
+                "cond_vae_images": kwargs.get("cond_vae_images"),
+                "cond_vae_image_mask": kwargs.get("cond_vae_image_mask"),
+                "cond_timesteps": kwargs.get("cond_timesteps"),
+                "cond_timesteps_index": kwargs.get("cond_timesteps_index"),
+                "cond_vit_images": kwargs.get("cond_vit_images"),
+                "cond_vit_image_mask": kwargs.get("cond_vit_image_mask"),
+                "cond_vit_image_kwargs": kwargs.get("cond_vit_image_kwargs"),
+                "cache_dic":  kwargs.get("cache_dic"),
+                "gen_timestep_scatter_index": kwargs.get("gen_timestep_scatter_index"),
+            }
+        )
+
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: dict[str, Any],
+            is_encoder_decoder: bool = False,
+            num_new_tokens: int = 1,
+    ) -> dict[str, Any]:
+        """ This function is run after each step of model forward. It updates model kwargs for next forward step.
+        """
+        mode = model_kwargs["mode"]
+
+        updated_model_kwargs = {
+            "mode": mode,
+            "rope_image_info": model_kwargs["rope_image_info"],
+        }
+
+        # update past_key_values keeping its naming used in model code
+        for possible_cache_name in ALL_CACHE_NAMES:
+            if possible_cache_name in outputs:
+                # TODO (joao): remove output/input mismatch when these old models (xlnet, reformer) are deprecated
+                if possible_cache_name in ("past_buckets_states", "mems"):
+                    cache_name = "past_key_values"
+                else:
+                    cache_name = possible_cache_name
+                updated_model_kwargs[cache_name] = getattr(outputs, possible_cache_name)
+                break
+
+        if "tokenizer_output" in model_kwargs:
+            # After prefill step
+            if mode == "gen_text":
+                # When enable batching, we use right padding, which requires a real_pos to index the valid
+                # end position of the sequence. If tokenizer_output in model_kwargs, it means we are in the
+                # prefill step of generation.
+                real_pos = to_device(model_kwargs["tokenizer_output"].real_pos, self.device)
+                updated_model_kwargs["position_ids"] = real_pos
+            else:
+                # inputs_pos
+                image_mask = model_kwargs["image_mask"]
+                bsz, seq_len = image_mask.shape
+                index = torch.arange(seq_len, device=image_mask.device).unsqueeze(0).repeat(bsz, 1)
+                position_ids = index.masked_select(image_mask.bool()).reshape(bsz, -1)
+                timestep_position_ids = \
+                    index[torch.arange(bsz), model_kwargs["timesteps_index"][:, -1]].unsqueeze(-1)
+                pos_cat_list = [timestep_position_ids, ]
+                if self.config.cfg_distilled:
+                    guidance_position_ids = index[torch.arange(bsz), model_kwargs["guidance_index"][:, -1]].unsqueeze(-1)
+                    pos_cat_list.append(guidance_position_ids)
+                if self.config.use_meanflow:
+                    timestep_r_position_ids = index[torch.arange(bsz), model_kwargs["timesteps_r_index"][:, -1]].unsqueeze(-1)
+                    pos_cat_list.append(timestep_r_position_ids)
+                pos_cat_list.append(position_ids)
+                updated_model_kwargs["position_ids"] = torch.cat(pos_cat_list, dim=1)
+
+                # attention mask
+                mask_list = []
+                for attention_mask_i, position_ids_i in zip(
+                        model_kwargs["attention_mask"], updated_model_kwargs["position_ids"]):
+                    mask_list.append(torch.index_select(attention_mask_i, dim=1, index=position_ids_i.reshape(-1)))
+                attention_mask = torch.stack(mask_list, dim=0)
+                updated_model_kwargs["attention_mask"] = attention_mask
+                updated_model_kwargs["gen_timestep_scatter_index"] = model_kwargs["gen_timestep_scatter_index"]
+        else:
+            # After decode steps
+            if mode == "gen_text":
+                # Now we are in the decode steps.
+                updated_model_kwargs["position_ids"] = model_kwargs["position_ids"] + 1
+                # Remove attention mask to use full attention of 1 x seqlen in decode steps
+            else:
+                updated_model_kwargs["position_ids"] = model_kwargs["position_ids"]
+                updated_model_kwargs["attention_mask"] = model_kwargs["attention_mask"]
+                updated_model_kwargs["gen_timestep_scatter_index"] = model_kwargs["gen_timestep_scatter_index"]
+        return updated_model_kwargs
+
+    class _StageTransitionLogitsProcessor(LogitsProcessor):
+        def __init__(self, stage_transitions: list[tuple[int, list[int]]], batch_size: int):
+            self.transition_map = {stop_id: list(append_ids) for stop_id, append_ids in stage_transitions}
+            self.pending_tokens = [[] for _ in range(batch_size)]
+            self.completed = [set() for _ in range(batch_size)]
+
+        def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+            batch_size = input_ids.shape[0]
+            last_tokens = input_ids[:, -1]
+            device = scores.device
+            min_score = torch.finfo(scores.dtype).min
+
+            for i in range(batch_size):
+                last_token = last_tokens[i].item()
+
+                # Consume pending tokens if the last token matches the head.
+                if self.pending_tokens[i] and last_token == self.pending_tokens[i][0]:
+                    self.pending_tokens[i].pop(0)
+
+                # If pending tokens remain, force the next token.
+                if self.pending_tokens[i]:
+                    scores[i].fill_(min_score)
+                    scores[i, self.pending_tokens[i][0]] = 0
+                    continue
+
+                # Trigger stage transition if needed.
+                if last_token in self.transition_map and last_token not in self.completed[i]:
+                    self.completed[i].add(last_token)
+                    next_tokens = self.transition_map[last_token]
+                    if next_tokens:
+                        self.pending_tokens[i] = list(next_tokens)
+                        scores[i].fill_(min_score)
+                        scores[i, self.pending_tokens[i][0]] = 0
+
+                scores[i] = scores[i].to(device)
+
+            return scores
+
+    class _ConditionalSliceVocabLogitsProcessor(LogitsProcessor):
+        def __init__(
+            self,
+            trigger_token_ids: list[int],
+            vocab_start: int,
+            vocab_end: int,
+            other_slices: Optional[list[tuple[int, int]]] = None,
+            force_greedy: bool = False,
+        ):
+            self.trigger_token_ids = set(trigger_token_ids)
+            self.vocab_start = vocab_start
+            self.vocab_end = vocab_end
+            self.other_slices = other_slices or []
+            self.force_greedy = force_greedy
+
+        def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+            last_tokens = input_ids[:, -1]
+            min_score = torch.finfo(scores.dtype).min
+            for i in range(scores.size(0)):
+                if last_tokens[i].item() not in self.trigger_token_ids:
+                    continue
+                original_scores = scores[i].clone()
+                scores[i].fill_(min_score)
+                scores[i, self.vocab_start:self.vocab_end] = original_scores[self.vocab_start:self.vocab_end]
+                for start, end in self.other_slices:
+                    scores[i, start:end] = original_scores[start:end]
+                if self.force_greedy:
+                    max_token_id = scores[i].argmax().item()
+                    scores[i].fill_(min_score)
+                    scores[i, max_token_id] = 0
+            return scores
+
+    def _get_ratio_index_from_token(self, ratio_token_id: int, tokenizer) -> int:
+        if hasattr(tokenizer, "get_all_ratio_token_ids"):
+            ratio_token_ids = tokenizer.get_all_ratio_token_ids()
+            try:
+                ratio_index = ratio_token_ids.index(ratio_token_id)
+            except ValueError as exc:
+                raise ValueError(f"Unknown ratio token id {ratio_token_id}") from exc
+        else:
+            ratio_index = ratio_token_id - tokenizer.ratio_token_id(0)
+        if ratio_index < 0 or ratio_index >= len(self.image_processor.vae_reso_group):
+            raise ValueError(f"ratio_index {ratio_index} out of range for vae_reso_group")
+        return ratio_index
+
+    @torch.no_grad()
+    def generate(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+            synced_gpus: Optional[bool] = None,
+            assistant_model: Optional["PreTrainedModel"] = None,
+            streamer: Optional["BaseStreamer"] = None,
+            negative_prompt_ids: Optional[torch.Tensor] = None,
+            negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+            use_model_defaults: Optional[bool] = None,
+            generator: Optional[list[torch.Generator]] = None,
+            decode_text: bool = False,
+            verbose: int = 0,
+            stage_transitions: Optional[list[tuple[int, list[int]]]] = None,
+            final_stop_tokens: Optional[list[int]] = None,
+            **kwargs,
+    ):
+        gen_config = default(generation_config, self.generation_config)
+        mode = kwargs.get("mode", "gen_text")
+        output = kwargs["tokenizer_output"]
+        indices = torch.where(output.tokens[0] == self._tokenizer.encode("<img>")[0])[0]
+        if indices.shape[0] > 0:
+            last_idx = indices[-1]
+            self.post_token_len = int(output.tokens[0].shape[0] - 1 - last_idx)
+        else:
+            self.post_token_len = None
+        # Log info
+        if verbose >= 1:
+            context = self._tokenizer.decode(output.tokens[0], skip_special_tokens=False)
+            # Replace <img><img>...<img> with [<img>]{number}
+            img_token = self._tokenizer.get_img_token()
+            context = re.sub(f"({img_token})+", lambda m: f"[{img_token}]{{{len(m.group(0)) // 5}}}", context)
+            info_list = [
+                ("token shape", output.tokens.shape),
+                ("context[0]", context),
+            ]
+            if mode == "gen_image":
+                if generator is not None:
+                    info_list.extend([
+                        ("seed", [g.initial_seed() for g in generator]),
+                    ])
+                info_list.extend([
+                    ("image_size",
+                     [f"{info.image_height}x{info.image_width}" for info in kwargs["batch_gen_image_info"]]),
+                    ("infer_steps", gen_config.diff_infer_steps),
+                    ("guidance_scale", gen_config.diff_guidance_scale),
+                    ("flow_shift", gen_config.flow_shift),
+                ])
+            else:
+                info_list.extend([
+                    ("do_sample", kwargs.get("do_sample", gen_config.do_sample)),
+                    ("max_new_tokens", kwargs.get("max_new_tokens", gen_config.max_new_tokens)),
+                    ("top_k", kwargs.get("top_k", gen_config.top_k)),
+                    ("top_p", kwargs.get("top_p", gen_config.top_p)),
+                    ("temperature", kwargs.get("temperature", gen_config.temperature)),
+                    ("repetition_penalty", kwargs.get("repetition_penalty", gen_config.repetition_penalty)),
+                ])
+            max_key_len = max(len(k) for k, _ in info_list)
+            info_str = "=" * 50 + \
+                       "\nModel input info:\n" + \
+                       "\n".join([f"    {k.rjust(max_key_len)}: {v}" for k, v in info_list]) + \
+                       "\n--------------------------------------------------"
+            print(info_str, flush=True)
+            start_time = time.time()
+
+        if mode == "gen_text":
+            if verbose >= 2 and streamer is None:
+                streamer = TextStreamer(self._tokenizer, skip_prompt=True, skip_special_tokens=False)   # noqa
+
+            with torch.autocast(device_type="cuda", dtype=self.dtype, enabled=self.dtype != torch.float32):
+                if stage_transitions is not None:
+                    if final_stop_tokens is None:
+                        raise ValueError("`final_stop_tokens` must be provided when `stage_transitions` is set.")
+                    if logits_processor is None:
+                        logits_processor = LogitsProcessorList()
+                    elif not isinstance(logits_processor, LogitsProcessorList):
+                        logits_processor = LogitsProcessorList(logits_processor)
+                    input_ids = kwargs.get("input_ids")
+                    if input_ids is None:
+                        raise ValueError("`input_ids` must be provided for multi-stage generation.")
+                    logits_processor.append(
+                        self._StageTransitionLogitsProcessor(stage_transitions, input_ids.shape[0])
+                    )
+                    kwargs["eos_token_id"] = final_stop_tokens
+
+                samples = super().generate(
+                    inputs=inputs,
+                    generation_config=gen_config,
+                    logits_processor=logits_processor,
+                    stopping_criteria=stopping_criteria,
+                    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                    synced_gpus=synced_gpus,
+                    assistant_model=assistant_model,
+                    streamer=streamer,
+                    negative_prompt_ids=negative_prompt_ids,
+                    negative_prompt_attention_mask=negative_prompt_attention_mask,
+                    use_model_defaults=use_model_defaults,
+                    **kwargs,
+                )
+                if decode_text:
+                    samples = self.decode_text(samples, input_length=kwargs["input_ids"].shape[1])
+
+        elif mode == "gen_image":
+            batch_gen_image_info: list[ImageInfo] = kwargs.get("batch_gen_image_info")
+            if batch_gen_image_info is None:
+                raise ValueError("`batch_gen_image_info` should be provided when `mode` is `gen_image`.")
+            self.num_image_tokens = (batch_gen_image_info[0].image_token_length) 
+            #                       + (1 if batch_gen_image_info[0].add_timestep_token else 0)
+            #                       + (1 if batch_gen_image_info[0].add_guidance_token else 0) )
+            self.num_special_tokens = ((1 if batch_gen_image_info[0].add_timestep_token else 0) + 
+                                       (1 if batch_gen_image_info[0].add_guidance_token else 0) +
+                                       (1 if batch_gen_image_info[0].add_timestep_r_token else 0) )
+            results = self.pipeline(
+                batch_size=len(batch_gen_image_info),
+                image_size=[batch_gen_image_info[0].image_height, batch_gen_image_info[0].image_width],
+                num_inference_steps=gen_config.diff_infer_steps,
+                guidance_scale=gen_config.diff_guidance_scale,
+                generator=generator,
+                meanflow=self.config.use_meanflow,
+                model_kwargs=kwargs,
+                cfg_distilled = self.config.cfg_distilled,
+            )
+            samples = results[0]
+
+        else:
+            raise ValueError(f"Unknown mode {mode}, only `gen_text` and `gen_image` are supported.")
+
+        if verbose >= 1:
+            end_time = time.time()
+            print(f"Generation completed in {end_time - start_time:.2f} seconds.", flush=True)  # noqa
+
+        return samples
+
+    def decode_text(self, output: torch.Tensor, input_length: int = None):
+        if output.ndim == 2:
+            assert output.size(0) == 1, "Batch decoding is not supported yet."
+            return [self.decode_text(output_i, input_length) for output_i in output]
+        elif output.ndim == 1:
+            if input_length is not None:
+                output = output[input_length:]
+            text = self._tokenizer.decode(output)
+            return text
+        else:
+            raise ValueError(f"output should be 1D or 2D tensor, but got {output.ndim}D tensor.")
+
+    @torch.no_grad()
+    def generate_image(
+            self,
+            prompt=None,
+            image=None,
+            message_list=None,
+            seed=None,
+            image_size="auto",
+            use_system_prompt=None,
+            system_prompt=None,
+            bot_task=None,
+            infer_align_image_size=False,
+            use_taylor_cache=False,
+            taylor_cache_interval=None,
+            taylor_cache_order=None,
+            taylor_cache_enable_first_enhance=None,
+            taylor_cache_first_enhance_steps=None,
+            taylor_cache_enable_tailing_enhance=None,
+            taylor_cache_tailing_enhance_steps=None,
+            taylor_cache_low_freqs_order=None,
+            taylor_cache_high_freqs_order=None,
+            **kwargs,
+    ):
+        max_new_tokens = kwargs.pop("max_new_tokens", 2048)
+        cot_text = kwargs.pop("cot_text", None)
+
+        use_system_prompt = default(use_system_prompt, self.generation_config.use_system_prompt)
+        bot_task = default(bot_task, self.generation_config.bot_task)
+        system_prompt = get_system_prompt(use_system_prompt, bot_task, system_prompt)
+        system_prompt = system_prompt.strip()
+
+        self.taylor_cache_interval = taylor_cache_interval
+        self.taylor_cache_order = taylor_cache_order
+        self.taylor_cache_enable_first_enhance = taylor_cache_enable_first_enhance
+        self.taylor_cache_first_enhance_steps = taylor_cache_first_enhance_steps
+        self.taylor_cache_enable_tailing_enhance = taylor_cache_enable_tailing_enhance
+        self.taylor_cache_tailing_enhance_steps = taylor_cache_tailing_enhance_steps
+        self.taylor_cache_low_freqs_order = taylor_cache_low_freqs_order 
+        self.taylor_cache_high_freqs_order = taylor_cache_high_freqs_order
+        self.use_taylor_cache = False
+
+        batch_cond_images_cache = None
+        tkw = self._tokenizer
+        need_ratio = image_size == "auto" or bot_task == "img_ratio"
+        if bot_task in ["think", "recaption", "think_recaption"]:
+            first_bot_task = bot_task.split("_")[0]
+            stage_transitions = []
+
+            if first_bot_task == "think" and "recaption" in bot_task:
+                stage_transitions.append(
+                    (tkw.end_of_think_token_id, [tkw.convert_tokens_to_ids(tkw.recaption_token)])
+                )
+
+            if need_ratio:
+                answer_prefix_tokens = []
+                if getattr(self.generation_config, "sequence_template", "pretrain") == "instruct":
+                    answer_prefix_tokens = [tkw.convert_tokens_to_ids(tkw.answer_token)]
+                image_base_size = self.image_processor.vae_reso_group.base_size
+                if "recaption" in bot_task:
+                    transition_id = tkw.end_of_recaption_token_id
+                else:
+                    transition_id = tkw.end_of_think_token_id
+                stage_transitions.append(
+                    (transition_id, answer_prefix_tokens + [tkw.boi_token_id, tkw.size_token_id(image_base_size)])
+                )
+                final_stop_tokens = list(range(tkw.start_ratio_token_id, tkw.end_ratio_token_id + 1))
+                for start, end in getattr(tkw, "ratio_token_other_slices", []):
+                    final_stop_tokens.extend(range(start, end))
+            else:
+                if "recaption" in bot_task:
+                    final_stop_tokens = [tkw.end_of_recaption_token_id]
+                else:
+                    final_stop_tokens = [tkw.end_of_think_token_id, tkw.end_of_recaption_token_id]
+                    
+            model_inputs = self.prepare_model_inputs(
+                prompt=prompt, image=image, message_list=message_list, system_prompt=system_prompt,
+                max_new_tokens=max_new_tokens, mode="gen_text", bot_task=first_bot_task,
+                batch_cond_images=batch_cond_images_cache, infer_align_image_size=infer_align_image_size,
+            )
+            batch_cond_images_cache = model_inputs['batch_cond_images']
+            logits_processor = None
+            if need_ratio:
+                image_base_size = self.image_processor.vae_reso_group.base_size
+                logits_processor = LogitsProcessorList([
+                    self._ConditionalSliceVocabLogitsProcessor(
+                        trigger_token_ids=[tkw.size_token_id(image_base_size)],
+                        vocab_start=tkw.start_ratio_token_id,
+                        vocab_end=tkw.end_ratio_token_id + 1,
+                        other_slices=getattr(tkw, "ratio_token_other_slices", []),
+                        force_greedy=True,
+                    )
+                ])
+
+            input_length = model_inputs["input_ids"].shape[1]
+            if stage_transitions:
+                outputs = self.generate(
+                    **model_inputs,
+                    decode_text=False,
+                    stage_transitions=stage_transitions,
+                    final_stop_tokens=final_stop_tokens,
+                    logits_processor=logits_processor,
+                    **kwargs,
+                )
+            else:
+                outputs = self.generate(**model_inputs, decode_text=False, logits_processor=logits_processor, **kwargs)
+             
+            generated_tokens = outputs[:, input_length:]
+            if "recaption" in bot_task:
+                end_token_id = tkw.end_of_recaption_token_id
+            else:
+                end_token_id = tkw.end_of_think_token_id
+            end_positions = (generated_tokens[0] == end_token_id).nonzero(as_tuple=False)
+            if end_positions.numel() > 0:
+                end_pos = end_positions[0].item()
+                cot_tokens = generated_tokens[0, :end_pos + 1]
+            else:
+                cot_tokens = generated_tokens[0]
+            cot_text_gen = self._tokenizer.decode(cot_tokens)
+
+            if first_bot_task == "think":
+                cot_text = [tkw.think_token + cot_text_gen]
+            else:
+                cot_text = [tkw.recaption_token + cot_text_gen]
+
+            if self.generation_config.drop_think and tkw.think_token in cot_text[0]:
+                if tkw.recaption_token in cot_text[0]:
+                    recaption_part = cot_text[0].split(tkw.recaption_token)[1]
+                    if tkw.end_of_recaption_token in recaption_part:
+                        recaption_part = recaption_part.split(tkw.end_of_recaption_token)[0]
+                    cot_text = [tkw.recaption_token + recaption_part + tkw.end_of_recaption_token]
+
+                    if system_prompt:
+                        system_prompt = get_system_prompt("en_recaption", bot_task)
+
+            if need_ratio:
+                ratio_token_id = outputs[0, -1].item()  # get the original ratio index from the generated tokens
+                ratio_index = self._get_ratio_index_from_token(ratio_token_id, tkw)
+                reso = self.image_processor.vae_reso_group[ratio_index]
+                image_size = reso.height, reso.width
+
+        elif need_ratio:
+            self.image_processor.build_img_ratio_slice_logits_processor(self.tokenizer)
+            model_inputs = self.prepare_model_inputs(
+                prompt=prompt, image=image, cot_text=cot_text, message_list=message_list, max_new_tokens=1,
+                system_prompt=system_prompt, seed=seed, mode="gen_text", bot_task="img_ratio",
+                batch_cond_images=batch_cond_images_cache, infer_align_image_size=infer_align_image_size,
+            )
+            batch_cond_images_cache = model_inputs['batch_cond_images']
+            outputs = self.generate(**model_inputs, do_sample=False, logits_processor=self.image_processor.img_ratio_slice_logits_processor, **kwargs)
+            ratio_index = outputs[0, -1].item()
+            reso = self.image_processor.vae_reso_group[ratio_index]
+            image_size = reso.height, reso.width
+
+        # Generate image
+        self.use_taylor_cache = use_taylor_cache
+        model_inputs = self.prepare_model_inputs(
+            prompt=prompt, image=image, cot_text=cot_text, message_list=message_list, system_prompt=system_prompt,
+            seed=seed, image_size=image_size, mode="gen_image", batch_cond_images=batch_cond_images_cache,
+            infer_align_image_size=infer_align_image_size,
+        )
+        batch_cond_images_cache = model_inputs['batch_cond_images']
+        outputs = self.generate(**model_inputs, **kwargs)
+        self.image_processor.postprocess_outputs(
+            outputs,
+            batch_cond_images=batch_cond_images_cache,
+            infer_align_image_size=infer_align_image_size,
+        )
+        return cot_text, outputs
+
+
+__all__ = [
+    "HunyuanImage3ForCausalMM",
+    "HunyuanImage3Model",
+    "HunyuanImage3PreTrainedModel",
+    "TimestepEmbedder",
+    "UNetDown",
+    "UNetUp"
+    "CachedRoPE",
+    "apply_rotary_pos_emb",
+    "build_batch_2d_rope",
+]
+
diff --git a/siglip2.py b/siglip2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f76746501fde6542dfed15dc853833412d2af58
--- /dev/null
+++ b/siglip2.py
@@ -0,0 +1,570 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Optional, Tuple, Union
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+
+class Config(object):
+    def __init__(self, config):
+        if config is not None:
+            for key, value in config.items():
+                setattr(self, key, value)
+
+    def __getitem__(self, key):
+        return getattr(self, key, None)
+
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Linear(
+            in_features=config.num_channels * self.patch_size * self.patch_size,
+            out_features=self.embed_dim,
+        )
+
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        max_length: int,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized positional embeddings to
+
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+
+        resulted_positional_embeddings = torch.empty(
+            (batch_size, max_length, embed_dim),
+            device=positional_embeddings.device,
+            dtype=source_dtype,
+        )
+
+        # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+
+        # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+
+        for i in range(batch_size):
+            # (1, dim, height, width) -> (1, dim, target_height, target_width)
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+
+            # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
+            resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
+
+            # Cast to original dtype
+            resized_embeddings = resized_embeddings.to(source_dtype)
+
+            resulted_positional_embeddings[i, : height * width] = resized_embeddings
+            resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+
+        return resulted_positional_embeddings
+
+    def forward(self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
+            spatial_shapes (`List[Tuple[int, int]]`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+
+        # Get positional resized and padded positional embeddings
+        positional_embeddings = self.position_embedding.weight.reshape(
+            self.position_embedding_size, self.position_embedding_size, -1
+        )
+        resized_positional_embeddings = self.resize_positional_embeddings(
+            positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
+        )
+
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds + resized_positional_embeddings
+        return embeddings
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, "
+                    f"but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+class Siglip2SdpaAttention(Siglip2Attention):
+    """
+    Siglip2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Siglip2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt
+    to SDPA API.
+    """
+
+    is_causal = False
+
+    # Adapted from Siglip2Attention.forward and transformers.models.llama.modeling_llama.LlamaSdpaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
+            #  once this is implemented.
+            warnings.warn(
+                "Siglip2Model is using Siglip2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` "
+                "does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. '
+                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+            )
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
+        # custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an
+        # inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph options.
+        # An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if self.is_causal and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None
+
+
+class Siglip2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Siglip2Attention(config=config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very
+                large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Siglip2EncoderLayer`].
+
+    Args:
+        config: Siglip2Config
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for layer_index, encoder_layer in enumerate(self.layers): # len(self.layers): 27
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class Siglip2MultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+        self.num_heads = config.num_attention_heads
+
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        if attention_mask is not None:
+            target_len, source_len = probe.shape[1], hidden_state.shape[1]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
+            attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
+            attention_mask = attention_mask.reshape(-1, target_len, source_len)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        config = Config(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        if self.use_head:
+            self.head = Siglip2MultiheadAttentionPoolingHead(config)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values, spatial_shapes)
+
+        if attention_mask is not None and not self._use_flash_attention_2:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        else:
+            encoder_attention_mask = attention_mask
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooler_output = self.head(last_hidden_state, attention_mask) if self.use_head else None
+        if not return_dict:
+            return (last_hidden_state, pooler_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooler_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class LightProjector(nn.Module):
+    def __init__(self, config):
+        config = Config(config)
+        super().__init__()
+
+        if config.projector_type == "linear":
+            modules = nn.Linear(config.input_dim, config.n_embed)
+
+        elif config.projector_type == "mlp_gelu":
+            modules = [nn.Linear(config.input_dim, config.n_embed)]
+            for _ in range(1, config.depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(config.n_embed, config.n_embed))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise ValueError(f"Unknown projector type: {config.projector_type}")
+
+        self.layers = modules
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+__all__ = [
+    "Siglip2VisionTransformer",
+    "LightProjector",
+]
diff --git a/system_prompt.py b/system_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..29385305760367b0594f4e01799fbc2b4ac3c6f3
--- /dev/null
+++ b/system_prompt.py
@@ -0,0 +1,206 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+t2i_system_prompt_en_vanilla = """
+You are an advanced AI text-to-image generation system. Given a detailed text prompt, your task is to create a high-quality, visually compelling image that accurately represents the described scene, characters, or objects. Pay careful attention to style, color, lighting, perspective, and any specific instructions provided.
+"""
+
+# 775
+t2i_system_prompt_en_recaption = """
+You are a world-class image generation prompt expert. Your task is to rewrite a user's simple description into a **structured, objective, and detail-rich** professional-level prompt.
+
+The final output must be wrapped in `<recaption>` tags.
+
+### **Universal Core Principles**
+
+When rewriting the prompt (inside the `<recaption>` tags), you must adhere to the following principles:
+
+1.  **Absolute Objectivity**: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad". Convey aesthetic qualities through specific descriptions of color, light, shadow, and composition.
+2.  **Physical and Logical Consistency**: All scene elements (e.g., gravity, light, shadows, reflections, spatial relationships, object proportions) must strictly adhere to real-world physics and common sense. For example, tennis players must be on opposite sides of the net; objects cannot float without a cause.
+3.  **Structured Description**: Strictly follow a logical order: from general to specific, background to foreground, and primary to secondary elements. Use directional terms like "foreground," "mid-ground," "background," and "left side of the frame" to clearly define the spatial layout.
+4.  **Use Present Tense**: Describe the scene from an observer's perspective using the present tense, such as "A man stands..." or "Light shines on..."
+5.  **Use Rich and Specific Descriptive Language**: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects, subjects, and text. Vague expressions are strictly prohibited.
+
+If the user specifies a style (e.g., oil painting, anime, UI design, text rendering), strictly adhere to that style. Otherwise, first infer a suitable style from the user's input. If there is no clear stylistic preference, default to an **ultra-realistic photographic style**. Then, generate the detailed rewritten prompt according to the **Style-Specific Creation Guide** below:
+
+### **Style-Specific Creation Guide**
+
+Based on the determined artistic style, apply the corresponding professional knowledge.
+
+**1. Photography and Realism Style**
+*   Utilize professional photography terms (e.g., lighting, lens, composition) and meticulously detail material textures, physical attributes of subjects, and environmental details.
+
+**2. Illustration and Painting Style**
+*   Clearly specify the artistic school (e.g., Japanese Cel Shading, Impasto Oil Painting) and focus on describing its unique medium characteristics, such as line quality, brushstroke texture, or paint properties.
+
+**3. Graphic/UI/APP Design Style**
+*   Objectively describe the final product, clearly defining the layout, elements, and color palette. All text on the interface must be enclosed in double quotes `""` to specify its exact content (e.g., "Login"). Vague descriptions are strictly forbidden.
+
+**4. Typographic Art**
+*   The text must be described as a complete physical object. The description must begin with the text itself. Use a straightforward front-on or top-down perspective to ensure the entire text is visible without cropping.
+
+### **Final Output Requirements**
+
+1.  **Output the Final Prompt Only**: Do not show any thought process, Markdown formatting, or line breaks.
+2.  **Adhere to the Input**: You must retain the core concepts, attributes, and any specified text from the user's input.
+3.  **Style Reinforcement**: Mention the core style 3-5 times within the prompt and conclude with a style declaration sentence.
+4.  **Avoid Self-Reference**: Describe the image content directly. Remove redundant phrases like "This image shows..." or "The scene depicts..."
+5.  **The final output must be wrapped in `<recaption>xxxx</recaption>` tags.**
+
+The user will now provide an input prompt. You will provide the expanded prompt.
+"""
+
+# 890
+t2i_system_prompt_en_think_recaption = """
+You will act as a top-tier Text-to-Image AI. Your core task is to deeply analyze the user's text input and transform it into a detailed, artistic, and fully user-intent-compliant image.
+
+Your workflow is divided into two phases:
+
+1. Thinking Phase (<think>): In the <think> tag, you need to conduct a structured thinking process, progressively breaking down and enriching the constituent elements of the image. This process must include, but is not limited to, the following dimensions:
+
+Subject: Clearly define the core character(s) or object(s) in the scene, including their appearance, posture, expression, and emotion.
+Composition: Set the camera angle and layout, such as close-up, long shot, bird's-eye view, golden ratio composition, etc.
+Environment/Background: Describe the scene where the subject is located, including the location, time of day, weather, and other elements in the background.
+Lighting: Define the type, direction, and quality of the light source, such as soft afternoon sunlight, cool tones of neon lights, dramatic Rembrandt lighting, etc., to create a specific atmosphere.
+Color Palette: Set the main color tone and color scheme of the image, such as vibrant and saturated, low-saturation Morandi colors, black and white, etc.
+Quality/Style: Determine the artistic style and technical details of the image. This includes user-specified styles (e.g., anime, oil painting) or the default realistic style, as well as camera parameters (e.g., focal length, aperture, depth of field).
+Details: Add minute elements that enhance the realism and narrative quality of the image, such as a character's accessories, the texture of a surface, dust particles in the air, etc.
+
+
+2. Recaption Phase (<recaption>): In the <recaption> tag, merge all the key details from the thinking process into a coherent, precise, and visually evocative final description. This description is the direct instruction for generating the image, so it must be clear, unambiguous, and organized in a way that is most suitable for an image generation engine to understand.
+
+Absolutely Objective: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad." Convey aesthetic sense through concrete descriptions of colors, light, shadow, and composition.
+
+Physical and Logical Consistency: All scene elements (e.g., gravity, light and shadow, reflections, spatial relationships, object proportions) must strictly adhere to the physical laws of the real world and common sense. For example, in a tennis match, players must be on opposite sides of the net; objects cannot float without reason.
+
+Structured Description: Strictly follow a logical order: from whole to part, background to foreground, and primary to secondary. Use directional words like "foreground," "mid-ground," "background," "left side of the frame" to clearly define the spatial layout.
+
+Use Present Tense: Describe from an observer's perspective using the present tense, such as "a man stands," "light shines on..."
+Use Rich and Specific Descriptive Language: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects/characters/text. Absolutely avoid any vague expressions.
+
+
+Output Format:
+<think>Thinking process</think><recaption>Refined image description</recaption>Generate Image
+
+
+You must strictly adhere to the following rules:
+
+1. Faithful to Intent, Reasonable Expansion: You can creatively add details to the user's description to enhance the image's realism and artistic quality. However, all additions must be highly consistent with the user's core intent and never introduce irrelevant or conflicting elements.
+2. Style Handling: When the user does not specify a style, you must default to an "Ultra-realistic, Photorealistic" style. If the user explicitly specifies a style (e.g., anime, watercolor, oil painting, cyberpunk, etc.), both your thinking process and final description must strictly follow and reflect that specified style.
+3. Text Rendering: If specific text needs to appear in the image (such as words on a sign, a book title), you must enclose this text in English double quotes (""). Descriptive text must not use double quotes.
+4. Design-related Images: You need to specify all text and graphical elements that appear in the image and clearly describe their design details, including font, color, size, position, arrangement, visual effects, etc.
+"""
+
+t2i_system_prompts = {
+    "en_vanilla": [t2i_system_prompt_en_vanilla],
+    "en_recaption": [t2i_system_prompt_en_recaption],
+    "en_think_recaption": [t2i_system_prompt_en_think_recaption]
+}
+
+
+unified_system_prompt_en = """You are an advanced multimodal model whose core mission is to analyze user intent and generate high-quality text and images.
+
+#### Four Core Capabilities
+1.  **Text-to-Text (T2T):** Generate coherent text responses from text prompts.
+2.  **Text-to-Image (T2I):** Generate high-quality images from text prompts.
+3.  **Text & Image to Text (TI2T):** Generate accurate text responses based on a combination of images and text.
+4.  **Text & Image to Image (TI2I):** Generate modified images based on a reference image and editing instructions.
+
+---
+### Image Generation Protocol (for T2I & TI2I)
+You will operate in one of two modes, determined by the user's starting tag:
+#### **<recaption> Mode (Prompt Rewriting)**:
+*   **Trigger:** Input begins with `<recaption>`.
+*   **Task:** Immediately rewrite the user's text into a structured, objective, and detail-rich professional-grade prompt.
+*   **Output:** Output only the rewritten prompt within `<recaption>` tags: `<recaption>Rewritten professional-grade prompt</recaption>`
+
+#### **<think> Mode (Think + Rewrite)**:
+*   **Trigger:** Input begins with `<think>`.
+*   **Task:** First, conduct a structured analysis of the request within `<think>` tags. Then, output the professional prompt, rewritten based on the analysis, within `<recaption>` tags.
+*   **Output:** Strictly adhere to the format: `<think>Analysis process</think><recaption>Rewritten prompt</recaption>`
+
+---
+### Execution Standards and Guidelines
+#### **`<think>` Phase: Analysis Guidelines**
+**For T2I (New Image Generation):**
+Deconstruct the user's request into the following core visual components:
+*   **Subject:** Key features of the main character/object, including appearance, pose, expression, and emotion.
+*   **Composition:** Camera angle, lens type, and layout.
+*   **Environment/Background:** The setting, time of day, weather, and background elements.
+*   **Lighting:** Technical details such as light source type, direction, and quality.
+*   **Color Palette:** The dominant hues and overall color scheme.
+*   **Style/Quality:** The artistic style, clarity, depth of field, and other technical details.
+*   **Text:** Identify any text to be rendered in the image, including its content, style, and position.
+*   **Details:** Small elements that add narrative depth and realism.
+
+**For TI2I (Image Editing):**
+Adopt a task-diagnostic approach:
+1.  **Diagnose Task:** Identify the edit type and analyze key requirements.
+2.  **Prioritize Analysis:**
+    *   **Adding:** Analyze the new element's position and appearance, ensuring seamless integration with the original image's lighting, shadows, and style.
+    *   **Removing:** Identify the target for removal and determine how to logically fill the resulting space using surrounding textures and lighting.
+    *   **Modifying:** Analyze what to change and what it should become, while emphasizing which elements must remain unchanged.
+    *   **Style Transfer:** Deconstruct the target style into specific features (e.g., brushstrokes, color palette) and apply them to the original image.
+    *   **Text Editing:** Ensure correct content and format. Consider the text's visual style (e.g., font, color, material) and how it adapts to the surface's perspective, curvature, and lighting.
+    *   **Reference Editing:** Extract specific visual elements (e.g., appearance, posture, composition, lines, depth) from the reference image to generate an image that aligns with the text description while also incorporating the referenced content.
+    *   **Inferential Editing:** Identify vague requests (e.g., "make it more professional") and translate them into concrete visual descriptions.
+
+#### `<recaption>` Phase: Professional-Grade Prompt Generation Rules
+**General Rewriting Principles (for T2I & TI2I):**
+1.  **Structure & Logic:** Start with a global description. Use positional words (e.g., "foreground", "background") to define the layout.
+2.  **Absolute Objectivity:** Avoid subjective terms. Convey aesthetics through precise descriptions of color, light, shadow, and materials.
+3.  **Physical & Logical Consistency:** Ensure all descriptions adhere to the laws of physics and common sense.
+4.  **Fidelity to User Intent:** Preserve the user's core concepts, subjects, and attributes. Text to be rendered in the image **must be enclosed in double quotes ("")**.
+5.  **Camera & Resolution:** Translate camera parameters into descriptions of visual effects. Convert resolution information into natural language.
+
+**T2I-Specific Guidelines:**
+*   **Style Adherence & Inference:** Strictly follow the specified style. If none is given, infer the most appropriate style and detail it using professional terminology.
+*   **Style Detailing:**
+    *   **Photography/Realism:** Use professional photography terms to describe lighting, lens effects, and material textures.
+    *   **Painting/Illustration:** Specify the art movement or medium's characteristics.
+    *   **UI/Design:** Objectively describe the final product. Define layout, elements, and typography. Text content must be specific and unambiguous.
+
+**TI2I-Specific Guidelines:**
+*   **Preserve Unchanged Elements:** Emphasize elements that **remain unchanged**. Unless explicitly instructed, never alter a character's identity/appearance, the core background, camera angle, or overall style.
+*   **Clear Editing Instructions:**
+    *   **Replacement:** Use the logic "**replace B with A**," and provide a detailed description of A.
+    *   **Addition:** Clearly state what to add, where, and what it looks like.
+*   **Unambiguous Referencing:** Avoid vague references (e.g., "that person"). Use specific descriptions of appearance.
+"""
+
+
+def get_system_prompt(sys_type, bot_task, system_prompt=None):
+    if sys_type == 'None':
+        return None
+    elif sys_type == "en_unified":
+        return unified_system_prompt_en
+    elif sys_type in ['en_vanilla', 'en_recaption', 'en_think_recaption']:
+        return t2i_system_prompts[sys_type][0]
+    elif sys_type == "dynamic":
+        if bot_task == "think":
+            return t2i_system_prompts["en_think_recaption"][0]
+        elif bot_task == "recaption":
+            return t2i_system_prompts["en_recaption"][0]
+        elif bot_task == "image":
+            return t2i_system_prompts["en_vanilla"][0].strip("\n")
+        else:
+            return system_prompt
+    elif sys_type == 'custom':
+        return system_prompt
+    else:
+        raise NotImplementedError(f"Unsupported system prompt type: {sys_type}")
+
+
+__all__ = [
+    "get_system_prompt"
+]
diff --git a/tokenization_hunyuan_image_3.py b/tokenization_hunyuan_image_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d2ebd22bd01bb8cc0d1af75a18252f1a8d45aa
--- /dev/null
+++ b/tokenization_hunyuan_image_3.py
@@ -0,0 +1,1773 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import random
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from typing import List, Tuple, Dict
+from typing import Optional, Union, Any
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.utils import BaseOutput
+
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+def default(value, default_value):
+    return value if value is not None else default_value
+
+
+def ensure_list(value):
+    if value is None:
+        return []
+    if isinstance(value, (list, tuple)):
+        return list(value)
+    return [value]
+
+
+class Resolution(object):
+    def __init__(self, size, *args):
+        if isinstance(size, str):
+            if 'x' in size:
+                size = size.split('x')
+                size = (int(size[0]), int(size[1]))
+            else:
+                size = int(size)
+        if len(args) > 0:
+            size = (size, args[0])
+        if isinstance(size, int):
+            size = (size, size)
+
+        self.h = self.height = size[0]
+        self.w = self.width = size[1]
+        self.r = self.ratio = self.height / self.width
+
+    def __getitem__(self, idx):
+        if idx == 0:
+            return self.h
+        elif idx == 1:
+            return self.w
+        else:
+            raise IndexError(f'Index {idx} out of range')
+
+    def __str__(self):
+        return f'{self.h}x{self.w}'
+
+
+class ResolutionGroup(object):
+    def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None):
+        self.align = align
+        self.base_size = base_size
+        assert base_size % align == 0, f'base_size {base_size} is not divisible by align {align}'
+        if base_size is not None and not isinstance(base_size, int):
+            raise ValueError(f'base_size must be None or int, but got {type(base_size)}')
+        if step is None:
+            step = base_size // 16
+        if step is not None and step > base_size // 2:
+            raise ValueError(f'step must be smaller than base_size // 2, but got {step} > {base_size // 2}')
+
+        self.step = step
+        self.data = self._calc_by_step()
+
+        if extra_resolutions is not None:
+            for extra_resolution in extra_resolutions:
+                height, width = extra_resolution.height, extra_resolution.width
+                ratio = height / width
+                flag = True
+                for resolution in self.data:
+                    if resolution.ratio == ratio:
+                        flag = False
+                        break
+                if flag:
+                    self.data.append(extra_resolution)
+
+        self.ratio = np.array([x.ratio for x in self.data])
+        self.attr = ['' for _ in range(len(self.data))]
+        self.prefix_space = 0
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+    def __repr__(self):
+        prefix = self.prefix_space * ' '
+        prefix_close = (self.prefix_space - 4) * ' '
+        res_str = f'ResolutionGroup(base_size={self.base_size}, step={self.step}, data='
+        attr_maxlen = max([len(x) for x in self.attr] + [5])
+        res_str += \
+            f'\n{prefix}ID: height width   ratio {" " * max(0, attr_maxlen - 4)}count  h/16 w/16    tokens\n{prefix}'
+        res_str += \
+            ('\n' + prefix).join([f'{i:2d}: ({x.h:4d}, {x.w:4d})  {self.ratio[i]:.4f}  {self.attr[i]:>{attr_maxlen}s}  '
+                                  f'({x.h // 16:3d}, {x.w // 16:3d})  {x.h // 16 * x.w // 16:6d}'
+                                  for i, x in enumerate(self.data)])
+        res_str += f'\n{prefix_close})'
+        return res_str
+
+    def _calc_by_step(self):
+        assert self.align <= self.step, f'align {self.align} must be smaller than step {self.step}'
+
+        min_height = self.base_size // 2
+        min_width = self.base_size // 2
+        max_height = self.base_size * 2
+        max_width = self.base_size * 2
+
+        resolutions = [Resolution(self.base_size, self.base_size)]
+
+        cur_height, cur_width = self.base_size, self.base_size
+        while True:
+            if cur_height >= max_height and cur_width <= min_width:
+                break
+
+            cur_height = min(cur_height + self.step, max_height)
+            cur_width = max(cur_width - self.step, min_width)
+            resolutions.append(Resolution(cur_height // self.align * self.align, cur_width // self.align * self.align))
+
+        cur_height, cur_width = self.base_size, self.base_size
+        while True:
+            if cur_height <= min_height and cur_width >= max_width:
+                break
+
+            cur_height = max(cur_height - self.step, min_height)
+            cur_width = min(cur_width + self.step, max_width)
+            resolutions.append(Resolution(cur_height // self.align * self.align, cur_width // self.align * self.align))
+
+        resolutions = sorted(resolutions, key=lambda x: x.ratio)
+
+        return resolutions
+
+    def get_target_size(self, width, height):
+        ratio = height / width
+        idx = np.argmin(np.abs(self.ratio - ratio))
+        reso = self.data[idx]
+        return reso.w, reso.h
+
+    def get_base_size_and_ratio_index(self, width, height):
+        ratio = height / width
+        idx = np.argmin(np.abs(self.ratio - ratio))
+        return self.base_size, idx
+
+
+class ImageInfo:
+    """ Class to store image information for processing and generation. """
+
+    def __init__(
+            self,
+            image_type: str = None,
+            image_tensor: torch.Tensor = None,
+            image_width: int = None,
+            image_height: int = None,
+            token_width: int = None,
+            token_height: int = None,
+            image_token_length: int = None,
+            base_size: int = None,
+            ratio_index: int = None,
+            ori_image_width: int = None,
+            ori_image_height: int = None,
+            **kwargs,
+    ):
+        self.image_type = image_type
+        self.image_tensor = image_tensor
+        self.ori_image_width = ori_image_width
+        self.image_width = image_width
+        self.w = image_width
+        self.ori_image_height = ori_image_height
+        self.image_height = image_height
+        self.h = image_height
+        self.token_width = token_width
+        self.tk_w = token_width
+        self.token_height = token_height
+        self.tk_h = token_height
+        self.image_token_length = default(
+            image_token_length,
+            token_width * token_height if token_width is not None and token_height is not None else None
+        )
+        self.base_size = base_size
+        self.ratio_index = ratio_index
+
+        self.add_timestep_token = kwargs.get("add_timestep_token", True)
+        self.add_guidance_token = kwargs.get("add_guidance_token", False)
+        self.use_front_boi_token = kwargs.get("use_front_boi_token", True)
+        self.add_image_shape_token = kwargs.get("add_image_shape_token", True)
+        self.add_timestep_r_token = kwargs.get("add_timestep_r_token", False)
+
+    def __getitem__(self, key: str) -> Any:
+        """Allow dictionary-like access to attributes."""
+        if hasattr(self, key):
+            return getattr(self, key)
+        raise KeyError(f"Key '{key}' not found in ImageInfo")
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        """Allow dictionary-like assignment to attributes."""
+        if hasattr(self, key):
+            setattr(self, key, value)
+        else:
+            raise KeyError(f"Key '{key}' not found in ImageInfo")
+
+    def __contains__(self, key: str) -> bool:
+        """Check if the key exists in the ImageInfo object."""
+        return hasattr(self, key)
+
+    def __repr__(self):
+        return (f"ImageInfo(image_type={self.image_type}, image_tensor={self.image_tensor}, "
+                f"ori_image_width={self.ori_image_width}, ori_image_height={self.ori_image_height}, "
+                f"image_width={self.image_width}, image_height={self.image_height}, "
+                f"token_width={self.token_width}, token_height={self.token_height}, "
+                f"image_token_length={self.image_token_length}, "
+                f"base_size={self.base_size}, ratio_index={self.ratio_index}")
+
+    @property
+    def meta_info(self):
+        # Used for image sections of tkwrapper.encode_general()
+        if self.image_type in ["vae", "gen_image"]:
+            return dict(
+                token_length=self.image_token_length,
+                add_timestep_token=self.add_timestep_token,
+                add_guidance_token=self.add_guidance_token,
+                add_timestep_r_token=self.add_timestep_r_token,
+                use_front_boi_token=self.use_front_boi_token,
+                add_image_shape_token=self.add_image_shape_token,
+                base_size=self.base_size,
+                ratio_idx=self.ratio_index,
+                # for rope 2d
+                token_height=self.token_height,
+                token_width=self.token_width,
+                # for bc
+                image_height=self.image_height,
+                image_width=self.image_width,
+                ori_image_width=self.ori_image_width,
+                ori_image_height=self.ori_image_height,
+            )
+        elif self.image_type in ["vit", "siglip2"]:
+            return dict(
+                token_length=self.image_token_length,
+                use_front_boi_token=self.use_front_boi_token,
+                add_image_shape_token=self.add_image_shape_token,
+                # for rope 2d
+                token_height=self.token_height,
+                token_width=self.token_width,
+                # for bc
+                image_height=self.image_height,
+                image_width=self.image_width,
+                ori_image_width=self.ori_image_width,
+                ori_image_height=self.ori_image_height,
+            )
+        else:
+            raise ValueError(f"Unknown image type '{self.image_type}'")
+
+    @property
+    def num_special_tokens(self):
+        if self.args is None:
+            raise ValueError("meta_info requires `args` attribute to be set.")
+        if self.image_type in ["vae", "src_image", "gen_image"]:
+            count = (
+                    2 +  # <boi> + <eoi> or <src_boi> + <src_eoi>
+                    (1 if self.add_timestep_token else 0) +
+                    (1 if self.add_guidance_token else 0) +
+                    (1 if self.add_timestep_r_token else 0) +
+                    (2 if self.add_image_shape_token else 0)
+            )
+        else:
+            raise ValueError(f"Unknown image_type: {self.image_type}")
+        return count
+
+    def copy(self, copy_image_tensor=True):
+        if copy_image_tensor and self.image_tensor is None:
+            raise ValueError("image_tensor is None, cannot copy")
+        return ImageInfo(
+            image_type=self.image_type,
+            image_tensor=self.image_tensor.clone() if copy_image_tensor else None,
+            image_width=self.image_width,
+            image_height=self.image_height,
+            ori_image_width=self.ori_image_width,
+            ori_image_height=self.ori_image_height,
+            token_width=self.token_width,
+            token_height=self.token_height,
+            image_token_length=self.image_token_length,
+            base_size=self.base_size,
+            ratio_index=self.ratio_index,
+        )
+
+    def zeros_(self):
+        self.image_tensor = torch.zeros_like(self.image_tensor)
+
+
+class ImageTensor(torch.Tensor):
+    # This class is just for type hinting purposes. Attribute `i` should be defined
+    # as an instance attribute of the torch.Tensor instance, like: tensor.i = ImageInfo(...)
+    i: ImageInfo
+    vision_encoder_kwargs: dict
+
+
+class JointImageInfo(object):
+    def __init__(self, vae_image_info: ImageInfo, vision_image_info: ImageInfo, vision_encoder_kwargs: dict = None):
+        self.vae_image_info = vae_image_info
+        self.vision_image_info = vision_image_info
+        self.vision_encoder_kwargs = vision_encoder_kwargs
+
+        # Define key attributes to align with ImageInfo for uniformity
+        self.image_type = "joint_image"
+        self.image_token_length = vae_image_info.image_token_length + vision_image_info.image_token_length
+
+        self.add_timestep_token = vae_image_info.add_timestep_token
+        self.use_front_boi_token = vae_image_info.use_front_boi_token
+        self.add_image_shape_token = vae_image_info.add_image_shape_token
+
+    def __repr__(self):
+        return f"JointImageInfo(vae_image={self.vae_image_info}, vision_image={self.vision_image_info})"
+
+    @property
+    def meta_info(self):
+        # Used for image sections of tkwrapper.encode_general()
+        return dict(
+            token_length=[self.vae_image_info.image_token_length, self.vision_image_info.image_token_length],
+            add_timestep_token=self.add_timestep_token,
+            use_front_boi_token=self.use_front_boi_token,
+            add_image_shape_token=self.add_image_shape_token,
+            base_size=self.vae_image_info.base_size,
+            ratio_idx=self.vae_image_info.ratio_index,
+            # for rope 2d
+            token_height=[self.vae_image_info.token_height, self.vision_image_info.token_height],
+            token_width=[self.vae_image_info.token_width, self.vision_image_info.token_width],
+            # for bc
+            image_height=[self.vae_image_info.image_height, self.vision_image_info.image_height],
+            image_width=[self.vae_image_info.image_width, self.vision_image_info.image_width],
+        )
+
+    @property
+    def num_special_tokens(self):
+        return (
+                2 +  # <boi> + <eoi>
+                (1 if self.add_timestep_token else 0) +
+                (2 if self.add_image_shape_token else 0) +
+                1   # <joint_image_sep>
+        )
+
+    def copy(self, copy_image_tensor=True):
+        if copy_image_tensor and (
+                self.vae_image_info.image_tensor is None or self.vision_image_info.image_tensor is None):
+            raise ValueError("image_tensor is None, cannot copy")
+        return JointImageInfo(
+            self.vae_image_info.copy(copy_image_tensor),
+            self.vision_image_info.copy(copy_image_tensor),
+            self.vision_encoder_kwargs,
+        )
+
+    def zeros_(self):
+        self.vae_image_info.zeros_()
+        self.vision_image_info.zeros_()
+
+
+class CondImage(object):
+    def __init__(self, image_type: str, vae_image: ImageTensor, vit_image: ImageTensor):
+        self.image_type = image_type
+        self.vae_image = vae_image
+        self.vit_image = vit_image
+
+        if image_type == "vae":
+            self.i = vae_image.i
+            self.section_type = "cond_vae_image"
+
+        elif image_type == "vit":
+            self.i = vit_image.i
+            self.section_type = "cond_vit_image"
+
+        elif image_type == "vae_vit":
+            self.i = JointImageInfo(vae_image.i, vit_image.i)
+            self.section_type = "cond_joint_image"
+
+        else:
+            raise ValueError(f"Unknown image_type: {image_type}")
+
+
+class TokenizerEncodeOutput(BaseOutput):
+    tokens: torch.Tensor = None
+    text_slices: Optional[list[slice]] = None
+    vae_image_slices: Optional[list[slice]] = None
+    gen_image_slices: Optional[list[slice]] = None
+    vit_image_slices: Optional[list[slice]] = None
+    joint_image_slices: Optional[list[slice]] = None
+    all_image_slices: Optional[list[slice]] = None
+    text_mask: Optional[torch.Tensor] = None
+    vae_image_mask: Optional[torch.Tensor] = None
+    gen_image_mask: Optional[torch.Tensor] = None
+    vit_image_mask: Optional[torch.Tensor] = None
+    real_pos: Optional[torch.Tensor] = None
+    guidance_scatter_index: Optional[torch.Tensor] = None
+    cond_timestep_scatter_index: Optional[torch.Tensor] = None
+    gen_timestep_scatter_index: Optional[torch.Tensor] = None
+    gen_timestep_r_scatter_index: Optional[torch.Tensor] = None
+
+
+class SeparatorStyle(IntEnum):
+    ADD_COLON_SPACE_SINGLE = auto()
+    NONE = auto()
+
+
+@dataclass
+class Conversation(object):
+    name: str
+    system_template: str = "{system_message}"
+    system_message: str = ""
+    roles: Tuple[str, str] = ("User", "Assistant")
+    messages: List[List[str]] = ()
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SPACE_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    sep_sp: str = None
+    stop_token_ids: list[int] = None
+
+    def get_prompt(self, return_type="str", add_system=True):
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        prompt_list = []
+
+        if self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            seps = [self.sep, self.sep2]
+            if add_system:
+                prompt_list.append(("System", system_prompt + self.sep_sp if system_prompt else ""))
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    prompt_list.append((role, f"{role}: {message}{seps[i % 2]}"))
+                else:
+                    prompt_list.append((role, f"{role}: "))
+
+        elif self.sep_style == SeparatorStyle.NONE:
+            seps = [self.sep, self.sep2]
+            if add_system:
+                prompt_list.append(("System", system_prompt + self.sep_sp if system_prompt else ""))
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    prompt_list.append((role, f"{role}{message}{seps[i % 2]}"))
+                else:
+                    prompt_list.append((role, f"{role}"))
+        else:
+            raise NotImplementedError(f"Unsupported sep_style: {self.sep_style}")
+
+        if return_type == "str":
+            prompt = "".join([msg for _, msg in prompt_list])
+        else:
+            prompt = prompt_list
+
+        return prompt
+
+    def get_role_prefix(self, role):
+        if self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            return f"{role}: "
+        elif self.sep_style == SeparatorStyle.NONE:
+            return f"{role}"
+        else:
+            raise NotImplementedError(f"Unsupported sep_style: {self.sep_style}")
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def add_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def copy(self):
+        return deepcopy(self)
+
+    def empty(self, name=None):
+        """Return an empty conversation with the same template."""
+        return Conversation(
+            name=name or self.name,
+            system_template=self.system_template,
+            system_message="",
+            roles=self.roles,
+            messages=[],
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            sep_sp=self.sep_sp,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f"{template.name} has been registered."
+
+    conv_templates[template.name] = template
+
+
+register_conv_template(
+    Conversation(
+        name="hunyuan-image-3",
+        system_template="{system_message}",
+        system_message="",
+        roles=("User", "Assistant"),
+        messages=[],
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="\n\n",
+        sep2="<|endoftext|>",
+        sep_sp="\n\n",
+        stop_token_ids=[127957],
+    )
+)
+
+
+def get_conversation_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+
+
+class HunyuanImage3TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Tokenizer for Hunyuan Multimodal models, utilizing a fast tokenizer backend.
+    This tokenizer extends the PreTrainedTokenizerFast from Hugging Face Transformers
+    for multimodal tasks.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # A convenience mapping for special tokens
+        special_tokens = self.special_tokens_map.get('additional_special_tokens', [])
+        if len(special_tokens) > 0:
+            special_token_ids = self.convert_tokens_to_ids(special_tokens)
+            self._sp_dict = dict(zip(special_tokens, special_token_ids))
+        else:
+            self._sp_dict = dict()
+
+        # Assign commonly used special tokens to attributes for easy access.
+        self.setup_special_tokens()
+
+        # Define decorator section
+        self.conversation_template = kwargs.get("conversation_template", "hunyuan-image-3")
+        self.conversation = get_conversation_template(self.conversation_template)
+        self.sequence_template = kwargs.get("sequence_template", "instruct")
+        self.decorator_section = DecoratorSections(
+            self,
+            conv=self.conversation,
+            sequence_template=self.sequence_template,
+        )
+
+    def setup_special_tokens(self):
+        # Define names for commonly used special tokens
+        predefined_name_mapping = dict(
+            boi="<boi>",
+            eoi="<eoi>",
+            boa="<boa>",
+            eoa="<eoa>",
+            bov="<bov>",
+            eov="<eov>",
+            img="<img>",
+            audio="<audio>",
+            video="<video>",
+            cfg="<cfg>",
+            timestep="<timestep>",
+            timestep_r="<timestep_r>",
+            guidance="<guidance>",
+            joint_img_sep="<joint_img_sep>",
+            answer="<answer>",
+            end_of_answer="</answer>",
+            # for extended cot types
+            think="<think>",
+            end_of_think="</think>",
+            recaption="<recaption>",
+            end_of_recaption="</recaption>",
+            # for grounding
+            ref="<ref>",
+            end_of_ref="</ref>",
+            quad="<quad>",
+            end_of_quad="</quad>",
+        )
+        for name, mapping in predefined_name_mapping.items():
+            setattr(self, f"{name}_token", mapping)
+            setattr(self, f"{name}_token_id", self.convert_tokens_to_ids(mapping))
+
+        if len(self._sp_dict) > 0:
+            name_mapping = dict()
+            for name, token in name_mapping.items():
+                setattr(self, name, token)
+                setattr(self, f"{name}_id", self._sp_dict[token])
+            
+
+        self.start_ratio_token_id = self.convert_tokens_to_ids("<img_ratio_0>")
+        self.end_ratio_token_id = self.convert_tokens_to_ids("<img_ratio_32>")
+        self.ratio_token_other_slices = [
+            (self.convert_tokens_to_ids("<img_ratio_33>"), self.convert_tokens_to_ids("<img_ratio_36>") + 1)
+        ]
+
+    @property
+    def max_token_id(self):
+        return self.vocab_size
+
+    def size_token(self, size: int):
+        return f"<img_size_{size}>"
+
+    def size_token_id(self, size: int):
+        return self.convert_tokens_to_ids(f"<img_size_{size}>")
+
+    def ratio_token(self, ratio_idx: int):
+        return f"<img_ratio_{ratio_idx}>"
+
+    def ratio_token_id(self, ratio_idx: int):
+        return self.convert_tokens_to_ids(f"<img_ratio_{ratio_idx}>")
+    
+    def get_all_ratio_token_ids(self):
+        return [self.ratio_token_id(i) for i in range(37)]
+
+    def relation_token(self, relation_idx: int, end: bool = False):
+        if end:
+            return f"</relation_{relation_idx}>"
+        return f"<relation_{relation_idx}>"
+
+    def relation_token_id(self, relation_idx: int, end: bool = False):
+        if end:
+            return self.convert_tokens_to_ids(f"</relation_{relation_idx}>")
+        return self.convert_tokens_to_ids(f"<relation_{relation_idx}>")
+
+    def x_token(self, pos: int):
+        return f"<pos_x_{pos}>"
+
+    def x_token_id(self, pos: int):
+        return self.convert_tokens_to_ids(f"<pos_x_{pos}>")
+
+    def y_token(self, pos: int):
+        return f"<pos_y_{pos}>"
+
+    def y_token_id(self, pos: int):
+        return self.convert_tokens_to_ids(f"<pos_y_{pos}>")
+
+    def z_token(self, pos: int):
+        return f"<pos_z_{pos}>"
+
+    def z_token_id(self, pos: int):
+        return self.convert_tokens_to_ids(f"<pos_z_{pos}>")
+
+    def get_img_token(self):
+        if hasattr(self, "img_token"):
+            return self.img_token
+        else:
+            return self.convert_ids_to_tokens(len(self) - 1)
+
+    def encode_text(
+            self,
+            *texts,
+            uncond_enabled: Optional[bool | list[bool]] = None,
+            uncond_p: Optional[float] = None,
+            max_length: Optional[int] = None,
+            pad: Optional[str] = None,
+            return_lengths: bool = False,
+    ):
+        """
+        Encode text and image for AR-like model training of the text-to-image/instruction tuning tasks.
+        Support encode multiple texts at once. Each text can be separately conditioned or unconditioned
+        based on the uncond_flags and a uniform uncond_p.
+        **<bos> token is always prepended to the text tokens.**
+
+        Parameters
+        ----------
+        texts: str or List[str]
+            List of texts to be encoded.
+        uncond_enabled: bool or List[bool]
+            List of flags to indicate whether the text should be unconditioned.
+            If False, the text will never be unconditioned.
+            If True, the text will be unconditioned with uncond_p.
+        uncond_p: float
+            Probability to the unconditional text. Only works when uncond_enabled is True.
+        max_length: int
+            Maximum length of the encoded text.
+        pad: Optional[str]
+            Padding method. Can be 'left' or 'right'.
+        return_lengths: bool
+            Whether to return the length of each encoded text.
+        """
+        if pad is not None:
+            assert max_length is not None, "max_length should be provided when pad is not None."
+
+        if uncond_enabled is None:
+            uncond_enabled = [True] * len(texts)
+        elif isinstance(uncond_enabled, bool):
+            uncond_enabled = [uncond_enabled] * len(texts)
+        if len(uncond_enabled) != len(texts):
+            print(uncond_enabled, texts)
+        assert len(uncond_enabled) == len(texts), (
+            f"Length of uncond_flags should be equal to the number of texts, "
+            f"but got {len(uncond_enabled)} and {len(texts)}."
+        )
+
+        # Prepare text/uncond tokens
+        # TODO: If len(texts) > 1, such as instruction + prompt in inpainting, we need to determine how to do uncond.
+        # Now all texts will be cond or uncond at the same time.
+        do_uncond_drop = (uncond_p is not None) and (random.random() < uncond_p)
+        text_tokens, lengths = [], []
+        cum_length = 0
+        for text, uncond_flag in zip(texts, uncond_enabled):
+            # If reach the max_length and there still have unencoded texts, give a warning message and break the loop.
+            if max_length is not None and cum_length >= max_length:
+                self.logger.warning(
+                    f"Text length exceeds the max_length({max_length}). The remaining texts will be ignored: "
+                    f"{text[:80]}..."
+                )
+                break
+            # Set add_special_tokens=False to avoid adding <bos> token in some LLMs.
+            if isinstance(text, str):
+                text_token = self.encode(text, add_special_tokens=False)
+            else:
+                text_token = text
+            if uncond_flag and do_uncond_drop:
+                text_token = [self.cfg_token_id] * len(text_token)
+            # Cutoff the text by max_length if necessary
+            if max_length is not None and (cum_length + len(text_token)) > max_length:
+                text_token = text_token[:max_length - cum_length]
+            text_tokens.extend(text_token)
+            lengths.append(len(text_token))
+            cum_length += len(text_token)
+
+        # Prepend/Append <pad> tokens if applicable
+        if pad is not None and (pad_length := max_length - len(text_tokens)) > 0:
+            if pad == 'left':
+                text_tokens = [self.pad_token_id] * pad_length + text_tokens
+            elif pad == 'right':
+                text_tokens = text_tokens + [self.pad_token_id] * pad_length
+            else:
+                raise ValueError(f"Unsupported padding method: {pad}.")
+
+        if return_lengths:
+            return text_tokens, lengths
+        return text_tokens
+
+    @staticmethod
+    def _check_key_number_matched(keys, data):
+        # Assert keys and token_source are matched
+        assert set(keys) == set(data.keys()), (
+            f"Keys in the template and token source should be matched, but got {keys} and {list(data.keys())}."
+        )
+        key_counts = {k: 0 for k in keys}
+        for key in keys:
+            key_counts[key] += 1
+        for key, count in key_counts.items():
+            assert len(data[key]) == count, (
+                f"Number of `{key}` in the token source should be matched with the template, but got "
+                f"{data[key]}({len(data[key])}) and {count}."
+            )
+
+    def _add_image_meta_info_token(
+            self,
+            token_seq,
+            token_count,
+            extra_token_pos,
+            add_timestep_token: bool = False,
+            add_timestep_r_token: bool = False,
+            add_image_shape_token: bool = False,
+            add_guidance_token: bool = False,
+            base_size=None,
+            ratio_idx=None,
+            image_type=None,
+    ):
+        if add_image_shape_token:
+            token_seq.extend([self.size_token_id(base_size), self.ratio_token_id(ratio_idx)])
+            token_count += 2
+        if add_timestep_token:
+            token_seq.extend([self.timestep_token_id])
+            extra_token_pos['timestep'].append(token_count)
+            if image_type is not None:
+                if image_type == "gen_image":
+                    extra_token_pos['gen_timestep'].append(token_count)
+                elif image_type in ["cond_joint_image", "cond_vae_image"]:
+                    extra_token_pos['cond_timestep'].append(token_count)
+                else:
+                    raise ValueError(f"Unsupported image type: {image_type}.")
+            token_count += 1
+        if add_guidance_token:
+            token_seq.extend([self.guidance_token_id])
+            extra_token_pos['guidance'].append(token_count)
+            token_count += 1
+        if add_timestep_r_token:
+            token_seq.extend([self.timestep_r_token_id])
+            extra_token_pos['gen_timestep_r'].append(token_count)
+            token_count += 1
+        return token_count
+
+    def encode_sequence(
+            self,
+            template: str,
+            token_source: dict[str, list[list[int] | dict[str, Any]]],
+            total_length=None,
+            add_timestep_token=False,
+            add_timestep_r_token=False,
+            add_guidance_token=False,
+            add_eos=True,
+            add_pad=True,
+            add_bos=True,
+            drop_last: str | bool = 'auto',
+            add_image_shape_token=False,
+    ):
+        """
+        Encode a sequence of tokens based on the provided token source.
+
+        Args:
+            template: str
+                Template of the sequence. E.g., "text-image" means the sequence is composed of text and an image.
+                "text-image-image" means the sequence is composed of text and two images.
+            token_source (dict[str, list[list[int] | dict[str, Any]]]): Token source for each key in the template, in order.
+                - text: List[List[int]]. Each List[int] is a sequence of tokenized text tokens.
+                - gen_image: ImageInfo
+                - cond_joint_image: JointImageInfo
+                - cond_vae_image: ImageInfo
+                - cond_vit_image: ImageInfo
+            total_length: int
+                Total length of the encoded sequence, include padding tokens.
+            add_timestep_token: bool
+                Whether to add timestep token before the image tokens.
+                (Right after the <iw><ih> token or <img_ratio_*><img_size_*> tokens)
+            add_guidance_token: bool
+                Whether to add guidance token before the image tokens.
+            add_eos: bool or 'auto'
+                Whether to add eos token at the end of the sequence. If True, always add eos token. If 'auto',
+                add eos token only when the total_length is not reached and the last token is not <eos>.
+            use_front_boi_token: bool:
+                Whether to put the <boi> token at the front of iw, ih and timestep tokens.
+            add_pad: bool or 'auto'
+                Whether to add padding tokens to the sequence. If True and total_length is not reached, add padding tokens.
+            add_bos: bool
+                Whether to add bos token at the beginning of the sequence.
+            drop_last: bool or 'auto'
+                - If auto, drop last tokens exceeding the total_length if the total_length is provided. If cut point is
+                    in the middle of the image tokens, an error will raised.
+                - If True, drop last tokens exceeding the total_length. If cut point is in the middle of the image tokens,
+                    all the successive image tokens will be dropped.
+                - If False, keep the last tokens exceeding the total_length, even if the total_length is reached.
+            add_image_shape_token: bool
+                Whether to add image shape token before the image tokens. (Right before the <timestep> token)
+
+        Returns
+        -------
+        token_seq: list
+            Encoded token sequence.
+        extra_token_pos: dict
+            Positions of extra tokens. E.g., iw_ih, timestep.
+        """
+        if drop_last is True and total_length is None:
+            raise ValueError("total_length should be provided when drop_last is True.")
+
+        keys = template.split('-')
+        modal_length = len(keys)
+        index_indicator = {k: 0 for k in token_source}
+        for k, v in token_source.items():
+            assert isinstance(v, (list, tuple)), (
+                f"Value of `{k}` in the token source should be a list or tuple, but got {type(v)}."
+            )
+        self._check_key_number_matched(keys, token_source)
+
+        token_seq = []
+        token_count = 0
+        extra_token_pos = defaultdict(list)
+        if add_bos:
+            token_seq.append(self.bos_token_id)
+            token_count += 1
+        # If drop_last is True, we check the token_count on the fly and exit the loop if the total_length is reached.
+        # This check is only applied to the block tokens. Block tokens mean the tokens that are unsplittable, like
+        # image tokens, face tokens. Text tokens are splittable, so we don't need to check the token_count for text.
+        # If the loop is broken by drop_last, we don't add the eos token at the end because the sequence is not complete.
+        drop_last_break = False
+        for i, key in enumerate(keys):
+            source = token_source[key][index_indicator[key]]
+            if key == "text":
+                token_seq.extend(source)  # text token sequence
+                extra_token_pos["<text>_start"].append(token_count)
+                token_count += len(source)
+                extra_token_pos["<text>_end"].append(token_count - 1)
+
+            elif key == "gen_image":
+                # 2 means boi and eoi
+                extra_count = \
+                    2 \
+                    + (1 if source.get('timestep', add_timestep_token) else 0) \
+                    + (1 if source.get('timestep_r', add_timestep_r_token) else 0) \
+                    + (1 if source.get('guidance', add_guidance_token) else 0) \
+                    + (2 if source.get('image_shape', add_image_shape_token) else 0)
+                if drop_last is True and token_count + extra_count + source['length'] > total_length:
+                    drop_last_break = True
+                    break
+                token_seq.append(self.boi_token_id)  # Use patched boi for Janus, otherwise using default <boi>
+                extra_token_pos["boi"].append(token_count)
+                token_count += 1
+                token_count = self._add_image_meta_info_token(
+                    token_seq=token_seq,
+                    token_count=token_count,
+                    extra_token_pos=extra_token_pos,
+                    add_timestep_token=source.get('timestep', add_timestep_token),
+                    add_timestep_r_token=source.get('timestep_r', add_timestep_r_token),
+                    add_guidance_token=source.get('guidance', add_guidance_token),
+                    add_image_shape_token=source.get('image_shape', add_image_shape_token),
+                    base_size=source.get('base_size'),
+                    ratio_idx=source.get('ratio_idx'),
+                    image_type=key,
+                )
+                token_seq.extend(
+                    [self.img_token_id] * source['length'] +  # token number
+                    [self.eoi_token_id]
+                )
+                extra_token_pos["<img>_start"].append(token_count)
+                extra_token_pos["<all_img>_start"].append(token_count)
+                token_count += source['length']
+                extra_token_pos["<img>_end"].append(token_count - 1)
+                extra_token_pos["<all_img>_end"].append(token_count - 1)
+                extra_token_pos["eoi"].append(token_count)
+                token_count += 1  # <eoi>
+
+            elif key == "cond_joint_image":
+                assert isinstance(source['length'], list) and len(
+                    source['length']) == 2, "cond_joint_image length should be a list of two integers"
+                # 2 + 1 means: boi, eoi, joint_img_sep
+                extra_count = \
+                    2 + 1 \
+                    + (1 if source.get('timestep', add_timestep_token) else 0) \
+                    + (2 if source.get('image_shape', add_image_shape_token) else 0)
+                if drop_last is True and token_count + extra_count + sum(source['length']) > total_length:
+                    drop_last_break = True
+                    break
+                token_seq.append(self.boi_token_id)  # Use patched boi for Janus, otherwise using default <boi>
+                extra_token_pos["boi"].append(token_count)
+                token_count += 1
+                token_count = self._add_image_meta_info_token(
+                    token_seq=token_seq,
+                    token_count=token_count,
+                    extra_token_pos=extra_token_pos,
+                    add_timestep_token=source.get('timestep', add_timestep_token),
+                    add_image_shape_token=source.get('image_shape', add_image_shape_token),
+                    base_size=source.get('base_size'),
+                    ratio_idx=source.get('ratio_idx'),
+                    image_type=key,
+                )
+                token_seq.extend(
+                    [self.img_token_id] * source['length'][0]
+                )
+                extra_token_pos["<vae_img>_start"].append(token_count)
+                extra_token_pos["<joint_img>_start"].append(token_count)
+                extra_token_pos["<all_img>_start"].append(token_count)
+                token_count += source['length'][0]
+                extra_token_pos["<vae_img>_end"].append(token_count - 1)
+                extra_token_pos["<all_img>_end"].append(token_count - 1)
+
+                token_seq.extend([self.joint_img_sep_token_id])
+                extra_token_pos["joint_img_sep"].append(token_count)
+                token_count += 1
+
+                token_seq.extend(
+                    [self.img_token_id] * source['length'][1]
+                )
+                extra_token_pos["<vit_img>_start"].append(token_count)
+                extra_token_pos["<all_img>_start"].append(token_count)
+                token_count += source['length'][1]
+                extra_token_pos["<vit_img>_end"].append(token_count - 1)
+                extra_token_pos["<joint_img>_end"].append(token_count - 1)
+                extra_token_pos["<all_img>_end"].append(token_count - 1)
+
+                token_seq.extend(
+                    [self.eoi_token_id]
+                )
+                extra_token_pos["eoi"].append(token_count)
+                token_count += 1  # <eoi>
+
+            elif key == "cond_vae_image":
+                # 2 means: boi, eoi
+                extra_count = \
+                    2 \
+                    + (1 if source.get('timestep', add_timestep_token) else 0) \
+                    + (2 if source.get('image_shape', add_image_shape_token) else 0)
+                if drop_last is True and token_count + extra_count + source['length'] > total_length:
+                    drop_last_break = True
+                    break
+                token_seq.append(self.boi_token_id)  # Use patched boi for Janus, otherwise using default <boi>
+                extra_token_pos["boi"].append(token_count)
+                token_count += 1
+                token_count = self._add_image_meta_info_token(
+                    token_seq=token_seq,
+                    token_count=token_count,
+                    extra_token_pos=extra_token_pos,
+                    add_timestep_token=source.get('timestep', add_timestep_token),
+                    add_image_shape_token=source.get('image_shape', add_image_shape_token),
+                    base_size=source.get('base_size'),
+                    ratio_idx=source.get('ratio_idx'),
+                    image_type=key,
+                )
+                token_seq.extend(
+                    [self.img_token_id] * source['length']
+                )
+                extra_token_pos["<vae_img>_start"].append(token_count)
+                extra_token_pos["<all_img>_start"].append(token_count)
+                token_count += source['length']
+                extra_token_pos["<vae_img>_end"].append(token_count - 1)
+                extra_token_pos["<all_img>_end"].append(token_count - 1)
+                token_seq.extend(
+                    [self.eoi_token_id]
+                )
+                extra_token_pos["eoi"].append(token_count)
+                token_count += 1  # <eoi>
+
+            elif key == "cond_vit_image":
+                # 2 means: boi, eoi
+                extra_count = 2
+                if drop_last is True and token_count + extra_count + source['length'] > total_length:
+                    drop_last_break = True
+                    break
+
+                if hasattr(self, "boi_token_id"):
+                    token_seq.append(self.boi_token_id)
+                    extra_token_pos["boi"].append(token_count)
+                    token_count += 1
+
+                if hasattr(self, "img_token_id"):
+                    token_seq.extend([self.img_token_id] * source['length'])
+                else:
+                    # If not img_token_id defined, but we still need to fill the image tokens,
+                    # we use the last token id representing the image token.
+                    token_seq.extend([len(self) - 1] * source['length'])
+                extra_token_pos["<vit_img>_start"].append(token_count)
+                extra_token_pos["<all_img>_start"].append(token_count)
+                token_count += source['length']
+                extra_token_pos["<vit_img>_end"].append(token_count - 1)
+                extra_token_pos["<all_img>_end"].append(token_count - 1)
+
+                if hasattr(self, "eoi_token_id"):
+                    token_seq.append(self.eoi_token_id)
+                    extra_token_pos["eoi"].append(token_count)
+                    token_count += 1
+
+            else:
+                raise ValueError(f"Not supported key: {key}")
+            index_indicator[key] += 1
+
+        if add_eos is True and not drop_last_break:
+            # Typically used for t2i task.
+            token_seq.append(self.eos_token_id)
+            extra_token_pos["eos"].append(token_count)
+            token_count += 1
+        elif add_eos == 'auto' and not drop_last_break:
+            # Typically used for lm and mmu task.
+            if token_seq[-1] != self.eos_token_id and (total_length is None or token_count < total_length):
+                token_seq.append(self.eos_token_id)
+                extra_token_pos["eos"].append(token_count)
+                token_count += 1
+
+        if total_length:
+            # Check token count and clip sequence if necessary
+            if token_count > total_length and drop_last:
+                # Assert clip position is not in the middle of the block-wise tokens (gen_image,
+                # src_image, und_image, face)
+                for start_key, end_key in [
+                    ("<img>_start", "<img>_end"), ("<vae_img>_start", "<vae_img>_end"),
+                    ("<vit_img>_start", "<vit_img>_end"), ("<joint>_start", "<joint>_end")
+                ]:
+                    if start_key in extra_token_pos and end_key in extra_token_pos:
+                        assert all(
+                            (start > total_length or end + 1 < total_length)
+                            for start, end in zip(extra_token_pos[start_key], extra_token_pos[end_key])
+                        ), ("Clip position should not be in the middle of the image tokens.\n"
+                            f"Below is the text:\n{self._shorten_text(self.decode(token_seq))}")
+                token_seq = token_seq[:total_length]
+
+            # Pad the sequence if necessary
+            pad_num = max(0, total_length - len(token_seq))
+            if add_pad and pad_num:
+                token_seq.extend([self.pad_token_id] * pad_num)
+                extra_token_pos["first_pad"].append(token_count)
+
+        return token_seq, extra_token_pos
+
+    @staticmethod
+    def parse_extra_token_pos(extra_token_pos, prefix, tokens, rng=None):
+        if rng is None:
+            rng = slice(None)
+        image_slices = [
+            slice(start, end + 1)
+            for start, end in zip(extra_token_pos[f'<{prefix}>_start'][rng], extra_token_pos[f'<{prefix}>_end'][rng])
+        ] if f'<{prefix}>_start' in extra_token_pos and f'<{prefix}>_end' in extra_token_pos else []
+        if image_slices:
+            image_mask = torch.zeros_like(tokens, dtype=torch.bool)
+            for image_slice in image_slices:
+                image_mask[image_slice] = True
+        else:
+            image_mask = None
+        return image_slices, image_mask
+
+    def encode_general(
+        self,
+        sections: Optional[list[dict[str, Any]]] = None,
+        max_token_length: Optional[int] = None,
+        add_eos: bool | str = 'auto',
+        use_text_mask: bool = True,
+        add_pad: bool | str = 'auto',
+        add_bos: bool = True,
+        drop_last: bool | str = 'auto',
+    ):
+        if sections is None:
+            raise ValueError("sections must be provided.")
+        template = '-'.join([section['type'] for section in sections])
+
+        sections = deepcopy(sections)
+        token_source = defaultdict(list)
+        text_mask_specs = []
+        for section in sections:
+            if section['type'] == 'text':
+                text = self.encode_text(
+                    section['text'] if 'text' in section else section['tokens'],
+                    uncond_enabled=section.get('uncond_enabled'),
+                    uncond_p=section.get('uncond_p'),
+                    max_length=section.get('max_length'),
+                )
+                token_source['text'].append(text)
+                text_mask_specs.append(dict(
+                    ignore=section.get('ignore', False),
+                    start_offset=section.get('start_offset', 0),
+                    end_offset=section.get('end_offset', 0),
+                ))
+            elif section['type'] == 'gen_image':
+                token_source['gen_image'].append(dict(
+                    length=section['token_length'],
+                    timestep=section.get('add_timestep_token', False),
+                    timestep_r=section.get('add_timestep_r_token', False),
+                    guidance=section.get('add_guidance_token', False),
+                    front_boi=section.get('use_front_boi_token', False),
+                    image_shape=section.get('add_image_shape_token', False),
+                    base_size=section.get('base_size'),
+                    ratio_idx=section.get('ratio_idx'),
+                ))
+            elif section['type'] == 'cond_joint_image':
+                token_source['cond_joint_image'].append(dict(
+                    length=section['token_length'],
+                    timestep=section.get('add_timestep_token', False),
+                    front_boi=section.get('use_front_boi_token', False),
+                    image_shape=section.get('add_image_shape_token', False),
+                    base_size=section.get('base_size'),
+                    ratio_idx=section.get('ratio_idx'),
+                ))
+            elif section['type'] == 'cond_vae_image':
+                token_source['cond_vae_image'].append(dict(
+                    length=section['token_length'],
+                    timestep=section.get('add_timestep_token', False),
+                    front_boi=section.get('use_front_boi_token', False),
+                    image_shape=section.get('add_image_shape_token', False),
+                    base_size=section.get('base_size'),
+                    ratio_idx=section.get('ratio_idx'),
+                ))
+            elif section['type'] == 'cond_vit_image':
+                token_source['cond_vit_image'].append(dict(
+                    length=section['token_length'],
+                    timestep=section.get('add_timestep_token', False),
+                    front_boi=section.get('use_front_boi_token', False),
+                    image_shape=section.get('add_image_shape_token', False),
+                    base_size=section.get('base_size'),
+                    ratio_idx=section.get('ratio_idx'),
+                ))
+            else:
+                raise ValueError(f"Invalid section type: {section['type']}")
+
+        # Combine text and image tokens
+        full_token_seq, extra_token_pos = self.encode_sequence(
+            template=template,
+            token_source=dict(token_source),
+            total_length=max_token_length,
+            add_eos=add_eos,
+            add_pad=add_pad,
+            add_bos=add_bos,
+            drop_last=drop_last,
+        )
+        full_seq_token_tensor = torch.tensor(full_token_seq, dtype=torch.long)
+
+        guidance_scatter_index = torch.tensor(extra_token_pos['guidance'], dtype=torch.long) \
+            if 'guidance' in extra_token_pos else None
+        cond_timestep_scatter_index = torch.tensor(extra_token_pos['cond_timestep'], dtype=torch.long) \
+            if 'cond_timestep' in extra_token_pos else None
+        gen_timestep_scatter_index = torch.tensor(extra_token_pos['gen_timestep'], dtype=torch.long) \
+            if 'gen_timestep' in extra_token_pos else None
+        gen_timestep_r_scatter_index = torch.tensor(extra_token_pos['gen_timestep_r'], dtype=torch.long) \
+            if 'gen_timestep_r' in extra_token_pos else None
+        gen_image_slices, gen_image_mask = self.parse_extra_token_pos(
+            extra_token_pos, 'img', full_seq_token_tensor)
+        vae_image_slices, vae_image_mask = self.parse_extra_token_pos(
+            extra_token_pos, 'vae_img', full_seq_token_tensor)
+        vit_image_slices, vit_image_mask = self.parse_extra_token_pos(
+            extra_token_pos, 'vit_img', full_seq_token_tensor)
+        joint_image_slices, _ = self.parse_extra_token_pos(
+            extra_token_pos, 'joint_img', full_seq_token_tensor)
+        # All image slices (src_image, gen_image, und_image)
+        all_image_slices = [
+            slice(start, end + 1)
+            for start, end in zip(extra_token_pos['<all_img>_start'], extra_token_pos['<all_img>_end'])
+        ] if '<all_img>_start' in extra_token_pos and '<all_img>_end' in extra_token_pos else []
+
+        # Text mask
+        text_slices = [
+            slice(start, end + 1)
+            for start, end in zip(extra_token_pos['<text>_start'], extra_token_pos['<text>_end'])
+        ] if '<text>_start' in extra_token_pos and '<text>_end' in extra_token_pos else []
+        assert len(text_slices) <= len(text_mask_specs), \
+            (f"Number of text slices ({len(text_slices)}) should be less than or equal to "
+             f"number of text mask specs ({len(text_mask_specs)})")
+        if use_text_mask:
+            text_mask = torch.zeros_like(full_seq_token_tensor, dtype=torch.float32)
+            for text_slice, mask_spec in zip(text_slices, text_mask_specs):
+                if not mask_spec['ignore']:
+                    real_slice = slice(
+                        text_slice.start + mask_spec['start_offset'],
+                        text_slice.stop + mask_spec['end_offset']
+                    )
+                    text_mask[real_slice] = 1.0
+        else:
+            text_mask = None
+
+        # real_pos is the first position of the <pad> token
+        real_pos = torch.tensor(extra_token_pos.get('first_pad', [full_seq_token_tensor.shape[0]]), dtype=torch.long)
+
+        return TokenizerEncodeOutput(
+            tokens=full_seq_token_tensor,
+            text_slices=text_slices,
+            gen_image_slices=gen_image_slices,
+            vae_image_slices=vae_image_slices,
+            vit_image_slices=vit_image_slices,
+            joint_image_slices=joint_image_slices,
+            all_image_slices=all_image_slices,
+            text_mask=text_mask,
+            gen_image_mask=gen_image_mask,
+            vae_image_mask=vae_image_mask,
+            vit_image_mask=vit_image_mask,
+            real_pos=real_pos,
+            guidance_scatter_index=guidance_scatter_index,
+            cond_timestep_scatter_index=cond_timestep_scatter_index,
+            gen_timestep_scatter_index=gen_timestep_scatter_index,
+            gen_timestep_r_scatter_index=gen_timestep_r_scatter_index,
+        )
+
+    def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_think=False):
+        if not cot_text:  # None or empty
+            return []
+        deco = self.decorator_section
+
+        if self.think_token in cot_text and self.end_of_think_token in cot_text:
+            before_think_sec = cot_text.split(self.think_token)[0]
+            after_think_sec = cot_text.split(self.end_of_think_token)[1]
+            think_sec = cot_text.split(self.think_token)[1].split(self.end_of_think_token)[0]
+            return self.get_cot_sections(before_think_sec, uncond_kwargs, drop_think=drop_think) + \
+                (deco.think(dict(type="text", text=think_sec, max_length=cot_max_length, **uncond_kwargs))
+                 if not drop_think else []) + \
+                self.get_cot_sections(after_think_sec, uncond_kwargs, drop_think=drop_think)
+
+        if self.recaption_token in cot_text and self.end_of_recaption_token in cot_text:
+            before_recaption_sec = cot_text.split(self.recaption_token)[0]
+            after_recaption_sec = cot_text.split(self.end_of_recaption_token)[1]
+            recaption_sec = cot_text.split(self.recaption_token)[1].split(self.end_of_recaption_token)[0]
+            return self.get_cot_sections(before_recaption_sec, uncond_kwargs, drop_think=drop_think) + \
+                (deco.recaption(dict(type="text", text=recaption_sec, max_length=cot_max_length, **uncond_kwargs))) + \
+                self.get_cot_sections(after_recaption_sec, uncond_kwargs, drop_think=drop_think)
+
+        return [
+            dict(type="text", text=cot_text, **uncond_kwargs),
+        ]
+
+    def apply_general_template(
+            self,
+            message_list,
+            max_length=None,
+            add_assistant_prefix=False,
+            answer="auto",
+            bot_task="auto",
+            sequence_template=None,
+            uncond_p=0.0,
+            cfg_factor=1,
+            batchify=False,
+            image_base_size=None,
+            drop_think=False,
+    ):
+        if bot_task == "img_ratio":
+            assert image_base_size is not None, "image_base_size should be provided for img_ratio task."
+
+        # If cfg_factor > 1, we need to repeat the unconditioned part
+        if batchify:
+            assert isinstance(message_list[0], list), \
+                f"When batchify is True, message_list should be a list of list, but got [{type(message_list[0])}, ...]."
+            return self.batch_gen_infer(
+                infer_fn=self.apply_general_template,
+                prompt_list=[[]],
+                infer_fn_kwargs_list=[dict(
+                    message_list=message_list_i,
+                    max_length=max_length,
+                    add_assistant_prefix=add_assistant_prefix,
+                    answer=answer,
+                    bot_task=bot_task,
+                    sequence_template=sequence_template,
+                    image_base_size=image_base_size,
+                    drop_think=drop_think,
+                ) for message_list_i in message_list],
+                do_classifier_free_guidance=cfg_factor > 1,
+                condition_repeat_times=1,
+                uncondition_repeat_times=cfg_factor - 1,
+            )
+
+        sequence_template = sequence_template or self.sequence_template
+        uncond_kwargs = dict(uncond_enabled=uncond_p == 1.0, uncond_p=uncond_p)
+
+        def process_successive_message(_message_list, _cur_message_idx, role, prefix, suffix,
+                                       answer_prefix="", answer_suffix=""):
+            _sub_sections = []
+            while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]['role'] == role:
+                message = _message_list[_cur_message_idx]
+                if message['type'] == 'text':
+                    text = message['content']
+                    if role == "system":
+                        _sub_sections.append(dict(type="text", text=text))
+                    elif role == "assistant":
+                        if (self.recaption_token in text and self.end_of_recaption_token in text) or (
+                                self.think_token in text and self.end_of_think_token in text):
+                            _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
+                        else:
+                            _sub_sections.append(dict(
+                            type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs))
+                    else:
+                        _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
+                elif message['type'] == 'gen_image':
+                    info = message['content']
+                    assert isinstance(info, ImageInfo), f"Expected ImageInfo, but got {type(info)}"
+                    if role == "assistant":
+                        _sub_sections.append(dict(type="text", text=answer_prefix))
+                    _sub_sections.append(dict(type=message['type'], **info.meta_info))
+                    if role == "assistant":
+                        _sub_sections.append(dict(type="text", text=answer_suffix))
+                elif message['type'] in ['cond_joint_image', 'cond_vae_image', 'cond_vit_image']:
+                    info = message['content']
+                    assert isinstance(info, (ImageInfo, JointImageInfo)), \
+                        f"Expected ImageInfo or JointImageInfo, but got {type(info)}"
+                    _sub_sections.append(dict(type=message['type'], **info.meta_info))
+                else:
+                    raise ValueError(f"Unknown message type: {message['type']}")
+                _cur_message_idx += 1
+            if len(_sub_sections) > 0:
+                # Add role prefix and suffix
+                _sub_sections.insert(0, dict(type='text', text=prefix))
+                _sub_sections.append(dict(type='text', text=suffix))
+            return _sub_sections, _cur_message_idx
+
+        # Define assistant prefix and suffix
+        if (answer == "auto" and sequence_template == "instruct") or answer is True:
+            answer_prefix, answer_suffix = self.answer_token, self.end_of_answer_token
+        else:
+            answer_prefix, answer_suffix = "", ""
+        if sequence_template == "pretrain":
+            system_suffix = ""
+            user_prefix = ""
+            user_suffix = ""
+            bot_prefix = ""
+            bot_suffix = ""
+        else:
+            conv = self.conversation
+            system_suffix = f"{conv.sep}"
+            user_prefix = conv.get_role_prefix(conv.roles[0])
+            user_suffix = f"{conv.sep}"
+            bot_prefix = conv.get_role_prefix(conv.roles[1])
+            bot_suffix = f"{conv.sep}"
+
+        # Process successive user and assistant messages
+        sections = []
+        cur_message_idx = 0
+        final_role = None
+        while cur_message_idx < len(message_list):
+            # Process successive system messages
+            sub_sections, cur_message_idx = process_successive_message(
+                message_list, cur_message_idx, role="system", prefix="", suffix=system_suffix)
+            # Add to the template and sections
+            sections.extend(sub_sections)
+            if len(sub_sections) > 0:
+                final_role = "system"
+
+            # Process successive user messages
+            sub_sections, cur_message_idx = process_successive_message(
+                message_list, cur_message_idx, role="user", prefix=user_prefix, suffix=user_suffix)
+            # Add to the template and sections
+            sections.extend(sub_sections)
+            if len(sub_sections) > 0:
+                final_role = "user"
+
+            # Process successive assistant messages
+            sub_sections, cur_message_idx = process_successive_message(
+                message_list, cur_message_idx, role="assistant", prefix=bot_prefix, suffix=bot_suffix,
+                answer_prefix=answer_prefix, answer_suffix=answer_suffix,
+            )
+            # Add to the template and sections
+            sections.extend(sub_sections)
+            if len(sub_sections) > 0:
+                final_role = "assistant"
+
+        if add_assistant_prefix:
+            if final_role == "assistant":
+                # Avoid adding prefix twice
+                _bot_prefix = ""
+                # Remove the final bot_suffix
+                if len(sections) > 0 and sections[-1]['type'] == 'text' and sections[-1]['text'] == bot_suffix:
+                    sections = sections[:-1]
+            else:
+                _bot_prefix = bot_prefix
+            # We can add special tokens for the bot lastest message according to different tasks
+            bot_response_prefix = dict(
+                auto=_bot_prefix,
+                image="",
+                think=f"{_bot_prefix}{self.think_token}",
+                recaption=f"{_bot_prefix}{self.recaption_token}",
+                img_ratio=f"{_bot_prefix}{answer_prefix}{self.boi_token}{self.size_token(image_base_size)}",
+            )[bot_task]
+            sections.append(dict(type='text', text=bot_response_prefix))
+
+        output = self.encode_general(
+            sections=sections,
+            use_text_mask=False,
+            add_eos=False,
+            add_pad=False,
+        )
+
+        if max_length is not None:
+            if output.tokens.shape[-1] > max_length:
+                raise ValueError(
+                    f"Encoded token length {output.tokens.shape[-1]} exceeds max_length {max_length}.\n"
+                    f"Please set a larger max_length or check the input messages:\n{message_list}"
+                )
+
+        return output, sections
+
+    def apply_chat_template(
+            self,
+            batch_prompt: Optional[list[str]] = None,
+            batch_message_list: Optional[list[list[dict[str, Any]]]] = None,
+            mode: str = "gen_text",
+            batch_gen_image_info: Optional[list[ImageInfo]] = None,
+            batch_cond_images: Optional[Union[list[CondImage], list[list[CondImage]]]] = None,
+            batch_system_prompt: Optional[list[str]] = None,
+            batch_cot_text: Optional[list[str]] = None,
+            max_length: Optional[int] = None,
+            bot_task: str = "auto",    # auto/image/think/recaption/img_ratio
+            image_base_size: Optional[int] = None,
+            sequence_template: str = "pretrain",
+            cfg_factor: int = 1,
+            add_assistant_prefix: Optional[bool] = None,
+            drop_think: bool = False,
+    ) -> dict[str, Any]:
+        assert bot_task in ["image", "auto", "think", "recaption", "img_ratio"], \
+            f"bot_task should be one of ['image', 'auto', 'think', 'recaption', 'img_ratio'], but got {bot_task}."
+
+        if batch_message_list is None:
+            # Simple text-to-image or text-cot-to-image task
+            batch_size = len(batch_prompt)
+
+            # Batchify inputs
+            if not isinstance(batch_system_prompt, list):
+                batch_system_prompt = [batch_system_prompt] * batch_size
+            if not isinstance(batch_gen_image_info, list):
+                batch_gen_image_info = [batch_gen_image_info] * batch_size
+            if batch_cot_text is not None:
+                assert len(batch_cot_text) == batch_size, \
+                    (f"batch_cot_text should have the same length as batch_size ({batch_size}), "
+                     f"but got {len(batch_cot_text)}.")
+            else:
+                batch_cot_text = [None] * batch_size
+            if batch_cond_images is not None:
+                assert len(batch_cond_images) == batch_size, \
+                    (f"batch_cond_image_info should have the same length as batch_size ({batch_size}), "
+                     f"but got {len(batch_cond_images)}.")
+                batch_cond_images = [
+                    cond_images if isinstance(cond_images, list) else [cond_images]
+                    for cond_images in batch_cond_images
+                ]
+            else:
+                batch_cond_images = [[] for _ in range(batch_size)]
+
+            # Convert single round materials into standard message list
+            batch_message_list = []
+            for prompt, system_prompt, cot_text, gen_image_info, cond_images in zip(
+                    batch_prompt, batch_system_prompt, batch_cot_text, batch_gen_image_info,
+                    batch_cond_images,
+            ):
+                message_list = []
+                # 1. system prompt section
+                if system_prompt:
+                    message_list.append(dict(role="system", type="text", content=system_prompt))
+                # 2. user inputs sections
+                #   2.1 image inputs
+                if len(cond_images) > 0:
+                    message_list.extend([
+                        dict(role="user", type=cond_image.section_type, content=cond_image.i)
+                        for cond_image in cond_images
+                    ])
+                #   2.2 text inputs
+                message_list.append(dict(role="user", type="text", content=prompt))
+                # 3. assistant answer sections
+                if cot_text is not None:
+                    message_list.append(dict(role="assistant", type="text", content=cot_text))
+                if mode == "gen_image":
+                    message_list.append(dict(
+                        role="assistant", type="gen_image", content=gen_image_info))
+                # ---
+                batch_message_list.append(message_list)
+
+        output, sections = self.apply_general_template(
+            message_list=batch_message_list,
+            max_length=max_length,
+            add_assistant_prefix=default(add_assistant_prefix, mode != "gen_image"),
+            bot_task=bot_task,
+            sequence_template=sequence_template,
+            cfg_factor=cfg_factor,
+            batchify=True,
+            image_base_size=image_base_size,
+            drop_think=drop_think,
+        )
+        return dict(output=output, sections=sections)
+
+    def pad(self, tensor_list, dim=0, pad_val=None):
+        if pad_val is None:
+            pad_val = self.pad_token_id
+        max_len = max([t.shape[dim] for t in tensor_list])
+        padded_tensor_list = []
+        for t in tensor_list:
+            if t.shape[dim] < max_len:
+                assert pad_val is not False, "Not allowed pad."
+                t = F.pad(t, (0, max_len - t.shape[dim]), value=pad_val)
+            padded_tensor_list.append(t)
+        return padded_tensor_list
+
+    def batch_gen_infer(
+            self,
+            infer_fn,
+            prompt_list: list,
+            negative_prompt_list: list = None,
+            infer_fn_kwargs_list: list[dict[str, int]] = None,
+            do_classifier_free_guidance=False,
+            condition_repeat_times: int = 1,
+            uncondition_repeat_times: int = 1,
+    ):
+        """
+        Batch inference for the AR-like model training of the text-to-image/instruction tuning tasks.
+
+        Parameters
+        ----------
+        infer_fn: callable
+            Inference function to encode the prompt.
+        prompt_list: list
+            List of prompts. Each element can be a single prompt or a list of prompts passed to the infer_fn.
+        negative_prompt_list: list
+            List of negative prompts. Only used when do_classifier_free_guidance is True. If None, will use <cfg> token sequence as negative prompt.
+        infer_fn_kwargs_list: List[Dict[str, int]]
+            List of keyword arguments for the infer_fn.
+        do_classifier_free_guidance: bool
+            Whether to do classifier-free guidance.
+        condition_repeat_times: int
+        uncondition_repeat_times: int
+            Support multi-condition and multi-uncondition. e.g, [pred_cond, pred_uncond_text, pred_uncond_text_uncond_src]
+        """
+        if infer_fn_kwargs_list is None:
+            infer_fn_kwargs_list = [{} for _ in prompt_list]
+
+        # [n_output, bsz]
+        cond_results_list = None
+        uncond_results_list = None
+        output_type_list = []
+
+        for prompt_idx, (prompt, infer_fn_kwargs) in enumerate(zip(prompt_list, infer_fn_kwargs_list)):
+            if not isinstance(prompt, (list, tuple)):
+                prompt = [prompt]
+            cond_kwargs = {"uncond_p": 0.0} if do_classifier_free_guidance else {}
+            results = infer_fn(
+                *prompt,
+                **infer_fn_kwargs,
+                **cond_kwargs,
+            )
+            output_type_list.append((type(results), len(results) if isinstance(results, (list, tuple)) else 1))
+            if isinstance(results, dict):
+                raise ValueError("Make batch on dict is not supported. Please return list or tuple for infer_fn.")
+            if not isinstance(results, (list, tuple)):
+                results = (results,)
+            if cond_results_list is None:
+                cond_results_list = [[] for _ in results]
+                uncond_results_list = [[] for _ in results]
+            for i, result in enumerate(results):
+                cond_results_list[i].append(result)
+
+            if do_classifier_free_guidance:
+                if negative_prompt_list is None:
+                    uncond_kwargs = {"uncond_p": 1.0}
+                    uncond_results = infer_fn(
+                        *prompt,
+                        **infer_fn_kwargs,
+                        **uncond_kwargs,
+                    )
+                else:
+                    negative_prompt = negative_prompt_list[prompt_idx]
+                    if not isinstance(negative_prompt, (list, tuple)):
+                        negative_prompt = [negative_prompt]
+                    uncond_results = infer_fn(
+                        *negative_prompt,
+                        **infer_fn_kwargs,
+                    )
+                if isinstance(uncond_results, TokenizerEncodeOutput):
+                    uncond_results_list.append(uncond_results)
+                else:
+                    for i, result in enumerate(uncond_results):
+                        uncond_results_list[i].append(result)
+
+        assert all(output_type_list[0] == n for n in output_type_list), \
+            f"Number of outputs should be equal for all samples, but got {output_type_list}."
+        output_type, output_num = output_type_list[0]
+
+        def make_batch(batch_cond_item, batch_uncond_item):
+            # Process each output item to make batch
+            first = batch_cond_item[0]     # The first element in the batch
+            if isinstance(first, torch.Tensor):
+                stacked_item = torch.stack(self.pad(
+                    batch_cond_item * condition_repeat_times + batch_uncond_item * uncondition_repeat_times,
+                ))
+
+            elif first is None:
+                assert all(item is None for item in batch_cond_item + batch_uncond_item), \
+                    (f"The first cond item is None, but some items are not None:\n\n"
+                     f"condition: {batch_cond_item}\n\n"
+                     f"uncondition: {batch_uncond_item}")
+                stacked_item = None
+
+            elif isinstance(first, (list, tuple)):
+                # If the output item is a list or tuple, we treat it as a whole, and won't make nested batch any more.
+                stacked_item = batch_cond_item * condition_repeat_times + batch_uncond_item * uncondition_repeat_times
+
+            elif isinstance(first, TokenizerEncodeOutput):
+                stacked_item = {}
+                # Traverse not-None attributes
+                for key in list(first.keys()):
+                    merged_list = [cond_item[key] for cond_item in batch_cond_item] * condition_repeat_times + \
+                        [uncond_item[key] for uncond_item in batch_uncond_item] * uncondition_repeat_times
+                    if isinstance(first[key], torch.Tensor):
+                        if 'mask' in key:
+                            pad_val = 0.0
+                        elif key == 'tokens':
+                            pad_val = self.pad_token_id
+                        else:
+                            pad_val = False     # Should not pad for other tensors
+                        stacked_item[key] = torch.stack(self.pad(merged_list, pad_val=pad_val), dim=0)
+                    elif isinstance(first[key], list):
+                        stacked_item[key] = merged_list
+                    elif first[key] is None:
+                        pass
+                    else:
+                        raise ValueError(f"Unsupported type of {key}: {type(first[key])}.")
+                stacked_item = TokenizerEncodeOutput(stacked_item)
+
+            else:
+                raise TypeError(f"Making batch on type {type(first)} is not supported.")
+
+            return stacked_item
+
+        stacked_outputs = []
+        for cond_results, uncond_results in zip(cond_results_list, uncond_results_list):
+            stacked_outputs.append(make_batch(cond_results, uncond_results))
+
+        if output_type == list:
+            return stacked_outputs
+        elif output_type == tuple:
+            return tuple(stacked_outputs)
+        elif output_num == 1:
+            return stacked_outputs[0]
+        else:
+            raise ValueError(f"Unsupported output type: {output_type}.")
+
+
+class DecoratorSections(object):
+    """ Define predefined sections in a multimodal template. """
+
+    def __init__(
+            self,
+            tokenizer: HunyuanImage3TokenizerFast,
+            conv: Conversation,
+            sequence_template: str,
+            ignore_start_tokens: Optional[set] = None,
+    ):
+        self.tokenizer = tokenizer
+        self.conv = conv
+        self.sequence_template = sequence_template
+        self.ignore_start_tokens = ignore_start_tokens or set()
+        self.roles = self.conv.roles
+
+        # Define sections based on the sequence template
+        if self.sequence_template == "pretrain":
+            self.user = []
+            self.user_sep = []
+            self.bot = []
+            self.bot_sep = []
+            self.answer_ = []
+            self._answer = []
+
+        elif self.sequence_template == "instruct":
+            self.user = [dict(type="text", text=self.conv.get_role_prefix(self.roles[0]), ignore=True)]
+            self.user_sep = [dict(type="text", text=self.conv.sep)]
+            self.bot = [dict(type="text", text=self.conv.get_role_prefix(self.roles[1]), ignore=True)]
+            self.bot_sep = [dict(type="text", text=self.conv.sep2)]
+            self.answer_ = [dict(type="text", text=self.tokenizer.answer_token,
+                                 ignore=(self.tokenizer.answer_token in self.ignore_start_tokens))]
+            self._answer = [dict(type="text", text=self.tokenizer.end_of_answer_token)]
+
+        else:
+            raise NotImplementedError(f"Unsupported sequence_template: {self.sequence_template}")
+
+        # Define eos token
+        eos_token = self.tokenizer.eos_token
+        if isinstance(eos_token, int):
+            eos_token = self.tokenizer.convert_ids_to_tokens(eos_token)
+        assert isinstance(eos_token, str), f"eos_token should be a string, got {type(eos_token)}."
+        self.eos = [dict(type="text", text=eos_token)]
+
+        # Define think sections
+        self.think_ = [dict(type="text", text=self.tokenizer.think_token,
+                            ignore=(self.tokenizer.think_token in self.ignore_start_tokens))]
+        self._think = [dict(type="text", text=self.tokenizer.end_of_think_token)]
+
+        # Define recaption sections
+        if hasattr(self.tokenizer, "recaption_token"):
+            self.recaption_ = [dict(type="text", text=self.tokenizer.recaption_token,
+                                    ignore=(self.tokenizer.recaption_token in self.ignore_start_tokens))]
+            self._recaption = [dict(type="text", text=self.tokenizer.end_of_recaption_token)]
+
+    def answer(self, section):
+        if isinstance(section, dict):
+            section = [section]
+        return self.answer_ + section + self._answer
+
+    def think(self, section):
+        if isinstance(section, dict):
+            section = [section]
+        return self.think_ + section + self._think
+
+    def recaption(self, section):
+        if not hasattr(self, "recaption_"):
+            raise AttributeError("This tokenizer does not support recaption sections.")
+        if isinstance(section, dict):
+            section = [section]
+        return self.recaption_ + section + self._recaption
+
+
+__all__ = [
+    "ResolutionGroup",
+    "ImageInfo",
+    "ImageTensor",
+    "JointImageInfo",
+    "CondImage",
+    "HunyuanImage3TokenizerFast",
+]
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf8e7245380ed104b3d23fb50a69bbe66f97ce6c
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2439418b9be76a54d8b5ff1ed89b37e36ec1735730f7da99b5aabb83a73db64
+size 25028750
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8553b5896652f4631ef47b5fb9b566edc18fd545
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,8 @@
+{
+    "bos_token": "<|startoftext|>",
+    "clean_up_tokenization_spaces": true,
+    "eos_token": "<|endoftext|>",
+    "model_max_length": 1048576,
+    "pad_token": "<pad>",
+    "tokenizer_class": "PreTrainedTokenizerFast"
+}
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..193e453c00ba128662a7ec49cc38b5bed5da5779
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,4 @@
+# Minimal utils module for hunyuan_image_3
+from .import_utils import _LazyModule, define_import_structure
+
+__all__ = ["_LazyModule", "define_import_structure"]
diff --git a/utils/__pycache__/__init__.cpython-311.pyc b/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a32a7b64cd240e276812b324fa00b7122eb4b78
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/utils/__pycache__/__init__.cpython-312.pyc b/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b8b1644fe081e8b4fe69d10b48f46e37454962d
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/utils/__pycache__/import_utils.cpython-311.pyc b/utils/__pycache__/import_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc50e6b07a488ffd8b9164808aa9eacd387bf705
--- /dev/null
+++ b/utils/__pycache__/import_utils.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df99989b51f6bf81e31229b0ff763ce8929aa8c80f402de03ce3585f79426cdc
+size 135707
diff --git a/utils/__pycache__/import_utils.cpython-312.pyc b/utils/__pycache__/import_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d2630ada2948eee0966b21cb3bb3ecbec598365
--- /dev/null
+++ b/utils/__pycache__/import_utils.cpython-312.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c63557010fffd4b5df1e3a33b7eed75e5e8d5e9380b65fbf333b7029c12566d
+size 123815
diff --git a/utils/import_utils.py b/utils/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..56c30a3e78fddfc57662b935698457c36bd67f78
--- /dev/null
+++ b/utils/import_utils.py
@@ -0,0 +1,2879 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import importlib.machinery
+import importlib.metadata
+import importlib.util
+import json
+import operator
+import os
+import re
+import shutil
+import subprocess
+import sys
+import warnings
+from collections import OrderedDict
+from enum import Enum
+from functools import lru_cache
+from itertools import chain
+from types import ModuleType
+from typing import Any, Callable, Optional, Union
+
+from packaging import version
+import logging
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[tuple[bool, str], bool]:
+    # Check if the package spec exists and grab its version to avoid importing a local directory
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            # TODO: Once python 3.9 support is dropped, `importlib.metadata.packages_distributions()`
+            # should be used here to map from package name to distribution names
+            # e.g. PIL -> Pillow, Pillow-SIMD; quark -> amd-quark; onnxruntime -> onnxruntime-gpu.
+            # `importlib.metadata.packages_distributions()` is not available in Python 3.9.
+
+            # Primary method to get the package version
+            package_version = importlib.metadata.version(pkg_name)
+        except importlib.metadata.PackageNotFoundError:
+            # Fallback method: Only for "torch" and versions containing "dev"
+            if pkg_name == "torch":
+                try:
+                    package = importlib.import_module(pkg_name)
+                    temp_version = getattr(package, "__version__", "N/A")
+                    # Check if the version contains "dev"
+                    if "dev" in temp_version:
+                        package_version = temp_version
+                        package_exists = True
+                    else:
+                        package_exists = False
+                except ImportError:
+                    # If the package can't be imported, it's not available
+                    package_exists = False
+            elif pkg_name == "quark":
+                # TODO: remove once `importlib.metadata.packages_distributions()` is supported.
+                try:
+                    package_version = importlib.metadata.version("amd-quark")
+                except Exception:
+                    package_exists = False
+            elif pkg_name == "triton":
+                try:
+                    # import triton works for both linux and windows
+                    package = importlib.import_module(pkg_name)
+                    package_version = getattr(package, "__version__", "N/A")
+                except Exception:
+                    try:
+                        package_version = importlib.metadata.version("pytorch-triton")  # pytorch-triton
+                    except Exception:
+                        package_exists = False
+            else:
+                # For packages other than "torch", don't attempt the fallback and set as not available
+                package_exists = False
+        logger.debug(f"Detected {pkg_name} version: {package_version}")
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
+
+# Try to run a native pytorch job in an environment with TorchXLA installed by setting this value to 0.
+USE_TORCH_XLA = os.environ.get("USE_TORCH_XLA", "1").upper()
+
+FORCE_TF_AVAILABLE = os.environ.get("FORCE_TF_AVAILABLE", "AUTO").upper()
+
+# `transformers` requires `torch>=1.11` but this variable is exposed publicly, and we can't simply remove it.
+# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
+TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
+
+ACCELERATE_MIN_VERSION = "0.26.0"
+SCHEDULEFREE_MIN_VERSION = "1.2.6"
+FSDP_MIN_VERSION = "1.12.0"
+GGUF_MIN_VERSION = "0.10.0"
+XLA_FSDPV2_MIN_VERSION = "2.2.0"
+HQQ_MIN_VERSION = "0.2.1"
+VPTQ_MIN_VERSION = "0.0.4"
+TORCHAO_MIN_VERSION = "0.4.0"
+AUTOROUND_MIN_VERSION = "0.5.0"
+TRITON_MIN_VERSION = "1.0.0"
+
+_accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
+_apex_available = _is_package_available("apex")
+_apollo_torch_available = _is_package_available("apollo_torch")
+_aqlm_available = _is_package_available("aqlm")
+_vptq_available, _vptq_version = _is_package_available("vptq", return_version=True)
+_av_available = importlib.util.find_spec("av") is not None
+_decord_available = importlib.util.find_spec("decord") is not None
+_torchcodec_available = importlib.util.find_spec("torchcodec") is not None
+_libcst_available = _is_package_available("libcst")
+_bitsandbytes_available = _is_package_available("bitsandbytes")
+_eetq_available = _is_package_available("eetq")
+_fbgemm_gpu_available = _is_package_available("fbgemm_gpu")
+_galore_torch_available = _is_package_available("galore_torch")
+_lomo_available = _is_package_available("lomo_optim")
+_grokadamw_available = _is_package_available("grokadamw")
+_schedulefree_available, _schedulefree_version = _is_package_available("schedulefree", return_version=True)
+_torch_optimi_available = importlib.util.find_spec("optimi") is not None
+# `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
+_bs4_available = importlib.util.find_spec("bs4") is not None
+_coloredlogs_available = _is_package_available("coloredlogs")
+# `importlib.metadata.util` doesn't work with `opencv-python-headless`.
+_cv2_available = importlib.util.find_spec("cv2") is not None
+_yt_dlp_available = importlib.util.find_spec("yt_dlp") is not None
+_datasets_available = _is_package_available("datasets")
+_detectron2_available = _is_package_available("detectron2")
+# We need to check `faiss`, `faiss-cpu` and `faiss-gpu`.
+_faiss_available = importlib.util.find_spec("faiss") is not None
+try:
+    _faiss_version = importlib.metadata.version("faiss")
+    logger.debug(f"Successfully imported faiss version {_faiss_version}")
+except importlib.metadata.PackageNotFoundError:
+    try:
+        _faiss_version = importlib.metadata.version("faiss-cpu")
+        logger.debug(f"Successfully imported faiss version {_faiss_version}")
+    except importlib.metadata.PackageNotFoundError:
+        try:
+            _faiss_version = importlib.metadata.version("faiss-gpu")
+            logger.debug(f"Successfully imported faiss version {_faiss_version}")
+        except importlib.metadata.PackageNotFoundError:
+            _faiss_available = False
+_ftfy_available = _is_package_available("ftfy")
+_g2p_en_available = _is_package_available("g2p_en")
+_hadamard_available = _is_package_available("fast_hadamard_transform")
+_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
+_jieba_available = _is_package_available("jieba")
+_jinja_available = _is_package_available("jinja2")
+_kenlm_available = _is_package_available("kenlm")
+_keras_nlp_available = _is_package_available("keras_nlp")
+_levenshtein_available = _is_package_available("Levenshtein")
+_librosa_available = _is_package_available("librosa")
+_natten_available = _is_package_available("natten")
+_nltk_available = _is_package_available("nltk")
+_onnx_available = _is_package_available("onnx")
+_openai_available = _is_package_available("openai")
+_optimum_available = _is_package_available("optimum")
+_auto_gptq_available = _is_package_available("auto_gptq")
+_gptqmodel_available = _is_package_available("gptqmodel")
+_auto_round_available, _auto_round_version = _is_package_available("auto_round", return_version=True)
+# `importlib.metadata.version` doesn't work with `awq`
+_auto_awq_available = importlib.util.find_spec("awq") is not None
+_quark_available = _is_package_available("quark")
+_fp_quant_available, _fp_quant_version = _is_package_available("fp_quant", return_version=True)
+_qutlass_available = _is_package_available("qutlass")
+_is_optimum_quanto_available = False
+try:
+    importlib.metadata.version("optimum_quanto")
+    _is_optimum_quanto_available = True
+except importlib.metadata.PackageNotFoundError:
+    _is_optimum_quanto_available = False
+# For compressed_tensors, only check spec to allow compressed_tensors-nightly package
+_compressed_tensors_available = importlib.util.find_spec("compressed_tensors") is not None
+_pandas_available = _is_package_available("pandas")
+_peft_available = _is_package_available("peft")
+_phonemizer_available = _is_package_available("phonemizer")
+_uroman_available = _is_package_available("uroman")
+_psutil_available = _is_package_available("psutil")
+_py3nvml_available = _is_package_available("py3nvml")
+_pyctcdecode_available = _is_package_available("pyctcdecode")
+_pygments_available = _is_package_available("pygments")
+_pytesseract_available = _is_package_available("pytesseract")
+_pytest_available = _is_package_available("pytest")
+_pytorch_quantization_available = _is_package_available("pytorch_quantization")
+_rjieba_available = _is_package_available("rjieba")
+_sacremoses_available = _is_package_available("sacremoses")
+_safetensors_available = _is_package_available("safetensors")
+_scipy_available = _is_package_available("scipy")
+_sentencepiece_available = _is_package_available("sentencepiece")
+_is_seqio_available = _is_package_available("seqio")
+_is_gguf_available, _gguf_version = _is_package_available("gguf", return_version=True)
+_sklearn_available = importlib.util.find_spec("sklearn") is not None
+if _sklearn_available:
+    try:
+        importlib.metadata.version("scikit-learn")
+    except importlib.metadata.PackageNotFoundError:
+        _sklearn_available = False
+_smdistributed_available = importlib.util.find_spec("smdistributed") is not None
+_soundfile_available = _is_package_available("soundfile")
+_spacy_available = _is_package_available("spacy")
+_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
+_tensorflow_probability_available = _is_package_available("tensorflow_probability")
+_tensorflow_text_available = _is_package_available("tensorflow_text")
+_tf2onnx_available = _is_package_available("tf2onnx")
+_timm_available = _is_package_available("timm")
+_tokenizers_available = _is_package_available("tokenizers")
+_torchaudio_available = _is_package_available("torchaudio")
+_torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
+_torchdistx_available = _is_package_available("torchdistx")
+_torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
+_mlx_available = _is_package_available("mlx")
+_num2words_available = _is_package_available("num2words")
+_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
+_tiktoken_available = _is_package_available("tiktoken")
+_blobfile_available = _is_package_available("blobfile")
+_liger_kernel_available = _is_package_available("liger_kernel")
+_spqr_available = _is_package_available("spqr_quant")
+_rich_available = _is_package_available("rich")
+_kernels_available = _is_package_available("kernels")
+_matplotlib_available = _is_package_available("matplotlib")
+_mistral_common_available = _is_package_available("mistral_common")
+_triton_available, _triton_version = _is_package_available("triton", return_version=True)
+
+_torch_version = "N/A"
+_torch_available = False
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    _torch_available, _torch_version = _is_package_available("torch", return_version=True)
+    if _torch_available:
+        _torch_available = version.parse(_torch_version) >= version.parse("2.1.0")
+        if not _torch_available:
+            logger.warning(f"Disabling PyTorch because PyTorch >= 2.1 is required but found {_torch_version}")
+else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+    _torch_available = False
+
+
+_tf_version = "N/A"
+_tf_available = False
+if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES:
+    _tf_available = True
+else:
+    if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+        # Note: _is_package_available("tensorflow") fails for tensorflow-cpu. Please test any changes to the line below
+        # with tensorflow-cpu to make sure it still works!
+        _tf_available = importlib.util.find_spec("tensorflow") is not None
+        if _tf_available:
+            candidates = (
+                "tensorflow",
+                "tensorflow-cpu",
+                "tensorflow-gpu",
+                "tf-nightly",
+                "tf-nightly-cpu",
+                "tf-nightly-gpu",
+                "tf-nightly-rocm",
+                "intel-tensorflow",
+                "intel-tensorflow-avx512",
+                "tensorflow-rocm",
+                "tensorflow-macos",
+                "tensorflow-aarch64",
+            )
+            _tf_version = None
+            # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+            for pkg in candidates:
+                try:
+                    _tf_version = importlib.metadata.version(pkg)
+                    break
+                except importlib.metadata.PackageNotFoundError:
+                    pass
+            _tf_available = _tf_version is not None
+        if _tf_available:
+            if version.parse(_tf_version) < version.parse("2"):
+                logger.info(
+                    f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum."
+                )
+                _tf_available = False
+    else:
+        logger.info("Disabling Tensorflow because USE_TORCH is set")
+
+
+_essentia_available = importlib.util.find_spec("essentia") is not None
+try:
+    _essentia_version = importlib.metadata.version("essentia")
+    logger.debug(f"Successfully imported essentia version {_essentia_version}")
+except importlib.metadata.PackageNotFoundError:
+    _essentia_version = False
+
+
+_pydantic_available = importlib.util.find_spec("pydantic") is not None
+try:
+    _pydantic_version = importlib.metadata.version("pydantic")
+    logger.debug(f"Successfully imported pydantic version {_pydantic_version}")
+except importlib.metadata.PackageNotFoundError:
+    _pydantic_available = False
+
+
+_fastapi_available = importlib.util.find_spec("fastapi") is not None
+try:
+    _fastapi_version = importlib.metadata.version("fastapi")
+    logger.debug(f"Successfully imported pydantic version {_fastapi_version}")
+except importlib.metadata.PackageNotFoundError:
+    _fastapi_available = False
+
+
+_uvicorn_available = importlib.util.find_spec("uvicorn") is not None
+try:
+    _uvicorn_version = importlib.metadata.version("uvicorn")
+    logger.debug(f"Successfully imported pydantic version {_uvicorn_version}")
+except importlib.metadata.PackageNotFoundError:
+    _uvicorn_available = False
+
+
+_pretty_midi_available = importlib.util.find_spec("pretty_midi") is not None
+try:
+    _pretty_midi_version = importlib.metadata.version("pretty_midi")
+    logger.debug(f"Successfully imported pretty_midi version {_pretty_midi_version}")
+except importlib.metadata.PackageNotFoundError:
+    _pretty_midi_available = False
+
+
+ccl_version = "N/A"
+_is_ccl_available = (
+    importlib.util.find_spec("torch_ccl") is not None
+    or importlib.util.find_spec("oneccl_bindings_for_pytorch") is not None
+)
+try:
+    ccl_version = importlib.metadata.version("oneccl_bind_pt")
+    logger.debug(f"Detected oneccl_bind_pt version {ccl_version}")
+except importlib.metadata.PackageNotFoundError:
+    _is_ccl_available = False
+
+
+_flax_available = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available, _flax_version = _is_package_available("flax", return_version=True)
+    if _flax_available:
+        _jax_available, _jax_version = _is_package_available("jax", return_version=True)
+        if _jax_available:
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        else:
+            _flax_available = _jax_available = False
+            _jax_version = _flax_version = "N/A"
+
+
+_torch_xla_available = False
+if USE_TORCH_XLA in ENV_VARS_TRUE_VALUES:
+    _torch_xla_available, _torch_xla_version = _is_package_available("torch_xla", return_version=True)
+    if _torch_xla_available:
+        logger.info(f"Torch XLA version {_torch_xla_version} available.")
+
+
+def is_kenlm_available() -> Union[tuple[bool, str], bool]:
+    return _kenlm_available
+
+
+def is_kernels_available() -> Union[tuple[bool, str], bool]:
+    return _kernels_available
+
+
+def is_cv2_available() -> Union[tuple[bool, str], bool]:
+    return _cv2_available
+
+
+def is_yt_dlp_available() -> Union[tuple[bool, str], bool]:
+    return _yt_dlp_available
+
+
+def is_torch_available() -> Union[tuple[bool, str], bool]:
+    return _torch_available
+
+
+def is_libcst_available() -> Union[tuple[bool, str], bool]:
+    return _libcst_available
+
+
+def is_accelerate_available(min_version: str = ACCELERATE_MIN_VERSION) -> bool:
+    return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
+
+
+def is_torch_accelerator_available() -> bool:
+    if is_torch_available():
+        import torch
+
+        return hasattr(torch, "accelerator")
+
+    return False
+
+
+def is_torch_deterministic() -> bool:
+    """
+    Check whether pytorch uses deterministic algorithms by looking if torch.set_deterministic_debug_mode() is set to 1 or 2"
+    """
+    if is_torch_available():
+        import torch
+
+        if torch.get_deterministic_debug_mode() == 0:
+            return False
+        else:
+            return True
+
+    return False
+
+
+def is_triton_available(min_version: str = TRITON_MIN_VERSION) -> bool:
+    return _triton_available and version.parse(_triton_version) >= version.parse(min_version)
+
+
+def is_hadamard_available() -> Union[tuple[bool, str], bool]:
+    return _hadamard_available
+
+
+def is_hqq_available(min_version: str = HQQ_MIN_VERSION) -> bool:
+    return _hqq_available and version.parse(_hqq_version) >= version.parse(min_version)
+
+
+def is_pygments_available() -> Union[tuple[bool, str], bool]:
+    return _pygments_available
+
+
+def get_torch_version() -> str:
+    return _torch_version
+
+
+def get_torch_major_and_minor_version() -> str:
+    if _torch_version == "N/A":
+        return "N/A"
+    parsed_version = version.parse(_torch_version)
+    return str(parsed_version.major) + "." + str(parsed_version.minor)
+
+
+def is_torch_sdpa_available():
+    # Mostly retained for backward compatibility in remote code, since sdpa works correctly on all torch versions >= 2.2
+    if not is_torch_available() or _torch_version == "N/A":
+        return False
+    return True
+
+
+def is_torch_flex_attn_available() -> bool:
+    if not is_torch_available() or _torch_version == "N/A":
+        return False
+
+    # TODO check if some bugs cause push backs on the exact version
+    # NOTE: We require torch>=2.5.0 as it is the first release
+    return version.parse(_torch_version) >= version.parse("2.5.0")
+
+
+def is_torchvision_available() -> bool:
+    return _torchvision_available
+
+
+def is_torchvision_v2_available() -> bool:
+    if not is_torchvision_available():
+        return False
+
+    # NOTE: We require torchvision>=0.15 as v2 transforms are available from this version: https://pytorch.org/vision/stable/transforms.html#v1-or-v2-which-one-should-i-use
+    return version.parse(_torchvision_version) >= version.parse("0.15")
+
+
+def is_galore_torch_available() -> Union[tuple[bool, str], bool]:
+    return _galore_torch_available
+
+
+def is_apollo_torch_available() -> Union[tuple[bool, str], bool]:
+    return _apollo_torch_available
+
+
+def is_torch_optimi_available() -> Union[tuple[bool, str], bool]:
+    return _torch_optimi_available
+
+
+def is_lomo_available() -> Union[tuple[bool, str], bool]:
+    return _lomo_available
+
+
+def is_grokadamw_available() -> Union[tuple[bool, str], bool]:
+    return _grokadamw_available
+
+
+def is_schedulefree_available(min_version: str = SCHEDULEFREE_MIN_VERSION) -> bool:
+    return _schedulefree_available and version.parse(_schedulefree_version) >= version.parse(min_version)
+
+
+def is_pyctcdecode_available() -> Union[tuple[bool, str], bool]:
+    return _pyctcdecode_available
+
+
+def is_librosa_available() -> Union[tuple[bool, str], bool]:
+    return _librosa_available
+
+
+def is_essentia_available() -> Union[tuple[bool, str], bool]:
+    return _essentia_available
+
+
+def is_pydantic_available() -> Union[tuple[bool, str], bool]:
+    return _pydantic_available
+
+
+def is_fastapi_available() -> Union[tuple[bool, str], bool]:
+    return _fastapi_available
+
+
+def is_uvicorn_available() -> Union[tuple[bool, str], bool]:
+    return _uvicorn_available
+
+
+def is_openai_available() -> Union[tuple[bool, str], bool]:
+    return _openai_available
+
+
+def is_pretty_midi_available() -> Union[tuple[bool, str], bool]:
+    return _pretty_midi_available
+
+
+def is_torch_cuda_available() -> bool:
+    if is_torch_available():
+        import torch
+
+        return torch.cuda.is_available()
+    else:
+        return False
+
+
+def is_cuda_platform() -> bool:
+    if is_torch_available():
+        import torch
+
+        return torch.version.cuda is not None
+    else:
+        return False
+
+
+def is_rocm_platform() -> bool:
+    if is_torch_available():
+        import torch
+
+        return torch.version.hip is not None
+    else:
+        return False
+
+
+def is_mamba_ssm_available() -> Union[tuple[bool, str], bool]:
+    if is_torch_available():
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        else:
+            return _is_package_available("mamba_ssm")
+    return False
+
+
+def is_mamba_2_ssm_available() -> bool:
+    if is_torch_available():
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        else:
+            if _is_package_available("mamba_ssm"):
+                import mamba_ssm
+
+                if version.parse(mamba_ssm.__version__) >= version.parse("2.0.4"):
+                    return True
+    return False
+
+
+def is_causal_conv1d_available() -> Union[tuple[bool, str], bool]:
+    if is_torch_available():
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        return _is_package_available("causal_conv1d")
+    return False
+
+
+def is_xlstm_available() -> Union[tuple[bool, str], bool]:
+    if is_torch_available():
+        return _is_package_available("xlstm")
+    return False
+
+
+def is_mambapy_available() -> Union[tuple[bool, str], bool]:
+    if is_torch_available():
+        return _is_package_available("mambapy")
+    return False
+
+
+def is_torch_mps_available(min_version: Optional[str] = None) -> bool:
+    if is_torch_available():
+        import torch
+
+        if hasattr(torch.backends, "mps"):
+            backend_available = torch.backends.mps.is_available() and torch.backends.mps.is_built()
+            if min_version is not None:
+                flag = version.parse(_torch_version) >= version.parse(min_version)
+                backend_available = backend_available and flag
+            return backend_available
+    return False
+
+
+def is_torch_bf16_gpu_available() -> bool:
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if torch.cuda.is_available():
+        return torch.cuda.is_bf16_supported()
+    if is_torch_xpu_available():
+        return torch.xpu.is_bf16_supported()
+    if is_torch_hpu_available():
+        return True
+    if is_torch_npu_available():
+        return torch.npu.is_bf16_supported()
+    return False
+
+
+def is_torch_bf16_cpu_available() -> Union[tuple[bool, str], bool]:
+    return is_torch_available()
+
+
+def is_torch_bf16_available() -> bool:
+    # the original bf16 check was for gpu only, but later a cpu/bf16 combo has emerged so this util
+    # has become ambiguous and therefore deprecated
+    warnings.warn(
+        "The util is_torch_bf16_available is deprecated, please use is_torch_bf16_gpu_available "
+        "or is_torch_bf16_cpu_available instead according to whether it's used with cpu or gpu",
+        FutureWarning,
+    )
+    return is_torch_bf16_gpu_available()
+
+
+@lru_cache
+def is_torch_fp16_available_on_device(device: str) -> bool:
+    if not is_torch_available():
+        return False
+
+    if is_torch_hpu_available():
+        if is_habana_gaudi1():
+            return False
+        else:
+            return True
+
+    import torch
+
+    try:
+        x = torch.zeros(2, 2, dtype=torch.float16, device=device)
+        _ = x @ x
+
+        # At this moment, let's be strict of the check: check if `LayerNorm` is also supported on device, because many
+        # models use this layer.
+        batch, sentence_length, embedding_dim = 3, 4, 5
+        embedding = torch.randn(batch, sentence_length, embedding_dim, dtype=torch.float16, device=device)
+        layer_norm = torch.nn.LayerNorm(embedding_dim, dtype=torch.float16, device=device)
+        _ = layer_norm(embedding)
+
+    except:  # noqa: E722
+        # TODO: more precise exception matching, if possible.
+        # most backends should return `RuntimeError` however this is not guaranteed.
+        return False
+
+    return True
+
+
+@lru_cache
+def is_torch_bf16_available_on_device(device: str) -> bool:
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if device == "cuda":
+        return is_torch_bf16_gpu_available()
+
+    if device == "hpu":
+        return True
+
+    try:
+        x = torch.zeros(2, 2, dtype=torch.bfloat16, device=device)
+        _ = x @ x
+    except:  # noqa: E722
+        # TODO: more precise exception matching, if possible.
+        # most backends should return `RuntimeError` however this is not guaranteed.
+        return False
+
+    return True
+
+
+def is_torch_tf32_available() -> bool:
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if not torch.cuda.is_available() or torch.version.cuda is None:
+        return False
+    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
+        return False
+    return True
+
+
+def is_torch_fx_available() -> Union[tuple[bool, str], bool]:
+    return is_torch_available()
+
+
+def is_peft_available() -> Union[tuple[bool, str], bool]:
+    return _peft_available
+
+
+def is_bs4_available() -> Union[tuple[bool, str], bool]:
+    return _bs4_available
+
+
+def is_tf_available() -> bool:
+    return _tf_available
+
+
+def is_coloredlogs_available() -> Union[tuple[bool, str], bool]:
+    return _coloredlogs_available
+
+
+def is_tf2onnx_available() -> Union[tuple[bool, str], bool]:
+    return _tf2onnx_available
+
+
+def is_onnx_available() -> Union[tuple[bool, str], bool]:
+    return _onnx_available
+
+
+def is_flax_available() -> bool:
+    return _flax_available
+
+
+def is_flute_available() -> bool:
+    try:
+        return importlib.util.find_spec("flute") is not None and importlib.metadata.version("flute-kernel") >= "0.4.1"
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
+def is_ftfy_available() -> Union[tuple[bool, str], bool]:
+    return _ftfy_available
+
+
+def is_g2p_en_available() -> Union[tuple[bool, str], bool]:
+    return _g2p_en_available
+
+
+@lru_cache
+def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False) -> bool:
+    """
+    Check if `torch_xla` is available. To train a native pytorch job in an environment with torch xla installed, set
+    the USE_TORCH_XLA to false.
+    """
+    assert not (check_is_tpu and check_is_gpu), "The check_is_tpu and check_is_gpu cannot both be true."
+
+    if not _torch_xla_available:
+        return False
+
+    import torch_xla
+
+    if check_is_gpu:
+        return torch_xla.runtime.device_type() in ["GPU", "CUDA"]
+    elif check_is_tpu:
+        return torch_xla.runtime.device_type() == "TPU"
+
+    return True
+
+
+@lru_cache
+def is_torch_neuroncore_available(check_device=True) -> bool:
+    if importlib.util.find_spec("torch_neuronx") is not None:
+        return is_torch_xla_available()
+    return False
+
+
+@lru_cache
+def is_torch_npu_available(check_device=False) -> bool:
+    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no NPU is found
+            _ = torch.npu.device_count()
+            return torch.npu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
+@lru_cache
+def is_torch_mlu_available(check_device=False) -> bool:
+    """
+    Checks if `mlu` is available via an `cndev-based` check which won't trigger the drivers and leave mlu
+    uninitialized.
+    """
+    if not _torch_available or importlib.util.find_spec("torch_mlu") is None:
+        return False
+
+    import torch
+    import torch_mlu  # noqa: F401
+
+    pytorch_cndev_based_mlu_check_previous_value = os.environ.get("PYTORCH_CNDEV_BASED_MLU_CHECK")
+    try:
+        os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = str(1)
+        available = torch.mlu.is_available()
+    finally:
+        if pytorch_cndev_based_mlu_check_previous_value:
+            os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = pytorch_cndev_based_mlu_check_previous_value
+        else:
+            os.environ.pop("PYTORCH_CNDEV_BASED_MLU_CHECK", None)
+
+    return available
+
+
+@lru_cache
+def is_torch_musa_available(check_device=False) -> bool:
+    "Checks if `torch_musa` is installed and potentially if a MUSA is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_musa") is None:
+        return False
+
+    import torch
+    import torch_musa  # noqa: F401
+
+    torch_musa_min_version = "0.33.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_musa_min_version):
+        return False
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no MUSA is found
+            _ = torch.musa.device_count()
+            return torch.musa.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "musa") and torch.musa.is_available()
+
+
+@lru_cache
+def is_torch_hpu_available() -> bool:
+    "Checks if `torch.hpu` is available and potentially if a HPU is in the environment"
+    if (
+        not _torch_available
+        or importlib.util.find_spec("habana_frameworks") is None
+        or importlib.util.find_spec("habana_frameworks.torch") is None
+    ):
+        return False
+
+    torch_hpu_min_accelerate_version = "1.5.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
+        return False
+
+    import torch
+
+    if os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
+        # import habana_frameworks.torch in case of lazy mode to patch torch with torch.hpu
+        import habana_frameworks.torch  # noqa: F401
+
+    if not hasattr(torch, "hpu") or not torch.hpu.is_available():
+        return False
+
+    # We patch torch.gather for int64 tensors to avoid a bug on Gaudi
+    # Graph compile failed with synStatus 26 [Generic failure]
+    # This can be removed once bug is fixed but for now we need it.
+    original_gather = torch.gather
+
+    def patched_gather(input: torch.Tensor, dim: int, index: torch.LongTensor) -> torch.Tensor:
+        if input.dtype == torch.int64 and input.device.type == "hpu":
+            return original_gather(input.to(torch.int32), dim, index).to(torch.int64)
+        else:
+            return original_gather(input, dim, index)
+
+    torch.gather = patched_gather
+    torch.Tensor.gather = patched_gather
+
+    original_take_along_dim = torch.take_along_dim
+
+    def patched_take_along_dim(
+        input: torch.Tensor, indices: torch.LongTensor, dim: Optional[int] = None
+    ) -> torch.Tensor:
+        if input.dtype == torch.int64 and input.device.type == "hpu":
+            return original_take_along_dim(input.to(torch.int32), indices, dim).to(torch.int64)
+        else:
+            return original_take_along_dim(input, indices, dim)
+
+    torch.take_along_dim = patched_take_along_dim
+
+    original_cholesky = torch.linalg.cholesky
+
+    def safe_cholesky(A, *args, **kwargs):
+        output = original_cholesky(A, *args, **kwargs)
+
+        if torch.isnan(output).any():
+            jitter_value = 1e-9
+            diag_jitter = torch.eye(A.size(-1), dtype=A.dtype, device=A.device) * jitter_value
+            output = original_cholesky(A + diag_jitter, *args, **kwargs)
+
+        return output
+
+    torch.linalg.cholesky = safe_cholesky
+
+    original_scatter = torch.scatter
+
+    def patched_scatter(
+        input: torch.Tensor, dim: int, index: torch.Tensor, src: torch.Tensor, *args, **kwargs
+    ) -> torch.Tensor:
+        if input.device.type == "hpu" and input is src:
+            return original_scatter(input, dim, index, src.clone(), *args, **kwargs)
+        else:
+            return original_scatter(input, dim, index, src, *args, **kwargs)
+
+    torch.scatter = patched_scatter
+    torch.Tensor.scatter = patched_scatter
+
+    # IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
+    # https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
+    # This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
+    # https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
+    original_compile = torch.compile
+
+    def hpu_backend_compile(*args, **kwargs):
+        if kwargs.get("backend") not in ["hpu_backend", "eager"]:
+            logger.warning(
+                f"Calling torch.compile with backend={kwargs.get('backend')} on a Gaudi device is not supported. "
+                "We will override the backend with 'hpu_backend' to avoid errors."
+            )
+            kwargs["backend"] = "hpu_backend"
+
+        return original_compile(*args, **kwargs)
+
+    torch.compile = hpu_backend_compile
+
+    return True
+
+
+@lru_cache
+def is_habana_gaudi1() -> bool:
+    if not is_torch_hpu_available():
+        return False
+
+    import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401
+
+    # Check if the device is Gaudi1 (vs Gaudi2, Gaudi3)
+    return htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi
+
+
+def is_torchdynamo_available() -> Union[tuple[bool, str], bool]:
+    return is_torch_available()
+
+
+def is_torch_compile_available() -> Union[tuple[bool, str], bool]:
+    return is_torch_available()
+
+
+def is_torchdynamo_compiling() -> Union[tuple[bool, str], bool]:
+    if not is_torch_available():
+        return False
+
+    # Importing torch._dynamo causes issues with PyTorch profiler (https://github.com/pytorch/pytorch/issues/130622)
+    # hence rather relying on `torch.compiler.is_compiling()` when possible (torch>=2.3)
+    try:
+        import torch
+
+        return torch.compiler.is_compiling()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo  # noqa: F401
+
+            return dynamo.is_compiling()
+        except Exception:
+            return False
+
+
+def is_torchdynamo_exporting() -> bool:
+    if not is_torch_available():
+        return False
+
+    try:
+        import torch
+
+        return torch.compiler.is_exporting()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo  # noqa: F401
+
+            return dynamo.is_exporting()
+        except Exception:
+            return False
+
+
+def is_torch_tensorrt_fx_available() -> bool:
+    if importlib.util.find_spec("torch_tensorrt") is None:
+        return False
+    return importlib.util.find_spec("torch_tensorrt.fx") is not None
+
+
+def is_datasets_available() -> Union[tuple[bool, str], bool]:
+    return _datasets_available
+
+
+def is_detectron2_available() -> Union[tuple[bool, str], bool]:
+    return _detectron2_available
+
+
+def is_rjieba_available() -> Union[tuple[bool, str], bool]:
+    return _rjieba_available
+
+
+def is_psutil_available() -> Union[tuple[bool, str], bool]:
+    return _psutil_available
+
+
+def is_py3nvml_available() -> Union[tuple[bool, str], bool]:
+    return _py3nvml_available
+
+
+def is_sacremoses_available() -> Union[tuple[bool, str], bool]:
+    return _sacremoses_available
+
+
+def is_apex_available() -> Union[tuple[bool, str], bool]:
+    return _apex_available
+
+
+def is_aqlm_available() -> Union[tuple[bool, str], bool]:
+    return _aqlm_available
+
+
+def is_vptq_available(min_version: str = VPTQ_MIN_VERSION) -> bool:
+    return _vptq_available and version.parse(_vptq_version) >= version.parse(min_version)
+
+
+def is_av_available() -> bool:
+    return _av_available
+
+
+def is_decord_available() -> bool:
+    return _decord_available
+
+
+def is_torchcodec_available() -> bool:
+    return _torchcodec_available
+
+
+def is_ninja_available() -> bool:
+    r"""
+    Code comes from *torch.utils.cpp_extension.is_ninja_available()*. Returns `True` if the
+    [ninja](https://ninja-build.org/) build system is available on the system, `False` otherwise.
+    """
+    try:
+        subprocess.check_output(["ninja", "--version"])
+    except Exception:
+        return False
+    else:
+        return True
+
+
+def is_ipex_available(min_version: str = "") -> bool:
+    def get_major_and_minor_from_version(full_version):
+        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
+
+    if not is_torch_available() or not _ipex_available:
+        return False
+
+    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
+    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
+    if torch_major_and_minor != ipex_major_and_minor:
+        logger.warning(
+            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
+            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
+        )
+        return False
+    if min_version:
+        return version.parse(_ipex_version) >= version.parse(min_version)
+    return True
+
+
+@lru_cache
+def is_torch_xpu_available(check_device: bool = False) -> bool:
+    """
+    Checks if XPU acceleration is available either via native PyTorch (>=2.6),
+    `intel_extension_for_pytorch` or via stock PyTorch (>=2.4) and potentially
+    if a XPU is in the environment.
+    """
+    if not is_torch_available():
+        return False
+
+    torch_version = version.parse(_torch_version)
+    if torch_version.major == 2 and torch_version.minor < 6:
+        if is_ipex_available():
+            import intel_extension_for_pytorch  # noqa: F401
+        elif torch_version.major == 2 and torch_version.minor < 4:
+            return False
+
+    import torch
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no XPU  is found
+            _ = torch.xpu.device_count()
+            return torch.xpu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
+@lru_cache
+def is_bitsandbytes_available(check_library_only: bool = False) -> bool:
+    if not _bitsandbytes_available:
+        return False
+
+    if check_library_only:
+        return True
+
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    # `bitsandbytes` versions older than 0.43.1 eagerly require CUDA at import time,
+    # so those versions of the library are practically only available when CUDA is too.
+    if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
+        return torch.cuda.is_available()
+
+    # Newer versions of `bitsandbytes` can be imported on systems without CUDA.
+    return True
+
+
+def is_bitsandbytes_multi_backend_available() -> bool:
+    if not is_bitsandbytes_available():
+        return False
+
+    import bitsandbytes as bnb
+
+    return "multi_backend" in getattr(bnb, "features", set())
+
+
+def is_flash_attn_2_available() -> bool:
+    if not is_torch_available():
+        return False
+
+    if not _is_package_available("flash_attn"):
+        return False
+
+    # Let's add an extra check to see if cuda is available
+    import torch
+
+    if not (torch.cuda.is_available() or is_torch_mlu_available()):
+        return False
+
+    if torch.version.cuda:
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
+    elif torch.version.hip:
+        # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4")
+    elif is_torch_mlu_available():
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.3.3")
+    else:
+        return False
+
+
+@lru_cache
+def is_flash_attn_3_available() -> bool:
+    if not is_torch_available():
+        return False
+
+    if not _is_package_available("flash_attn_3"):
+        return False
+
+    import torch
+
+    if not torch.cuda.is_available():
+        return False
+
+    # TODO: Check for a minimum version when FA3 is stable
+    # return version.parse(importlib.metadata.version("flash_attn_3")) >= version.parse("3.0.0")
+
+    return True
+
+
+@lru_cache
+def is_flash_attn_greater_or_equal_2_10() -> bool:
+    if not _is_package_available("flash_attn"):
+        return False
+
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
+
+
+@lru_cache
+def is_flash_attn_greater_or_equal(library_version: str) -> bool:
+    if not _is_package_available("flash_attn"):
+        return False
+
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
+
+
+@lru_cache
+def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False) -> bool:
+    """
+    Accepts a library version and returns True if the current version of the library is greater than or equal to the
+    given version. If `accept_dev` is True, it will also accept development versions (e.g. 2.7.0.dev20250320 matches
+    2.7.0).
+    """
+    if not _is_package_available("torch"):
+        return False
+
+    if accept_dev:
+        return version.parse(version.parse(importlib.metadata.version("torch")).base_version) >= version.parse(
+            library_version
+        )
+    else:
+        return version.parse(importlib.metadata.version("torch")) >= version.parse(library_version)
+
+
+@lru_cache
+def is_torch_less_or_equal(library_version: str, accept_dev: bool = False) -> bool:
+    """
+    Accepts a library version and returns True if the current version of the library is less than or equal to the
+    given version. If `accept_dev` is True, it will also accept development versions (e.g. 2.7.0.dev20250320 matches
+    2.7.0).
+    """
+    if not _is_package_available("torch"):
+        return False
+
+    if accept_dev:
+        return version.parse(version.parse(importlib.metadata.version("torch")).base_version) <= version.parse(
+            library_version
+        )
+    else:
+        return version.parse(importlib.metadata.version("torch")) <= version.parse(library_version)
+
+
+@lru_cache
+def is_huggingface_hub_greater_or_equal(library_version: str, accept_dev: bool = False) -> bool:
+    if not _is_package_available("huggingface_hub"):
+        return False
+
+    if accept_dev:
+        return version.parse(
+            version.parse(importlib.metadata.version("huggingface_hub")).base_version
+        ) >= version.parse(library_version)
+    else:
+        return version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(library_version)
+
+
+@lru_cache
+def is_quanto_greater(library_version: str, accept_dev: bool = False) -> bool:
+    """
+    Accepts a library version and returns True if the current version of the library is greater than or equal to the
+    given version. If `accept_dev` is True, it will also accept development versions (e.g. 2.7.0.dev20250320 matches
+    2.7.0).
+    """
+    if not _is_package_available("optimum.quanto"):
+        return False
+
+    if accept_dev:
+        return version.parse(version.parse(importlib.metadata.version("optimum-quanto")).base_version) > version.parse(
+            library_version
+        )
+    else:
+        return version.parse(importlib.metadata.version("optimum-quanto")) > version.parse(library_version)
+
+
+def is_torchdistx_available():
+    return _torchdistx_available
+
+
+def is_faiss_available() -> bool:
+    return _faiss_available
+
+
+def is_scipy_available() -> Union[tuple[bool, str], bool]:
+    return _scipy_available
+
+
+def is_sklearn_available() -> Union[tuple[bool, str], bool]:
+    return _sklearn_available
+
+
+def is_sentencepiece_available() -> Union[tuple[bool, str], bool]:
+    return _sentencepiece_available
+
+
+def is_seqio_available() -> Union[tuple[bool, str], bool]:
+    return _is_seqio_available
+
+
+def is_gguf_available(min_version: str = GGUF_MIN_VERSION) -> bool:
+    return _is_gguf_available and version.parse(_gguf_version) >= version.parse(min_version)
+
+
+def is_protobuf_available() -> bool:
+    if importlib.util.find_spec("google") is None:
+        return False
+    return importlib.util.find_spec("google.protobuf") is not None
+
+
+def is_fsdp_available(min_version: str = FSDP_MIN_VERSION) -> bool:
+    return is_torch_available() and version.parse(_torch_version) >= version.parse(min_version)
+
+
+def is_optimum_available() -> Union[tuple[bool, str], bool]:
+    return _optimum_available
+
+
+def is_auto_awq_available() -> bool:
+    return _auto_awq_available
+
+
+def is_auto_round_available(min_version: str = AUTOROUND_MIN_VERSION) -> bool:
+    return _auto_round_available and version.parse(_auto_round_version) >= version.parse(min_version)
+
+
+def is_optimum_quanto_available():
+    # `importlib.metadata.version` doesn't work with `optimum.quanto`, need to put `optimum_quanto`
+    return _is_optimum_quanto_available
+
+
+def is_quark_available() -> Union[tuple[bool, str], bool]:
+    return _quark_available
+
+
+def is_fp_quant_available() -> bool:
+    return _fp_quant_available and version.parse(_fp_quant_version) >= version.parse("0.1.6")
+
+
+def is_qutlass_available() -> Union[tuple[bool, str], bool]:
+    return _qutlass_available
+
+
+def is_compressed_tensors_available() -> bool:
+    return _compressed_tensors_available
+
+
+def is_auto_gptq_available() -> Union[tuple[bool, str], bool]:
+    return _auto_gptq_available
+
+
+def is_gptqmodel_available() -> Union[tuple[bool, str], bool]:
+    return _gptqmodel_available
+
+
+def is_eetq_available() -> Union[tuple[bool, str], bool]:
+    return _eetq_available
+
+
+def is_fbgemm_gpu_available() -> Union[tuple[bool, str], bool]:
+    return _fbgemm_gpu_available
+
+
+def is_levenshtein_available() -> Union[tuple[bool, str], bool]:
+    return _levenshtein_available
+
+
+def is_optimum_neuron_available() -> Union[tuple[bool, str], bool]:
+    return _optimum_available and _is_package_available("optimum.neuron")
+
+
+def is_safetensors_available() -> Union[tuple[bool, str], bool]:
+    return _safetensors_available
+
+
+def is_tokenizers_available() -> Union[tuple[bool, str], bool]:
+    return _tokenizers_available
+
+
+@lru_cache
+def is_vision_available() -> bool:
+    _pil_available = importlib.util.find_spec("PIL") is not None
+    if _pil_available:
+        try:
+            package_version = importlib.metadata.version("Pillow")
+        except importlib.metadata.PackageNotFoundError:
+            try:
+                package_version = importlib.metadata.version("Pillow-SIMD")
+            except importlib.metadata.PackageNotFoundError:
+                return False
+        logger.debug(f"Detected PIL version {package_version}")
+    return _pil_available
+
+
+def is_pytesseract_available() -> Union[tuple[bool, str], bool]:
+    return _pytesseract_available
+
+
+def is_pytest_available() -> Union[tuple[bool, str], bool]:
+    return _pytest_available
+
+
+def is_spacy_available() -> Union[tuple[bool, str], bool]:
+    return _spacy_available
+
+
+def is_tensorflow_text_available() -> Union[tuple[bool, str], bool]:
+    return is_tf_available() and _tensorflow_text_available
+
+
+def is_keras_nlp_available() -> Union[tuple[bool, str], bool]:
+    return is_tensorflow_text_available() and _keras_nlp_available
+
+
+def is_in_notebook() -> bool:
+    try:
+        # Check if we are running inside Marimo
+        if "marimo" in sys.modules:
+            return True
+        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
+        get_ipython = sys.modules["IPython"].get_ipython
+        if "IPKernelApp" not in get_ipython().config:
+            raise ImportError("console")
+        # Removed the lines to include VSCode
+        if "DATABRICKS_RUNTIME_VERSION" in os.environ and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0":
+            # Databricks Runtime 11.0 and above uses IPython kernel by default so it should be compatible with Jupyter notebook
+            # https://docs.microsoft.com/en-us/azure/databricks/notebooks/ipython-kernel
+            raise ImportError("databricks")
+
+        return importlib.util.find_spec("IPython") is not None
+    except (AttributeError, ImportError, KeyError):
+        return False
+
+
+def is_pytorch_quantization_available() -> Union[tuple[bool, str], bool]:
+    return _pytorch_quantization_available
+
+
+def is_tensorflow_probability_available() -> Union[tuple[bool, str], bool]:
+    return _tensorflow_probability_available
+
+
+def is_pandas_available() -> Union[tuple[bool, str], bool]:
+    return _pandas_available
+
+
+def is_sagemaker_dp_enabled() -> bool:
+    # Get the sagemaker specific env variable.
+    sagemaker_params = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        sagemaker_params = json.loads(sagemaker_params)
+        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return _smdistributed_available
+
+
+def is_sagemaker_mp_enabled() -> bool:
+    # Get the sagemaker specific mp parameters from smp_options variable.
+    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
+    try:
+        # Parse it and check the field "partitions" is included, it is required for model parallel.
+        smp_options = json.loads(smp_options)
+        if "partitions" not in smp_options:
+            return False
+    except json.JSONDecodeError:
+        return False
+
+    # Get the sagemaker specific framework parameters from mpi_options variable.
+    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        mpi_options = json.loads(mpi_options)
+        if not mpi_options.get("sagemaker_mpi_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return _smdistributed_available
+
+
+def is_training_run_on_sagemaker() -> bool:
+    return "SAGEMAKER_JOB_NAME" in os.environ
+
+
+def is_soundfile_available() -> Union[tuple[bool, str], bool]:
+    return _soundfile_available
+
+
+def is_timm_available() -> Union[tuple[bool, str], bool]:
+    return _timm_available
+
+
+def is_natten_available() -> Union[tuple[bool, str], bool]:
+    return _natten_available
+
+
+def is_nltk_available() -> Union[tuple[bool, str], bool]:
+    return _nltk_available
+
+
+def is_torchaudio_available() -> Union[tuple[bool, str], bool]:
+    return _torchaudio_available
+
+
+def is_torchao_available(min_version: str = TORCHAO_MIN_VERSION) -> bool:
+    return _torchao_available and version.parse(_torchao_version) >= version.parse(min_version)
+
+
+def is_speech_available() -> Union[tuple[bool, str], bool]:
+    # For now this depends on torchaudio but the exact dependency might evolve in the future.
+    return _torchaudio_available
+
+
+def is_spqr_available() -> Union[tuple[bool, str], bool]:
+    return _spqr_available
+
+
+def is_phonemizer_available() -> Union[tuple[bool, str], bool]:
+    return _phonemizer_available
+
+
+def is_uroman_available() -> Union[tuple[bool, str], bool]:
+    return _uroman_available
+
+
+def torch_only_method(fn: Callable) -> Callable:
+    def wrapper(*args, **kwargs):
+        if not _torch_available:
+            raise ImportError(
+                "You need to install pytorch to use this method or class, "
+                "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
+            )
+        else:
+            return fn(*args, **kwargs)
+
+    return wrapper
+
+
+def is_ccl_available() -> bool:
+    return _is_ccl_available
+
+
+def is_sudachi_available() -> bool:
+    return _sudachipy_available
+
+
+def get_sudachi_version() -> bool:
+    return _sudachipy_version
+
+
+def is_sudachi_projection_available() -> bool:
+    if not is_sudachi_available():
+        return False
+
+    # NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the constructor of BertJapaneseTokenizer.
+    # - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230
+    return version.parse(_sudachipy_version) >= version.parse("0.6.8")
+
+
+def is_jumanpp_available() -> bool:
+    return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
+
+
+def is_cython_available() -> bool:
+    return importlib.util.find_spec("pyximport") is not None
+
+
+def is_jieba_available() -> Union[tuple[bool, str], bool]:
+    return _jieba_available
+
+
+def is_jinja_available() -> Union[tuple[bool, str], bool]:
+    return _jinja_available
+
+
+def is_mlx_available() -> Union[tuple[bool, str], bool]:
+    return _mlx_available
+
+
+def is_num2words_available() -> Union[tuple[bool, str], bool]:
+    return _num2words_available
+
+
+def is_tiktoken_available() -> Union[tuple[bool, str], bool]:
+    return _tiktoken_available and _blobfile_available
+
+
+def is_liger_kernel_available() -> bool:
+    if not _liger_kernel_available:
+        return False
+
+    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.3.0")
+
+
+def is_rich_available() -> Union[tuple[bool, str], bool]:
+    return _rich_available
+
+
+def is_matplotlib_available() -> Union[tuple[bool, str], bool]:
+    return _matplotlib_available
+
+
+def is_mistral_common_available() -> Union[tuple[bool, str], bool]:
+    return _mistral_common_available
+
+
+def check_torch_load_is_safe() -> None:
+    if not is_torch_greater_or_equal("2.6"):
+        raise ValueError(
+            "Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users "
+            "to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply "
+            "when loading files with safetensors."
+            "\nSee the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434"
+        )
+
+
+# docstyle-ignore
+AV_IMPORT_ERROR = """
+{0} requires the PyAv library but it was not found in your environment. You can install it with:
+```
+pip install av
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+YT_DLP_IMPORT_ERROR = """
+{0} requires the YT-DLP library but it was not found in your environment. You can install it with:
+```
+pip install yt-dlp
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+DECORD_IMPORT_ERROR = """
+{0} requires the PyAv library but it was not found in your environment. You can install it with:
+```
+pip install decord
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+TORCHCODEC_IMPORT_ERROR = """
+{0} requires the TorchCodec (https://github.com/pytorch/torchcodec) library, but it was not found in your environment. You can install it with:
+```
+pip install torchcodec
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+CV2_IMPORT_ERROR = """
+{0} requires the OpenCV library but it was not found in your environment. You can install it with:
+```
+pip install opencv-python
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+DATASETS_IMPORT_ERROR = """
+{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
+```
+pip install datasets
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install datasets
+```
+then restarting your kernel.
+
+Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
+working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
+that python file if that's the case. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+TOKENIZERS_IMPORT_ERROR = """
+{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
+```
+pip install tokenizers
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install tokenizers
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+SENTENCEPIECE_IMPORT_ERROR = """
+{0} requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
+installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PROTOBUF_IMPORT_ERROR = """
+{0} requires the protobuf library but it was not found in your environment. Check out the instructions on the
+installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+FAISS_IMPORT_ERROR = """
+{0} requires the faiss library but it was not found in your environment. Check out the instructions on the
+installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR = """
+{0} requires the PyTorch library but it was not found in your environment. Check out the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+TORCHVISION_IMPORT_ERROR = """
+{0} requires the Torchvision library but it was not found in your environment. Check out the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR_WITH_TF = """
+{0} requires the PyTorch library but it was not found in your environment.
+However, we were able to find a TensorFlow installation. TensorFlow classes begin
+with "TF", but are otherwise identically named to our PyTorch classes. This
+means that the TF equivalent of the class you tried to import would be "TF{0}".
+If you want to use TensorFlow, please use TF classes instead!
+
+If you really do want to use PyTorch please go to
+https://pytorch.org/get-started/locally/ and follow the instructions that
+match your environment.
+"""
+
+# docstyle-ignore
+TF_IMPORT_ERROR_WITH_PYTORCH = """
+{0} requires the TensorFlow library but it was not found in your environment.
+However, we were able to find a PyTorch installation. PyTorch classes do not begin
+with "TF", but are otherwise identically named to our TF classes.
+If you want to use PyTorch, please use those classes instead!
+
+If you really do want to use TensorFlow, please follow the instructions on the
+installation page https://www.tensorflow.org/install that match your environment.
+"""
+
+# docstyle-ignore
+BS4_IMPORT_ERROR = """
+{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
+`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+SKLEARN_IMPORT_ERROR = """
+{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
+```
+pip install -U scikit-learn
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install -U scikit-learn
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+TENSORFLOW_IMPORT_ERROR = """
+{0} requires the TensorFlow library but it was not found in your environment. Check out the instructions on the
+installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+DETECTRON2_IMPORT_ERROR = """
+{0} requires the detectron2 library but it was not found in your environment. Check out the instructions on the
+installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+FLAX_IMPORT_ERROR = """
+{0} requires the FLAX library but it was not found in your environment. Check out the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+FTFY_IMPORT_ERROR = """
+{0} requires the ftfy library but it was not found in your environment. Check out the instructions on the
+installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+LEVENSHTEIN_IMPORT_ERROR = """
+{0} requires the python-Levenshtein library but it was not found in your environment. You can install it with pip: `pip
+install python-Levenshtein`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+G2P_EN_IMPORT_ERROR = """
+{0} requires the g2p-en library but it was not found in your environment. You can install it with pip:
+`pip install g2p-en`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYTORCH_QUANTIZATION_IMPORT_ERROR = """
+{0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
+`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TENSORFLOW_PROBABILITY_IMPORT_ERROR = """
+{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as
+explained here: https://github.com/tensorflow/probability. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TENSORFLOW_TEXT_IMPORT_ERROR = """
+{0} requires the tensorflow_text library but it was not found in your environment. You can install it with pip as
+explained here: https://www.tensorflow.org/text/guide/tf_text_intro.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TORCHAUDIO_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. Please install it and restart your
+runtime.
+"""
+
+# docstyle-ignore
+PANDAS_IMPORT_ERROR = """
+{0} requires the pandas library but it was not found in your environment. You can install it with pip as
+explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html.
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PHONEMIZER_IMPORT_ERROR = """
+{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
+`pip install phonemizer`. Please note that you may need to restart your runtime after installation.
+"""
+# docstyle-ignore
+UROMAN_IMPORT_ERROR = """
+{0} requires the uroman library but it was not found in your environment. You can install it with pip:
+`pip install uroman`. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+SACREMOSES_IMPORT_ERROR = """
+{0} requires the sacremoses library but it was not found in your environment. You can install it with pip:
+`pip install sacremoses`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip:
+`pip install scipy`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+KERAS_NLP_IMPORT_ERROR = """
+{0} requires the keras_nlp library but it was not found in your environment. You can install it with pip.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+SPEECH_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
+`pip install torchaudio`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TIMM_IMPORT_ERROR = """
+{0} requires the timm library but it was not found in your environment. You can install it with pip:
+`pip install timm`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+NATTEN_IMPORT_ERROR = """
+{0} requires the natten library but it was not found in your environment. You can install it by referring to:
+shi-labs.com/natten . You can also install it with pip (may take longer to build):
+`pip install natten`. Please note that you may need to restart your runtime after installation.
+"""
+
+NUMEXPR_IMPORT_ERROR = """
+{0} requires the numexpr library but it was not found in your environment. You can install it by referring to:
+https://numexpr.readthedocs.io/en/latest/index.html.
+"""
+
+
+# docstyle-ignore
+NLTK_IMPORT_ERROR = """
+{0} requires the NLTK library but it was not found in your environment. You can install it by referring to:
+https://www.nltk.org/install.html. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+VISION_IMPORT_ERROR = """
+{0} requires the PIL library but it was not found in your environment. You can install it with pip:
+`pip install pillow`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYDANTIC_IMPORT_ERROR = """
+{0} requires the pydantic library but it was not found in your environment. You can install it with pip:
+`pip install pydantic`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+FASTAPI_IMPORT_ERROR = """
+{0} requires the fastapi library but it was not found in your environment. You can install it with pip:
+`pip install fastapi`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+UVICORN_IMPORT_ERROR = """
+{0} requires the uvicorn library but it was not found in your environment. You can install it with pip:
+`pip install uvicorn`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+OPENAI_IMPORT_ERROR = """
+{0} requires the openai library but it was not found in your environment. You can install it with pip:
+`pip install openai`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYTESSERACT_IMPORT_ERROR = """
+{0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
+`pip install pytesseract`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYCTCDECODE_IMPORT_ERROR = """
+{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
+`pip install pyctcdecode`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+ACCELERATE_IMPORT_ERROR = """
+{0} requires the accelerate library >= {ACCELERATE_MIN_VERSION} it was not found in your environment.
+You can install or update it with pip: `pip install --upgrade accelerate`. Please note that you may need to restart your
+runtime after installation.
+"""
+
+# docstyle-ignore
+CCL_IMPORT_ERROR = """
+{0} requires the torch ccl library but it was not found in your environment. You can install it with pip:
+`pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+ESSENTIA_IMPORT_ERROR = """
+{0} requires essentia library. But that was not found in your environment. You can install them with pip:
+`pip install essentia==2.1b6.dev1034`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+LIBROSA_IMPORT_ERROR = """
+{0} requires the librosa library. But that was not found in your environment. You can install them with pip:
+`pip install librosa`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PRETTY_MIDI_IMPORT_ERROR = """
+{0} requires the pretty_midi library. But that was not found in your environment. You can install them with pip:
+`pip install pretty_midi`
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+CYTHON_IMPORT_ERROR = """
+{0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install
+Cython`. Please note that you may need to restart your runtime after installation.
+"""
+
+JIEBA_IMPORT_ERROR = """
+{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
+jieba`. Please note that you may need to restart your runtime after installation.
+"""
+
+PEFT_IMPORT_ERROR = """
+{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install
+peft`. Please note that you may need to restart your runtime after installation.
+"""
+
+JINJA_IMPORT_ERROR = """
+{0} requires the jinja library but it was not found in your environment. You can install it with pip: `pip install
+jinja2`. Please note that you may need to restart your runtime after installation.
+"""
+
+RICH_IMPORT_ERROR = """
+{0} requires the rich library but it was not found in your environment. You can install it with pip: `pip install
+rich`. Please note that you may need to restart your runtime after installation.
+"""
+
+MISTRAL_COMMON_IMPORT_ERROR = """
+{0} requires the mistral-common library but it was not found in your environment. You can install it with pip: `pip install mistral-common`. Please note that you may need to restart your runtime after installation.
+"""
+
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("av", (is_av_available, AV_IMPORT_ERROR)),
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+        ("cv2", (is_cv2_available, CV2_IMPORT_ERROR)),
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
+        ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
+        ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)),
+        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+        ("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
+        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
+        ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
+        ("uroman", (is_uroman_available, UROMAN_IMPORT_ERROR)),
+        ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
+        ("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
+        ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
+        ("sacremoses", (is_sacremoses_available, SACREMOSES_IMPORT_ERROR)),
+        ("pytorch_quantization", (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
+        ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
+        ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)),
+        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+        ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
+        ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
+        ("torchaudio", (is_torchaudio_available, TORCHAUDIO_IMPORT_ERROR)),
+        ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
+        ("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),
+        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("torchvision", (is_torchvision_available, TORCHVISION_IMPORT_ERROR)),
+        ("torchcodec", (is_torchcodec_available, TORCHCODEC_IMPORT_ERROR)),
+        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+        ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
+        ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
+        ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
+        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
+        ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
+        ("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
+        ("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),
+        ("rich", (is_rich_available, RICH_IMPORT_ERROR)),
+        ("keras_nlp", (is_keras_nlp_available, KERAS_NLP_IMPORT_ERROR)),
+        ("pydantic", (is_pydantic_available, PYDANTIC_IMPORT_ERROR)),
+        ("fastapi", (is_fastapi_available, FASTAPI_IMPORT_ERROR)),
+        ("uvicorn", (is_uvicorn_available, UVICORN_IMPORT_ERROR)),
+        ("openai", (is_openai_available, OPENAI_IMPORT_ERROR)),
+        ("mistral-common", (is_mistral_common_available, MISTRAL_COMMON_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+
+    # Raise an error for users who might not realize that classes without "TF" are torch-only
+    if "torch" in backends and "tf" not in backends and not is_torch_available() and is_tf_available():
+        raise ImportError(PYTORCH_IMPORT_ERROR_WITH_TF.format(name))
+
+    # Raise the inverse error for PyTorch users trying to load TF classes
+    if "tf" in backends and "torch" not in backends and is_torch_available() and not is_tf_available():
+        raise ImportError(TF_IMPORT_ERROR_WITH_PYTORCH.format(name))
+
+    failed = []
+    for backend in backends:
+        if isinstance(backend, Backend):
+            available, msg = backend.is_satisfied, backend.error_message
+        else:
+            available, msg = BACKENDS_MAPPING[backend]
+
+        if not available():
+            failed.append(msg.format(name))
+
+    if failed:
+        raise ImportError("".join(failed))
+
+
+class DummyObject(type):
+    """
+    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+    `requires_backend` each time a user tries to access any method of that class.
+    """
+
+    is_dummy = True
+
+    def __getattribute__(cls, key):
+        if (key.startswith("_") and key != "_from_config") or key == "is_dummy" or key == "mro" or key == "call":
+            return super().__getattribute__(key)
+        requires_backends(cls, cls._backends)
+
+
+def is_torch_fx_proxy(x):
+    if is_torch_fx_available():
+        import torch.fx
+
+        return isinstance(x, torch.fx.Proxy)
+    return False
+
+
+BACKENDS_T = frozenset[str]
+IMPORT_STRUCTURE_T = dict[BACKENDS_T, dict[str, set[str]]]
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(
+        self,
+        name: str,
+        module_file: str,
+        import_structure: IMPORT_STRUCTURE_T,
+        module_spec: Optional[importlib.machinery.ModuleSpec] = None,
+        extra_objects: Optional[dict[str, object]] = None,
+        explicit_import_shortcut: Optional[dict[str, list[str]]] = None,
+    ):
+        super().__init__(name)
+
+        self._object_missing_backend = {}
+        self._explicit_import_shortcut = explicit_import_shortcut if explicit_import_shortcut else {}
+
+        if any(isinstance(key, frozenset) for key in import_structure):
+            self._modules = set()
+            self._class_to_module = {}
+            self.__all__ = []
+
+            _import_structure = {}
+
+            for backends, module in import_structure.items():
+                missing_backends = []
+
+                # This ensures that if a module is importable, then all other keys of the module are importable.
+                # As an example, in module.keys() we might have the following:
+                #
+                # dict_keys(['models.nllb_moe.configuration_nllb_moe', 'models.sew_d.configuration_sew_d'])
+                #
+                # with this, we don't only want to be able to import these explicitly, we want to be able to import
+                # every intermediate module as well. Therefore, this is what is returned:
+                #
+                # {
+                #     'models.nllb_moe.configuration_nllb_moe',
+                #     'models.sew_d.configuration_sew_d',
+                #     'models',
+                #     'models.sew_d', 'models.nllb_moe'
+                # }
+
+                module_keys = set(
+                    chain(*[[k.rsplit(".", i)[0] for i in range(k.count(".") + 1)] for k in list(module.keys())])
+                )
+
+                for backend in backends:
+                    if backend in BACKENDS_MAPPING:
+                        callable, _ = BACKENDS_MAPPING[backend]
+                    else:
+                        if any(key in backend for key in ["=", "<", ">"]):
+                            backend = Backend(backend)
+                            callable = backend.is_satisfied
+                        else:
+                            raise ValueError(
+                                f"Backend should be defined in the BACKENDS_MAPPING. Offending backend: {backend}"
+                            )
+
+                    try:
+                        if not callable():
+                            missing_backends.append(backend)
+                    except (importlib.metadata.PackageNotFoundError, ModuleNotFoundError, RuntimeError):
+                        missing_backends.append(backend)
+
+                self._modules = self._modules.union(module_keys)
+
+                for key, values in module.items():
+                    if missing_backends:
+                        self._object_missing_backend[key] = missing_backends
+
+                    for value in values:
+                        self._class_to_module[value] = key
+                        if missing_backends:
+                            self._object_missing_backend[value] = missing_backends
+                    _import_structure.setdefault(key, []).extend(values)
+
+                # Needed for autocompletion in an IDE
+                self.__all__.extend(module_keys | set(chain(*module.values())))
+
+            self.__file__ = module_file
+            self.__spec__ = module_spec
+            self.__path__ = [os.path.dirname(module_file)]
+            self._objects = {} if extra_objects is None else extra_objects
+            self._name = name
+            self._import_structure = _import_structure
+
+        # This can be removed once every exportable object has a `require()` require.
+        else:
+            self._modules = set(import_structure.keys())
+            self._class_to_module = {}
+            for key, values in import_structure.items():
+                for value in values:
+                    self._class_to_module[value] = key
+            # Needed for autocompletion in an IDE
+            self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+            self.__file__ = module_file
+            self.__spec__ = module_spec
+            self.__path__ = [os.path.dirname(module_file)]
+            self._objects = {} if extra_objects is None else extra_objects
+            self._name = name
+            self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._object_missing_backend:
+            missing_backends = self._object_missing_backend[name]
+
+            class Placeholder(metaclass=DummyObject):
+                _backends = missing_backends
+
+                def __init__(self, *args, **kwargs):
+                    requires_backends(self, missing_backends)
+
+                def call(self, *args, **kwargs):
+                    pass
+
+            Placeholder.__name__ = name
+
+            if name not in self._class_to_module:
+                module_name = f"transformers.{name}"
+            else:
+                module_name = self._class_to_module[name]
+                if not module_name.startswith("transformers."):
+                    module_name = f"transformers.{module_name}"
+
+            Placeholder.__module__ = module_name
+
+            value = Placeholder
+        elif name in self._class_to_module:
+            try:
+                module = self._get_module(self._class_to_module[name])
+                value = getattr(module, name)
+            except (ModuleNotFoundError, RuntimeError) as e:
+                raise ModuleNotFoundError(
+                    f"Could not import module '{name}'. Are this object's requirements defined correctly?"
+                ) from e
+
+        elif name in self._modules:
+            try:
+                value = self._get_module(name)
+            except (ModuleNotFoundError, RuntimeError) as e:
+                raise ModuleNotFoundError(
+                    f"Could not import module '{name}'. Are this object's requirements defined correctly?"
+                ) from e
+        else:
+            value = None
+            for key, values in self._explicit_import_shortcut.items():
+                if name in values:
+                    value = self._get_module(key)
+
+            if value is None:
+                raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
+
+
+class OptionalDependencyNotAvailable(BaseException):
+    """Internally used error class for signalling an optional dependency was not found."""
+
+
+def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
+    """Imports transformers directly
+
+    Args:
+        path (`str`): The path to the source file
+        file (`str`, *optional*): The file to join with the path. Defaults to "__init__.py".
+
+    Returns:
+        `ModuleType`: The resulting imported module
+    """
+    name = "transformers"
+    location = os.path.join(path, file)
+    spec = importlib.util.spec_from_file_location(name, location, submodule_search_locations=[path])
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    module = sys.modules[name]
+    return module
+
+
+class VersionComparison(Enum):
+    EQUAL = operator.eq
+    NOT_EQUAL = operator.ne
+    GREATER_THAN = operator.gt
+    LESS_THAN = operator.lt
+    GREATER_THAN_OR_EQUAL = operator.ge
+    LESS_THAN_OR_EQUAL = operator.le
+
+    @staticmethod
+    def from_string(version_string: str) -> "VersionComparison":
+        string_to_operator = {
+            "=": VersionComparison.EQUAL.value,
+            "==": VersionComparison.EQUAL.value,
+            "!=": VersionComparison.NOT_EQUAL.value,
+            ">": VersionComparison.GREATER_THAN.value,
+            "<": VersionComparison.LESS_THAN.value,
+            ">=": VersionComparison.GREATER_THAN_OR_EQUAL.value,
+            "<=": VersionComparison.LESS_THAN_OR_EQUAL.value,
+        }
+
+        return string_to_operator[version_string]
+
+
+@lru_cache
+def split_package_version(package_version_str) -> tuple[str, str, str]:
+    pattern = r"([a-zA-Z0-9_-]+)([!<>=~]+)([0-9.]+)"
+    match = re.match(pattern, package_version_str)
+    if match:
+        return (match.group(1), match.group(2), match.group(3))
+    else:
+        raise ValueError(f"Invalid package version string: {package_version_str}")
+
+
+class Backend:
+    def __init__(self, backend_requirement: str):
+        self.package_name, self.version_comparison, self.version = split_package_version(backend_requirement)
+
+        if self.package_name not in BACKENDS_MAPPING:
+            raise ValueError(
+                f"Backends should be defined in the BACKENDS_MAPPING. Offending backend: {self.package_name}"
+            )
+
+    def is_satisfied(self) -> bool:
+        return VersionComparison.from_string(self.version_comparison)(
+            version.parse(importlib.metadata.version(self.package_name)), version.parse(self.version)
+        )
+
+    def __repr__(self) -> str:
+        return f'Backend("{self.package_name}", {VersionComparison[self.version_comparison]}, "{self.version}")'
+
+    @property
+    def error_message(self):
+        return (
+            f"{{0}} requires the {self.package_name} library version {self.version_comparison}{self.version}. That"
+            f" library was not found with this version in your environment."
+        )
+
+
+def requires(*, backends=()):
+    """
+    This decorator enables two things:
+    - Attaching a `__backends` tuple to an object to see what are the necessary backends for it
+      to execute correctly without instantiating it
+    - The '@requires' string is used to dynamically import objects
+    """
+
+    if not isinstance(backends, tuple):
+        raise TypeError("Backends should be a tuple.")
+
+    applied_backends = []
+    for backend in backends:
+        if backend in BACKENDS_MAPPING:
+            applied_backends.append(backend)
+        else:
+            if any(key in backend for key in ["=", "<", ">"]):
+                applied_backends.append(Backend(backend))
+            else:
+                raise ValueError(f"Backend should be defined in the BACKENDS_MAPPING. Offending backend: {backend}")
+
+    def inner_fn(fun):
+        fun.__backends = applied_backends
+        return fun
+
+    return inner_fn
+
+
+BASE_FILE_REQUIREMENTS = {
+    lambda e: "modeling_tf_" in e: ("tf",),
+    lambda e: "modeling_flax_" in e: ("flax",),
+    lambda e: "modeling_" in e: ("torch",),
+    lambda e: e.startswith("tokenization_") and e.endswith("_fast"): ("tokenizers",),
+    lambda e: e.startswith("image_processing_") and e.endswith("_fast"): ("vision", "torch", "torchvision"),
+    lambda e: e.startswith("image_processing_"): ("vision",),
+}
+
+
+def fetch__all__(file_content) -> list[str]:
+    """
+    Returns the content of the __all__ variable in the file content.
+    Returns None if not defined, otherwise returns a list of strings.
+    """
+
+    if "__all__" not in file_content:
+        return []
+
+    start_index = None
+    lines = file_content.splitlines()
+    for index, line in enumerate(lines):
+        if line.startswith("__all__"):
+            start_index = index
+
+    # There is no line starting with `__all__`
+    if start_index is None:
+        return []
+
+    lines = lines[start_index:]
+
+    if not lines[0].startswith("__all__"):
+        raise ValueError(
+            "fetch__all__ accepts a list of lines, with the first line being the __all__ variable declaration"
+        )
+
+    # __all__ is defined on a single line
+    if lines[0].endswith("]"):
+        return [obj.strip("\"' ") for obj in lines[0].split("=")[1].strip(" []").split(",")]
+
+    # __all__ is defined on multiple lines
+    else:
+        _all: list[str] = []
+        for __all__line_index in range(1, len(lines)):
+            if lines[__all__line_index].strip() == "]":
+                return _all
+            else:
+                _all.append(lines[__all__line_index].strip("\"', "))
+
+        return _all
+
+
+@lru_cache
+def create_import_structure_from_path(module_path):
+    """
+    This method takes the path to a file/a folder and returns the import structure.
+    If a file is given, it will return the import structure of the parent folder.
+
+    Import structures are designed to be digestible by `_LazyModule` objects. They are
+    created from the __all__ definitions in each files as well as the `@require` decorators
+    above methods and objects.
+
+    The import structure allows explicit display of the required backends for a given object.
+    These backends are specified in two ways:
+
+    1. Through their `@require`, if they are exported with that decorator. This `@require` decorator
+       accepts a `backend` tuple kwarg mentioning which backends are required to run this object.
+
+    2. If an object is defined in a file with "default" backends, it will have, at a minimum, this
+       backend specified. The default backends are defined according to the filename:
+
+       - If a file is named like `modeling_*.py`, it will have a `torch` backend
+       - If a file is named like `modeling_tf_*.py`, it will have a `tf` backend
+       - If a file is named like `modeling_flax_*.py`, it will have a `flax` backend
+       - If a file is named like `tokenization_*_fast.py`, it will have a `tokenizers` backend
+       - If a file is named like `image_processing*_fast.py`, it will have a `torchvision` + `torch` backend
+
+    Backends serve the purpose of displaying a clear error message to the user in case the backends are not installed.
+    Should an object be imported without its required backends being in the environment, any attempt to use the
+    object will raise an error mentioning which backend(s) should be added to the environment in order to use
+    that object.
+
+    Here's an example of an input import structure at the src.transformers.models level:
+
+    {
+        'albert': {
+            frozenset(): {
+                'configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'}
+            },
+            frozenset({'tokenizers'}): {
+                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+            },
+        },
+        'align': {
+            frozenset(): {
+                'configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+                'processing_align': {'AlignProcessor'}
+            },
+        },
+        'altclip': {
+            frozenset(): {
+                'configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+                'processing_altclip': {'AltCLIPProcessor'},
+            }
+        }
+    }
+    """
+    import_structure = {}
+
+    if os.path.isfile(module_path):
+        module_path = os.path.dirname(module_path)
+
+    directory = module_path
+    adjacent_modules = []
+
+    for f in os.listdir(module_path):
+        if f != "__pycache__" and os.path.isdir(os.path.join(module_path, f)):
+            import_structure[f] = create_import_structure_from_path(os.path.join(module_path, f))
+
+        elif not os.path.isdir(os.path.join(directory, f)):
+            adjacent_modules.append(f)
+
+    # We're only taking a look at files different from __init__.py
+    # We could theoretically require things directly from the __init__.py
+    # files, but this is not supported at this time.
+    if "__init__.py" in adjacent_modules:
+        adjacent_modules.remove("__init__.py")
+
+    # Modular files should not be imported
+    def find_substring(substring, list_):
+        return any(substring in x for x in list_)
+
+    if find_substring("modular_", adjacent_modules) and find_substring("modeling_", adjacent_modules):
+        adjacent_modules = [module for module in adjacent_modules if "modular_" not in module]
+
+    module_requirements = {}
+    for module_name in adjacent_modules:
+        # Only modules ending in `.py` are accepted here.
+        if not module_name.endswith(".py"):
+            continue
+
+        with open(os.path.join(directory, module_name), encoding="utf-8") as f:
+            file_content = f.read()
+
+        # Remove the .py suffix
+        module_name = module_name[:-3]
+
+        previous_line = ""
+        previous_index = 0
+
+        # Some files have some requirements by default.
+        # For example, any file named `modeling_tf_xxx.py`
+        # should have TensorFlow as a required backend.
+        base_requirements = ()
+        for string_check, requirements in BASE_FILE_REQUIREMENTS.items():
+            if string_check(module_name):
+                base_requirements = requirements
+                break
+
+        # Objects that have a `@require` assigned to them will get exported
+        # with the backends specified in the decorator as well as the file backends.
+        exported_objects = set()
+        if "@requires" in file_content:
+            lines = file_content.split("\n")
+            for index, line in enumerate(lines):
+                # This allows exporting items with other decorators. We'll take a look
+                # at the line that follows at the same indentation level.
+                if line.startswith((" ", "\t", "@", ")")) and not line.startswith("@requires"):
+                    continue
+
+                # Skipping line enables putting whatever we want between the
+                # export() call and the actual class/method definition.
+                # This is what enables having # Copied from statements, docs, etc.
+                skip_line = False
+
+                if "@requires" in previous_line:
+                    skip_line = False
+
+                    # Backends are defined on the same line as export
+                    if "backends" in previous_line:
+                        backends_string = previous_line.split("backends=")[1].split("(")[1].split(")")[0]
+                        backends = tuple(sorted([b.strip("'\",") for b in backends_string.split(", ") if b]))
+
+                    # Backends are defined in the lines following export, for example such as:
+                    # @export(
+                    #     backends=(
+                    #             "sentencepiece",
+                    #             "torch",
+                    #             "tf",
+                    #     )
+                    # )
+                    #
+                    # or
+                    #
+                    # @export(
+                    #     backends=(
+                    #             "sentencepiece", "tf"
+                    #     )
+                    # )
+                    elif "backends" in lines[previous_index + 1]:
+                        backends = []
+                        for backend_line in lines[previous_index:index]:
+                            if "backends" in backend_line:
+                                backend_line = backend_line.split("=")[1]
+                            if '"' in backend_line or "'" in backend_line:
+                                if ", " in backend_line:
+                                    backends.extend(backend.strip("()\"', ") for backend in backend_line.split(", "))
+                                else:
+                                    backends.append(backend_line.strip("()\"', "))
+
+                            # If the line is only a ')', then we reached the end of the backends and we break.
+                            if backend_line.strip() == ")":
+                                break
+                        backends = tuple(backends)
+
+                    # No backends are registered for export
+                    else:
+                        backends = ()
+
+                    backends = frozenset(backends + base_requirements)
+                    if backends not in module_requirements:
+                        module_requirements[backends] = {}
+                    if module_name not in module_requirements[backends]:
+                        module_requirements[backends][module_name] = set()
+
+                    if not line.startswith("class") and not line.startswith("def"):
+                        skip_line = True
+                    else:
+                        start_index = 6 if line.startswith("class") else 4
+                        object_name = line[start_index:].split("(")[0].strip(":")
+                        module_requirements[backends][module_name].add(object_name)
+                        exported_objects.add(object_name)
+
+                if not skip_line:
+                    previous_line = line
+                    previous_index = index
+
+        # All objects that are in __all__ should be exported by default.
+        # These objects are exported with the file backends.
+        if "__all__" in file_content:
+            for _all_object in fetch__all__(file_content):
+                if _all_object not in exported_objects:
+                    backends = frozenset(base_requirements)
+                    if backends not in module_requirements:
+                        module_requirements[backends] = {}
+                    if module_name not in module_requirements[backends]:
+                        module_requirements[backends][module_name] = set()
+
+                    module_requirements[backends][module_name].add(_all_object)
+
+    import_structure = {**module_requirements, **import_structure}
+    return import_structure
+
+
+def spread_import_structure(nested_import_structure):
+    """
+    This method takes as input an unordered import structure and brings the required backends at the top-level,
+    aggregating modules and objects under their required backends.
+
+    Here's an example of an input import structure at the src.transformers.models level:
+
+    {
+        'albert': {
+            frozenset(): {
+                'configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'}
+            },
+            frozenset({'tokenizers'}): {
+                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+            },
+        },
+        'align': {
+            frozenset(): {
+                'configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+                'processing_align': {'AlignProcessor'}
+            },
+        },
+        'altclip': {
+            frozenset(): {
+                'configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+                'processing_altclip': {'AltCLIPProcessor'},
+            }
+        }
+    }
+
+    Here's an example of an output import structure at the src.transformers.models level:
+
+    {
+        frozenset({'tokenizers'}): {
+            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+        },
+        frozenset(): {
+            'albert.configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'},
+            'align.processing_align': {'AlignProcessor'},
+            'align.configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+            'altclip.configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+            'altclip.processing_altclip': {'AltCLIPProcessor'}
+        }
+    }
+
+    """
+
+    def propagate_frozenset(unordered_import_structure):
+        frozenset_first_import_structure = {}
+        for _key, _value in unordered_import_structure.items():
+            # If the value is not a dict but a string, no need for custom manipulation
+            if not isinstance(_value, dict):
+                frozenset_first_import_structure[_key] = _value
+
+            elif any(isinstance(v, frozenset) for v in _value):
+                for k, v in _value.items():
+                    if isinstance(k, frozenset):
+                        # Here we want to switch around _key and k to propagate k upstream if it is a frozenset
+                        if k not in frozenset_first_import_structure:
+                            frozenset_first_import_structure[k] = {}
+                        if _key not in frozenset_first_import_structure[k]:
+                            frozenset_first_import_structure[k][_key] = {}
+
+                        frozenset_first_import_structure[k][_key].update(v)
+
+                    else:
+                        # If k is not a frozenset, it means that the dictionary is not "level": some keys (top-level)
+                        # are frozensets, whereas some are not -> frozenset keys are at an unknown depth-level of the
+                        # dictionary.
+                        #
+                        # We recursively propagate the frozenset for this specific dictionary so that the frozensets
+                        # are at the top-level when we handle them.
+                        propagated_frozenset = propagate_frozenset({k: v})
+                        for r_k, r_v in propagated_frozenset.items():
+                            if isinstance(_key, frozenset):
+                                if r_k not in frozenset_first_import_structure:
+                                    frozenset_first_import_structure[r_k] = {}
+                                if _key not in frozenset_first_import_structure[r_k]:
+                                    frozenset_first_import_structure[r_k][_key] = {}
+
+                                # _key is a frozenset -> we switch around the r_k and _key
+                                frozenset_first_import_structure[r_k][_key].update(r_v)
+                            else:
+                                if _key not in frozenset_first_import_structure:
+                                    frozenset_first_import_structure[_key] = {}
+                                if r_k not in frozenset_first_import_structure[_key]:
+                                    frozenset_first_import_structure[_key][r_k] = {}
+
+                                # _key is not a frozenset -> we keep the order of r_k and _key
+                                frozenset_first_import_structure[_key][r_k].update(r_v)
+
+            else:
+                frozenset_first_import_structure[_key] = propagate_frozenset(_value)
+
+        return frozenset_first_import_structure
+
+    def flatten_dict(_dict, previous_key=None):
+        items = []
+        for _key, _value in _dict.items():
+            _key = f"{previous_key}.{_key}" if previous_key is not None else _key
+            if isinstance(_value, dict):
+                items.extend(flatten_dict(_value, _key).items())
+            else:
+                items.append((_key, _value))
+        return dict(items)
+
+    # The tuples contain the necessary backends. We want these first, so we propagate them up the
+    # import structure.
+    ordered_import_structure = nested_import_structure
+
+    # 6 is a number that gives us sufficient depth to go through all files and foreseeable folder depths
+    # while not taking too long to parse.
+    for i in range(6):
+        ordered_import_structure = propagate_frozenset(ordered_import_structure)
+
+    # We then flatten the dict so that it references a module path.
+    flattened_import_structure = {}
+    for key, value in ordered_import_structure.copy().items():
+        if isinstance(key, str):
+            del ordered_import_structure[key]
+        else:
+            flattened_import_structure[key] = flatten_dict(value)
+
+    return flattened_import_structure
+
+
+@lru_cache
+def define_import_structure(module_path: str, prefix: Optional[str] = None) -> IMPORT_STRUCTURE_T:
+    """
+    This method takes a module_path as input and creates an import structure digestible by a _LazyModule.
+
+    Here's an example of an output import structure at the src.transformers.models level:
+
+    {
+        frozenset({'tokenizers'}): {
+            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+        },
+        frozenset(): {
+            'albert.configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'},
+            'align.processing_align': {'AlignProcessor'},
+            'align.configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+            'altclip.configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+            'altclip.processing_altclip': {'AltCLIPProcessor'}
+        }
+    }
+
+    The import structure is a dict defined with frozensets as keys, and dicts of strings to sets of objects.
+
+    If `prefix` is not None, it will add that prefix to all keys in the returned dict.
+    """
+    import_structure = create_import_structure_from_path(module_path)
+    spread_dict = spread_import_structure(import_structure)
+
+    if prefix is None:
+        return spread_dict
+    else:
+        spread_dict = {k: {f"{prefix}.{kk}": vv for kk, vv in v.items()} for k, v in spread_dict.items()}
+        return spread_dict
+
+
+def clear_import_cache() -> None:
+    """
+    Clear cached Transformers modules to allow reloading modified code.
+
+    This is useful when actively developing/modifying Transformers code.
+    """
+    # Get all transformers modules
+    transformers_modules = [mod_name for mod_name in sys.modules if mod_name.startswith("transformers.")]
+
+    # Remove them from sys.modules
+    for mod_name in transformers_modules:
+        module = sys.modules[mod_name]
+        # Clear _LazyModule caches if applicable
+        if isinstance(module, _LazyModule):
+            module._objects = {}  # Clear cached objects
+        del sys.modules[mod_name]
+
+    # Force reload main transformers module
+    if "transformers" in sys.modules:
+        main_module = sys.modules["transformers"]
+        if isinstance(main_module, _LazyModule):
+            main_module._objects = {}  # Clear cached objects
+        importlib.reload(main_module)