Ovis-Image

Running on Zero

App Files Files Community

Flourish commited on 14 days ago

Commit

44cb7bc

verified ·

1 Parent(s): f2c6f36

Upload 14 files

Browse files

Files changed (14) hide show

ovis_image/__init__.py +44 -0
ovis_image/dataset/image_util.py +125 -0
ovis_image/model/args.py +29 -0
ovis_image/model/autoencoder.py +402 -0
ovis_image/model/hf_embedder.py +50 -0
ovis_image/model/layers.py +407 -0
ovis_image/model/model.py +121 -0
ovis_image/model/ops.py +102 -0
ovis_image/model/ovis/configuration_ovis2_5.py +96 -0
ovis_image/model/ovis/modeling_ovis2_5.py +1000 -0
ovis_image/model/tokenizer.py +82 -0
ovis_image/sampling.py +224 -0
ovis_image/test.py +92 -0
ovis_image/utils.py +112 -0

ovis_image/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+from ovis_image.model.args import OvisImageModelArgs
+from ovis_image.model.autoencoder import AutoEncoderParams
+from ovis_image.model.model import OvisImageModel
+__all__ = [
+    "OvisImageModelArgs",
+    "OvisImageModel",
+    "ovis_image_configs",
+]
+ovis_image_configs = {
+    "ovis-image-7b": OvisImageModelArgs(
+        in_channels=64,
+        out_channels=64,
+        context_in_dim=2048,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=6,
+        double_block_type="DoubleStreamBlock",
+        depth_single_blocks=27,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        activation = "swiglu",
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}

ovis_image/dataset/image_util.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import math
+import torch
+import torchvision
+from torchvision import transforms
+from einops import rearrange, repeat
+def ceil_to(x, factor=16):
+    return math.ceil(float(x) / factor) * factor
+def build_img_ids(
+    latent_height,
+    latent_width,
+    latent_crop_height = None,
+    latent_crop_width = None,
+    time = 0,
+):
+    if latent_crop_height is None:
+        latent_crop_height = latent_height
+    if latent_crop_width is None:
+        latent_crop_width = latent_width
+    img_ids = torch.zeros(latent_height, latent_width, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(latent_height)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(latent_width)[None, :]
+    # crop
+    crop_h = (latent_height - latent_crop_height) // 2
+    crop_w = (latent_width - latent_crop_width) // 2
+    img_ids = img_ids[crop_h:crop_h+latent_crop_height, crop_w:crop_w+latent_crop_width]
+    img_ids[..., 0] = time
+    h, w, c = img_ids.shape
+    img_ids = img_ids.reshape(h * w, c)
+    return img_ids
+def process_pil_img_to_tensor(
+    pil_img,
+    output_size: int | None = 256,
+    output_width: int | None = None,
+    output_height: int | None = None,
+    with_position_ids: bool = False,
+    position_ids_time: int = 0,
+):
+    width, height = pil_img.size
+    if output_width is None or output_height is None:
+        output_width = output_size
+        output_height = output_size
+    assert output_height % 16 == 0
+    assert output_width % 16 == 0
+    resize_ratio = max(
+        float(output_width)/width,
+        float(output_height)/height
+    )
+    resize_size = (
+        ceil_to(resize_ratio * height, 16),
+        ceil_to(resize_ratio * width, 16)
+    )
+    pil_resize_img = torchvision.transforms.functional.resize(
+        pil_img, resize_size, interpolation=transforms.InterpolationMode.BICUBIC
+    )
+    pil_crop_img = torchvision.transforms.functional.center_crop(
+        pil_resize_img, (output_height, output_width)
+    )
+    image_tensor = torchvision.transforms.functional.to_tensor(pil_crop_img)
+    image_tensor = torchvision.transforms.functional.normalize(
+        image_tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
+    )
+    if with_position_ids:
+        img_ids = build_img_ids(
+            latent_height = resize_size[0] // 16,
+            latent_width = resize_size[1] // 16,
+            latent_crop_height = output_height // 16,
+            latent_crop_width = output_width // 16,
+            time = position_ids_time,
+        )
+    else:
+        img_ids = None
+    return pil_crop_img, image_tensor, img_ids
+def pack_latent_to_token(
+    latent,
+):
+    token = rearrange(
+        latent,
+        "b c (h ph) (w pw) -> b (h w) (c ph pw)",
+        ph=2,
+        pw=2
+    )
+    return token
+def unpack_token_to_latent(
+    token,
+    image_height: int | None = None,
+    latent_height: int | None  = None,
+    image_width: int | None  = None,
+    latent_width: int | None  = None,
+):
+    if image_height is not None:
+        h = math.ceil(image_height / 16)
+    elif latent_height is not None:
+        h = latent_height // 2
+    else:
+        raise ValueError(f"both {image_height} and {latent_height} are None")
+    if image_width is not None:
+        w = math.ceil(image_width / 16)
+    elif latent_width is not None:
+        w = latent_width // 2
+    else:
+        raise ValueError(f"both {image_width} and {latent_width} are None")
+    return rearrange(
+        token,
+        "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+        h=h,
+        w=w,
+        ph=2,
+        pw=2,
+    )

ovis_image/model/args.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+from dataclasses import dataclass, field
+from ovis_image.model.autoencoder import AutoEncoderParams
+@dataclass
+class OvisImageModelArgs:
+    in_channels: int = 64
+    out_channels: int = 64
+    context_in_dim: int = 512
+    hidden_size: int = 3072
+    mlp_ratio: float = 4.0
+    num_heads: int = 24
+    depth: int = 19
+    double_block_type: str = "DoubleStreamBlock"
+    depth_single_blocks: int = 38
+    axes_dim: tuple = (16, 56, 56)
+    theta: int = 10_000
+    qkv_bias: bool = True
+    activation: str = "gelu_tanh"
+    """activation: gelu_tanh or swiglu"""
+    norm: str = "layernorm"
+    """norm: layernorm or rmsnorm"""
+    autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams)

ovis_image/model/autoencoder.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import os
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from safetensors.torch import load_file as load_sft
+from torch import nn, Tensor
+@dataclass
+class AutoEncoderParams:
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    ch_mult: tuple[int] = (1, 2, 4, 4)
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+    use_quant_conv: bool = False
+    use_post_quant_conv: bool = False
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(
+            block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+        self.quant_conv = nn.Conv2d(2 * params.z_channels, 2 * params.z_channels, 1) if params.use_quant_conv else None
+        self.post_quant_conv = nn.Conv2d(params.z_channels, params.z_channels, 1) if params.use_post_quant_conv else None
+    def encode(self, x: Tensor) -> Tensor:
+        x = self.encoder(x)
+        if self.quant_conv is not None:
+            x = self.quant_conv(x)
+        z = self.reg(x)
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        if self.post_quant_conv is not None:
+            z = self.post_quant_conv(z)
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+def load_ae(
+    ckpt_path: str,
+    autoencoder_params: AutoEncoderParams,
+    device: str | torch.device = "cuda",
+    dtype=torch.bfloat16,
+    random_init=False,
+) -> AutoEncoder:
+    """
+    Load the autoencoder from the given model name.
+    Args:
+        name (str): The name of the autoencoder.
+        device (str or torch.device): The device to load the autoencoder to.
+    Returns:
+        AutoEncoder: The loaded autoencoder.
+    """
+    # Loading the autoencoder
+    with torch.device(device):
+        ae = AutoEncoder(autoencoder_params)
+    if random_init:
+        print(f"Random Init VAE")
+        return ae.to(dtype=dtype)
+    if not os.path.exists(ckpt_path):
+        raise ValueError(
+            f"Autoencoder path {ckpt_path} does not exist. Please download it first."
+        )
+    if ckpt_path is not None:
+        print(f"Loading {ckpt_path}")
+        sd = load_sft(ckpt_path, device=str(device))
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        if len(missing) > 0:
+            print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        if len(unexpected) > 0:
+            print(
+                f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+            )
+    return ae.to(dtype=dtype)

ovis_image/model/hf_embedder.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import torch
+from torch import nn, Tensor
+from ovis_image.model.ovis.modeling_ovis2_5 import Ovis2_5, Ovis2_5_Config
+class OvisEmbedder(nn.Module):
+    def __init__(
+            self,
+            model_path: str,
+            random_init=False,
+            **hf_kwargs
+        ):
+        super().__init__()
+        if random_init:
+            # Initialize Ovis model with random weights for test purpose only
+            config = Ovis2_5_Config.from_pretrained(model_path)
+            config.name_or_path = model_path
+            self.hf_module = Ovis2_5._from_config(config, **hf_kwargs)
+        else:
+            self.hf_module = Ovis2_5.from_pretrained(
+                model_path, **hf_kwargs
+            )
+        self.pad_token_id = self.hf_module.text_tokenizer.pad_token_id
+        self.user_prompt_begin_id = 28
+        # get Qwen3
+        self.hf_module = self.hf_module.llm.model
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, batch_tokens: Tensor, attention_mask = None) -> Tensor:
+        if attention_mask is None:
+            attention_mask = torch.ne(
+                batch_tokens, self.pad_token_id
+            ).to(device=batch_tokens.device)
+        outputs = self.hf_module(
+            input_ids=batch_tokens,
+            attention_mask=attention_mask,
+        )
+        txt_semantic_embed = outputs.last_hidden_state
+        txt_semantic_embed = txt_semantic_embed * attention_mask[..., None]
+        txt_semantic_embed = txt_semantic_embed[:, self.user_prompt_begin_id:, :]
+        return txt_semantic_embed

ovis_image/model/layers.py ADDED Viewed

	@@ -0,0 +1,407 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import math
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import nn, Tensor
+from ovis_image.model.ops import attention, rope
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    @torch.no_grad()
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        # bs x 1 x 512 x 64 x 2 x 2
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    with torch.device(t.device):
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def init_weights(self, init_std: float = 0.02):
+        nn.init.normal_(self.in_layer.weight, std=init_std)
+        nn.init.constant_(self.in_layer.bias, 0)
+        nn.init.normal_(self.out_layer.weight, std=init_std)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = nn.RMSNorm(dim)
+        self.key_norm = nn.RMSNorm(dim)
+    def init_weights(self):
+        self.query_norm.reset_parameters()
+        self.key_norm.reset_parameters()
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def init_weights(self):
+        for layer in (self.qkv, self.proj):
+            nn.init.xavier_uniform_(layer.weight)
+            if layer.bias is not None:
+                nn.init.constant_(layer.bias, 0)
+        self.norm.init_weights()
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+class YakMLP(nn.Module):
+    # Use SwiGLU
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+        self.act_fn = nn.SiLU()
+    def init_weights(self):
+        for layer in (self.gate_proj, self.up_proj, self.down_proj):
+            nn.init.xavier_uniform_(layer.weight)
+            nn.init.constant_(layer.bias, 0)
+    def forward(self, x: Tensor) -> Tensor:
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def build_mlp(hidden_size, intermediate_size, activation = "gelu_tanh"):
+    if activation == "gelu_tanh":
+        mlp = nn.Sequential(
+            nn.Linear(hidden_size, intermediate_size, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(intermediate_size, hidden_size, bias=True),
+        )
+    else:
+        mlp = YakMLP(hidden_size, intermediate_size)
+    return mlp
+def init_mlp(mlp, activation = "gelu_tanh"):
+    if activation == "gelu_tanh":
+        for layer in (mlp[0], mlp[2]):
+            nn.init.xavier_uniform_(layer.weight)
+            nn.init.constant_(layer.bias, 0)
+    else:
+        mlp.init_weights()
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, multiples: int = 1):
+        super().__init__()
+        assert multiples in [1, 2, 3]
+        self.multiples = multiples
+        self.multiplier = 3 * multiples
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+        self.act = nn.SiLU()
+    def init_weights(self):
+        nn.init.constant_(self.lin.weight, 0)
+        nn.init.constant_(self.lin.bias, 0)
+    def forward(self, vec: Tensor):
+        out = self.lin(self.act(vec))[:, None, :].chunk(
+            self.multiplier, dim=-1
+        )
+        if self.multiples == 1:
+            return ModulationOut(*out[:3])
+        elif self.multiples == 2:
+            return (
+                ModulationOut(*out[:3]),
+                ModulationOut(*out[3:]),
+            )
+        elif self.multiples == 3:
+            return (
+                ModulationOut(*out[:3]),
+                ModulationOut(*out[3:6]),
+                ModulationOut(*out[6:]),
+            )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+        activation: str = "gelu_tanh",
+        norm_layer: nn.Module = nn.LayerNorm,
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.activation = activation
+        self.img_mod = Modulation(hidden_size, multiples=2)
+        self.img_norm1 = norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.img_norm2 = norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, activation)
+        self.txt_mod = Modulation(hidden_size, multiples=2)
+        self.txt_norm1 = norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.txt_norm2 = norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, activation)
+    def init_weights(self):
+        # initialize all the nn.Linear submodules
+        init_mlp(self.img_mlp, self.activation)
+        init_mlp(self.txt_mlp, self.activation)
+        # initialize Modulation layers, SelfAttention layers
+        for layer in (self.img_attn, self.img_mod, self.txt_attn, self.txt_mod):
+            layer.init_weights()
+        # Reset parameters for Normalization layers
+        for norm in (self.txt_norm1, self.txt_norm2, self.img_norm1, self.img_norm2):
+            norm.reset_parameters()
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_scale: float | None = None,
+        activation: str = "gelu_tanh",
+        norm_layer: nn.Module = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.activation = activation
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        if activation == "gelu_tanh":
+            # qkv and mlp_in
+            self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, bias=qkv_bias)
+        else:
+            # qkv and mlp_in and mlp_gate
+            self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim * 2, bias=qkv_bias)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        if activation == "gelu_tanh":
+            self.mlp_act = nn.GELU(approximate="tanh")
+        else:
+            self.mlp_act = nn.SiLU()
+        self.modulation = Modulation(hidden_size, multiples=1)
+    def init_weights(self):
+        for layer in (self.linear1, self.linear2):
+            nn.init.xavier_uniform_(layer.weight)
+            if layer.bias is not None:
+                nn.init.constant_(layer.bias, 0)
+        self.norm.init_weights()
+        self.pre_norm.reset_parameters()
+        self.modulation.init_weights()
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        if self.activation == "gelu_tanh":
+            qkv, mlp = torch.split(
+                self.linear1(x_mod),
+                [3 * self.hidden_size, self.mlp_hidden_dim],
+                dim=-1
+            )
+        else:
+            qkv, mlp, mlp_gate = torch.split(
+                self.linear1(x_mod),
+                [3 * self.hidden_size, self.mlp_hidden_dim, self.mlp_hidden_dim],
+                dim=-1
+            )
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        if self.activation == "gelu_tanh":
+            # compute activation in mlp stream, cat again and run second linear layer
+            x = x + mod.gate * self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        else:
+            x = x + mod.gate * self.linear2(
+                torch.cat((attn, self.mlp_act(mlp_gate) * mlp), 2)
+            )
+        return x
+class LastLayer(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            patch_size: int,
+            out_channels: int,
+            norm_layer: nn.Module = nn.LayerNorm,
+        ):
+        super().__init__()
+        self.norm_final = norm_layer(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def init_weights(self):
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.linear.weight, 0)
+        nn.init.constant_(self.linear.bias, 0)
+        self.norm_final.reset_parameters()
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

ovis_image/model/model.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import torch
+from torch import nn, Tensor
+from ovis_image.model.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+from ovis_image.model.args import OvisImageModelArgs
+class OvisImageModel(nn.Module):
+    def __init__(self, model_args: OvisImageModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        self.in_channels = model_args.in_channels
+        self.out_channels = model_args.out_channels
+        if model_args.hidden_size % model_args.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {model_args.hidden_size} must be divisible by num_heads {model_args.num_heads}"
+            )
+        pe_dim = model_args.hidden_size // model_args.num_heads
+        if sum(model_args.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {model_args.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = model_args.hidden_size
+        self.num_heads = model_args.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=model_args.theta, axes_dim=model_args.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.semantic_txt_norm = nn.RMSNorm(model_args.context_in_dim, eps=1e-6)
+        self.semantic_txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size, bias=True)
+        if model_args.norm == "layernorm":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = nn.RMSNorm
+        DoubleBlock = DoubleStreamBlock
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=model_args.mlp_ratio,
+                    qkv_bias=model_args.qkv_bias,
+                    activation=model_args.activation,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(model_args.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=model_args.mlp_ratio,
+                    qkv_bias=model_args.qkv_bias,
+                    activation=model_args.activation,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(model_args.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(
+            self.hidden_size,
+            1,
+            self.out_channels,
+            norm_layer=norm_layer,
+        )
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        txt = self.semantic_txt_norm(txt)
+        txt = self.semantic_txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

ovis_image/model/ops.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import torch
+from einops import rearrange
+from torch import Tensor
+from torch.nn.attention import sdpa_kernel, SDPBackend
+flash_attn_func = None
+try:
+    from flash_attn_interface import flash_attn_func
+    print("find flash attn 3")
+except:
+    flash_attn_func = None
+def check_attention_type(attn_implementation):
+    if torch.__version__ >= "2.7.0":
+        if attn_implementation != "sdpa":
+            print("please set attn_implementation as sdpa for torch271")
+    elif flash_attn_func is not None:
+        if attn_implementation != "flash_attention_3":
+            print("please set attn_implementation as flash_attention_3 for H100")
+def get_attention_type_by_system():
+    if torch.__version__ >= "2.7.0":
+        return "sdpa"
+    elif flash_attn_func is not None:
+        return "flash_attention_3"
+    else:
+        return "eager"
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    if torch.__version__ >= "2.7.0":
+        return attention_sdpa(q, k, v, pe)
+    elif flash_attn_func is not None:
+        return attention_fa3(q, k, v, pe)
+    else:
+        return attention_eager(q, k, v, pe)
+def attention_eager(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    # https://docs.pytorch.org/docs/2.6/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def attention_sdpa(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    # B200用torch271镜像，用SDPA加速
+    with sdpa_kernel([SDPBackend.CUDNN_ATTENTION]):
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v,
+        )
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def attention_fa3(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    # H100上用flash_attn_3加速
+    q = rearrange(q, "B H L D -> B L H D")
+    k = rearrange(k, "B H L D -> B L H D")
+    v = rearrange(v, "B H L D -> B L H D")
+    x = flash_attn_func(q, k, v)[0]
+    x = rearrange(x, "B L H D -> B L (H D)")
+    return x
+def get_attention_func(attn_implementation):
+    if attn_implementation == "eager":
+        return attention_eager
+    elif attn_implementation == "sdpa":
+        return attention_sdpa
+    elif attn_implementation == "flash_attention_3":
+        return attention_fa3
+    else:
+        return attention_eager
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=pos.dtype, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack(
+        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
+    )
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

ovis_image/model/ovis/configuration_ovis2_5.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Any, Optional, List, Union
+from transformers import Qwen3Config
+from transformers.configuration_utils import PretrainedConfig
+__all__ = ["Siglip2NavitConfig", "Ovis2_5_Config"]
+class Siglip2NavitConfig(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+    model_type: str = "siglip2_navit"
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 4096,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        num_channels: int = 3,
+        num_patches: int = -1,
+        image_size: int = 512,
+        patch_size: int = 16,
+        hidden_act: str="gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        attention_dropout: float = 0.0,
+        hidden_stride: int = 2,
+        window_size: int = 112,
+        fullatt_block_indexes: Optional[list] = None,
+        temporal_patch_size: int = 1,
+        preserve_original_pe: bool = True,
+        use_rope: bool = True,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_stride = hidden_stride
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.temporal_patch_size = temporal_patch_size
+        self.preserve_original_pe = preserve_original_pe
+        self.use_rope = use_rope
+class Ovis2_5_Config(PretrainedConfig):
+    model_type = "ovis2_5"
+    sub_configs = dict(llm_config=Qwen3Config, vit_config=Siglip2NavitConfig)
+    def __init__(self,
+        llm_config: Optional[Union[Qwen3Config, dict]] = None,
+        vit_config: Optional[Union[Siglip2NavitConfig, dict]] = None,
+        visual_vocab_size=65536,
+        hidden_size=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        if isinstance(llm_config, dict):
+            llm_config = Qwen3Config(**llm_config)
+        self.llm_config = llm_config
+        if isinstance(vit_config, dict):
+            vit_config = Siglip2NavitConfig(**vit_config)
+        self.vit_config = vit_config
+        self.visual_vocab_size = visual_vocab_size
+        self.hidden_size = hidden_size
+        if kwargs.get('attn_implementation'):
+            self.llm_config._attn_implementation = kwargs['attn_implementation']
+            self.vit_config._attn_implementation = kwargs['attn_implementation']

ovis_image/model/ovis/modeling_ovis2_5.py ADDED Viewed

	@@ -0,0 +1,1000 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import math
+from typing import Dict, List, Optional, Tuple, Union
+import PIL.Image
+import numpy as np
+import torch
+# from flash_attn import flash_attn_varlen_func
+# from flash_attn.layers.rotary import apply_rotary_emb
+from torch import Tensor, nn
+from torch.nn import functional as F
+from transformers import (
+    AutoConfig,
+    AutoImageProcessor,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+from transformers.activations import ACT2FN
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.modeling_utils import PreTrainedModel
+from ovis_image.model.ovis.configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config
+from ovis_image.model.ops import get_attention_type_by_system
+IMAGE_PLACEHOLDER = "<image>"
+IMAGE_PLACEHOLDER_ID = -200
+VIDEO_PLACEHOLDER = "<video>"
+VIDEO_PLACEHOLDER_ID = -201
+VISUAL_ATOM_ID = -300
+INDICATOR_IDS = [-301, -302, -303, -304]
+# copied from qwen2.5-vl
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2NavitConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.image_size = config.image_size
+        self.num_patches = config.num_patches
+        self.preserve_original_pe = config.preserve_original_pe
+        self.hidden_stride = config.hidden_stride
+        # siglip2 naflex
+        if self.num_patches > 0:
+            self.patch_embedding = nn.Linear(
+                    in_features=config.num_channels * self.patch_size * self.patch_size,
+                    out_features=self.embed_dim,
+                )
+            if self.preserve_original_pe:
+                self.position_embedding_size = int(self.num_patches**0.5)
+                self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+        else:
+            self.patch_embedding = nn.Conv2d(
+                    in_channels=config.num_channels,
+                    out_channels=self.embed_dim,
+                    kernel_size=self.patch_size,
+                    stride=self.patch_size,
+                    padding="valid",
+                )
+            if self.preserve_original_pe:
+                self.num_patches = (self.image_size // self.patch_size) ** 2
+                self.position_embedding_size = self.image_size // self.patch_size
+                self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        max_length: int,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized positional embeddings to
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+        resulted_positional_embeddings = torch.empty(
+            (batch_size, max_length, embed_dim),
+            device=positional_embeddings.device,
+            dtype=source_dtype,
+        )
+        # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+        # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+        for i in range(batch_size):
+            # (1, dim, height, width) -> (1, dim, target_height, target_width)
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+            # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
+            resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
+            # Cast to original dtype
+            resized_embeddings = resized_embeddings.to(source_dtype)
+            resulted_positional_embeddings[i, : height * width] = resized_embeddings
+            resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+        return resulted_positional_embeddings
+    def forward(self, pixel_values: torch.FloatTensor,
+                grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (num_patches, num_channels * temporal_patch_size * patch_size * patch_size)
+            grid_thws: (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+        """
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        if isinstance(self.patch_embedding, nn.Linear):
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+        elif isinstance(self.patch_embedding, nn.Conv2d):
+            pixel_values = pixel_values.view(-1, self.config.num_channels * self.config.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+            patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
+        if self.preserve_original_pe:
+            assert grid_thws is not None
+            pos_embed_new = torch.zeros_like(patch_embeds)
+            ori_h = ori_w = self.position_embedding_size
+            positional_embeddings = self.position_embedding.weight.reshape(
+                                self.position_embedding_size, self.position_embedding_size, -1
+                            ).unsqueeze(0).permute(0,3,1,2)
+            # pos_embed = self.pos_embed.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2)
+            cnt = 0
+            for t, h, w in grid_thws:
+                thw = t * h * w
+                pe = F.interpolate(positional_embeddings, size=(h, w), mode='bicubic', align_corners=False)
+                pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
+                pe = pe[0].repeat(t, 1)
+                pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride, w // self.hidden_stride,
+                                self.hidden_stride, -1)
+                pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(thw, -1)
+                pos_embed_new[cnt:cnt + thw] = pe
+                cnt += thw
+            patch_embeds = patch_embeds + pos_embed_new
+        return patch_embeds
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_flashatt(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.use_rope = config.use_rope
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        seq_length, embed_dim = hidden_states.shape
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+        queries = queries.view(seq_length, self.num_heads, self.head_dim)
+        keys = keys.view(seq_length, self.num_heads, self.head_dim)
+        values = values.view(seq_length, self.num_heads, self.head_dim)
+        if self.use_rope:
+            cos, sin = position_embeddings
+            queries, keys = apply_rotary_pos_emb_flashatt(queries.unsqueeze(0), keys.unsqueeze(0), cos, sin)
+            queries = queries.squeeze(0)
+            keys = keys.squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        # attn_output = flash_attn_varlen_func(queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+        #                                     seq_length, -1
+        #                                 )
+        batch_size = cu_seqlens.shape[0] - 1
+        q_list, k_list, v_list = [], [], []
+        for i in range(batch_size):
+            start, end = cu_seqlens[i].item(), cu_seqlens[i+1].item()
+            q_list.append(queries[start:end])
+            k_list.append(keys[start:end])
+            v_list.append(values[start:end])
+        def pad_to_max(t, max_len):
+            pad = (0, 0, 0, 0, 0, max_len - t.shape[0])  # [seqlen, num_heads, head_dim]
+            return torch.nn.functional.pad(t, pad)
+        q_batched = torch.stack([pad_to_max(x, max_seqlen) for x in q_list])  # (batch, seqlen, nhead, dim)
+        k_batched = torch.stack([pad_to_max(x, max_seqlen) for x in k_list])
+        v_batched = torch.stack([pad_to_max(x, max_seqlen) for x in v_list])
+        mask = torch.zeros((batch_size, max_seqlen), dtype=torch.bool, device=queries.device)
+        for i in range(batch_size):
+            mask[i, :q_list[i].shape[0]] = True  # [batch, seqlen]
+        # (batch, nhead, seqlen, head_dim)
+        q_batched = q_batched.transpose(1, 2)  # (batch, nhead, seqlen, head_dim)
+        k_batched = k_batched.transpose(1, 2)
+        v_batched = v_batched.transpose(1, 2)
+        if torch.__version__ >= "2.7.0":
+            from torch.nn.attention import sdpa_kernel, SDPBackend
+            with sdpa_kernel([SDPBackend.CUDNN_ATTENTION]):
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    q_batched, k_batched, v_batched,
+                    attn_mask=mask.unsqueeze(1).unsqueeze(2)
+                ).permute(0, 2, 1, 3).reshape(seq_length, -1)
+        else:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                q_batched, k_batched, v_batched,
+                attn_mask = mask.unsqueeze(1).unsqueeze(2)  # broadcast到 (batch, 1, 1, seqlen)
+            ).permute(0, 2, 1, 3).reshape(seq_length, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class Siglip2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(self, config: Siglip2NavitConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: torch.Tensor
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=position_embeddings
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Siglip2EncoderLayer`].
+    Args:
+        config: Siglip2NavitConfig
+    """
+    def __init__(self, config: Siglip2NavitConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.rotary_pos_emb = VisionRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
+        self.patch_size = config.patch_size
+        self.hidden_stride = config.hidden_stride
+        self.window_size = config.window_size
+        self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
+        self.fullatt_block_indexes = None if config.fullatt_block_indexes is None else [int(i) for i in config.fullatt_block_indexes.split('|')]
+    # copied from qwen2.5_vl
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.hidden_stride // self.patch_size  # patch (after merge) number in each window
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.hidden_stride,  # number of patch after merge
+                grid_w // self.hidden_stride,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        grid_thws: torch.Tensor,
+        output_hidden_states: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        rotary_pos_emb = self.rot_pos_emb(grid_thws)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=inputs_embeds.device,
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = inputs_embeds.size()
+        inputs_embeds = inputs_embeds.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        inputs_embeds = inputs_embeds[window_index, :, :]
+        inputs_embeds = inputs_embeds.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        reverse_indices = torch.argsort(window_index)
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for index, block in enumerate(self.layers):
+            if self.fullatt_block_indexes is None or index in self.fullatt_block_indexes:
+                cu_seqlens_tmp = cu_seqlens
+            else:
+                cu_seqlens_tmp = cu_window_seqlens
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(block.__call__, hidden_states, cu_seqlens_tmp, position_embeddings)
+            else:
+                hidden_states = block(hidden_states, cu_seqlens_tmp, position_embeddings)
+            if output_hidden_states:
+                hidden_states_ = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+                encoder_states += (hidden_states_[reverse_indices, :].reshape(seq_len, -1),)
+        # tokens = self.post_trunk_norm(tokens)
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
+        return hidden_states, encoder_states
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(self, config: Siglip2NavitConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = True,
+    ) -> Union[
+        Tuple[torch.Tensor],
+        Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
+        BaseModelOutputWithNoAttention,
+    ]:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width) of the input images.
+        """
+        # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # output_hidden_states = (
+        #     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        # )
+        hidden_states = self.embeddings(pixel_values, grid_thws)
+        last_hidden_state, hidden_states = self.encoder(hidden_states, grid_thws, output_hidden_states)
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        if not return_dict:
+            output = (last_hidden_state,)
+            output += (hidden_states,) if output_hidden_states else ()
+            return output
+        return BaseModelOutputWithNoAttention(
+                last_hidden_state=last_hidden_state,
+                hidden_states=hidden_states
+            )
+class Siglip2PreTrainedModel(PreTrainedModel):
+    config_class = Siglip2NavitConfig
+    base_model_prefix = "siglip2_navit"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Siglip2VisionEmbeddings",
+        "Siglip2EncoderLayer",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_flex_attn = False
+    _supports_attention_backend = True
+class Siglip2NavitModel(Siglip2PreTrainedModel):
+    config_class = Siglip2NavitConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: Siglip2NavitConfig):
+        super().__init__(config)
+        self.vision_model = Siglip2VisionTransformer(config)
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+        Tuple[torch.Tensor],
+        Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
+        BaseModelOutputWithNoAttention,
+    ]:
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class VisualEmbedding(torch.nn.Embedding):
+    """
+    A visual embedding layer that can handle both discrete token IDs (long) and continuous
+    soft-token probabilities (float).
+    """
+    def forward(self, visual_tokens: Tensor) -> Tensor:
+        if visual_tokens.dtype in [torch.int8, torch.int16, torch.int32, torch.int64, torch.long]:
+            return super().forward(visual_tokens)
+        # Handle soft tokens (probabilities) by matrix multiplication with the embedding weight
+        return torch.matmul(visual_tokens, self.weight)
+class VisualTokenizer(torch.nn.Module):
+    """
+    Tokenizes images or videos into a sequence of continuous visual tokens.
+    """
+    def __init__(self, vit, visual_vocab_size, image_processor_name_or_path, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.vit = vit
+        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_name_or_path, do_center_crop=False)
+        head_dim = visual_vocab_size - len(INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            torch.nn.Linear(self.vit.config.hidden_size * self.vit.config.hidden_stride ** 2, head_dim, bias=False),
+            torch.nn.LayerNorm(head_dim)
+        )
+    def _encode(self, pixel_values, grid_thws):
+        output = self.vit(pixel_values, grid_thws, output_hidden_states=True, return_dict=True)
+        features = output.hidden_states[-1]
+        seq_len, _ = features.shape
+        features = features.reshape(seq_len // (self.vit.config.hidden_stride ** 2), -1)
+        return features
+    # Adapted from qwen2_vl
+    @staticmethod
+    def smart_resize(
+        height: int, width: int, factor: int = 28, min_pixels: int = 448 * 448, max_pixels: int = 1344 * 1792
+    ):
+        """Rescales the image so that the following conditions are met:
+        1. Both dimensions are divisible by 'factor'.
+        2. The total number of pixels is within ['min_pixels', 'max_pixels'].
+        3. The aspect ratio is maintained as closely as possible.
+        """
+        if height < factor or width < factor:
+            if height < width:
+                width = round(factor / height * width)
+                height = factor
+            else:
+                height = round(factor / width * height)
+                width = factor
+        elif max(height, width) / min(height, width) > 200:
+            if height > width:
+                height = 200 * width
+            else:
+                width = 200 * height
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+    def preprocess(
+        self,
+        image: Optional[PIL.Image.Image] = None,
+        video: Optional[List[PIL.Image.Image]] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None
+    ):
+        patch_size = self.vit.config.patch_size
+        temporal_patch_size = self.vit.config.temporal_patch_size
+        hidden_stride = self.vit.config.hidden_stride
+        assert (image is None) ^ (video is None), "Invalid input: expect either image or video"
+        if image is not None:
+            images = [image]
+        else:
+            images = video
+        images = [image.convert("RGB") if image.mode != 'RGB' else image for image in images]
+        width, height = images[0].size
+        processed_images = []
+        for image in images:
+            resized_height, resized_width = self.smart_resize(
+                height,
+                width,
+                factor=patch_size * hidden_stride,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+            new_size = dict(height=resized_height, width=resized_width)
+            new_image = self.image_processor.preprocess(image, size=new_size, return_tensors="np")['pixel_values'][0]
+            processed_images.append(new_image)
+        patches = np.array(processed_images)
+        if patches.shape[0] % temporal_patch_size != 0:
+            repeats = np.repeat(patches[-1][np.newaxis], temporal_patch_size - 1, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
+        patches = patches.reshape(
+            grid_t, temporal_patch_size, channel,
+            grid_h // hidden_stride, hidden_stride, patch_size,
+            grid_w // hidden_stride, hidden_stride, patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
+        )
+        flatten_patches = torch.tensor(flatten_patches)
+        return flatten_patches, grid_thw
+    def forward(
+        self, pixel_values, grid_thws
+    ) -> torch.Tensor:  # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
+        features = self._encode(pixel_values, grid_thws)
+        logits = self.head(features)
+        tokens = torch.softmax(logits, dim=-1, dtype=torch.float32).to(logits.dtype)
+        token_len, _ = tokens.shape
+        padding_tensor = torch.zeros(size=(token_len, len(INDICATOR_IDS)),
+                                     dtype=tokens.dtype,
+                                     device=tokens.device,
+                                     layout=tokens.layout,
+                                     requires_grad=False)
+        tokens = torch.cat((tokens, padding_tensor), dim=1)
+        return tokens
+class OvisPreTrainedModel(PreTrainedModel):
+    config_class = Ovis2_5_Config
+    base_model_prefix = "ovis2_5"
+class Ovis2_5(OvisPreTrainedModel):
+    _supports_flash_attn_2 = True
+    _supports_flash_attn_3 = True
+    def __init__(self, config: Ovis2_5_Config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        attn_implementation = get_attention_type_by_system()
+        print(f"Use {attn_implementation} for LLM!")
+        self.llm = AutoModelForCausalLM.from_config(
+            self.config.llm_config,
+            attn_implementation=attn_implementation,
+        )
+        assert self.config.hidden_size == self.llm.config.hidden_size, "hidden size mismatch"
+        self.text_tokenizer = AutoTokenizer.from_pretrained(self.config.name_or_path)
+        self.visual_tokenizer = VisualTokenizer(vit=AutoModel.from_config(self.config.vit_config),
+                                                visual_vocab_size=self.config.visual_vocab_size,
+                                                image_processor_name_or_path=self.config.name_or_path)
+        self.vte = VisualEmbedding(self.config.visual_vocab_size, self.config.hidden_size,
+                                   device=self.visual_tokenizer.vit.device, dtype=self.visual_tokenizer.vit.dtype)
+        indicator_token_indices = torch.arange(
+            self.config.visual_vocab_size - len(INDICATOR_IDS),
+            self.config.visual_vocab_size,
+            dtype=torch.long
+        )
+        self.register_buffer("indicator_token_indices", indicator_token_indices, persistent=False)
+        def _merge_modules(modules_list: tuple):
+            merged_modules = []
+            for modules in modules_list:
+                merged_modules.extend(modules if modules else [])
+            return merged_modules
+        # Standard model configurations for parallelism and device placement
+        self._no_split_modules = _merge_modules(
+            (self.llm._no_split_modules, self.visual_tokenizer.vit._no_split_modules))
+        self._skip_keys_device_placement = self.llm._skip_keys_device_placement
+        self._keep_in_fp32_modules = _merge_modules(
+            (self.llm._keep_in_fp32_modules, self.visual_tokenizer.vit._keep_in_fp32_modules))
+        self.is_parallelizable = all((self.llm.is_parallelizable, self.visual_tokenizer.vit.is_parallelizable))
+        # self.supports_gradient_checkpointing = True
+        self.supports_gradient_checkpointing = False
+    def tie_weights(self):
+        self.llm.tie_weights()
+    def get_wte(self):
+        return self.llm.get_input_embeddings()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        pixel_values: Optional[torch.Tensor],
+        grid_thws: Optional[torch.Tensor],
+        labels: Optional[torch.Tensor] = None,
+        **kwargs
+    ):
+        inputs_embeds = self.merge_multimodal(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+        )
+        return self.llm(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
+    def merge_multimodal(
+        self,
+        input_ids: torch.Tensor,
+        pixel_values: Optional[torch.Tensor],
+        grid_thws: Optional[torch.Tensor],
+    ):
+        placeholder_token_mask = torch.lt(input_ids, 0)
+        multimodal_embeds = self.get_wte()(torch.masked_fill(input_ids, placeholder_token_mask, 0))
+        if pixel_values is not None:
+            visual_indicator_embeds = self.vte(self.indicator_token_indices).to(
+                dtype=multimodal_embeds.dtype, device=multimodal_embeds.device
+            )
+            visual_tokens = self.visual_tokenizer(pixel_values, grid_thws)
+            visual_embeds = self.vte(visual_tokens).to(dtype=multimodal_embeds.dtype, device=multimodal_embeds.device)
+            for i, indicator_id in enumerate(INDICATOR_IDS):
+                multimodal_embeds[input_ids == indicator_id] = visual_indicator_embeds[i]
+            multimodal_embeds[input_ids == VISUAL_ATOM_ID] = visual_embeds
+        return multimodal_embeds
+    def _merge_inputs(
+        self, raw_input_ids, placeholder_id, grid_thws, indicator_begin_id, indicator_end_id
+    ):
+        input_ids = []
+        prev_index = 0
+        placeholder_indexes = [i for i, v in enumerate(raw_input_ids) if v == placeholder_id]
+        for placeholder_index, grid_thw in zip(placeholder_indexes, grid_thws):
+            input_ids.extend(raw_input_ids[prev_index:placeholder_index])
+            num_image_atoms = grid_thw.prod().item()
+            num_image_atoms //= self.visual_tokenizer.vit.config.hidden_stride ** 2
+            num_image_atoms //= self.visual_tokenizer.vit.config.temporal_patch_size
+            input_ids.extend([indicator_begin_id] + [VISUAL_ATOM_ID] * num_image_atoms + [indicator_end_id])
+            prev_index = placeholder_index + 1
+        input_ids.extend(raw_input_ids[prev_index:])
+        return input_ids
+    def _tokenize_with_visual_placeholder(self, text):
+        placeholder = VIDEO_PLACEHOLDER if VIDEO_PLACEHOLDER in text else IMAGE_PLACEHOLDER
+        placeholder_id = VIDEO_PLACEHOLDER_ID if VIDEO_PLACEHOLDER in text else IMAGE_PLACEHOLDER_ID
+        chunks = [self.text_tokenizer(chunk, add_special_tokens=False).input_ids for chunk in text.split(placeholder)]
+        input_ids = chunks[0]
+        for chunk in chunks[1:]:
+            input_ids.append(placeholder_id)
+            input_ids.extend(chunk)
+        return input_ids
+    def preprocess_inputs(
+        self,
+        messages: List[Union[str, Dict]],
+        min_pixels=448 * 448,
+        max_pixels=1344 * 1792,
+        add_generation_prompt=True,
+        enable_thinking=False
+    ):
+        text = self.text_tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            enable_thinking=enable_thinking
+        )
+        input_ids = self._tokenize_with_visual_placeholder(text)
+        images = []
+        videos = []
+        for message in messages:
+            content = message["content"]
+            if isinstance(content, list):
+                images.extend([item["image"] for item in content if item.get("image") is not None])
+                videos.extend([item["video"] for item in content if item.get("video") is not None])
+        if images and videos:
+            raise ValueError(
+                "Multiple visual input data types detected (both image and video provided). "
+                "This model supports only one type of visual input data at a time. "
+                "Please provide either image or video, but not both."
+            )
+        pixel_values, grid_thws = None, None
+        if images:
+            pixel_values, grid_thws = zip(
+                *(self.visual_tokenizer.preprocess(image=image, min_pixels=min_pixels, max_pixels=max_pixels)
+                  for image in images)
+            )
+            input_ids = self._merge_inputs(
+                input_ids, IMAGE_PLACEHOLDER_ID, grid_thws, INDICATOR_IDS[0], INDICATOR_IDS[1]
+            )
+            pixel_values = torch.cat(pixel_values, dim=0)
+            grid_thws = torch.cat(grid_thws, dim=0)
+        elif videos:
+            assert len(videos) == 1, "only support single video"
+            pixel_values, grid_thws = self.visual_tokenizer.preprocess(
+                video=videos[0], min_pixels=min_pixels, max_pixels=max_pixels
+            )
+            input_ids = self._merge_inputs(
+                input_ids, VIDEO_PLACEHOLDER_ID, grid_thws, INDICATOR_IDS[2], INDICATOR_IDS[3]
+            )
+        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
+        return input_ids, pixel_values, grid_thws
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        attention_mask = torch.ne(inputs, self.text_tokenizer.pad_token_id).to(device=inputs.device)
+        inputs_embeds = self.merge_multimodal(
+            input_ids=inputs,
+            pixel_values=kwargs.pop('pixel_values', None),
+            grid_thws=kwargs.pop('grid_thws', None)
+        )
+        enable_thinking = kwargs.pop('enable_thinking', False)
+        enable_thinking_budget = kwargs.pop('enable_thinking_budget', False)
+        thinking_budget = kwargs.pop('thinking_budget', 1024)
+        if enable_thinking and enable_thinking_budget:
+            actual_max_new_tokens = kwargs['max_new_tokens']
+            kwargs['max_new_tokens'] = thinking_budget
+            generated_ids = self.llm.generate(inputs=None, inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs)
+            output_ids = generated_ids
+            output_ids_list = generated_ids[0]
+            # check if the generation has already finished (151645 is <|im_end|>)
+            if 151645 not in output_ids_list:
+                # check if the thinking process has finished (151668 is </think>)
+                # and prepare the second model input
+                if 151668 not in output_ids_list:
+                    early_stopping_text = "\n\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now.\n</think>\n\n"
+                    early_stopping_ids = self.text_tokenizer(early_stopping_text, return_tensors="pt", return_attention_mask=False).input_ids.to(inputs.device)
+                    input_ids_appendent = torch.cat([output_ids, early_stopping_ids], dim=-1)
+                    kwargs['streamer'].put(early_stopping_ids) if 'streamer' in kwargs else None
+                else:
+                    input_ids_appendent = output_ids
+                # second generation
+                new_inputs = torch.cat([inputs, input_ids_appendent], dim=-1)
+                attention_mask = torch.ne(new_inputs, self.text_tokenizer.pad_token_id).to(device=inputs.device)
+                inputs_embeds_appendent = self.merge_multimodal(
+                    input_ids=input_ids_appendent,
+                    pixel_values=None,
+                    grid_thws=None
+                )
+                new_inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_appendent], dim=-2)
+                kwargs['max_new_tokens'] = inputs_embeds.size(-2) + actual_max_new_tokens - new_inputs_embeds.size(-2)
+                generated_ids2 = self.llm.generate(inputs=None, inputs_embeds=new_inputs_embeds, attention_mask=attention_mask, **kwargs)
+                kwargs['streamer'].manual_end() if 'streamer' in kwargs else None
+                return torch.cat([input_ids_appendent, generated_ids2], dim=-1)
+            else:
+                kwargs['streamer'].manual_end() if 'streamer' in kwargs else None
+                return generated_ids
+        else:
+            generated_ids = self.llm.generate(inputs=None, inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs)
+            kwargs['streamer'].manual_end() if 'streamer' in kwargs else None
+            return generated_ids
+AutoConfig.register('siglip2_navit', Siglip2NavitConfig)
+AutoModel.register(Siglip2NavitConfig, Siglip2NavitModel)
+AutoConfig.register("ovis2_5", Ovis2_5_Config)
+AutoModelForCausalLM.register(Ovis2_5_Config, Ovis2_5)

ovis_image/model/tokenizer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+from typing import List
+import torch
+from transformers import AutoTokenizer
+class OvisTokenizer:
+    """
+    Tokenizing and encoding/decoding text using the Ovis tokenizer.
+    Args:
+        model_path (str): Path to the tokenzier from hugging face.
+    """
+    def __init__(
+        self,
+        model_path: str = "Ovis2.5-2B",
+        max_length: int = 256,
+        **hf_kwargs
+    ):
+        super().__init__()
+        self._tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.system_prompt = "Describe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background: "
+        self.user_prompt_begin_id = 28
+        self._max_length = max_length + self.user_prompt_begin_id
+    def encode(
+        self,
+        s: str,
+        system_prompt = ""
+    ) -> torch.Tensor:
+        """
+        Encode the prompt text into tokens.
+        """
+        if len(system_prompt) == 0:
+            system_prompt = self.system_prompt
+        messages = [{
+            "role": "user",
+            "content": system_prompt + s,
+        }]
+        text = self._tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False
+        )
+        tokens = self._tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self._max_length,
+            return_tensors="pt",
+            add_special_tokens=False,
+        )
+        return tokens.input_ids, tokens.attention_mask
+    def decode(self, t: List[int]) -> str:
+        return self._tokenizer.decode(t, skip_special_tokens=False)
+def build_ovis_tokenizer(tokenizer_path):
+    max_ovis_encoding_len = 256
+    ovis_tokenizer = OvisTokenizer(
+        tokenizer_path,
+        max_length=max_ovis_encoding_len,
+    )
+    return ovis_tokenizer
+if __name__ == "__main__":
+    ovis_path = "/mnt/workspace/cv_multimodal/aigc/huggingface/Ovis2.5-2B"
+    text = "a cute cat"
+    ovis_tokenizer = OvisTokenizer(ovis_path, max_length=256)
+    ovis_token = ovis_tokenizer.encode(text)
+    import pdb
+    pdb.set_trace()

ovis_image/sampling.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import math
+import os
+from typing import Callable, Optional
+from PIL import ExifTags, Image
+import torch
+from torch import Tensor
+from einops import rearrange, repeat
+from ovis_image.dataset.image_util import build_img_ids
+from ovis_image.model.autoencoder import AutoEncoder
+from ovis_image.model.hf_embedder import OvisEmbedder
+from ovis_image.model.model import OvisImageModel
+from ovis_image.utils import (
+    generate_noise_latent,
+    pack_latents,
+    unpack_latents,
+    generate_txt_ids,
+)
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def sample_timesteps(batch_size, image_seq_len=None, base_shift=None, max_shift=None):
+    if image_seq_len is None or base_shift is None or max_shift is None:
+        logit_mean = 0
+    else:
+        logit_mean = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+    logit_std = 1.0
+    timesteps = torch.normal(
+        mean=logit_mean, std=logit_std, size=(batch_size,)
+    )
+    timesteps = torch.nn.functional.sigmoid(timesteps)
+    return timesteps
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # estimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+def generate_image(
+    device: torch.device,
+    dtype: torch.dtype,
+    model: OvisImageModel,
+    prompt: str,
+    autoencoder: AutoEncoder,
+    ovis_tokenizer,
+    ovis_encoder: OvisEmbedder,
+    img_height: int = 256,
+    img_width: int = 256,
+    denoising_steps: int = 50,
+    cfg_scale: float = 5.0,
+    seed: int = 42,
+) -> torch.Tensor:
+    """
+    Sampling and save a single images from noise using a given prompt.
+    For randomized noise generation, the random seend should already be set at the begining of training.
+    Since we will always use the local random seed on this rank, we don't need to pass in the seed again.
+    """
+    # allow for packing and conversion to latent space. Use the same resolution as training time.
+    img_height = 16 * (img_height // 16)
+    img_width = 16 * (img_width // 16)
+    enable_classifier_free_guidance = True
+    # Tokenize the prompt. Unsqueeze to add a batch dimension.
+    ovis_token_ids, ovis_token_mask = ovis_tokenizer.encode(prompt)
+    ovis_encodings = ovis_encoder(
+        ovis_token_ids.to(device=device), ovis_token_mask.to(device=device)
+    )
+    if enable_classifier_free_guidance:
+        empty_ovis_token_ids, empty_ovis_token_mask = ovis_tokenizer.encode("")
+        empty_ovis_encodings = ovis_encoder(
+            empty_ovis_token_ids.to(device=device), empty_ovis_token_mask.to(device=device)
+        )
+    latents = generate_noise_latent(
+        ovis_token_ids.shape[0],
+        img_height, img_width, device, dtype, seed=seed,
+        latent_channel=autoencoder.params.z_channels)
+    img = denoise(
+        device=device,
+        dtype=dtype,
+        model=model,
+        latents=latents,
+        denoising_steps=denoising_steps,
+        ovis_encodings=ovis_encodings,
+        enable_classifier_free_guidance=enable_classifier_free_guidance,
+        empty_ovis_encodings=(
+            empty_ovis_encodings if enable_classifier_free_guidance else None
+        ),
+        classifier_free_guidance_scale=cfg_scale,
+    )
+    img = autoencoder.decode(img)
+    return img
+def denoise(
+    device: torch.device,
+    dtype: torch.dtype,
+    model: OvisImageModel,
+    latents: torch.Tensor,
+    denoising_steps: int,
+    ovis_encodings: torch.Tensor,
+    enable_classifier_free_guidance: bool = False,
+    empty_ovis_encodings: torch.Tensor | None = None,
+    classifier_free_guidance_scale: float | None = None,
+) -> torch.Tensor:
+    """
+    Sampling images from noise using a given prompt, by running inference with trained model.
+    Save the generated images to the given output path.
+    """
+    bsz = ovis_encodings.shape[0]
+    _, latent_channels, latent_height, latent_width = latents.shape
+    # create denoising schedule
+    timesteps = get_schedule(denoising_steps, latent_height * latent_width, shift=True)
+    # create positional encodings
+    latent_pos_enc = build_img_ids(
+        latent_height // 2, latent_width // 2,
+    ).to(latents)
+    latent_pos_enc = repeat(latent_pos_enc, 'l c -> bsz l c', bsz=bsz)
+    ovis_txt_ids = generate_txt_ids(ovis_encodings, time_id=0).to(latents)
+    if enable_classifier_free_guidance:
+        ovis_encodings = torch.cat([empty_ovis_encodings, ovis_encodings], dim=0)
+        latent_pos_enc = torch.cat([latent_pos_enc, latent_pos_enc], dim=0)
+        ovis_txt_ids = torch.cat([ovis_txt_ids, ovis_txt_ids], dim=0)
+    # convert img-like latents into sequences of patches
+    latents = pack_latents(latents)
+    # this is ignored for schnell
+    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+        if enable_classifier_free_guidance:
+            img = torch.cat([latents, latents], dim=0)
+            t_vec = torch.full((bsz * 2,), t_curr, dtype=dtype, device=device)
+        else:
+            img = latents
+            t_vec = torch.full((bsz,), t_curr, dtype=dtype, device=device)
+        model_pred = model(
+            img=img,
+            img_ids=latent_pos_enc,
+            txt=ovis_encodings,
+            txt_ids=ovis_txt_ids,
+            timesteps=t_vec,
+        )
+        if enable_classifier_free_guidance:
+            pred_u, pred_c = model_pred.chunk(2)
+            pred = pred_u + classifier_free_guidance_scale * (pred_c - pred_u)
+        else:
+            pred = model_pred
+        latents = latents + (t_prev - t_curr) * pred
+    # convert sequences of patches into img-like latents
+    latents = unpack_latents(latents, latent_height, latent_width)
+    return latents
+def save_image(
+    name: str,
+    output_dir: str,
+    x: torch.Tensor,
+    add_sampling_metadata: bool,
+    prompt: str,
+    verbose = True,
+):
+    if verbose:
+        print(f"Saving image to {output_dir}/{name}")
+    os.makedirs(output_dir, exist_ok=True)
+    output_name = os.path.join(output_dir, name)
+    # bring into PIL format and save
+    x = x.clamp(-1, 1)
+    x = rearrange(x[0], "c h w -> h w c")
+    img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+    exif_data = Image.Exif()
+    exif_data[ExifTags.Base.Software] = "AI generated;txt2img"
+    exif_data[ExifTags.Base.Make] = "Ovis"
+    exif_data[ExifTags.Base.Model] = name
+    if add_sampling_metadata:
+        exif_data[ExifTags.Base.ImageDescription] = prompt
+    img.save(output_name, exif=exif_data, quality=95, subsampling=0)

ovis_image/test.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import argparse
+import torch
+from safetensors.torch import load_file
+from ovis_image.model.tokenizer import build_ovis_tokenizer
+from ovis_image.model.autoencoder import load_ae
+from ovis_image.model.hf_embedder import OvisEmbedder
+from ovis_image.model.model import OvisImageModel
+from ovis_image.sampling import generate_image, save_image
+from ovis_image import ovis_image_configs
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', type=str, required=True)
+    parser.add_argument('--ovis_path', type=str, default="")
+    parser.add_argument('--vae_path', type=str, default="")
+    parser.add_argument('--prompt', type=str, default="")
+    parser.add_argument('--image_size', type=int, default=1024)
+    parser.add_argument('--denoising_steps', type=int, default=50)
+    parser.add_argument('--cfg_scale', type=float, default=5.0)
+    args = parser.parse_args()
+    return args
+def load_model_weight(model, model_path):
+    model_state_dict = load_file(model_path)
+    missing_keys, unexpected_keys = model.load_state_dict(model_state_dict)
+    print(f"Load Missing Keys {missing_keys}")
+    print(f"Load Unexpected Keys {unexpected_keys}")
+    return model
+def main():
+    args = parse_args()
+    model_config = ovis_image_configs["ovis-image-7b"]
+    device = "cuda"
+    _dtype = torch.bfloat16
+    print(f"dtype: {_dtype}")
+    ovis_image = OvisImageModel(model_config)
+    ovis_image = load_model_weight(ovis_image, args.model_path)
+    ovis_image = ovis_image.to(device=device, dtype=_dtype)
+    ovis_image.eval()
+    ovis_tokenizer = build_ovis_tokenizer(args.ovis_path)
+    autoencoder = load_ae(
+        args.vae_path,
+        model_config.autoencoder_params,
+        device=device,
+        dtype=_dtype,
+        random_init=False,
+    )
+    autoencoder.eval()
+    ovis_encoder = OvisEmbedder(
+        model_path=args.ovis_path,
+        random_init=False,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch.bfloat16,
+    ).to(device=device, dtype=_dtype)
+    with torch.no_grad():
+        image = generate_image(
+            device=device,
+            dtype=_dtype,
+            model=ovis_image,
+            prompt=args.prompt,
+            autoencoder=autoencoder,
+            ovis_tokenizer=ovis_tokenizer,
+            ovis_encoder=ovis_encoder,
+            img_height=args.image_size,
+            img_width=args.image_size,
+            denoising_steps=args.denoising_steps,
+            cfg_scale=args.cfg_scale,
+            seed=42,
+        )
+    image_name = f"ovis_image.png"
+    save_image(
+        name=image_name,
+        output_dir="outputs",
+        x=image,
+        add_sampling_metadata=True,
+        prompt=args.prompt,
+        verbose=False,
+    )
+if __name__ == "__main__":
+    main()

ovis_image/utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (C) 2025 AIDC-AI
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+import torch
+from torch import Tensor
+def generate_txt_ids(encodings, time_id=0):
+    txt_ids = torch.zeros(encodings.shape[0], encodings.shape[1], 3)
+    txt_ids[..., 1] = txt_ids[..., 1] + torch.arange(encodings.shape[1])[None, :]
+    txt_ids[..., 2] = txt_ids[..., 2] + torch.arange(encodings.shape[1])[None, :]
+    txt_ids[..., 0] = time_id
+    return txt_ids
+def generate_noise_latent(
+    bsz: int,
+    height: int,
+    width: int,
+    device: str | torch.device,
+    dtype: torch.dtype,
+    seed: int | None = None,
+    latent_channel = None,
+) -> Tensor:
+    """Generate noise latents for the flow model. The random seed will be set at the begining of training.
+    Args:
+        bsz (int): batch_size.
+        height (int): The height of the image.
+        width (int): The width of the image.
+        device (str | torch.device): The device to use.
+        dtype (torch.dtype): The dtype to use.
+    Returns:
+        Tensor: The noise latents.
+            Shape: [num_samples, LATENT_CHANNELS, height // IMG_LATENT_SIZE_RATIO, width // IMG_LATENT_SIZE_RATIO]
+    """
+    LATENT_CHANNELS, IMAGE_LATENT_SIZE_RATIO = 16, 8
+    if latent_channel is not None:
+        LATENT_CHANNELS = latent_channel
+    return torch.randn(
+        bsz,
+        LATENT_CHANNELS,
+        height // IMAGE_LATENT_SIZE_RATIO,
+        width // IMAGE_LATENT_SIZE_RATIO,
+        dtype=dtype,
+        generator=torch.Generator().manual_seed(seed),
+    ).to(device)
+def pack_latents(x: Tensor) -> Tensor:
+    """
+    Rearrange latents from an image-like format into a sequence of patches.
+    Equivalent to `einops.rearrange("b c (h ph) (w pw) -> b (h w) (c ph pw)")`.
+    Args:
+        x (Tensor): The unpacked latents.
+            Shape: [bsz, ch, latent height, latent width]
+    Returns:
+        Tensor: The packed latents.
+            Shape: (bsz, (latent_height // ph) * (latent_width // pw), ch * ph * pw)
+    """
+    PATCH_HEIGHT, PATCH_WIDTH = 2, 2
+    b, c, latent_height, latent_width = x.shape
+    h = latent_height // PATCH_HEIGHT
+    w = latent_width // PATCH_WIDTH
+    # [b, c, h*ph, w*ph] -> [b, c, h, w, ph, pw]
+    x = x.unfold(2, PATCH_HEIGHT, PATCH_HEIGHT).unfold(3, PATCH_WIDTH, PATCH_WIDTH)
+    # [b, c, h, w, ph, PW] -> [b, h, w, c, ph, PW]
+    x = x.permute(0, 2, 3, 1, 4, 5)
+    # [b, h, w, c, ph, PW] -> [b, h*w, c*ph*PW]
+    return x.reshape(b, h * w, c * PATCH_HEIGHT * PATCH_WIDTH)
+def unpack_latents(x: Tensor, latent_height: int, latent_width: int) -> Tensor:
+    """
+    Rearrange latents from a sequence of patches into an image-like format.
+    Equivalent to `einops.rearrange("b (h w) (c ph pw) -> b c (h ph) (w pw)")`.
+    Args:
+        x (Tensor): The packed latents.
+            Shape: (bsz, (latent_height // ph) * (latent_width // pw), ch * ph * pw)
+        latent_height (int): The height of the unpacked latents.
+        latent_width (int): The width of the unpacked latents.
+    Returns:
+        Tensor: The unpacked latents.
+            Shape: [bsz, ch, latent height, latent width]
+    """
+    PATCH_HEIGHT, PATCH_WIDTH = 2, 2
+    b, _, c_ph_pw = x.shape
+    h = latent_height // PATCH_HEIGHT
+    w = latent_width // PATCH_WIDTH
+    c = c_ph_pw // (PATCH_HEIGHT * PATCH_WIDTH)
+    # [b, h*w, c*ph*pw] -> [b, h, w, c, ph, pw]
+    x = x.reshape(b, h, w, c, PATCH_HEIGHT, PATCH_WIDTH)
+    # [b, h, w, c, ph, pw] -> [b, c, h, ph, w, pw]
+    x = x.permute(0, 3, 1, 4, 2, 5)
+    # [b, c, h, ph, w, pw] -> [b, c, h*ph, w*pw]
+    return x.reshape(b, c, h * PATCH_HEIGHT, w * PATCH_WIDTH)