Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +2 -0
Infinity/infinity/models/basic.py +793 -0
Infinity/infinity/models/infinity.py +817 -0
Infinity/infinity_vae_d32_reg.pth +3 -0
README.md +162 -3
flan-t5-xl-encoder-Q8_0.gguf +3 -0
generate_image_2b_q8_gguf.py +559 -0
gradio_webui.py +342 -0
infinity_2b_reg_Q8_0.gguf +3 -0
infinity_gguf_utils.py +477 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+flan-t5-xl-encoder-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
+infinity_2b_reg_Q8_0.gguf filter=lfs diff=lfs merge=lfs -text

Infinity/infinity/models/basic.py ADDED Viewed

	@@ -0,0 +1,793 @@

+"""
+Definitions of blocks of VAR transformer model.
+"""
+import math
+import os
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from timm.models.layers import DropPath, drop_path
+from torch.utils.checkpoint import checkpoint
+# Attention backend selection with fallback hierarchy:
+# 1. SageAttention (optional, 2-5x faster than FlashAttention)
+# 2. FlashAttention (optional, still faster than PyTorch)
+# 3. PyTorch scaled_dot_product_attention (always available)
+SAGE_ATTN_AVAILABLE = False
+FLASH_ATTN_AVAILABLE = False
+sageattn = None
+sageattn_varlen = None
+flash_attn_func = None
+flash_attn_varlen_kvpacked_func = None
+# Try to import SageAttention (optional, fastest option)
+try:
+    from sageattention import sageattn, sageattn_varlen
+    SAGE_ATTN_AVAILABLE = True
+    print("[INFO] SageAttention detected - will use for 2-5x speedup over FlashAttention")
+except ImportError:
+    pass
+# Try to import FlashAttention (optional, fallback if SageAttention not available)
+try:
+    from flash_attn import flash_attn_func                  # q, k, or v: BLHc, ret: BLHc
+    from flash_attn import flash_attn_varlen_kvpacked_func  # qkv: N3Hc, ret: NHc
+    FLASH_ATTN_AVAILABLE = True
+    if not SAGE_ATTN_AVAILABLE:
+        print("[INFO] FlashAttention detected - will use for optimized attention")
+except ImportError:
+    pass
+# Print final status
+if not SAGE_ATTN_AVAILABLE and not FLASH_ATTN_AVAILABLE:
+    print("[INFO] Using PyTorch scaled_dot_product_attention (no SageAttention or FlashAttention detected)")
+    print("      Install SageAttention for 2-5x speedup: pip install sageattention>=2.2.0 --no-build-isolation")
+from torch.nn.functional import scaled_dot_product_attention as slow_attn    # q, k, v: BHLc
+# Import GGUF utilities for on-the-fly dequantization
+try:
+    import sys
+    import os
+    # Add parent directory to path to find infinity_gguf_utils
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dirs = [
+        os.path.join(current_dir, '../../..'),  # From Infinity/infinity/models to root
+        os.path.join(current_dir, '../../../..'), # One more level up if needed
+    ]
+    for parent_dir in parent_dirs:
+        if parent_dir not in sys.path:
+            sys.path.insert(0, parent_dir)
+    from infinity_gguf_utils import dequantize_gguf_tensor, GGUFParameter
+    GGUF_AVAILABLE = True
+except ImportError:
+    GGUF_AVAILABLE = False
+    GGUFParameter = None
+def get_weight_for_linear(linear_layer, target_dtype=None):
+    """
+    Helper function to get weight from a linear layer, dequantizing if it's a GGUF parameter.
+    Args:
+        linear_layer: nn.Linear or GGUFLinear layer
+        target_dtype: Target dtype for dequantization
+    Returns:
+        Weight tensor ready for use in F.linear
+    """
+    weight = linear_layer.weight
+    if GGUF_AVAILABLE and isinstance(weight, GGUFParameter):
+        # Dequantize GGUF weight
+        return dequantize_gguf_tensor(weight, target_dtype=target_dtype)
+    # For F16 or other non-quantized weights, convert to target dtype if specified
+    if target_dtype is not None and weight.dtype != target_dtype:
+        return weight.to(dtype=target_dtype)
+    return weight
+# Import flash_attn's fused ops
+try:
+    from flash_attn.ops.layer_norm import dropout_add_layer_norm
+    from flash_attn.ops.rms_norm import dropout_add_rms_norm
+    from flash_attn.ops.rms_norm import rms_norm as rms_norm_impl
+    from flash_attn.ops.fused_dense import fused_mlp_func
+    flash_fused_op_installed = True
+except ImportError:
+    dropout_add_layer_norm = dropout_add_rms_norm = fused_mlp_func = None
+    flash_fused_op_installed = False
+    def rms_norm_impl(x, weight, epsilon):
+        return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True).add_(epsilon))) * weight
+def precompute_rope2d_freqs_grid(dim, dynamic_resolution_h_w, rope2d_normalized_by_hw, pad_to_multiplier=1, max_height=2048 // 16, max_width=2048 // 16, base=10000.0, device=None, scaling_factor=1.0):
+    # split the dimension into half, one for x and one for y
+    half_dim = dim // 2
+    inv_freq = 1.0 / (base ** (torch.arange(0, half_dim, 2, dtype=torch.int64).float().to(device) / half_dim)) # namely theta, 1 / (10000^(i/half_dim)), i=0,2,..., half_dim-2
+    t_height = torch.arange(max_height, device=device, dtype=torch.int64).type_as(inv_freq)
+    t_width = torch.arange(max_width, device=device, dtype=torch.int64).type_as(inv_freq)
+    t_height = t_height / scaling_factor
+    freqs_height = torch.outer(t_height, inv_freq)  # (max_height, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely y*theta
+    t_width = t_width / scaling_factor
+    freqs_width = torch.outer(t_width, inv_freq)  # (max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2), namely x*theta
+    freqs_grid_map = torch.concat([
+        freqs_height[:, None, :].expand(-1, max_width, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2)
+        freqs_width[None, :, :].expand(max_height, -1, -1), # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d) / 2)
+    ], dim=-1)  # (max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d))
+    freqs_grid_map = torch.stack([torch.cos(freqs_grid_map), torch.sin(freqs_grid_map)], dim=0)
+    # (2, max_height, max_width, dim / (1 for 1d, 2 for 2d, 3 for 3d))
+    rope2d_freqs_grid = {}
+    for h_div_w in dynamic_resolution_h_w:
+        scale_schedule = dynamic_resolution_h_w[h_div_w]['1M']['scales']
+        _, ph, pw = scale_schedule[-1]
+        max_edge_length = freqs_grid_map.shape[1]
+        if ph >= pw:
+            uph, upw = max_edge_length, int(max_edge_length / ph * pw)
+        else:
+            uph, upw = int(max_edge_length / pw * ph), max_edge_length
+        rope_cache_list = []
+        for (_, ph, pw) in scale_schedule:
+            ph_mul_pw = ph * pw
+            if rope2d_normalized_by_hw == 1: # downsample
+                rope_cache = F.interpolate(freqs_grid_map[:, :uph, :upw, :].permute([0,3,1,2]), size=(ph, pw), mode='bilinear', align_corners=True)
+                rope_cache = rope_cache.permute([0,2,3,1]) # (2, ph, pw, half_head_dim)
+            elif rope2d_normalized_by_hw == 2: # star stylee
+                _, uph, upw = scale_schedule[-1]
+                indices = torch.stack([
+                    (torch.arange(ph) * (uph / ph)).reshape(ph, 1).expand(ph, pw),
+                    (torch.arange(pw) * (upw / pw)).reshape(1, pw).expand(ph, pw),
+                ], dim=-1).round().int() # (ph, pw, 2)
+                indices = indices.reshape(-1, 2) # (ph*pw, 2)
+                rope_cache = freqs_grid_map[:, indices[:,0], indices[:,1], :] # (2, ph*pw, half_head_dim)
+                rope_cache = rope_cache.reshape(2, ph, pw, -1)
+            elif rope2d_normalized_by_hw == 0:
+                rope_cache = freqs_grid_map[:, :ph, :pw, :] # (2, ph, pw, half_head_dim)
+            else:
+                raise ValueError(f'Unknown rope2d_normalized_by_hw: {rope2d_normalized_by_hw}')
+            rope_cache_list.append(rope_cache.reshape(2, ph_mul_pw, -1))
+        cat_rope_cache = torch.cat(rope_cache_list, 1) # (2, seq_len, half_head_dim)
+        if cat_rope_cache.shape[1] % pad_to_multiplier:
+            pad = torch.zeros(2, pad_to_multiplier - cat_rope_cache.shape[1] % pad_to_multiplier, half_dim)
+            cat_rope_cache = torch.cat([cat_rope_cache, pad], dim=1)
+        cat_rope_cache = cat_rope_cache[:,None,None,None] # (2, 1, 1, 1, seq_len, half_dim)
+        for pn in dynamic_resolution_h_w[h_div_w]:
+            scale_schedule = dynamic_resolution_h_w[h_div_w][pn]['scales']
+            tmp_scale_schedule = [(1, h, w) for _, h, w in scale_schedule]
+            rope2d_freqs_grid[str(tuple(tmp_scale_schedule))] = cat_rope_cache
+    return rope2d_freqs_grid
+def apply_rotary_emb(q, k, scale_schedule, rope2d_freqs_grid, pad_to_multiplier, rope2d_normalized_by_hw, scale_ind):
+    qk = torch.stack((q, k), dim=0)  #(2, batch_size, heads, seq_len, head_dim)
+    device_type = qk.device.type
+    device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+    with torch.autocast(device_type=device_type, enabled=False):
+        seq_len = qk.shape[3]
+        start = 0
+        if scale_ind >= 1:
+            assert len(scale_schedule[0]) == 3
+            start = np.sum([item[0] * item[1] * item[2] for item in scale_schedule[:scale_ind]])
+        rope2d_freqs_grid[str(tuple(scale_schedule))] = rope2d_freqs_grid[str(tuple(scale_schedule))].to(qk.device)
+        assert start+seq_len <= rope2d_freqs_grid[str(tuple(scale_schedule))].shape[4]
+        rope_cache = rope2d_freqs_grid[str(tuple(scale_schedule))][:, :, :, :, start:start+seq_len] # rope_cache shape: [2, 1, 1, 1, seq_len, half_head_dim]
+        qk = qk.reshape(*qk.shape[:-1], -1, 2) #(2, batch_size, heads, seq_len, half_head_dim, 2)
+        qk = torch.stack([
+            rope_cache[0] * qk[...,0] - rope_cache[1] * qk[...,1],
+            rope_cache[1] * qk[...,0] + rope_cache[0] * qk[...,1],
+        ], dim=-1) # (2, batch_size, heads, seq_len, half_head_dim, 2), here stack + reshape should not be concate
+        qk = qk.reshape(*qk.shape[:-2], -1) #(2, batch_size, heads, seq_len, head_dim)
+        q, k = qk.unbind(dim=0) # (batch_size, heads, seq_len, head_dim)
+    return q, k
+class FastRMSNorm(nn.Module):
+    def __init__(self, C, eps=1e-6, elementwise_affine=True):
+        super().__init__()
+        self.C = C
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(C))
+        else:
+            self.register_buffer('weight', torch.ones(C))
+    def forward(self, x):
+        src_type = x.dtype
+        return rms_norm_impl(x.float(), self.weight, epsilon=self.eps).to(src_type)
+    def extra_repr(self) -> str:
+        return f'C={self.C}, eps={self.eps:g}, elementwise_affine={self.elementwise_affine}'
+def get_dropout_layer(p):
+    return nn.Dropout(p, inplace=True) if p > 0 else nn.Identity()
+class FFN(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0., fused_mlp=False):
+        super().__init__()
+        self.fused_mlp_func = fused_mlp_func if fused_mlp else None
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = get_dropout_layer(drop)
+        self.heuristic = -1
+    def forward(self, x):
+        if self.fused_mlp_func is not None:
+            return self.drop(self.fused_mlp_func(
+                x=x,
+                weight1=self.fc1.weight,
+                weight2=self.fc2.weight,
+                bias1=self.fc1.bias,
+                bias2=self.fc2.bias,
+                activation='gelu_approx',
+                save_pre_act=self.training,
+                return_residual=False,
+                checkpoint_lvl=0,
+                heuristic=self.heuristic,
+                process_group=None,
+            ))
+        else:
+            return self.drop(self.fc2( self.act(self.fc1(x)) ))
+    def extra_repr(self) -> str:
+        return f'fused_mlp={self.fused_mlp_func is not None}'
+class FFNSwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features=None, drop=0., fused_mlp=False):
+        super().__init__()
+        self.fused_mlp_func = None
+        hidden_features = round(2 * hidden_features / 3 / 256) * 256
+        out_features = out_features or in_features
+        self.fcg = nn.Linear(in_features, hidden_features, bias=False)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=False)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=False)
+        self.drop = get_dropout_layer(drop)
+    def forward(self, x):
+        return self.drop(self.fc2( F.silu(self.fcg(x), inplace=True).mul_(self.fc1(x)) ))
+    def extra_repr(self) -> str:
+        return f'fused_mlp={self.fused_mlp_func is not None}'
+class SelfAttention(nn.Module):
+    def __init__(
+        self, embed_dim=768, num_heads=12,
+        proj_drop=0., tau=1, cos_attn=False, customized_flash_attn=True, use_flex_attn=False,
+        batch_size=2, pad_to_multiplier=1, rope2d_normalized_by_hw=0,
+    ):
+        """
+        :param embed_dim: model's width
+        :param num_heads: num heads of multi-head attention
+        :param proj_drop: always 0 for testing
+        :param tau: always 1
+        :param cos_attn: always True: during attention, q and k will be L2-normalized and scaled by a head-wise learnable parameter self.scale_mul_1H11
+        :param customized_flash_attn:
+        """
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        self.using_flash = customized_flash_attn
+        self.num_heads, self.head_dim = num_heads, embed_dim // num_heads
+        self.tau, self.cos_attn = tau, cos_attn
+        if self.cos_attn:
+            self.scale = 1
+            size = (1, 1, self.num_heads, 1) if self.using_flash else (1, self.num_heads, 1, 1)
+            # size: 11H1 or 1H11
+            self.scale_mul_1H11 = nn.Parameter(torch.full(size=size, fill_value=4.0).log(), requires_grad=True)
+            self.max_scale_mul = torch.log(torch.tensor(100)).item()
+        else:
+            self.scale = 1 / math.sqrt(self.head_dim) / self.tau
+        self.mat_qkv = nn.Linear(embed_dim, embed_dim * 3, bias=False)
+        self.q_bias, self.v_bias = nn.Parameter(torch.zeros(embed_dim)), nn.Parameter(torch.zeros(embed_dim))
+        self.register_buffer('zero_k_bias', torch.zeros(embed_dim))
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = get_dropout_layer(proj_drop)
+        self.caching = False    # kv caching: only used during inference
+        self.cached_k = None    # kv caching: only used during inference
+        self.cached_v = None    # kv caching: only used during inference
+        self.batch_size = batch_size
+        self.use_flex_attn = use_flex_attn
+        self.pad_to_multiplier = pad_to_multiplier
+        self.rope2d_normalized_by_hw = rope2d_normalized_by_hw
+    def kv_caching(self, enable: bool): # kv caching: only used during inference
+        self.caching = enable
+        self.cached_k = None
+        self.cached_v = None
+    # NOTE: attn_bias_or_two_vector is None during inference
+    def forward(self, x, attn_bias_or_two_vector: Union[torch.Tensor, Tuple[torch.IntTensor, torch.IntTensor]], attn_fn=None, scale_schedule=None, rope2d_freqs_grid=None, scale_ind=0):
+        """
+        :param (fp32) x: shaped (B or batch_size, L or seq_length, C or hidden_dim); if seq-parallel is used, the `L` dim would be shared
+        :param (fp32) attn_bias_or_two_vector:
+                if not using_flash:
+                    a block-wise, lower-triangle matrix, like:
+                    [[[[0, -, -, -, -, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, -, -, -, -, -, -, -, -, -],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]]
+                    where 0 means visible and - means invisible (-inf)
+                else:
+                    a tuple of two 1-dim int vector (VAR_visible_kvlen, VAR_invisible_qlen)
+        :return: shaped (B or batch_size, L or seq_length, C or hidden_dim); if seq-parallel is used, the `L` dim would be shared
+        """
+        # x: fp32
+        B, L, C = x.shape
+        # qkv: amp, bf16
+        qkv = F.linear(input=x, weight=get_weight_for_linear(self.mat_qkv, target_dtype=x.dtype), bias=torch.cat((self.q_bias, self.zero_k_bias, self.v_bias))).view(B, L, 3, self.num_heads, self.head_dim)  # BL3Hc
+        if self.using_flash: q, k, v = qkv.unbind(dim=2); L_dim = 1           # q or k or v: all are shaped in (B:batch_size, L:seq_len, H:heads, c:head_dim)
+        else: q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0); L_dim = 2   # q or k or v: all are shaped in (B:batch_size, H:heads, L:seq_len, c:head_dim)
+        if self.cos_attn:   # always True
+            scale_mul = self.scale_mul_1H11.clamp_max(self.max_scale_mul).exp() # 11H1 (flash), or 1H11 (not flash)
+            q = F.normalize(q, dim=-1, eps=1e-12).mul(scale_mul).contiguous()   # fp32
+            k = F.normalize(k, dim=-1, eps=1e-12).contiguous()                  # fp32
+            v = v.contiguous()                                                  # bf16
+        else:   # be contiguous, to make kernel happy
+            q = q.contiguous()      # bf16
+            k = k.contiguous()      # bf16
+            v = v.contiguous()      # bf16
+        if rope2d_freqs_grid is not None:
+            q, k = apply_rotary_emb(q, k, scale_schedule, rope2d_freqs_grid, self.pad_to_multiplier, self.rope2d_normalized_by_hw, scale_ind) #, freqs_cis=freqs_cis)
+        if self.caching:    # kv caching: only used during inference
+            if self.cached_k is None: self.cached_k = k; self.cached_v = v
+            else: k = self.cached_k = torch.cat((self.cached_k, k), dim=L_dim); v = self.cached_v = torch.cat((self.cached_v, v), dim=L_dim)
+        if self.using_flash:
+            # Try SageAttention first (if available and during inference)
+            if SAGE_ATTN_AVAILABLE and attn_bias_or_two_vector is None:
+                try:
+                    # SageAttention: expects (B, num_heads, seq_len, head_dim) layout (HND format)
+                    # Our q, k, v are already in (B, L, H, c) format, need to transpose to (B, H, L, c)
+                    q_sage = q.transpose(1, 2)  # (B, H, L, c)
+                    k_sage = k.transpose(1, 2)  # (B, H, L, c)
+                    v_sage = v.transpose(1, 2)  # (B, H, L, c)
+                    # Convert to fp16 or bf16 if needed (SageAttention requires fp16/bf16)
+                    target_dtype = torch.bfloat16 if v.dtype == torch.float32 else v.dtype
+                    q_sage = q_sage.to(target_dtype)
+                    k_sage = k_sage.to(target_dtype)
+                    v_sage = v_sage.to(target_dtype)
+                    # Use SageAttention for inference
+                    oup = sageattn(q_sage, k_sage, v_sage, tensor_layout="HND", is_causal=False)
+                    oup = oup.transpose(1, 2).reshape(B, L, C)  # (B, H, L, c) -> (B, L, H, c) -> (B, L, C)
+                    if target_dtype != v.dtype:
+                        oup = oup.to(v.dtype)
+                except Exception as e:
+                    print(f"[WARNING] SageAttention failed ({str(e)[:100]}), falling back to FlashAttention/PyTorch")
+                    # Fall through to FlashAttention or PyTorch
+                    if FLASH_ATTN_AVAILABLE:
+                        kw = dict() if attn_bias_or_two_vector is None else dict(VAR_visible_kvlen=attn_bias_or_two_vector[0], VAR_invisible_qlen=attn_bias_or_two_vector[1])
+                        oup = flash_attn_func(q.to(v.dtype), k.to(v.dtype), v, dropout_p=0, softmax_scale=self.scale, **kw).view(B, L, C)
+                    else:
+                        q_torch = q.transpose(1, 2)
+                        k_torch = k.transpose(1, 2)
+                        v_torch = v.transpose(1, 2)
+                        oup = slow_attn(query=q_torch, key=k_torch, value=v_torch, scale=self.scale, dropout_p=0).transpose(1, 2).reshape(B, L, C)
+            # Fall back to FlashAttention if SageAttention not used
+            elif FLASH_ATTN_AVAILABLE:
+                if attn_bias_or_two_vector is not None: # training
+                    kw = dict(VAR_visible_kvlen=attn_bias_or_two_vector[0], VAR_invisible_qlen=attn_bias_or_two_vector[1])
+                else:                                   # inference (autoregressive sampling)
+                    kw = dict()
+                oup = flash_attn_func(q.to(v.dtype), k.to(v.dtype), v, dropout_p=0, softmax_scale=self.scale, **kw).view(B, L, C)
+            # Final fallback to PyTorch SDPA
+            else:
+                q_torch = q.transpose(1, 2)  # (B, H, L, c)
+                k_torch = k.transpose(1, 2)
+                v_torch = v.transpose(1, 2)
+                oup = slow_attn(query=q_torch, key=k_torch, value=v_torch, scale=self.scale, dropout_p=0).transpose(1, 2).reshape(B, L, C)
+        else:
+            # if self.cos_attn: q, k are in fp32; v is in bf16
+            # else: q, k, v are in bf16
+            if self.use_flex_attn and attn_fn is not None:
+                oup = attn_fn(q, k, v, scale=self.scale).transpose(1, 2).reshape(B, L, C)
+            else:
+                oup = slow_attn(query=q, key=k, value=v, scale=self.scale, attn_mask=attn_bias_or_two_vector, dropout_p=0).transpose(1, 2).reshape(B, L, C)
+            # oup: bf16
+        return self.proj_drop(self.proj(oup))
+    def extra_repr(self) -> str:
+        tail = ''
+        return f'using_flash={self.using_flash}, tau={self.tau}, cos_attn={self.cos_attn}{tail}'
+class CrossAttention(nn.Module):
+    def __init__(
+        self, for_attn_pool=False, embed_dim=768, kv_dim=4096, num_heads=12,
+        proj_drop=0., cos_attn=False, use_flash_attn=True,
+    ):
+        """
+        :param for_attn_pool: only used in VAR.text_proj_for_sos
+        :param embed_dim: Q's dim
+        :param kv_dim: K's and V's dim
+        :param num_heads: num heads of multi-head attention
+        :param proj_drop: proj drop out
+        :param cos_attn: during attention, q and k will be L2-normalized and scaled by a head-wise learnable parameter self.scale_mul_1H11
+        """
+        cos_attn = False    # TODO: never use cos attn in cross attention with T5 kv
+        super().__init__()
+        self.for_attn_pool = for_attn_pool
+        self.embed_dim = embed_dim
+        self.kv_dim = kv_dim
+        assert embed_dim % num_heads == 0
+        self.num_heads, self.head_dim = num_heads, embed_dim // num_heads  # =64
+        self.cos_attn = cos_attn
+        self.use_flash_attn = use_flash_attn
+        if self.cos_attn:
+            self.scale = 1
+            self.scale_mul_1H1 = nn.Parameter(torch.full(size=(1, self.num_heads, 1, 1), fill_value=4.0).log(), requires_grad=True)
+            self.max_scale_mul = torch.log(torch.tensor(100)).item()
+        else:
+            self.scale = 1 / math.sqrt(self.head_dim)
+        if for_attn_pool:
+            q = torch.empty(1, self.num_heads, self.head_dim)
+            nn.init.trunc_normal_(q, mean=0, std=math.sqrt(1 / embed_dim / 3))
+            self.mat_q = nn.Parameter(q)
+        else:
+            self.mat_q = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.mat_kv = nn.Linear(kv_dim, embed_dim*2, bias=False)
+        self.v_bias = nn.Parameter(torch.zeros(embed_dim))
+        self.register_buffer('zero_k_bias', torch.zeros(embed_dim))
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = get_dropout_layer(proj_drop)
+    def forward(self, q, ca_kv):
+        """
+        :param q: shaped as (batch, seq_len, Q_dim)
+        :param ca_kv: contains several vectors, each of which is shaped as (len_i, KV_dim). We have [len_1xKV_dim, len_2xKV_dim, len_3xKV_dim, ...] and lens == [len_1, len_2, len_3, ...]
+            - kv_compact: shaped as (sum(lens), KV_dim)
+            - cu_seqlens_k: cumulated sum of lens
+            - max_seqlen_k: int, max(lens)
+        NOTE: seq_len (num of Qs) can reach 10k;  but len_i (num of KVs) must <= 256
+        :return: shaped as (batch, seq_len, Q_dim)
+        """
+        kv_compact, cu_seqlens_k, max_seqlen_k = ca_kv
+        N = kv_compact.shape[0]
+        kv_compact = F.linear(kv_compact, weight=get_weight_for_linear(self.mat_kv, target_dtype=kv_compact.dtype), bias=torch.cat((self.zero_k_bias, self.v_bias))).view(N, 2, self.num_heads, self.head_dim) # NC => N2Hc
+        # attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens
+        if not self.for_attn_pool:
+            B, Lq = q.shape[:2]
+            q_compact = self.mat_q(q).view(-1, self.num_heads, self.head_dim)
+        else:
+            B = cu_seqlens_k.shape[0] - 1
+            Lq = 1
+            # Dequantize mat_q if it's a GGUFParameter
+            mat_q_data = self.mat_q
+            if GGUF_AVAILABLE and isinstance(mat_q_data, GGUFParameter):
+                mat_q_data = dequantize_gguf_tensor(mat_q_data, target_dtype=kv_compact.dtype)
+            q_compact = mat_q_data.repeat(B, 1, 1).to(dtype=kv_compact.dtype)
+        if self.cos_attn:   # always False
+            scale_mul = self.scale_mul_1H1.clamp_max(self.max_scale_mul).exp()
+            k, v = kv_compact.unbind(dim=1)
+            q_compact = F.normalize(q_compact, dim=-1).mul(scale_mul)
+            k = F.normalize(k, dim=-1)
+            kv_compact = torch.stack((k, v), dim=1)
+        q_compact = q_compact.contiguous()
+        kv_compact = kv_compact.contiguous()
+        # Try optimized attention backends with graceful fallback
+        if self.use_flash_attn:
+            cu_seqlens_q = torch.arange(0, Lq * (B+1), Lq, dtype=torch.int32, device=q_compact.device)
+            oup = None
+            # Try SageAttention first (fastest option)
+            if SAGE_ATTN_AVAILABLE:
+                try:
+                    # SageAttention varlen: expects separate k, v tensors
+                    # kv_compact is (N, 2, num_heads, head_dim), split into k and v
+                    k_compact, v_compact = kv_compact.unbind(dim=1)  # Each is (N, num_heads, head_dim)
+                    # Convert to fp16/bf16 if needed
+                    target_dtype = torch.bfloat16 if q_compact.dtype == torch.float32 else q_compact.dtype
+                    q_sage = q_compact.to(target_dtype)
+                    k_sage = k_compact.to(target_dtype)
+                    v_sage = v_compact.to(target_dtype)
+                    # Use sageattn_varlen for variable length sequences
+                    oup = sageattn_varlen(
+                        q=q_sage,
+                        k=k_sage,
+                        v=v_sage,
+                        cu_seqlens_q=cu_seqlens_q,
+                        cu_seqlens_k=cu_seqlens_k,
+                        max_seqlen_q=Lq,
+                        max_seqlen_k=max_seqlen_k,
+                        is_causal=False,
+                        sm_scale=self.scale,
+                        smooth_k=True
+                    ).reshape(B, Lq, -1)
+                    if target_dtype != q_compact.dtype:
+                        oup = oup.float()
+                except Exception as e:
+                    print(f"[WARNING] SageAttention failed ({str(e)[:100]}), falling back to FlashAttention/PyTorch")
+                    oup = None
+            # Fall back to FlashAttention if SageAttention failed or not available
+            if oup is None and FLASH_ATTN_AVAILABLE:
+                try:
+                    if q_compact.dtype == torch.float32:
+                        oup = flash_attn_varlen_kvpacked_func(q=q_compact.to(dtype=torch.bfloat16), kv=kv_compact.to(dtype=torch.bfloat16), cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=Lq, max_seqlen_k=max_seqlen_k, dropout_p=0, softmax_scale=self.scale).reshape(B, Lq, -1)
+                        oup = oup.float()
+                    else:
+                        oup = flash_attn_varlen_kvpacked_func(q=q_compact, kv=kv_compact, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=Lq, max_seqlen_k=max_seqlen_k, dropout_p=0, softmax_scale=self.scale).reshape(B, Lq, -1)
+                except Exception as e:
+                    print(f"[WARNING] FlashAttention failed ({str(e)[:100]}), falling back to PyTorch attention")
+                    oup = None
+            # If both SageAttention and FlashAttention failed, fall back to PyTorch
+            if oup is None:
+                self.use_flash_attn = False  # Disable optimized attention for future calls
+        # Fallback to PyTorch scaled_dot_product_attention
+        if not self.use_flash_attn:
+            # Unpack k and v from kv_compact: (N, 2, num_heads, head_dim)
+            k, v = kv_compact.unbind(dim=1)  # k, v: (N, num_heads, head_dim)
+            # Reconstruct per-batch k and v tensors based on cu_seqlens_k
+            k_batched = []
+            v_batched = []
+            for i in range(B):
+                start = cu_seqlens_k[i].item()
+                end = cu_seqlens_k[i+1].item()
+                k_batched.append(k[start:end])  # (seq_len_i, num_heads, head_dim)
+                v_batched.append(v[start:end])
+            # Pad to max_seqlen_k for batching
+            k_padded = torch.stack([
+                F.pad(k_i, (0, 0, 0, 0, 0, max_seqlen_k - k_i.shape[0])) if k_i.shape[0] < max_seqlen_k else k_i
+                for k_i in k_batched
+            ])  # (B, max_seqlen_k, num_heads, head_dim)
+            v_padded = torch.stack([
+                F.pad(v_i, (0, 0, 0, 0, 0, max_seqlen_k - v_i.shape[0])) if v_i.shape[0] < max_seqlen_k else v_i
+                for v_i in v_batched
+            ])  # (B, max_seqlen_k, num_heads, head_dim)
+            # Reshape q_compact: (B*Lq, num_heads, head_dim) -> (B, Lq, num_heads, head_dim)
+            q_batched = q_compact.view(B, Lq, self.num_heads, self.head_dim)
+            # Transpose for attention: (B, num_heads, seq_len, head_dim)
+            q_attn = q_batched.transpose(1, 2)  # (B, num_heads, Lq, head_dim)
+            k_attn = k_padded.transpose(1, 2)   # (B, num_heads, max_seqlen_k, head_dim)
+            v_attn = v_padded.transpose(1, 2)   # (B, num_heads, max_seqlen_k, head_dim)
+            # Create attention mask to mask out padding
+            attn_mask = torch.zeros(B, 1, Lq, max_seqlen_k, dtype=torch.bool, device=q_compact.device)
+            for i in range(B):
+                seq_len = cu_seqlens_k[i+1].item() - cu_seqlens_k[i].item()
+                if seq_len < max_seqlen_k:
+                    attn_mask[i, :, :, seq_len:] = True  # Mask padding positions
+            # Apply attention
+            oup = slow_attn(
+                query=q_attn,
+                key=k_attn,
+                value=v_attn,
+                attn_mask=~attn_mask,  # True = not masked, False = masked (inverted for PyTorch)
+                scale=self.scale,
+                dropout_p=0.0
+            )  # (B, num_heads, Lq, head_dim)
+            # Reshape back: (B, num_heads, Lq, head_dim) -> (B, Lq, embed_dim)
+            oup = oup.transpose(1, 2).reshape(B, Lq, -1)
+        return self.proj_drop(self.proj(oup))
+    def extra_repr(self) -> str:
+        return f'Cq={self.embed_dim}, Ckv={self.kv_dim}, cos_attn={self.cos_attn}'
+class SelfAttnBlock(nn.Module):
+    def __init__(
+        self, embed_dim, kv_dim, cross_attn_layer_scale, cond_dim, act: bool, shared_aln: bool, norm_layer: partial,
+        num_heads, mlp_ratio=4., drop=0., drop_path=0., tau=1, cos_attn=False,
+        swiglu=False, customized_flash_attn=False, fused_mlp=False, fused_norm_func=None, checkpointing_sa_only=False,
+    ):
+        super(SelfAttnBlock, self).__init__()
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path_rate = drop_path
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.attn = SelfAttention(
+            embed_dim=embed_dim, num_heads=num_heads, proj_drop=drop, tau=tau, cos_attn=cos_attn, customized_flash_attn=customized_flash_attn, attn_fn = attn_fn
+        )
+        self.using_swiglu = swiglu
+        self.ffn = (FFNSwiGLU if swiglu else FFN)(in_features=embed_dim, hidden_features=round(embed_dim * mlp_ratio / 256) * 256, drop=drop, fused_mlp=fused_mlp)
+        self.ln_wo_grad = norm_layer(embed_dim, elementwise_affine=False)
+        self.fused_norm_func = fused_norm_func
+        self.norm_eps = norm_layer.keywords.get('eps', 1e-6)
+        self.shared_aln = shared_aln
+        if self.shared_aln:
+            self.ada_gss = nn.Parameter(torch.randn(1, 1, 6, embed_dim) / embed_dim**0.5)
+        else:
+            lin = nn.Linear(cond_dim, 6*embed_dim)
+            self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin) if act else nn.Sequential(lin)
+    # NOTE: attn_bias_or_two_vector is None during inference
+    def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector):  # todo: minGPT and vqgan also uses pre-norm, just like this, while MaskGiT uses post-norm
+        with torch.cuda.amp.autocast(enabled=False):
+            if self.shared_aln: # always True;                   (1, 1, 6, C)  + (B, 1, 6, C)
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = (self.ada_gss + cond_BD).unbind(2) # 116C + B16C =unbind(2)=> 6 B1C
+            else:
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+        if self.fused_ada_norm is None:
+            x = x + self.drop_path(self.attn( self.ln_wo_grad(x.float()).mul(scale1.add(1)).add_(shift1), attn_bias_or_two_vector=attn_bias_or_two_vector ).mul_(gamma1))
+            x = x + self.drop_path(self.ffn( self.ln_wo_grad(x.float()).mul(scale2.add(1)).add_(shift2) ).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        else:
+            x = x + self.drop_path(self.attn(self.fused_ada_norm(C=self.C, eps=self.norm_eps, x=x, scale=scale1, shift=shift1), attn_bias_or_two_vector=attn_bias_or_two_vector).mul_(gamma1))
+            x = x + self.drop_path(self.ffn(self.fused_ada_norm(C=self.C, eps=self.norm_eps, x=x, scale=scale2, shift=shift2)).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        return x
+    def extra_repr(self) -> str:
+        return f'shared_aln={self.shared_aln}, fused_norm={self.fused_norm_func is not None}'
+class CrossAttnBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim, kv_dim, cross_attn_layer_scale, cond_dim, act: bool, shared_aln: bool, norm_layer: partial,
+        num_heads, mlp_ratio=4., drop=0., drop_path=0., tau=1, cos_attn=False,
+        swiglu=False, customized_flash_attn=False, fused_mlp=False, fused_norm_func=None, checkpointing_sa_only=False,
+        use_flex_attn=False, batch_size=2, pad_to_multiplier=1, apply_rope2d=False, rope2d_normalized_by_hw=False,
+    ):
+        super(CrossAttnBlock, self).__init__()
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path_rate = drop_path
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.sa = SelfAttention(
+            embed_dim=embed_dim, num_heads=num_heads, proj_drop=drop, tau=tau, cos_attn=cos_attn, customized_flash_attn=customized_flash_attn,
+            use_flex_attn=use_flex_attn, batch_size=batch_size, pad_to_multiplier=pad_to_multiplier, rope2d_normalized_by_hw=rope2d_normalized_by_hw,
+        )
+        self.ca = CrossAttention(embed_dim=embed_dim, kv_dim=kv_dim, num_heads=num_heads, proj_drop=drop, cos_attn=cos_attn)
+        self.using_swiglu = swiglu
+        self.ffn = (FFNSwiGLU if swiglu else FFN)(in_features=embed_dim, hidden_features=round(embed_dim * mlp_ratio / 256) * 256, drop=drop, fused_mlp=fused_mlp)
+        self.ln_wo_grad = norm_layer(embed_dim, elementwise_affine=False)
+        self.fused_norm_func = fused_norm_func
+        self.norm_eps = norm_layer.keywords.get('eps', 1e-6)
+        self.ca_norm = norm_layer(embed_dim, elementwise_affine=True)
+        self.shared_aln = shared_aln
+        if self.shared_aln: # always True
+            self.ada_gss = nn.Parameter(torch.randn(1, 1, 6, embed_dim) / embed_dim**0.5)
+        else:
+            lin = nn.Linear(cond_dim, 6*embed_dim)
+            self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin) if act else nn.Sequential(lin)
+        if cross_attn_layer_scale >= 0:
+            self.ca_gamma = nn.Parameter(cross_attn_layer_scale * torch.ones(embed_dim), requires_grad=True)
+        else:
+            self.ca_gamma = 1
+        self.checkpointing_sa_only = checkpointing_sa_only
+    # NOTE: attn_bias_or_two_vector is None during inference
+    def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn=None, scale_schedule=None, rope2d_freqs_grid=None, scale_ind=0):    # todo: minGPT and vqgan also uses pre-norm, just like this, while MaskGiT uses post-norm
+        with torch.cuda.amp.autocast(enabled=False):    # disable half precision
+            if self.shared_aln: # always True;                   (1, 1, 6, C)  + (B, 1, 6, C)
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = (self.ada_gss + cond_BD).unbind(2) # 116C + B16C =unbind(2)=> 6 B1C
+            else:
+                gamma1, gamma2, scale1, scale2, shift1, shift2 = self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+        if self.fused_norm_func is None:
+            x_sa = self.ln_wo_grad(x.float()).mul(scale1.add(1)).add_(shift1)
+            if self.checkpointing_sa_only and self.training:
+                x_sa = checkpoint(self.sa, x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, use_reentrant=False)
+            else:
+                x_sa = self.sa(x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid)
+            x = x + self.drop_path(x_sa.mul_(gamma1))
+            x = x + self.ca(self.ca_norm(x), ca_kv).float().mul_(self.ca_gamma)
+            x = x + self.drop_path(self.ffn( self.ln_wo_grad(x.float()).mul(scale2.add(1)).add_(shift2) ).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        else:
+            x_sa = self.fused_norm_func(C=self.C, eps=self.norm_eps, x=x, scale=scale1, shift=shift1)
+            if self.checkpointing_sa_only and self.training:
+                x_sa = checkpoint(self.sa, x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, use_reentrant=False)
+            else:
+                x_sa = self.sa(x_sa, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, scale_ind=scale_ind)
+            x = x + self.drop_path(x_sa.mul_(gamma1))
+            x = x + self.ca(self.ca_norm(x), ca_kv).float().mul_(self.ca_gamma)
+            x = x + self.drop_path(self.ffn(self.fused_norm_func(C=self.C, eps=self.norm_eps, x=x, scale=scale2, shift=shift2)).mul(gamma2)) # this mul(gamma2) cannot be in-placed cuz we possibly use FusedMLP
+        return x
+    def extra_repr(self) -> str:
+        return f'shared_aln={self.shared_aln}, fused_norm={self.fused_norm_func is not None}, ca_gamma={"<learnable>" if isinstance(self.ca_gamma, nn.Parameter) else self.ca_gamma}'
+class AdaLNBeforeHead(nn.Module):
+    def __init__(self, C, D, act: bool, norm_layer: partial, fused_norm_func=None):   # C: embed_dim, D: cond_dim
+        super().__init__()
+        self.C, self.D = C, D
+        self.ln_wo_grad = norm_layer(C, elementwise_affine=False)
+        self.fused_norm_func = fused_norm_func
+        self.norm_eps = norm_layer.keywords.get('eps', 1e-6)
+        lin = nn.Linear(D, 2*C)
+        self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin) if act else nn.Sequential(lin)
+    def forward(self, x_BLC: torch.Tensor, cond_BD: Optional[torch.Tensor]):
+        scale, shift = self.ada_lin(cond_BD).view(-1, 1, 2, self.C).unbind(2)
+        if self.fused_norm_func is None:
+            return self.ln_wo_grad(x_BLC).mul(scale.add(1)).add_(shift)
+        else:
+            return self.fused_norm_func(C=self.C, eps=self.norm_eps, x=x_BLC, scale=scale, shift=shift)
+def main():
+    dev = 'cpu' # 'cuda' if torch.cuda.is_available() else 'cpu'
+    rng = torch.Generator(device=dev)
+    # for Li in ([1, 3, 5], [1, 3]):
+    rng.manual_seed(0)
+    B, H, cq, ckv = 4, 8, 64, 96
+    Cq = H*cq
+    Ckv = H*ckv
+    Li = [5, 4, 7, 6]
+    Lq = 10
+    L = max(Li)
+    attn_bias = torch.zeros(B, 1, Lq, L, device=dev)
+    for i, x in enumerate(Li):
+        attn_bias[i, 0, :, x:] = -torch.inf
+    q = torch.randn(B, Lq, H, cq, generator=rng, device=dev)
+    k = torch.randn(B, L, H, ckv, generator=rng, device=dev)
+    v = torch.randn(B, L, H, ckv, generator=rng, device=dev)
+    tq, tk, tv = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)    # BHLc
+    seqlen_k = torch.tensor(Li, dtype=torch.int32, device=dev)
+    cu_seqlens_k = F.pad(torch.cumsum(seqlen_k, dim=0, dtype=torch.torch.int32), (1, 0))
+    kv = torch.stack([k, v], dim=2)
+    kv_compact = torch.cat([kv[i, :Li[i]] for i in range(B)], dim=0)
+    ca = CrossAttention(for_attn_pool=False, embed_dim=Cq, kv_dim=Ckv, num_heads=H)
+    CrossAttention.forward
+    ca(q, (kv_compact, cu_seqlens_k, max(Li))).mean().backward()
+if __name__ == '__main__':
+    main()

Infinity/infinity/models/infinity.py ADDED Viewed

	@@ -0,0 +1,817 @@

+"""
+Definition of Infinity transformer model.
+"""
+import math
+import random
+import time
+from contextlib import nullcontext
+from functools import partial
+from typing import List, Optional, Tuple, Union, Dict, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models import register_model
+from torch.utils.checkpoint import checkpoint
+from PIL import Image
+import numpy as np
+import infinity.utils.dist as dist
+from infinity.utils.dist import for_visualize
+from infinity.models.basic import flash_attn_func, flash_fused_op_installed, AdaLNBeforeHead, CrossAttnBlock, SelfAttnBlock, CrossAttention, FastRMSNorm, precompute_rope2d_freqs_grid
+from infinity.utils import misc
+from infinity.models.flex_attn import FlexAttn
+from infinity.utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
+try:
+    from infinity.models.fused_op import fused_ada_layer_norm, fused_ada_rms_norm
+except:
+    fused_ada_layer_norm, fused_ada_rms_norm = None, None
+class MultiInpIdentity(nn.Module):
+    def forward(self, x, *args, **kwargs):
+        return x
+class TextAttentivePool(nn.Module):
+    def __init__(self, Ct5: int, D: int):
+        super().__init__()
+        self.Ct5, self.D = Ct5, D
+        if D > 4096:
+            self.head_dim = 64
+        else:
+            self.head_dim = 128
+        self.num_heads = Ct5 // self.head_dim
+        self.ca = CrossAttention(for_attn_pool=True, embed_dim=self.D, kv_dim=Ct5, num_heads=self.num_heads)
+    def forward(self, ca_kv):
+        return self.ca(None, ca_kv).squeeze(1)
+class SharedAdaLin(nn.Linear):
+    def forward(self, cond_BD):
+        C = self.weight.shape[0] // 6
+        # Import get_weight_for_linear from basic.py
+        from infinity.models.basic import get_weight_for_linear
+        weight = get_weight_for_linear(self, target_dtype=cond_BD.dtype)
+        return F.linear(cond_BD, weight, self.bias).reshape(-1, 1, 6, C)   # B16C
+class MultipleLayers(nn.Module):
+    def __init__(self, ls, num_blocks_in_a_chunk, index):
+        super().__init__()
+        self.module = nn.ModuleList()
+        for i in range(index, index+num_blocks_in_a_chunk):
+            self.module.append(ls[i])
+    def forward(self, x, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn=None, scale_schedule=None, checkpointing_full_block=False, rope2d_freqs_grid=None):
+        h = x
+        for m in self.module:
+            if checkpointing_full_block:
+                h = torch.utils.checkpoint.checkpoint(m, h, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid, use_reentrant=False)
+            else:
+                h = m(h, cond_BD, ca_kv, attn_bias_or_two_vector, attn_fn, scale_schedule, rope2d_freqs_grid)
+        return h
+class Infinity(nn.Module):
+    def __init__(
+        self, vae_local,
+        text_channels=0, text_maxlen=0,     # text-cond generation
+        selecting_idx=None,                 # class-cond generation
+        embed_dim=1024, depth=16, num_heads=16, mlp_ratio=4.,   # model's architecture
+        drop_rate=0., drop_path_rate=0.,    # drop out and drop path
+        norm_eps=1e-6, rms_norm=False,      # norm layer
+        shared_aln=False, head_aln=True,    # adaptive norm
+        cond_drop_rate=0.1,                 # for classifier-free guidance
+        rand_uncond=False,
+        cross_attn_layer_scale=-1., nm0=False, tau=1, cos_attn=True, swiglu=False,
+        raw_scale_schedule=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),
+        head_depth=1,
+        top_p=0.0, top_k=0.0,
+        customized_flash_attn=False, fused_mlp=False, fused_norm=False,
+        block_chunks=1,
+        checkpointing=None,
+        pad_to_multiplier=0,
+        use_flex_attn=False,
+        batch_size=2,
+        add_lvl_embeding_only_first_block=1,
+        use_bit_label=1,
+        rope2d_each_sa_layer=0,
+        rope2d_normalized_by_hw=0,
+        pn=None,
+        train_h_div_w_list=None,
+        video_frames=1,
+        always_training_scales=20,
+        apply_spatial_patchify = 0,
+        inference_mode=False,
+    ):
+        # set hyperparameters
+        self.C = embed_dim
+        self.inference_mode = inference_mode
+        self.apply_spatial_patchify = apply_spatial_patchify
+        if self.apply_spatial_patchify:
+            self.d_vae = vae_local.embed_dim * 4
+        else:
+            self.d_vae = vae_local.embed_dim
+        self.use_bit_label = use_bit_label
+        self.codebook_dim = self.d_vae
+        self.V = (self.codebook_dim * 2) if self.use_bit_label else vae_local.vocab_size
+        self.bit_mask = vae_local.quantizer.lfq.mask if self.use_bit_label else None
+        self.Ct5 = text_channels
+        self.depth = depth
+        self.num_heads = num_heads
+        self.batch_size = batch_size
+        self.mlp_ratio = mlp_ratio
+        self.cond_drop_rate = cond_drop_rate
+        self.norm_eps = norm_eps
+        self.prog_si = -1
+        self.pn = pn
+        self.train_h_div_w_list = train_h_div_w_list if train_h_div_w_list else h_div_w_templates
+        self.video_frames = video_frames
+        self.always_training_scales = always_training_scales
+        assert add_lvl_embeding_only_first_block in [0,1]
+        self.add_lvl_embeding_only_first_block = add_lvl_embeding_only_first_block
+        assert rope2d_each_sa_layer in [0,1]
+        self.rope2d_each_sa_layer = rope2d_each_sa_layer
+        self.rope2d_normalized_by_hw = rope2d_normalized_by_hw
+        print(f'self.codebook_dim: {self.codebook_dim}, self.add_lvl_embeding_only_first_block: {self.add_lvl_embeding_only_first_block}, \
+            self.use_bit_label: {self.use_bit_label}, self.rope2d_each_sa_layer: {rope2d_each_sa_layer}, self.rope2d_normalized_by_hw: {self.rope2d_normalized_by_hw}')
+        head_up_method = ''
+        word_patch_size = 1 if head_up_method in {'', 'no'} else 2
+        if word_patch_size > 1:
+            assert all(raw_pn % word_patch_size == 0 for raw_pn in raw_scale_schedule), f'raw_scale_schedule={raw_scale_schedule}, not compatible with word_patch_size={word_patch_size}'
+        self.checkpointing = checkpointing
+        self.pad_to_multiplier = max(1, pad_to_multiplier)
+        customized_kernel_installed = any('Infinity' in arg_name for arg_name in flash_attn_func.__code__.co_varnames)
+        self.customized_flash_attn = customized_flash_attn and customized_kernel_installed
+        if customized_flash_attn and not customized_kernel_installed:
+            import inspect, warnings
+            file_path = inspect.getsourcefile(flash_attn_func)
+            line_number = inspect.getsourcelines(flash_attn_func)[1]
+            info = (
+                f'>>>>>> Customized FlashAttention2 is not installed or compiled, but specified in args by --flash=1. Set customized_flash_attn = False. <<<<<<\n'
+                f'>>>>>> `flash_attn_func` is in [line {line_number}] [file {file_path}] <<<<<<\n'
+                f'>>>>>> {flash_attn_func.__code__.co_varnames=} <<<<<<\n'
+            )
+            warnings.warn(info, ImportWarning)
+            print(info, flush=True)
+        self.raw_scale_schedule = raw_scale_schedule    # 'raw' means before any patchifying
+        self.first_l = 1
+        # solve top-p top-k sampling hyperparameters
+        self.top_p, self.top_k = max(min(top_p, 1), 0), (round(top_k * self.V) if 0 < top_k < 1 else round(top_k))
+        if self.top_p < 1e-5: self.top_p = 0
+        if self.top_k >= self.V or self.top_k <= 0: self.top_k = 0
+        t = torch.zeros(dist.get_world_size(), device=dist.get_device())
+        t[dist.get_rank()] = float(flash_fused_op_installed)
+        dist.barrier()
+        dist.allreduce(t)
+        assert round(t.sum().item()) in {0, dist.get_world_size()}, f'flash_fused_op_installed: {t}'
+        super().__init__()
+        self.rng = torch.Generator(device=dist.get_device())
+        self.maybe_record_function = nullcontext
+        self.text_maxlen = text_maxlen
+        self.t2i = text_channels != 0
+        # [inp & position embedding]
+        init_std = math.sqrt(1 / self.C / 3)
+        self.norm0_cond = nn.Identity()
+        if self.t2i:
+            self.selecting_idx = None
+            self.num_classes = 0
+            self.D = self.C
+            cfg_uncond = torch.empty(self.text_maxlen, self.Ct5)
+            rng = torch.Generator(device='cpu')
+            rng.manual_seed(0)
+            torch.nn.init.trunc_normal_(cfg_uncond, std=1.2, generator=rng)
+            cfg_uncond /= self.Ct5 ** 0.5
+            if rand_uncond:
+                self.register_buffer('cfg_uncond', cfg_uncond)
+            else:
+                self.cfg_uncond = nn.Parameter(cfg_uncond)
+            self.text_norm = FastRMSNorm(self.Ct5, elementwise_affine=True, eps=norm_eps)
+            self.text_proj_for_sos = TextAttentivePool(self.Ct5, self.D)
+            self.text_proj_for_ca = nn.Sequential(
+                nn.Linear(self.Ct5, self.D),
+                nn.GELU(approximate='tanh'),
+                nn.Linear(self.D, self.D),
+            )
+        else:   # class-label cond
+            if selecting_idx is None:
+                num_classes = 1000
+                print(f'======= WARNING: selecting_idx not specified, set to 1/{num_classes} @ {dist.get_device()} =======')
+                selecting_idx = torch.full((1, num_classes), fill_value=1/num_classes, dtype=torch.float32, device=dist.get_device())
+            self.selecting_idx = selecting_idx
+            self.num_classes = selecting_idx.shape[-1]
+            self.D = self.C
+            self.class_emb = nn.Embedding(self.num_classes + 1, self.C)
+            nn.init.trunc_normal_(self.class_emb.weight.data, mean=0, std=init_std)
+        self.pos_start = nn.Parameter(torch.empty(1, self.first_l, self.C))
+        nn.init.trunc_normal_(self.pos_start.data, mean=0, std=init_std)
+        if self.rope2d_each_sa_layer:
+            rope2d_freqs_grid = precompute_rope2d_freqs_grid(dim=self.C//self.num_heads, dynamic_resolution_h_w=dynamic_resolution_h_w, pad_to_multiplier=self.pad_to_multiplier, rope2d_normalized_by_hw=self.rope2d_normalized_by_hw)
+            self.rope2d_freqs_grid = rope2d_freqs_grid
+        else:
+            raise ValueError(f'self.rope2d_each_sa_layer={self.rope2d_each_sa_layer} not implemented')
+        self.lvl_embed = nn.Embedding(15, self.C)
+        nn.init.trunc_normal_(self.lvl_embed.weight.data, mean=0, std=init_std)
+        # [input layers] input norm && input embedding
+        norm_layer = partial(FastRMSNorm if rms_norm else nn.LayerNorm, eps=norm_eps)
+        self.norm0_ve = norm_layer(self.d_vae) if nm0 else nn.Identity()
+        self.word_embed = nn.Linear(self.d_vae, self.C)
+        # [shared adaptive layernorm mapping network]
+        self.shared_ada_lin = nn.Sequential(nn.SiLU(inplace=False), SharedAdaLin(self.D, 6*self.C)) if shared_aln else nn.Identity()
+        # fused norm
+        if fused_norm:
+            fused_norm_func = fused_ada_rms_norm if rms_norm else fused_ada_layer_norm
+            if fused_norm_func is not None: # pre-compile
+                B = 2
+                x = torch.randn(B, 1, self.C).requires_grad_(True)
+                scale = torch.randn(B, 1, self.C).mul_(0.01).requires_grad_(True)
+                shift = torch.randn(B, 1, self.C).mul_(0.01).requires_grad_(True)
+                # fused_norm_func(C=self.C, eps=self.norm_eps, x=x, scale=scale, shift=shift).mean().backward()
+                del B, x, scale, shift
+        else:
+            fused_norm_func = None
+        # [backbone and head]
+        self.use_flex_attn = use_flex_attn
+        self.attn_fn_compile_dict = {}
+        self.batch_size = batch_size
+        if self.use_flex_attn:
+            self.attn_fn_compile_dict = self.compile_flex_attn()
+        self.drop_path_rate = drop_path_rate
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # dpr means drop path rate (linearly increasing)
+        self.unregistered_blocks = []
+        for block_idx in range(depth):
+            block = (CrossAttnBlock if self.t2i else SelfAttnBlock)(
+                embed_dim=self.C, kv_dim=self.D, cross_attn_layer_scale=cross_attn_layer_scale, cond_dim=self.D, act=True, shared_aln=shared_aln, norm_layer=norm_layer,
+                num_heads=num_heads, mlp_ratio=mlp_ratio, drop=drop_rate, drop_path=dpr[block_idx], tau=tau, cos_attn=cos_attn,
+                swiglu=swiglu, customized_flash_attn=self.customized_flash_attn, fused_mlp=fused_mlp, fused_norm_func=fused_norm_func,
+                checkpointing_sa_only=self.checkpointing == 'self-attn',
+                use_flex_attn=use_flex_attn, batch_size=batch_size, pad_to_multiplier=pad_to_multiplier, rope2d_normalized_by_hw=rope2d_normalized_by_hw,
+            )
+            self.unregistered_blocks.append(block)
+        # [head]
+        V = self.V
+        if head_aln:
+            self.head_nm = AdaLNBeforeHead(self.C, self.D, act=True, norm_layer=norm_layer, fused_norm_func=fused_norm_func)
+            self.head = nn.Linear(self.C, V) if head_depth == 1 else nn.Sequential(nn.Linear(self.C, self.C, bias=True), nn.GELU(approximate='tanh'), nn.Linear(self.C, V))
+        else:
+            self.head_nm = MultiInpIdentity()
+            self.head = nn.Sequential(norm_layer(self.C), nn.Linear(self.C, V)) if head_depth == 1 else nn.Sequential(norm_layer(self.C), nn.Linear(self.C, self.C, bias=True), nn.GELU(approximate='tanh'), nn.Linear(self.C, V))
+        self.num_block_chunks = block_chunks or 1
+        self.num_blocks_in_a_chunk = depth // block_chunks
+        print(f"{self.num_blocks_in_a_chunk=}, {depth=}, {block_chunks=}")
+        assert self.num_blocks_in_a_chunk * block_chunks == depth
+        if self.num_block_chunks == 1:
+            self.blocks = nn.ModuleList(self.unregistered_blocks)
+        else:
+            self.block_chunks = nn.ModuleList()
+            for i in range(self.num_block_chunks):
+                self.block_chunks.append(MultipleLayers(self.unregistered_blocks, self.num_blocks_in_a_chunk, i*self.num_blocks_in_a_chunk))
+        print(
+            f'\n[constructor]  ==== customized_flash_attn={self.customized_flash_attn} (using_flash={sum((b.sa.using_flash if self.t2i else b.attn.using_flash) for b in self.unregistered_blocks)}/{self.depth}), fused_mlp={fused_mlp} (fused_mlp={sum(b.ffn.fused_mlp_func is not None for b in self.unregistered_blocks)}/{self.depth}) ==== \n'
+            f'    [Infinity config ] embed_dim={embed_dim}, num_heads={num_heads}, depth={depth}, mlp_ratio={mlp_ratio}, swiglu={swiglu} num_blocks_in_a_chunk={self.num_blocks_in_a_chunk}\n'
+            f'    [drop ratios] drop_rate={drop_rate}, drop_path_rate={drop_path_rate:g} ({torch.linspace(0, drop_path_rate, depth)})',
+            end='\n\n', flush=True
+        )
+    def compile_flex_attn(self):
+        attn_fn_compile_dict = {}
+        for h_div_w in self.train_h_div_w_list:
+            h_div_w_template = h_div_w_templates[np.argmin(np.abs(float(h_div_w) - h_div_w_templates))]
+            full_scale_schedule = dynamic_resolution_h_w[h_div_w_template][self.pn]['scales']
+            if self.inference_mode:
+                apply_flex_attn_scales = list(range(1, 1+len(full_scale_schedule)))
+                mask_type = "infinity_infer_mask_with_kv_cache"
+                auto_padding = True
+            else:
+                mask_type = 'var'
+                auto_padding = False
+                apply_flex_attn_scales = [min(self.always_training_scales, len(full_scale_schedule))]
+            for scales_num in apply_flex_attn_scales:
+                print(f'====== apply flex attn hdivw: {h_div_w} scales: {scales_num} ======')
+                scale_schedule = full_scale_schedule[:scales_num]
+                scale_schedule = [ (min(t, self.video_frames//4+1), h, w) for (t,h, w) in scale_schedule]
+                patchs_nums_tuple = tuple(scale_schedule)
+                SEQ_L = sum( pt * ph * pw for pt, ph, pw in patchs_nums_tuple)
+                aligned_L = SEQ_L+ (self.pad_to_multiplier - SEQ_L % self.pad_to_multiplier) if SEQ_L % self.pad_to_multiplier != 0 else SEQ_L
+                attn_fn = FlexAttn(block_scales = patchs_nums_tuple,
+                                        mask_type = mask_type,
+                                        B = self.batch_size,
+                                        H = self.num_heads,
+                                        L = aligned_L,
+                                        auto_padding=auto_padding)
+                attn_fn_compile_dict[patchs_nums_tuple] = attn_fn
+            if self.video_frames > 1: # append image attn_fn when self.video_frames > 1 (namely videos)
+                scale_schedule = [ (1, h, w) for (t,h, w) in scale_schedule]
+                patchs_nums_tuple = tuple(scale_schedule)
+                SEQ_L = sum( pt * ph * pw for pt, ph, pw in patchs_nums_tuple)
+                aligned_L = SEQ_L+ (self.pad_to_multiplier - SEQ_L % self.pad_to_multiplier) if SEQ_L % self.pad_to_multiplier != 0 else SEQ_L
+                attn_fn = FlexAttn(block_scales = patchs_nums_tuple,
+                                        mask_type = mask_type,
+                                        B = self.batch_size,
+                                        H = self.num_heads,
+                                        L = aligned_L)
+                attn_fn_compile_dict[patchs_nums_tuple] = attn_fn
+        return attn_fn_compile_dict
+    def _apply_module_with_dtype_handling(self, module, x):
+        """
+        Apply a module (Linear, Sequential, etc.) with F16 weight dtype handling.
+        """
+        from infinity.models.basic import get_weight_for_linear
+        if isinstance(module, nn.Linear):
+            # Handle Linear layer with dtype conversion
+            weight = get_weight_for_linear(module, target_dtype=x.dtype)
+            return F.linear(x, weight, module.bias)
+        elif isinstance(module, nn.Sequential):
+            # Recursively apply each layer in the sequential
+            for layer in module:
+                x = self._apply_module_with_dtype_handling(layer, x)
+            return x
+        else:
+            # For other modules (GELU, LayerNorm, etc.), apply directly
+            return module(x)
+    def get_logits(self, h: torch.Tensor, cond_BD: Optional[torch.Tensor]):
+        """
+        :param h: hidden_state, shaped (B or batch_size, L or seq_len, C or hidden_dim)
+        :param cond_BD: shaped (B or batch_size, D or cond_dim)
+        :param tau: temperature
+        :return: logits, shaped (B or batch_size, V or vocabulary_size)
+        """
+        with torch.amp.autocast('cuda', enabled=False):
+            x = self.head_nm(h.float(), cond_BD.float())
+            return self._apply_module_with_dtype_handling(self.head, x)
+    def add_lvl_embeding(self, feature, scale_ind, scale_schedule, need_to_pad=0):
+        bs, seq_len, c = feature.shape
+        patch_t, patch_h, patch_w = scale_schedule[scale_ind]
+        t_mul_h_mul_w = patch_t * patch_h * patch_w
+        assert t_mul_h_mul_w + need_to_pad == seq_len
+        feature[:, :t_mul_h_mul_w] += self.lvl_embed(scale_ind*torch.ones((bs, t_mul_h_mul_w),dtype=torch.int).to(feature.device))
+        return feature
+    def add_lvl_embeding_for_x_BLC(self, x_BLC, scale_schedule, need_to_pad=0):
+        ptr = 0
+        x_BLC_list = []
+        for scale_ind, patch_t_h_w in enumerate(scale_schedule):
+            scale_seq_len = np.array(patch_t_h_w).prod()
+            x_BLC_this_scale = x_BLC[:,ptr:ptr+scale_seq_len] # shape: [bs, patch_h*patch_w, c]
+            ptr += scale_seq_len
+            x_BLC_this_scale = self.add_lvl_embeding(x_BLC_this_scale, scale_ind, scale_schedule)
+            x_BLC_list.append(x_BLC_this_scale)
+        assert x_BLC.shape[1] == (ptr + need_to_pad), f'{x_BLC.shape[1]} != {ptr} + {need_to_pad}'
+        x_BLC_list.append(x_BLC[:,ptr:])
+        x_BLC = torch.cat(x_BLC_list, dim=1)
+        return x_BLC
+    def forward(self, label_B_or_BLT: Union[torch.LongTensor, Tuple[torch.FloatTensor, torch.IntTensor, int]], x_BLC_wo_prefix: torch.Tensor, scale_schedule: List[Tuple[int]],
+        cfg_infer=False,
+        **kwargs,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:  # returns logits_BLV
+        """
+        label_B_or_BLT: label_B or (kv_compact, cu_seqlens_k, max_seqlen_k)
+        :return: logits BLV, V is vocab_size
+        """
+        if cfg_infer:
+            return self.autoregressive_infer_cfg(label_B_or_BLT=label_B_or_BLT, scale_schedule=scale_schedule, **kwargs)
+        x_BLC_wo_prefix = x_BLC_wo_prefix.float()       # input should be float32
+        B = x_BLC_wo_prefix.shape[0]
+        # [1. get input sequence x_BLC]
+        with torch.amp.autocast('cuda', enabled=False):
+            kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT
+            # drop cond
+            total = 0
+            for le in lens:
+                if random.random() < self.cond_drop_rate:
+                    kv_compact[total:total+le] = self.cfg_uncond[:le]
+                total += le
+            must_on_graph = self.cfg_uncond[0, 0] * 0
+            kv_compact = self.text_norm(kv_compact).contiguous()
+            sos = cond_BD = self.text_proj_for_sos((kv_compact, cu_seqlens_k, max_seqlen_k)).float().contiguous()    # cond_BD should be float32
+            kv_compact = self.text_proj_for_ca(kv_compact).contiguous()
+            kv_compact[0, 0] += must_on_graph
+            ca_kv = kv_compact, cu_seqlens_k, max_seqlen_k
+            cond_BD_or_gss = self.shared_ada_lin(cond_BD).contiguous()  # gss: gamma, scale, shift; cond_BD_or_gss should be float32
+            sos = sos.unsqueeze(1).expand(B, 1, -1) + self.pos_start.expand(B, 1, -1)
+            x_BLC = torch.cat((sos, self.word_embed(self.norm0_ve(x_BLC_wo_prefix))), dim=1)
+            # [1.1. pad the seqlen dim]
+            l_end = x_BLC.shape[1]
+            need_to_pad = (l_end + self.pad_to_multiplier - 1) // self.pad_to_multiplier * self.pad_to_multiplier - l_end # 0
+            if self.customized_flash_attn:
+                Infinity_visible_kvlen = self.Infinity_visible_kvlen[:l_end]
+                Infinity_invisible_qlen = self.Infinity_invisible_qlen[:l_end]
+                attn_bias_or_two_vector = (Infinity_visible_kvlen, Infinity_invisible_qlen)
+                # todo: solve need_to_pad here
+            elif self.use_flex_attn:
+                if need_to_pad:
+                    x_BLC = F.pad(x_BLC, (0, 0, 0, need_to_pad))
+                assert x_BLC.shape[-1] % 128 == 0, 'x_BLC.shape[-1] % 128 != 0'
+                attn_bias_or_two_vector = None
+            else:
+                d: torch.Tensor = torch.cat([torch.full((pn[0]*pn[1]*pn[2],), i) for i, pn in enumerate(scale_schedule)]).view(1, l_end, 1)
+                dT = d.transpose(1, 2)    # dT: 11L
+                attn_bias_for_masking = torch.where(d >= dT, 0., -torch.inf).reshape(1, 1, l_end, l_end)
+                attn_bias = attn_bias_for_masking[:, :, :l_end, :l_end].contiguous()   # attn_bias: 11LL
+                if need_to_pad:
+                    attn_bias = F.pad(attn_bias, (0, need_to_pad, 0, need_to_pad), value=-torch.inf)
+                    attn_bias[0, 0, l_end:, 0] = 0
+                    x_BLC = F.pad(x_BLC, (0, 0, 0, need_to_pad))
+                attn_bias_or_two_vector = attn_bias.type_as(x_BLC).to(x_BLC.device)
+        if self.use_flex_attn:
+            attn_fn = self.attn_fn_compile_dict[tuple(scale_schedule)]
+        else:
+            attn_fn = None
+        # [2. block loop]
+        SelfAttnBlock.forward, CrossAttnBlock.forward
+        checkpointing_full_block = self.checkpointing == 'full-block' and self.training
+        if self.num_block_chunks == 1:
+            for i, b in enumerate(self.blocks):
+                if self.add_lvl_embeding_only_first_block and i == 0:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                if not self.add_lvl_embeding_only_first_block:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                if checkpointing_full_block:
+                    x_BLC = torch.utils.checkpoint.checkpoint(b, x_BLC, cond_BD_or_gss, ca_kv, attn_bias_or_two_vector, attn_fn, scale_schedule, self.rope2d_freqs_grid, use_reentrant=False)
+                else:
+                    x_BLC = b(x=x_BLC, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_bias_or_two_vector, attn_fn=attn_fn, scale_schedule=scale_schedule, rope2d_freqs_grid=self.rope2d_freqs_grid)
+        else:
+            for i, chunk in enumerate(self.block_chunks): # this path
+                if self.add_lvl_embeding_only_first_block and i == 0:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                if not self.add_lvl_embeding_only_first_block:
+                    x_BLC = self.add_lvl_embeding_for_x_BLC(x_BLC, scale_schedule, need_to_pad)
+                x_BLC = chunk(x=x_BLC, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=attn_bias_or_two_vector, attn_fn=attn_fn, scale_schedule=scale_schedule, checkpointing_full_block=checkpointing_full_block, rope2d_freqs_grid=self.rope2d_freqs_grid)
+        # [3. unpad the seqlen dim, and then get logits]
+        return self.get_logits(x_BLC[:, :l_end], cond_BD)    # return logits BLV, V is vocab_size
+    @torch.no_grad()
+    def autoregressive_infer_cfg(
+        self,
+        vae=None,
+        scale_schedule=None,
+        label_B_or_BLT=None,
+        B=1, negative_label_B_or_BLT=None, force_gt_Bhw=None,
+        g_seed=None, cfg_list=[], tau_list=[], cfg_sc=3, top_k=0, top_p=0.0,
+        returns_vemb=0, ratio_Bl1=None, gumbel=0, norm_cfg=False,
+        cfg_exp_k: float=0.0, cfg_insertion_layer=[-5],
+        vae_type=0, softmax_merge_topk=-1, ret_img=False,
+        trunk_scale=1000,
+        gt_leak=0, gt_ls_Bl=None,
+        inference_mode=False,
+        save_img_path=None,
+        sampling_per_bits=1,
+    ):   # returns List[idx_Bl]
+        if g_seed is None: rng = None
+        else: self.rng.manual_seed(g_seed); rng = self.rng
+        assert len(cfg_list) >= len(scale_schedule)
+        assert len(tau_list) >= len(scale_schedule)
+        # scale_schedule is used by infinity, vae_scale_schedule is used by vae if there exists a spatial patchify,
+        # we need to convert scale_schedule to vae_scale_schedule by multiply 2 to h and w
+        if self.apply_spatial_patchify:
+            vae_scale_schedule = [(pt, 2*ph, 2*pw) for pt, ph, pw in scale_schedule]
+        else:
+            vae_scale_schedule = scale_schedule
+        kv_compact, lens, cu_seqlens_k, max_seqlen_k = label_B_or_BLT
+        if any(np.array(cfg_list) != 1):
+            bs = 2*B
+            if not negative_label_B_or_BLT:
+                kv_compact_un = kv_compact.clone()
+                total = 0
+                for le in lens:
+                    kv_compact_un[total:total+le] = (self.cfg_uncond)[:le]
+                    total += le
+                kv_compact = torch.cat((kv_compact, kv_compact_un), dim=0)
+                cu_seqlens_k = torch.cat((cu_seqlens_k, cu_seqlens_k[1:]+cu_seqlens_k[-1]), dim=0)
+            else:
+                kv_compact_un, lens_un, cu_seqlens_k_un, max_seqlen_k_un = negative_label_B_or_BLT
+                kv_compact = torch.cat((kv_compact, kv_compact_un), dim=0)
+                cu_seqlens_k = torch.cat((cu_seqlens_k, cu_seqlens_k_un[1:]+cu_seqlens_k[-1]), dim=0)
+                max_seqlen_k = max(max_seqlen_k, max_seqlen_k_un)
+        else:
+            bs = B
+        kv_compact = self.text_norm(kv_compact)
+        sos = cond_BD = self.text_proj_for_sos((kv_compact, cu_seqlens_k, max_seqlen_k)) # sos shape: [2, 4096]
+        kv_compact = self.text_proj_for_ca(kv_compact) # kv_compact shape: [304, 4096]
+        ca_kv = kv_compact, cu_seqlens_k, max_seqlen_k
+        last_stage = sos.unsqueeze(1).expand(bs, 1, -1) + self.pos_start.expand(bs, 1, -1)
+        with torch.amp.autocast('cuda', enabled=False):
+            cond_BD_or_gss = self.shared_ada_lin(cond_BD.float()).float().contiguous()
+        accu_BChw, cur_L, ret = None, 0, []  # current length, list of reconstructed images
+        idx_Bl_list, idx_Bld_list = [], []
+        if inference_mode:
+            for b in self.unregistered_blocks: (b.sa if isinstance(b, CrossAttnBlock) else b.attn).kv_caching(True)
+        else:
+            assert self.num_block_chunks > 1
+            for block_chunk_ in self.block_chunks:
+                for module in block_chunk_.module.module:
+                    (module.sa if isinstance(module, CrossAttnBlock) else module.attn).kv_caching(True)
+        abs_cfg_insertion_layers = []
+        add_cfg_on_logits, add_cfg_on_probs = False, False
+        leng = len(self.unregistered_blocks)
+        for item in cfg_insertion_layer:
+            if item == 0: # add cfg on logits
+                add_cfg_on_logits = True
+            elif item == 1: # add cfg on probs
+                add_cfg_on_probs = True # todo in the future, we may want to add cfg on logits and probs
+            elif item < 0: # determine to add cfg at item-th layer's output
+                assert leng+item > 0, f'cfg_insertion_layer: {item} is not valid since len(unregistered_blocks)={self.num_block_chunks}'
+                abs_cfg_insertion_layers.append(leng+item)
+            else:
+                raise ValueError(f'cfg_insertion_layer: {item} is not valid')
+        num_stages_minus_1 = len(scale_schedule)-1
+        summed_codes = 0
+        for si, pn in enumerate(scale_schedule):   # si: i-th segment
+            cfg = cfg_list[si]
+            if si >= trunk_scale:
+                break
+            cur_L += np.array(pn).prod()
+            need_to_pad = 0
+            attn_fn = None
+            if self.use_flex_attn:
+                # need_to_pad = (self.pad_to_multiplier - cur_L % self.pad_to_multiplier) % self.pad_to_multiplier
+                # if need_to_pad:
+                #     last_stage = F.pad(last_stage, (0, 0, 0, need_to_pad))
+                attn_fn = self.attn_fn_compile_dict.get(tuple(scale_schedule[:(si+1)]), None)
+            # assert self.attn_bias_for_masking[:, :, last_L:cur_L, :cur_L].sum() == 0, f'AR with {(self.attn_bias_for_masking[:, :, last_L:cur_L, :cur_L] != 0).sum()} / {self.attn_bias_for_masking[:, :, last_L:cur_L, :cur_L].numel()} mask item'
+            layer_idx = 0
+            for block_idx, b in enumerate(self.block_chunks):
+                # last_stage shape: [4, 1, 2048], cond_BD_or_gss.shape: [4, 1, 6, 2048], ca_kv[0].shape: [64, 2048], ca_kv[1].shape [5], ca_kv[2]: int
+                if self.add_lvl_embeding_only_first_block and block_idx == 0:
+                    last_stage = self.add_lvl_embeding(last_stage, si, scale_schedule, need_to_pad=need_to_pad)
+                if not self.add_lvl_embeding_only_first_block:
+                    last_stage = self.add_lvl_embeding(last_stage, si, scale_schedule, need_to_pad=need_to_pad)
+                for m in b.module:
+                    last_stage = m(x=last_stage, cond_BD=cond_BD_or_gss, ca_kv=ca_kv, attn_bias_or_two_vector=None, attn_fn=attn_fn, scale_schedule=scale_schedule, rope2d_freqs_grid=self.rope2d_freqs_grid, scale_ind=si)
+                    if (cfg != 1) and (layer_idx in abs_cfg_insertion_layers):
+                        # print(f'add cfg={cfg} on {layer_idx}-th layer output')
+                        last_stage = cfg * last_stage[:B] + (1-cfg) * last_stage[B:]
+                        last_stage = torch.cat((last_stage, last_stage), 0)
+                    layer_idx += 1
+            if (cfg != 1) and add_cfg_on_logits:
+                # print(f'add cfg on add_cfg_on_logits')
+                logits_BlV = self.get_logits(last_stage, cond_BD).mul(1/tau_list[si])
+                logits_BlV = cfg * logits_BlV[:B] + (1-cfg) * logits_BlV[B:]
+            else:
+                logits_BlV = self.get_logits(last_stage[:B], cond_BD[:B]).mul(1/tau_list[si])
+            if self.use_bit_label:
+                tmp_bs, tmp_seq_len = logits_BlV.shape[:2]
+                logits_BlV = logits_BlV.reshape(tmp_bs, -1, 2)
+                idx_Bld = sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV, rng=rng, top_k=top_k or self.top_k, top_p=top_p or self.top_p, num_samples=1)[:, :, 0]
+                idx_Bld = idx_Bld.reshape(tmp_bs, tmp_seq_len, -1)
+            else:
+                idx_Bl = sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV, rng=rng, top_k=top_k or self.top_k, top_p=top_p or self.top_p, num_samples=1)[:, :, 0]
+            if vae_type != 0:
+                assert returns_vemb
+                if si < gt_leak:
+                    idx_Bld = gt_ls_Bl[si]
+                else:
+                    assert pn[0] == 1
+                    idx_Bld = idx_Bld.reshape(B, pn[1], pn[2], -1) # shape: [B, h, w, d] or [B, h, w, 4d]
+                    if self.apply_spatial_patchify: # unpatchify operation
+                        idx_Bld = idx_Bld.permute(0,3,1,2) # [B, 4d, h, w]
+                        idx_Bld = torch.nn.functional.pixel_shuffle(idx_Bld, 2) # [B, d, 2h, 2w]
+                        idx_Bld = idx_Bld.permute(0,2,3,1) # [B, 2h, 2w, d]
+                    idx_Bld = idx_Bld.unsqueeze(1) # [B, 1, h, w, d] or [B, 1, 2h, 2w, d]
+                idx_Bld_list.append(idx_Bld)
+                codes = vae.quantizer.lfq.indices_to_codes(idx_Bld, label_type='bit_label') # [B, d, 1, h, w] or [B, d, 1, 2h, 2w]
+                if si != num_stages_minus_1:
+                    summed_codes += F.interpolate(codes, size=vae_scale_schedule[-1], mode=vae.quantizer.z_interplote_up)
+                    last_stage = F.interpolate(summed_codes, size=vae_scale_schedule[si+1], mode=vae.quantizer.z_interplote_up) # [B, d, 1, h, w] or [B, d, 1, 2h, 2w]
+                    last_stage = last_stage.squeeze(-3) # [B, d, h, w] or [B, d, 2h, 2w]
+                    if self.apply_spatial_patchify: # patchify operation
+                        last_stage = torch.nn.functional.pixel_unshuffle(last_stage, 2) # [B, 4d, h, w]
+                    last_stage = last_stage.reshape(*last_stage.shape[:2], -1) # [B, d, h*w] or [B, 4d, h*w]
+                    last_stage = torch.permute(last_stage, [0,2,1]) # [B, h*w, d] or [B, h*w, 4d]
+                else:
+                    summed_codes += codes
+            else:
+                if si < gt_leak:
+                    idx_Bl = gt_ls_Bl[si]
+                h_BChw = self.quant_only_used_in_inference[0].embedding(idx_Bl).float()   # BlC
+                # h_BChw = h_BChw.float().transpose_(1, 2).reshape(B, self.d_vae, scale_schedule[si][0], scale_schedule[si][1])
+                h_BChw = h_BChw.transpose_(1, 2).reshape(B, self.d_vae, scale_schedule[si][0], scale_schedule[si][1], scale_schedule[si][2])
+                ret.append(h_BChw if returns_vemb != 0 else idx_Bl)
+                idx_Bl_list.append(idx_Bl)
+                if si != num_stages_minus_1:
+                    accu_BChw, last_stage = self.quant_only_used_in_inference[0].one_step_fuse(si, num_stages_minus_1+1, accu_BChw, h_BChw, scale_schedule)
+            if si != num_stages_minus_1:
+                last_stage = self.word_embed(self.norm0_ve(last_stage))
+                last_stage = last_stage.repeat(bs//B, 1, 1)
+        if inference_mode:
+            for b in self.unregistered_blocks: (b.sa if isinstance(b, CrossAttnBlock) else b.attn).kv_caching(False)
+        else:
+            assert self.num_block_chunks > 1
+            for block_chunk_ in self.block_chunks:
+                for module in block_chunk_.module.module:
+                    (module.sa if isinstance(module, CrossAttnBlock) else module.attn).kv_caching(False)
+        if not ret_img:
+            return ret, idx_Bl_list, []
+        if vae_type != 0:
+            img = vae.decode(summed_codes.squeeze(-3))
+        else:
+            img = vae.viz_from_ms_h_BChw(ret, scale_schedule=scale_schedule, same_shape=True, last_one=True)
+        img = (img + 1) / 2
+        img = img.permute(0, 2, 3, 1).mul_(255).to(torch.uint8).flip(dims=(3,))
+        return ret, idx_Bl_list, img
+    @for_visualize
+    def vis_key_params(self, ep):
+        return
+    def load_state_dict(self, state_dict: Dict[str, Any], strict=False, assign=False):
+        for k in state_dict:
+            if 'cfg_uncond' in k:
+                old, new = state_dict[k], self.cfg_uncond.data
+                min_tlen = min(old.shape[0], new.shape[0])
+                if min_tlen == old.shape[0]:
+                    state_dict[k] = torch.cat((old.to(device=new.device, dtype=new.dtype), new[min_tlen:]))
+                else:
+                    state_dict[k] = old[:min_tlen]
+        for buf_name in ('lvl_1L', 'attn_bias_for_masking', 'Infinity_visible_kvlen', 'Infinity_invisible_qlen'):
+            state_dict.pop(buf_name, None)
+            if hasattr(self, buf_name):
+                state_dict[buf_name] = getattr(self, buf_name)
+        return super().load_state_dict(state_dict=state_dict, strict=strict, assign=assign)
+    def special_init(
+        self,
+        aln_init: float,
+        aln_gamma_init: float,
+        scale_head: float,
+        scale_proj: int,
+    ):
+        # init head's norm
+        if isinstance(self.head_nm, AdaLNBeforeHead):
+            self.head_nm.ada_lin[-1].weight.data.mul_(aln_init)    # there's no gamma for head
+            if hasattr(self.head_nm.ada_lin[-1], 'bias') and self.head_nm.ada_lin[-1].bias is not None:
+                self.head_nm.ada_lin[-1].bias.data.zero_()
+        # init head's proj
+        if scale_head >= 0:
+            if isinstance(self.head, nn.Linear):
+                self.head.weight.data.mul_(scale_head)
+                self.head.bias.data.zero_()
+            elif isinstance(self.head, nn.Sequential):
+                self.head[-1].weight.data.mul_(scale_head)
+                self.head[-1].bias.data.zero_()
+        depth = len(self.unregistered_blocks)
+        for block_idx, sab in enumerate(self.unregistered_blocks):
+            sab: Union[SelfAttnBlock, CrossAttnBlock]
+            # init proj
+            scale = 1 / math.sqrt(2*depth if scale_proj == 1 else 2*(1 + block_idx))
+            if scale_proj == 1:
+                if self.t2i:
+                    sab.sa.proj.weight.data.mul_(scale)
+                    sab.ca.proj.weight.data.mul_(scale)
+                else:
+                    sab.attn.proj.weight.data.mul_(scale)
+                sab.ffn.fc2.weight.data.mul_(scale)
+            # if sab.using_swiglu:
+            #     nn.init.ones_(sab.ffn.fcg.bias)
+            #     nn.init.trunc_normal_(sab.ffn.fcg.weight, std=1e-5)
+            # init ada_lin
+            if hasattr(sab, 'ada_lin'):
+                lin = sab.ada_lin[-1]
+                lin.weight.data[:2*self.C].mul_(aln_gamma_init)     # init gamma
+                lin.weight.data[2*self.C:].mul_(aln_init)           # init scale and shift
+                if hasattr(lin, 'bias') and lin.bias is not None:
+                    lin.bias.data.zero_()
+            elif hasattr(sab, 'ada_gss'):
+                sab.ada_gss.data[:, :, :2, :].mul_(aln_gamma_init)  # init gamma
+                sab.ada_gss.data[:, :, 2:, :].mul_(aln_init)        # init scale and shift
+    def extra_repr(self):
+        return f'drop_path_rate={self.drop_path_rate}'
+    def get_layer_id_and_scale_exp(self, para_name: str):
+        raise NotImplementedError
+def sample_with_top_k_top_p_also_inplace_modifying_logits_(logits_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = logits_BlV.shape
+    if top_k > 0:
+        top_k = min(top_k, V)
+        idx_to_remove = logits_BlV < logits_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True)
+        logits_BlV.masked_fill_(idx_to_remove, -torch.inf)
+    if top_p > 0:
+        sorted_logits, sorted_idx = logits_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        logits_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), -torch.inf)
+    # sample (have to squeeze cuz multinomial can only be used on 2D tensor)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(logits_BlV.softmax(dim=-1).view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples)
+def sampling_with_top_k_top_p_also_inplace_modifying_probs_(probs_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = probs_BlV.shape
+    if top_k > 0:
+        top_k = min(top_k, V)
+        idx_to_remove = probs_BlV < probs_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True)
+        probs_BlV.masked_fill_(idx_to_remove, 0)
+    if top_p > 0:
+        sorted_probs, sorted_idx = probs_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_probs.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        probs_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), 0)
+    # sample (have to squeeze cuz multinomial can only be used on 2D tensor)
+    probs_BlV = probs_BlV / probs_BlV.sum(-1, keepdims=True)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(probs_BlV.view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples)
+def get_params_num(d, w, mlp):
+    m = round(mlp * w / 256) * 256
+    s = d * (w**2 * 8 + w*m * 2)    # sa+ca, mlp
+    s += w**2 * 6       # saln
+    s += 4096 * w       # pred
+    s += 32 * w         # we
+    Ct5 = 4096
+    s += Ct5*w * 4      # T5 attn pool
+    s += Ct5*w + w*w    # T5 mlp
+    return f'{s/1e9:.2f}B'
+TIMM_KEYS = {'img_size', 'pretrained', 'pretrained_cfg', 'pretrained_cfg_overlay', 'global_pool'}
+@register_model
+def infinity_2b(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_20b(depth=58, embed_dim=4608, num_heads=4608//128, drop_path_rate=0.25, **kwargs): return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+# model configuration for scaling Infinity transformer
+@register_model
+def infinity_layer12(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer16(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer24(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer32(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer40(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})
+@register_model
+def infinity_layer48(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, **kwargs):
+    return Infinity(depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=4, drop_path_rate=drop_path_rate, **{k: v for k, v in kwargs.items() if k not in TIMM_KEYS})

Infinity/infinity_vae_d32_reg.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a37fa3ea1b2a1ebd23de61d91a5e68202825e5a67edaef4b7c55f5fd5b9cf26
+size 1557324701

README.md CHANGED Viewed

@@ -1,3 +1,162 @@
----
-license: mit
----

+# Infinity-2B GGUF with SageAttention
+Unofficial Q8_0 GGUF quantization of Infinity-2B with **SageAttention** support for even faster generation.
+## Features
+✨ **SageAttention Integration** - 2-5x faster than FlashAttention with automatic fallback
+🎨 **Gradio Web UI** - Easy-to-use interface for image generation
+💾 **Q8_0 Quantization** - ~75% memory reduction with minimal quality loss
+🚀 **Optimized Inference** - T5 encoder on CPU, efficient VRAM usage
+🔧 **GGUF Support** - On-the-fly dequantization with flexible deployment
+## Quick Start
+### Web UI (Recommended)
+```bash
+python gradio_webui.py --autoload
+```
+Then open `http://127.0.0.1:7860` in your browser.
+### Command Line
+```bash
+python generate_image_2b_q8_gguf.py \
+  --prompt "an astronaut riding a horse on the moon" \
+  --output output.png
+```
+## Installation
+### 1. Basic Requirements
+```bash
+pip install -r Infinity/requirements.txt
+pip install gradio gguf
+```
+### 2. Install SageAttention (Optional, Recommended)
+For faster generation:
+```bash
+pip install sageattention>=2.2.0 --no-build-isolation
+```
+**Requirements**: CUDA ≥12.0 (CUDA 12.8+ for Blackwell GPUs like RTX 50-series)
+**Note**: SageAttention is optional. The code automatically falls back to:
+1. SageAttention (if installed) - 2-5x faster ✨
+2. FlashAttention (if available) - faster than PyTorch
+3. PyTorch SDPA (always works) - built-in fallback
+### 3. Download Models
+You'll need:
+- `infinity_2b_reg_Q8_0.gguf` - Infinity-2B model (~2.1 GB)
+- `flan-t5-xl-encoder-Q8_0.gguf` - T5 text encoder (~1.0 GB)
+- `Infinity/infinity_vae_d32_reg.pth` - VAE decoder (~0.5 GB)
+## Memory Requirements
+| Component | VRAM Usage |
+|-----------|-----------|
+| Infinity-2B (Q8_0) | ~2.5 GB |
+| VAE | ~0.5 GB |
+| Working Memory | ~1-2 GB |
+| **Total (1M res)** | **~4-5 GB** |
+**T5 encoder runs on CPU** to save VRAM!
+Recommended: **8GB+ VRAM** for comfortable 1M (1024×1024) generation
+## Web UI Features
+The Gradio web interface provides:
+- **Model Management**: Load models once, reuse for all generations
+- **Full Parameter Control**: CFG scale, tau, resolution, aspect ratio, seed
+- **Real-time Preview**: See your images as they generate
+- **Progress Tracking**: Visual feedback during loading and generation
+- **Clean Layout**: Model paths banner, settings on left, output on right
+### Web UI Options
+```bash
+# Basic usage
+python gradio_webui.py
+# Auto-load models on startup (faster)
+python gradio_webui.py --autoload
+# Create public share link
+python gradio_webui.py --share
+# Custom port
+python gradio_webui.py --server-port 8080
+# Full options
+python gradio_webui.py \
+  --autoload \
+  --server-port 7860 \
+  --infinity-gguf path/to/infinity.gguf \
+  --t5-gguf path/to/t5.gguf \
+  --vae-path path/to/vae.pth
+```
+## Command-Line Options
+```bash
+python generate_image_2b_q8_gguf.py [OPTIONS]
+```
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--prompt TEXT` | Text prompt for image generation | "an astronaut..." |
+| `--infinity-gguf PATH` | Path to Infinity GGUF file | infinity_2b_reg_Q8_0.gguf |
+| `--t5-gguf PATH` | Path to T5 encoder GGUF | flan-t5-xl-encoder-Q8_0.gguf |
+| `--vae-path PATH` | Path to VAE checkpoint | Infinity/infinity_vae_d32_reg.pth |
+| `--output PATH` | Output image path | output.png |
+| `--cfg-scale FLOAT` | CFG scale (1.0-10.0) | 3.0 |
+| `--tau FLOAT` | Temperature (0.1-1.0) | 0.5 |
+| `--seed INT` | Random seed for reproducibility | 42 |
+| `--pn {0.06M,0.25M,1M}` | Resolution preset | 1M |
+| `--aspect-ratio FLOAT` | Aspect ratio (height/width) | 1.0 |
+## Technical Details
+### Quantization
+- **Q8_0 format**: 8-bit quantization with minimal quality loss
+- **On-the-fly dequantization**: Using custom GGUFLinear layers
+- **Memory savings**: ~75% reduction vs FP16
+- **Quality**: Nearly identical to FP16
+### Architecture
+- **Infinity-2B**: 2.0B parameters, embed_dim=2048, depth=32
+- **T5-XL Encoder**: 2048-dim text embeddings
+- **VAE**: d32 with dynamic resolution support
+### GGUF Support
+The implementation includes:
+- Import utilities for GGUF tensors
+- Custom `GGUFLinear` layers for on-the-fly dequantization
+- Patched attention mechanisms for compatibility
+- F16 dtype handling for head layers
+See [patch_infinity_for_gguf.sh](patch_infinity_for_gguf.sh) for implementation details.
+## Credits
+- **Original Model**: [Infinity by FoundationVision](https://github.com/FoundationVision/Infinity)
+- **SageAttention**: [thu-ml/SageAttention](https://github.com/thu-ml/SageAttention)
+- **GGUF Format**: [ggerganov/ggml](https://github.com/ggerganov/ggml)
+## License
+MIT

flan-t5-xl-encoder-Q8_0.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d212c960e07faf2323e2136cb03e62578a8f6862f13709f480684da9f5d9a2e6
+size 1563507296

generate_image_2b_q8_gguf.py ADDED Viewed

	@@ -0,0 +1,559 @@

+#!/usr/bin/env python3
+"""
+Generate images using quantized Infinity-2B model (GGUF format)
+Loads T5 text encoder from GGUF on CPU, Infinity model from GGUF on GPU
+"""
+import os
+import sys
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Add Infinity to Python path (assumes Infinity repo is in same directory as this script)
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+INFINITY_PATH = os.path.join(SCRIPT_DIR, 'Infinity')
+if os.path.exists(INFINITY_PATH):
+    sys.path.insert(0, INFINITY_PATH)
+else:
+    print(f"Warning: Infinity repo not found at {INFINITY_PATH}")
+    print("Please clone the Infinity repo and run patch_infinity_for_gguf.sh")
+import time
+import argparse
+import torch
+import torch.nn.functional as F
+import numpy as np
+import cv2
+from typing import List
+import gguf
+# Import existing utilities
+from infinity_gguf_utils import (
+    load_gguf_state_dict,
+    load_gguf_state_dict_with_params,
+    _replace_with_gguf_linear,
+    GGUFParameter,
+    dequantize_gguf_tensor,
+    GGUFLinear
+)
+# Import Infinity model and utilities
+from infinity.models.infinity import Infinity
+from infinity.models.bsq_vae.vae import vae_model
+from infinity.utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
+# Import transformers for tokenizer
+from transformers import AutoTokenizer
+def load_t5_tokenizer_from_gguf(gguf_path):
+    """
+    Load T5 tokenizer from GGUF metadata or use standard tokenizer
+    For simplicity, we'll use the standard T5 tokenizer
+    """
+    print("[Loading T5 Tokenizer]")
+    # Use standard T5 tokenizer - the GGUF file should be compatible
+    # We can use any T5-v1.1-xxl tokenizer since the vocab is standard
+    try:
+        from transformers import T5TokenizerFast
+        # Try to find a local tokenizer or use HuggingFace
+        tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=True)
+        tokenizer.model_max_length = 512
+        return tokenizer
+    except:
+        print("Warning: Could not load T5 tokenizer from HuggingFace, trying local cache...")
+        tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-xxl", legacy=True)
+        tokenizer.model_max_length = 512
+        return tokenizer
+def load_t5_encoder_from_gguf(gguf_path, device='cpu'):
+    """
+    Load T5 encoder from GGUF file and keep on CPU
+    Based on ComfyUI-GGUF loader implementation
+    """
+    print(f"[Loading T5 Encoder from GGUF: {gguf_path}]")
+    print(f"[T5 will be kept on {device}]")
+    # Apply NumPy 2.0 compatibility patch if needed
+    import numpy as np
+    if not hasattr(np.ndarray, 'newbyteorder'):
+        def newbyteorder(self, new_order):
+            return self.view(self.dtype.newbyteorder(new_order))
+        np.ndarray.newbyteorder = newbyteorder
+    # Load GGUF state dict
+    from gguf import GGUFReader
+    reader = GGUFReader(gguf_path)
+    # Map llama.cpp T5 keys to HuggingFace T5 keys
+    T5_SD_MAP = {
+        "enc.": "encoder.",
+        ".blk.": ".block.",
+        "token_embd": "shared",
+        "output_norm": "final_layer_norm",
+        "attn_q": "layer.0.SelfAttention.q",
+        "attn_k": "layer.0.SelfAttention.k",
+        "attn_v": "layer.0.SelfAttention.v",
+        "attn_o": "layer.0.SelfAttention.o",
+        "attn_norm": "layer.0.layer_norm",
+        "attn_rel_b": "layer.0.SelfAttention.relative_attention_bias",
+        "ffn_up": "layer.1.DenseReluDense.wi_1",
+        "ffn_down": "layer.1.DenseReluDense.wo",
+        "ffn_gate": "layer.1.DenseReluDense.wi_0",
+        "ffn_norm": "layer.1.layer_norm",
+    }
+    # Load and convert tensors
+    state_dict = {}
+    print("Loading T5 tensors from GGUF...")
+    for tensor in reader.tensors:
+        tensor_name = tensor.name
+        # Apply key mapping
+        for old_key, new_key in T5_SD_MAP.items():
+            tensor_name = tensor_name.replace(old_key, new_key)
+        # Load tensor data
+        torch_tensor = torch.from_numpy(np.array(tensor.data))
+        # Determine shape
+        shape = torch.Size(tuple(int(v) for v in reversed(tensor.shape)))
+        # Check if quantized
+        is_quantized = tensor.tensor_type not in {
+            gguf.GGMLQuantizationType.F32,
+            gguf.GGMLQuantizationType.F16
+        }
+        if is_quantized:
+            # Dequantize to float16 for CPU inference
+            # print(f"  Dequantizing {tensor_name} ({tensor.tensor_type})...")
+            param = GGUFParameter(torch_tensor, quant_type=tensor.tensor_type)
+            dequant_tensor = dequantize_gguf_tensor(param, target_dtype=torch.float16)
+            state_dict[tensor_name] = dequant_tensor.to(device)
+        else:
+            # Already F32 or F16
+            torch_tensor = torch_tensor.view(*shape)
+            if tensor.tensor_type == gguf.GGMLQuantizationType.F32:
+                state_dict[tensor_name] = torch_tensor.to(torch.float16).to(device)
+            else:
+                state_dict[tensor_name] = torch_tensor.to(device)
+    print(f"Loaded {len(state_dict)} tensors for T5 encoder")
+    # Load T5 model architecture from transformers
+    from transformers import T5EncoderModel, T5Config
+    # Create T5 config - for T5-XL (2048 dims, not XXL which is 4096)
+    # Try to load from local directory first, fall back to download if needed
+    try:
+        config = T5Config.from_pretrained("./flan-t5-xl-official")
+        print("Loaded T5 config from local directory")
+    except Exception as e:
+        print(f"Could not load config from local directory: {e}")
+        print("Falling back to download T5 config...")
+        config = T5Config.from_pretrained("google/flan-t5-xl")
+        print("Downloaded T5 config from HuggingFace")
+    # Create model
+    model = T5EncoderModel(config)
+    # Load state dict
+    print("Loading state dict into T5 model...")
+    missing, unexpected = model.load_state_dict(state_dict, strict=False)
+    if missing:
+        print(f"  Missing keys: {missing[:5]}..." if len(missing) > 5 else f"  Missing keys: {missing}")
+    if unexpected:
+        print(f"  Unexpected keys: {unexpected[:5]}..." if len(unexpected) > 5 else f"  Unexpected keys: {unexpected}")
+    model.to(device)
+    model.eval()
+    model.requires_grad_(False)
+    print(f"[T5 Encoder loaded successfully on {device}]")
+    return model
+def load_infinity_from_gguf(gguf_path, vae, device='cuda', model_type='infinity_2b',
+                            text_channels=2048, pn='1M'):
+    """
+    Load Infinity model from GGUF file
+    """
+    print(f"[Loading Infinity-2B from GGUF: {gguf_path}]")
+    # Model configuration for Infinity-2B
+    if model_type == 'infinity_2b':
+        kwargs_model = dict(
+            depth=32,
+            embed_dim=2048,
+            num_heads=2048//128,  # 16 heads
+            drop_path_rate=0.1,
+            mlp_ratio=4,
+            block_chunks=8
+        )
+    else:
+        raise ValueError(f"Unsupported model type: {model_type}")
+    # Create Infinity model
+    text_maxlen = 512
+    print("[Creating Infinity model architecture]")
+    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True), torch.no_grad():
+        infinity_model = Infinity(
+            vae_local=vae,
+            text_channels=text_channels,
+            text_maxlen=text_maxlen,
+            shared_aln=True,
+            raw_scale_schedule=None,
+            checkpointing='full-block',
+            customized_flash_attn=False,
+            fused_norm=True,
+            pad_to_multiplier=128,
+            use_flex_attn=False,
+            add_lvl_embeding_only_first_block=1,
+            use_bit_label=1,
+            rope2d_each_sa_layer=1,
+            rope2d_normalized_by_hw=2,
+            pn=pn,
+            apply_spatial_patchify=0,
+            inference_mode=True,
+            train_h_div_w_list=[1.0],
+            **kwargs_model,
+        ).to(device=device)
+        print(f"[Infinity model size: {sum(p.numel() for p in infinity_model.parameters())/1e9:.2f}B parameters]")
+        # Convert to bfloat16
+        for block in infinity_model.unregistered_blocks:
+            block.bfloat16()
+        infinity_model.eval()
+        infinity_model.requires_grad_(False)
+    # Load GGUF weights with GGUFParameters
+    print("[Loading Infinity weights from GGUF]")
+    state_dict = load_gguf_state_dict_with_params(gguf_path, device=device)
+    # Replace Linear layers with GGUFLinear layers for on-the-fly dequantization
+    print("[Replacing Linear layers with GGUFLinear layers]")
+    infinity_model = _replace_with_gguf_linear(infinity_model, torch.bfloat16, state_dict, prefix="")
+    # Load weights directly into the model (not using load_state_dict)
+    print("[Loading weights into model]")
+    skipped_keys = []
+    for key, tensor in state_dict.items():
+        # Find the module and parameter name
+        parts = key.rsplit('.', 1)
+        if len(parts) != 2:
+            continue
+        module_name, param_name = parts
+        # Navigate to the module
+        module = infinity_model
+        for attr in module_name.split('.'):
+            if hasattr(module, attr):
+                module = getattr(module, attr)
+            else:
+                module = None
+                break
+        # Set the parameter
+        if module is not None and hasattr(module, param_name):
+            existing_param = getattr(module, param_name)
+            # Get the shape of the tensor to load
+            tensor_shape = tensor.shape
+            if hasattr(tensor, 'quant_shape'):
+                tensor_shape = tensor.quant_shape
+            # Check if shapes match
+            if existing_param.shape != tensor_shape:
+                print(f"[WARNING] Shape mismatch for {key}: expected {existing_param.shape}, got {tensor_shape}. Skipping.")
+                skipped_keys.append(key)
+                continue
+            # Set the parameter
+            if isinstance(tensor, torch.nn.Parameter):
+                setattr(module, param_name, tensor)
+            else:
+                setattr(module, param_name, torch.nn.Parameter(tensor, requires_grad=False))
+    if skipped_keys:
+        print(f"[INFO] Skipped {len(skipped_keys)} parameters due to shape mismatches")
+    infinity_model.rng = torch.Generator(device=device)
+    print("[Infinity model loaded successfully]")
+    return infinity_model
+def load_vae(vae_path, vae_type=32, device='cuda'):
+    """
+    Load VAE model
+    """
+    print(f"[Loading VAE from {vae_path}]")
+    schedule_mode = "dynamic"
+    codebook_dim = vae_type
+    codebook_size = 2**codebook_dim
+    patch_size = 16
+    encoder_ch_mult = [1, 2, 4, 4, 4]
+    decoder_ch_mult = [1, 2, 4, 4, 4]
+    vae = vae_model(
+        vae_path,
+        schedule_mode,
+        codebook_dim,
+        codebook_size,
+        patch_size=patch_size,
+        encoder_ch_mult=encoder_ch_mult,
+        decoder_ch_mult=decoder_ch_mult,
+        test_mode=True
+    ).to(device)
+    print("[VAE loaded successfully]")
+    return vae
+def encode_prompt(text_tokenizer, text_encoder, prompt, device='cuda'):
+    """
+    Encode text prompt using T5 encoder
+    """
+    print(f"Encoding prompt: {prompt}")
+    captions = [prompt]
+    tokens = text_tokenizer(
+        text=captions,
+        max_length=512,
+        padding='max_length',
+        truncation=True,
+        return_tensors='pt'
+    )
+    # Move tokens to appropriate devices
+    # T5 encoder is on CPU, so keep tokens on CPU too
+    input_ids = tokens.input_ids.to(text_encoder.device)
+    mask = tokens.attention_mask.to(text_encoder.device)
+    # Encode with T5
+    with torch.no_grad():
+        text_features = text_encoder(
+            input_ids=input_ids,
+            attention_mask=mask
+        )['last_hidden_state'].float()
+    # Move to GPU for Infinity model
+    text_features = text_features.to(device)
+    mask = mask.to(device)
+    lens: List[int] = mask.sum(dim=-1).tolist()
+    cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0))
+    Ltext = max(lens)
+    kv_compact = []
+    for len_i, feat_i in zip(lens, text_features.unbind(0)):
+        kv_compact.append(feat_i[:len_i])
+    kv_compact = torch.cat(kv_compact, dim=0)
+    # Ensure kv_compact is in float32 to avoid dtype mismatches
+    kv_compact = kv_compact.to(torch.float32)
+    text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
+    return text_cond_tuple
+def generate_image(infinity_model, vae, text_tokenizer, text_encoder, prompt,
+                  cfg_scale=3.0, tau=0.5, seed=None, scale_schedule=None,
+                  vae_type=32, device='cuda'):
+    """
+    Generate image using Infinity model
+    """
+    print("[Starting image generation]")
+    start_time = time.time()
+    # Note: Deterministic mode is set early in main() if seed is provided
+    if seed is not None:
+        print(f"Using seed: {seed}")
+    # Encode prompt
+    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt, device=device)
+    # Prepare cfg and tau lists
+    cfg_list = [cfg_scale] * len(scale_schedule)
+    tau_list = [tau] * len(scale_schedule)
+    print(f"CFG scale: {cfg_scale}, Tau: {tau}")
+    print(f"Scale schedule: {scale_schedule}")
+    # Generate with autocast
+    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True):
+        with torch.no_grad():
+            gen_start = time.time()
+            _, _, img_list = infinity_model.autoregressive_infer_cfg(
+                vae=vae,
+                scale_schedule=scale_schedule,
+                label_B_or_BLT=text_cond_tuple,
+                g_seed=seed,
+                B=1,
+                negative_label_B_or_BLT=None,
+                force_gt_Bhw=None,
+                cfg_sc=cfg_scale,
+                cfg_list=cfg_list,
+                tau_list=tau_list,
+                top_k=900,
+                top_p=0.97,
+                returns_vemb=1,
+                ratio_Bl1=None,
+                gumbel=0,
+                norm_cfg=False,
+                cfg_exp_k=0.0,
+                cfg_insertion_layer=[0],  # Must be a list
+                vae_type=vae_type,
+                softmax_merge_topk=-1,
+                ret_img=True,
+                trunk_scale=1000,
+                gt_leak=0,
+                gt_ls_Bl=None,
+                inference_mode=True,
+                sampling_per_bits=1,
+            )
+            gen_time = time.time() - gen_start
+    img = img_list[0]
+    total_time = time.time() - start_time
+    print(f"[Generation complete! Total time: {total_time:.2f}s, Inference time: {gen_time:.2f}s]")
+    return img
+def main():
+    parser = argparse.ArgumentParser(description='Generate images with Infinity-2B GGUF')
+    parser.add_argument('--prompt', type=str,
+                       default='an astronaut riding a horse on the moon',
+                       help='Text prompt for image generation')
+    parser.add_argument('--infinity-gguf', type=str,
+                       default='infinity_2b_reg_Q8_0.gguf',
+                       help='Path to Infinity-2B GGUF file')
+    parser.add_argument('--t5-gguf', type=str,
+                       default='flan-t5-xl-encoder-Q8_0.gguf',
+                       help='Path to T5 encoder GGUF file')
+    parser.add_argument('--vae-path', type=str,
+                       default='Infinity/infinity_vae_d32_reg.pth',
+                       help='Path to VAE checkpoint')
+    parser.add_argument('--output', type=str,
+                       default='output.png',
+                       help='Output image path')
+    parser.add_argument('--cfg-scale', type=float, default=3.0,
+                       help='Classifier-free guidance scale')
+    parser.add_argument('--tau', type=float, default=0.5,
+                       help='Temperature for self-attention')
+    parser.add_argument('--seed', type=int, default=42,
+                       help='Random seed')
+    parser.add_argument('--pn', type=str, default='1M',
+                       choices=['0.06M', '0.25M', '1M'],
+                       help='Resolution preset')
+    parser.add_argument('--aspect-ratio', type=float, default=1.0,
+                       help='Aspect ratio (height/width)')
+    args = parser.parse_args()
+    # Set deterministic mode early (before model loading) if seed is provided
+    if args.seed is not None:
+        torch.manual_seed(args.seed)
+        np.random.seed(args.seed)
+        # Enable deterministic mode for cuDNN
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        # Try to enable full deterministic mode
+        try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        except Exception as e:
+            print(f"Warning: Could not enable full deterministic mode: {e}")
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    print(f"Using device: {device}")
+    # Set CUDA seed after device is determined
+    if args.seed is not None and device == 'cuda':
+        torch.cuda.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+        # Control SDPA backend for determinism
+        try:
+            torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False)
+            print(f"Deterministic mode enabled (seed={args.seed})")
+        except Exception as e:
+            print(f"Warning: Could not set SDPA backend: {e}")
+    if device == 'cpu':
+        print("WARNING: No GPU detected! This will be extremely slow.")
+    # Determine scale schedule based on aspect ratio
+    h_div_w_template = h_div_w_templates[
+        np.argmin(np.abs(h_div_w_templates - args.aspect_ratio))
+    ]
+    scale_schedule = dynamic_resolution_h_w[h_div_w_template][args.pn]['scales']
+    scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+    print("\n" + "="*70)
+    print("Infinity-2B GGUF Image Generation")
+    print("="*70)
+    # Load models
+    print("\n[1/4] Loading VAE...")
+    vae = load_vae(args.vae_path, vae_type=32, device=device)
+    print("\n[2/4] Loading T5 Tokenizer...")
+    text_tokenizer = load_t5_tokenizer_from_gguf(args.t5_gguf)
+    print("\n[3/4] Loading T5 Encoder from GGUF (on CPU)...")
+    text_encoder = load_t5_encoder_from_gguf(args.t5_gguf, device='cpu')
+    print("\n[4/4] Loading Infinity-2B from GGUF...")
+    infinity_model = load_infinity_from_gguf(
+        args.infinity_gguf,
+        vae=vae,
+        device=device,
+        model_type='infinity_2b',
+        text_channels=2048,  # Model projects T5's 4096 internally
+        pn=args.pn
+    )
+    print("\n" + "="*70)
+    print("All models loaded successfully!")
+    print("="*70)
+    # Generate image
+    print(f"\nGenerating image with prompt: '{args.prompt}'")
+    generated_image = generate_image(
+        infinity_model,
+        vae,
+        text_tokenizer,
+        text_encoder,
+        args.prompt,
+        cfg_scale=args.cfg_scale,
+        tau=args.tau,
+        seed=args.seed,
+        scale_schedule=scale_schedule,
+        vae_type=32,
+        device=device
+    )
+    # Save image
+    print(f"\nSaving image to {args.output}...")
+    image_np = generated_image.cpu().numpy()
+    cv2.imwrite(args.output, image_np)
+    print(f"\n{'='*70}")
+    print(f"✓ Image saved successfully to: {args.output}")
+    print(f"{'='*70}\n")
+if __name__ == '__main__':
+    main()

gradio_webui.py ADDED Viewed

	@@ -0,0 +1,342 @@

+#!/usr/bin/env python3
+"""
+Gradio Web UI for Infinity-2B GGUF Image Generation
+Provides an easy-to-use interface for generating images with the quantized model
+"""
+import os
+import sys
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Add Infinity to Python path
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+INFINITY_PATH = os.path.join(SCRIPT_DIR, 'Infinity')
+if os.path.exists(INFINITY_PATH):
+    sys.path.insert(0, INFINITY_PATH)
+import time
+import argparse
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from datetime import datetime
+# Import the generation functions from our existing script
+from generate_image_2b_q8_gguf import (
+    load_t5_tokenizer_from_gguf,
+    load_t5_encoder_from_gguf,
+    load_infinity_from_gguf,
+    load_vae,
+    generate_image
+)
+from infinity.utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
+# Global model storage
+class ModelCache:
+    def __init__(self):
+        self.vae = None
+        self.text_tokenizer = None
+        self.text_encoder = None
+        self.infinity_model = None
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.loaded = False
+model_cache = ModelCache()
+def load_models(infinity_gguf_path, t5_gguf_path, vae_path, pn='1M', progress=gr.Progress()):
+    """
+    Load all models with progress tracking
+    """
+    global model_cache
+    if model_cache.loaded:
+        return "✓ Models already loaded!"
+    progress(0, desc="Loading VAE...")
+    model_cache.vae = load_vae(vae_path, vae_type=32, device=model_cache.device)
+    progress(0.25, desc="Loading T5 Tokenizer...")
+    model_cache.text_tokenizer = load_t5_tokenizer_from_gguf(t5_gguf_path)
+    progress(0.5, desc="Loading T5 Encoder (on CPU)...")
+    model_cache.text_encoder = load_t5_encoder_from_gguf(t5_gguf_path, device='cpu')
+    progress(0.75, desc="Loading Infinity-2B from GGUF...")
+    model_cache.infinity_model = load_infinity_from_gguf(
+        infinity_gguf_path,
+        vae=model_cache.vae,
+        device=model_cache.device,
+        model_type='infinity_2b',
+        text_channels=2048,
+        pn=pn
+    )
+    model_cache.loaded = True
+    progress(1.0, desc="Complete!")
+    return "✓ All models loaded successfully!"
+def generate_image_gradio(
+    prompt,
+    cfg_scale,
+    tau,
+    seed,
+    aspect_ratio,
+    pn,
+    use_random_seed,
+    progress=gr.Progress()
+):
+    """
+    Generate image with Gradio progress tracking
+    """
+    global model_cache
+    if not model_cache.loaded:
+        return None, "❌ Please load models first!"
+    try:
+        # Use random seed if requested
+        if use_random_seed:
+            seed = np.random.randint(0, 2**31 - 1)
+        # Set seed for reproducibility
+        if seed is not None:
+            torch.manual_seed(seed)
+            np.random.seed(seed)
+            if model_cache.device == 'cuda':
+                torch.cuda.manual_seed(seed)
+                torch.cuda.manual_seed_all(seed)
+        # Determine scale schedule based on aspect ratio
+        h_div_w_template = h_div_w_templates[
+            np.argmin(np.abs(h_div_w_templates - aspect_ratio))
+        ]
+        scale_schedule = dynamic_resolution_h_w[h_div_w_template][pn]['scales']
+        scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+        progress(0.1, desc="Encoding prompt...")
+        start_time = time.time()
+        progress(0.3, desc="Generating image (this may take a while)...")
+        # Generate image
+        img_np = generate_image(
+            model_cache.infinity_model,
+            model_cache.vae,
+            model_cache.text_tokenizer,
+            model_cache.text_encoder,
+            prompt,
+            cfg_scale=cfg_scale,
+            tau=tau,
+            seed=seed,
+            scale_schedule=scale_schedule,
+            vae_type=32,
+            device=model_cache.device
+        )
+        progress(0.9, desc="Converting to PIL Image...")
+        # Convert to PIL Image (RGB)
+        img_np = img_np.cpu().numpy()
+        # OpenCV uses BGR, convert to RGB
+        img_rgb = img_np[:, :, ::-1]
+        pil_image = Image.fromarray(img_rgb.astype(np.uint8))
+        elapsed_time = time.time() - start_time
+        # Get resolution
+        h, w = img_np.shape[:2]
+        info = f"""✓ Generation complete!
+**Time**: {elapsed_time:.2f}s
+**Resolution**: {w}x{h}
+**Seed**: {seed}
+**CFG Scale**: {cfg_scale}
+**Tau**: {tau}
+**Aspect Ratio**: {aspect_ratio:.2f}
+**PN**: {pn}"""
+        progress(1.0, desc="Done!")
+        return pil_image, info
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error during generation:\n{str(e)}\n\n{traceback.format_exc()}"
+        return None, error_msg
+def create_ui():
+    """
+    Create Gradio UI
+    """
+    # Create Blocks without theme for compatibility with older Gradio versions
+    with gr.Blocks(title="Infinity-2B GGUF Generator") as demo:
+        gr.Markdown("# 🎨 Infinity-2B GGUF Image Generator")
+        # Model paths banner at the top
+        with gr.Row():
+            infinity_gguf = gr.Textbox(
+                label="Infinity-2B GGUF",
+                value="infinity_2b_reg_Q8_0.gguf",
+                scale=2
+            )
+            t5_gguf = gr.Textbox(
+                label="T5 GGUF",
+                value="flan-t5-xl-encoder-Q8_0.gguf",
+                scale=2
+            )
+            vae_path = gr.Textbox(
+                label="VAE Checkpoint",
+                value="Infinity/infinity_vae_d32_reg.pth",
+                scale=2
+            )
+            pn_load = gr.Dropdown(
+                label="Resolution Preset",
+                choices=['0.06M', '0.25M', '1M'],
+                value='1M',
+                scale=1
+            )
+            load_btn = gr.Button("🚀 Load Models", variant="primary", scale=1)
+        load_status = gr.Textbox(label="Status", interactive=False, show_label=False)
+        # Main content area
+        with gr.Row():
+            # Left column: Generation settings
+            with gr.Column(scale=1):
+                gr.Markdown("### Generation Settings")
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    placeholder="Describe the image you want to generate...",
+                    value="an astronaut riding a horse on the moon",
+                    lines=3
+                )
+                with gr.Row():
+                    cfg_scale = gr.Slider(
+                        minimum=1.0,
+                        maximum=10.0,
+                        value=3.0,
+                        step=0.5,
+                        label="CFG Scale",
+                        info="Higher = stronger prompt adherence"
+                    )
+                    tau = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.5,
+                        step=0.05,
+                        label="Tau (Temperature)",
+                        info="Lower = more deterministic"
+                    )
+                with gr.Row():
+                    aspect_ratio = gr.Slider(
+                        minimum=0.5,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Aspect Ratio (H/W)",
+                        info="1.0 = square, >1.0 = portrait, <1.0 = landscape"
+                    )
+                    pn = gr.Dropdown(
+                        label="Resolution Preset",
+                        choices=['0.06M', '0.25M', '1M'],
+                        value='1M',
+                        info="Higher = better quality but slower"
+                    )
+                with gr.Row():
+                    seed = gr.Number(
+                        label="Seed",
+                        value=42,
+                        precision=0,
+                        info="For reproducible results"
+                    )
+                    use_random_seed = gr.Checkbox(
+                        label="Random Seed",
+                        value=False,
+                        info="Generate random seed each time"
+                    )
+                generate_btn = gr.Button("✨ Generate Image", variant="primary", size="lg")
+            # Right column: Output
+            with gr.Column(scale=1):
+                output_image = gr.Image(
+                    label="Generated Image",
+                    type="pil",
+                    height=600
+                )
+                output_info = gr.Markdown("Generate an image to see details here.")
+        # Wire up events
+        load_btn.click(
+            fn=load_models,
+            inputs=[infinity_gguf, t5_gguf, vae_path, pn_load],
+            outputs=[load_status]
+        )
+        generate_btn.click(
+            fn=generate_image_gradio,
+            inputs=[prompt, cfg_scale, tau, seed, aspect_ratio, pn, use_random_seed],
+            outputs=[output_image, output_info]
+        )
+    return demo
+def main():
+    parser = argparse.ArgumentParser(description='Infinity-2B GGUF Gradio Web UI')
+    parser.add_argument('--share', action='store_true', help='Create a public share link')
+    parser.add_argument('--server-name', type=str, default='127.0.0.1', help='Server name')
+    parser.add_argument('--server-port', type=int, default=7860, help='Server port')
+    parser.add_argument('--autoload', action='store_true', help='Auto-load models on startup')
+    parser.add_argument('--infinity-gguf', type=str, default='infinity_2b_reg_Q8_0.gguf')
+    parser.add_argument('--t5-gguf', type=str, default='flan-t5-xl-encoder-Q8_0.gguf')
+    parser.add_argument('--vae-path', type=str, default='Infinity/infinity_vae_d32_reg.pth')
+    args = parser.parse_args()
+    # Auto-load models if requested
+    if args.autoload:
+        print("Auto-loading models...")
+        load_models(args.infinity_gguf, args.t5_gguf, args.vae_path)
+    # Create and launch UI
+    demo = create_ui()
+    print("\n" + "="*70)
+    print("Starting Infinity-2B GGUF Web UI")
+    print("="*70)
+    print(f"Server: http://{args.server_name}:{args.server_port}")
+    if args.share:
+        print("Creating public share link...")
+    print("="*70 + "\n")
+    demo.launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share,
+        inbrowser=True
+    )
+if __name__ == '__main__':
+    main()

infinity_2b_reg_Q8_0.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:747220c5030d342f0195f34eb5c21ebb75b2bb855df96a848544be29f00326bc
+size 2374494496

infinity_gguf_utils.py ADDED Viewed

	@@ -0,0 +1,477 @@

+#!/usr/bin/env python3
+"""
+GGUF utilities for Infinity model inference
+Includes GGUFParameter, dequantization functions, and GGUFLinear layer
+"""
+import numpy as np
+# Monkey patch for NumPy 2.0 compatibility (must be done before importing gguf)
+if not hasattr(np.ndarray, 'newbyteorder'):
+    def newbyteorder(self, new_order):
+        return self.view(self.dtype.newbyteorder(new_order))
+    np.ndarray.newbyteorder = newbyteorder
+import torch
+import torch.nn as nn
+import gguf
+from typing import Optional
+# Dequantization constants
+QK_K = 256
+K_SCALE_SIZE = 12
+def to_uint32(x):
+    """Convert bytes to uint32"""
+    x = x.view(torch.uint8).to(torch.int32)
+    return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1)
+def split_block_dims(blocks, *args):
+    """Split block dimensions"""
+    n_max = blocks.shape[1]
+    dims = list(args) + [n_max - sum(args)]
+    return torch.split(blocks, dims, dim=1)
+def dequantize_blocks_Q8_0(blocks, block_size, type_size, dtype=None):
+    """Dequantize Q8_0 blocks"""
+    d, x = split_block_dims(blocks, 2)
+    d = d.view(torch.float16).to(dtype)
+    x = x.view(torch.int8)
+    return d * x
+def dequantize_blocks_Q6_K(blocks, block_size, type_size, dtype=None):
+    """Dequantize Q6_K blocks"""
+    n_blocks = blocks.shape[0]
+    ql, qh, scales, d = split_block_dims(blocks, QK_K // 2, QK_K // 4, QK_K // 16)
+    scales = scales.view(torch.int8).to(dtype)
+    d = d.view(torch.float16).to(dtype)
+    d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
+    ql = ql.reshape((n_blocks, -1, 1, 64)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
+    ql = (ql & 0x0F).reshape((n_blocks, -1, 32))
+    qh = qh.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 2, 4, 6], device=d.device, dtype=torch.uint8).reshape((1, 1, 4, 1))
+    qh = (qh & 0x03).reshape((n_blocks, -1, 32))
+    q = (ql | (qh << 4)).to(torch.int8) - 32
+    q = q.reshape((n_blocks, QK_K // 16, -1))
+    return (d * q).reshape((n_blocks, QK_K))
+def get_scale_min(scales):
+    """Extract scale and min from packed data"""
+    n_blocks = scales.shape[0]
+    scales = scales.view(torch.uint8)
+    scales = scales.reshape((n_blocks, 3, 4))
+    d, m, m_d = torch.split(scales, scales.shape[-2] // 3, dim=-2)
+    sc = torch.cat([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], dim=-1)
+    min = torch.cat([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], dim=-1)
+    return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
+def dequantize_blocks_Q5_K(blocks, block_size, type_size, dtype=None):
+    """Dequantize Q5_K blocks"""
+    n_blocks = blocks.shape[0]
+    d, dmin, scales, qh, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE, QK_K // 8)
+    d = d.view(torch.float16).to(dtype)
+    dmin = dmin.view(torch.float16).to(dtype)
+    sc, m = get_scale_min(scales)
+    d = (d * sc).reshape((n_blocks, -1, 1))
+    dm = (dmin * m).reshape((n_blocks, -1, 1))
+    ql = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
+    qh = qh.reshape((n_blocks, -1, 1, 32)) >> torch.arange(0, 8, device=d.device, dtype=torch.uint8).reshape((1, 1, 8, 1))
+    ql = (ql & 0x0F).reshape((n_blocks, -1, 32))
+    qh = (qh & 0x01).reshape((n_blocks, -1, 32))
+    q = ql | (qh << 4)
+    return (d * q - dm).reshape((n_blocks, QK_K))
+def dequantize_blocks_Q4_K(blocks, block_size, type_size, dtype=None):
+    """Dequantize Q4_K blocks"""
+    n_blocks = blocks.shape[0]
+    d, dmin, scales, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE)
+    d = d.view(torch.float16).to(dtype)
+    dmin = dmin.view(torch.float16).to(dtype)
+    sc, m = get_scale_min(scales)
+    d = (d * sc).reshape((n_blocks, -1, 1))
+    dm = (dmin * m).reshape((n_blocks, -1, 1))
+    qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
+    qs = (qs & 0x0F).reshape((n_blocks, -1, 32))
+    return (d * qs - dm).reshape((n_blocks, QK_K))
+def dequantize_blocks_BF16(blocks, block_size, type_size, dtype=None):
+    """Dequantize BF16 blocks"""
+    return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32)
+# Mapping of quantization types to dequantization functions
+GGML_QUANT_SIZES = gguf.GGML_QUANT_SIZES
+DEQUANTIZE_FUNCTIONS = {
+    gguf.GGMLQuantizationType.BF16: dequantize_blocks_BF16,
+    gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
+    gguf.GGMLQuantizationType.Q6_K: dequantize_blocks_Q6_K,
+    gguf.GGMLQuantizationType.Q5_K: dequantize_blocks_Q5_K,
+    gguf.GGMLQuantizationType.Q4_K: dequantize_blocks_Q4_K,
+}
+def _quant_shape_from_byte_shape(shape, type_size, block_size):
+    """Calculate dequantized shape from quantized byte shape"""
+    return (*shape[:-1], shape[-1] // type_size * block_size)
+def dequantize_gguf_tensor(tensor, target_dtype=None):
+    """
+    Dequantize a GGUF tensor to regular torch tensor
+    Args:
+        tensor: GGUFParameter or regular tensor
+        target_dtype: Target dtype for output (default: float32)
+    Returns:
+        Regular torch tensor
+    """
+    # If not quantized, just return the tensor
+    if not hasattr(tensor, "quant_type"):
+        return tensor.to(target_dtype) if target_dtype else tensor
+    quant_type = tensor.quant_type
+    # If F32 or F16, just convert normally
+    if quant_type in {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}:
+        return tensor.to(target_dtype) if target_dtype else tensor
+    # Get dequantization function
+    if quant_type not in DEQUANTIZE_FUNCTIONS:
+        raise ValueError(f"Unsupported quantization type: {quant_type}")
+    dequant_fn = DEQUANTIZE_FUNCTIONS[quant_type]
+    block_size, type_size = GGML_QUANT_SIZES[quant_type]
+    # Prepare tensor for dequantization
+    tensor_bytes = tensor.view(torch.uint8)
+    shape = _quant_shape_from_byte_shape(tensor_bytes.shape, type_size, block_size)
+    n_blocks = tensor_bytes.numel() // type_size
+    blocks = tensor_bytes.reshape((n_blocks, type_size))
+    # Dequantize
+    dtype = target_dtype if target_dtype else torch.float32
+    dequant = dequant_fn(blocks, block_size, type_size, dtype=dtype)
+    dequant = dequant.reshape(shape)
+    return dequant
+class GGUFParameter(torch.nn.Parameter):
+    """
+    Custom Parameter class for GGUF quantized tensors
+    Stores quantization metadata alongside the data
+    """
+    def __new__(cls, data, requires_grad=False, quant_type=None):
+        data = data if data is not None else torch.empty(0)
+        # Store byte shape before creating parameter
+        byte_shape = data.shape
+        self = torch.Tensor._make_subclass(cls, data, requires_grad)
+        self.quant_type = quant_type
+        block_size, type_size = GGML_QUANT_SIZES[quant_type]
+        self.quant_shape = _quant_shape_from_byte_shape(byte_shape, type_size, block_size)
+        return self
+    @property
+    def shape(self):
+        """Return the dequantized shape instead of byte shape"""
+        if hasattr(self, 'quant_shape'):
+            return self.quant_shape
+        # Fallback: get shape from parent class without causing recursion
+        return object.__getattribute__(self, 'data').shape if hasattr(self, 'data') else torch.Size()
+def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix=""):
+    """
+    Replace nn.Linear layers with GGUF Linear layers for on-the-fly dequantization
+    Based on ComfyUI-WanVideoWrapper implementation
+    """
+    def _should_convert_to_gguf(state_dict, prefix):
+        weight_key = prefix + "weight"
+        return weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter)
+    has_children = list(model.children())
+    if not has_children:
+        return
+    try:
+        from accelerate import init_empty_weights
+        use_accelerate = True
+    except ImportError:
+        use_accelerate = False
+    for name, module in model.named_children():
+        module_prefix = prefix + name + "."
+        _replace_with_gguf_linear(module, compute_dtype, state_dict, module_prefix)
+        if (
+            isinstance(module, nn.Linear)
+            and not isinstance(module, GGUFLinear)
+            and _should_convert_to_gguf(state_dict, module_prefix)
+        ):
+            # Get correct dimensions from the GGUF parameter shape
+            weight_param = state_dict[module_prefix + "weight"]
+            if hasattr(weight_param, 'quant_shape'):
+                out_features, in_features = weight_param.quant_shape
+            else:
+                out_features, in_features = weight_param.shape
+            # Check if this is a custom Linear subclass with a custom forward method
+            module_type = type(module)
+            has_custom_forward = (
+                module_type != nn.Linear and
+                hasattr(module_type, 'forward') and
+                module_type.forward is not nn.Linear.forward
+            )
+            if has_custom_forward:
+                # For custom Linear subclasses (like SharedAdaLin), create wrapped forward
+                from types import MethodType
+                def wrapped_forward(self, *args, **kwargs):
+                    input_tensor = args[0] if args else None
+                    if input_tensor is not None and hasattr(input_tensor, 'dtype'):
+                        target_dtype = input_tensor.dtype if input_tensor.dtype in [torch.float16, torch.bfloat16, torch.float32] else compute_dtype
+                    else:
+                        target_dtype = compute_dtype
+                    # Dequantize weights
+                    dequant_weight = dequantize_gguf_tensor(self.weight, target_dtype=target_dtype)
+                    dequant_bias = None
+                    if self.bias is not None:
+                        if isinstance(self.bias, GGUFParameter):
+                            dequant_bias = dequantize_gguf_tensor(self.bias, target_dtype=target_dtype)
+                        else:
+                            dequant_bias = self.bias
+                    # Perform linear operation
+                    import torch.nn.functional as F
+                    linear_output = F.linear(input_tensor, dequant_weight, dequant_bias)
+                    # Apply custom reshaping for SharedAdaLin
+                    if module_type.__name__ == 'SharedAdaLin':
+                        C = dequant_weight.shape[0] // 6
+                        return linear_output.reshape(-1, 1, 6, C)
+                    return linear_output
+                new_module = GGUFLinear(
+                    in_features,
+                    out_features,
+                    module.bias is not None,
+                    compute_dtype=compute_dtype,
+                )
+                new_module.forward = MethodType(wrapped_forward, new_module)
+            else:
+                # Standard GGUFLinear replacement
+                if use_accelerate:
+                    with init_empty_weights():
+                        new_module = GGUFLinear(
+                            in_features,
+                            out_features,
+                            module.bias is not None,
+                            compute_dtype=compute_dtype,
+                        )
+                else:
+                    new_module = GGUFLinear(
+                        in_features,
+                        out_features,
+                        module.bias is not None,
+                        compute_dtype=compute_dtype,
+                    )
+            model._modules[name] = new_module
+            model._modules[name].source_cls = type(module)
+            model._modules[name].requires_grad_(False)
+    return model
+class GGUFLinear(nn.Linear):
+    """
+    Custom Linear layer that dequantizes GGUF weights on-the-fly
+    Compatible with Infinity model architecture
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        compute_dtype=None,
+    ):
+        super().__init__(in_features, out_features, bias, device, dtype)
+        self.compute_dtype = compute_dtype if compute_dtype else torch.float32
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with on-the-fly dequantization
+        Args:
+            input: Input tensor
+        Returns:
+            Output tensor after linear transformation
+        """
+        # Dequantize weight to compute dtype or match input dtype
+        target_dtype = input.dtype if input.dtype in [torch.float16, torch.bfloat16, torch.float32] else self.compute_dtype
+        weight = dequantize_gguf_tensor(self.weight, target_dtype=target_dtype)
+        # Transpose weight for PyTorch (GGUF stores as (out, in) for some, (in, out) for others)
+        # For linear layers, assume GGUF stores as (out, in)
+        # weight = weight.t()
+        # Dequantize bias if present
+        bias = None
+        if self.bias is not None:
+            bias = dequantize_gguf_tensor(self.bias, target_dtype=target_dtype)
+        # Perform linear operation
+        return torch.nn.functional.linear(input, weight, bias)
+def load_gguf_state_dict_with_params(gguf_path, device='cuda'):
+    """
+    Load GGUF file and return state dict with GGUFParameters for quantized tensors
+    For use with _replace_with_gguf_linear
+    """
+    from gguf import GGUFReader
+    reader = GGUFReader(gguf_path)
+    state_dict = {}
+    for tensor in reader.tensors:
+        torch_tensor = torch.from_numpy(np.array(tensor.data)).to(device)
+        # Check if quantized
+        is_quantized = tensor.tensor_type not in {
+            gguf.GGMLQuantizationType.F32,
+            gguf.GGMLQuantizationType.F16
+        }
+        if is_quantized:
+            # Keep as GGUFParameter for on-the-fly dequantization
+            param = GGUFParameter(torch_tensor, quant_type=tensor.tensor_type)
+            state_dict[tensor.name] = param
+        else:
+            # Already F32 or F16 - convert to regular tensor
+            shape = torch.Size(tuple(int(v) for v in reversed(tensor.shape)))
+            torch_tensor = torch_tensor.view(*shape)
+            if tensor.tensor_type == gguf.GGMLQuantizationType.F32:
+                state_dict[tensor.name] = nn.Parameter(torch_tensor.float())
+            else:
+                state_dict[tensor.name] = nn.Parameter(torch_tensor.half())
+    return state_dict
+def load_gguf_state_dict(gguf_path):
+    """
+    Load GGUF file and create state dict with GGUFParameters
+    Args:
+        gguf_path: Path to GGUF file
+    Returns:
+        state_dict: Dictionary mapping tensor names to GGUFParameters or regular tensors
+    """
+    from gguf import GGUFReader
+    reader = GGUFReader(gguf_path)
+    state_dict = {}
+    for tensor in reader.tensors:
+        # Check if tensor is quantized
+        is_quantized = tensor.tensor_type not in {
+            gguf.GGMLQuantizationType.F32,
+            gguf.GGMLQuantizationType.F16
+        }
+        # Create meta tensor with appropriate type
+        if is_quantized:
+            # For quantized tensors, create GGUFParameter
+            meta_tensor = torch.from_numpy(np.array(tensor.data)).to('cpu')
+            param = GGUFParameter(meta_tensor, quant_type=tensor.tensor_type)
+            state_dict[tensor.name] = param
+        else:
+            # For F32/F16, just load normally
+            state_dict[tensor.name] = torch.from_numpy(np.array(tensor.data)).to('cpu')
+    return state_dict
+def replace_linear_with_gguf(model, state_dict, compute_dtype=torch.float32):
+    """
+    Recursively replace nn.Linear layers with GGUFLinear layers
+    where the corresponding weight in state_dict is a GGUFParameter
+    Args:
+        model: PyTorch model
+        state_dict: State dict with GGUFParameters
+        compute_dtype: Dtype to use for computation
+    Returns:
+        Modified model with GGUFLinear layers
+    """
+    for name, module in model.named_children():
+        # Recursively process children
+        replace_linear_with_gguf(module, state_dict, compute_dtype)
+        # Check if this is a Linear layer with quantized weights
+        if isinstance(module, nn.Linear):
+            weight_key = f"{get_module_prefix(model, name)}.weight"
+            if weight_key in state_dict and isinstance(state_dict[weight_key], GGUFParameter):
+                # Replace with GGUFLinear
+                in_features = module.in_features
+                out_features = module.out_features
+                has_bias = module.bias is not None
+                gguf_linear = GGUFLinear(
+                    in_features,
+                    out_features,
+                    bias=has_bias,
+                    compute_dtype=compute_dtype
+                )
+                # Copy the module to the model
+                setattr(model, name, gguf_linear)
+    return model
+def get_module_prefix(model, module_name):
+    """Helper to get the full prefix for a module"""
+    # This is a simplified version - you may need to adjust based on your model structure
+    return module_name
+if __name__ == "__main__":
+    # Test dequantization
+    print("GGUF utilities loaded successfully!")
+    print(f"Supported quantization types: {list(DEQUANTIZE_FUNCTIONS.keys())}")