slida's picture
download
raw
248 kB
import warnings
import itertools
from typing import Any, Dict, List, Optional, Tuple, Union
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.loaders import PeftAdapterMixin
from diffusers.loaders.single_file_model import FromOriginalModelMixin
from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
from diffusers.models.attention_processor import Attention
from diffusers.models.modeling_outputs import Transformer2DModelOutput
from diffusers.models.modeling_utils import ModelMixin
from diffusers.models.embeddings import get_1d_rotary_pos_embed
from diffusers.models.activations import get_activation
from diffusers.models.embeddings import Timesteps
import importlib.util
import sys
#################MY####################
from dataclasses import dataclass
import numpy as np
#######################################
# The package importlib_metadata is in a different place, depending on the python version.
if sys.version_info < (3, 8):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
def _is_package_available(pkg_name: str):
pkg_exists = importlib.util.find_spec(pkg_name) is not None
pkg_version = "N/A"
if pkg_exists:
try:
pkg_version = importlib_metadata.version(pkg_name)
except (ImportError, importlib_metadata.PackageNotFoundError):
pkg_exists = False
return pkg_exists, pkg_version
_triton_available, _triton_version = _is_package_available("triton")
_flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")
def is_triton_available():
return _triton_available
def is_flash_attn_available():
return _flash_attn_available
if is_triton_available():
# from ...ops.triton.layer_norm import RMSNorm
import triton
import triton.language as tl
from typing import Callable
def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
def decorator(*args, **kwargs):
if cuda_amp_deprecated:
kwargs["device_type"] = "cuda"
return dec(*args, **kwargs)
return decorator
if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
deprecated = True
from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
else:
deprecated = False
from torch.cuda.amp import custom_fwd, custom_bwd
custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
custom_bwd = custom_amp_decorator(custom_bwd, deprecated)
def triton_autotune_configs():
# Return configs with a valid warp count for the current device
configs=[]
# Maximum threads per block is architecture-dependent in theory, but in reality all are 1024
max_threads_per_block=1024
# Default to warp size 32 if not defined by device
warp_size=getattr(torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32)
# Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
warp_count=1
while warp_count*warp_size <= max_threads_per_block:
configs.append(triton.Config({}, num_warps=warp_count))
warp_count*=2
return configs
@triton.autotune(
configs=triton_autotune_configs(),
key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
)
# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
@triton.jit
def _layer_norm_fwd_1pass_kernel(
X, # pointer to the input
Y, # pointer to the output
W, # pointer to the weights
B, # pointer to the biases
RESIDUAL, # pointer to the residual
X1,
W1,
B1,
Y1,
RESIDUAL_OUT, # pointer to the residual
ROWSCALE,
SEEDS, # Dropout seeds for each row
DROPOUT_MASK,
Mean, # pointer to the mean
Rstd, # pointer to the 1/std
stride_x_row, # how much to increase the pointer when moving by 1 row
stride_y_row,
stride_res_row,
stride_res_out_row,
stride_x1_row,
stride_y1_row,
M, # number of rows in X
N, # number of columns in X
eps, # epsilon to avoid division by zero
dropout_p, # Dropout probability
zero_centered_weight, # If true, add 1.0 to the weight
IS_RMS_NORM: tl.constexpr,
BLOCK_N: tl.constexpr,
HAS_RESIDUAL: tl.constexpr,
STORE_RESIDUAL_OUT: tl.constexpr,
HAS_BIAS: tl.constexpr,
HAS_DROPOUT: tl.constexpr,
STORE_DROPOUT_MASK: tl.constexpr,
HAS_ROWSCALE: tl.constexpr,
HAS_X1: tl.constexpr,
HAS_W1: tl.constexpr,
HAS_B1: tl.constexpr,
):
# Map the program id to the row of X and Y it should compute.
row = tl.program_id(0)
X += row * stride_x_row
Y += row * stride_y_row
if HAS_RESIDUAL:
RESIDUAL += row * stride_res_row
if STORE_RESIDUAL_OUT:
RESIDUAL_OUT += row * stride_res_out_row
if HAS_X1:
X1 += row * stride_x1_row
if HAS_W1:
Y1 += row * stride_y1_row
# Compute mean and variance
cols = tl.arange(0, BLOCK_N)
x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
if HAS_ROWSCALE:
rowscale = tl.load(ROWSCALE + row).to(tl.float32)
x *= rowscale
if HAS_DROPOUT:
# Compute dropout mask
# 7 rounds is good enough, and reduces register pressure
keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
if STORE_DROPOUT_MASK:
tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
if HAS_X1:
x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
if HAS_ROWSCALE:
rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
x1 *= rowscale
if HAS_DROPOUT:
# Compute dropout mask
# 7 rounds is good enough, and reduces register pressure
keep_mask = (
tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
)
x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
if STORE_DROPOUT_MASK:
tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
x += x1
if HAS_RESIDUAL:
residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
x += residual
if STORE_RESIDUAL_OUT:
tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
if not IS_RMS_NORM:
mean = tl.sum(x, axis=0) / N
tl.store(Mean + row, mean)
xbar = tl.where(cols < N, x - mean, 0.0)
var = tl.sum(xbar * xbar, axis=0) / N
else:
xbar = tl.where(cols < N, x, 0.0)
var = tl.sum(xbar * xbar, axis=0) / N
rstd = 1 / tl.sqrt(var + eps)
tl.store(Rstd + row, rstd)
# Normalize and apply linear transformation
mask = cols < N
w = tl.load(W + cols, mask=mask).to(tl.float32)
if zero_centered_weight:
w += 1.0
if HAS_BIAS:
b = tl.load(B + cols, mask=mask).to(tl.float32)
x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
y = x_hat * w + b if HAS_BIAS else x_hat * w
# Write output
tl.store(Y + cols, y, mask=mask)
if HAS_W1:
w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
if zero_centered_weight:
w1 += 1.0
if HAS_B1:
b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
tl.store(Y1 + cols, y1, mask=mask)
def _layer_norm_fwd(
x,
weight,
bias,
eps,
residual=None,
x1=None,
weight1=None,
bias1=None,
dropout_p=0.0,
rowscale=None,
out_dtype=None,
residual_dtype=None,
zero_centered_weight=False,
is_rms_norm=False,
return_dropout_mask=False,
out=None,
residual_out=None
):
if residual is not None:
residual_dtype = residual.dtype
M, N = x.shape
assert x.stride(-1) == 1
if residual is not None:
assert residual.stride(-1) == 1
assert residual.shape == (M, N)
assert weight.shape == (N,)
assert weight.stride(-1) == 1
if bias is not None:
assert bias.stride(-1) == 1
assert bias.shape == (N,)
if x1 is not None:
assert x1.shape == x.shape
assert rowscale is None
assert x1.stride(-1) == 1
if weight1 is not None:
assert weight1.shape == (N,)
assert weight1.stride(-1) == 1
if bias1 is not None:
assert bias1.shape == (N,)
assert bias1.stride(-1) == 1
if rowscale is not None:
assert rowscale.is_contiguous()
assert rowscale.shape == (M,)
# allocate output
if out is None:
out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
else:
assert out.shape == x.shape
assert out.stride(-1) == 1
if weight1 is not None:
y1 = torch.empty_like(out)
assert y1.stride(-1) == 1
else:
y1 = None
if (
residual is not None
or (residual_dtype is not None and residual_dtype != x.dtype)
or dropout_p > 0.0
or rowscale is not None
or x1 is not None
):
if residual_out is None:
residual_out = torch.empty(
M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype
)
else:
assert residual_out.shape == x.shape
assert residual_out.stride(-1) == 1
else:
residual_out = None
mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
if dropout_p > 0.0:
seeds = torch.randint(
2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
)
else:
seeds = None
if return_dropout_mask and dropout_p > 0.0:
dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool)
else:
dropout_mask = None
# Less than 64KB per feature: enqueue fused kernel
MAX_FUSED_SIZE = 65536 // x.element_size()
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
if N > BLOCK_N:
raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
with torch.cuda.device(x.device.index):
_layer_norm_fwd_1pass_kernel[(M,)](
x,
out,
weight,
bias,
residual,
x1,
weight1,
bias1,
y1,
residual_out,
rowscale,
seeds,
dropout_mask,
mean,
rstd,
x.stride(0),
out.stride(0),
residual.stride(0) if residual is not None else 0,
residual_out.stride(0) if residual_out is not None else 0,
x1.stride(0) if x1 is not None else 0,
y1.stride(0) if y1 is not None else 0,
M,
N,
eps,
dropout_p,
zero_centered_weight,
is_rms_norm,
BLOCK_N,
residual is not None,
residual_out is not None,
bias is not None,
dropout_p > 0.0,
dropout_mask is not None,
rowscale is not None,
)
# residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
if dropout_mask is not None and x1 is not None:
dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
else:
dropout_mask1 = None
return (
out,
y1,
mean,
rstd,
residual_out if residual_out is not None else x,
seeds,
dropout_mask,
dropout_mask1,
)
@triton.autotune(
configs=triton_autotune_configs(),
key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
)
# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
@triton.jit
def _layer_norm_bwd_kernel(
X, # pointer to the input
W, # pointer to the weights
B, # pointer to the biases
Y, # pointer to the output to be recomputed
DY, # pointer to the output gradient
DX, # pointer to the input gradient
DW, # pointer to the partial sum of weights gradient
DB, # pointer to the partial sum of biases gradient
DRESIDUAL,
W1,
DY1,
DX1,
DW1,
DB1,
DRESIDUAL_IN,
ROWSCALE,
SEEDS,
Mean, # pointer to the mean
Rstd, # pointer to the 1/std
stride_x_row, # how much to increase the pointer when moving by 1 row
stride_y_row,
stride_dy_row,
stride_dx_row,
stride_dres_row,
stride_dy1_row,
stride_dx1_row,
stride_dres_in_row,
M, # number of rows in X
N, # number of columns in X
eps, # epsilon to avoid division by zero
dropout_p,
zero_centered_weight,
rows_per_program,
IS_RMS_NORM: tl.constexpr,
BLOCK_N: tl.constexpr,
HAS_DRESIDUAL: tl.constexpr,
STORE_DRESIDUAL: tl.constexpr,
HAS_BIAS: tl.constexpr,
HAS_DROPOUT: tl.constexpr,
HAS_ROWSCALE: tl.constexpr,
HAS_DY1: tl.constexpr,
HAS_DX1: tl.constexpr,
HAS_B1: tl.constexpr,
RECOMPUTE_OUTPUT: tl.constexpr,
):
# Map the program id to the elements of X, DX, and DY it should compute.
row_block_id = tl.program_id(0)
row_start = row_block_id * rows_per_program
# Do not early exit if row_start >= M, because we need to write DW and DB
cols = tl.arange(0, BLOCK_N)
mask = cols < N
X += row_start * stride_x_row
if HAS_DRESIDUAL:
DRESIDUAL += row_start * stride_dres_row
if STORE_DRESIDUAL:
DRESIDUAL_IN += row_start * stride_dres_in_row
DY += row_start * stride_dy_row
DX += row_start * stride_dx_row
if HAS_DY1:
DY1 += row_start * stride_dy1_row
if HAS_DX1:
DX1 += row_start * stride_dx1_row
if RECOMPUTE_OUTPUT:
Y += row_start * stride_y_row
w = tl.load(W + cols, mask=mask).to(tl.float32)
if zero_centered_weight:
w += 1.0
if RECOMPUTE_OUTPUT and HAS_BIAS:
b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
if HAS_DY1:
w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
if zero_centered_weight:
w1 += 1.0
dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
if HAS_BIAS:
db = tl.zeros((BLOCK_N,), dtype=tl.float32)
if HAS_DY1:
dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
if HAS_B1:
db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
row_end = min((row_block_id + 1) * rows_per_program, M)
for row in range(row_start, row_end):
# Load data to SRAM
x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
if HAS_DY1:
dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
if not IS_RMS_NORM:
mean = tl.load(Mean + row)
rstd = tl.load(Rstd + row)
# Compute dx
xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
xhat = tl.where(mask, xhat, 0.0)
if RECOMPUTE_OUTPUT:
y = xhat * w + b if HAS_BIAS else xhat * w
tl.store(Y + cols, y, mask=mask)
wdy = w * dy
dw += dy * xhat
if HAS_BIAS:
db += dy
if HAS_DY1:
wdy += w1 * dy1
dw1 += dy1 * xhat
if HAS_B1:
db1 += dy1
if not IS_RMS_NORM:
c1 = tl.sum(xhat * wdy, axis=0) / N
c2 = tl.sum(wdy, axis=0) / N
dx = (wdy - (xhat * c1 + c2)) * rstd
else:
c1 = tl.sum(xhat * wdy, axis=0) / N
dx = (wdy - xhat * c1) * rstd
if HAS_DRESIDUAL:
dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
dx += dres
# Write dx
if STORE_DRESIDUAL:
tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
if HAS_DX1:
if HAS_DROPOUT:
keep_mask = (
tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
)
dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
else:
dx1 = dx
tl.store(DX1 + cols, dx1, mask=mask)
if HAS_DROPOUT:
keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
if HAS_ROWSCALE:
rowscale = tl.load(ROWSCALE + row).to(tl.float32)
dx *= rowscale
tl.store(DX + cols, dx, mask=mask)
X += stride_x_row
if HAS_DRESIDUAL:
DRESIDUAL += stride_dres_row
if STORE_DRESIDUAL:
DRESIDUAL_IN += stride_dres_in_row
if RECOMPUTE_OUTPUT:
Y += stride_y_row
DY += stride_dy_row
DX += stride_dx_row
if HAS_DY1:
DY1 += stride_dy1_row
if HAS_DX1:
DX1 += stride_dx1_row
tl.store(DW + row_block_id * N + cols, dw, mask=mask)
if HAS_BIAS:
tl.store(DB + row_block_id * N + cols, db, mask=mask)
if HAS_DY1:
tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
if HAS_B1:
tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
def _layer_norm_bwd(
dy,
x,
weight,
bias,
eps,
mean,
rstd,
dresidual=None,
dy1=None,
weight1=None,
bias1=None,
seeds=None,
dropout_p=0.0,
rowscale=None,
has_residual=False,
has_x1=False,
zero_centered_weight=False,
is_rms_norm=False,
x_dtype=None,
recompute_output=False,
):
M, N = x.shape
assert x.stride(-1) == 1
assert dy.stride(-1) == 1
assert dy.shape == (M, N)
if dresidual is not None:
assert dresidual.stride(-1) == 1
assert dresidual.shape == (M, N)
assert weight.shape == (N,)
assert weight.stride(-1) == 1
if bias is not None:
assert bias.stride(-1) == 1
assert bias.shape == (N,)
if dy1 is not None:
assert weight1 is not None
assert dy1.shape == dy.shape
assert dy1.stride(-1) == 1
if weight1 is not None:
assert weight1.shape == (N,)
assert weight1.stride(-1) == 1
if bias1 is not None:
assert bias1.shape == (N,)
assert bias1.stride(-1) == 1
if seeds is not None:
assert seeds.is_contiguous()
assert seeds.shape == (M if not has_x1 else M * 2,)
if rowscale is not None:
assert rowscale.is_contiguous()
assert rowscale.shape == (M,)
# allocate output
dx = (
torch.empty_like(x)
if x_dtype is None
else torch.empty(M, N, dtype=x_dtype, device=x.device)
)
dresidual_in = (
torch.empty_like(x)
if has_residual
and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
else None
)
dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
if recompute_output:
assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
# Less than 64KB per feature: enqueue fused kernel
MAX_FUSED_SIZE = 65536 // x.element_size()
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
if N > BLOCK_N:
raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
# Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
# latency of the gmem reads/writes, but will increase the time of summing up dw / db.
sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
_dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
_db = (
torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
if bias is not None
else None
)
_dw1 = torch.empty_like(_dw) if weight1 is not None else None
_db1 = torch.empty_like(_db) if bias1 is not None else None
rows_per_program = math.ceil(M / sm_count)
grid = (sm_count,)
with torch.cuda.device(x.device.index):
_layer_norm_bwd_kernel[grid](
x,
weight,
bias,
y,
dy,
dx,
_dw,
_db,
dresidual,
weight1,
dy1,
dx1,
_dw1,
_db1,
dresidual_in,
rowscale,
seeds,
mean,
rstd,
x.stride(0),
0 if not recompute_output else y.stride(0),
dy.stride(0),
dx.stride(0),
dresidual.stride(0) if dresidual is not None else 0,
dy1.stride(0) if dy1 is not None else 0,
dx1.stride(0) if dx1 is not None else 0,
dresidual_in.stride(0) if dresidual_in is not None else 0,
M,
N,
eps,
dropout_p,
zero_centered_weight,
rows_per_program,
is_rms_norm,
BLOCK_N,
dresidual is not None,
dresidual_in is not None,
bias is not None,
dropout_p > 0.0,
)
dw = _dw.sum(0).to(weight.dtype)
db = _db.sum(0).to(bias.dtype) if bias is not None else None
dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
# Don't need to compute dresidual_in separately in this case
if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
dresidual_in = dx
if has_x1 and dropout_p == 0.0:
dx1 = dx
return (
(dx, dw, db, dresidual_in, dx1, dw1, db1)
if not recompute_output
else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
)
class LayerNormFn(torch.autograd.Function):
@staticmethod
def forward(
ctx,
x,
weight,
bias,
residual=None,
x1=None,
weight1=None,
bias1=None,
eps=1e-6,
dropout_p=0.0,
rowscale=None,
prenorm=False,
residual_in_fp32=False,
zero_centered_weight=False,
is_rms_norm=False,
return_dropout_mask=False,
out=None,
residual_out=None
):
x_shape_og = x.shape
# Check for zero sequence length
if x.numel() == 0:
ctx.zero_seq_length = True
# Only save minimal required tensors for backward
# ctx.save_for_backward(weight, bias, weight1, bias1)
ctx.x_shape_og = x_shape_og
ctx.weight_shape = weight.shape
ctx.weight_dtype = weight.dtype
ctx.weight_device = weight.device
ctx.has_bias = bias is not None
ctx.bias_shape = bias.shape if bias is not None else None
ctx.bias_dtype = bias.dtype if bias is not None else None
ctx.bias_device = bias.device if bias is not None else None
ctx.has_weight1 = weight1 is not None
ctx.weight1_shape = weight1.shape if weight1 is not None else None
ctx.weight1_dtype = weight1.dtype if weight1 is not None else None
ctx.weight1_device = weight1.device if weight1 is not None else None
ctx.has_bias1 = bias1 is not None
ctx.bias1_shape = bias1.shape if bias1 is not None else None
ctx.bias1_dtype = bias1.dtype if bias1 is not None else None
ctx.bias1_device = bias1.device if bias1 is not None else None
ctx.has_residual = residual is not None
ctx.has_x1 = x1 is not None
ctx.dropout_p = dropout_p
# Handle output tensors with correct dtype
y = x # Preserve input tensor properties
y1 = torch.empty_like(x) if x1 is not None else None
# Only create residual_out if prenorm is True
residual_out = torch.empty(x.shape,
dtype=torch.float32 if residual_in_fp32 else x.dtype,
device=x.device) if prenorm else None
# Handle dropout masks
dropout_mask = None
dropout_mask1 = None
if return_dropout_mask:
dropout_mask = torch.empty_like(x, dtype=torch.uint8)
if x1 is not None:
dropout_mask1 = torch.empty_like(x, dtype=torch.uint8)
# Return based on configuration
if not return_dropout_mask:
if weight1 is None:
return y if not prenorm else (y, residual_out)
else:
return (y, y1) if not prenorm else (y, y1, residual_out)
else:
if weight1 is None:
return ((y, dropout_mask, dropout_mask1) if not prenorm
else (y, residual_out, dropout_mask, dropout_mask1))
else:
return ((y, y1, dropout_mask, dropout_mask1) if not prenorm
else (y, y1, residual_out, dropout_mask, dropout_mask1))
ctx.zero_seq_length = False
# reshape input data into 2D tensor
x = x.reshape(-1, x.shape[-1])
if x.stride(-1) != 1:
x = x.contiguous()
if residual is not None:
assert residual.shape == x_shape_og
residual = residual.reshape(-1, residual.shape[-1])
if residual.stride(-1) != 1:
residual = residual.contiguous()
if x1 is not None:
assert x1.shape == x_shape_og
assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
x1 = x1.reshape(-1, x1.shape[-1])
if x1.stride(-1) != 1:
x1 = x1.contiguous()
weight = weight.contiguous()
if bias is not None:
bias = bias.contiguous()
if weight1 is not None:
weight1 = weight1.contiguous()
if bias1 is not None:
bias1 = bias1.contiguous()
if rowscale is not None:
rowscale = rowscale.reshape(-1).contiguous()
residual_dtype = (
residual.dtype
if residual is not None
else (torch.float32 if residual_in_fp32 else None)
)
if out is not None:
out = out.reshape(-1, out.shape[-1])
if residual_out is not None:
residual_out = residual_out.reshape(-1, residual_out.shape[-1])
y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
x,
weight,
bias,
eps,
residual,
x1,
weight1,
bias1,
dropout_p=dropout_p,
rowscale=rowscale,
residual_dtype=residual_dtype,
zero_centered_weight=zero_centered_weight,
is_rms_norm=is_rms_norm,
return_dropout_mask=return_dropout_mask,
out=out,
residual_out=residual_out
)
ctx.save_for_backward(
residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
)
ctx.x_shape_og = x_shape_og
ctx.eps = eps
ctx.dropout_p = dropout_p
ctx.is_rms_norm = is_rms_norm
ctx.has_residual = residual is not None
ctx.has_x1 = x1 is not None
ctx.prenorm = prenorm
ctx.x_dtype = x.dtype
ctx.zero_centered_weight = zero_centered_weight
y = y.reshape(x_shape_og)
y1 = y1.reshape(x_shape_og) if y1 is not None else None
residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
if not return_dropout_mask:
if weight1 is None:
return y if not prenorm else (y, residual_out)
else:
return (y, y1) if not prenorm else (y, y1, residual_out)
else:
if weight1 is None:
return (
(y, dropout_mask, dropout_mask1)
if not prenorm
else (y, residual_out, dropout_mask, dropout_mask1)
)
else:
return (
(y, y1, dropout_mask, dropout_mask1)
if not prenorm
else (y, y1, residual_out, dropout_mask, dropout_mask1)
)
@staticmethod
def backward(ctx, dy, *args):
if ctx.zero_seq_length:
return (
torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device),
torch.zeros(ctx.weight_shape, dtype=ctx.weight_dtype, device=ctx.weight_device),
torch.zeros(ctx.bias_shape, dtype=ctx.bias_dtype, device=ctx.bias_device) if ctx.has_bias else None,
torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device) if ctx.has_residual else None,
torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device) if ctx.has_x1 and ctx.dropout_p > 0.0 else None,
torch.zeros(ctx.weight1_shape, dtype=ctx.weight1_dtype, device=ctx.weight1_device) if ctx.has_weight1 else None,
torch.zeros(ctx.bias1_shape, dtype=ctx.bias1_dtype, device=ctx.bias1_device) if ctx.has_bias1 else None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
dy = dy.reshape(-1, dy.shape[-1])
if dy.stride(-1) != 1:
dy = dy.contiguous()
assert dy.shape == x.shape
if weight1 is not None:
dy1, args = args[0], args[1:]
dy1 = dy1.reshape(-1, dy1.shape[-1])
if dy1.stride(-1) != 1:
dy1 = dy1.contiguous()
assert dy1.shape == x.shape
else:
dy1 = None
if ctx.prenorm:
dresidual = args[0]
dresidual = dresidual.reshape(-1, dresidual.shape[-1])
if dresidual.stride(-1) != 1:
dresidual = dresidual.contiguous()
assert dresidual.shape == x.shape
else:
dresidual = None
dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
dy,
x,
weight,
bias,
ctx.eps,
mean,
rstd,
dresidual,
dy1,
weight1,
bias1,
seeds,
ctx.dropout_p,
rowscale,
ctx.has_residual,
ctx.has_x1,
ctx.zero_centered_weight,
ctx.is_rms_norm,
x_dtype=ctx.x_dtype,
)
return (
dx.reshape(ctx.x_shape_og),
dw,
db,
dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
dw1,
db1,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
def rms_norm_fn(
x,
weight,
bias,
residual=None,
x1=None,
weight1=None,
bias1=None,
eps=1e-6,
dropout_p=0.0,
rowscale=None,
prenorm=False,
residual_in_fp32=False,
zero_centered_weight=False,
return_dropout_mask=False,
out=None,
residual_out=None
):
return LayerNormFn.apply(
x,
weight,
bias,
residual,
x1,
weight1,
bias1,
eps,
dropout_p,
rowscale,
prenorm,
residual_in_fp32,
zero_centered_weight,
True,
return_dropout_mask,
out,
residual_out
)
class RMSNorm(torch.nn.Module):
def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, zero_centered_weight=False,
device=None, dtype=None):
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.eps = eps
if dropout_p > 0.0:
self.drop = torch.nn.Dropout(dropout_p)
else:
self.drop = None
self.zero_centered_weight = zero_centered_weight
self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
self.register_parameter("bias", None)
self.reset_parameters()
def reset_parameters(self):
if not self.zero_centered_weight:
torch.nn.init.ones_(self.weight)
else:
torch.nn.init.zeros_(self.weight)
def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
return rms_norm_fn(
x,
self.weight,
self.bias,
residual=residual,
eps=self.eps,
dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
prenorm=prenorm,
residual_in_fp32=residual_in_fp32,
zero_centered_weight=self.zero_centered_weight,
)
else:
from torch.nn import RMSNorm
warnings.warn("Cannot import triton, install triton to use fused RMSNorm for better performance")
def swiglu(x, y):
return F.silu(x.float(), inplace=False).to(x.dtype) * y
logger = logging.get_logger(__name__)
class TimestepEmbedding(nn.Module):
def __init__(
self,
in_channels: int,
time_embed_dim: int,
act_fn: str = "silu",
out_dim: int = None,
post_act_fn: Optional[str] = None,
cond_proj_dim=None,
sample_proj_bias=True,
):
super().__init__()
self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
if cond_proj_dim is not None:
self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
else:
self.cond_proj = None
self.act = get_activation(act_fn)
if out_dim is not None:
time_embed_dim_out = out_dim
else:
time_embed_dim_out = time_embed_dim
self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
if post_act_fn is None:
self.post_act = None
else:
self.post_act = get_activation(post_act_fn)
self.initialize_weights()
def initialize_weights(self):
nn.init.normal_(self.linear_1.weight, std=0.02)
nn.init.zeros_(self.linear_1.bias)
nn.init.normal_(self.linear_2.weight, std=0.02)
nn.init.zeros_(self.linear_2.bias)
def forward(self, sample, condition=None):
if condition is not None:
sample = sample + self.cond_proj(condition)
sample = self.linear_1(sample)
if self.act is not None:
sample = self.act(sample)
sample = self.linear_2(sample)
if self.post_act is not None:
sample = self.post_act(sample)
return sample
def apply_rotary_emb(
x: torch.Tensor,
freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
use_real: bool = True,
use_real_unbind_dim: int = -1,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
tensors contain rotary embeddings and are returned as real tensors.
Args:
x (`torch.Tensor`):
Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
Returns:
Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
"""
if use_real:
cos, sin = freqs_cis # [S, D]
cos = cos[None, None]
sin = sin[None, None]
cos, sin = cos.to(x.device), sin.to(x.device)
if use_real_unbind_dim == -1:
# Used for flux, cogvideox, hunyuan-dit
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
elif use_real_unbind_dim == -2:
# Used for Stable Audio, OmniGen and CogView4
x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
x_rotated = torch.cat([-x_imag, x_real], dim=-1)
else:
raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
return out
else:
# used for lumina
# x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], x.shape[-1] // 2, 2))
freqs_cis = freqs_cis.unsqueeze(2)
x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
return x_out.type_as(x)
class BOOGURotaryPosEmbed(nn.Module):
def __init__(self, theta: int,
axes_dim: Tuple[int, int, int],
axes_lens: Tuple[int, int, int] = (300, 512, 512),
patch_size: int = 2):
super().__init__()
self.theta = theta
self.axes_dim = axes_dim
self.axes_lens = axes_lens
self.patch_size = patch_size
@staticmethod
def get_freqs_cis(axes_dim: Tuple[int, int, int],
axes_lens: Tuple[int, int, int],
theta: int) -> List[torch.Tensor]:
freqs_cis = []
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
freqs_cis.append(emb)
return freqs_cis
def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
device = ids.device
if ids.device.type == "mps":
ids = ids.to("cpu")
result = []
for i in range(len(self.axes_dim)):
freqs = freqs_cis[i].to(ids.device)
index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
return torch.cat(result, dim=-1).to(device)
def forward(
self,
freqs_cis,
attention_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
device
):
batch_size = len(attention_mask)
p = self.patch_size
encoder_seq_len = attention_mask.shape[1]
l_effective_cap_len = attention_mask.sum(dim=1).tolist()
seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]
max_seq_len = max(seq_lengths)
max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
max_img_len = max(l_effective_img_len)
# Create position IDs
position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
# add text position ids
position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")
pe_shift = cap_seq_len
pe_shift_len = cap_seq_len
if ref_img_sizes[i] is not None:
for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
H, W = ref_img_size
ref_H_tokens, ref_W_tokens = H // p, W // p
assert ref_H_tokens * ref_W_tokens == ref_img_len
# add image position ids
row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids
pe_shift += max(ref_H_tokens, ref_W_tokens)
pe_shift_len += ref_img_len
H, W = img_sizes[i]
H_tokens, W_tokens = H // p, W // p
assert H_tokens * W_tokens == l_effective_img_len[i]
row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()
assert pe_shift_len + l_effective_img_len[i] == seq_len
position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
position_ids[i, pe_shift_len: seq_len, 1] = row_ids
position_ids[i, pe_shift_len: seq_len, 2] = col_ids
# Get combined rotary embeddings
freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)
# create separate rotary embeddings for captions and images
cap_freqs_cis = torch.zeros(
batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
ref_img_freqs_cis = torch.zeros(
batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
img_freqs_cis = torch.zeros(
batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]
return (
cap_freqs_cis,
ref_img_freqs_cis,
img_freqs_cis,
freqs_cis,
l_effective_cap_len,
seq_lengths,
)
###################################################################my double stream block#######################################################################
class BOOGUDoubleStreamRotaryPosEmbed(nn.Module):
def __init__(self, theta: int,
axes_dim: Tuple[int, int, int],
axes_lens: Tuple[int, int, int] = (300, 512, 512),
patch_size: int = 2):
super().__init__()
self.theta = theta
self.axes_dim = axes_dim
self.axes_lens = axes_lens
self.patch_size = patch_size
@staticmethod
def get_freqs_cis(axes_dim: Tuple[int, int, int],
axes_lens: Tuple[int, int, int],
theta: int) -> List[torch.Tensor]:
freqs_cis = []
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
freqs_cis.append(emb)
return freqs_cis
def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
device = ids.device
if ids.device.type == "mps":
ids = ids.to("cpu")
result = []
for i in range(len(self.axes_dim)):
freqs = freqs_cis[i].to(ids.device)
index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
return torch.cat(result, dim=-1).to(device)
def forward(
self,
freqs_cis,
attention_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
device
):
batch_size = len(attention_mask)
p = self.patch_size
encoder_seq_len = attention_mask.shape[1]
l_effective_cap_len = attention_mask.sum(dim=1).tolist()
seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]
max_seq_len = max(seq_lengths)
max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
max_img_len = max(l_effective_img_len)
# Create position IDs
position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
# add text position ids
position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")
pe_shift = cap_seq_len
pe_shift_len = cap_seq_len
if ref_img_sizes[i] is not None:
for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
H, W = ref_img_size
ref_H_tokens, ref_W_tokens = H // p, W // p
assert ref_H_tokens * ref_W_tokens == ref_img_len
# add image position ids
row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids
pe_shift += max(ref_H_tokens, ref_W_tokens)
pe_shift_len += ref_img_len
H, W = img_sizes[i]
H_tokens, W_tokens = H // p, W // p
assert H_tokens * W_tokens == l_effective_img_len[i]
row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()
assert pe_shift_len + l_effective_img_len[i] == seq_len
position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
position_ids[i, pe_shift_len: seq_len, 1] = row_ids
position_ids[i, pe_shift_len: seq_len, 2] = col_ids
# Get combined rotary embeddings
freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)
# create separate rotary embeddings for captions and images
cap_freqs_cis = torch.zeros(
batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
ref_img_freqs_cis = torch.zeros(
batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
img_freqs_cis = torch.zeros(
batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
# Calculate combined image sequence lengths (ref_img + img) for each sample
combined_img_seq_lengths = [sum(ref_img_len) + img_len for ref_img_len, img_len in zip(l_effective_ref_img_len, l_effective_img_len)]
max_combined_img_len = max(combined_img_seq_lengths)
# Create combined image rotary embeddings
combined_img_freqs_cis = torch.zeros(
batch_size, max_combined_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
)
for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]
# Combined image rotary embeddings: ref_img + img (same order as img_patch_embed_and_refine)
combined_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
combined_img_freqs_cis[i, sum(ref_img_len):sum(ref_img_len) + img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]
return (
cap_freqs_cis,
ref_img_freqs_cis,
img_freqs_cis,
freqs_cis,
l_effective_cap_len,
seq_lengths,
combined_img_freqs_cis,
combined_img_seq_lengths,
)
class BOOGUPromptTuningRotaryPosEmbed(nn.Module):
"""
Rotary Position Embedding for Prompt Tuning tokens.
This class generates rotary position embeddings specifically for prompt tuning tokens.
Since prompt tokens are treated as text tokens, we use text-style position encoding
with a fixed sequence length equal to num_trainable_prompt_tokens.
Args:
theta: Base frequency for rotary embeddings
axes_dim: Dimensions for each axis (tuple like (32, 32, 32))
num_trainable_prompt_tokens: Number of trainable prompt tokens
"""
def __init__(self, theta: int, dim: int , num_trainable_prompt_tokens: int):
super().__init__()
self.theta = theta
self.num_trainable_prompt_tokens = num_trainable_prompt_tokens
# For text tokens, only use the first dimension (text/temporal dimension)
self.dim = dim # Extract text dimension from tuple
def forward(self, batch_size: int, device: torch.device, use_causal_mask: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Generate rotary position embeddings and attention mask for prompt tuning.
Args:
batch_size: Batch size
device: Target device for tensors
use_causal_mask: Whether to use causal attention mask
Returns:
Tuple of (rotary_embeddings, attention_mask)
- rotary_embeddings: [B, num_tokens, instruction_dim//2] - RoPE embeddings for prompt tokens (complex form)
- attention_mask: [B, num_tokens] or [B, num_tokens, num_tokens] - Attention mask
"""
# Generate 1D rotary embeddings for text-style tokens
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
# get_1d_rotary_pos_embed(dim, seq_len) returns [seq_len, dim//2]
# Because RoPE uses complex representation, each dimension is split into sin/cos pairs
text_freqs_cis = get_1d_rotary_pos_embed(
self.dim, # This should be 32 (text dimension)
self.num_trainable_prompt_tokens, # Sequence length
theta=self.theta,
freqs_dtype=freqs_dtype
)
# For prompt tuning, we create simple sequential position embeddings
# Each prompt token gets a unique position ID: 0, 1, 2, ..., num_tokens-1
position_indices = torch.arange(self.num_trainable_prompt_tokens, dtype=torch.int64, device=text_freqs_cis.device)
# Select the appropriate rotary embeddings for each position
# text_freqs_cis is [num_tokens, instruction_dim//2], we want [num_tokens, instruction_dim//2]
rotary_emb = text_freqs_cis[position_indices] # [num_tokens, instruction_dim//2]
# Expand to batch size and move to target device
rotary_emb = rotary_emb.unsqueeze(0).expand(batch_size, -1, -1).to(device) # [B, num_tokens, instruction_dim//2]
# Create attention mask based on use_causal_mask parameter
if use_causal_mask:
# Create causal mask: only future tokens can attend to past tokens
# Lower triangular matrix where mask[i, j] = True if i >= j
causal_mask = torch.tril(torch.ones(
self.num_trainable_prompt_tokens, self.num_trainable_prompt_tokens,
dtype=torch.bool, device=device
)) # [num_tokens, num_tokens]
# Expand to batch size [B, num_tokens, num_tokens]
attention_mask = causal_mask.unsqueeze(0).expand(batch_size, -1, -1)
else:
# Non-causal mask: all tokens can attend to each other (all True)
attention_mask = torch.ones(
batch_size, self.num_trainable_prompt_tokens,
dtype=torch.bool, device=device
) # [B, num_tokens]
return rotary_emb, attention_mask
##########################################################################################################################################
class LuminaRMSNormZero(nn.Module):
"""
Norm layer adaptive RMS normalization zero.
Parameters:
embedding_dim (`int`): The size of each embedding vector.
"""
def __init__(
self,
embedding_dim: int,
norm_eps: float,
norm_elementwise_affine: bool,
):
super().__init__()
self.silu = nn.SiLU()
self.linear = nn.Linear(
min(embedding_dim, 1024),
4 * embedding_dim,
bias=True,
)
self.norm = RMSNorm(embedding_dim, eps=norm_eps)
def forward(
self,
x: torch.Tensor,
emb: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
emb = self.linear(self.silu(emb))
scale_msa, gate_msa, scale_mlp, gate_mlp = emb.chunk(4, dim=1)
x = self.norm(x) * (1 + scale_msa[:, None])
return x, gate_msa, scale_mlp, gate_mlp
class LuminaLayerNormContinuous(nn.Module):
def __init__(
self,
embedding_dim: int,
conditioning_embedding_dim: int,
# NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
# because the output is immediately scaled and shifted by the projected conditioning embeddings.
# Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
# However, this is how it was implemented in the original code, and it's rather likely you should
# set `elementwise_affine` to False.
elementwise_affine=True,
eps=1e-5,
bias=True,
norm_type="layer_norm",
out_dim: Optional[int] = None,
):
super().__init__()
# AdaLN
self.silu = nn.SiLU()
self.linear_1 = nn.Linear(conditioning_embedding_dim, embedding_dim, bias=bias)
if norm_type == "layer_norm":
self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
elif norm_type == "rms_norm":
self.norm = RMSNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
else:
raise ValueError(f"unknown norm_type {norm_type}")
self.linear_2 = None
if out_dim is not None:
self.linear_2 = nn.Linear(embedding_dim, out_dim, bias=bias)
def forward(
self,
x: torch.Tensor,
conditioning_embedding: torch.Tensor,
) -> torch.Tensor:
# convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
emb = self.linear_1(self.silu(conditioning_embedding).to(x.dtype))
scale = emb
x = self.norm(x) * (1 + scale)[:, None, :]
if self.linear_2 is not None:
x = self.linear_2(x)
return x
class LuminaFeedForward(nn.Module):
r"""
A feed-forward layer.
Parameters:
hidden_size (`int`):
The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
hidden representations.
intermediate_size (`int`): The intermediate dimension of the feedforward layer.
multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
of this value.
ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
dimension. Defaults to None.
"""
def __init__(
self,
dim: int,
inner_dim: int,
multiple_of: Optional[int] = 256,
ffn_dim_multiplier: Optional[float] = None,
):
super().__init__()
self.swiglu = swiglu
# custom hidden_size factor multiplier
if ffn_dim_multiplier is not None:
inner_dim = int(ffn_dim_multiplier * inner_dim)
inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)
self.linear_1 = nn.Linear(
dim,
inner_dim,
bias=False,
)
self.linear_2 = nn.Linear(
inner_dim,
dim,
bias=False,
)
self.linear_3 = nn.Linear(
dim,
inner_dim,
bias=False,
)
def forward(self, x):
h1, h2 = self.linear_1(x), self.linear_3(x)
return self.linear_2(self.swiglu(h1, h2))
class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
def __init__(
self,
hidden_size: int = 4096,
text_feat_dim: int = 2048,
frequency_embedding_size: int = 256,
norm_eps: float = 1e-5,
timestep_scale: float = 1.0,
) -> None:
super().__init__()
self.time_proj = Timesteps(
num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=timestep_scale
)
self.timestep_embedder = TimestepEmbedding(
in_channels=frequency_embedding_size, time_embed_dim=min(hidden_size, 1024)
)
#############################my debug###################################
print(f"###################text_feat_dim: {text_feat_dim}########################")
################################################################
self.caption_embedder = nn.Sequential(
RMSNorm(text_feat_dim, eps=norm_eps),
nn.Linear(text_feat_dim, hidden_size, bias=True),
)
self._initialize_weights()
def _initialize_weights(self):
nn.init.trunc_normal_(self.caption_embedder[1].weight, std=0.02)
nn.init.zeros_(self.caption_embedder[1].bias)
def forward(
self, timestep: torch.Tensor, instruction_hidden_states: torch.Tensor, dtype: torch.dtype
) -> Tuple[torch.Tensor, torch.Tensor]:
timestep_proj = self.time_proj(timestep).to(dtype=dtype)
time_embed = self.timestep_embedder(timestep_proj)
caption_embed = self.caption_embedder(instruction_hidden_states)
return time_embed, caption_embed
### default AttnProcessor
# class OmniGen2AttnProcessor:
# """
# Processor for implementing scaled dot-product attention.
# This processor is optimized for PyTorch 2.0 and implements:
# - Flash attention with variable length sequences
# - Rotary position embeddings (RoPE)
# - Query-Key normalization
# - Proportional attention scaling
# Args:
# None
# Raises:
# ImportError: If PyTorch version is less than 2.0
# """
# def __init__(self) -> None:
# """Initialize the attention processor."""
# if not hasattr(F, "scaled_dot_product_attention"):
# raise ImportError(
# "OmniGen2AttnProcessorFlash2Varlen requires PyTorch 2.0. "
# "Please upgrade PyTorch to version 2.0 or later."
# )
# def __call__(
# self,
# attn: Attention,
# hidden_states: torch.Tensor,
# encoder_hidden_states: torch.Tensor,
# attention_mask: Optional[torch.Tensor] = None,
# image_rotary_emb: Optional[torch.Tensor] = None,
# base_sequence_length: Optional[int] = None,
# ) -> torch.Tensor:
# """
# Process attention computation with flash attention.
# Args:
# attn: Attention module
# hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
# encoder_hidden_states: Encoder hidden states tensor
# attention_mask: Optional attention mask tensor
# image_rotary_emb: Optional rotary embeddings for image tokens
# base_sequence_length: Optional base sequence length for proportional attention
# Returns:
# torch.Tensor: Processed hidden states after attention computation
# """
# batch_size, sequence_length, _ = hidden_states.shape
# # Get Query-Key-Value Pair
# query = attn.to_q(hidden_states)
# key = attn.to_k(encoder_hidden_states)
# value = attn.to_v(encoder_hidden_states)
# query_dim = query.shape[-1]
# inner_dim = key.shape[-1]
# head_dim = query_dim // attn.heads
# dtype = query.dtype
# # Get key-value heads
# kv_heads = inner_dim // head_dim
# # Reshape tensors for attention computation
# query = query.view(batch_size, -1, attn.heads, head_dim)
# key = key.view(batch_size, -1, kv_heads, head_dim)
# value = value.view(batch_size, -1, kv_heads, head_dim)
# # Apply Query-Key normalization
# if attn.norm_q is not None:
# query = attn.norm_q(query)
# if attn.norm_k is not None:
# key = attn.norm_k(key)
# # Apply Rotary Position Embeddings
# if image_rotary_emb is not None:
# query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
# key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
# query, key = query.to(dtype), key.to(dtype)
# # Calculate attention scale
# if base_sequence_length is not None:
# softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
# else:
# softmax_scale = attn.scale
# # scaled_dot_product_attention expects attention_mask shape to be
# # (batch, heads, source_length, target_length)
# if attention_mask is not None:
# attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
# query = query.transpose(1, 2)
# key = key.transpose(1, 2)
# value = value.transpose(1, 2)
# # explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
# key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
# value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
# hidden_states = F.scaled_dot_product_attention(
# query, key, value, attn_mask=attention_mask, scale=softmax_scale
# )
# hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
# hidden_states = hidden_states.type_as(query)
# # Apply output projection
# hidden_states = attn.to_out[0](hidden_states)
# hidden_states = attn.to_out[1](hidden_states)
# return hidden_states
#####################################################################my Attention Processor######################################################################################################
####################debug############################
from webdataset.utils import pytorch_worker_info
#####################################################
class BOOGUDoubleStreamSelfAttnProcessorFlash2Varlen(nn.Module):
"""
Double-stream self-attention processor with flash attention and variable length sequences.
This processor implements YAK-style double-stream attention where:
- Text and image features are processed separately to generate QKV
- QKV are concatenated and processed together for cross-modal attention
- Uses flash attention for efficient computation
- Supports both standard and causal attention masks
Args:
head_dim: Dimension of each attention head
num_attention_heads: Number of attention heads for queries
num_kv_heads: Number of key-value heads
qkv_bias: Whether to use bias in QKV linear layers
"""
def __init__(self, head_dim: int, num_attention_heads: int, num_kv_heads: int, qkv_bias: bool = False) -> None:
"""Initialize the double-stream attention processor."""
super().__init__()
if not is_flash_attn_available():
raise ImportError(
"BOOGUDoubleStreamSelfAttnProcessorFlash2Varlen requires flash_attn. "
"Please install flash_attn."
)
# Calculate dimensions
self.head_dim = head_dim
self.num_attention_heads = num_attention_heads
self.num_kv_heads = num_kv_heads
query_dim = head_dim * num_attention_heads
kv_dim = head_dim * num_kv_heads
# Initialize separate Q, K, V linear layers for text and image
# Query uses num_attention_heads, Key/Value use num_kv_heads
self.img_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
self.img_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
self.img_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
self.txt_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
self.txt_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
self.txt_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
# Additional output projection layers for text and image streams
self.txt_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)
self.img_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)
# Initialize weights
self.initialize_weights()
# ########################################debug###############################################
# rank, world_size, worker, num_workers = pytorch_worker_info(None)
# print(f"#######################init rank: {rank} : #self.img_to_q: {self.img_to_q.weight.sum(dim=-1)}################################")
# ############################################################################################
def initialize_weights(self) -> None:
"""
Initialize the weights of the double-stream attention processor.
Uses Xavier uniform initialization for linear layers and zero initialization for biases.
"""
# Initialize image stream QKV projection layers
nn.init.xavier_uniform_(self.img_to_q.weight)
nn.init.xavier_uniform_(self.img_to_k.weight)
nn.init.xavier_uniform_(self.img_to_v.weight)
# Initialize text stream QKV projection layers
nn.init.xavier_uniform_(self.txt_to_q.weight)
nn.init.xavier_uniform_(self.txt_to_k.weight)
nn.init.xavier_uniform_(self.txt_to_v.weight)
# Initialize separate output projection layers
nn.init.xavier_uniform_(self.txt_out.weight)
nn.init.xavier_uniform_(self.img_out.weight)
# Initialize biases if they exist
if self.img_to_q.bias is not None:
nn.init.zeros_(self.img_to_q.bias)
nn.init.zeros_(self.img_to_k.bias)
nn.init.zeros_(self.img_to_v.bias)
nn.init.zeros_(self.txt_to_q.bias)
nn.init.zeros_(self.txt_to_k.bias)
nn.init.zeros_(self.txt_to_v.bias)
nn.init.zeros_(self.txt_out.bias)
nn.init.zeros_(self.img_out.bias)
def _upad_input(
self,
query_layer: torch.Tensor,
key_layer: torch.Tensor,
value_layer: torch.Tensor,
attention_mask: torch.Tensor,
query_length: int,
num_heads: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
"""
Unpad the input tensors for flash attention.
Same implementation as BOOGUAttnProcessorFlash2Varlen.
"""
def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
"""Helper function to get unpadding data from attention mask."""
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return indices, cu_seqlens, max_seqlen_in_batch
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
# Unpad key and value layers
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
indices_k,
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
indices_k,
)
# Handle different query length cases
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
indices_k,
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
def _concat_text_image_features(
self,
img_hidden_states_list: List[torch.Tensor],
txt_hidden_states_list: List[torch.Tensor],
encoder_seq_lengths: List[int],
seq_lengths: List[int],
) -> List[torch.Tensor]:
"""
Concatenate text and image features following YAK's logic (text first, then image).
Args:
img_hidden_states_list: List of image tensors [img_query, img_key, img_value]
txt_hidden_states_list: List of text tensors [txt_query, txt_key, txt_value]
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
Returns:
List of concatenated tensors [query, key, value]
"""
assert len(img_hidden_states_list) == len(txt_hidden_states_list), \
f"Length mismatch: img_list={len(img_hidden_states_list)}, txt_list={len(txt_hidden_states_list)}"
batch_size = img_hidden_states_list[0].shape[0]
max_seq_len = max(seq_lengths)
concatenated_list = []
for img_tensor, txt_tensor in zip(img_hidden_states_list, txt_hidden_states_list):
# Ensure tensors are on the same device
device = img_tensor.device
if txt_tensor.device != device:
txt_tensor = txt_tensor.to(device)
# Create output tensor with proper shape [B, max_seq_len, feature_dim]
feature_dim = img_tensor.shape[-1]
concatenated = img_tensor.new_zeros(batch_size, max_seq_len, feature_dim)
# Concatenate text first, then image for each sample
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
# Place text tokens first
concatenated[i, :encoder_seq_len] = txt_tensor[i, :encoder_seq_len]
# Place image tokens after text
concatenated[i, encoder_seq_len:seq_len] = img_tensor[i, :seq_len - encoder_seq_len]
concatenated_list.append(concatenated)
return concatenated_list
def _split_text_image_features(
self,
hidden_states_list: List[torch.Tensor],
encoder_seq_lengths: List[int],
seq_lengths: List[int],
) -> List[Tuple[torch.Tensor, torch.Tensor]]:
"""
Split concatenated features back to text and image features.
Inverse operation of _concat_text_image_features.
Args:
hidden_states_list: List of concatenated tensors (usually just one element)
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
Returns:
List of tuples, each containing (txt_hidden_states, img_hidden_states)
"""
result_list = []
for hidden_states in hidden_states_list:
batch_size = hidden_states.shape[0]
feature_dim = hidden_states.shape[-1]
# Get maximum lengths for text and image
max_txt_len = max(encoder_seq_lengths)
max_img_len = max(seq_len - encoder_seq_len for seq_len, encoder_seq_len in zip(seq_lengths, encoder_seq_lengths))
# Create output tensors [B, max_len, feature_dim]
txt_hidden_states = hidden_states.new_zeros(batch_size, max_txt_len, feature_dim)
img_hidden_states = hidden_states.new_zeros(batch_size, max_img_len, feature_dim)
# Split each sample back to text and image
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
img_len = seq_len - encoder_seq_len
# Extract text portion
txt_hidden_states[i, :encoder_seq_len] = hidden_states[i, :encoder_seq_len]
# Extract image portion
img_hidden_states[i, :img_len] = hidden_states[i, encoder_seq_len:seq_len]
result_list.append((txt_hidden_states, img_hidden_states))
return result_list
def __call__(
self,
attn: Attention,
img_hidden_states: torch.Tensor,
txt_hidden_states: torch.Tensor,
joint_attention_mask: Optional[torch.Tensor] = None,
rotary_emb: Optional[torch.Tensor] = None,
encoder_seq_lengths: List[int] = None, # [B] - Text sequence lengths for each sample
seq_lengths: List[int] = None, # [B] - Total sequence lengths for each sample
base_sequence_length: Optional[int] = None,
) -> torch.Tensor:
"""
Process double-stream self-attention computation with flash attention.
Args:
attn: Attention module
img_hidden_states: Image hidden states tensor [B, L_img, D]
txt_hidden_states: Text hidden states tensor [B, L_txt, D]
joint_attention_mask: Combined attention mask [B, L_total]
rotary_emb: Rotary embeddings for the joint sequence
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
base_sequence_length: Optional base sequence length for proportional attention
Returns:
torch.Tensor: Processed hidden states after attention computation
"""
batch_size = img_hidden_states.shape[0]
L_txt = txt_hidden_states.shape[1]
L_img = img_hidden_states.shape[1]
# Ensure Q, K, V linear layers are on the same device as input tensors
device = img_hidden_states.device
for layer in [self.img_to_q, self.img_to_k, self.img_to_v, self.txt_to_q, self.txt_to_k, self.txt_to_v,
self.txt_out, self.img_out]:
if layer.weight.device != device:
layer = layer.to(device)
# Generate Q, K, V for image and text streams (NO head reshaping yet)
img_query = self.img_to_q(img_hidden_states) # [B, L_img, query_dim]
img_key = self.img_to_k(img_hidden_states) # [B, L_img, kv_dim]
img_value = self.img_to_v(img_hidden_states) # [B, L_img, kv_dim]
txt_query = self.txt_to_q(txt_hidden_states) # [B, L_txt, query_dim]
txt_key = self.txt_to_k(txt_hidden_states) # [B, L_txt, kv_dim]
txt_value = self.txt_to_v(txt_hidden_states) # [B, L_txt, kv_dim]
# Use helper function to concatenate QKV following YAK's logic (text first, then image)
img_list = [img_query, img_key, img_value] # [B, L_img, feature_dim] each
txt_list = [txt_query, txt_key, txt_value] # [B, L_txt, feature_dim] each
concatenated_list = self._concat_text_image_features(img_list, txt_list, encoder_seq_lengths, seq_lengths)
query, key, value = concatenated_list # [B, max_seq_len, feature_dim] each
# From here, follow exactly the same logic as BOOGUAttnProcessorFlash2Varlen
sequence_length = max(seq_lengths)
query_dim = query.shape[-1]
inner_dim = key.shape[-1]
head_dim = query_dim // attn.heads
dtype = query.dtype
# Get key-value heads
kv_heads = inner_dim // head_dim
# Reshape tensors for attention computation
query = query.view(batch_size, -1, attn.heads, head_dim)
key = key.view(batch_size, -1, kv_heads, head_dim)
value = value.view(batch_size, -1, kv_heads, head_dim)
# Apply Query-Key normalization
if attn.norm_q is not None:
query = attn.norm_q(query)
if attn.norm_k is not None:
key = attn.norm_k(key)
# Apply Rotary Position Embeddings
if rotary_emb is not None:
query = apply_rotary_emb(query, rotary_emb, use_real=False)
key = apply_rotary_emb(key, rotary_emb, use_real=False)
query, key = query.to(dtype), key.to(dtype)
# Calculate attention scale
if base_sequence_length is not None:
softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
else:
softmax_scale = attn.scale
# Detect if we have a causal mask
is_causal = False
if joint_attention_mask is not None and joint_attention_mask.dim() == 3:
# Check if it's a lower triangular causal mask
# For efficiency, we only check the first sample
mask_sample = joint_attention_mask[0] # [seq_len, seq_len]
is_causal = torch.allclose(mask_sample, torch.tril(torch.ones_like(mask_sample)))
# Unpad input for flash attention
(
query_states,
key_states,
value_states,
indices_q,
cu_seq_lens,
max_seq_lens,
) = self._upad_input(query, key, value, joint_attention_mask, sequence_length, attn.heads)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
# Handle different number of heads
if kv_heads < attn.heads:
key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
# Apply flash attention with causal parameter
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=0.0,
causal=is_causal, # Use detected causal setting
softmax_scale=softmax_scale,
)
# Pad output and apply final transformations
hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
hidden_states = hidden_states.flatten(-2)
hidden_states = hidden_states.type_as(query)
# Split hidden_states back to text and image, apply separate output projections, then merge
split_results = self._split_text_image_features([hidden_states], encoder_seq_lengths, seq_lengths)
txt_hidden_states, img_hidden_states = split_results[0] # [B, max_txt_len, feature_dim], [B, max_img_len, feature_dim]
# Apply separate output projections for text and image
txt_projected = self.txt_out(txt_hidden_states) # [B, max_txt_len, feature_dim]
img_projected = self.img_out(img_hidden_states) # [B, max_img_len, feature_dim]
# Merge back to joint representation
merged_list = self._concat_text_image_features([img_projected], [txt_projected], encoder_seq_lengths, seq_lengths)
hidden_states = merged_list[0] # [B, max_seq_len, feature_dim]
# Apply final output projection
hidden_states = attn.to_out[0](hidden_states)
hidden_states = attn.to_out[1](hidden_states)
# ########################################debug###############################################
# rank, world_size, worker, num_workers = pytorch_worker_info(None)
# if rank == 0:
# print(f"#####################rank: {rank}###self.img_to_q: {self.img_to_q.weight[0][:25]} ################################")
# print(f"#####################rank: {rank}###self.txt_to_q: {self.txt_to_q.weight[0][:25]} ################################")
# # print(f"#####################rank: {rank}###attn.to_q: {attn.to_q.weight.sum(dim=-1)[:10]} ################################")
# ############################################################################################
return hidden_states
class BOOGUDoubleStreamSelfAttnProcessor(nn.Module):
"""
Double-stream self-attention processor without flash attention.
This processor implements YAK-style double-stream attention where:
- Text and image features are processed separately to generate QKV
- QKV are concatenated and processed together for cross-modal attention
- Uses PyTorch's scaled_dot_product_attention for computation
- Supports both standard and causal attention masks
Args:
head_dim: Dimension of each attention head
num_attention_heads: Number of attention heads for queries
num_kv_heads: Number of key-value heads
qkv_bias: Whether to use bias in QKV linear layers
"""
def __init__(self, head_dim: int, num_attention_heads: int, num_kv_heads: int, qkv_bias: bool = False) -> None:
"""Initialize the double-stream attention processor."""
super().__init__()
if not hasattr(F, "scaled_dot_product_attention"):
raise ImportError(
"BOOGUDoubleStreamSelfAttnProcessor requires PyTorch 2.0. "
"Please upgrade PyTorch to version 2.0 or later."
)
# Calculate dimensions
self.head_dim = head_dim
self.num_attention_heads = num_attention_heads
self.num_kv_heads = num_kv_heads
query_dim = head_dim * num_attention_heads
kv_dim = head_dim * num_kv_heads
# Initialize separate Q, K, V linear layers for text and image
# Query uses num_attention_heads, Key/Value use num_kv_heads
self.img_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
self.img_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
self.img_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
self.txt_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
self.txt_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
self.txt_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
# Additional output projection layers for text and image streams
self.txt_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)
self.img_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)
# Initialize weights
self.initialize_weights()
def initialize_weights(self) -> None:
"""
Initialize the weights of the double-stream attention processor.
Uses Xavier uniform initialization for linear layers and zero initialization for biases.
"""
# Initialize image stream QKV projection layers
nn.init.xavier_uniform_(self.img_to_q.weight)
nn.init.xavier_uniform_(self.img_to_k.weight)
nn.init.xavier_uniform_(self.img_to_v.weight)
# Initialize text stream QKV projection layers
nn.init.xavier_uniform_(self.txt_to_q.weight)
nn.init.xavier_uniform_(self.txt_to_k.weight)
nn.init.xavier_uniform_(self.txt_to_v.weight)
# Initialize separate output projection layers
nn.init.xavier_uniform_(self.txt_out.weight)
nn.init.xavier_uniform_(self.img_out.weight)
# Initialize biases if they exist
if self.img_to_q.bias is not None:
nn.init.zeros_(self.img_to_q.bias)
nn.init.zeros_(self.img_to_k.bias)
nn.init.zeros_(self.img_to_v.bias)
nn.init.zeros_(self.txt_to_q.bias)
nn.init.zeros_(self.txt_to_k.bias)
nn.init.zeros_(self.txt_to_v.bias)
nn.init.zeros_(self.txt_out.bias)
nn.init.zeros_(self.img_out.bias)
def _concat_text_image_features(
self,
img_hidden_states_list: List[torch.Tensor],
txt_hidden_states_list: List[torch.Tensor],
encoder_seq_lengths: List[int],
seq_lengths: List[int],
) -> List[torch.Tensor]:
"""
Concatenate text and image features following YAK's logic (text first, then image).
Args:
img_hidden_states_list: List of image tensors [img_query, img_key, img_value]
txt_hidden_states_list: List of text tensors [txt_query, txt_key, txt_value]
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
Returns:
List of concatenated tensors [query, key, value]
"""
assert len(img_hidden_states_list) == len(txt_hidden_states_list), \
f"Length mismatch: img_list={len(img_hidden_states_list)}, txt_list={len(txt_hidden_states_list)}"
batch_size = img_hidden_states_list[0].shape[0]
max_seq_len = max(seq_lengths)
concatenated_list = []
for img_tensor, txt_tensor in zip(img_hidden_states_list, txt_hidden_states_list):
# Ensure tensors are on the same device
device = img_tensor.device
if txt_tensor.device != device:
txt_tensor = txt_tensor.to(device)
# Create output tensor with proper shape [B, max_seq_len, feature_dim]
feature_dim = img_tensor.shape[-1]
concatenated = img_tensor.new_zeros(batch_size, max_seq_len, feature_dim)
# Concatenate text first, then image for each sample
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
# Place text tokens first
concatenated[i, :encoder_seq_len] = txt_tensor[i, :encoder_seq_len]
# Place image tokens after text
concatenated[i, encoder_seq_len:seq_len] = img_tensor[i, :seq_len - encoder_seq_len]
concatenated_list.append(concatenated)
return concatenated_list
def _split_text_image_features(
self,
hidden_states_list: List[torch.Tensor],
encoder_seq_lengths: List[int],
seq_lengths: List[int],
) -> List[Tuple[torch.Tensor, torch.Tensor]]:
"""
Split concatenated features back to text and image features.
Inverse operation of _concat_text_image_features.
Args:
hidden_states_list: List of concatenated tensors (usually just one element)
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
Returns:
List of tuples, each containing (txt_hidden_states, img_hidden_states)
"""
result_list = []
for hidden_states in hidden_states_list:
batch_size = hidden_states.shape[0]
feature_dim = hidden_states.shape[-1]
# Get maximum lengths for text and image
max_txt_len = max(encoder_seq_lengths)
max_img_len = max(seq_len - encoder_seq_len for seq_len, encoder_seq_len in zip(seq_lengths, encoder_seq_lengths))
# Create output tensors [B, max_len, feature_dim]
txt_hidden_states = hidden_states.new_zeros(batch_size, max_txt_len, feature_dim)
img_hidden_states = hidden_states.new_zeros(batch_size, max_img_len, feature_dim)
# Split each sample back to text and image
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
img_len = seq_len - encoder_seq_len
# Extract text portion
txt_hidden_states[i, :encoder_seq_len] = hidden_states[i, :encoder_seq_len]
# Extract image portion
img_hidden_states[i, :img_len] = hidden_states[i, encoder_seq_len:seq_len]
result_list.append((txt_hidden_states, img_hidden_states))
return result_list
def __call__(
self,
attn: Attention,
img_hidden_states: torch.Tensor,
txt_hidden_states: torch.Tensor,
joint_attention_mask: Optional[torch.Tensor] = None,
rotary_emb: Optional[torch.Tensor] = None,
encoder_seq_lengths: List[int] = None, # [B] - Text sequence lengths for each sample
seq_lengths: List[int] = None, # [B] - Total sequence lengths for each sample
base_sequence_length: Optional[int] = None,
) -> torch.Tensor:
"""
Process double-stream self-attention computation with PyTorch's scaled_dot_product_attention.
Args:
attn: Attention module
img_hidden_states: Image hidden states tensor [B, L_img, D]
txt_hidden_states: Text hidden states tensor [B, L_txt, D]
joint_attention_mask: Combined attention mask [B, L_total]
rotary_emb: Rotary embeddings for the joint sequence
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
base_sequence_length: Optional base sequence length for proportional attention
Returns:
torch.Tensor: Processed hidden states after attention computation
"""
batch_size = img_hidden_states.shape[0]
L_txt = txt_hidden_states.shape[1]
L_img = img_hidden_states.shape[1]
# Ensure Q, K, V linear layers are on the same device as input tensors
device = img_hidden_states.device
for layer in [self.img_to_q, self.img_to_k, self.img_to_v, self.txt_to_q, self.txt_to_k, self.txt_to_v,
self.txt_out, self.img_out]:
if layer.weight.device != device:
layer = layer.to(device)
# Generate Q, K, V for image and text streams (NO head reshaping yet)
img_query = self.img_to_q(img_hidden_states) # [B, L_img, query_dim]
img_key = self.img_to_k(img_hidden_states) # [B, L_img, kv_dim]
img_value = self.img_to_v(img_hidden_states) # [B, L_img, kv_dim]
txt_query = self.txt_to_q(txt_hidden_states) # [B, L_txt, query_dim]
txt_key = self.txt_to_k(txt_hidden_states) # [B, L_txt, kv_dim]
txt_value = self.txt_to_v(txt_hidden_states) # [B, L_txt, kv_dim]
# Use helper function to concatenate QKV following YAK's logic (text first, then image)
img_list = [img_query, img_key, img_value] # [B, L_img, feature_dim] each
txt_list = [txt_query, txt_key, txt_value] # [B, L_txt, feature_dim] each
concatenated_list = self._concat_text_image_features(img_list, txt_list, encoder_seq_lengths, seq_lengths)
query, key, value = concatenated_list # [B, max_seq_len, feature_dim] each
# From here, follow exactly the same logic as BOOGUAttnProcessor
sequence_length = max(seq_lengths)
query_dim = query.shape[-1]
inner_dim = key.shape[-1]
head_dim = query_dim // attn.heads
dtype = query.dtype
# Get key-value heads
kv_heads = inner_dim // head_dim
# Reshape tensors for attention computation
query = query.view(batch_size, -1, attn.heads, head_dim)
key = key.view(batch_size, -1, kv_heads, head_dim)
value = value.view(batch_size, -1, kv_heads, head_dim)
# Apply Query-Key normalization
if attn.norm_q is not None:
query = attn.norm_q(query)
if attn.norm_k is not None:
key = attn.norm_k(key)
# Apply Rotary Position Embeddings
if rotary_emb is not None:
query = apply_rotary_emb(query, rotary_emb, use_real=False)
key = apply_rotary_emb(key, rotary_emb, use_real=False)
query, key = query.to(dtype), key.to(dtype)
# Calculate attention scale
if base_sequence_length is not None:
softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
else:
softmax_scale = attn.scale
# scaled_dot_product_attention expects attention_mask shape to be
# (batch, heads, source_length, target_length)
if joint_attention_mask is not None:
joint_attention_mask = joint_attention_mask.bool()
if joint_attention_mask.dim() == 2:
# Standard mask [B, seq_len] -> [B, 1, 1, seq_len]
joint_attention_mask = joint_attention_mask.view(batch_size, 1, 1, -1)
elif joint_attention_mask.dim() == 3:
# Causal mask [B, seq_len, seq_len] -> [B, 1, seq_len, seq_len]
joint_attention_mask = joint_attention_mask.unsqueeze(1)
else:
raise ValueError(f"Unsupported joint_attention_mask shape: {joint_attention_mask.shape}")
query = query.transpose(1, 2)
key = key.transpose(1, 2)
value = value.transpose(1, 2)
# explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
hidden_states = F.scaled_dot_product_attention(
query, key, value, attn_mask=joint_attention_mask, scale=softmax_scale
)
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
hidden_states = hidden_states.type_as(query)
# Split hidden_states back to text and image, apply separate output projections, then merge
split_results = self._split_text_image_features([hidden_states], encoder_seq_lengths, seq_lengths)
txt_hidden_states, img_hidden_states = split_results[0] # [B, max_txt_len, feature_dim], [B, max_img_len, feature_dim]
# Apply separate output projections for text and image
txt_projected = self.txt_out(txt_hidden_states) # [B, max_txt_len, feature_dim]
img_projected = self.img_out(img_hidden_states) # [B, max_img_len, feature_dim]
# Merge back to joint representation
merged_list = self._concat_text_image_features([img_projected], [txt_projected], encoder_seq_lengths, seq_lengths)
hidden_states = merged_list[0] # [B, max_seq_len, feature_dim]
# Apply final output projection
hidden_states = attn.to_out[0](hidden_states)
hidden_states = attn.to_out[1](hidden_states)
return hidden_states
class BOOGUAttnProcessorFlash2Varlen:
"""
Processor for implementing scaled dot-product attention with flash attention and variable length sequences.
This processor implements:
- Flash attention with variable length sequences
- Rotary position embeddings (RoPE)
- Query-Key normalization
- Proportional attention scaling
Args:
None
"""
def __init__(self) -> None:
"""Initialize the attention processor."""
if not is_flash_attn_available():
raise ImportError(
"BOOGUAttnProcessorFlash2Varlen requires flash_attn. "
"Please install flash_attn."
)
def _upad_input(
self,
query_layer: torch.Tensor,
key_layer: torch.Tensor,
value_layer: torch.Tensor,
attention_mask: torch.Tensor,
query_length: int,
num_heads: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
"""
Unpad the input tensors for flash attention.
Args:
query_layer: Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
key_layer: Key tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
value_layer: Value tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
attention_mask: Attention mask tensor of shape (batch_size, seq_len) or (batch_size, seq_len, seq_len) for causal
query_length: Length of the query sequence
num_heads: Number of attention heads
Returns:
Tuple containing:
- Unpadded query tensor
- Unpadded key tensor
- Unpadded value tensor
- Query indices
- Tuple of cumulative sequence lengths for query and key
- Tuple of maximum sequence lengths for query and key
"""
def _get_unpad_data(mask_2d: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
"""Helper function to get unpadding data from a 2D attention mask [B, L]."""
seqlens_in_batch = mask_2d.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(mask_2d.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return indices, cu_seqlens, max_seqlen_in_batch
# Normalize attention mask: if a causal 3D mask is provided [B, L, L],
# convert it to a standard 2D padding mask [B, L] with True for valid tokens.
if attention_mask is not None and attention_mask.dim() == 3:
B, L, _ = attention_mask.shape
# For a proper lower-triangular causal mask, all first L positions are valid per sample.
# However, to be robust, infer per-sample effective lengths from the diagonal.
diag_valid = torch.diagonal(attention_mask, dim1=-2, dim2=-1)
lengths = diag_valid.sum(dim=-1, dtype=torch.int32) # [B]
mask_2d = torch.zeros(B, L, dtype=torch.bool, device=attention_mask.device)
for i in range(B):
if lengths[i].item() > 0:
mask_2d[i, : int(lengths[i].item())] = True
else:
mask_2d = attention_mask # already [B, L]
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(mask_2d)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
# Unpad key and value layers (shared path for both standard and causal cases)
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
indices_k,
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
indices_k,
)
# Handle different query length cases
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
indices_k,
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# Use the last query_length positions of the 2D mask
q_mask = mask_2d[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, q_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
image_rotary_emb: Optional[torch.Tensor] = None,
base_sequence_length: Optional[int] = None,
) -> torch.Tensor:
"""
Process attention computation with flash attention.
Args:
attn: Attention module
hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
encoder_hidden_states: Encoder hidden states tensor
attention_mask: Optional attention mask tensor
image_rotary_emb: Optional rotary embeddings for image tokens
base_sequence_length: Optional base sequence length for proportional attention
Returns:
torch.Tensor: Processed hidden states after attention computation
"""
batch_size, sequence_length, _ = hidden_states.shape
# Get Query-Key-Value Pair
query = attn.to_q(hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query_dim = query.shape[-1]
inner_dim = key.shape[-1]
head_dim = query_dim // attn.heads
dtype = query.dtype
# Get key-value heads
kv_heads = inner_dim // head_dim
# Reshape tensors for attention computation
query = query.view(batch_size, -1, attn.heads, head_dim)
key = key.view(batch_size, -1, kv_heads, head_dim)
value = value.view(batch_size, -1, kv_heads, head_dim)
# Apply Query-Key normalization
if attn.norm_q is not None:
query = attn.norm_q(query)
if attn.norm_k is not None:
key = attn.norm_k(key)
# Apply Rotary Position Embeddings
if image_rotary_emb is not None:
query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
query, key = query.to(dtype), key.to(dtype)
# Calculate attention scale
if base_sequence_length is not None:
softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
else:
softmax_scale = attn.scale
# Detect if we have a causal mask
is_causal = False
if attention_mask is not None and attention_mask.dim() == 3:
# Check if it's a lower triangular causal mask
# For efficiency, we only check the first sample
mask_sample = attention_mask[0] # [seq_len, seq_len]
is_causal = torch.allclose(mask_sample, torch.tril(torch.ones_like(mask_sample)))
# Unpad input for flash attention
(
query_states,
key_states,
value_states,
indices_q,
cu_seq_lens,
max_seq_lens,
) = self._upad_input(query, key, value, attention_mask, sequence_length, attn.heads)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
# Handle different number of heads
if kv_heads < attn.heads:
key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
# Apply flash attention with causal parameter
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=0.0,
causal=is_causal, # Use detected causal setting
softmax_scale=softmax_scale,
)
# Pad output and apply final transformations
hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
hidden_states = hidden_states.flatten(-2)
hidden_states = hidden_states.type_as(query)
# Apply output projection
hidden_states = attn.to_out[0](hidden_states)
hidden_states = attn.to_out[1](hidden_states)
return hidden_states
class BOOGUAttnProcessorFlash2Varlen:
"""
Processor for implementing scaled dot-product attention with flash attention and variable length sequences.
This processor implements:
- Flash attention with variable length sequences
- Rotary position embeddings (RoPE)
- Query-Key normalization
- Proportional attention scaling
Args:
None
"""
def __init__(self) -> None:
"""Initialize the attention processor."""
if not is_flash_attn_available():
raise ImportError(
"BOOGUAttnProcessorFlash2Varlen requires flash_attn. "
"Please install flash_attn."
)
def _upad_input(
self,
query_layer: torch.Tensor,
key_layer: torch.Tensor,
value_layer: torch.Tensor,
attention_mask: torch.Tensor,
query_length: int,
num_heads: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
"""
Unpad the input tensors for flash attention.
Args:
query_layer: Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
key_layer: Key tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
value_layer: Value tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
attention_mask: Attention mask tensor of shape (batch_size, seq_len) or (batch_size, seq_len, seq_len) for causal
query_length: Length of the query sequence
num_heads: Number of attention heads
Returns:
Tuple containing:
- Unpadded query tensor
- Unpadded key tensor
- Unpadded value tensor
- Query indices
- Tuple of cumulative sequence lengths for query and key
- Tuple of maximum sequence lengths for query and key
"""
def _get_unpad_data(mask_2d: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
"""Helper function to get unpadding data from a 2D attention mask [B, L]."""
seqlens_in_batch = mask_2d.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(mask_2d.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return indices, cu_seqlens, max_seqlen_in_batch
# Normalize attention mask: if a causal 3D mask is provided [B, L, L],
# convert it to a standard 2D padding mask [B, L] with True for valid tokens.
if attention_mask is not None and attention_mask.dim() == 3:
B, L, _ = attention_mask.shape
# For a proper lower-triangular causal mask, all first L positions are valid per sample.
# However, to be robust, infer per-sample effective lengths from the diagonal.
diag_valid = torch.diagonal(attention_mask, dim1=-2, dim2=-1)
lengths = diag_valid.sum(dim=-1, dtype=torch.int32) # [B]
mask_2d = torch.zeros(B, L, dtype=torch.bool, device=attention_mask.device)
for i in range(B):
if lengths[i].item() > 0:
mask_2d[i, : int(lengths[i].item())] = True
else:
mask_2d = attention_mask # already [B, L]
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(mask_2d)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
# Unpad key and value layers (shared path for both standard and causal cases)
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
indices_k,
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
indices_k,
)
# Handle different query length cases
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
indices_k,
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# Use the last query_length positions of the 2D mask
q_mask = mask_2d[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, q_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
image_rotary_emb: Optional[torch.Tensor] = None,
base_sequence_length: Optional[int] = None,
) -> torch.Tensor:
"""
Process attention computation with flash attention.
Args:
attn: Attention module
hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
encoder_hidden_states: Encoder hidden states tensor
attention_mask: Optional attention mask tensor
image_rotary_emb: Optional rotary embeddings for image tokens
base_sequence_length: Optional base sequence length for proportional attention
Returns:
torch.Tensor: Processed hidden states after attention computation
"""
batch_size, sequence_length, _ = hidden_states.shape
# Get Query-Key-Value Pair
query = attn.to_q(hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query_dim = query.shape[-1]
inner_dim = key.shape[-1]
head_dim = query_dim // attn.heads
dtype = query.dtype
# Get key-value heads
kv_heads = inner_dim // head_dim
# Reshape tensors for attention computation
query = query.view(batch_size, -1, attn.heads, head_dim)
key = key.view(batch_size, -1, kv_heads, head_dim)
value = value.view(batch_size, -1, kv_heads, head_dim)
# Apply Query-Key normalization
if attn.norm_q is not None:
query = attn.norm_q(query)
if attn.norm_k is not None:
key = attn.norm_k(key)
# Apply Rotary Position Embeddings
if image_rotary_emb is not None:
query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
query, key = query.to(dtype), key.to(dtype)
# Calculate attention scale
if base_sequence_length is not None:
softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
else:
softmax_scale = attn.scale
# Detect if we have a causal mask
is_causal = False
if attention_mask is not None and attention_mask.dim() == 3:
# Check if it's a lower triangular causal mask
# For efficiency, we only check the first sample
mask_sample = attention_mask[0] # [seq_len, seq_len]
is_causal = torch.allclose(mask_sample, torch.tril(torch.ones_like(mask_sample)))
# Unpad input for flash attention
(
query_states,
key_states,
value_states,
indices_q,
cu_seq_lens,
max_seq_lens,
) = self._upad_input(query, key, value, attention_mask, sequence_length, attn.heads)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
# Handle different number of heads
if kv_heads < attn.heads:
key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
# Apply flash attention with causal parameter
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=0.0,
causal=is_causal, # Use detected causal setting
softmax_scale=softmax_scale,
)
# Pad output and apply final transformations
hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
hidden_states = hidden_states.flatten(-2)
hidden_states = hidden_states.type_as(query)
# Apply output projection
hidden_states = attn.to_out[0](hidden_states)
hidden_states = attn.to_out[1](hidden_states)
return hidden_states
class BOOGUAttnProcessor:
"""
Processor for implementing scaled dot-product attention with flash attention and variable length sequences.
This processor is optimized for PyTorch 2.0 and implements:
- Flash attention with variable length sequences
- Rotary position embeddings (RoPE)
- Query-Key normalization
- Proportional attention scaling
Args:
None
Raises:
ImportError: If PyTorch version is less than 2.0
"""
def __init__(self) -> None:
"""Initialize the attention processor."""
if not hasattr(F, "scaled_dot_product_attention"):
raise ImportError(
"BOOGUAttnProcessorFlash2Varlen requires PyTorch 2.0. "
"Please upgrade PyTorch to version 2.0 or later."
)
def __call__(
self,
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
image_rotary_emb: Optional[torch.Tensor] = None,
base_sequence_length: Optional[int] = None,
) -> torch.Tensor:
"""
Process attention computation with flash attention.
Args:
attn: Attention module
hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
encoder_hidden_states: Encoder hidden states tensor
attention_mask: Optional attention mask tensor
image_rotary_emb: Optional rotary embeddings for image tokens
base_sequence_length: Optional base sequence length for proportional attention
Returns:
torch.Tensor: Processed hidden states after attention computation
"""
batch_size, sequence_length, _ = hidden_states.shape
# Get Query-Key-Value Pair
query = attn.to_q(hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query_dim = query.shape[-1]
inner_dim = key.shape[-1]
head_dim = query_dim // attn.heads
dtype = query.dtype
# Get key-value heads
kv_heads = inner_dim // head_dim
# Reshape tensors for attention computation
query = query.view(batch_size, -1, attn.heads, head_dim)
key = key.view(batch_size, -1, kv_heads, head_dim)
value = value.view(batch_size, -1, kv_heads, head_dim)
# Apply Query-Key normalization
if attn.norm_q is not None:
query = attn.norm_q(query)
if attn.norm_k is not None:
key = attn.norm_k(key)
# Apply Rotary Position Embeddings
if image_rotary_emb is not None:
query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
query, key = query.to(dtype), key.to(dtype)
# Calculate attention scale
if base_sequence_length is not None:
softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
else:
softmax_scale = attn.scale
# sdpa expects attn_mask with shape (B, H, Q, K) as boolean (True keeps, False masks)
if attention_mask is not None:
attention_mask = attention_mask.bool()
if attention_mask.dim() == 2:
# Standard padding mask [B, L] -> [B, 1, 1, L]
attention_mask = attention_mask.view(batch_size, 1, 1, -1)
elif attention_mask.dim() == 3:
# Robust causal + padding mask construction
# Infer valid lengths from diagonal, then build lower-triangular mask within valid lengths
B, L, _ = attention_mask.shape
diag_valid = torch.diagonal(attention_mask, dim1=-2, dim2=-1)
lengths = diag_valid.sum(dim=-1) # [B]
arange_L = torch.arange(L, device=attention_mask.device)
# Padding masks for queries and keys: shape [B, L]
q_valid = arange_L.unsqueeze(0) < lengths.unsqueeze(1)
k_valid = q_valid # same lengths assumed
# Lower-triangular causal mask [L, L]
causal = torch.tril(torch.ones(L, L, dtype=torch.bool, device=attention_mask.device))
# Combine: [B, L, L]
combined = causal & q_valid.unsqueeze(-1) & k_valid.unsqueeze(-2)
attention_mask = combined.unsqueeze(1) # [B, 1, L, L]
else:
raise ValueError(f"Unsupported attention_mask shape: {attention_mask.shape}")
query = query.transpose(1, 2)
key = key.transpose(1, 2)
value = value.transpose(1, 2)
# print(f"######################attention_mask: {attention_mask}, shape: {attention_mask.shape}############################")
# explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
hidden_states = F.scaled_dot_product_attention(
query, key, value, attn_mask=attention_mask, scale=softmax_scale
)
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
hidden_states = hidden_states.type_as(query)
# Apply output projection
hidden_states = attn.to_out[0](hidden_states)
hidden_states = attn.to_out[1](hidden_states)
return hidden_states
###########################################################################################################################################################################
### default transformer blocks
# class OmniGen2TransformerBlock(nn.Module):
# """
# Transformer block for OmniGen2 model.
# This block implements a transformer layer with:
# - Multi-head attention with flash attention
# - Feed-forward network with SwiGLU activation
# - RMS normalization
# - Optional modulation for conditional generation
# Args:
# dim: Dimension of the input and output tensors
# num_attention_heads: Number of attention heads
# num_kv_heads: Number of key-value heads
# multiple_of: Multiple of which the hidden dimension should be
# ffn_dim_multiplier: Multiplier for the feed-forward network dimension
# norm_eps: Epsilon value for normalization layers
# modulation: Whether to use modulation for conditional generation
# use_fused_rms_norm: Whether to use fused RMS normalization
# use_fused_swiglu: Whether to use fused SwiGLU activation
# """
# def __init__(
# self,
# dim: int,
# num_attention_heads: int,
# num_kv_heads: int,
# multiple_of: int,
# ffn_dim_multiplier: float,
# norm_eps: float,
# modulation: bool = True,
# ) -> None:
# """Initialize the transformer block."""
# super().__init__()
# self.head_dim = dim // num_attention_heads
# self.modulation = modulation
# # Initialize attention layer
# self.attn = Attention(
# query_dim=dim,
# cross_attention_dim=None,
# dim_head=dim // num_attention_heads,
# qk_norm="rms_norm",
# heads=num_attention_heads,
# kv_heads=num_kv_heads,
# eps=1e-5,
# bias=False,
# out_bias=False,
# processor=OmniGen2AttnProcessor(),
# )
# # Initialize feed-forward network
# self.feed_forward = LuminaFeedForward(
# dim=dim,
# inner_dim=4 * dim,
# multiple_of=multiple_of,
# ffn_dim_multiplier=ffn_dim_multiplier,
# )
# # Initialize normalization layers
# if modulation:
# self.norm1 = LuminaRMSNormZero(
# embedding_dim=dim,
# norm_eps=norm_eps,
# norm_elementwise_affine=True,
# )
# else:
# self.norm1 = RMSNorm(dim, eps=norm_eps)
# self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
# self.norm2 = RMSNorm(dim, eps=norm_eps)
# self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
# self.initialize_weights()
# def initialize_weights(self) -> None:
# """
# Initialize the weights of the transformer block.
# Uses Xavier uniform initialization for linear layers and zero initialization for biases.
# """
# nn.init.xavier_uniform_(self.attn.to_q.weight)
# nn.init.xavier_uniform_(self.attn.to_k.weight)
# nn.init.xavier_uniform_(self.attn.to_v.weight)
# nn.init.xavier_uniform_(self.attn.to_out[0].weight)
# nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
# nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
# nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)
# if self.modulation:
# nn.init.zeros_(self.norm1.linear.weight)
# nn.init.zeros_(self.norm1.linear.bias)
# def forward(
# self,
# hidden_states: torch.Tensor,
# attention_mask: torch.Tensor,
# image_rotary_emb: torch.Tensor,
# temb: Optional[torch.Tensor] = None,
# ) -> torch.Tensor:
# """
# Forward pass of the transformer block.
# Args:
# hidden_states: Input hidden states tensor
# attention_mask: Attention mask tensor
# image_rotary_emb: Rotary embeddings for image tokens
# temb: Optional timestep embedding tensor
# Returns:
# torch.Tensor: Output hidden states after transformer block processing
# """
# if self.modulation:
# if temb is None:
# raise ValueError("temb must be provided when modulation is enabled")
# norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
# attn_output = self.attn(
# hidden_states=norm_hidden_states,
# encoder_hidden_states=norm_hidden_states,
# attention_mask=attention_mask,
# image_rotary_emb=image_rotary_emb,
# )
# hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
# mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
# hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
# else:
# norm_hidden_states = self.norm1(hidden_states)
# attn_output = self.attn(
# hidden_states=norm_hidden_states,
# encoder_hidden_states=norm_hidden_states,
# attention_mask=attention_mask,
# image_rotary_emb=image_rotary_emb,
# )
# hidden_states = hidden_states + self.norm2(attn_output)
# mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
# hidden_states = hidden_states + self.ffn_norm2(mlp_output)
# return hidden_states
# class OmniGen2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
# """
# OmniGen2 Transformer 2D Model.
# A transformer-based diffusion model for image generation with:
# - Patch-based image processing
# - Rotary position embeddings
# - Multi-head attention
# - Conditional generation support
# Args:
# patch_size: Size of image patches
# in_channels: Number of input channels
# out_channels: Number of output channels (defaults to in_channels)
# hidden_size: Size of hidden layers
# num_layers: Number of transformer layers
# num_refiner_layers: Number of refiner layers
# num_attention_heads: Number of attention heads
# num_kv_heads: Number of key-value heads
# multiple_of: Multiple of which the hidden dimension should be
# ffn_dim_multiplier: Multiplier for feed-forward network dimension
# norm_eps: Epsilon value for normalization layers
# axes_dim_rope: Dimensions for rotary position embeddings
# axes_lens: Lengths for rotary position embeddings
# instruction_feat_dim: Dimension of text features
# timestep_scale: Scale factor for timestep embeddings
# use_fused_rms_norm: Whether to use fused RMS normalization
# use_fused_swiglu: Whether to use fused SwiGLU activation
# """
# _supports_gradient_checkpointing = True
# _no_split_modules = ["Omnigen2TransformerBlock"]
# _skip_layerwise_casting_patterns = ["x_embedder", "norm"]
# @register_to_config
# def __init__(
# self,
# patch_size: int = 2,
# in_channels: int = 16,
# out_channels: Optional[int] = None,
# hidden_size: int = 2304,
# num_layers: int = 26,
# num_refiner_layers: int = 2,
# num_attention_heads: int = 24,
# num_kv_heads: int = 8,
# multiple_of: int = 256,
# ffn_dim_multiplier: Optional[float] = None,
# norm_eps: float = 1e-5,
# axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
# axes_lens: Tuple[int, int, int] = (300, 512, 512),
# instruction_feat_dim: int = 1024,
# timestep_scale: float = 1.0,
# ) -> None:
# """Initialize the OmniGen2 transformer model."""
# super().__init__()
# # Validate configuration
# if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
# raise ValueError(
# f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
# f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
# )
# self.out_channels = out_channels or in_channels
# # Initialize embeddings
# self.rope_embedder = OmniGen2RotaryPosEmbed(
# theta=10000,
# axes_dim=axes_dim_rope,
# axes_lens=axes_lens,
# patch_size=patch_size,
# )
# self.x_embedder = nn.Linear(
# in_features=patch_size * patch_size * in_channels,
# out_features=hidden_size,
# )
# self.ref_image_patch_embedder = nn.Linear(
# in_features=patch_size * patch_size * in_channels,
# out_features=hidden_size,
# )
# self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
# hidden_size=hidden_size,
# instruction_feat_dim=instruction_feat_dim,
# norm_eps=norm_eps,
# timestep_scale=timestep_scale,
# )
# # Initialize transformer blocks
# self.noise_refiner = nn.ModuleList([
# OmniGen2TransformerBlock(
# hidden_size,
# num_attention_heads,
# num_kv_heads,
# multiple_of,
# ffn_dim_multiplier,
# norm_eps,
# modulation=True,
# )
# for _ in range(num_refiner_layers)
# ])
# self.ref_image_refiner = nn.ModuleList([
# OmniGen2TransformerBlock(
# hidden_size,
# num_attention_heads,
# num_kv_heads,
# multiple_of,
# ffn_dim_multiplier,
# norm_eps,
# modulation=True,
# )
# for _ in range(num_refiner_layers)
# ])
# self.context_refiner = nn.ModuleList(
# [
# OmniGen2TransformerBlock(
# hidden_size,
# num_attention_heads,
# num_kv_heads,
# multiple_of,
# ffn_dim_multiplier,
# norm_eps,
# modulation=False,
# )
# for _ in range(num_refiner_layers)
# ]
# )
# # 3. Transformer blocks
# self.layers = nn.ModuleList(
# [
# OmniGen2TransformerBlock(
# hidden_size,
# num_attention_heads,
# num_kv_heads,
# multiple_of,
# ffn_dim_multiplier,
# norm_eps,
# modulation=True,
# )
# for _ in range(num_layers)
# ]
# )
# # 4. Output norm & projection
# self.norm_out = LuminaLayerNormContinuous(
# embedding_dim=hidden_size,
# conditioning_embedding_dim=min(hidden_size, 1024),
# elementwise_affine=False,
# eps=1e-6,
# bias=True,
# out_dim=patch_size * patch_size * self.out_channels,
# )
# # Add learnable embeddings to distinguish different images
# self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images
# self.gradient_checkpointing = False
# self.initialize_weights()
# def initialize_weights(self) -> None:
# """
# Initialize the weights of the model.
# Uses Xavier uniform initialization for linear layers.
# """
# nn.init.xavier_uniform_(self.x_embedder.weight)
# nn.init.constant_(self.x_embedder.bias, 0.0)
# nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
# nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)
# nn.init.zeros_(self.norm_out.linear_1.weight)
# nn.init.zeros_(self.norm_out.linear_1.bias)
# nn.init.zeros_(self.norm_out.linear_2.weight)
# nn.init.zeros_(self.norm_out.linear_2.bias)
# nn.init.normal_(self.image_index_embedding, std=0.02)
# def img_patch_embed_and_refine(
# self,
# hidden_states,
# ref_image_hidden_states,
# padded_img_mask,
# padded_ref_img_mask,
# noise_rotary_emb,
# ref_img_rotary_emb,
# l_effective_ref_img_len,
# l_effective_img_len,
# temb
# ):
# batch_size = len(hidden_states)
# max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])
# hidden_states = self.x_embedder(hidden_states)
# ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
# for i in range(batch_size):
# shift = 0
# for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
# ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
# shift += ref_img_len
# for layer in self.noise_refiner:
# hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
# flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
# num_ref_images = len(flat_l_effective_ref_img_len)
# max_ref_img_len = max(flat_l_effective_ref_img_len)
# batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
# batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
# batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
# batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)
# # sequence of ref imgs to batch
# idx = 0
# for i in range(batch_size):
# shift = 0
# for ref_img_len in l_effective_ref_img_len[i]:
# batch_ref_img_mask[idx, :ref_img_len] = True
# batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
# batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
# batch_temb[idx] = temb[i]
# shift += ref_img_len
# idx += 1
# # refine ref imgs separately
# for layer in self.ref_image_refiner:
# batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)
# # batch of ref imgs to sequence
# idx = 0
# for i in range(batch_size):
# shift = 0
# for ref_img_len in l_effective_ref_img_len[i]:
# ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
# shift += ref_img_len
# idx += 1
# combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
# for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
# combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
# combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]
# return combined_img_hidden_states
# def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
# batch_size = len(hidden_states)
# p = self.config.patch_size
# device = hidden_states[0].device
# img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
# l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
# if ref_image_hidden_states is not None:
# ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
# l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
# else:
# ref_img_sizes = [None for _ in range(batch_size)]
# l_effective_ref_img_len = [[0] for _ in range(batch_size)]
# max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
# max_img_len = max(l_effective_img_len)
# # ref image patch embeddings
# flat_ref_img_hidden_states = []
# for i in range(batch_size):
# if ref_img_sizes[i] is not None:
# imgs = []
# for ref_img in ref_image_hidden_states[i]:
# C, H, W = ref_img.size()
# ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
# imgs.append(ref_img)
# img = torch.cat(imgs, dim=0)
# flat_ref_img_hidden_states.append(img)
# else:
# flat_ref_img_hidden_states.append(None)
# # image patch embeddings
# flat_hidden_states = []
# for i in range(batch_size):
# img = hidden_states[i]
# C, H, W = img.size()
# img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
# flat_hidden_states.append(img)
# padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
# padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
# for i in range(batch_size):
# if ref_img_sizes[i] is not None:
# padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
# padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True
# padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
# padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
# for i in range(batch_size):
# padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
# padded_img_mask[i, :l_effective_img_len[i]] = True
# return (
# padded_hidden_states,
# padded_ref_img_hidden_states,
# padded_img_mask,
# padded_ref_img_mask,
# l_effective_ref_img_len,
# l_effective_img_len,
# ref_img_sizes,
# img_sizes,
# )
# def forward(
# self,
# hidden_states: Union[torch.Tensor, List[torch.Tensor]],
# timestep: torch.Tensor,
# text_hidden_states: torch.Tensor,
# freqs_cis: torch.Tensor,
# text_attention_mask: torch.Tensor,
# ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
# attention_kwargs: Optional[Dict[str, Any]] = None,
# return_dict: bool = False,
# ) -> Union[torch.Tensor, Transformer2DModelOutput]:
# if attention_kwargs is not None:
# attention_kwargs = attention_kwargs.copy()
# lora_scale = attention_kwargs.pop("scale", 1.0)
# else:
# lora_scale = 1.0
# if USE_PEFT_BACKEND:
# # weight the lora layers by setting `lora_scale` for each PEFT layer
# scale_lora_layers(self, lora_scale)
# else:
# if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
# logger.warning(
# "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
# )
# # 1. Condition, positional & patch embedding
# batch_size = len(hidden_states)
# is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)
# if is_hidden_states_tensor:
# assert hidden_states.ndim == 4
# hidden_states = [_hidden_states for _hidden_states in hidden_states]
# device = hidden_states[0].device
# temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
# (
# hidden_states,
# ref_image_hidden_states,
# img_mask,
# ref_img_mask,
# l_effective_ref_img_len,
# l_effective_img_len,
# ref_img_sizes,
# img_sizes,
# ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
# (
# context_rotary_emb,
# ref_img_rotary_emb,
# noise_rotary_emb,
# rotary_emb,
# encoder_seq_lengths,
# seq_lengths,
# ) = self.rope_embedder(
# freqs_cis,
# text_attention_mask,
# l_effective_ref_img_len,
# l_effective_img_len,
# ref_img_sizes,
# img_sizes,
# device,
# )
# # 2. Context refinement
# for layer in self.context_refiner:
# text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
# combined_img_hidden_states = self.img_patch_embed_and_refine(
# hidden_states,
# ref_image_hidden_states,
# img_mask,
# ref_img_mask,
# noise_rotary_emb,
# ref_img_rotary_emb,
# l_effective_ref_img_len,
# l_effective_img_len,
# temb,
# )
# # 3. Joint Transformer blocks
# max_seq_len = max(seq_lengths)
# attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
# joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
# for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
# attention_mask[i, :seq_len] = True
# joint_hidden_states[i, :encoder_seq_len] = text_hidden_states[i, :encoder_seq_len]
# joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]
# hidden_states = joint_hidden_states
# for layer_idx, layer in enumerate(self.layers):
# if torch.is_grad_enabled() and self.gradient_checkpointing:
# hidden_states = self._gradient_checkpointing_func(
# layer, hidden_states, attention_mask, rotary_emb, temb
# )
# else:
# hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
# # 4. Output norm & projection
# hidden_states = self.norm_out(hidden_states, temb)
# p = self.config.patch_size
# output = []
# for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
# height, width = img_size
# output.append(rearrange(hidden_states[i][seq_len - img_len:seq_len], '(h w) (p1 p2 c) -> c (h p1) (w p2)', h=height // p, w=width // p, p1=p, p2=p))
# if is_hidden_states_tensor:
# output = torch.stack(output, dim=0)
# if USE_PEFT_BACKEND:
# # remove `lora_scale` from each PEFT layer
# unscale_lora_layers(self, lora_scale)
# if not return_dict:
# return output
# return Transformer2DModelOutput(sample=output)
####################################################################my Transformer Blocks###########################################################################################
class BOOGUTransformerBlock(nn.Module):
"""
Transformer block for BOOGU model.
This block implements a transformer layer with:
- Multi-head attention with flash attention
- Feed-forward network with SwiGLU activation
- RMS normalization
- Optional modulation for conditional generation
Args:
dim: Dimension of the input and output tensors
num_attention_heads: Number of attention heads
num_kv_heads: Number of key-value heads
multiple_of: Multiple of which the hidden dimension should be
ffn_dim_multiplier: Multiplier for the feed-forward network dimension
norm_eps: Epsilon value for normalization layers
modulation: Whether to use modulation for conditional generation
use_fused_rms_norm: Whether to use fused RMS normalization
use_fused_swiglu: Whether to use fused SwiGLU activation
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
num_kv_heads: int,
multiple_of: int,
ffn_dim_multiplier: float,
norm_eps: float,
modulation: bool = True,
) -> None:
"""Initialize the transformer block."""
super().__init__()
self.head_dim = dim // num_attention_heads
self.modulation = modulation
try:
# #########################my debug############################
# print(f"###########################Use BOOGUAttnProcessorFlash2Varlen############################")
# #############################################################
processor = BOOGUAttnProcessorFlash2Varlen()
except ImportError:
processor = BOOGUAttnProcessor()
# Initialize attention layer
self.attn = Attention(
query_dim=dim,
cross_attention_dim=None,
dim_head=dim // num_attention_heads,
qk_norm="rms_norm",
heads=num_attention_heads,
kv_heads=num_kv_heads,
eps=1e-5,
bias=False,
out_bias=False,
processor=processor,
)
# Initialize feed-forward network
self.feed_forward = LuminaFeedForward(
dim=dim,
inner_dim=4 * dim,
multiple_of=multiple_of,
ffn_dim_multiplier=ffn_dim_multiplier
)
# Initialize normalization layers
if modulation:
self.norm1 = LuminaRMSNormZero(
embedding_dim=dim,
norm_eps=norm_eps,
norm_elementwise_affine=True
)
else:
self.norm1 = RMSNorm(dim, eps=norm_eps)
self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
self.norm2 = RMSNorm(dim, eps=norm_eps)
self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
self.initialize_weights()
def initialize_weights(self) -> None:
"""
Initialize the weights of the transformer block.
Uses Xavier uniform initialization for linear layers and zero initialization for biases.
"""
nn.init.xavier_uniform_(self.attn.to_q.weight)
nn.init.xavier_uniform_(self.attn.to_k.weight)
nn.init.xavier_uniform_(self.attn.to_v.weight)
nn.init.xavier_uniform_(self.attn.to_out[0].weight)
nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)
if self.modulation:
nn.init.zeros_(self.norm1.linear.weight)
nn.init.zeros_(self.norm1.linear.bias)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
image_rotary_emb: torch.Tensor,
temb: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""
Forward pass of the transformer block.
Args:
hidden_states: Input hidden states tensor
attention_mask: Attention mask tensor
image_rotary_emb: Rotary embeddings for image tokens
temb: Optional timestep embedding tensor
Returns:
torch.Tensor: Output hidden states after transformer block processing
"""
enable_taylorseer = getattr(self, 'enable_taylorseer', False)
if enable_taylorseer:
if self.modulation:
if temb is None:
raise ValueError("temb must be provided when modulation is enabled")
if self.current['type'] == 'full':
self.current['module'] = 'total'
taylor_cache_init(cache_dic=self.cache_dic, current=self.current)
norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
attn_output = self.attn(
hidden_states=norm_hidden_states,
encoder_hidden_states=norm_hidden_states,
attention_mask=attention_mask,
image_rotary_emb=image_rotary_emb,
)
hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
derivative_approximation(cache_dic=self.cache_dic, current=self.current, feature=hidden_states)
elif self.current['type'] == 'Taylor':
self.current['module'] = 'total'
hidden_states = taylor_formula(cache_dic=self.cache_dic, current=self.current)
else:
norm_hidden_states = self.norm1(hidden_states)
attn_output = self.attn(
hidden_states=norm_hidden_states,
encoder_hidden_states=norm_hidden_states,
attention_mask=attention_mask,
image_rotary_emb=image_rotary_emb,
)
hidden_states = hidden_states + self.norm2(attn_output)
mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
hidden_states = hidden_states + self.ffn_norm2(mlp_output)
else:
if self.modulation:
if temb is None:
raise ValueError("temb must be provided when modulation is enabled")
# ################################my debug###################################
# print(f"######################hidden_states.shape: {hidden_states.shape}##########################") # #hidden_states.shape: torch.Size([88, 464, 2520])
# print(f"######################temb.shape: {temb.shape}##########################") #temb.shape: torch.Size([88, 1024])
# ###########################################################################
norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
# ################################my debug###################################
# print(f"######################norm_hidden_states.shape: {norm_hidden_states.shape}##########################") # norm_hidden_states.shape: torch.Size([88, 464, 2520])
# ###########################################################################
attn_output = self.attn(
hidden_states=norm_hidden_states,
encoder_hidden_states=norm_hidden_states,
attention_mask=attention_mask,
image_rotary_emb=image_rotary_emb,
)
hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
else:
norm_hidden_states = self.norm1(hidden_states)
attn_output = self.attn(
hidden_states=norm_hidden_states,
encoder_hidden_states=norm_hidden_states,
attention_mask=attention_mask,
image_rotary_emb=image_rotary_emb,
)
hidden_states = hidden_states + self.norm2(attn_output)
mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
hidden_states = hidden_states + self.ffn_norm2(mlp_output)
return hidden_states
# class PromptTuningTransformerBlock(BOOGUTransformerBlock):
class PromptTuningTransformerBlock(nn.Module):
"""
Transformer block for BOOGU model.
This block implements a transformer layer with:
- Multi-head attention with flash attention
- Feed-forward network with SwiGLU activation
- RMS normalization
- Optional modulation for conditional generation
Args:
dim: Dimension of the input and output tensors
num_attention_heads: Number of attention heads
num_kv_heads: Number of key-value heads
multiple_of: Multiple of which the hidden dimension should be
ffn_dim_multiplier: Multiplier for the feed-forward network dimension
norm_eps: Epsilon value for normalization layers
modulation: Whether to use modulation for conditional generation
use_fused_rms_norm: Whether to use fused RMS normalization
use_fused_swiglu: Whether to use fused SwiGLU activation
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
num_kv_heads: int,
multiple_of: int,
ffn_dim_multiplier: float,
norm_eps: float,
) -> None:
"""Initialize the transformer block."""
super().__init__()
# super().__init__(
# dim,
# num_attention_heads,
# num_kv_heads,
# multiple_of,
# ffn_dim_multiplier,
# norm_eps,
# modulation = False,
# )
# nn.Module.__init__()
self.head_dim = dim // num_attention_heads
from torch.nn import RMSNorm
try:
# #########################my debug############################
# print(f"###########################Use BOOGUAttnProcessorFlash2Varlen############################")
# #############################################################
# raise ImportError
processor = BOOGUAttnProcessorFlash2Varlen()
except ImportError:
#########################my debug############################
print(f"###########################Use BOOGUAttnProcessor############################")
#############################################################
processor = BOOGUAttnProcessor()
# Initialize attention layer
self.attn = Attention(
query_dim=dim,
cross_attention_dim=None,
dim_head=dim // num_attention_heads,
qk_norm="rms_norm",
heads=num_attention_heads,
kv_heads=num_kv_heads,
eps=1e-5,
bias=False,
out_bias=False,
processor=processor,
)
# Initialize feed-forward network
self.feed_forward = LuminaFeedForward(
dim=dim,
inner_dim=4 * dim,
multiple_of=multiple_of,
ffn_dim_multiplier=ffn_dim_multiplier
)
# Initialize normalization layers
self.norm1 = RMSNorm(dim, eps=norm_eps)
self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
self.norm2 = RMSNorm(dim, eps=norm_eps)
self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
self.initialize_weights()
def initialize_weights(self) -> None:
"""
Initialize the weights of the transformer block.
Uses Xavier uniform initialization for linear layers and zero initialization for biases.
"""
nn.init.xavier_uniform_(self.attn.to_q.weight)
nn.init.xavier_uniform_(self.attn.to_k.weight)
nn.init.xavier_uniform_(self.attn.to_v.weight)
nn.init.xavier_uniform_(self.attn.to_out[0].weight)
nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
rotary_emb: torch.Tensor,
) -> torch.Tensor:
"""
Forward pass of the transformer block.
Args:
hidden_states: Input hidden states tensor
attention_mask: Attention mask tensor
rotary_emb: Rotary embeddings for image tokens
Returns:
torch.Tensor: Output hidden states after transformer block processing
"""
norm_hidden_states = self.norm1(hidden_states)
attn_output = self.attn(
hidden_states=norm_hidden_states,
encoder_hidden_states=norm_hidden_states,
attention_mask=attention_mask,
image_rotary_emb=rotary_emb,
)
hidden_states = hidden_states + self.norm2(attn_output)
mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
hidden_states = hidden_states + self.ffn_norm2(mlp_output)
return hidden_states
@dataclass
class TeaCacheParams:
"""
TeaCache parameters for `OmniGen2Transformer2DModel`
See https://github.com/ali-vilab/TeaCache/ for a more comprehensive understanding
Args:
previous_residual (Optional[torch.Tensor]):
The tensor difference between the output and the input of the transformer layers from the previous timestep.
previous_modulated_inp (Optional[torch.Tensor]):
The modulated input from the previous timestep used to indicate the change of the transformer layer's output.
accumulated_rel_l1_distance (float):
The accumulated relative L1 distance.
is_first_or_last_step (bool):
Whether the current timestep is the first or last step.
"""
previous_residual: Optional[torch.Tensor] = None
previous_modulated_inp: Optional[torch.Tensor] = None
accumulated_rel_l1_distance: float = 0
is_first_or_last_step: bool = False
#################################################Prompt Tuning#####################################################################
# class PromptEmbedding(nn.Module):
class PromptEmbedding(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
_supports_gradient_checkpointing = True
_no_split_modules = ["PromptTuningTransformerBlock", "BOOGUTransformerBlock"]
_skip_layerwise_casting_patterns = ["prompt_token_embedding", "norm"]
def __init__(self, prompt_tuning_configs):
super().__init__()
# 拆出你关心的参数
num_trainable_prompt_tokens = prompt_tuning_configs.get("num_trainable_prompt_tokens", 32)
hidden_size = prompt_tuning_configs.get("hidden_size", 2048)
num_attention_heads = prompt_tuning_configs.get("num_attention_heads", 32)
num_kv_heads = prompt_tuning_configs.get("num_kv_heads", 8)
multiple_of = prompt_tuning_configs.get("multiple_of", 256)
ffn_dim_multiplier = prompt_tuning_configs.get("ffn_dim_multiplier", None)
norm_eps = prompt_tuning_configs.get("norm_eps", 1e-5)
num_layers = prompt_tuning_configs.get("num_layers", 2)
theta = prompt_tuning_configs.get("theta", 10000)
# 关键:注册到 config(会存到 config.json)
self.register_to_config(
num_trainable_prompt_tokens=num_trainable_prompt_tokens,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
num_kv_heads=num_kv_heads,
multiple_of=multiple_of,
ffn_dim_multiplier=ffn_dim_multiplier,
norm_eps=norm_eps,
num_layers=num_layers,
theta=theta,
)
self.prompt_tuning_configs = prompt_tuning_configs
# print(f"##################prompt_tuning_configs: {prompt_tuning_configs}, type: {type(prompt_tuning_configs)}#####################")
# num_trainable_prompt_tokens = prompt_tuning_configs.get("num_trainable_prompt_tokens", 32)
# hidden_size = prompt_tuning_configs.get("hidden_size", 2048)
# num_attention_heads = prompt_tuning_configs.get("num_attention_heads", 32)
# prompt_emb_head_dim = hidden_size // num_attention_heads
prompt_emb_head_dim = self.config.hidden_size // self.config.num_attention_heads
self.prompt_token_embedding = nn.Embedding(
num_embeddings = self.config.num_trainable_prompt_tokens,
embedding_dim = self.config.hidden_size,
)
# # Initialize prompt tuning rotary position embedder
# self.prompt_rope_embedder = BOOGUPromptTuningRotaryPosEmbed(
# theta=10000,
# dim=prompt_emb_head_dim,
# num_trainable_prompt_tokens=num_trainable_prompt_tokens
# )
# Initialize prompt tuning rotary position embedder
self.prompt_rope_embedder = BOOGUPromptTuningRotaryPosEmbed(
theta=self.config.theta,
dim=prompt_emb_head_dim,
num_trainable_prompt_tokens=self.config.num_trainable_prompt_tokens
)
# self.prompt_tuning_layers = nn.ModuleList(
# [
# PromptTuningTransformerBlock(
# dim=hidden_size,
# num_attention_heads=prompt_tuning_configs.get("num_attention_heads", 32),
# num_kv_heads=prompt_tuning_configs.get("num_kv_heads", 8),
# multiple_of=prompt_tuning_configs.get("multiple_of", 256),
# ffn_dim_multiplier=prompt_tuning_configs.get("ffn_dim_multiplier", None),
# norm_eps=prompt_tuning_configs.get("norm_eps", 1e-5),
# )
# for _ in range(prompt_tuning_configs.get("num_layers", 2))
# ])
# self.prompt_tuning_layers = nn.ModuleList(
# [
# BOOGUTransformerBlock(
# dim=hidden_size,
# num_attention_heads=prompt_tuning_configs.get("num_attention_heads", 32),
# num_kv_heads=prompt_tuning_configs.get("num_kv_heads", 8),
# multiple_of=prompt_tuning_configs.get("multiple_of", 256),
# ffn_dim_multiplier=prompt_tuning_configs.get("ffn_dim_multiplier", None),
# norm_eps=prompt_tuning_configs.get("norm_eps", 1e-5),
# modulation=False,
# )
# for _ in range(prompt_tuning_configs.get("num_layers", 2))
# ])
self.prompt_tuning_layers = nn.ModuleList(
[
BOOGUTransformerBlock(
dim=self.config.hidden_size,
num_attention_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_kv_heads,
multiple_of=self.config.multiple_of,
ffn_dim_multiplier=self.config.ffn_dim_multiplier,
norm_eps=self.config.norm_eps,
modulation=False,
)
for _ in range(self.config.num_layers)
])
self.gradient_checkpointing = False
# # Set up gradient checkpointing function manually since PromptEmbedding doesn't inherit from ModelMixin
# self._gradient_checkpointing_func = checkpoint
# Initialize weights
self.initialize_weights()
def initialize_weights(self) -> None:
# Initialize prompt token embeddings with small random values
# Using small std to ensure stable training initialization
nn.init.normal_(self.prompt_token_embedding.weight, mean=0.0, std=0.02)
# Note: prompt_tuning_layers are already initialized in their __init__ methods
# No need to call initialize_weights() again to avoid double initialization
def forward(self, idx = None, batch_size=1, device=None, use_causal_mask=True):
if idx is None:
prompt_embeddings = self.prompt_token_embedding.weight
else:
prompt_embeddings= self.prompt_token_embedding(idx)
# Expand to batch size [B, num_tokens, hidden_dim]
hidden_states = prompt_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
# Get rotary position embeddings and attention mask
rotary_emb, attention_mask = self.prompt_rope_embedder(batch_size, device, use_causal_mask)
# print(f"#########################attention_mask:{attention_mask}, shape: {attention_mask.shape}##########################")
# Process through prompt tuning layers with gradient checkpointing support
for i, layer in enumerate(self.prompt_tuning_layers):
if torch.is_grad_enabled() and self.gradient_checkpointing:
# Use gradient checkpointing to save memory during training
# print(f"#######################gradient checkpointing###############################")
hidden_states = self._gradient_checkpointing_func(
layer,
hidden_states,
attention_mask,
rotary_emb,
)
else:
# print(f"#######################no gradient checkpointing###############################")
# Normal forward pass without gradient checkpointing
hidden_states = layer(
hidden_states, # [B, num_tokens, hidden_dim]
attention_mask, # [B, num_tokens] - All True for causal attention
# rotary_emb=rotary_emb, # [B, num_tokens, text_dim] - Use text-style RoPE
rotary_emb, # [B, num_tokens, text_dim] - Use text-style RoPE
# No timestep conditioning for prompt tuning
)
return hidden_states
@classmethod
def from_config(cls, config, **kwargs):
# `config` is a dict(read from config.json)
# If `__init__` receives the positional parameter `prompt_tuning_configs` :
instance = cls(prompt_tuning_configs=config)
weight_dtype = kwargs.get("weight_dtype", None)
if weight_dtype is not None:
for p in instance.parameters():
p.data = p.data.to(dtype=weight_dtype)
return instance
############################################################################################################################
###################################################################my double stream block#######################################################################
class BOOGUTransformerDoubleStreamBlock(nn.Module):
"""
BOOGU Double Stream Transformer Block for BOOGU model.
This block implements a double-stream transformer layer with:
- Separate text and image processing streams
- Cross-modal attention between text and image
- Image self-attention for spatial modeling
- BOOGU style modulation and normalization
The data flow follows YAK's DoubleStreamXBlock logic but uses BOOGU's
modulation style (LuminaRMSNormZero instead of triple modulation).
Args:
dim: Dimension of the input and output tensors
num_attention_heads: Number of attention heads
num_kv_heads: Number of key-value heads
multiple_of: Multiple of which the hidden dimension should be
ffn_dim_multiplier: Multiplier for the feed-forward network dimension
norm_eps: Epsilon value for normalization layers
modulation: Whether to use modulation for conditional generation
"""
def __init__(
self,
dim: int,
num_attention_heads: int,
num_kv_heads: int,
multiple_of: int,
ffn_dim_multiplier: float,
norm_eps: float,
modulation: bool = True,
) -> None:
"""Initialize the double stream transformer block."""
super().__init__()
self.head_dim = dim // num_attention_heads
self.num_attention_heads = num_attention_heads
self.modulation = modulation
self.hidden_size = dim
try:
processor = BOOGUAttnProcessorFlash2Varlen()
except ImportError:
processor = BOOGUAttnProcessor()
try:
double_stream_processor = BOOGUDoubleStreamSelfAttnProcessorFlash2Varlen(
head_dim=self.head_dim,
num_attention_heads=num_attention_heads,
num_kv_heads=num_kv_heads,
qkv_bias=False
)
except ImportError:
double_stream_processor = BOOGUDoubleStreamSelfAttnProcessor(
head_dim=self.head_dim,
num_attention_heads=num_attention_heads,
num_kv_heads=num_kv_heads,
qkv_bias=False
)
# === Image Stream Components ===
# Image-text cross-modal attention - uses double-stream processor
self.img_txt_attn = Attention(
query_dim=dim,
cross_attention_dim=None,
dim_head=dim // num_attention_heads,
qk_norm="rms_norm",
heads=num_attention_heads,
kv_heads=num_kv_heads,
eps=1e-5,
bias=False,
out_bias=False,
processor=double_stream_processor,
)
# Image self-attention for spatial modeling
self.img_self_attn = Attention(
query_dim=dim,
cross_attention_dim=None,
dim_head=dim // num_attention_heads,
qk_norm="rms_norm",
heads=num_attention_heads,
kv_heads=num_kv_heads,
eps=1e-5,
bias=False,
out_bias=False,
processor=processor,
)
# Image feed-forward network
self.img_feed_forward = LuminaFeedForward(
dim=dim,
inner_dim=4 * dim,
multiple_of=multiple_of,
ffn_dim_multiplier=ffn_dim_multiplier
)
# Image normalization layers
if modulation:
# Image triple modulation: cross_attn, self_attn, mlp
self.img_norm1 = LuminaRMSNormZero( # for cross-modal attention
embedding_dim=dim,
norm_eps=norm_eps,
norm_elementwise_affine=True
)
self.img_norm2 = LuminaRMSNormZero( # for mlp
embedding_dim=dim,
norm_eps=norm_eps,
norm_elementwise_affine=True
)
self.img_norm3 = LuminaRMSNormZero( # for self-attention
embedding_dim=dim,
norm_eps=norm_eps,
norm_elementwise_affine=True
)
else:
self.img_norm1 = RMSNorm(dim, eps=norm_eps)
self.img_norm2 = RMSNorm(dim, eps=norm_eps)
self.img_norm3 = RMSNorm(dim, eps=norm_eps)
self.img_ffn_norm1 = RMSNorm(dim, eps=norm_eps)
self.img_attn_norm = RMSNorm(dim, eps=norm_eps)
self.img_self_attn_norm = RMSNorm(dim, eps=norm_eps)
self.img_ffn_norm2 = RMSNorm(dim, eps=norm_eps)
# ###########################deprecated#####################################
# # # === Text Stream Components ===
# # Text cross-modal attention (with image)
# self.txt_attn = Attention(
# query_dim=dim,
# cross_attention_dim=None,
# dim_head=dim // num_attention_heads,
# qk_norm="rms_norm",
# heads=num_attention_heads,
# kv_heads=num_kv_heads,
# eps=1e-5,
# bias=False,
# out_bias=False,
# processor=processor,
# )
# ##########################################################################
# Text feed-forward network
self.txt_feed_forward = LuminaFeedForward(
dim=dim,
inner_dim=4 * dim,
multiple_of=multiple_of,
ffn_dim_multiplier=ffn_dim_multiplier
)
# Text normalization layers
if modulation:
# Text double modulation: cross_attn, mlp
self.txt_norm1 = LuminaRMSNormZero( # for cross-modal attention
embedding_dim=dim,
norm_eps=norm_eps,
norm_elementwise_affine=True
)
self.txt_norm2 = LuminaRMSNormZero( # for mlp
embedding_dim=dim,
norm_eps=norm_eps,
norm_elementwise_affine=True
)
else:
self.txt_norm1 = RMSNorm(dim, eps=norm_eps)
self.txt_norm2 = RMSNorm(dim, eps=norm_eps)
self.txt_ffn_norm1 = RMSNorm(dim, eps=norm_eps)
self.txt_attn_norm = RMSNorm(dim, eps=norm_eps)
self.txt_ffn_norm2 = RMSNorm(dim, eps=norm_eps)
self.initialize_weights()
# Disable gradients for unused attn.to_q/k/v layers in img_txt_attn
# since we use double_stream_processor with its own linear layers
for param in self.img_txt_attn.to_q.parameters():
param.requires_grad = False
for param in self.img_txt_attn.to_k.parameters():
param.requires_grad = False
for param in self.img_txt_attn.to_v.parameters():
param.requires_grad = False
del self.img_txt_attn.to_k
del self.img_txt_attn.to_v
del self.img_txt_attn.to_q
def initialize_weights(self) -> None:
"""
Initialize the weights of the double stream transformer block.
Uses Xavier uniform initialization for linear layers and zero initialization
for modulation parameters.
"""
# Initialize image-text stream weights
# nn.init.xavier_uniform_(self.img_txt_attn.to_q.weight) # not useful.
# nn.init.xavier_uniform_(self.img_txt_attn.to_k.weight) # not useful.
# nn.init.xavier_uniform_(self.img_txt_attn.to_v.weight) # not useful.
nn.init.xavier_uniform_(self.img_txt_attn.to_out[0].weight)
# Note: img_self_attn and txt_attn use standard Attention modules
# PyTorch's default initialization (Kaiming uniform) is usually sufficient
# But we keep Xavier uniform for consistency with other BOOGU components
nn.init.xavier_uniform_(self.img_self_attn.to_q.weight)
nn.init.xavier_uniform_(self.img_self_attn.to_k.weight)
nn.init.xavier_uniform_(self.img_self_attn.to_v.weight)
nn.init.xavier_uniform_(self.img_self_attn.to_out[0].weight)
nn.init.xavier_uniform_(self.img_feed_forward.linear_1.weight)
nn.init.xavier_uniform_(self.img_feed_forward.linear_2.weight)
nn.init.xavier_uniform_(self.img_feed_forward.linear_3.weight)
# ############################deprecated#####################################
# # Initialize text stream weights
# nn.init.xavier_uniform_(self.txt_attn.to_q.weight)
# nn.init.xavier_uniform_(self.txt_attn.to_k.weight)
# nn.init.xavier_uniform_(self.txt_attn.to_v.weight)
# nn.init.xavier_uniform_(self.txt_attn.to_out[0].weight)
# ###########################################################################
nn.init.xavier_uniform_(self.txt_feed_forward.linear_1.weight)
nn.init.xavier_uniform_(self.txt_feed_forward.linear_2.weight)
nn.init.xavier_uniform_(self.txt_feed_forward.linear_3.weight)
# Initialize modulation parameters
if self.modulation:
nn.init.zeros_(self.img_norm1.linear.weight)
nn.init.zeros_(self.img_norm1.linear.bias)
nn.init.zeros_(self.img_norm2.linear.weight)
nn.init.zeros_(self.img_norm2.linear.bias)
nn.init.zeros_(self.img_norm3.linear.weight)
nn.init.zeros_(self.img_norm3.linear.bias)
nn.init.zeros_(self.txt_norm1.linear.weight)
nn.init.zeros_(self.txt_norm1.linear.bias)
nn.init.zeros_(self.txt_norm2.linear.weight)
nn.init.zeros_(self.txt_norm2.linear.bias)
def forward(
self,
img_hidden_states: torch.Tensor, # [B, L_img, D] - Image tokens (ref_img + noise_img)
txt_hidden_states: torch.Tensor, # [B, L_txt, D] - Text tokens
img_attention_mask: torch.Tensor, # [B, L_img] - Attention mask for [ref_img + noise_img]
joint_attention_mask: torch.Tensor, # [B, L_total] - Combined attention mask for [txt + img]
image_rotary_emb: torch.Tensor, # [B, L_img, head_dim] - Rotary embeddings for [ref_img + noise_img]
rotary_emb: torch.Tensor, # [B, L_total, head_dim] - Rotary embeddings for [txt + img]
temb: Optional[torch.Tensor] = None, # [B, 1024] - Timestep embeddings
encoder_seq_lengths: List[int] = None, # [B] - Text sequence lengths for each sample
seq_lengths: List[int] = None, # [B] - Total sequence lengths for each sample
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Forward pass of the double stream transformer block.
This implementation follows YAK's DoubleStreamXBlock logic exactly:
1. Apply normalization and modulation to both streams
2. Cross-modal attention: both text and image attend to [text + image] sequence
3. Image self-attention: image tokens attend to themselves only
4. Apply MLPs to both streams
Args:
img_hidden_states: Image token representations [B, L_img, D] (ref_img + noise_img)
txt_hidden_states: Text token representations [B, L_txt, D]
img_attention_mask: Image attention mask [B, L_img] - True for valid image tokens
joint_attention_mask: Combined attention mask [B, L_total] - True for valid tokens in [txt + img]
image_rotary_emb: Rotary position embeddings [B, L_img, head_dim] for image tokens
rotary_emb: Rotary position embeddings [B, L_total, head_dim] for [txt + img]
temb: Timestep conditioning embeddings [B, 1024]
encoder_seq_lengths: Text sequence lengths for each sample [B]
seq_lengths: Total sequence lengths for each sample [B]
Returns:
Tuple of (updated_img_hidden_states, updated_txt_hidden_states)
"""
if self.modulation and temb is None:
raise ValueError("temb must be provided when modulation is enabled")
# Extract dimensions
batch_size = img_hidden_states.shape[0]
L_txt = txt_hidden_states.shape[1] # Text sequence length
L_img = img_hidden_states.shape[1] # Image sequence length (ref_img + noise_img)
if self.modulation:
# === Step 1: Apply modulation to both streams ===
# Image stream: get 3 sets of modulation parameters (cross_attn, self_attn, mlp)
img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)
# Text stream: get 2 sets of modulation parameters (cross_attn, mlp)
txt_norm1_out, txt_gate_msa, txt_scale_mlp, txt_gate_mlp = self.txt_norm1(txt_hidden_states, temb)
txt_norm2_out, txt_shift_mlp, _, _ = self.txt_norm2(txt_hidden_states, temb)
# === Step 2: Cross-modal attention (both streams attend to [txt + img]) ===
# Use double-stream processor for YAK-style attention computation
# We need to call the processor directly because the standard Attention interface
# doesn't support our double-stream parameters (img_hidden_states, txt_hidden_states, etc.)
joint_attn_out = self.img_txt_attn.processor(
attn=self.img_txt_attn,
img_hidden_states=img_norm1_out, # Image features
txt_hidden_states=txt_norm1_out, # Text features
joint_attention_mask=joint_attention_mask, # Mask for valid tokens in [txt + img]
rotary_emb=rotary_emb, # RoPE for full sequence
encoder_seq_lengths=encoder_seq_lengths, # Text sequence lengths
seq_lengths=seq_lengths, # Total sequence lengths
)
# Split attention output back to text and image portions (reverse of concatenation)
txt_attn_out = txt_hidden_states.new_zeros(batch_size, L_txt, self.hidden_size)
img_attn_out = img_hidden_states.new_zeros(batch_size, L_img, self.hidden_size)
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
# Extract text portion
txt_attn_out[i, :encoder_seq_len] = joint_attn_out[i, :encoder_seq_len]
# Extract image portion
img_attn_out[i, :seq_len - encoder_seq_len] = joint_attn_out[i, encoder_seq_len:seq_len]
# === Step 3: Image self-attention (image tokens attend to themselves only) ===
img_self_attn_out = self.img_self_attn(
hidden_states=img_norm3_out, # Image features only
encoder_hidden_states=img_norm3_out, # Self-attention on image
attention_mask=img_attention_mask, # Mask for valid image tokens
image_rotary_emb=image_rotary_emb, # RoPE for image tokens only
)
# === Step 4: Update streams with residual connections ===
# Update image stream: cross_attn + self_attn + mlp
img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)
# Image MLP with modulation (following YAK's logic)
img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)
# Update text stream: cross_attn + mlp (no self-attention for text in YAK)
txt_hidden_states = txt_hidden_states + txt_gate_msa.unsqueeze(1).tanh() * self.txt_attn_norm(txt_attn_out)
# Text MLP with modulation (following YAK's logic)
txt_mlp_input = (1 + txt_scale_mlp.unsqueeze(1)) * txt_norm2_out + txt_shift_mlp.unsqueeze(1)
txt_mlp_out = self.txt_feed_forward(self.txt_ffn_norm1(txt_mlp_input))
txt_hidden_states = txt_hidden_states + txt_gate_mlp.unsqueeze(1).tanh() * self.txt_ffn_norm2(txt_mlp_out)
else:
# Non-modulated version (for context_refiner style blocks without timestep conditioning)
# Same logic but simpler without modulation parameters
# Normalize inputs
img_norm1_out = self.img_norm1(img_hidden_states)
img_norm3_out = self.img_norm3(img_hidden_states)
txt_norm1_out = self.txt_norm1(txt_hidden_states)
# Cross-modal attention - use double-stream processor for YAK-style attention computation
# We need to call the processor directly because the standard Attention interface
# doesn't support our double-stream parameters (img_hidden_states, txt_hidden_states, etc.)
joint_attn_out = self.img_txt_attn.processor(
attn=self.img_txt_attn,
img_hidden_states=img_norm1_out, # Image features
txt_hidden_states=txt_norm1_out, # Text features
joint_attention_mask=joint_attention_mask, # Mask for valid tokens in [txt + img]
rotary_emb=rotary_emb, # RoPE for full sequence
encoder_seq_lengths=encoder_seq_lengths, # Text sequence lengths
seq_lengths=seq_lengths, # Total sequence lengths
)
# Split attention output back to text and image portions
txt_attn_out = txt_hidden_states.new_zeros(batch_size, L_txt, self.hidden_size)
img_attn_out = img_hidden_states.new_zeros(batch_size, L_img, self.hidden_size)
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
# Extract text portion
txt_attn_out[i, :encoder_seq_len] = joint_attn_out[i, :encoder_seq_len]
# Extract image portion
img_attn_out[i, :seq_len - encoder_seq_len] = joint_attn_out[i, encoder_seq_len:seq_len]
# Image self-attention
img_self_attn_out = self.img_self_attn(
hidden_states=img_norm3_out,
encoder_hidden_states=img_norm3_out,
attention_mask=img_attention_mask, # Use image attention mask
image_rotary_emb=image_rotary_emb, # Use image rotary embeddings
)
# Update streams (simpler without modulation gates)
img_hidden_states = img_hidden_states + self.img_attn_norm(img_attn_out)
img_hidden_states = img_hidden_states + self.img_self_attn_norm(img_self_attn_out)
# Image MLP with norm2 (following YAK's logic)
img_norm2_out = self.img_norm2(img_hidden_states)
img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_norm2_out))
img_hidden_states = img_hidden_states + self.img_ffn_norm2(img_mlp_out)
txt_hidden_states = txt_hidden_states + self.txt_attn_norm(txt_attn_out)
# Text MLP with norm2 (following YAK's logic)
txt_norm2_out = self.txt_norm2(txt_hidden_states)
txt_mlp_out = self.txt_feed_forward(self.txt_ffn_norm1(txt_norm2_out))
txt_hidden_states = txt_hidden_states + self.txt_ffn_norm2(txt_mlp_out)
return img_hidden_states, txt_hidden_states
BOOGUTransformerSingleStreamBlock = BOOGUTransformerBlock
# PromptTuningTransformerBlock = OmniGen2TransformerBlock
class BOOGUSingleDoubleStreamTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
"""
BOOGU Mixed Single-Double Stream Transformer 2D Model.
A transformer-based diffusion model that combines double-stream and single-stream processing:
- Initial layers use double-stream processing (separate text and image streams)
- Later layers use single-stream processing (joint text+image processing)
- This follows YAK's architecture pattern but with BOOGU's components
Args:
patch_size: Size of image patches
in_channels: Number of input channels
out_channels: Number of output channels (defaults to in_channels)
hidden_size: Size of hidden layers
num_layers: Total number of transformer layers
num_double_stream_layers: Number of initial double-stream layers
num_refiner_layers: Number of refiner layers
num_attention_heads: Number of attention heads
num_kv_heads: Number of key-value heads
multiple_of: Multiple of which the hidden dimension should be
ffn_dim_multiplier: Multiplier for feed-forward network dimension
norm_eps: Epsilon value for normalization layers
axes_dim_rope: Dimensions for rotary position embeddings
axes_lens: Lengths for rotary position embeddings
text_feat_dim: Dimension of text features
timestep_scale: Scale factor for timestep embeddings
"""
_supports_gradient_checkpointing = True
_no_split_modules = ["BOOGUTransformerBlock", "BOOGUTransformerSingleStreamBlock", "BOOGUTransformerDoubleStreamBlock", "PromptEmbedding", "nn.Embedding", "PromptTuningTransformerBlock"]
_skip_layerwise_casting_patterns = ["x_embedder", "norm", "embedding"]
@register_to_config
def __init__(
self,
patch_size: int = 2,
in_channels: int = 16,
out_channels: Optional[int] = None,
hidden_size: int = 2304,
num_layers: int = 26,
num_double_stream_layers: int = 2,
num_refiner_layers: int = 2,
num_attention_heads: int = 24,
num_kv_heads: int = 8,
multiple_of: int = 256,
ffn_dim_multiplier: Optional[float] = None,
norm_eps: float = 1e-5,
axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
axes_lens: Tuple[int, int, int] = (300, 512, 512),
# text_feat_dim: int = 1024,
text_feature_configs: Dict[str, Any] = dict(text_feat_dim=1024, reduce_type="concat", num_text_feature_layers=1),
prompt_tuning_configs: Dict[str, Any] = dict(use_prompt_tuning=False),
timestep_scale: float = 1.0,
) -> None:
"""Initialize the BOOGU mixed single-double stream transformer model."""
super().__init__()
# Validate configuration
if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
raise ValueError(
f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
)
if num_double_stream_layers > num_layers:
raise ValueError(
f"num_double_stream_layers ({num_double_stream_layers}) cannot be greater than "
f"num_layers ({num_layers})"
)
self.out_channels = out_channels or in_channels
self.num_double_stream_layers = num_double_stream_layers
self.num_single_stream_layers = num_layers - num_double_stream_layers
self.text_feature_configs = text_feature_configs
self.prompt_tuning_configs = prompt_tuning_configs
self.preprocessed_text_feat_dim = self.cal_preprocessed_text_feat_dim(text_feature_configs)
# Initialize embeddings
self.rope_embedder = BOOGUDoubleStreamRotaryPosEmbed(
theta=10000,
axes_dim=axes_dim_rope,
axes_lens=axes_lens,
patch_size=patch_size,
)
self.x_embedder = nn.Linear(
in_features=patch_size * patch_size * in_channels,
out_features=hidden_size,
)
self.ref_image_patch_embedder = nn.Linear(
in_features=patch_size * patch_size * in_channels,
out_features=hidden_size,
)
self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
hidden_size=hidden_size,
text_feat_dim=self.preprocessed_text_feat_dim,
norm_eps=norm_eps,
timestep_scale=timestep_scale
)
# Initialize refiner layers (same as original BOOGU)
self.noise_refiner = nn.ModuleList([
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(num_refiner_layers)
])
self.ref_image_refiner = nn.ModuleList([
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(num_refiner_layers)
])
self.context_refiner = nn.ModuleList([
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=False
)
for _ in range(num_refiner_layers)
])
# === MIXED ARCHITECTURE: Double-stream + Single-stream layers ===
# 1. Double-stream layers (initial processing with separate text/image streams)
self.double_stream_layers = nn.ModuleList([
BOOGUTransformerDoubleStreamBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(num_double_stream_layers)
])
# 2. Single-stream layers (joint text+image processing)
self.single_stream_layers = nn.ModuleList([
BOOGUTransformerSingleStreamBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(self.num_single_stream_layers)
])
# 4. Output norm & projection (same as original BOOGU)
self.norm_out = LuminaLayerNormContinuous(
embedding_dim=hidden_size,
conditioning_embedding_dim=min(hidden_size, 1024),
elementwise_affine=False,
eps=1e-6,
bias=True,
out_dim=patch_size * patch_size * self.out_channels
)
# Add learnable embeddings to distinguish different images
self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images
self.gradient_checkpointing = False
self.initialize_weights()
# TeaCache settings
self.enable_teacache = False
self.teacache_rel_l1_thresh = 0.05
self.teacache_params = TeaCacheParams()
coefficients = [-5.48259225, 11.48772289, -4.47407401, 2.47730926, -0.03316487]
self.rescale_func = np.poly1d(coefficients)
def initialize_weights(self) -> None:
"""
Initialize the weights of the model.
Uses Xavier uniform initialization for linear layers.
"""
nn.init.xavier_uniform_(self.x_embedder.weight)
nn.init.constant_(self.x_embedder.bias, 0.0)
nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)
nn.init.zeros_(self.norm_out.linear_1.weight)
nn.init.zeros_(self.norm_out.linear_1.bias)
nn.init.zeros_(self.norm_out.linear_2.weight)
nn.init.zeros_(self.norm_out.linear_2.bias)
nn.init.normal_(self.image_index_embedding, std=0.02)
# Reuse the same helper methods from original BOOGUTransformer2DModel
def img_patch_embed_and_refine(
self,
hidden_states,
ref_image_hidden_states,
padded_img_mask,
padded_ref_img_mask,
noise_rotary_emb,
ref_img_rotary_emb,
l_effective_ref_img_len,
l_effective_img_len,
temb
):
"""Same implementation as original BOOGUTransformer2DModel"""
batch_size = len(hidden_states)
max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])
hidden_states = self.x_embedder(hidden_states)
ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
for i in range(batch_size):
shift = 0
for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
shift += ref_img_len
for layer in self.noise_refiner:
hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
num_ref_images = len(flat_l_effective_ref_img_len)
max_ref_img_len = max(flat_l_effective_ref_img_len)
batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)
# sequence of ref imgs to batch
idx = 0
for i in range(batch_size):
shift = 0
for ref_img_len in l_effective_ref_img_len[i]:
batch_ref_img_mask[idx, :ref_img_len] = True
batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
batch_temb[idx] = temb[i]
shift += ref_img_len
idx += 1
# refine ref imgs separately
for layer in self.ref_image_refiner:
batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)
# batch of ref imgs to sequence
idx = 0
for i in range(batch_size):
shift = 0
for ref_img_len in l_effective_ref_img_len[i]:
ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
shift += ref_img_len
idx += 1
combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]
return combined_img_hidden_states
def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
"""Same implementation as original BOOGUTransformer2DModel"""
batch_size = len(hidden_states)
p = self.config.patch_size
device = hidden_states[0].device
img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
if ref_image_hidden_states is not None:
ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
else:
ref_img_sizes = [None for _ in range(batch_size)]
l_effective_ref_img_len = [[0] for _ in range(batch_size)]
max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
max_img_len = max(l_effective_img_len)
# ref image patch embeddings
flat_ref_img_hidden_states = []
for i in range(batch_size):
if ref_img_sizes[i] is not None:
imgs = []
for ref_img in ref_image_hidden_states[i]:
C, H, W = ref_img.size()
ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
imgs.append(ref_img)
img = torch.cat(imgs, dim=0)
flat_ref_img_hidden_states.append(img)
else:
flat_ref_img_hidden_states.append(None)
# image patch embeddings
flat_hidden_states = []
for i in range(batch_size):
img = hidden_states[i]
C, H, W = img.size()
img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
flat_hidden_states.append(img)
padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
for i in range(batch_size):
if ref_img_sizes[i] is not None:
padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True
padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
for i in range(batch_size):
padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
padded_img_mask[i, :l_effective_img_len[i]] = True
return (
padded_hidden_states,
padded_ref_img_hidden_states,
padded_img_mask,
padded_ref_img_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
)
def cal_preprocessed_text_feat_dim(self, text_feature_configs: Dict[str, Any]):
num_text_feature_layers = max(text_feature_configs.get("num_text_feature_layers", 1), 1)
text_feat_dim = text_feature_configs.get("text_feat_dim", 4096)
reduce_type = text_feature_configs.get("reduce_type", "concat")
if "cat" in reduce_type.lower():
return num_text_feature_layers * text_feat_dim
elif "mean" in reduce_type.lower():
return text_feat_dim
else:
raise ValueError(f"Invalid reduce_type: {reduce_type}")
def preprocess_text_hidden_states(self, raw_text_hidden_states, text_feature_configs: Dict[str, Any]):
num_text_feature_layers = max(text_feature_configs.get("num_text_feature_layers", 1), 1)
text_feat_dim = text_feature_configs.get("text_feat_dim", 4096)
reduce_type = text_feature_configs.get("reduce_type", "concat")
text_hidden_states = None
if isinstance(raw_text_hidden_states, torch.Tensor):
text_hidden_states = raw_text_hidden_states
elif isinstance(raw_text_hidden_states, (list,tuple) ):
assert len(raw_text_hidden_states) == num_text_feature_layers
if "cat" in reduce_type.lower():
text_hidden_states = torch.cat(raw_text_hidden_states, dim=-1)
elif "mean" in reduce_type.lower():
text_hidden_states = torch.mean(torch.stack(raw_text_hidden_states), dim=0)
else:
raise ValueError(f"Invalid reduce_type: {reduce_type}")
else:
raise ValueError(f"Invalid type of raw_text_hidden_states, expected torch.Tensor or list, but got {type(raw_text_hidden_states)}")
assert self.preprocessed_text_feat_dim == text_hidden_states.shape[-1]
return text_hidden_states
def forward(
self,
hidden_states: Union[torch.Tensor, List[torch.Tensor]],
timestep: torch.Tensor,
text_hidden_states: torch.Tensor,
freqs_cis: torch.Tensor,
text_attention_mask: torch.Tensor,
ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = False,
) -> Union[torch.Tensor, Transformer2DModelOutput]:
"""
Forward pass combining double-stream and single-stream processing.
Processing flow:
1. Text refinement and image embedding (same as original BOOGU)
2. Double-stream processing: separate text and image streams
3. Stream fusion: combine text and image streams into joint representation
4. Single-stream processing: joint text+image processing
5. Output projection
Args:
hidden_states: Input image tensors [List[Tensor]] or [B, C, H, W]
timestep: Timestep tensor [B]
text_hidden_states: Text features [B, L_txt, text_feat_dim]
freqs_cis: Frequency components for rotary embeddings
text_attention_mask: Text attention mask [B, L_txt]
ref_image_hidden_states: Reference image tensors (optional)
attention_kwargs: Additional attention arguments
return_dict: Whether to return dict format
Returns:
Generated image tensors or Transformer2DModelOutput
"""
text_hidden_states = self.preprocess_text_hidden_states(text_hidden_states, self.text_feature_configs)
enable_taylorseer = getattr(self, 'enable_taylorseer', False)
if enable_taylorseer:
cal_type(self.cache_dic, self.current)
if attention_kwargs is not None:
attention_kwargs = attention_kwargs.copy()
lora_scale = attention_kwargs.pop("scale", 1.0)
else:
lora_scale = 1.0
if USE_PEFT_BACKEND:
# weight the lora layers by setting `lora_scale` for each PEFT layer
scale_lora_layers(self, lora_scale)
else:
if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
logger.warning(
"Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
)
# === 1. Initial processing (same as original BOOGU) ===
batch_size = len(hidden_states)
is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)
if is_hidden_states_tensor:
assert hidden_states.ndim == 4
hidden_states = [_hidden_states for _hidden_states in hidden_states]
device = hidden_states[0].device
# ########################debug##########################
# print(f"#####################timestep.dtype: {timestep.dtype}###########################")
# print(f"#####################text_hidden_states.dtype: {text_hidden_states.dtype}###########################")
# ######################################################
# Timestep and text embedding
temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
# Flatten and pad sequences
(
hidden_states,
ref_image_hidden_states,
img_mask,
ref_img_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
# Generate rotary embeddings
(
context_rotary_emb,
ref_img_rotary_emb,
noise_rotary_emb,
rotary_emb,
encoder_seq_lengths,
seq_lengths,
combined_img_rotary_emb,
combined_img_seq_lengths,
) = self.rope_embedder(
freqs_cis,
text_attention_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
device,
)
# === 2. Context refinement (same as original BOOGU) ===
for layer in self.context_refiner:
text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
# Embed and refine image patches
combined_img_hidden_states = self.img_patch_embed_and_refine(
hidden_states,
ref_image_hidden_states,
img_mask,
ref_img_mask,
noise_rotary_emb,
ref_img_rotary_emb,
l_effective_ref_img_len,
l_effective_img_len,
temb,
)
# === 3. DOUBLE-STREAM PROCESSING ===
# Initialize text and image streams
txt_hidden_states = text_hidden_states # [B, L_txt, D]
img_hidden_states = combined_img_hidden_states # [B, L_img, D] - contains ref_img + noise_img
# Prepare joint attention mask for combined sequence [txt + img] (including ref_img)
max_seq_len = max(seq_lengths)
joint_attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
for i, seq_len in enumerate(seq_lengths):
joint_attention_mask[i, :seq_len] = True
# Process through double-stream layers (if any)
if self.num_double_stream_layers > 0:
# Prepare image attention mask for combined image sequence [ref_img + noise_img]
max_img_len = max(combined_img_seq_lengths)
img_attention_mask = hidden_states.new_zeros(batch_size, max_img_len, dtype=torch.bool)
for i, img_seq_len in enumerate(combined_img_seq_lengths):
img_attention_mask[i, :img_seq_len] = True
# Process through double-stream layers
for layer_idx, layer in enumerate(self.double_stream_layers):
if enable_taylorseer:
layer.current = self.current
layer.cache_dic = self.cache_dic
layer.enable_taylorseer = True
self.current['layer'] = layer_idx
if torch.is_grad_enabled() and self.gradient_checkpointing:
img_hidden_states, txt_hidden_states = self._gradient_checkpointing_func(
layer, img_hidden_states, txt_hidden_states, img_attention_mask, joint_attention_mask,
combined_img_rotary_emb, rotary_emb, temb, encoder_seq_lengths, seq_lengths
)
else:
# Double-stream forward: returns (img_states, txt_states)
img_hidden_states, txt_hidden_states = layer(
img_hidden_states, txt_hidden_states, img_attention_mask, joint_attention_mask,
combined_img_rotary_emb, rotary_emb, temb, encoder_seq_lengths, seq_lengths
)
# === 4. STREAM FUSION: Combine text and image streams ===
# Following BOOGU's joint processing approach
# img_hidden_states already contains the processed [ref_img_tokens, noise_img_tokens]
joint_hidden_states = hidden_states.new_zeros(batch_size, max(seq_lengths), self.config.hidden_size)
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
# Place text tokens first, then processed image tokens
joint_hidden_states[i, :encoder_seq_len] = txt_hidden_states[i, :encoder_seq_len]
joint_hidden_states[i, encoder_seq_len:seq_len] = img_hidden_states[i, :seq_len - encoder_seq_len]
# === 5. SINGLE-STREAM PROCESSING ===
# Process the joint representation through single-stream layers
hidden_states = joint_hidden_states
# TeaCache optimization (optional)
if self.enable_teacache and len(self.single_stream_layers) > 0:
teacache_hidden_states = hidden_states.clone()
teacache_temb = temb.clone()
modulated_inp, _, _, _ = self.single_stream_layers[0].norm1(teacache_hidden_states, teacache_temb)
if self.teacache_params.is_first_or_last_step:
should_calc = True
self.teacache_params.accumulated_rel_l1_distance = 0
else:
self.teacache_params.accumulated_rel_l1_distance += self.rescale_func(
((modulated_inp - self.teacache_params.previous_modulated_inp).abs().mean() \
/ self.teacache_params.previous_modulated_inp.abs().mean()).cpu().item()
)
if self.teacache_params.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
should_calc = False
else:
should_calc = True
self.teacache_params.accumulated_rel_l1_distance = 0
self.teacache_params.previous_modulated_inp = modulated_inp
else:
should_calc = True
# Process through single-stream layers
if self.enable_teacache and not should_calc:
hidden_states += self.teacache_params.previous_residual
else:
if enable_taylorseer:
self.current['stream'] = 'single_stream_layers'
if self.enable_teacache:
ori_hidden_states = hidden_states.clone()
for layer_idx, layer in enumerate(self.single_stream_layers):
if enable_taylorseer:
layer.current = self.current
layer.cache_dic = self.cache_dic
layer.enable_taylorseer = True
self.current['layer'] = self.num_double_stream_layers + layer_idx
if torch.is_grad_enabled() and self.gradient_checkpointing:
hidden_states = self._gradient_checkpointing_func(
layer, hidden_states, joint_attention_mask, rotary_emb, temb
)
else:
# Single-stream forward: standard transformer block
hidden_states = layer(hidden_states, joint_attention_mask, rotary_emb, temb)
if self.enable_teacache:
self.teacache_params.previous_residual = hidden_states - ori_hidden_states
# === 6. Output projection (same as original BOOGU) ===
hidden_states = self.norm_out(hidden_states, temb)
# Reshape output back to image format
p = self.config.patch_size
output = []
for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
height, width = img_size
# Extract image portion from joint sequence (text tokens are at the beginning)
img_tokens = hidden_states[i][seq_len - img_len:seq_len] # [img_len, patch_dim]
# Reshape to image: (h w) (p1 p2 c) -> c (h p1) (w p2)
img_output = rearrange(
img_tokens,
'(h w) (p1 p2 c) -> c (h p1) (w p2)',
h=height // p, w=width // p, p1=p, p2=p
)
output.append(img_output)
if is_hidden_states_tensor:
output = torch.stack(output, dim=0)
# Clean up LoRA scaling
if USE_PEFT_BACKEND:
unscale_lora_layers(self, lora_scale)
# Update TaylorSeer step counter
if enable_taylorseer:
self.current['step'] += 1
if not return_dict:
return output
return Transformer2DModelOutput(sample=output)
##########################################################################################################################################
class BOOGUTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
"""
BOOGU Transformer 2D Model.
A transformer-based diffusion model for image generation with:
- Patch-based image processing
- Rotary position embeddings
- Multi-head attention
- Conditional generation support
Args:
patch_size: Size of image patches
in_channels: Number of input channels
out_channels: Number of output channels (defaults to in_channels)
hidden_size: Size of hidden layers
num_layers: Number of transformer layers
num_refiner_layers: Number of refiner layers
num_attention_heads: Number of attention heads
num_kv_heads: Number of key-value heads
multiple_of: Multiple of which the hidden dimension should be
ffn_dim_multiplier: Multiplier for feed-forward network dimension
norm_eps: Epsilon value for normalization layers
axes_dim_rope: Dimensions for rotary position embeddings
axes_lens: Lengths for rotary position embeddings
text_feat_dim: Dimension of text features
timestep_scale: Scale factor for timestep embeddings
use_fused_rms_norm: Whether to use fused RMS normalization
use_fused_swiglu: Whether to use fused SwiGLU activation
"""
_supports_gradient_checkpointing = True
_no_split_modules = ["BOOGUTransformerBlock"]
_skip_layerwise_casting_patterns = ["x_embedder", "norm"]
@register_to_config
def __init__(
self,
patch_size: int = 2,
in_channels: int = 16,
out_channels: Optional[int] = None,
hidden_size: int = 2304,
num_layers: int = 26,
num_refiner_layers: int = 2,
num_attention_heads: int = 24,
num_kv_heads: int = 8,
multiple_of: int = 256,
ffn_dim_multiplier: Optional[float] = None,
norm_eps: float = 1e-5,
axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
axes_lens: Tuple[int, int, int] = (300, 512, 512),
text_feat_dim: int = 1024,
timestep_scale: float = 1.0
) -> None:
"""Initialize the BOOGU transformer model."""
super().__init__()
# Validate configuration
if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
raise ValueError(
f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
)
self.out_channels = out_channels or in_channels
# Initialize embeddings
self.rope_embedder = BOOGURotaryPosEmbed(
theta=10000,
axes_dim=axes_dim_rope,
axes_lens=axes_lens,
patch_size=patch_size,
)
self.x_embedder = nn.Linear(
in_features=patch_size * patch_size * in_channels,
out_features=hidden_size,
)
self.ref_image_patch_embedder = nn.Linear(
in_features=patch_size * patch_size * in_channels,
out_features=hidden_size,
)
self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
hidden_size=hidden_size,
text_feat_dim=text_feat_dim,
norm_eps=norm_eps,
timestep_scale=timestep_scale
)
# Initialize transformer blocks
self.noise_refiner = nn.ModuleList([
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(num_refiner_layers)
])
self.ref_image_refiner = nn.ModuleList([
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(num_refiner_layers)
])
self.context_refiner = nn.ModuleList(
[
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=False
)
for _ in range(num_refiner_layers)
]
)
# 3. Transformer blocks
self.layers = nn.ModuleList(
[
BOOGUTransformerBlock(
hidden_size,
num_attention_heads,
num_kv_heads,
multiple_of,
ffn_dim_multiplier,
norm_eps,
modulation=True
)
for _ in range(num_layers)
]
)
# 4. Output norm & projection
self.norm_out = LuminaLayerNormContinuous(
embedding_dim=hidden_size,
conditioning_embedding_dim=min(hidden_size, 1024),
elementwise_affine=False,
eps=1e-6,
bias=True,
out_dim=patch_size * patch_size * self.out_channels
)
# Add learnable embeddings to distinguish different images
self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images
self.gradient_checkpointing = False
self.initialize_weights()
# TeaCache settings
self.enable_teacache = False
self.teacache_rel_l1_thresh = 0.05
self.teacache_params = TeaCacheParams()
coefficients = [-5.48259225, 11.48772289, -4.47407401, 2.47730926, -0.03316487]
self.rescale_func = np.poly1d(coefficients)
def initialize_weights(self) -> None:
"""
Initialize the weights of the model.
Uses Xavier uniform initialization for linear layers.
"""
nn.init.xavier_uniform_(self.x_embedder.weight)
nn.init.constant_(self.x_embedder.bias, 0.0)
nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)
nn.init.zeros_(self.norm_out.linear_1.weight)
nn.init.zeros_(self.norm_out.linear_1.bias)
nn.init.zeros_(self.norm_out.linear_2.weight)
nn.init.zeros_(self.norm_out.linear_2.bias)
nn.init.normal_(self.image_index_embedding, std=0.02)
def img_patch_embed_and_refine(
self,
hidden_states,
ref_image_hidden_states,
padded_img_mask,
padded_ref_img_mask,
noise_rotary_emb,
ref_img_rotary_emb,
l_effective_ref_img_len,
l_effective_img_len,
temb
):
batch_size = len(hidden_states)
max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])
hidden_states = self.x_embedder(hidden_states)
ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
for i in range(batch_size):
shift = 0
for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
shift += ref_img_len
for layer in self.noise_refiner:
hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
num_ref_images = len(flat_l_effective_ref_img_len)
max_ref_img_len = max(flat_l_effective_ref_img_len)
batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)
# sequence of ref imgs to batch
idx = 0
for i in range(batch_size):
shift = 0
for ref_img_len in l_effective_ref_img_len[i]:
batch_ref_img_mask[idx, :ref_img_len] = True
batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
batch_temb[idx] = temb[i]
shift += ref_img_len
idx += 1
# refine ref imgs separately
for layer in self.ref_image_refiner:
batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)
# batch of ref imgs to sequence
idx = 0
for i in range(batch_size):
shift = 0
for ref_img_len in l_effective_ref_img_len[i]:
ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
shift += ref_img_len
idx += 1
combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]
return combined_img_hidden_states
def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
batch_size = len(hidden_states)
p = self.config.patch_size
device = hidden_states[0].device
img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
if ref_image_hidden_states is not None:
ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
else:
ref_img_sizes = [None for _ in range(batch_size)]
l_effective_ref_img_len = [[0] for _ in range(batch_size)]
max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
max_img_len = max(l_effective_img_len)
# ref image patch embeddings
flat_ref_img_hidden_states = []
for i in range(batch_size):
if ref_img_sizes[i] is not None:
imgs = []
for ref_img in ref_image_hidden_states[i]:
C, H, W = ref_img.size()
ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
imgs.append(ref_img)
img = torch.cat(imgs, dim=0)
flat_ref_img_hidden_states.append(img)
else:
flat_ref_img_hidden_states.append(None)
# image patch embeddings
flat_hidden_states = []
for i in range(batch_size):
img = hidden_states[i]
C, H, W = img.size()
img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
flat_hidden_states.append(img)
padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
for i in range(batch_size):
if ref_img_sizes[i] is not None:
padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True
padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
for i in range(batch_size):
padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
padded_img_mask[i, :l_effective_img_len[i]] = True
return (
padded_hidden_states,
padded_ref_img_hidden_states,
padded_img_mask,
padded_ref_img_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
)
def forward(
self,
hidden_states: Union[torch.Tensor, List[torch.Tensor]], # output_images' feature
timestep: torch.Tensor,
text_hidden_states: torch.Tensor, # text' feature
freqs_cis: torch.Tensor,
text_attention_mask: torch.Tensor,
ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None, # input_images' feature
attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = False,
) -> Union[torch.Tensor, Transformer2DModelOutput]:
enable_taylorseer = getattr(self, 'enable_taylorseer', False)
if enable_taylorseer:
cal_type(self.cache_dic, self.current)
if attention_kwargs is not None:
attention_kwargs = attention_kwargs.copy()
lora_scale = attention_kwargs.pop("scale", 1.0)
else:
lora_scale = 1.0
if USE_PEFT_BACKEND:
# weight the lora layers by setting `lora_scale` for each PEFT layer
scale_lora_layers(self, lora_scale)
else:
if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
logger.warning(
"Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
)
# 1. Condition, positional & patch embedding
batch_size = len(hidden_states)
is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)
if is_hidden_states_tensor:
assert hidden_states.ndim == 4
hidden_states = [_hidden_states for _hidden_states in hidden_states]
device = hidden_states[0].device
temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
(
hidden_states,
ref_image_hidden_states,
img_mask,
ref_img_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
(
context_rotary_emb,
ref_img_rotary_emb,
noise_rotary_emb,
rotary_emb,
encoder_seq_lengths,
seq_lengths,
) = self.rope_embedder(
freqs_cis,
text_attention_mask,
l_effective_ref_img_len,
l_effective_img_len,
ref_img_sizes,
img_sizes,
device,
)
# 2. Context refinement
for layer in self.context_refiner:
text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
combined_img_hidden_states = self.img_patch_embed_and_refine(
hidden_states,
ref_image_hidden_states,
img_mask,
ref_img_mask,
noise_rotary_emb,
ref_img_rotary_emb,
l_effective_ref_img_len,
l_effective_img_len,
temb,
)
# 3. Joint Transformer blocks
max_seq_len = max(seq_lengths) ## 220+256 = 476
attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
# #########################################my debug##############################################
# print(f"#####################text_hidden_states.shape: {text_hidden_states.shape}############################") # ext_hidden_states.shape: torch.Size([88, 220, 2520])
# print(f"#####################combined_img_hidden_states.shape: {combined_img_hidden_states.shape}############################") # combined_img_hidden_states.shape: torch.Size([88, 256, 2520]) # seplen for image is all 256
# print(f"#####################encoder_seq_lengths: {encoder_seq_lengths}############################") # [50, 50, 52, 170, 122, 197, 56, 172, 209, 151, 200, 50, 163, 166, 160, 163, 209, 166, 202, 19, 50, 174, 198, 181, 204, 173, 185, 201, 173, 51, 164, 154, 130, 208, 19, 50, 19, 191, 168, 47, 171, 153, 210, 49, 150, 165, 138, 51, 210, 55, 146, 49, 164, 114, 201, 195, 182, 166, 50, 212, 156, 48, 167, 162, 214, 149, 50, 171, 150, 220, 19, 209, 47, 156, 152, 143, 135, 166, 137, 144, 50, 50, 147, 135, 204, 47, 138, 209] # max : 220
# print(f"#####################seq_lengths: {seq_lengths}############################") # [306, 306, 308, 426, 378, 453, 312, 428, 465, 407, 456, 306, 419, 422, 416, 419, 465, 422, 458, 275, 306, 430, 454, 437, 460, 429, 441, 457, 429, 307, 420, 410, 386, 464, 275, 306, 275, 447, 424, 303, 427, 409, 466, 305, 406, 421, 394, 307, 466, 311, 402, 305, 420, 370, 457, 451, 438, 422, 306, 468, 412, 304, 423, 418, 470, 405, 306, 427, 406, 476, 275, 465, 303, 412, 408, 399, 391, 422, 393, 400, 306, 306, 403, 391, 460, 303, 394, 465] # max: 276 = 220 + 256
# ###############################################################################################
for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
attention_mask[i, :seq_len] = True
joint_hidden_states[i, :encoder_seq_len] = text_hidden_states[i, :encoder_seq_len]
joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]
hidden_states = joint_hidden_states
if self.enable_teacache:
teacache_hidden_states = hidden_states.clone()
teacache_temb = temb.clone()
modulated_inp, _, _, _ = self.layers[0].norm1(teacache_hidden_states, teacache_temb)
if self.teacache_params.is_first_or_last_step:
should_calc = True
self.teacache_params.accumulated_rel_l1_distance = 0
else:
self.teacache_params.accumulated_rel_l1_distance += self.rescale_func(
((modulated_inp - self.teacache_params.previous_modulated_inp).abs().mean() \
/ self.teacache_params.previous_modulated_inp.abs().mean()).cpu().item()
)
if self.teacache_params.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
should_calc = False
else:
should_calc = True
self.teacache_params.accumulated_rel_l1_distance = 0
self.teacache_params.previous_modulated_inp = modulated_inp
if self.enable_teacache:
if not should_calc:
hidden_states += self.teacache_params.previous_residual
else:
ori_hidden_states = hidden_states.clone()
for layer_idx, layer in enumerate(self.layers):
if torch.is_grad_enabled() and self.gradient_checkpointing:
hidden_states = self._gradient_checkpointing_func(
layer, hidden_states, attention_mask, rotary_emb, temb
)
else:
hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
self.teacache_params.previous_residual = hidden_states - ori_hidden_states
else:
if enable_taylorseer:
self.current['stream'] = 'layers_stream'
for layer_idx, layer in enumerate(self.layers):
if enable_taylorseer:
layer.current = self.current
layer.cache_dic = self.cache_dic
layer.enable_taylorseer = True
self.current['layer'] = layer_idx
if torch.is_grad_enabled() and self.gradient_checkpointing:
hidden_states = self._gradient_checkpointing_func(
layer, hidden_states, attention_mask, rotary_emb, temb
)
else:
hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
# 4. Output norm & projection
hidden_states = self.norm_out(hidden_states, temb)
p = self.config.patch_size
output = []
for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
height, width = img_size
output.append(rearrange(hidden_states[i][seq_len - img_len:seq_len], '(h w) (p1 p2 c) -> c (h p1) (w p2)', h=height // p, w=width // p, p1=p, p2=p))
if is_hidden_states_tensor:
output = torch.stack(output, dim=0)
if USE_PEFT_BACKEND:
# remove `lora_scale` from each PEFT layer
unscale_lora_layers(self, lora_scale)
if enable_taylorseer:
self.current['step'] += 1
if not return_dict:
return output
return Transformer2DModelOutput(sample=output)
###############################################################################################################################################################################################

Xet Storage Details

Size:
248 kB
·
Xet hash:
121d9333cc8de129fb789151ca7bf252e11310aea65573c4ceb2aa0046a9d998

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.