Buckets:

slida
/

test-storage

Files

xet

slida/test-storage / .cache /huggingface /modules /diffusers_modules /local /transformer_boogu.py

slida

14 days ago

download

raw

248 kB

	import warnings
	import itertools
	from typing import Any, Dict, List, Optional, Tuple, Union
	import math

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from einops import rearrange, repeat

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.loaders import PeftAdapterMixin
	from diffusers.loaders.single_file_model import FromOriginalModelMixin
	from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
	from diffusers.models.attention_processor import Attention
	from diffusers.models.modeling_outputs import Transformer2DModelOutput
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.embeddings import get_1d_rotary_pos_embed
	from diffusers.models.activations import get_activation
	from diffusers.models.embeddings import Timesteps

	import importlib.util
	import sys


	#################MY####################
	from dataclasses import dataclass
	import numpy as np
	#######################################



	# The package importlib_metadata is in a different place, depending on the python version.
	if sys.version_info < (3, 8):
	import importlib_metadata
	else:
	import importlib.metadata as importlib_metadata

	def _is_package_available(pkg_name: str):
	pkg_exists = importlib.util.find_spec(pkg_name) is not None
	pkg_version = "N/A"

	if pkg_exists:
	try:
	pkg_version = importlib_metadata.version(pkg_name)
	except (ImportError, importlib_metadata.PackageNotFoundError):
	pkg_exists = False

	return pkg_exists, pkg_version

	_triton_available, _triton_version = _is_package_available("triton")
	_flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")

	def is_triton_available():
	return _triton_available

	def is_flash_attn_available():
	return _flash_attn_available

	if is_triton_available():
	# from ...ops.triton.layer_norm import RMSNorm
	import triton
	import triton.language as tl


	from typing import Callable


	def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
	def decorator(args, *kwargs):
	if cuda_amp_deprecated:
	kwargs["device_type"] = "cuda"
	return dec(args, *kwargs)
	return decorator


	if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
	deprecated = True
	from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
	else:
	deprecated = False
	from torch.cuda.amp import custom_fwd, custom_bwd

	custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
	custom_bwd = custom_amp_decorator(custom_bwd, deprecated)


	def triton_autotune_configs():
	# Return configs with a valid warp count for the current device
	configs=[]
	# Maximum threads per block is architecture-dependent in theory, but in reality all are 1024
	max_threads_per_block=1024
	# Default to warp size 32 if not defined by device
	warp_size=getattr(torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32)
	# Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
	warp_count=1
	while warp_count*warp_size <= max_threads_per_block:
	configs.append(triton.Config({}, num_warps=warp_count))
	warp_count*=2
	return configs

	@triton.autotune(
	configs=triton_autotune_configs(),
	key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
	)
	# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
	# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
	@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
	@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
	@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
	@triton.jit
	def _layer_norm_fwd_1pass_kernel(
	X, # pointer to the input
	Y, # pointer to the output
	W, # pointer to the weights
	B, # pointer to the biases
	RESIDUAL, # pointer to the residual
	X1,
	W1,
	B1,
	Y1,
	RESIDUAL_OUT, # pointer to the residual
	ROWSCALE,
	SEEDS, # Dropout seeds for each row
	DROPOUT_MASK,
	Mean, # pointer to the mean
	Rstd, # pointer to the 1/std
	stride_x_row, # how much to increase the pointer when moving by 1 row
	stride_y_row,
	stride_res_row,
	stride_res_out_row,
	stride_x1_row,
	stride_y1_row,
	M, # number of rows in X
	N, # number of columns in X
	eps, # epsilon to avoid division by zero
	dropout_p, # Dropout probability
	zero_centered_weight, # If true, add 1.0 to the weight
	IS_RMS_NORM: tl.constexpr,
	BLOCK_N: tl.constexpr,
	HAS_RESIDUAL: tl.constexpr,
	STORE_RESIDUAL_OUT: tl.constexpr,
	HAS_BIAS: tl.constexpr,
	HAS_DROPOUT: tl.constexpr,
	STORE_DROPOUT_MASK: tl.constexpr,
	HAS_ROWSCALE: tl.constexpr,
	HAS_X1: tl.constexpr,
	HAS_W1: tl.constexpr,
	HAS_B1: tl.constexpr,
	):
	# Map the program id to the row of X and Y it should compute.
	row = tl.program_id(0)
	X += row * stride_x_row
	Y += row * stride_y_row
	if HAS_RESIDUAL:
	RESIDUAL += row * stride_res_row
	if STORE_RESIDUAL_OUT:
	RESIDUAL_OUT += row * stride_res_out_row
	if HAS_X1:
	X1 += row * stride_x1_row
	if HAS_W1:
	Y1 += row * stride_y1_row
	# Compute mean and variance
	cols = tl.arange(0, BLOCK_N)
	x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
	if HAS_ROWSCALE:
	rowscale = tl.load(ROWSCALE + row).to(tl.float32)
	x *= rowscale
	if HAS_DROPOUT:
	# Compute dropout mask
	# 7 rounds is good enough, and reduces register pressure
	keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
	x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
	if STORE_DROPOUT_MASK:
	tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
	if HAS_X1:
	x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
	if HAS_ROWSCALE:
	rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
	x1 *= rowscale
	if HAS_DROPOUT:
	# Compute dropout mask
	# 7 rounds is good enough, and reduces register pressure
	keep_mask = (
	tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
	)
	x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
	if STORE_DROPOUT_MASK:
	tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
	x += x1
	if HAS_RESIDUAL:
	residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
	x += residual
	if STORE_RESIDUAL_OUT:
	tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
	if not IS_RMS_NORM:
	mean = tl.sum(x, axis=0) / N
	tl.store(Mean + row, mean)
	xbar = tl.where(cols < N, x - mean, 0.0)
	var = tl.sum(xbar * xbar, axis=0) / N
	else:
	xbar = tl.where(cols < N, x, 0.0)
	var = tl.sum(xbar * xbar, axis=0) / N
	rstd = 1 / tl.sqrt(var + eps)
	tl.store(Rstd + row, rstd)
	# Normalize and apply linear transformation
	mask = cols < N
	w = tl.load(W + cols, mask=mask).to(tl.float32)
	if zero_centered_weight:
	w += 1.0
	if HAS_BIAS:
	b = tl.load(B + cols, mask=mask).to(tl.float32)
	x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
	y = x_hat * w + b if HAS_BIAS else x_hat * w
	# Write output
	tl.store(Y + cols, y, mask=mask)
	if HAS_W1:
	w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
	if zero_centered_weight:
	w1 += 1.0
	if HAS_B1:
	b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
	y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
	tl.store(Y1 + cols, y1, mask=mask)


	def _layer_norm_fwd(
	x,
	weight,
	bias,
	eps,
	residual=None,
	x1=None,
	weight1=None,
	bias1=None,
	dropout_p=0.0,
	rowscale=None,
	out_dtype=None,
	residual_dtype=None,
	zero_centered_weight=False,
	is_rms_norm=False,
	return_dropout_mask=False,
	out=None,
	residual_out=None
	):
	if residual is not None:
	residual_dtype = residual.dtype
	M, N = x.shape
	assert x.stride(-1) == 1
	if residual is not None:
	assert residual.stride(-1) == 1
	assert residual.shape == (M, N)
	assert weight.shape == (N,)
	assert weight.stride(-1) == 1
	if bias is not None:
	assert bias.stride(-1) == 1
	assert bias.shape == (N,)
	if x1 is not None:
	assert x1.shape == x.shape
	assert rowscale is None
	assert x1.stride(-1) == 1
	if weight1 is not None:
	assert weight1.shape == (N,)
	assert weight1.stride(-1) == 1
	if bias1 is not None:
	assert bias1.shape == (N,)
	assert bias1.stride(-1) == 1
	if rowscale is not None:
	assert rowscale.is_contiguous()
	assert rowscale.shape == (M,)
	# allocate output
	if out is None:
	out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
	else:
	assert out.shape == x.shape
	assert out.stride(-1) == 1
	if weight1 is not None:
	y1 = torch.empty_like(out)
	assert y1.stride(-1) == 1
	else:
	y1 = None
	if (
	residual is not None
	or (residual_dtype is not None and residual_dtype != x.dtype)
	or dropout_p > 0.0
	or rowscale is not None
	or x1 is not None
	):
	if residual_out is None:
	residual_out = torch.empty(
	M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype
	)
	else:
	assert residual_out.shape == x.shape
	assert residual_out.stride(-1) == 1
	else:
	residual_out = None
	mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
	rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
	if dropout_p > 0.0:
	seeds = torch.randint(
	2*32, (M if x1 is None else 2 M,), device=x.device, dtype=torch.int64
	)
	else:
	seeds = None
	if return_dropout_mask and dropout_p > 0.0:
	dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool)
	else:
	dropout_mask = None
	# Less than 64KB per feature: enqueue fused kernel
	MAX_FUSED_SIZE = 65536 // x.element_size()
	BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
	if N > BLOCK_N:
	raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
	with torch.cuda.device(x.device.index):
	_layer_norm_fwd_1pass_kernel[(M,)](
	x,
	out,
	weight,
	bias,
	residual,
	x1,
	weight1,
	bias1,
	y1,
	residual_out,
	rowscale,
	seeds,
	dropout_mask,
	mean,
	rstd,
	x.stride(0),
	out.stride(0),
	residual.stride(0) if residual is not None else 0,
	residual_out.stride(0) if residual_out is not None else 0,
	x1.stride(0) if x1 is not None else 0,
	y1.stride(0) if y1 is not None else 0,
	M,
	N,
	eps,
	dropout_p,
	zero_centered_weight,
	is_rms_norm,
	BLOCK_N,
	residual is not None,
	residual_out is not None,
	bias is not None,
	dropout_p > 0.0,
	dropout_mask is not None,
	rowscale is not None,
	)
	# residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
	if dropout_mask is not None and x1 is not None:
	dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
	else:
	dropout_mask1 = None
	return (
	out,
	y1,
	mean,
	rstd,
	residual_out if residual_out is not None else x,
	seeds,
	dropout_mask,
	dropout_mask1,
	)

	@triton.autotune(
	configs=triton_autotune_configs(),
	key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
	)
	# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
	# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
	# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
	@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
	@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
	@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
	@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
	@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
	@triton.jit
	def _layer_norm_bwd_kernel(
	X, # pointer to the input
	W, # pointer to the weights
	B, # pointer to the biases
	Y, # pointer to the output to be recomputed
	DY, # pointer to the output gradient
	DX, # pointer to the input gradient
	DW, # pointer to the partial sum of weights gradient
	DB, # pointer to the partial sum of biases gradient
	DRESIDUAL,
	W1,
	DY1,
	DX1,
	DW1,
	DB1,
	DRESIDUAL_IN,
	ROWSCALE,
	SEEDS,
	Mean, # pointer to the mean
	Rstd, # pointer to the 1/std
	stride_x_row, # how much to increase the pointer when moving by 1 row
	stride_y_row,
	stride_dy_row,
	stride_dx_row,
	stride_dres_row,
	stride_dy1_row,
	stride_dx1_row,
	stride_dres_in_row,
	M, # number of rows in X
	N, # number of columns in X
	eps, # epsilon to avoid division by zero
	dropout_p,
	zero_centered_weight,
	rows_per_program,
	IS_RMS_NORM: tl.constexpr,
	BLOCK_N: tl.constexpr,
	HAS_DRESIDUAL: tl.constexpr,
	STORE_DRESIDUAL: tl.constexpr,
	HAS_BIAS: tl.constexpr,
	HAS_DROPOUT: tl.constexpr,
	HAS_ROWSCALE: tl.constexpr,
	HAS_DY1: tl.constexpr,
	HAS_DX1: tl.constexpr,
	HAS_B1: tl.constexpr,
	RECOMPUTE_OUTPUT: tl.constexpr,
	):
	# Map the program id to the elements of X, DX, and DY it should compute.
	row_block_id = tl.program_id(0)
	row_start = row_block_id * rows_per_program
	# Do not early exit if row_start >= M, because we need to write DW and DB
	cols = tl.arange(0, BLOCK_N)
	mask = cols < N
	X += row_start * stride_x_row
	if HAS_DRESIDUAL:
	DRESIDUAL += row_start * stride_dres_row
	if STORE_DRESIDUAL:
	DRESIDUAL_IN += row_start * stride_dres_in_row
	DY += row_start * stride_dy_row
	DX += row_start * stride_dx_row
	if HAS_DY1:
	DY1 += row_start * stride_dy1_row
	if HAS_DX1:
	DX1 += row_start * stride_dx1_row
	if RECOMPUTE_OUTPUT:
	Y += row_start * stride_y_row
	w = tl.load(W + cols, mask=mask).to(tl.float32)
	if zero_centered_weight:
	w += 1.0
	if RECOMPUTE_OUTPUT and HAS_BIAS:
	b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
	if HAS_DY1:
	w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
	if zero_centered_weight:
	w1 += 1.0
	dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
	if HAS_BIAS:
	db = tl.zeros((BLOCK_N,), dtype=tl.float32)
	if HAS_DY1:
	dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
	if HAS_B1:
	db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
	row_end = min((row_block_id + 1) * rows_per_program, M)
	for row in range(row_start, row_end):
	# Load data to SRAM
	x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
	dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
	if HAS_DY1:
	dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
	if not IS_RMS_NORM:
	mean = tl.load(Mean + row)
	rstd = tl.load(Rstd + row)
	# Compute dx
	xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
	xhat = tl.where(mask, xhat, 0.0)
	if RECOMPUTE_OUTPUT:
	y = xhat * w + b if HAS_BIAS else xhat * w
	tl.store(Y + cols, y, mask=mask)
	wdy = w * dy
	dw += dy * xhat
	if HAS_BIAS:
	db += dy
	if HAS_DY1:
	wdy += w1 * dy1
	dw1 += dy1 * xhat
	if HAS_B1:
	db1 += dy1
	if not IS_RMS_NORM:
	c1 = tl.sum(xhat * wdy, axis=0) / N
	c2 = tl.sum(wdy, axis=0) / N
	dx = (wdy - (xhat * c1 + c2)) * rstd
	else:
	c1 = tl.sum(xhat * wdy, axis=0) / N
	dx = (wdy - xhat * c1) * rstd
	if HAS_DRESIDUAL:
	dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
	dx += dres
	# Write dx
	if STORE_DRESIDUAL:
	tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
	if HAS_DX1:
	if HAS_DROPOUT:
	keep_mask = (
	tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
	)
	dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
	else:
	dx1 = dx
	tl.store(DX1 + cols, dx1, mask=mask)
	if HAS_DROPOUT:
	keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
	dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
	if HAS_ROWSCALE:
	rowscale = tl.load(ROWSCALE + row).to(tl.float32)
	dx *= rowscale
	tl.store(DX + cols, dx, mask=mask)

	X += stride_x_row
	if HAS_DRESIDUAL:
	DRESIDUAL += stride_dres_row
	if STORE_DRESIDUAL:
	DRESIDUAL_IN += stride_dres_in_row
	if RECOMPUTE_OUTPUT:
	Y += stride_y_row
	DY += stride_dy_row
	DX += stride_dx_row
	if HAS_DY1:
	DY1 += stride_dy1_row
	if HAS_DX1:
	DX1 += stride_dx1_row
	tl.store(DW + row_block_id * N + cols, dw, mask=mask)
	if HAS_BIAS:
	tl.store(DB + row_block_id * N + cols, db, mask=mask)
	if HAS_DY1:
	tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
	if HAS_B1:
	tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)


	def _layer_norm_bwd(
	dy,
	x,
	weight,
	bias,
	eps,
	mean,
	rstd,
	dresidual=None,
	dy1=None,
	weight1=None,
	bias1=None,
	seeds=None,
	dropout_p=0.0,
	rowscale=None,
	has_residual=False,
	has_x1=False,
	zero_centered_weight=False,
	is_rms_norm=False,
	x_dtype=None,
	recompute_output=False,
	):
	M, N = x.shape
	assert x.stride(-1) == 1
	assert dy.stride(-1) == 1
	assert dy.shape == (M, N)
	if dresidual is not None:
	assert dresidual.stride(-1) == 1
	assert dresidual.shape == (M, N)
	assert weight.shape == (N,)
	assert weight.stride(-1) == 1
	if bias is not None:
	assert bias.stride(-1) == 1
	assert bias.shape == (N,)
	if dy1 is not None:
	assert weight1 is not None
	assert dy1.shape == dy.shape
	assert dy1.stride(-1) == 1
	if weight1 is not None:
	assert weight1.shape == (N,)
	assert weight1.stride(-1) == 1
	if bias1 is not None:
	assert bias1.shape == (N,)
	assert bias1.stride(-1) == 1
	if seeds is not None:
	assert seeds.is_contiguous()
	assert seeds.shape == (M if not has_x1 else M * 2,)
	if rowscale is not None:
	assert rowscale.is_contiguous()
	assert rowscale.shape == (M,)
	# allocate output
	dx = (
	torch.empty_like(x)
	if x_dtype is None
	else torch.empty(M, N, dtype=x_dtype, device=x.device)
	)
	dresidual_in = (
	torch.empty_like(x)
	if has_residual
	and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
	else None
	)
	dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
	y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
	if recompute_output:
	assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"

	# Less than 64KB per feature: enqueue fused kernel
	MAX_FUSED_SIZE = 65536 // x.element_size()
	BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
	if N > BLOCK_N:
	raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
	# Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
	# latency of the gmem reads/writes, but will increase the time of summing up dw / db.
	sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
	_dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
	_db = (
	torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
	if bias is not None
	else None
	)
	_dw1 = torch.empty_like(_dw) if weight1 is not None else None
	_db1 = torch.empty_like(_db) if bias1 is not None else None
	rows_per_program = math.ceil(M / sm_count)
	grid = (sm_count,)
	with torch.cuda.device(x.device.index):
	_layer_norm_bwd_kernel[grid](
	x,
	weight,
	bias,
	y,
	dy,
	dx,
	_dw,
	_db,
	dresidual,
	weight1,
	dy1,
	dx1,
	_dw1,
	_db1,
	dresidual_in,
	rowscale,
	seeds,
	mean,
	rstd,
	x.stride(0),
	0 if not recompute_output else y.stride(0),
	dy.stride(0),
	dx.stride(0),
	dresidual.stride(0) if dresidual is not None else 0,
	dy1.stride(0) if dy1 is not None else 0,
	dx1.stride(0) if dx1 is not None else 0,
	dresidual_in.stride(0) if dresidual_in is not None else 0,
	M,
	N,
	eps,
	dropout_p,
	zero_centered_weight,
	rows_per_program,
	is_rms_norm,
	BLOCK_N,
	dresidual is not None,
	dresidual_in is not None,
	bias is not None,
	dropout_p > 0.0,
	)
	dw = _dw.sum(0).to(weight.dtype)
	db = _db.sum(0).to(bias.dtype) if bias is not None else None
	dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
	db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
	# Don't need to compute dresidual_in separately in this case
	if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
	dresidual_in = dx
	if has_x1 and dropout_p == 0.0:
	dx1 = dx
	return (
	(dx, dw, db, dresidual_in, dx1, dw1, db1)
	if not recompute_output
	else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
	)

	class LayerNormFn(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx,
	x,
	weight,
	bias,
	residual=None,
	x1=None,
	weight1=None,
	bias1=None,
	eps=1e-6,
	dropout_p=0.0,
	rowscale=None,
	prenorm=False,
	residual_in_fp32=False,
	zero_centered_weight=False,
	is_rms_norm=False,
	return_dropout_mask=False,
	out=None,
	residual_out=None
	):
	x_shape_og = x.shape
	# Check for zero sequence length
	if x.numel() == 0:
	ctx.zero_seq_length = True
	# Only save minimal required tensors for backward
	# ctx.save_for_backward(weight, bias, weight1, bias1)
	ctx.x_shape_og = x_shape_og
	ctx.weight_shape = weight.shape
	ctx.weight_dtype = weight.dtype
	ctx.weight_device = weight.device

	ctx.has_bias = bias is not None
	ctx.bias_shape = bias.shape if bias is not None else None
	ctx.bias_dtype = bias.dtype if bias is not None else None
	ctx.bias_device = bias.device if bias is not None else None

	ctx.has_weight1 = weight1 is not None
	ctx.weight1_shape = weight1.shape if weight1 is not None else None
	ctx.weight1_dtype = weight1.dtype if weight1 is not None else None
	ctx.weight1_device = weight1.device if weight1 is not None else None

	ctx.has_bias1 = bias1 is not None
	ctx.bias1_shape = bias1.shape if bias1 is not None else None
	ctx.bias1_dtype = bias1.dtype if bias1 is not None else None
	ctx.bias1_device = bias1.device if bias1 is not None else None

	ctx.has_residual = residual is not None
	ctx.has_x1 = x1 is not None
	ctx.dropout_p = dropout_p

	# Handle output tensors with correct dtype
	y = x # Preserve input tensor properties
	y1 = torch.empty_like(x) if x1 is not None else None

	# Only create residual_out if prenorm is True
	residual_out = torch.empty(x.shape,
	dtype=torch.float32 if residual_in_fp32 else x.dtype,
	device=x.device) if prenorm else None

	# Handle dropout masks
	dropout_mask = None
	dropout_mask1 = None
	if return_dropout_mask:
	dropout_mask = torch.empty_like(x, dtype=torch.uint8)
	if x1 is not None:
	dropout_mask1 = torch.empty_like(x, dtype=torch.uint8)

	# Return based on configuration
	if not return_dropout_mask:
	if weight1 is None:
	return y if not prenorm else (y, residual_out)
	else:
	return (y, y1) if not prenorm else (y, y1, residual_out)
	else:
	if weight1 is None:
	return ((y, dropout_mask, dropout_mask1) if not prenorm
	else (y, residual_out, dropout_mask, dropout_mask1))
	else:
	return ((y, y1, dropout_mask, dropout_mask1) if not prenorm
	else (y, y1, residual_out, dropout_mask, dropout_mask1))

	ctx.zero_seq_length = False
	# reshape input data into 2D tensor
	x = x.reshape(-1, x.shape[-1])
	if x.stride(-1) != 1:
	x = x.contiguous()
	if residual is not None:
	assert residual.shape == x_shape_og
	residual = residual.reshape(-1, residual.shape[-1])
	if residual.stride(-1) != 1:
	residual = residual.contiguous()
	if x1 is not None:
	assert x1.shape == x_shape_og
	assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
	x1 = x1.reshape(-1, x1.shape[-1])
	if x1.stride(-1) != 1:
	x1 = x1.contiguous()
	weight = weight.contiguous()
	if bias is not None:
	bias = bias.contiguous()
	if weight1 is not None:
	weight1 = weight1.contiguous()
	if bias1 is not None:
	bias1 = bias1.contiguous()
	if rowscale is not None:
	rowscale = rowscale.reshape(-1).contiguous()
	residual_dtype = (
	residual.dtype
	if residual is not None
	else (torch.float32 if residual_in_fp32 else None)
	)
	if out is not None:
	out = out.reshape(-1, out.shape[-1])
	if residual_out is not None:
	residual_out = residual_out.reshape(-1, residual_out.shape[-1])
	y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
	x,
	weight,
	bias,
	eps,
	residual,
	x1,
	weight1,
	bias1,
	dropout_p=dropout_p,
	rowscale=rowscale,
	residual_dtype=residual_dtype,
	zero_centered_weight=zero_centered_weight,
	is_rms_norm=is_rms_norm,
	return_dropout_mask=return_dropout_mask,
	out=out,
	residual_out=residual_out
	)
	ctx.save_for_backward(
	residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
	)
	ctx.x_shape_og = x_shape_og
	ctx.eps = eps
	ctx.dropout_p = dropout_p
	ctx.is_rms_norm = is_rms_norm
	ctx.has_residual = residual is not None
	ctx.has_x1 = x1 is not None
	ctx.prenorm = prenorm
	ctx.x_dtype = x.dtype
	ctx.zero_centered_weight = zero_centered_weight
	y = y.reshape(x_shape_og)
	y1 = y1.reshape(x_shape_og) if y1 is not None else None
	residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
	dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
	dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
	if not return_dropout_mask:
	if weight1 is None:
	return y if not prenorm else (y, residual_out)
	else:
	return (y, y1) if not prenorm else (y, y1, residual_out)
	else:
	if weight1 is None:
	return (
	(y, dropout_mask, dropout_mask1)
	if not prenorm
	else (y, residual_out, dropout_mask, dropout_mask1)
	)
	else:
	return (
	(y, y1, dropout_mask, dropout_mask1)
	if not prenorm
	else (y, y1, residual_out, dropout_mask, dropout_mask1)
	)

	@staticmethod
	def backward(ctx, dy, *args):
	if ctx.zero_seq_length:
	return (
	torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device),
	torch.zeros(ctx.weight_shape, dtype=ctx.weight_dtype, device=ctx.weight_device),
	torch.zeros(ctx.bias_shape, dtype=ctx.bias_dtype, device=ctx.bias_device) if ctx.has_bias else None,
	torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device) if ctx.has_residual else None,
	torch.zeros(ctx.x_shape_og, dtype=dy.dtype, device=dy.device) if ctx.has_x1 and ctx.dropout_p > 0.0 else None,
	torch.zeros(ctx.weight1_shape, dtype=ctx.weight1_dtype, device=ctx.weight1_device) if ctx.has_weight1 else None,
	torch.zeros(ctx.bias1_shape, dtype=ctx.bias1_dtype, device=ctx.bias1_device) if ctx.has_bias1 else None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	)

	x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
	dy = dy.reshape(-1, dy.shape[-1])
	if dy.stride(-1) != 1:
	dy = dy.contiguous()
	assert dy.shape == x.shape
	if weight1 is not None:
	dy1, args = args[0], args[1:]
	dy1 = dy1.reshape(-1, dy1.shape[-1])
	if dy1.stride(-1) != 1:
	dy1 = dy1.contiguous()
	assert dy1.shape == x.shape
	else:
	dy1 = None
	if ctx.prenorm:
	dresidual = args[0]
	dresidual = dresidual.reshape(-1, dresidual.shape[-1])
	if dresidual.stride(-1) != 1:
	dresidual = dresidual.contiguous()
	assert dresidual.shape == x.shape
	else:
	dresidual = None

	dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
	dy,
	x,
	weight,
	bias,
	ctx.eps,
	mean,
	rstd,
	dresidual,
	dy1,
	weight1,
	bias1,
	seeds,
	ctx.dropout_p,
	rowscale,
	ctx.has_residual,
	ctx.has_x1,
	ctx.zero_centered_weight,
	ctx.is_rms_norm,
	x_dtype=ctx.x_dtype,
	)
	return (
	dx.reshape(ctx.x_shape_og),
	dw,
	db,
	dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
	dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
	dw1,
	db1,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	)

	def rms_norm_fn(
	x,
	weight,
	bias,
	residual=None,
	x1=None,
	weight1=None,
	bias1=None,
	eps=1e-6,
	dropout_p=0.0,
	rowscale=None,
	prenorm=False,
	residual_in_fp32=False,
	zero_centered_weight=False,
	return_dropout_mask=False,
	out=None,
	residual_out=None
	):
	return LayerNormFn.apply(
	x,
	weight,
	bias,
	residual,
	x1,
	weight1,
	bias1,
	eps,
	dropout_p,
	rowscale,
	prenorm,
	residual_in_fp32,
	zero_centered_weight,
	True,
	return_dropout_mask,
	out,
	residual_out
	)

	class RMSNorm(torch.nn.Module):
	def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, zero_centered_weight=False,
	device=None, dtype=None):
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	self.eps = eps
	if dropout_p > 0.0:
	self.drop = torch.nn.Dropout(dropout_p)
	else:
	self.drop = None
	self.zero_centered_weight = zero_centered_weight
	self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
	self.register_parameter("bias", None)
	self.reset_parameters()

	def reset_parameters(self):
	if not self.zero_centered_weight:
	torch.nn.init.ones_(self.weight)
	else:
	torch.nn.init.zeros_(self.weight)

	def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
	return rms_norm_fn(
	x,
	self.weight,
	self.bias,
	residual=residual,
	eps=self.eps,
	dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
	prenorm=prenorm,
	residual_in_fp32=residual_in_fp32,
	zero_centered_weight=self.zero_centered_weight,
	)
	else:
	from torch.nn import RMSNorm
	warnings.warn("Cannot import triton, install triton to use fused RMSNorm for better performance")

	def swiglu(x, y):
	return F.silu(x.float(), inplace=False).to(x.dtype) * y

	logger = logging.get_logger(__name__)


	class TimestepEmbedding(nn.Module):
	def __init__(
	self,
	in_channels: int,
	time_embed_dim: int,
	act_fn: str = "silu",
	out_dim: int = None,
	post_act_fn: Optional[str] = None,
	cond_proj_dim=None,
	sample_proj_bias=True,
	):
	super().__init__()

	self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)

	if cond_proj_dim is not None:
	self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
	else:
	self.cond_proj = None

	self.act = get_activation(act_fn)

	if out_dim is not None:
	time_embed_dim_out = out_dim
	else:
	time_embed_dim_out = time_embed_dim
	self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)

	if post_act_fn is None:
	self.post_act = None
	else:
	self.post_act = get_activation(post_act_fn)

	self.initialize_weights()

	def initialize_weights(self):
	nn.init.normal_(self.linear_1.weight, std=0.02)
	nn.init.zeros_(self.linear_1.bias)
	nn.init.normal_(self.linear_2.weight, std=0.02)
	nn.init.zeros_(self.linear_2.bias)

	def forward(self, sample, condition=None):
	if condition is not None:
	sample = sample + self.cond_proj(condition)
	sample = self.linear_1(sample)

	if self.act is not None:
	sample = self.act(sample)

	sample = self.linear_2(sample)

	if self.post_act is not None:
	sample = self.post_act(sample)
	return sample

	def apply_rotary_emb(
	x: torch.Tensor,
	freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
	use_real: bool = True,
	use_real_unbind_dim: int = -1,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
	to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
	reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
	tensors contain rotary embeddings and are returned as real tensors.

	Args:
	x (`torch.Tensor`):
	Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
	freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

	Returns:
	Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
	"""
	if use_real:
	cos, sin = freqs_cis # [S, D]
	cos = cos[None, None]
	sin = sin[None, None]
	cos, sin = cos.to(x.device), sin.to(x.device)

	if use_real_unbind_dim == -1:
	# Used for flux, cogvideox, hunyuan-dit
	x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
	x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
	elif use_real_unbind_dim == -2:
	# Used for Stable Audio, OmniGen and CogView4
	x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
	x_rotated = torch.cat([-x_imag, x_real], dim=-1)
	else:
	raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")

	out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)

	return out
	else:
	# used for lumina
	# x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
	x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], x.shape[-1] // 2, 2))
	freqs_cis = freqs_cis.unsqueeze(2)
	x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)

	return x_out.type_as(x)



	class BOOGURotaryPosEmbed(nn.Module):
	def __init__(self, theta: int,
	axes_dim: Tuple[int, int, int],
	axes_lens: Tuple[int, int, int] = (300, 512, 512),
	patch_size: int = 2):
	super().__init__()
	self.theta = theta
	self.axes_dim = axes_dim
	self.axes_lens = axes_lens
	self.patch_size = patch_size

	@staticmethod
	def get_freqs_cis(axes_dim: Tuple[int, int, int],
	axes_lens: Tuple[int, int, int],
	theta: int) -> List[torch.Tensor]:
	freqs_cis = []
	freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
	for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
	emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
	freqs_cis.append(emb)
	return freqs_cis

	def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
	device = ids.device
	if ids.device.type == "mps":
	ids = ids.to("cpu")

	result = []
	for i in range(len(self.axes_dim)):
	freqs = freqs_cis[i].to(ids.device)
	index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
	result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
	return torch.cat(result, dim=-1).to(device)

	def forward(
	self,
	freqs_cis,
	attention_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	device
	):
	batch_size = len(attention_mask)
	p = self.patch_size

	encoder_seq_len = attention_mask.shape[1]
	l_effective_cap_len = attention_mask.sum(dim=1).tolist()

	seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]

	max_seq_len = max(seq_lengths)
	max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
	max_img_len = max(l_effective_img_len)

	# Create position IDs
	position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)

	for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
	# add text position ids
	position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")

	pe_shift = cap_seq_len
	pe_shift_len = cap_seq_len

	if ref_img_sizes[i] is not None:
	for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
	H, W = ref_img_size
	ref_H_tokens, ref_W_tokens = H // p, W // p
	assert ref_H_tokens * ref_W_tokens == ref_img_len
	# add image position ids

	row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
	col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
	position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
	position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
	position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids

	pe_shift += max(ref_H_tokens, ref_W_tokens)
	pe_shift_len += ref_img_len

	H, W = img_sizes[i]
	H_tokens, W_tokens = H // p, W // p
	assert H_tokens * W_tokens == l_effective_img_len[i]

	row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
	col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()

	assert pe_shift_len + l_effective_img_len[i] == seq_len
	position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
	position_ids[i, pe_shift_len: seq_len, 1] = row_ids
	position_ids[i, pe_shift_len: seq_len, 2] = col_ids

	# Get combined rotary embeddings
	freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)

	# create separate rotary embeddings for captions and images
	cap_freqs_cis = torch.zeros(
	batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)
	ref_img_freqs_cis = torch.zeros(
	batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)
	img_freqs_cis = torch.zeros(
	batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)

	for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
	cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
	ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
	img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]

	return (
	cap_freqs_cis,
	ref_img_freqs_cis,
	img_freqs_cis,
	freqs_cis,
	l_effective_cap_len,
	seq_lengths,
	)

	###################################################################my double stream block#######################################################################
	class BOOGUDoubleStreamRotaryPosEmbed(nn.Module):
	def __init__(self, theta: int,
	axes_dim: Tuple[int, int, int],
	axes_lens: Tuple[int, int, int] = (300, 512, 512),
	patch_size: int = 2):
	super().__init__()
	self.theta = theta
	self.axes_dim = axes_dim
	self.axes_lens = axes_lens
	self.patch_size = patch_size

	@staticmethod
	def get_freqs_cis(axes_dim: Tuple[int, int, int],
	axes_lens: Tuple[int, int, int],
	theta: int) -> List[torch.Tensor]:
	freqs_cis = []
	freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
	for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
	emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
	freqs_cis.append(emb)
	return freqs_cis

	def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
	device = ids.device
	if ids.device.type == "mps":
	ids = ids.to("cpu")

	result = []
	for i in range(len(self.axes_dim)):
	freqs = freqs_cis[i].to(ids.device)
	index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
	result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
	return torch.cat(result, dim=-1).to(device)

	def forward(
	self,
	freqs_cis,
	attention_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	device
	):
	batch_size = len(attention_mask)
	p = self.patch_size

	encoder_seq_len = attention_mask.shape[1]
	l_effective_cap_len = attention_mask.sum(dim=1).tolist()

	seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]

	max_seq_len = max(seq_lengths)
	max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
	max_img_len = max(l_effective_img_len)

	# Create position IDs
	position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)

	for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
	# add text position ids
	position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")

	pe_shift = cap_seq_len
	pe_shift_len = cap_seq_len

	if ref_img_sizes[i] is not None:
	for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
	H, W = ref_img_size
	ref_H_tokens, ref_W_tokens = H // p, W // p
	assert ref_H_tokens * ref_W_tokens == ref_img_len
	# add image position ids

	row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
	col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
	position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
	position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
	position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids

	pe_shift += max(ref_H_tokens, ref_W_tokens)
	pe_shift_len += ref_img_len

	H, W = img_sizes[i]
	H_tokens, W_tokens = H // p, W // p
	assert H_tokens * W_tokens == l_effective_img_len[i]

	row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
	col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()

	assert pe_shift_len + l_effective_img_len[i] == seq_len
	position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
	position_ids[i, pe_shift_len: seq_len, 1] = row_ids
	position_ids[i, pe_shift_len: seq_len, 2] = col_ids

	# Get combined rotary embeddings
	freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)

	# create separate rotary embeddings for captions and images
	cap_freqs_cis = torch.zeros(
	batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)
	ref_img_freqs_cis = torch.zeros(
	batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)
	img_freqs_cis = torch.zeros(
	batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)

	# Calculate combined image sequence lengths (ref_img + img) for each sample
	combined_img_seq_lengths = [sum(ref_img_len) + img_len for ref_img_len, img_len in zip(l_effective_ref_img_len, l_effective_img_len)]
	max_combined_img_len = max(combined_img_seq_lengths)

	# Create combined image rotary embeddings
	combined_img_freqs_cis = torch.zeros(
	batch_size, max_combined_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
	)

	for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
	cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
	ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
	img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]

	# Combined image rotary embeddings: ref_img + img (same order as img_patch_embed_and_refine)
	combined_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
	combined_img_freqs_cis[i, sum(ref_img_len):sum(ref_img_len) + img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]

	return (
	cap_freqs_cis,
	ref_img_freqs_cis,
	img_freqs_cis,
	freqs_cis,
	l_effective_cap_len,
	seq_lengths,
	combined_img_freqs_cis,
	combined_img_seq_lengths,
	)



	class BOOGUPromptTuningRotaryPosEmbed(nn.Module):
	"""
	Rotary Position Embedding for Prompt Tuning tokens.

	This class generates rotary position embeddings specifically for prompt tuning tokens.
	Since prompt tokens are treated as text tokens, we use text-style position encoding
	with a fixed sequence length equal to num_trainable_prompt_tokens.

	Args:
	theta: Base frequency for rotary embeddings
	axes_dim: Dimensions for each axis (tuple like (32, 32, 32))
	num_trainable_prompt_tokens: Number of trainable prompt tokens
	"""

	def __init__(self, theta: int, dim: int , num_trainable_prompt_tokens: int):
	super().__init__()
	self.theta = theta
	self.num_trainable_prompt_tokens = num_trainable_prompt_tokens
	# For text tokens, only use the first dimension (text/temporal dimension)
	self.dim = dim # Extract text dimension from tuple


	def forward(self, batch_size: int, device: torch.device, use_causal_mask: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Generate rotary position embeddings and attention mask for prompt tuning.

	Args:
	batch_size: Batch size
	device: Target device for tensors
	use_causal_mask: Whether to use causal attention mask

	Returns:
	Tuple of (rotary_embeddings, attention_mask)
	- rotary_embeddings: [B, num_tokens, instruction_dim//2] - RoPE embeddings for prompt tokens (complex form)
	- attention_mask: [B, num_tokens] or [B, num_tokens, num_tokens] - Attention mask
	"""
	# Generate 1D rotary embeddings for text-style tokens
	freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64

	# get_1d_rotary_pos_embed(dim, seq_len) returns [seq_len, dim//2]
	# Because RoPE uses complex representation, each dimension is split into sin/cos pairs
	text_freqs_cis = get_1d_rotary_pos_embed(
	self.dim, # This should be 32 (text dimension)
	self.num_trainable_prompt_tokens, # Sequence length
	theta=self.theta,
	freqs_dtype=freqs_dtype
	)

	# For prompt tuning, we create simple sequential position embeddings
	# Each prompt token gets a unique position ID: 0, 1, 2, ..., num_tokens-1
	position_indices = torch.arange(self.num_trainable_prompt_tokens, dtype=torch.int64, device=text_freqs_cis.device)

	# Select the appropriate rotary embeddings for each position
	# text_freqs_cis is [num_tokens, instruction_dim//2], we want [num_tokens, instruction_dim//2]
	rotary_emb = text_freqs_cis[position_indices] # [num_tokens, instruction_dim//2]

	# Expand to batch size and move to target device
	rotary_emb = rotary_emb.unsqueeze(0).expand(batch_size, -1, -1).to(device) # [B, num_tokens, instruction_dim//2]

	# Create attention mask based on use_causal_mask parameter
	if use_causal_mask:
	# Create causal mask: only future tokens can attend to past tokens
	# Lower triangular matrix where mask[i, j] = True if i >= j
	causal_mask = torch.tril(torch.ones(
	self.num_trainable_prompt_tokens, self.num_trainable_prompt_tokens,
	dtype=torch.bool, device=device
	)) # [num_tokens, num_tokens]

	# Expand to batch size [B, num_tokens, num_tokens]
	attention_mask = causal_mask.unsqueeze(0).expand(batch_size, -1, -1)
	else:
	# Non-causal mask: all tokens can attend to each other (all True)
	attention_mask = torch.ones(
	batch_size, self.num_trainable_prompt_tokens,
	dtype=torch.bool, device=device
	) # [B, num_tokens]

	return rotary_emb, attention_mask



	##########################################################################################################################################




	class LuminaRMSNormZero(nn.Module):
	"""
	Norm layer adaptive RMS normalization zero.

	Parameters:
	embedding_dim (`int`): The size of each embedding vector.
	"""

	def __init__(
	self,
	embedding_dim: int,
	norm_eps: float,
	norm_elementwise_affine: bool,
	):
	super().__init__()
	self.silu = nn.SiLU()
	self.linear = nn.Linear(
	min(embedding_dim, 1024),
	4 * embedding_dim,
	bias=True,
	)
	self.norm = RMSNorm(embedding_dim, eps=norm_eps)

	def forward(
	self,
	x: torch.Tensor,
	emb: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	emb = self.linear(self.silu(emb))
	scale_msa, gate_msa, scale_mlp, gate_mlp = emb.chunk(4, dim=1)
	x = self.norm(x) * (1 + scale_msa[:, None])
	return x, gate_msa, scale_mlp, gate_mlp


	class LuminaLayerNormContinuous(nn.Module):
	def __init__(
	self,
	embedding_dim: int,
	conditioning_embedding_dim: int,
	# NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
	# because the output is immediately scaled and shifted by the projected conditioning embeddings.
	# Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
	# However, this is how it was implemented in the original code, and it's rather likely you should
	# set `elementwise_affine` to False.
	elementwise_affine=True,
	eps=1e-5,
	bias=True,
	norm_type="layer_norm",
	out_dim: Optional[int] = None,
	):
	super().__init__()

	# AdaLN
	self.silu = nn.SiLU()
	self.linear_1 = nn.Linear(conditioning_embedding_dim, embedding_dim, bias=bias)

	if norm_type == "layer_norm":
	self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
	elif norm_type == "rms_norm":
	self.norm = RMSNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
	else:
	raise ValueError(f"unknown norm_type {norm_type}")

	self.linear_2 = None
	if out_dim is not None:
	self.linear_2 = nn.Linear(embedding_dim, out_dim, bias=bias)

	def forward(
	self,
	x: torch.Tensor,
	conditioning_embedding: torch.Tensor,
	) -> torch.Tensor:
	# convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
	emb = self.linear_1(self.silu(conditioning_embedding).to(x.dtype))
	scale = emb
	x = self.norm(x) * (1 + scale)[:, None, :]

	if self.linear_2 is not None:
	x = self.linear_2(x)

	return x


	class LuminaFeedForward(nn.Module):
	r"""
	A feed-forward layer.

	Parameters:
	hidden_size (`int`):
	The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
	hidden representations.
	intermediate_size (`int`): The intermediate dimension of the feedforward layer.
	multiple_of (`int`, optional): Value to ensure hidden dimension is a multiple
	of this value.
	ffn_dim_multiplier (float, optional): Custom multiplier for hidden
	dimension. Defaults to None.
	"""

	def __init__(
	self,
	dim: int,
	inner_dim: int,
	multiple_of: Optional[int] = 256,
	ffn_dim_multiplier: Optional[float] = None,
	):
	super().__init__()

	self.swiglu = swiglu

	# custom hidden_size factor multiplier
	if ffn_dim_multiplier is not None:
	inner_dim = int(ffn_dim_multiplier * inner_dim)
	inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)

	self.linear_1 = nn.Linear(
	dim,
	inner_dim,
	bias=False,
	)
	self.linear_2 = nn.Linear(
	inner_dim,
	dim,
	bias=False,
	)
	self.linear_3 = nn.Linear(
	dim,
	inner_dim,
	bias=False,
	)

	def forward(self, x):
	h1, h2 = self.linear_1(x), self.linear_3(x)
	return self.linear_2(self.swiglu(h1, h2))


	class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
	def __init__(
	self,
	hidden_size: int = 4096,
	text_feat_dim: int = 2048,
	frequency_embedding_size: int = 256,
	norm_eps: float = 1e-5,
	timestep_scale: float = 1.0,
	) -> None:
	super().__init__()

	self.time_proj = Timesteps(
	num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=timestep_scale
	)

	self.timestep_embedder = TimestepEmbedding(
	in_channels=frequency_embedding_size, time_embed_dim=min(hidden_size, 1024)
	)
	#############################my debug###################################
	print(f"###################text_feat_dim: {text_feat_dim}########################")
	################################################################

	self.caption_embedder = nn.Sequential(
	RMSNorm(text_feat_dim, eps=norm_eps),
	nn.Linear(text_feat_dim, hidden_size, bias=True),
	)

	self._initialize_weights()

	def _initialize_weights(self):
	nn.init.trunc_normal_(self.caption_embedder[1].weight, std=0.02)
	nn.init.zeros_(self.caption_embedder[1].bias)

	def forward(
	self, timestep: torch.Tensor, instruction_hidden_states: torch.Tensor, dtype: torch.dtype
	) -> Tuple[torch.Tensor, torch.Tensor]:
	timestep_proj = self.time_proj(timestep).to(dtype=dtype)
	time_embed = self.timestep_embedder(timestep_proj)
	caption_embed = self.caption_embedder(instruction_hidden_states)
	return time_embed, caption_embed




	### default AttnProcessor
	# class OmniGen2AttnProcessor:
	# """
	# Processor for implementing scaled dot-product attention.

	# This processor is optimized for PyTorch 2.0 and implements:
	# - Flash attention with variable length sequences
	# - Rotary position embeddings (RoPE)
	# - Query-Key normalization
	# - Proportional attention scaling

	# Args:
	# None

	# Raises:
	# ImportError: If PyTorch version is less than 2.0
	# """

	# def __init__(self) -> None:
	# """Initialize the attention processor."""
	# if not hasattr(F, "scaled_dot_product_attention"):
	# raise ImportError(
	# "OmniGen2AttnProcessorFlash2Varlen requires PyTorch 2.0. "
	# "Please upgrade PyTorch to version 2.0 or later."
	# )

	# def __call__(
	# self,
	# attn: Attention,
	# hidden_states: torch.Tensor,
	# encoder_hidden_states: torch.Tensor,
	# attention_mask: Optional[torch.Tensor] = None,
	# image_rotary_emb: Optional[torch.Tensor] = None,
	# base_sequence_length: Optional[int] = None,
	# ) -> torch.Tensor:
	# """
	# Process attention computation with flash attention.

	# Args:
	# attn: Attention module
	# hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
	# encoder_hidden_states: Encoder hidden states tensor
	# attention_mask: Optional attention mask tensor
	# image_rotary_emb: Optional rotary embeddings for image tokens
	# base_sequence_length: Optional base sequence length for proportional attention

	# Returns:
	# torch.Tensor: Processed hidden states after attention computation
	# """
	# batch_size, sequence_length, _ = hidden_states.shape

	# # Get Query-Key-Value Pair
	# query = attn.to_q(hidden_states)
	# key = attn.to_k(encoder_hidden_states)
	# value = attn.to_v(encoder_hidden_states)

	# query_dim = query.shape[-1]
	# inner_dim = key.shape[-1]
	# head_dim = query_dim // attn.heads
	# dtype = query.dtype

	# # Get key-value heads
	# kv_heads = inner_dim // head_dim

	# # Reshape tensors for attention computation
	# query = query.view(batch_size, -1, attn.heads, head_dim)
	# key = key.view(batch_size, -1, kv_heads, head_dim)
	# value = value.view(batch_size, -1, kv_heads, head_dim)

	# # Apply Query-Key normalization
	# if attn.norm_q is not None:
	# query = attn.norm_q(query)
	# if attn.norm_k is not None:
	# key = attn.norm_k(key)

	# # Apply Rotary Position Embeddings
	# if image_rotary_emb is not None:
	# query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
	# key = apply_rotary_emb(key, image_rotary_emb, use_real=False)

	# query, key = query.to(dtype), key.to(dtype)

	# # Calculate attention scale
	# if base_sequence_length is not None:
	# softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
	# else:
	# softmax_scale = attn.scale

	# # scaled_dot_product_attention expects attention_mask shape to be
	# # (batch, heads, source_length, target_length)
	# if attention_mask is not None:
	# attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)

	# query = query.transpose(1, 2)
	# key = key.transpose(1, 2)
	# value = value.transpose(1, 2)

	# # explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
	# key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
	# value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)

	# hidden_states = F.scaled_dot_product_attention(
	# query, key, value, attn_mask=attention_mask, scale=softmax_scale
	# )
	# hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	# hidden_states = hidden_states.type_as(query)

	# # Apply output projection
	# hidden_states = attn.to_out[0](hidden_states)
	# hidden_states = attn.to_out[1](hidden_states)

	# return hidden_states


	#####################################################################my Attention Processor######################################################################################################


	####################debug############################
	from webdataset.utils import pytorch_worker_info
	#####################################################


	class BOOGUDoubleStreamSelfAttnProcessorFlash2Varlen(nn.Module):
	"""
	Double-stream self-attention processor with flash attention and variable length sequences.

	This processor implements YAK-style double-stream attention where:
	- Text and image features are processed separately to generate QKV
	- QKV are concatenated and processed together for cross-modal attention
	- Uses flash attention for efficient computation
	- Supports both standard and causal attention masks

	Args:
	head_dim: Dimension of each attention head
	num_attention_heads: Number of attention heads for queries
	num_kv_heads: Number of key-value heads
	qkv_bias: Whether to use bias in QKV linear layers
	"""

	def __init__(self, head_dim: int, num_attention_heads: int, num_kv_heads: int, qkv_bias: bool = False) -> None:
	"""Initialize the double-stream attention processor."""
	super().__init__()
	if not is_flash_attn_available():
	raise ImportError(
	"BOOGUDoubleStreamSelfAttnProcessorFlash2Varlen requires flash_attn. "
	"Please install flash_attn."
	)

	# Calculate dimensions
	self.head_dim = head_dim
	self.num_attention_heads = num_attention_heads
	self.num_kv_heads = num_kv_heads

	query_dim = head_dim * num_attention_heads
	kv_dim = head_dim * num_kv_heads

	# Initialize separate Q, K, V linear layers for text and image
	# Query uses num_attention_heads, Key/Value use num_kv_heads
	self.img_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
	self.img_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
	self.img_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)

	self.txt_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
	self.txt_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
	self.txt_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)

	# Additional output projection layers for text and image streams
	self.txt_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)
	self.img_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)

	# Initialize weights
	self.initialize_weights()
	# ########################################debug###############################################
	# rank, world_size, worker, num_workers = pytorch_worker_info(None)
	# print(f"#######################init rank: {rank} : #self.img_to_q: {self.img_to_q.weight.sum(dim=-1)}################################")
	# ############################################################################################


	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the double-stream attention processor.

	Uses Xavier uniform initialization for linear layers and zero initialization for biases.
	"""
	# Initialize image stream QKV projection layers
	nn.init.xavier_uniform_(self.img_to_q.weight)
	nn.init.xavier_uniform_(self.img_to_k.weight)
	nn.init.xavier_uniform_(self.img_to_v.weight)

	# Initialize text stream QKV projection layers
	nn.init.xavier_uniform_(self.txt_to_q.weight)
	nn.init.xavier_uniform_(self.txt_to_k.weight)
	nn.init.xavier_uniform_(self.txt_to_v.weight)

	# Initialize separate output projection layers
	nn.init.xavier_uniform_(self.txt_out.weight)
	nn.init.xavier_uniform_(self.img_out.weight)

	# Initialize biases if they exist
	if self.img_to_q.bias is not None:
	nn.init.zeros_(self.img_to_q.bias)
	nn.init.zeros_(self.img_to_k.bias)
	nn.init.zeros_(self.img_to_v.bias)
	nn.init.zeros_(self.txt_to_q.bias)
	nn.init.zeros_(self.txt_to_k.bias)
	nn.init.zeros_(self.txt_to_v.bias)
	nn.init.zeros_(self.txt_out.bias)
	nn.init.zeros_(self.img_out.bias)

	def _upad_input(
	self,
	query_layer: torch.Tensor,
	key_layer: torch.Tensor,
	value_layer: torch.Tensor,
	attention_mask: torch.Tensor,
	query_length: int,
	num_heads: int,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
	"""
	Unpad the input tensors for flash attention.
	Same implementation as BOOGUAttnProcessorFlash2Varlen.
	"""
	def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
	"""Helper function to get unpadding data from attention mask."""
	seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
	indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = seqlens_in_batch.max().item()
	cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
	return indices, cu_seqlens, max_seqlen_in_batch

	indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
	batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

	# Unpad key and value layers
	key_layer = index_first_axis(
	key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
	indices_k,
	)
	value_layer = index_first_axis(
	value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
	indices_k,
	)

	# Handle different query length cases
	if query_length == kv_seq_len:
	query_layer = index_first_axis(
	query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
	indices_k,
	)
	cu_seqlens_q = cu_seqlens_k
	max_seqlen_in_batch_q = max_seqlen_in_batch_k
	indices_q = indices_k
	elif query_length == 1:
	max_seqlen_in_batch_q = 1
	cu_seqlens_q = torch.arange(
	batch_size + 1, dtype=torch.int32, device=query_layer.device
	)
	indices_q = cu_seqlens_q[:-1]
	query_layer = query_layer.squeeze(1)
	else:
	attention_mask = attention_mask[:, -query_length:]
	query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

	return (
	query_layer,
	key_layer,
	value_layer,
	indices_q,
	(cu_seqlens_q, cu_seqlens_k),
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
	)

	def _concat_text_image_features(
	self,
	img_hidden_states_list: List[torch.Tensor],
	txt_hidden_states_list: List[torch.Tensor],
	encoder_seq_lengths: List[int],
	seq_lengths: List[int],
	) -> List[torch.Tensor]:
	"""
	Concatenate text and image features following YAK's logic (text first, then image).

	Args:
	img_hidden_states_list: List of image tensors [img_query, img_key, img_value]
	txt_hidden_states_list: List of text tensors [txt_query, txt_key, txt_value]
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]

	Returns:
	List of concatenated tensors [query, key, value]
	"""
	assert len(img_hidden_states_list) == len(txt_hidden_states_list), \
	f"Length mismatch: img_list={len(img_hidden_states_list)}, txt_list={len(txt_hidden_states_list)}"

	batch_size = img_hidden_states_list[0].shape[0]
	max_seq_len = max(seq_lengths)

	concatenated_list = []

	for img_tensor, txt_tensor in zip(img_hidden_states_list, txt_hidden_states_list):
	# Ensure tensors are on the same device
	device = img_tensor.device
	if txt_tensor.device != device:
	txt_tensor = txt_tensor.to(device)

	# Create output tensor with proper shape [B, max_seq_len, feature_dim]
	feature_dim = img_tensor.shape[-1]
	concatenated = img_tensor.new_zeros(batch_size, max_seq_len, feature_dim)

	# Concatenate text first, then image for each sample
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	# Place text tokens first
	concatenated[i, :encoder_seq_len] = txt_tensor[i, :encoder_seq_len]
	# Place image tokens after text
	concatenated[i, encoder_seq_len:seq_len] = img_tensor[i, :seq_len - encoder_seq_len]

	concatenated_list.append(concatenated)

	return concatenated_list


	def _split_text_image_features(
	self,
	hidden_states_list: List[torch.Tensor],
	encoder_seq_lengths: List[int],
	seq_lengths: List[int],
	) -> List[Tuple[torch.Tensor, torch.Tensor]]:
	"""
	Split concatenated features back to text and image features.
	Inverse operation of _concat_text_image_features.

	Args:
	hidden_states_list: List of concatenated tensors (usually just one element)
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]

	Returns:
	List of tuples, each containing (txt_hidden_states, img_hidden_states)
	"""
	result_list = []

	for hidden_states in hidden_states_list:
	batch_size = hidden_states.shape[0]
	feature_dim = hidden_states.shape[-1]

	# Get maximum lengths for text and image
	max_txt_len = max(encoder_seq_lengths)
	max_img_len = max(seq_len - encoder_seq_len for seq_len, encoder_seq_len in zip(seq_lengths, encoder_seq_lengths))

	# Create output tensors [B, max_len, feature_dim]
	txt_hidden_states = hidden_states.new_zeros(batch_size, max_txt_len, feature_dim)
	img_hidden_states = hidden_states.new_zeros(batch_size, max_img_len, feature_dim)

	# Split each sample back to text and image
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	img_len = seq_len - encoder_seq_len

	# Extract text portion
	txt_hidden_states[i, :encoder_seq_len] = hidden_states[i, :encoder_seq_len]
	# Extract image portion
	img_hidden_states[i, :img_len] = hidden_states[i, encoder_seq_len:seq_len]

	result_list.append((txt_hidden_states, img_hidden_states))

	return result_list

	def __call__(
	self,
	attn: Attention,
	img_hidden_states: torch.Tensor,
	txt_hidden_states: torch.Tensor,
	joint_attention_mask: Optional[torch.Tensor] = None,
	rotary_emb: Optional[torch.Tensor] = None,
	encoder_seq_lengths: List[int] = None, # [B] - Text sequence lengths for each sample
	seq_lengths: List[int] = None, # [B] - Total sequence lengths for each sample
	base_sequence_length: Optional[int] = None,
	) -> torch.Tensor:
	"""
	Process double-stream self-attention computation with flash attention.

	Args:
	attn: Attention module
	img_hidden_states: Image hidden states tensor [B, L_img, D]
	txt_hidden_states: Text hidden states tensor [B, L_txt, D]
	joint_attention_mask: Combined attention mask [B, L_total]
	rotary_emb: Rotary embeddings for the joint sequence
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]
	base_sequence_length: Optional base sequence length for proportional attention

	Returns:
	torch.Tensor: Processed hidden states after attention computation
	"""
	batch_size = img_hidden_states.shape[0]
	L_txt = txt_hidden_states.shape[1]
	L_img = img_hidden_states.shape[1]

	# Ensure Q, K, V linear layers are on the same device as input tensors
	device = img_hidden_states.device
	for layer in [self.img_to_q, self.img_to_k, self.img_to_v, self.txt_to_q, self.txt_to_k, self.txt_to_v,
	self.txt_out, self.img_out]:
	if layer.weight.device != device:
	layer = layer.to(device)

	# Generate Q, K, V for image and text streams (NO head reshaping yet)
	img_query = self.img_to_q(img_hidden_states) # [B, L_img, query_dim]
	img_key = self.img_to_k(img_hidden_states) # [B, L_img, kv_dim]
	img_value = self.img_to_v(img_hidden_states) # [B, L_img, kv_dim]

	txt_query = self.txt_to_q(txt_hidden_states) # [B, L_txt, query_dim]
	txt_key = self.txt_to_k(txt_hidden_states) # [B, L_txt, kv_dim]
	txt_value = self.txt_to_v(txt_hidden_states) # [B, L_txt, kv_dim]

	# Use helper function to concatenate QKV following YAK's logic (text first, then image)
	img_list = [img_query, img_key, img_value] # [B, L_img, feature_dim] each
	txt_list = [txt_query, txt_key, txt_value] # [B, L_txt, feature_dim] each
	concatenated_list = self._concat_text_image_features(img_list, txt_list, encoder_seq_lengths, seq_lengths)
	query, key, value = concatenated_list # [B, max_seq_len, feature_dim] each

	# From here, follow exactly the same logic as BOOGUAttnProcessorFlash2Varlen
	sequence_length = max(seq_lengths)

	query_dim = query.shape[-1]
	inner_dim = key.shape[-1]
	head_dim = query_dim // attn.heads
	dtype = query.dtype

	# Get key-value heads
	kv_heads = inner_dim // head_dim

	# Reshape tensors for attention computation
	query = query.view(batch_size, -1, attn.heads, head_dim)
	key = key.view(batch_size, -1, kv_heads, head_dim)
	value = value.view(batch_size, -1, kv_heads, head_dim)

	# Apply Query-Key normalization
	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# Apply Rotary Position Embeddings
	if rotary_emb is not None:
	query = apply_rotary_emb(query, rotary_emb, use_real=False)
	key = apply_rotary_emb(key, rotary_emb, use_real=False)

	query, key = query.to(dtype), key.to(dtype)

	# Calculate attention scale
	if base_sequence_length is not None:
	softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
	else:
	softmax_scale = attn.scale

	# Detect if we have a causal mask
	is_causal = False
	if joint_attention_mask is not None and joint_attention_mask.dim() == 3:
	# Check if it's a lower triangular causal mask
	# For efficiency, we only check the first sample
	mask_sample = joint_attention_mask[0] # [seq_len, seq_len]
	is_causal = torch.allclose(mask_sample, torch.tril(torch.ones_like(mask_sample)))

	# Unpad input for flash attention
	(
	query_states,
	key_states,
	value_states,
	indices_q,
	cu_seq_lens,
	max_seq_lens,
	) = self._upad_input(query, key, value, joint_attention_mask, sequence_length, attn.heads)

	cu_seqlens_q, cu_seqlens_k = cu_seq_lens
	max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

	# Handle different number of heads
	if kv_heads < attn.heads:
	key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
	value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)

	# Apply flash attention with causal parameter
	attn_output_unpad = flash_attn_varlen_func(
	query_states,
	key_states,
	value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_in_batch_q,
	max_seqlen_k=max_seqlen_in_batch_k,
	dropout_p=0.0,
	causal=is_causal, # Use detected causal setting
	softmax_scale=softmax_scale,
	)

	# Pad output and apply final transformations
	hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
	hidden_states = hidden_states.flatten(-2)
	hidden_states = hidden_states.type_as(query)

	# Split hidden_states back to text and image, apply separate output projections, then merge
	split_results = self._split_text_image_features([hidden_states], encoder_seq_lengths, seq_lengths)
	txt_hidden_states, img_hidden_states = split_results[0] # [B, max_txt_len, feature_dim], [B, max_img_len, feature_dim]

	# Apply separate output projections for text and image
	txt_projected = self.txt_out(txt_hidden_states) # [B, max_txt_len, feature_dim]
	img_projected = self.img_out(img_hidden_states) # [B, max_img_len, feature_dim]

	# Merge back to joint representation
	merged_list = self._concat_text_image_features([img_projected], [txt_projected], encoder_seq_lengths, seq_lengths)
	hidden_states = merged_list[0] # [B, max_seq_len, feature_dim]

	# Apply final output projection
	hidden_states = attn.to_out[0](hidden_states)
	hidden_states = attn.to_out[1](hidden_states)

	# ########################################debug###############################################
	# rank, world_size, worker, num_workers = pytorch_worker_info(None)
	# if rank == 0:
	# print(f"#####################rank: {rank}###self.img_to_q: {self.img_to_q.weight[0][:25]} ################################")
	# print(f"#####################rank: {rank}###self.txt_to_q: {self.txt_to_q.weight[0][:25]} ################################")

	# # print(f"#####################rank: {rank}###attn.to_q: {attn.to_q.weight.sum(dim=-1)[:10]} ################################")

	# ############################################################################################

	return hidden_states




	class BOOGUDoubleStreamSelfAttnProcessor(nn.Module):
	"""
	Double-stream self-attention processor without flash attention.

	This processor implements YAK-style double-stream attention where:
	- Text and image features are processed separately to generate QKV
	- QKV are concatenated and processed together for cross-modal attention
	- Uses PyTorch's scaled_dot_product_attention for computation
	- Supports both standard and causal attention masks

	Args:
	head_dim: Dimension of each attention head
	num_attention_heads: Number of attention heads for queries
	num_kv_heads: Number of key-value heads
	qkv_bias: Whether to use bias in QKV linear layers
	"""

	def __init__(self, head_dim: int, num_attention_heads: int, num_kv_heads: int, qkv_bias: bool = False) -> None:
	"""Initialize the double-stream attention processor."""
	super().__init__()
	if not hasattr(F, "scaled_dot_product_attention"):
	raise ImportError(
	"BOOGUDoubleStreamSelfAttnProcessor requires PyTorch 2.0. "
	"Please upgrade PyTorch to version 2.0 or later."
	)

	# Calculate dimensions
	self.head_dim = head_dim
	self.num_attention_heads = num_attention_heads
	self.num_kv_heads = num_kv_heads

	query_dim = head_dim * num_attention_heads
	kv_dim = head_dim * num_kv_heads

	# Initialize separate Q, K, V linear layers for text and image
	# Query uses num_attention_heads, Key/Value use num_kv_heads
	self.img_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
	self.img_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
	self.img_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)

	self.txt_to_q = nn.Linear(query_dim, query_dim, bias=qkv_bias)
	self.txt_to_k = nn.Linear(query_dim, kv_dim, bias=qkv_bias)
	self.txt_to_v = nn.Linear(query_dim, kv_dim, bias=qkv_bias)

	# Additional output projection layers for text and image streams
	self.txt_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)
	self.img_out = nn.Linear(query_dim, query_dim, bias=qkv_bias)

	# Initialize weights
	self.initialize_weights()

	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the double-stream attention processor.

	Uses Xavier uniform initialization for linear layers and zero initialization for biases.
	"""
	# Initialize image stream QKV projection layers
	nn.init.xavier_uniform_(self.img_to_q.weight)
	nn.init.xavier_uniform_(self.img_to_k.weight)
	nn.init.xavier_uniform_(self.img_to_v.weight)

	# Initialize text stream QKV projection layers
	nn.init.xavier_uniform_(self.txt_to_q.weight)
	nn.init.xavier_uniform_(self.txt_to_k.weight)
	nn.init.xavier_uniform_(self.txt_to_v.weight)

	# Initialize separate output projection layers
	nn.init.xavier_uniform_(self.txt_out.weight)
	nn.init.xavier_uniform_(self.img_out.weight)

	# Initialize biases if they exist
	if self.img_to_q.bias is not None:
	nn.init.zeros_(self.img_to_q.bias)
	nn.init.zeros_(self.img_to_k.bias)
	nn.init.zeros_(self.img_to_v.bias)
	nn.init.zeros_(self.txt_to_q.bias)
	nn.init.zeros_(self.txt_to_k.bias)
	nn.init.zeros_(self.txt_to_v.bias)
	nn.init.zeros_(self.txt_out.bias)
	nn.init.zeros_(self.img_out.bias)

	def _concat_text_image_features(
	self,
	img_hidden_states_list: List[torch.Tensor],
	txt_hidden_states_list: List[torch.Tensor],
	encoder_seq_lengths: List[int],
	seq_lengths: List[int],
	) -> List[torch.Tensor]:
	"""
	Concatenate text and image features following YAK's logic (text first, then image).

	Args:
	img_hidden_states_list: List of image tensors [img_query, img_key, img_value]
	txt_hidden_states_list: List of text tensors [txt_query, txt_key, txt_value]
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]

	Returns:
	List of concatenated tensors [query, key, value]
	"""
	assert len(img_hidden_states_list) == len(txt_hidden_states_list), \
	f"Length mismatch: img_list={len(img_hidden_states_list)}, txt_list={len(txt_hidden_states_list)}"

	batch_size = img_hidden_states_list[0].shape[0]
	max_seq_len = max(seq_lengths)

	concatenated_list = []

	for img_tensor, txt_tensor in zip(img_hidden_states_list, txt_hidden_states_list):
	# Ensure tensors are on the same device
	device = img_tensor.device
	if txt_tensor.device != device:
	txt_tensor = txt_tensor.to(device)

	# Create output tensor with proper shape [B, max_seq_len, feature_dim]
	feature_dim = img_tensor.shape[-1]
	concatenated = img_tensor.new_zeros(batch_size, max_seq_len, feature_dim)

	# Concatenate text first, then image for each sample
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	# Place text tokens first
	concatenated[i, :encoder_seq_len] = txt_tensor[i, :encoder_seq_len]
	# Place image tokens after text
	concatenated[i, encoder_seq_len:seq_len] = img_tensor[i, :seq_len - encoder_seq_len]

	concatenated_list.append(concatenated)

	return concatenated_list

	def _split_text_image_features(
	self,
	hidden_states_list: List[torch.Tensor],
	encoder_seq_lengths: List[int],
	seq_lengths: List[int],
	) -> List[Tuple[torch.Tensor, torch.Tensor]]:
	"""
	Split concatenated features back to text and image features.
	Inverse operation of _concat_text_image_features.

	Args:
	hidden_states_list: List of concatenated tensors (usually just one element)
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]

	Returns:
	List of tuples, each containing (txt_hidden_states, img_hidden_states)
	"""
	result_list = []

	for hidden_states in hidden_states_list:
	batch_size = hidden_states.shape[0]
	feature_dim = hidden_states.shape[-1]

	# Get maximum lengths for text and image
	max_txt_len = max(encoder_seq_lengths)
	max_img_len = max(seq_len - encoder_seq_len for seq_len, encoder_seq_len in zip(seq_lengths, encoder_seq_lengths))

	# Create output tensors [B, max_len, feature_dim]
	txt_hidden_states = hidden_states.new_zeros(batch_size, max_txt_len, feature_dim)
	img_hidden_states = hidden_states.new_zeros(batch_size, max_img_len, feature_dim)

	# Split each sample back to text and image
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	img_len = seq_len - encoder_seq_len

	# Extract text portion
	txt_hidden_states[i, :encoder_seq_len] = hidden_states[i, :encoder_seq_len]
	# Extract image portion
	img_hidden_states[i, :img_len] = hidden_states[i, encoder_seq_len:seq_len]

	result_list.append((txt_hidden_states, img_hidden_states))

	return result_list

	def __call__(
	self,
	attn: Attention,
	img_hidden_states: torch.Tensor,
	txt_hidden_states: torch.Tensor,
	joint_attention_mask: Optional[torch.Tensor] = None,
	rotary_emb: Optional[torch.Tensor] = None,
	encoder_seq_lengths: List[int] = None, # [B] - Text sequence lengths for each sample
	seq_lengths: List[int] = None, # [B] - Total sequence lengths for each sample
	base_sequence_length: Optional[int] = None,
	) -> torch.Tensor:
	"""
	Process double-stream self-attention computation with PyTorch's scaled_dot_product_attention.

	Args:
	attn: Attention module
	img_hidden_states: Image hidden states tensor [B, L_img, D]
	txt_hidden_states: Text hidden states tensor [B, L_txt, D]
	joint_attention_mask: Combined attention mask [B, L_total]
	rotary_emb: Rotary embeddings for the joint sequence
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]
	base_sequence_length: Optional base sequence length for proportional attention

	Returns:
	torch.Tensor: Processed hidden states after attention computation
	"""
	batch_size = img_hidden_states.shape[0]
	L_txt = txt_hidden_states.shape[1]
	L_img = img_hidden_states.shape[1]

	# Ensure Q, K, V linear layers are on the same device as input tensors
	device = img_hidden_states.device
	for layer in [self.img_to_q, self.img_to_k, self.img_to_v, self.txt_to_q, self.txt_to_k, self.txt_to_v,
	self.txt_out, self.img_out]:
	if layer.weight.device != device:
	layer = layer.to(device)

	# Generate Q, K, V for image and text streams (NO head reshaping yet)
	img_query = self.img_to_q(img_hidden_states) # [B, L_img, query_dim]
	img_key = self.img_to_k(img_hidden_states) # [B, L_img, kv_dim]
	img_value = self.img_to_v(img_hidden_states) # [B, L_img, kv_dim]

	txt_query = self.txt_to_q(txt_hidden_states) # [B, L_txt, query_dim]
	txt_key = self.txt_to_k(txt_hidden_states) # [B, L_txt, kv_dim]
	txt_value = self.txt_to_v(txt_hidden_states) # [B, L_txt, kv_dim]

	# Use helper function to concatenate QKV following YAK's logic (text first, then image)
	img_list = [img_query, img_key, img_value] # [B, L_img, feature_dim] each
	txt_list = [txt_query, txt_key, txt_value] # [B, L_txt, feature_dim] each
	concatenated_list = self._concat_text_image_features(img_list, txt_list, encoder_seq_lengths, seq_lengths)
	query, key, value = concatenated_list # [B, max_seq_len, feature_dim] each

	# From here, follow exactly the same logic as BOOGUAttnProcessor
	sequence_length = max(seq_lengths)

	query_dim = query.shape[-1]
	inner_dim = key.shape[-1]
	head_dim = query_dim // attn.heads
	dtype = query.dtype

	# Get key-value heads
	kv_heads = inner_dim // head_dim

	# Reshape tensors for attention computation
	query = query.view(batch_size, -1, attn.heads, head_dim)
	key = key.view(batch_size, -1, kv_heads, head_dim)
	value = value.view(batch_size, -1, kv_heads, head_dim)

	# Apply Query-Key normalization
	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# Apply Rotary Position Embeddings
	if rotary_emb is not None:
	query = apply_rotary_emb(query, rotary_emb, use_real=False)
	key = apply_rotary_emb(key, rotary_emb, use_real=False)

	query, key = query.to(dtype), key.to(dtype)

	# Calculate attention scale
	if base_sequence_length is not None:
	softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
	else:
	softmax_scale = attn.scale

	# scaled_dot_product_attention expects attention_mask shape to be
	# (batch, heads, source_length, target_length)
	if joint_attention_mask is not None:
	joint_attention_mask = joint_attention_mask.bool()
	if joint_attention_mask.dim() == 2:
	# Standard mask [B, seq_len] -> [B, 1, 1, seq_len]
	joint_attention_mask = joint_attention_mask.view(batch_size, 1, 1, -1)
	elif joint_attention_mask.dim() == 3:
	# Causal mask [B, seq_len, seq_len] -> [B, 1, seq_len, seq_len]
	joint_attention_mask = joint_attention_mask.unsqueeze(1)
	else:
	raise ValueError(f"Unsupported joint_attention_mask shape: {joint_attention_mask.shape}")

	query = query.transpose(1, 2)
	key = key.transpose(1, 2)
	value = value.transpose(1, 2)

	# explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
	key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
	value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)

	hidden_states = F.scaled_dot_product_attention(
	query, key, value, attn_mask=joint_attention_mask, scale=softmax_scale
	)
	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.type_as(query)

	# Split hidden_states back to text and image, apply separate output projections, then merge
	split_results = self._split_text_image_features([hidden_states], encoder_seq_lengths, seq_lengths)
	txt_hidden_states, img_hidden_states = split_results[0] # [B, max_txt_len, feature_dim], [B, max_img_len, feature_dim]

	# Apply separate output projections for text and image
	txt_projected = self.txt_out(txt_hidden_states) # [B, max_txt_len, feature_dim]
	img_projected = self.img_out(img_hidden_states) # [B, max_img_len, feature_dim]

	# Merge back to joint representation
	merged_list = self._concat_text_image_features([img_projected], [txt_projected], encoder_seq_lengths, seq_lengths)
	hidden_states = merged_list[0] # [B, max_seq_len, feature_dim]

	# Apply final output projection
	hidden_states = attn.to_out[0](hidden_states)
	hidden_states = attn.to_out[1](hidden_states)

	return hidden_states


	class BOOGUAttnProcessorFlash2Varlen:
	"""
	Processor for implementing scaled dot-product attention with flash attention and variable length sequences.

	This processor implements:
	- Flash attention with variable length sequences
	- Rotary position embeddings (RoPE)
	- Query-Key normalization
	- Proportional attention scaling

	Args:
	None
	"""

	def __init__(self) -> None:
	"""Initialize the attention processor."""
	if not is_flash_attn_available():
	raise ImportError(
	"BOOGUAttnProcessorFlash2Varlen requires flash_attn. "
	"Please install flash_attn."
	)

	def _upad_input(
	self,
	query_layer: torch.Tensor,
	key_layer: torch.Tensor,
	value_layer: torch.Tensor,
	attention_mask: torch.Tensor,
	query_length: int,
	num_heads: int,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
	"""
	Unpad the input tensors for flash attention.

	Args:
	query_layer: Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
	key_layer: Key tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
	value_layer: Value tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
	attention_mask: Attention mask tensor of shape (batch_size, seq_len) or (batch_size, seq_len, seq_len) for causal
	query_length: Length of the query sequence
	num_heads: Number of attention heads

	Returns:
	Tuple containing:
	- Unpadded query tensor
	- Unpadded key tensor
	- Unpadded value tensor
	- Query indices
	- Tuple of cumulative sequence lengths for query and key
	- Tuple of maximum sequence lengths for query and key
	"""
	def _get_unpad_data(mask_2d: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
	"""Helper function to get unpadding data from a 2D attention mask [B, L]."""
	seqlens_in_batch = mask_2d.sum(dim=-1, dtype=torch.int32)
	indices = torch.nonzero(mask_2d.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = seqlens_in_batch.max().item()
	cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
	return indices, cu_seqlens, max_seqlen_in_batch

	# Normalize attention mask: if a causal 3D mask is provided [B, L, L],
	# convert it to a standard 2D padding mask [B, L] with True for valid tokens.
	if attention_mask is not None and attention_mask.dim() == 3:
	B, L, _ = attention_mask.shape
	# For a proper lower-triangular causal mask, all first L positions are valid per sample.
	# However, to be robust, infer per-sample effective lengths from the diagonal.
	diag_valid = torch.diagonal(attention_mask, dim1=-2, dim2=-1)
	lengths = diag_valid.sum(dim=-1, dtype=torch.int32) # [B]
	mask_2d = torch.zeros(B, L, dtype=torch.bool, device=attention_mask.device)
	for i in range(B):
	if lengths[i].item() > 0:
	mask_2d[i, : int(lengths[i].item())] = True
	else:
	mask_2d = attention_mask # already [B, L]

	indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(mask_2d)
	batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

	# Unpad key and value layers (shared path for both standard and causal cases)
	key_layer = index_first_axis(
	key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
	indices_k,
	)
	value_layer = index_first_axis(
	value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
	indices_k,
	)

	# Handle different query length cases
	if query_length == kv_seq_len:
	query_layer = index_first_axis(
	query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
	indices_k,
	)
	cu_seqlens_q = cu_seqlens_k
	max_seqlen_in_batch_q = max_seqlen_in_batch_k
	indices_q = indices_k
	elif query_length == 1:
	max_seqlen_in_batch_q = 1
	cu_seqlens_q = torch.arange(
	batch_size + 1, dtype=torch.int32, device=query_layer.device
	)
	indices_q = cu_seqlens_q[:-1]
	query_layer = query_layer.squeeze(1)
	else:
	# Use the last query_length positions of the 2D mask
	q_mask = mask_2d[:, -query_length:]
	query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, q_mask)

	return (
	query_layer,
	key_layer,
	value_layer,
	indices_q,
	(cu_seqlens_q, cu_seqlens_k),
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
	)

	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	image_rotary_emb: Optional[torch.Tensor] = None,
	base_sequence_length: Optional[int] = None,
	) -> torch.Tensor:
	"""
	Process attention computation with flash attention.

	Args:
	attn: Attention module
	hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
	encoder_hidden_states: Encoder hidden states tensor
	attention_mask: Optional attention mask tensor
	image_rotary_emb: Optional rotary embeddings for image tokens
	base_sequence_length: Optional base sequence length for proportional attention

	Returns:
	torch.Tensor: Processed hidden states after attention computation
	"""

	batch_size, sequence_length, _ = hidden_states.shape

	# Get Query-Key-Value Pair
	query = attn.to_q(hidden_states)
	key = attn.to_k(encoder_hidden_states)
	value = attn.to_v(encoder_hidden_states)

	query_dim = query.shape[-1]
	inner_dim = key.shape[-1]
	head_dim = query_dim // attn.heads
	dtype = query.dtype

	# Get key-value heads
	kv_heads = inner_dim // head_dim

	# Reshape tensors for attention computation
	query = query.view(batch_size, -1, attn.heads, head_dim)
	key = key.view(batch_size, -1, kv_heads, head_dim)
	value = value.view(batch_size, -1, kv_heads, head_dim)

	# Apply Query-Key normalization
	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# Apply Rotary Position Embeddings
	if image_rotary_emb is not None:
	query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
	key = apply_rotary_emb(key, image_rotary_emb, use_real=False)

	query, key = query.to(dtype), key.to(dtype)

	# Calculate attention scale
	if base_sequence_length is not None:
	softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
	else:
	softmax_scale = attn.scale

	# Detect if we have a causal mask
	is_causal = False
	if attention_mask is not None and attention_mask.dim() == 3:
	# Check if it's a lower triangular causal mask
	# For efficiency, we only check the first sample
	mask_sample = attention_mask[0] # [seq_len, seq_len]
	is_causal = torch.allclose(mask_sample, torch.tril(torch.ones_like(mask_sample)))

	# Unpad input for flash attention
	(
	query_states,
	key_states,
	value_states,
	indices_q,
	cu_seq_lens,
	max_seq_lens,
	) = self._upad_input(query, key, value, attention_mask, sequence_length, attn.heads)

	cu_seqlens_q, cu_seqlens_k = cu_seq_lens
	max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

	# Handle different number of heads
	if kv_heads < attn.heads:
	key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
	value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)

	# Apply flash attention with causal parameter
	attn_output_unpad = flash_attn_varlen_func(
	query_states,
	key_states,
	value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_in_batch_q,
	max_seqlen_k=max_seqlen_in_batch_k,
	dropout_p=0.0,
	causal=is_causal, # Use detected causal setting
	softmax_scale=softmax_scale,
	)

	# Pad output and apply final transformations
	hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
	hidden_states = hidden_states.flatten(-2)
	hidden_states = hidden_states.type_as(query)

	# Apply output projection
	hidden_states = attn.to_out[0](hidden_states)
	hidden_states = attn.to_out[1](hidden_states)

	return hidden_states




	class BOOGUAttnProcessorFlash2Varlen:
	"""
	Processor for implementing scaled dot-product attention with flash attention and variable length sequences.

	This processor implements:
	- Flash attention with variable length sequences
	- Rotary position embeddings (RoPE)
	- Query-Key normalization
	- Proportional attention scaling

	Args:
	None
	"""

	def __init__(self) -> None:
	"""Initialize the attention processor."""
	if not is_flash_attn_available():
	raise ImportError(
	"BOOGUAttnProcessorFlash2Varlen requires flash_attn. "
	"Please install flash_attn."
	)

	def _upad_input(
	self,
	query_layer: torch.Tensor,
	key_layer: torch.Tensor,
	value_layer: torch.Tensor,
	attention_mask: torch.Tensor,
	query_length: int,
	num_heads: int,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
	"""
	Unpad the input tensors for flash attention.

	Args:
	query_layer: Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
	key_layer: Key tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
	value_layer: Value tensor of shape (batch_size, seq_len, num_kv_heads, head_dim)
	attention_mask: Attention mask tensor of shape (batch_size, seq_len) or (batch_size, seq_len, seq_len) for causal
	query_length: Length of the query sequence
	num_heads: Number of attention heads

	Returns:
	Tuple containing:
	- Unpadded query tensor
	- Unpadded key tensor
	- Unpadded value tensor
	- Query indices
	- Tuple of cumulative sequence lengths for query and key
	- Tuple of maximum sequence lengths for query and key
	"""
	def _get_unpad_data(mask_2d: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
	"""Helper function to get unpadding data from a 2D attention mask [B, L]."""
	seqlens_in_batch = mask_2d.sum(dim=-1, dtype=torch.int32)
	indices = torch.nonzero(mask_2d.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = seqlens_in_batch.max().item()
	cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
	return indices, cu_seqlens, max_seqlen_in_batch

	# Normalize attention mask: if a causal 3D mask is provided [B, L, L],
	# convert it to a standard 2D padding mask [B, L] with True for valid tokens.
	if attention_mask is not None and attention_mask.dim() == 3:
	B, L, _ = attention_mask.shape
	# For a proper lower-triangular causal mask, all first L positions are valid per sample.
	# However, to be robust, infer per-sample effective lengths from the diagonal.
	diag_valid = torch.diagonal(attention_mask, dim1=-2, dim2=-1)
	lengths = diag_valid.sum(dim=-1, dtype=torch.int32) # [B]
	mask_2d = torch.zeros(B, L, dtype=torch.bool, device=attention_mask.device)
	for i in range(B):
	if lengths[i].item() > 0:
	mask_2d[i, : int(lengths[i].item())] = True
	else:
	mask_2d = attention_mask # already [B, L]

	indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(mask_2d)
	batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

	# Unpad key and value layers (shared path for both standard and causal cases)
	key_layer = index_first_axis(
	key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
	indices_k,
	)
	value_layer = index_first_axis(
	value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
	indices_k,
	)

	# Handle different query length cases
	if query_length == kv_seq_len:
	query_layer = index_first_axis(
	query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
	indices_k,
	)
	cu_seqlens_q = cu_seqlens_k
	max_seqlen_in_batch_q = max_seqlen_in_batch_k
	indices_q = indices_k
	elif query_length == 1:
	max_seqlen_in_batch_q = 1
	cu_seqlens_q = torch.arange(
	batch_size + 1, dtype=torch.int32, device=query_layer.device
	)
	indices_q = cu_seqlens_q[:-1]
	query_layer = query_layer.squeeze(1)
	else:
	# Use the last query_length positions of the 2D mask
	q_mask = mask_2d[:, -query_length:]
	query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, q_mask)

	return (
	query_layer,
	key_layer,
	value_layer,
	indices_q,
	(cu_seqlens_q, cu_seqlens_k),
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
	)

	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	image_rotary_emb: Optional[torch.Tensor] = None,
	base_sequence_length: Optional[int] = None,
	) -> torch.Tensor:
	"""
	Process attention computation with flash attention.

	Args:
	attn: Attention module
	hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
	encoder_hidden_states: Encoder hidden states tensor
	attention_mask: Optional attention mask tensor
	image_rotary_emb: Optional rotary embeddings for image tokens
	base_sequence_length: Optional base sequence length for proportional attention

	Returns:
	torch.Tensor: Processed hidden states after attention computation
	"""

	batch_size, sequence_length, _ = hidden_states.shape

	# Get Query-Key-Value Pair
	query = attn.to_q(hidden_states)
	key = attn.to_k(encoder_hidden_states)
	value = attn.to_v(encoder_hidden_states)

	query_dim = query.shape[-1]
	inner_dim = key.shape[-1]
	head_dim = query_dim // attn.heads
	dtype = query.dtype

	# Get key-value heads
	kv_heads = inner_dim // head_dim

	# Reshape tensors for attention computation
	query = query.view(batch_size, -1, attn.heads, head_dim)
	key = key.view(batch_size, -1, kv_heads, head_dim)
	value = value.view(batch_size, -1, kv_heads, head_dim)

	# Apply Query-Key normalization
	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# Apply Rotary Position Embeddings
	if image_rotary_emb is not None:
	query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
	key = apply_rotary_emb(key, image_rotary_emb, use_real=False)

	query, key = query.to(dtype), key.to(dtype)

	# Calculate attention scale
	if base_sequence_length is not None:
	softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
	else:
	softmax_scale = attn.scale

	# Detect if we have a causal mask
	is_causal = False
	if attention_mask is not None and attention_mask.dim() == 3:
	# Check if it's a lower triangular causal mask
	# For efficiency, we only check the first sample
	mask_sample = attention_mask[0] # [seq_len, seq_len]
	is_causal = torch.allclose(mask_sample, torch.tril(torch.ones_like(mask_sample)))

	# Unpad input for flash attention
	(
	query_states,
	key_states,
	value_states,
	indices_q,
	cu_seq_lens,
	max_seq_lens,
	) = self._upad_input(query, key, value, attention_mask, sequence_length, attn.heads)

	cu_seqlens_q, cu_seqlens_k = cu_seq_lens
	max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

	# Handle different number of heads
	if kv_heads < attn.heads:
	key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
	value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)

	# Apply flash attention with causal parameter
	attn_output_unpad = flash_attn_varlen_func(
	query_states,
	key_states,
	value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_in_batch_q,
	max_seqlen_k=max_seqlen_in_batch_k,
	dropout_p=0.0,
	causal=is_causal, # Use detected causal setting
	softmax_scale=softmax_scale,
	)

	# Pad output and apply final transformations
	hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
	hidden_states = hidden_states.flatten(-2)
	hidden_states = hidden_states.type_as(query)

	# Apply output projection
	hidden_states = attn.to_out[0](hidden_states)
	hidden_states = attn.to_out[1](hidden_states)

	return hidden_states



	class BOOGUAttnProcessor:
	"""
	Processor for implementing scaled dot-product attention with flash attention and variable length sequences.

	This processor is optimized for PyTorch 2.0 and implements:
	- Flash attention with variable length sequences
	- Rotary position embeddings (RoPE)
	- Query-Key normalization
	- Proportional attention scaling

	Args:
	None

	Raises:
	ImportError: If PyTorch version is less than 2.0
	"""

	def __init__(self) -> None:
	"""Initialize the attention processor."""
	if not hasattr(F, "scaled_dot_product_attention"):
	raise ImportError(
	"BOOGUAttnProcessorFlash2Varlen requires PyTorch 2.0. "
	"Please upgrade PyTorch to version 2.0 or later."
	)

	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	image_rotary_emb: Optional[torch.Tensor] = None,
	base_sequence_length: Optional[int] = None,
	) -> torch.Tensor:
	"""
	Process attention computation with flash attention.

	Args:
	attn: Attention module
	hidden_states: Hidden states tensor of shape (batch_size, seq_len, hidden_dim)
	encoder_hidden_states: Encoder hidden states tensor
	attention_mask: Optional attention mask tensor
	image_rotary_emb: Optional rotary embeddings for image tokens
	base_sequence_length: Optional base sequence length for proportional attention

	Returns:
	torch.Tensor: Processed hidden states after attention computation
	"""
	batch_size, sequence_length, _ = hidden_states.shape

	# Get Query-Key-Value Pair
	query = attn.to_q(hidden_states)
	key = attn.to_k(encoder_hidden_states)
	value = attn.to_v(encoder_hidden_states)

	query_dim = query.shape[-1]
	inner_dim = key.shape[-1]
	head_dim = query_dim // attn.heads
	dtype = query.dtype

	# Get key-value heads
	kv_heads = inner_dim // head_dim

	# Reshape tensors for attention computation
	query = query.view(batch_size, -1, attn.heads, head_dim)
	key = key.view(batch_size, -1, kv_heads, head_dim)
	value = value.view(batch_size, -1, kv_heads, head_dim)

	# Apply Query-Key normalization
	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# Apply Rotary Position Embeddings
	if image_rotary_emb is not None:
	query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
	key = apply_rotary_emb(key, image_rotary_emb, use_real=False)

	query, key = query.to(dtype), key.to(dtype)

	# Calculate attention scale
	if base_sequence_length is not None:
	softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
	else:
	softmax_scale = attn.scale

	# sdpa expects attn_mask with shape (B, H, Q, K) as boolean (True keeps, False masks)
	if attention_mask is not None:
	attention_mask = attention_mask.bool()
	if attention_mask.dim() == 2:
	# Standard padding mask [B, L] -> [B, 1, 1, L]
	attention_mask = attention_mask.view(batch_size, 1, 1, -1)
	elif attention_mask.dim() == 3:
	# Robust causal + padding mask construction
	# Infer valid lengths from diagonal, then build lower-triangular mask within valid lengths
	B, L, _ = attention_mask.shape
	diag_valid = torch.diagonal(attention_mask, dim1=-2, dim2=-1)
	lengths = diag_valid.sum(dim=-1) # [B]
	arange_L = torch.arange(L, device=attention_mask.device)
	# Padding masks for queries and keys: shape [B, L]
	q_valid = arange_L.unsqueeze(0) < lengths.unsqueeze(1)
	k_valid = q_valid # same lengths assumed
	# Lower-triangular causal mask [L, L]
	causal = torch.tril(torch.ones(L, L, dtype=torch.bool, device=attention_mask.device))
	# Combine: [B, L, L]
	combined = causal & q_valid.unsqueeze(-1) & k_valid.unsqueeze(-2)
	attention_mask = combined.unsqueeze(1) # [B, 1, L, L]
	else:
	raise ValueError(f"Unsupported attention_mask shape: {attention_mask.shape}")

	query = query.transpose(1, 2)
	key = key.transpose(1, 2)
	value = value.transpose(1, 2)

	# print(f"######################attention_mask: {attention_mask}, shape: {attention_mask.shape}############################")

	# explicitly repeat key and value to match query length, otherwise using enable_gqa=True results in MATH backend of sdpa in our test of pytorch2.6
	key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
	value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)

	hidden_states = F.scaled_dot_product_attention(
	query, key, value, attn_mask=attention_mask, scale=softmax_scale
	)
	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.type_as(query)

	# Apply output projection
	hidden_states = attn.to_out[0](hidden_states)
	hidden_states = attn.to_out[1](hidden_states)

	return hidden_states
	###########################################################################################################################################################################







	### default transformer blocks
	# class OmniGen2TransformerBlock(nn.Module):
	# """
	# Transformer block for OmniGen2 model.

	# This block implements a transformer layer with:
	# - Multi-head attention with flash attention
	# - Feed-forward network with SwiGLU activation
	# - RMS normalization
	# - Optional modulation for conditional generation

	# Args:
	# dim: Dimension of the input and output tensors
	# num_attention_heads: Number of attention heads
	# num_kv_heads: Number of key-value heads
	# multiple_of: Multiple of which the hidden dimension should be
	# ffn_dim_multiplier: Multiplier for the feed-forward network dimension
	# norm_eps: Epsilon value for normalization layers
	# modulation: Whether to use modulation for conditional generation
	# use_fused_rms_norm: Whether to use fused RMS normalization
	# use_fused_swiglu: Whether to use fused SwiGLU activation
	# """

	# def __init__(
	# self,
	# dim: int,
	# num_attention_heads: int,
	# num_kv_heads: int,
	# multiple_of: int,
	# ffn_dim_multiplier: float,
	# norm_eps: float,
	# modulation: bool = True,
	# ) -> None:
	# """Initialize the transformer block."""
	# super().__init__()
	# self.head_dim = dim // num_attention_heads
	# self.modulation = modulation

	# # Initialize attention layer
	# self.attn = Attention(
	# query_dim=dim,
	# cross_attention_dim=None,
	# dim_head=dim // num_attention_heads,
	# qk_norm="rms_norm",
	# heads=num_attention_heads,
	# kv_heads=num_kv_heads,
	# eps=1e-5,
	# bias=False,
	# out_bias=False,
	# processor=OmniGen2AttnProcessor(),
	# )

	# # Initialize feed-forward network
	# self.feed_forward = LuminaFeedForward(
	# dim=dim,
	# inner_dim=4 * dim,
	# multiple_of=multiple_of,
	# ffn_dim_multiplier=ffn_dim_multiplier,
	# )

	# # Initialize normalization layers
	# if modulation:
	# self.norm1 = LuminaRMSNormZero(
	# embedding_dim=dim,
	# norm_eps=norm_eps,
	# norm_elementwise_affine=True,
	# )
	# else:
	# self.norm1 = RMSNorm(dim, eps=norm_eps)

	# self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
	# self.norm2 = RMSNorm(dim, eps=norm_eps)
	# self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)

	# self.initialize_weights()

	# def initialize_weights(self) -> None:
	# """
	# Initialize the weights of the transformer block.

	# Uses Xavier uniform initialization for linear layers and zero initialization for biases.
	# """
	# nn.init.xavier_uniform_(self.attn.to_q.weight)
	# nn.init.xavier_uniform_(self.attn.to_k.weight)
	# nn.init.xavier_uniform_(self.attn.to_v.weight)
	# nn.init.xavier_uniform_(self.attn.to_out[0].weight)

	# nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
	# nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
	# nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)

	# if self.modulation:
	# nn.init.zeros_(self.norm1.linear.weight)
	# nn.init.zeros_(self.norm1.linear.bias)

	# def forward(
	# self,
	# hidden_states: torch.Tensor,
	# attention_mask: torch.Tensor,
	# image_rotary_emb: torch.Tensor,
	# temb: Optional[torch.Tensor] = None,
	# ) -> torch.Tensor:
	# """
	# Forward pass of the transformer block.

	# Args:
	# hidden_states: Input hidden states tensor
	# attention_mask: Attention mask tensor
	# image_rotary_emb: Rotary embeddings for image tokens
	# temb: Optional timestep embedding tensor

	# Returns:
	# torch.Tensor: Output hidden states after transformer block processing
	# """
	# if self.modulation:
	# if temb is None:
	# raise ValueError("temb must be provided when modulation is enabled")

	# norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
	# attn_output = self.attn(
	# hidden_states=norm_hidden_states,
	# encoder_hidden_states=norm_hidden_states,
	# attention_mask=attention_mask,
	# image_rotary_emb=image_rotary_emb,
	# )
	# hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
	# mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
	# hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
	# else:
	# norm_hidden_states = self.norm1(hidden_states)
	# attn_output = self.attn(
	# hidden_states=norm_hidden_states,
	# encoder_hidden_states=norm_hidden_states,
	# attention_mask=attention_mask,
	# image_rotary_emb=image_rotary_emb,
	# )
	# hidden_states = hidden_states + self.norm2(attn_output)
	# mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
	# hidden_states = hidden_states + self.ffn_norm2(mlp_output)

	# return hidden_states


	# class OmniGen2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
	# """
	# OmniGen2 Transformer 2D Model.

	# A transformer-based diffusion model for image generation with:
	# - Patch-based image processing
	# - Rotary position embeddings
	# - Multi-head attention
	# - Conditional generation support

	# Args:
	# patch_size: Size of image patches
	# in_channels: Number of input channels
	# out_channels: Number of output channels (defaults to in_channels)
	# hidden_size: Size of hidden layers
	# num_layers: Number of transformer layers
	# num_refiner_layers: Number of refiner layers
	# num_attention_heads: Number of attention heads
	# num_kv_heads: Number of key-value heads
	# multiple_of: Multiple of which the hidden dimension should be
	# ffn_dim_multiplier: Multiplier for feed-forward network dimension
	# norm_eps: Epsilon value for normalization layers
	# axes_dim_rope: Dimensions for rotary position embeddings
	# axes_lens: Lengths for rotary position embeddings
	# instruction_feat_dim: Dimension of text features
	# timestep_scale: Scale factor for timestep embeddings
	# use_fused_rms_norm: Whether to use fused RMS normalization
	# use_fused_swiglu: Whether to use fused SwiGLU activation
	# """

	# _supports_gradient_checkpointing = True
	# _no_split_modules = ["Omnigen2TransformerBlock"]
	# _skip_layerwise_casting_patterns = ["x_embedder", "norm"]

	# @register_to_config
	# def __init__(
	# self,
	# patch_size: int = 2,
	# in_channels: int = 16,
	# out_channels: Optional[int] = None,
	# hidden_size: int = 2304,
	# num_layers: int = 26,
	# num_refiner_layers: int = 2,
	# num_attention_heads: int = 24,
	# num_kv_heads: int = 8,
	# multiple_of: int = 256,
	# ffn_dim_multiplier: Optional[float] = None,
	# norm_eps: float = 1e-5,
	# axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
	# axes_lens: Tuple[int, int, int] = (300, 512, 512),
	# instruction_feat_dim: int = 1024,
	# timestep_scale: float = 1.0,
	# ) -> None:
	# """Initialize the OmniGen2 transformer model."""
	# super().__init__()

	# # Validate configuration
	# if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
	# raise ValueError(
	# f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
	# f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
	# )

	# self.out_channels = out_channels or in_channels

	# # Initialize embeddings
	# self.rope_embedder = OmniGen2RotaryPosEmbed(
	# theta=10000,
	# axes_dim=axes_dim_rope,
	# axes_lens=axes_lens,
	# patch_size=patch_size,
	# )

	# self.x_embedder = nn.Linear(
	# in_features=patch_size * patch_size * in_channels,
	# out_features=hidden_size,
	# )

	# self.ref_image_patch_embedder = nn.Linear(
	# in_features=patch_size * patch_size * in_channels,
	# out_features=hidden_size,
	# )

	# self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
	# hidden_size=hidden_size,
	# instruction_feat_dim=instruction_feat_dim,
	# norm_eps=norm_eps,
	# timestep_scale=timestep_scale,
	# )

	# # Initialize transformer blocks
	# self.noise_refiner = nn.ModuleList([
	# OmniGen2TransformerBlock(
	# hidden_size,
	# num_attention_heads,
	# num_kv_heads,
	# multiple_of,
	# ffn_dim_multiplier,
	# norm_eps,
	# modulation=True,
	# )
	# for _ in range(num_refiner_layers)
	# ])

	# self.ref_image_refiner = nn.ModuleList([
	# OmniGen2TransformerBlock(
	# hidden_size,
	# num_attention_heads,
	# num_kv_heads,
	# multiple_of,
	# ffn_dim_multiplier,
	# norm_eps,
	# modulation=True,
	# )
	# for _ in range(num_refiner_layers)
	# ])

	# self.context_refiner = nn.ModuleList(
	# [
	# OmniGen2TransformerBlock(
	# hidden_size,
	# num_attention_heads,
	# num_kv_heads,
	# multiple_of,
	# ffn_dim_multiplier,
	# norm_eps,
	# modulation=False,
	# )
	# for _ in range(num_refiner_layers)
	# ]
	# )

	# # 3. Transformer blocks
	# self.layers = nn.ModuleList(
	# [
	# OmniGen2TransformerBlock(
	# hidden_size,
	# num_attention_heads,
	# num_kv_heads,
	# multiple_of,
	# ffn_dim_multiplier,
	# norm_eps,
	# modulation=True,
	# )
	# for _ in range(num_layers)
	# ]
	# )

	# # 4. Output norm & projection
	# self.norm_out = LuminaLayerNormContinuous(
	# embedding_dim=hidden_size,
	# conditioning_embedding_dim=min(hidden_size, 1024),
	# elementwise_affine=False,
	# eps=1e-6,
	# bias=True,
	# out_dim=patch_size * patch_size * self.out_channels,
	# )

	# # Add learnable embeddings to distinguish different images
	# self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images

	# self.gradient_checkpointing = False

	# self.initialize_weights()

	# def initialize_weights(self) -> None:
	# """
	# Initialize the weights of the model.

	# Uses Xavier uniform initialization for linear layers.
	# """
	# nn.init.xavier_uniform_(self.x_embedder.weight)
	# nn.init.constant_(self.x_embedder.bias, 0.0)

	# nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
	# nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)

	# nn.init.zeros_(self.norm_out.linear_1.weight)
	# nn.init.zeros_(self.norm_out.linear_1.bias)
	# nn.init.zeros_(self.norm_out.linear_2.weight)
	# nn.init.zeros_(self.norm_out.linear_2.bias)

	# nn.init.normal_(self.image_index_embedding, std=0.02)

	# def img_patch_embed_and_refine(
	# self,
	# hidden_states,
	# ref_image_hidden_states,
	# padded_img_mask,
	# padded_ref_img_mask,
	# noise_rotary_emb,
	# ref_img_rotary_emb,
	# l_effective_ref_img_len,
	# l_effective_img_len,
	# temb
	# ):
	# batch_size = len(hidden_states)
	# max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])

	# hidden_states = self.x_embedder(hidden_states)
	# ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)

	# for i in range(batch_size):
	# shift = 0
	# for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
	# ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
	# shift += ref_img_len

	# for layer in self.noise_refiner:
	# hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)

	# flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
	# num_ref_images = len(flat_l_effective_ref_img_len)
	# max_ref_img_len = max(flat_l_effective_ref_img_len)

	# batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
	# batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
	# batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
	# batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)

	# # sequence of ref imgs to batch
	# idx = 0
	# for i in range(batch_size):
	# shift = 0
	# for ref_img_len in l_effective_ref_img_len[i]:
	# batch_ref_img_mask[idx, :ref_img_len] = True
	# batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
	# batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
	# batch_temb[idx] = temb[i]
	# shift += ref_img_len
	# idx += 1

	# # refine ref imgs separately
	# for layer in self.ref_image_refiner:
	# batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)

	# # batch of ref imgs to sequence
	# idx = 0
	# for i in range(batch_size):
	# shift = 0
	# for ref_img_len in l_effective_ref_img_len[i]:
	# ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
	# shift += ref_img_len
	# idx += 1

	# combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
	# for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
	# combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
	# combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]

	# return combined_img_hidden_states

	# def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
	# batch_size = len(hidden_states)
	# p = self.config.patch_size
	# device = hidden_states[0].device

	# img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
	# l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]

	# if ref_image_hidden_states is not None:
	# ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
	# l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
	# else:
	# ref_img_sizes = [None for _ in range(batch_size)]
	# l_effective_ref_img_len = [[0] for _ in range(batch_size)]

	# max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
	# max_img_len = max(l_effective_img_len)

	# # ref image patch embeddings
	# flat_ref_img_hidden_states = []
	# for i in range(batch_size):
	# if ref_img_sizes[i] is not None:
	# imgs = []
	# for ref_img in ref_image_hidden_states[i]:
	# C, H, W = ref_img.size()
	# ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
	# imgs.append(ref_img)

	# img = torch.cat(imgs, dim=0)
	# flat_ref_img_hidden_states.append(img)
	# else:
	# flat_ref_img_hidden_states.append(None)

	# # image patch embeddings
	# flat_hidden_states = []
	# for i in range(batch_size):
	# img = hidden_states[i]
	# C, H, W = img.size()

	# img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
	# flat_hidden_states.append(img)

	# padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
	# padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
	# for i in range(batch_size):
	# if ref_img_sizes[i] is not None:
	# padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
	# padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True

	# padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
	# padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
	# for i in range(batch_size):
	# padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
	# padded_img_mask[i, :l_effective_img_len[i]] = True

	# return (
	# padded_hidden_states,
	# padded_ref_img_hidden_states,
	# padded_img_mask,
	# padded_ref_img_mask,
	# l_effective_ref_img_len,
	# l_effective_img_len,
	# ref_img_sizes,
	# img_sizes,
	# )

	# def forward(
	# self,
	# hidden_states: Union[torch.Tensor, List[torch.Tensor]],
	# timestep: torch.Tensor,
	# text_hidden_states: torch.Tensor,
	# freqs_cis: torch.Tensor,
	# text_attention_mask: torch.Tensor,
	# ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
	# attention_kwargs: Optional[Dict[str, Any]] = None,
	# return_dict: bool = False,
	# ) -> Union[torch.Tensor, Transformer2DModelOutput]:
	# if attention_kwargs is not None:
	# attention_kwargs = attention_kwargs.copy()
	# lora_scale = attention_kwargs.pop("scale", 1.0)
	# else:
	# lora_scale = 1.0

	# if USE_PEFT_BACKEND:
	# # weight the lora layers by setting `lora_scale` for each PEFT layer
	# scale_lora_layers(self, lora_scale)
	# else:
	# if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
	# logger.warning(
	# "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
	# )

	# # 1. Condition, positional & patch embedding
	# batch_size = len(hidden_states)
	# is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)

	# if is_hidden_states_tensor:
	# assert hidden_states.ndim == 4
	# hidden_states = [_hidden_states for _hidden_states in hidden_states]

	# device = hidden_states[0].device

	# temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)

	# (
	# hidden_states,
	# ref_image_hidden_states,
	# img_mask,
	# ref_img_mask,
	# l_effective_ref_img_len,
	# l_effective_img_len,
	# ref_img_sizes,
	# img_sizes,
	# ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)

	# (
	# context_rotary_emb,
	# ref_img_rotary_emb,
	# noise_rotary_emb,
	# rotary_emb,
	# encoder_seq_lengths,
	# seq_lengths,
	# ) = self.rope_embedder(
	# freqs_cis,
	# text_attention_mask,
	# l_effective_ref_img_len,
	# l_effective_img_len,
	# ref_img_sizes,
	# img_sizes,
	# device,
	# )

	# # 2. Context refinement
	# for layer in self.context_refiner:
	# text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)

	# combined_img_hidden_states = self.img_patch_embed_and_refine(
	# hidden_states,
	# ref_image_hidden_states,
	# img_mask,
	# ref_img_mask,
	# noise_rotary_emb,
	# ref_img_rotary_emb,
	# l_effective_ref_img_len,
	# l_effective_img_len,
	# temb,
	# )

	# # 3. Joint Transformer blocks
	# max_seq_len = max(seq_lengths)

	# attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
	# joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
	# for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	# attention_mask[i, :seq_len] = True
	# joint_hidden_states[i, :encoder_seq_len] = text_hidden_states[i, :encoder_seq_len]
	# joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]

	# hidden_states = joint_hidden_states

	# for layer_idx, layer in enumerate(self.layers):
	# if torch.is_grad_enabled() and self.gradient_checkpointing:
	# hidden_states = self._gradient_checkpointing_func(
	# layer, hidden_states, attention_mask, rotary_emb, temb
	# )
	# else:
	# hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)

	# # 4. Output norm & projection
	# hidden_states = self.norm_out(hidden_states, temb)

	# p = self.config.patch_size
	# output = []
	# for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
	# height, width = img_size
	# output.append(rearrange(hidden_states[i][seq_len - img_len:seq_len], '(h w) (p1 p2 c) -> c (h p1) (w p2)', h=height // p, w=width // p, p1=p, p2=p))
	# if is_hidden_states_tensor:
	# output = torch.stack(output, dim=0)

	# if USE_PEFT_BACKEND:
	# # remove `lora_scale` from each PEFT layer
	# unscale_lora_layers(self, lora_scale)

	# if not return_dict:
	# return output
	# return Transformer2DModelOutput(sample=output)



	####################################################################my Transformer Blocks###########################################################################################

	class BOOGUTransformerBlock(nn.Module):
	"""
	Transformer block for BOOGU model.

	This block implements a transformer layer with:
	- Multi-head attention with flash attention
	- Feed-forward network with SwiGLU activation
	- RMS normalization
	- Optional modulation for conditional generation

	Args:
	dim: Dimension of the input and output tensors
	num_attention_heads: Number of attention heads
	num_kv_heads: Number of key-value heads
	multiple_of: Multiple of which the hidden dimension should be
	ffn_dim_multiplier: Multiplier for the feed-forward network dimension
	norm_eps: Epsilon value for normalization layers
	modulation: Whether to use modulation for conditional generation
	use_fused_rms_norm: Whether to use fused RMS normalization
	use_fused_swiglu: Whether to use fused SwiGLU activation
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	num_kv_heads: int,
	multiple_of: int,
	ffn_dim_multiplier: float,
	norm_eps: float,
	modulation: bool = True,
	) -> None:
	"""Initialize the transformer block."""
	super().__init__()
	self.head_dim = dim // num_attention_heads
	self.modulation = modulation

	try:
	# #########################my debug############################
	# print(f"###########################Use BOOGUAttnProcessorFlash2Varlen############################")
	# #############################################################

	processor = BOOGUAttnProcessorFlash2Varlen()
	except ImportError:
	processor = BOOGUAttnProcessor()

	# Initialize attention layer
	self.attn = Attention(
	query_dim=dim,
	cross_attention_dim=None,
	dim_head=dim // num_attention_heads,
	qk_norm="rms_norm",
	heads=num_attention_heads,
	kv_heads=num_kv_heads,
	eps=1e-5,
	bias=False,
	out_bias=False,
	processor=processor,
	)

	# Initialize feed-forward network
	self.feed_forward = LuminaFeedForward(
	dim=dim,
	inner_dim=4 * dim,
	multiple_of=multiple_of,
	ffn_dim_multiplier=ffn_dim_multiplier
	)

	# Initialize normalization layers
	if modulation:
	self.norm1 = LuminaRMSNormZero(
	embedding_dim=dim,
	norm_eps=norm_eps,
	norm_elementwise_affine=True
	)
	else:
	self.norm1 = RMSNorm(dim, eps=norm_eps)

	self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
	self.norm2 = RMSNorm(dim, eps=norm_eps)
	self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)

	self.initialize_weights()

	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the transformer block.

	Uses Xavier uniform initialization for linear layers and zero initialization for biases.
	"""
	nn.init.xavier_uniform_(self.attn.to_q.weight)
	nn.init.xavier_uniform_(self.attn.to_k.weight)
	nn.init.xavier_uniform_(self.attn.to_v.weight)
	nn.init.xavier_uniform_(self.attn.to_out[0].weight)

	nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
	nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
	nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)

	if self.modulation:
	nn.init.zeros_(self.norm1.linear.weight)
	nn.init.zeros_(self.norm1.linear.bias)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	image_rotary_emb: torch.Tensor,
	temb: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Forward pass of the transformer block.

	Args:
	hidden_states: Input hidden states tensor
	attention_mask: Attention mask tensor
	image_rotary_emb: Rotary embeddings for image tokens
	temb: Optional timestep embedding tensor

	Returns:
	torch.Tensor: Output hidden states after transformer block processing
	"""

	enable_taylorseer = getattr(self, 'enable_taylorseer', False)

	if enable_taylorseer:
	if self.modulation:
	if temb is None:
	raise ValueError("temb must be provided when modulation is enabled")

	if self.current['type'] == 'full':
	self.current['module'] = 'total'
	taylor_cache_init(cache_dic=self.cache_dic, current=self.current)

	norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
	attn_output = self.attn(
	hidden_states=norm_hidden_states,
	encoder_hidden_states=norm_hidden_states,
	attention_mask=attention_mask,
	image_rotary_emb=image_rotary_emb,
	)
	hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
	mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
	hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)

	derivative_approximation(cache_dic=self.cache_dic, current=self.current, feature=hidden_states)

	elif self.current['type'] == 'Taylor':
	self.current['module'] = 'total'
	hidden_states = taylor_formula(cache_dic=self.cache_dic, current=self.current)
	else:
	norm_hidden_states = self.norm1(hidden_states)
	attn_output = self.attn(
	hidden_states=norm_hidden_states,
	encoder_hidden_states=norm_hidden_states,
	attention_mask=attention_mask,
	image_rotary_emb=image_rotary_emb,
	)
	hidden_states = hidden_states + self.norm2(attn_output)
	mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
	hidden_states = hidden_states + self.ffn_norm2(mlp_output)
	else:
	if self.modulation:
	if temb is None:
	raise ValueError("temb must be provided when modulation is enabled")
	# ################################my debug###################################
	# print(f"######################hidden_states.shape: {hidden_states.shape}##########################") # #hidden_states.shape: torch.Size([88, 464, 2520])
	# print(f"######################temb.shape: {temb.shape}##########################") #temb.shape: torch.Size([88, 1024])
	# ###########################################################################
	norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)

	# ################################my debug###################################
	# print(f"######################norm_hidden_states.shape: {norm_hidden_states.shape}##########################") # norm_hidden_states.shape: torch.Size([88, 464, 2520])
	# ###########################################################################


	attn_output = self.attn(
	hidden_states=norm_hidden_states,
	encoder_hidden_states=norm_hidden_states,
	attention_mask=attention_mask,
	image_rotary_emb=image_rotary_emb,
	)
	hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
	mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
	hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
	else:

	norm_hidden_states = self.norm1(hidden_states)
	attn_output = self.attn(
	hidden_states=norm_hidden_states,
	encoder_hidden_states=norm_hidden_states,
	attention_mask=attention_mask,
	image_rotary_emb=image_rotary_emb,
	)
	hidden_states = hidden_states + self.norm2(attn_output)
	mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
	hidden_states = hidden_states + self.ffn_norm2(mlp_output)

	return hidden_states







	# class PromptTuningTransformerBlock(BOOGUTransformerBlock):
	class PromptTuningTransformerBlock(nn.Module):


	"""
	Transformer block for BOOGU model.

	This block implements a transformer layer with:
	- Multi-head attention with flash attention
	- Feed-forward network with SwiGLU activation
	- RMS normalization
	- Optional modulation for conditional generation

	Args:
	dim: Dimension of the input and output tensors
	num_attention_heads: Number of attention heads
	num_kv_heads: Number of key-value heads
	multiple_of: Multiple of which the hidden dimension should be
	ffn_dim_multiplier: Multiplier for the feed-forward network dimension
	norm_eps: Epsilon value for normalization layers
	modulation: Whether to use modulation for conditional generation
	use_fused_rms_norm: Whether to use fused RMS normalization
	use_fused_swiglu: Whether to use fused SwiGLU activation
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	num_kv_heads: int,
	multiple_of: int,
	ffn_dim_multiplier: float,
	norm_eps: float,
	) -> None:
	"""Initialize the transformer block."""
	super().__init__()

	# super().__init__(
	# dim,
	# num_attention_heads,
	# num_kv_heads,
	# multiple_of,
	# ffn_dim_multiplier,
	# norm_eps,
	# modulation = False,
	# )

	# nn.Module.__init__()

	self.head_dim = dim // num_attention_heads

	from torch.nn import RMSNorm

	try:
	# #########################my debug############################
	# print(f"###########################Use BOOGUAttnProcessorFlash2Varlen############################")
	# #############################################################
	# raise ImportError
	processor = BOOGUAttnProcessorFlash2Varlen()
	except ImportError:
	#########################my debug############################
	print(f"###########################Use BOOGUAttnProcessor############################")
	#############################################################
	processor = BOOGUAttnProcessor()

	# Initialize attention layer
	self.attn = Attention(
	query_dim=dim,
	cross_attention_dim=None,
	dim_head=dim // num_attention_heads,
	qk_norm="rms_norm",
	heads=num_attention_heads,
	kv_heads=num_kv_heads,
	eps=1e-5,
	bias=False,
	out_bias=False,
	processor=processor,
	)

	# Initialize feed-forward network
	self.feed_forward = LuminaFeedForward(
	dim=dim,
	inner_dim=4 * dim,
	multiple_of=multiple_of,
	ffn_dim_multiplier=ffn_dim_multiplier
	)

	# Initialize normalization layers

	self.norm1 = RMSNorm(dim, eps=norm_eps)

	self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
	self.norm2 = RMSNorm(dim, eps=norm_eps)
	self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)

	self.initialize_weights()

	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the transformer block.

	Uses Xavier uniform initialization for linear layers and zero initialization for biases.
	"""
	nn.init.xavier_uniform_(self.attn.to_q.weight)
	nn.init.xavier_uniform_(self.attn.to_k.weight)
	nn.init.xavier_uniform_(self.attn.to_v.weight)
	nn.init.xavier_uniform_(self.attn.to_out[0].weight)

	nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
	nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
	nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	rotary_emb: torch.Tensor,
	) -> torch.Tensor:
	"""
	Forward pass of the transformer block.

	Args:
	hidden_states: Input hidden states tensor
	attention_mask: Attention mask tensor
	rotary_emb: Rotary embeddings for image tokens
	Returns:
	torch.Tensor: Output hidden states after transformer block processing
	"""

	norm_hidden_states = self.norm1(hidden_states)
	attn_output = self.attn(
	hidden_states=norm_hidden_states,
	encoder_hidden_states=norm_hidden_states,
	attention_mask=attention_mask,
	image_rotary_emb=rotary_emb,
	)
	hidden_states = hidden_states + self.norm2(attn_output)
	mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
	hidden_states = hidden_states + self.ffn_norm2(mlp_output)

	return hidden_states





	@dataclass
	class TeaCacheParams:
	"""
	TeaCache parameters for `OmniGen2Transformer2DModel`
	See https://github.com/ali-vilab/TeaCache/ for a more comprehensive understanding

	Args:
	previous_residual (Optional[torch.Tensor]):
	The tensor difference between the output and the input of the transformer layers from the previous timestep.
	previous_modulated_inp (Optional[torch.Tensor]):
	The modulated input from the previous timestep used to indicate the change of the transformer layer's output.
	accumulated_rel_l1_distance (float):
	The accumulated relative L1 distance.
	is_first_or_last_step (bool):
	Whether the current timestep is the first or last step.
	"""
	previous_residual: Optional[torch.Tensor] = None
	previous_modulated_inp: Optional[torch.Tensor] = None
	accumulated_rel_l1_distance: float = 0
	is_first_or_last_step: bool = False



	#################################################Prompt Tuning#####################################################################
	# class PromptEmbedding(nn.Module):
	class PromptEmbedding(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):

	_supports_gradient_checkpointing = True
	_no_split_modules = ["PromptTuningTransformerBlock", "BOOGUTransformerBlock"]
	_skip_layerwise_casting_patterns = ["prompt_token_embedding", "norm"]


	def __init__(self, prompt_tuning_configs):
	super().__init__()


	# 拆出你关心的参数
	num_trainable_prompt_tokens = prompt_tuning_configs.get("num_trainable_prompt_tokens", 32)
	hidden_size = prompt_tuning_configs.get("hidden_size", 2048)
	num_attention_heads = prompt_tuning_configs.get("num_attention_heads", 32)
	num_kv_heads = prompt_tuning_configs.get("num_kv_heads", 8)
	multiple_of = prompt_tuning_configs.get("multiple_of", 256)
	ffn_dim_multiplier = prompt_tuning_configs.get("ffn_dim_multiplier", None)
	norm_eps = prompt_tuning_configs.get("norm_eps", 1e-5)
	num_layers = prompt_tuning_configs.get("num_layers", 2)
	theta = prompt_tuning_configs.get("theta", 10000)

	# 关键：注册到 config（会存到 config.json）
	self.register_to_config(
	num_trainable_prompt_tokens=num_trainable_prompt_tokens,
	hidden_size=hidden_size,
	num_attention_heads=num_attention_heads,
	num_kv_heads=num_kv_heads,
	multiple_of=multiple_of,
	ffn_dim_multiplier=ffn_dim_multiplier,
	norm_eps=norm_eps,
	num_layers=num_layers,
	theta=theta,
	)

	self.prompt_tuning_configs = prompt_tuning_configs



	# print(f"##################prompt_tuning_configs: {prompt_tuning_configs}, type: {type(prompt_tuning_configs)}#####################")


	# num_trainable_prompt_tokens = prompt_tuning_configs.get("num_trainable_prompt_tokens", 32)
	# hidden_size = prompt_tuning_configs.get("hidden_size", 2048)

	# num_attention_heads = prompt_tuning_configs.get("num_attention_heads", 32)
	# prompt_emb_head_dim = hidden_size // num_attention_heads

	prompt_emb_head_dim = self.config.hidden_size // self.config.num_attention_heads



	self.prompt_token_embedding = nn.Embedding(
	num_embeddings = self.config.num_trainable_prompt_tokens,
	embedding_dim = self.config.hidden_size,
	)


	# # Initialize prompt tuning rotary position embedder
	# self.prompt_rope_embedder = BOOGUPromptTuningRotaryPosEmbed(
	# theta=10000,
	# dim=prompt_emb_head_dim,
	# num_trainable_prompt_tokens=num_trainable_prompt_tokens
	# )

	# Initialize prompt tuning rotary position embedder
	self.prompt_rope_embedder = BOOGUPromptTuningRotaryPosEmbed(
	theta=self.config.theta,
	dim=prompt_emb_head_dim,
	num_trainable_prompt_tokens=self.config.num_trainable_prompt_tokens
	)



	# self.prompt_tuning_layers = nn.ModuleList(
	# [
	# PromptTuningTransformerBlock(
	# dim=hidden_size,
	# num_attention_heads=prompt_tuning_configs.get("num_attention_heads", 32),
	# num_kv_heads=prompt_tuning_configs.get("num_kv_heads", 8),
	# multiple_of=prompt_tuning_configs.get("multiple_of", 256),
	# ffn_dim_multiplier=prompt_tuning_configs.get("ffn_dim_multiplier", None),
	# norm_eps=prompt_tuning_configs.get("norm_eps", 1e-5),
	# )
	# for _ in range(prompt_tuning_configs.get("num_layers", 2))
	# ])


	# self.prompt_tuning_layers = nn.ModuleList(
	# [
	# BOOGUTransformerBlock(
	# dim=hidden_size,
	# num_attention_heads=prompt_tuning_configs.get("num_attention_heads", 32),
	# num_kv_heads=prompt_tuning_configs.get("num_kv_heads", 8),
	# multiple_of=prompt_tuning_configs.get("multiple_of", 256),
	# ffn_dim_multiplier=prompt_tuning_configs.get("ffn_dim_multiplier", None),
	# norm_eps=prompt_tuning_configs.get("norm_eps", 1e-5),
	# modulation=False,
	# )
	# for _ in range(prompt_tuning_configs.get("num_layers", 2))
	# ])


	self.prompt_tuning_layers = nn.ModuleList(
	[
	BOOGUTransformerBlock(
	dim=self.config.hidden_size,
	num_attention_heads=self.config.num_attention_heads,
	num_kv_heads=self.config.num_kv_heads,
	multiple_of=self.config.multiple_of,
	ffn_dim_multiplier=self.config.ffn_dim_multiplier,
	norm_eps=self.config.norm_eps,
	modulation=False,
	)
	for _ in range(self.config.num_layers)
	])


	self.gradient_checkpointing = False

	# # Set up gradient checkpointing function manually since PromptEmbedding doesn't inherit from ModelMixin
	# self._gradient_checkpointing_func = checkpoint

	# Initialize weights
	self.initialize_weights()



	def initialize_weights(self) -> None:
	# Initialize prompt token embeddings with small random values
	# Using small std to ensure stable training initialization
	nn.init.normal_(self.prompt_token_embedding.weight, mean=0.0, std=0.02)

	# Note: prompt_tuning_layers are already initialized in their __init__ methods
	# No need to call initialize_weights() again to avoid double initialization


	def forward(self, idx = None, batch_size=1, device=None, use_causal_mask=True):
	if idx is None:
	prompt_embeddings = self.prompt_token_embedding.weight
	else:
	prompt_embeddings= self.prompt_token_embedding(idx)

	# Expand to batch size [B, num_tokens, hidden_dim]
	hidden_states = prompt_embeddings.unsqueeze(0).expand(batch_size, -1, -1)

	# Get rotary position embeddings and attention mask
	rotary_emb, attention_mask = self.prompt_rope_embedder(batch_size, device, use_causal_mask)

	# print(f"#########################attention_mask:{attention_mask}, shape: {attention_mask.shape}##########################")

	# Process through prompt tuning layers with gradient checkpointing support
	for i, layer in enumerate(self.prompt_tuning_layers):
	if torch.is_grad_enabled() and self.gradient_checkpointing:
	# Use gradient checkpointing to save memory during training
	# print(f"#######################gradient checkpointing###############################")
	hidden_states = self._gradient_checkpointing_func(
	layer,
	hidden_states,
	attention_mask,
	rotary_emb,
	)
	else:
	# print(f"#######################no gradient checkpointing###############################")
	# Normal forward pass without gradient checkpointing
	hidden_states = layer(
	hidden_states, # [B, num_tokens, hidden_dim]
	attention_mask, # [B, num_tokens] - All True for causal attention
	# rotary_emb=rotary_emb, # [B, num_tokens, text_dim] - Use text-style RoPE
	rotary_emb, # [B, num_tokens, text_dim] - Use text-style RoPE
	# No timestep conditioning for prompt tuning
	)
	return hidden_states

	@classmethod
	def from_config(cls, config, **kwargs):
	# `config` is a dict(read from config.json)
	# If `__init__` receives the positional parameter `prompt_tuning_configs` :
	instance = cls(prompt_tuning_configs=config)

	weight_dtype = kwargs.get("weight_dtype", None)
	if weight_dtype is not None:
	for p in instance.parameters():
	p.data = p.data.to(dtype=weight_dtype)

	return instance
	############################################################################################################################




	###################################################################my double stream block#######################################################################
	class BOOGUTransformerDoubleStreamBlock(nn.Module):
	"""
	BOOGU Double Stream Transformer Block for BOOGU model.

	This block implements a double-stream transformer layer with:
	- Separate text and image processing streams
	- Cross-modal attention between text and image
	- Image self-attention for spatial modeling
	- BOOGU style modulation and normalization

	The data flow follows YAK's DoubleStreamXBlock logic but uses BOOGU's
	modulation style (LuminaRMSNormZero instead of triple modulation).

	Args:
	dim: Dimension of the input and output tensors
	num_attention_heads: Number of attention heads
	num_kv_heads: Number of key-value heads
	multiple_of: Multiple of which the hidden dimension should be
	ffn_dim_multiplier: Multiplier for the feed-forward network dimension
	norm_eps: Epsilon value for normalization layers
	modulation: Whether to use modulation for conditional generation
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	num_kv_heads: int,
	multiple_of: int,
	ffn_dim_multiplier: float,
	norm_eps: float,
	modulation: bool = True,
	) -> None:
	"""Initialize the double stream transformer block."""
	super().__init__()
	self.head_dim = dim // num_attention_heads
	self.num_attention_heads = num_attention_heads
	self.modulation = modulation
	self.hidden_size = dim

	try:
	processor = BOOGUAttnProcessorFlash2Varlen()
	except ImportError:
	processor = BOOGUAttnProcessor()

	try:
	double_stream_processor = BOOGUDoubleStreamSelfAttnProcessorFlash2Varlen(
	head_dim=self.head_dim,
	num_attention_heads=num_attention_heads,
	num_kv_heads=num_kv_heads,
	qkv_bias=False
	)
	except ImportError:
	double_stream_processor = BOOGUDoubleStreamSelfAttnProcessor(
	head_dim=self.head_dim,
	num_attention_heads=num_attention_heads,
	num_kv_heads=num_kv_heads,
	qkv_bias=False
	)

	# === Image Stream Components ===
	# Image-text cross-modal attention - uses double-stream processor
	self.img_txt_attn = Attention(
	query_dim=dim,
	cross_attention_dim=None,
	dim_head=dim // num_attention_heads,
	qk_norm="rms_norm",
	heads=num_attention_heads,
	kv_heads=num_kv_heads,
	eps=1e-5,
	bias=False,
	out_bias=False,
	processor=double_stream_processor,
	)

	# Image self-attention for spatial modeling
	self.img_self_attn = Attention(
	query_dim=dim,
	cross_attention_dim=None,
	dim_head=dim // num_attention_heads,
	qk_norm="rms_norm",
	heads=num_attention_heads,
	kv_heads=num_kv_heads,
	eps=1e-5,
	bias=False,
	out_bias=False,
	processor=processor,
	)

	# Image feed-forward network
	self.img_feed_forward = LuminaFeedForward(
	dim=dim,
	inner_dim=4 * dim,
	multiple_of=multiple_of,
	ffn_dim_multiplier=ffn_dim_multiplier
	)

	# Image normalization layers
	if modulation:
	# Image triple modulation: cross_attn, self_attn, mlp
	self.img_norm1 = LuminaRMSNormZero( # for cross-modal attention
	embedding_dim=dim,
	norm_eps=norm_eps,
	norm_elementwise_affine=True
	)
	self.img_norm2 = LuminaRMSNormZero( # for mlp
	embedding_dim=dim,
	norm_eps=norm_eps,
	norm_elementwise_affine=True
	)
	self.img_norm3 = LuminaRMSNormZero( # for self-attention
	embedding_dim=dim,
	norm_eps=norm_eps,
	norm_elementwise_affine=True
	)
	else:
	self.img_norm1 = RMSNorm(dim, eps=norm_eps)
	self.img_norm2 = RMSNorm(dim, eps=norm_eps)
	self.img_norm3 = RMSNorm(dim, eps=norm_eps)

	self.img_ffn_norm1 = RMSNorm(dim, eps=norm_eps)
	self.img_attn_norm = RMSNorm(dim, eps=norm_eps)
	self.img_self_attn_norm = RMSNorm(dim, eps=norm_eps)
	self.img_ffn_norm2 = RMSNorm(dim, eps=norm_eps)

	# ###########################deprecated#####################################
	# # # === Text Stream Components ===
	# # Text cross-modal attention (with image)
	# self.txt_attn = Attention(
	# query_dim=dim,
	# cross_attention_dim=None,
	# dim_head=dim // num_attention_heads,
	# qk_norm="rms_norm",
	# heads=num_attention_heads,
	# kv_heads=num_kv_heads,
	# eps=1e-5,
	# bias=False,
	# out_bias=False,
	# processor=processor,
	# )
	# ##########################################################################

	# Text feed-forward network
	self.txt_feed_forward = LuminaFeedForward(
	dim=dim,
	inner_dim=4 * dim,
	multiple_of=multiple_of,
	ffn_dim_multiplier=ffn_dim_multiplier
	)

	# Text normalization layers
	if modulation:
	# Text double modulation: cross_attn, mlp
	self.txt_norm1 = LuminaRMSNormZero( # for cross-modal attention
	embedding_dim=dim,
	norm_eps=norm_eps,
	norm_elementwise_affine=True
	)
	self.txt_norm2 = LuminaRMSNormZero( # for mlp
	embedding_dim=dim,
	norm_eps=norm_eps,
	norm_elementwise_affine=True
	)
	else:
	self.txt_norm1 = RMSNorm(dim, eps=norm_eps)
	self.txt_norm2 = RMSNorm(dim, eps=norm_eps)

	self.txt_ffn_norm1 = RMSNorm(dim, eps=norm_eps)
	self.txt_attn_norm = RMSNorm(dim, eps=norm_eps)
	self.txt_ffn_norm2 = RMSNorm(dim, eps=norm_eps)

	self.initialize_weights()

	# Disable gradients for unused attn.to_q/k/v layers in img_txt_attn
	# since we use double_stream_processor with its own linear layers
	for param in self.img_txt_attn.to_q.parameters():
	param.requires_grad = False
	for param in self.img_txt_attn.to_k.parameters():
	param.requires_grad = False
	for param in self.img_txt_attn.to_v.parameters():
	param.requires_grad = False

	del self.img_txt_attn.to_k
	del self.img_txt_attn.to_v
	del self.img_txt_attn.to_q



	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the double stream transformer block.

	Uses Xavier uniform initialization for linear layers and zero initialization
	for modulation parameters.
	"""
	# Initialize image-text stream weights
	# nn.init.xavier_uniform_(self.img_txt_attn.to_q.weight) # not useful.
	# nn.init.xavier_uniform_(self.img_txt_attn.to_k.weight) # not useful.
	# nn.init.xavier_uniform_(self.img_txt_attn.to_v.weight) # not useful.
	nn.init.xavier_uniform_(self.img_txt_attn.to_out[0].weight)

	# Note: img_self_attn and txt_attn use standard Attention modules
	# PyTorch's default initialization (Kaiming uniform) is usually sufficient
	# But we keep Xavier uniform for consistency with other BOOGU components
	nn.init.xavier_uniform_(self.img_self_attn.to_q.weight)
	nn.init.xavier_uniform_(self.img_self_attn.to_k.weight)
	nn.init.xavier_uniform_(self.img_self_attn.to_v.weight)
	nn.init.xavier_uniform_(self.img_self_attn.to_out[0].weight)

	nn.init.xavier_uniform_(self.img_feed_forward.linear_1.weight)
	nn.init.xavier_uniform_(self.img_feed_forward.linear_2.weight)
	nn.init.xavier_uniform_(self.img_feed_forward.linear_3.weight)

	# ############################deprecated#####################################
	# # Initialize text stream weights
	# nn.init.xavier_uniform_(self.txt_attn.to_q.weight)
	# nn.init.xavier_uniform_(self.txt_attn.to_k.weight)
	# nn.init.xavier_uniform_(self.txt_attn.to_v.weight)
	# nn.init.xavier_uniform_(self.txt_attn.to_out[0].weight)
	# ###########################################################################

	nn.init.xavier_uniform_(self.txt_feed_forward.linear_1.weight)
	nn.init.xavier_uniform_(self.txt_feed_forward.linear_2.weight)
	nn.init.xavier_uniform_(self.txt_feed_forward.linear_3.weight)


	# Initialize modulation parameters
	if self.modulation:
	nn.init.zeros_(self.img_norm1.linear.weight)
	nn.init.zeros_(self.img_norm1.linear.bias)
	nn.init.zeros_(self.img_norm2.linear.weight)
	nn.init.zeros_(self.img_norm2.linear.bias)
	nn.init.zeros_(self.img_norm3.linear.weight)
	nn.init.zeros_(self.img_norm3.linear.bias)

	nn.init.zeros_(self.txt_norm1.linear.weight)
	nn.init.zeros_(self.txt_norm1.linear.bias)
	nn.init.zeros_(self.txt_norm2.linear.weight)
	nn.init.zeros_(self.txt_norm2.linear.bias)

	def forward(
	self,
	img_hidden_states: torch.Tensor, # [B, L_img, D] - Image tokens (ref_img + noise_img)
	txt_hidden_states: torch.Tensor, # [B, L_txt, D] - Text tokens
	img_attention_mask: torch.Tensor, # [B, L_img] - Attention mask for [ref_img + noise_img]
	joint_attention_mask: torch.Tensor, # [B, L_total] - Combined attention mask for [txt + img]
	image_rotary_emb: torch.Tensor, # [B, L_img, head_dim] - Rotary embeddings for [ref_img + noise_img]
	rotary_emb: torch.Tensor, # [B, L_total, head_dim] - Rotary embeddings for [txt + img]
	temb: Optional[torch.Tensor] = None, # [B, 1024] - Timestep embeddings
	encoder_seq_lengths: List[int] = None, # [B] - Text sequence lengths for each sample
	seq_lengths: List[int] = None, # [B] - Total sequence lengths for each sample
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Forward pass of the double stream transformer block.

	This implementation follows YAK's DoubleStreamXBlock logic exactly:
	1. Apply normalization and modulation to both streams
	2. Cross-modal attention: both text and image attend to [text + image] sequence
	3. Image self-attention: image tokens attend to themselves only
	4. Apply MLPs to both streams

	Args:
	img_hidden_states: Image token representations [B, L_img, D] (ref_img + noise_img)
	txt_hidden_states: Text token representations [B, L_txt, D]
	img_attention_mask: Image attention mask [B, L_img] - True for valid image tokens
	joint_attention_mask: Combined attention mask [B, L_total] - True for valid tokens in [txt + img]
	image_rotary_emb: Rotary position embeddings [B, L_img, head_dim] for image tokens
	rotary_emb: Rotary position embeddings [B, L_total, head_dim] for [txt + img]
	temb: Timestep conditioning embeddings [B, 1024]
	encoder_seq_lengths: Text sequence lengths for each sample [B]
	seq_lengths: Total sequence lengths for each sample [B]

	Returns:
	Tuple of (updated_img_hidden_states, updated_txt_hidden_states)
	"""
	if self.modulation and temb is None:
	raise ValueError("temb must be provided when modulation is enabled")

	# Extract dimensions
	batch_size = img_hidden_states.shape[0]
	L_txt = txt_hidden_states.shape[1] # Text sequence length
	L_img = img_hidden_states.shape[1] # Image sequence length (ref_img + noise_img)

	if self.modulation:
	# === Step 1: Apply modulation to both streams ===
	# Image stream: get 3 sets of modulation parameters (cross_attn, self_attn, mlp)
	img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
	img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
	img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)

	# Text stream: get 2 sets of modulation parameters (cross_attn, mlp)
	txt_norm1_out, txt_gate_msa, txt_scale_mlp, txt_gate_mlp = self.txt_norm1(txt_hidden_states, temb)
	txt_norm2_out, txt_shift_mlp, _, _ = self.txt_norm2(txt_hidden_states, temb)

	# === Step 2: Cross-modal attention (both streams attend to [txt + img]) ===
	# Use double-stream processor for YAK-style attention computation
	# We need to call the processor directly because the standard Attention interface
	# doesn't support our double-stream parameters (img_hidden_states, txt_hidden_states, etc.)
	joint_attn_out = self.img_txt_attn.processor(
	attn=self.img_txt_attn,
	img_hidden_states=img_norm1_out, # Image features
	txt_hidden_states=txt_norm1_out, # Text features
	joint_attention_mask=joint_attention_mask, # Mask for valid tokens in [txt + img]
	rotary_emb=rotary_emb, # RoPE for full sequence
	encoder_seq_lengths=encoder_seq_lengths, # Text sequence lengths
	seq_lengths=seq_lengths, # Total sequence lengths
	)

	# Split attention output back to text and image portions (reverse of concatenation)
	txt_attn_out = txt_hidden_states.new_zeros(batch_size, L_txt, self.hidden_size)
	img_attn_out = img_hidden_states.new_zeros(batch_size, L_img, self.hidden_size)
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	# Extract text portion
	txt_attn_out[i, :encoder_seq_len] = joint_attn_out[i, :encoder_seq_len]
	# Extract image portion
	img_attn_out[i, :seq_len - encoder_seq_len] = joint_attn_out[i, encoder_seq_len:seq_len]

	# === Step 3: Image self-attention (image tokens attend to themselves only) ===
	img_self_attn_out = self.img_self_attn(
	hidden_states=img_norm3_out, # Image features only
	encoder_hidden_states=img_norm3_out, # Self-attention on image
	attention_mask=img_attention_mask, # Mask for valid image tokens
	image_rotary_emb=image_rotary_emb, # RoPE for image tokens only
	)

	# === Step 4: Update streams with residual connections ===
	# Update image stream: cross_attn + self_attn + mlp
	img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
	img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)

	# Image MLP with modulation (following YAK's logic)
	img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
	img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
	img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)

	# Update text stream: cross_attn + mlp (no self-attention for text in YAK)
	txt_hidden_states = txt_hidden_states + txt_gate_msa.unsqueeze(1).tanh() * self.txt_attn_norm(txt_attn_out)

	# Text MLP with modulation (following YAK's logic)
	txt_mlp_input = (1 + txt_scale_mlp.unsqueeze(1)) * txt_norm2_out + txt_shift_mlp.unsqueeze(1)
	txt_mlp_out = self.txt_feed_forward(self.txt_ffn_norm1(txt_mlp_input))
	txt_hidden_states = txt_hidden_states + txt_gate_mlp.unsqueeze(1).tanh() * self.txt_ffn_norm2(txt_mlp_out)

	else:
	# Non-modulated version (for context_refiner style blocks without timestep conditioning)
	# Same logic but simpler without modulation parameters

	# Normalize inputs
	img_norm1_out = self.img_norm1(img_hidden_states)
	img_norm3_out = self.img_norm3(img_hidden_states)
	txt_norm1_out = self.txt_norm1(txt_hidden_states)

	# Cross-modal attention - use double-stream processor for YAK-style attention computation
	# We need to call the processor directly because the standard Attention interface
	# doesn't support our double-stream parameters (img_hidden_states, txt_hidden_states, etc.)
	joint_attn_out = self.img_txt_attn.processor(
	attn=self.img_txt_attn,
	img_hidden_states=img_norm1_out, # Image features
	txt_hidden_states=txt_norm1_out, # Text features
	joint_attention_mask=joint_attention_mask, # Mask for valid tokens in [txt + img]
	rotary_emb=rotary_emb, # RoPE for full sequence
	encoder_seq_lengths=encoder_seq_lengths, # Text sequence lengths
	seq_lengths=seq_lengths, # Total sequence lengths
	)

	# Split attention output back to text and image portions
	txt_attn_out = txt_hidden_states.new_zeros(batch_size, L_txt, self.hidden_size)
	img_attn_out = img_hidden_states.new_zeros(batch_size, L_img, self.hidden_size)
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	# Extract text portion
	txt_attn_out[i, :encoder_seq_len] = joint_attn_out[i, :encoder_seq_len]
	# Extract image portion
	img_attn_out[i, :seq_len - encoder_seq_len] = joint_attn_out[i, encoder_seq_len:seq_len]

	# Image self-attention
	img_self_attn_out = self.img_self_attn(
	hidden_states=img_norm3_out,
	encoder_hidden_states=img_norm3_out,
	attention_mask=img_attention_mask, # Use image attention mask
	image_rotary_emb=image_rotary_emb, # Use image rotary embeddings
	)

	# Update streams (simpler without modulation gates)
	img_hidden_states = img_hidden_states + self.img_attn_norm(img_attn_out)
	img_hidden_states = img_hidden_states + self.img_self_attn_norm(img_self_attn_out)
	# Image MLP with norm2 (following YAK's logic)
	img_norm2_out = self.img_norm2(img_hidden_states)
	img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_norm2_out))
	img_hidden_states = img_hidden_states + self.img_ffn_norm2(img_mlp_out)

	txt_hidden_states = txt_hidden_states + self.txt_attn_norm(txt_attn_out)
	# Text MLP with norm2 (following YAK's logic)
	txt_norm2_out = self.txt_norm2(txt_hidden_states)
	txt_mlp_out = self.txt_feed_forward(self.txt_ffn_norm1(txt_norm2_out))
	txt_hidden_states = txt_hidden_states + self.txt_ffn_norm2(txt_mlp_out)

	return img_hidden_states, txt_hidden_states


	BOOGUTransformerSingleStreamBlock = BOOGUTransformerBlock




	# PromptTuningTransformerBlock = OmniGen2TransformerBlock



	class BOOGUSingleDoubleStreamTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
	"""
	BOOGU Mixed Single-Double Stream Transformer 2D Model.

	A transformer-based diffusion model that combines double-stream and single-stream processing:
	- Initial layers use double-stream processing (separate text and image streams)
	- Later layers use single-stream processing (joint text+image processing)
	- This follows YAK's architecture pattern but with BOOGU's components

	Args:
	patch_size: Size of image patches
	in_channels: Number of input channels
	out_channels: Number of output channels (defaults to in_channels)
	hidden_size: Size of hidden layers
	num_layers: Total number of transformer layers
	num_double_stream_layers: Number of initial double-stream layers
	num_refiner_layers: Number of refiner layers
	num_attention_heads: Number of attention heads
	num_kv_heads: Number of key-value heads
	multiple_of: Multiple of which the hidden dimension should be
	ffn_dim_multiplier: Multiplier for feed-forward network dimension
	norm_eps: Epsilon value for normalization layers
	axes_dim_rope: Dimensions for rotary position embeddings
	axes_lens: Lengths for rotary position embeddings
	text_feat_dim: Dimension of text features
	timestep_scale: Scale factor for timestep embeddings
	"""

	_supports_gradient_checkpointing = True
	_no_split_modules = ["BOOGUTransformerBlock", "BOOGUTransformerSingleStreamBlock", "BOOGUTransformerDoubleStreamBlock", "PromptEmbedding", "nn.Embedding", "PromptTuningTransformerBlock"]
	_skip_layerwise_casting_patterns = ["x_embedder", "norm", "embedding"]

	@register_to_config
	def __init__(
	self,
	patch_size: int = 2,
	in_channels: int = 16,
	out_channels: Optional[int] = None,
	hidden_size: int = 2304,
	num_layers: int = 26,
	num_double_stream_layers: int = 2,
	num_refiner_layers: int = 2,
	num_attention_heads: int = 24,
	num_kv_heads: int = 8,
	multiple_of: int = 256,
	ffn_dim_multiplier: Optional[float] = None,
	norm_eps: float = 1e-5,
	axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
	axes_lens: Tuple[int, int, int] = (300, 512, 512),
	# text_feat_dim: int = 1024,
	text_feature_configs: Dict[str, Any] = dict(text_feat_dim=1024, reduce_type="concat", num_text_feature_layers=1),
	prompt_tuning_configs: Dict[str, Any] = dict(use_prompt_tuning=False),
	timestep_scale: float = 1.0,
	) -> None:
	"""Initialize the BOOGU mixed single-double stream transformer model."""
	super().__init__()

	# Validate configuration
	if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
	raise ValueError(
	f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
	f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
	)

	if num_double_stream_layers > num_layers:
	raise ValueError(
	f"num_double_stream_layers ({num_double_stream_layers}) cannot be greater than "
	f"num_layers ({num_layers})"
	)

	self.out_channels = out_channels or in_channels
	self.num_double_stream_layers = num_double_stream_layers
	self.num_single_stream_layers = num_layers - num_double_stream_layers
	self.text_feature_configs = text_feature_configs
	self.prompt_tuning_configs = prompt_tuning_configs
	self.preprocessed_text_feat_dim = self.cal_preprocessed_text_feat_dim(text_feature_configs)

	# Initialize embeddings
	self.rope_embedder = BOOGUDoubleStreamRotaryPosEmbed(
	theta=10000,
	axes_dim=axes_dim_rope,
	axes_lens=axes_lens,
	patch_size=patch_size,
	)

	self.x_embedder = nn.Linear(
	in_features=patch_size * patch_size * in_channels,
	out_features=hidden_size,
	)

	self.ref_image_patch_embedder = nn.Linear(
	in_features=patch_size * patch_size * in_channels,
	out_features=hidden_size,
	)

	self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
	hidden_size=hidden_size,
	text_feat_dim=self.preprocessed_text_feat_dim,
	norm_eps=norm_eps,
	timestep_scale=timestep_scale
	)

	# Initialize refiner layers (same as original BOOGU)
	self.noise_refiner = nn.ModuleList([
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(num_refiner_layers)
	])

	self.ref_image_refiner = nn.ModuleList([
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(num_refiner_layers)
	])

	self.context_refiner = nn.ModuleList([
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=False
	)
	for _ in range(num_refiner_layers)
	])

	# === MIXED ARCHITECTURE: Double-stream + Single-stream layers ===

	# 1. Double-stream layers (initial processing with separate text/image streams)
	self.double_stream_layers = nn.ModuleList([
	BOOGUTransformerDoubleStreamBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(num_double_stream_layers)
	])


	# 2. Single-stream layers (joint text+image processing)
	self.single_stream_layers = nn.ModuleList([
	BOOGUTransformerSingleStreamBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(self.num_single_stream_layers)
	])

	# 4. Output norm & projection (same as original BOOGU)
	self.norm_out = LuminaLayerNormContinuous(
	embedding_dim=hidden_size,
	conditioning_embedding_dim=min(hidden_size, 1024),
	elementwise_affine=False,
	eps=1e-6,
	bias=True,
	out_dim=patch_size * patch_size * self.out_channels
	)

	# Add learnable embeddings to distinguish different images
	self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images

	self.gradient_checkpointing = False

	self.initialize_weights()

	# TeaCache settings
	self.enable_teacache = False
	self.teacache_rel_l1_thresh = 0.05
	self.teacache_params = TeaCacheParams()

	coefficients = [-5.48259225, 11.48772289, -4.47407401, 2.47730926, -0.03316487]
	self.rescale_func = np.poly1d(coefficients)

	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the model.

	Uses Xavier uniform initialization for linear layers.
	"""
	nn.init.xavier_uniform_(self.x_embedder.weight)
	nn.init.constant_(self.x_embedder.bias, 0.0)

	nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
	nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)

	nn.init.zeros_(self.norm_out.linear_1.weight)
	nn.init.zeros_(self.norm_out.linear_1.bias)
	nn.init.zeros_(self.norm_out.linear_2.weight)
	nn.init.zeros_(self.norm_out.linear_2.bias)

	nn.init.normal_(self.image_index_embedding, std=0.02)

	# Reuse the same helper methods from original BOOGUTransformer2DModel
	def img_patch_embed_and_refine(
	self,
	hidden_states,
	ref_image_hidden_states,
	padded_img_mask,
	padded_ref_img_mask,
	noise_rotary_emb,
	ref_img_rotary_emb,
	l_effective_ref_img_len,
	l_effective_img_len,
	temb
	):
	"""Same implementation as original BOOGUTransformer2DModel"""
	batch_size = len(hidden_states)
	max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])

	hidden_states = self.x_embedder(hidden_states)
	ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)

	for i in range(batch_size):
	shift = 0
	for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
	ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
	shift += ref_img_len

	for layer in self.noise_refiner:
	hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)

	flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
	num_ref_images = len(flat_l_effective_ref_img_len)
	max_ref_img_len = max(flat_l_effective_ref_img_len)

	batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
	batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
	batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
	batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)

	# sequence of ref imgs to batch
	idx = 0
	for i in range(batch_size):
	shift = 0
	for ref_img_len in l_effective_ref_img_len[i]:
	batch_ref_img_mask[idx, :ref_img_len] = True
	batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
	batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
	batch_temb[idx] = temb[i]
	shift += ref_img_len
	idx += 1

	# refine ref imgs separately
	for layer in self.ref_image_refiner:
	batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)

	# batch of ref imgs to sequence
	idx = 0
	for i in range(batch_size):
	shift = 0
	for ref_img_len in l_effective_ref_img_len[i]:
	ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
	shift += ref_img_len
	idx += 1

	combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
	for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
	combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
	combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]

	return combined_img_hidden_states

	def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
	"""Same implementation as original BOOGUTransformer2DModel"""
	batch_size = len(hidden_states)
	p = self.config.patch_size
	device = hidden_states[0].device

	img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
	l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]

	if ref_image_hidden_states is not None:
	ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
	l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
	else:
	ref_img_sizes = [None for _ in range(batch_size)]
	l_effective_ref_img_len = [[0] for _ in range(batch_size)]

	max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
	max_img_len = max(l_effective_img_len)

	# ref image patch embeddings
	flat_ref_img_hidden_states = []
	for i in range(batch_size):
	if ref_img_sizes[i] is not None:
	imgs = []
	for ref_img in ref_image_hidden_states[i]:
	C, H, W = ref_img.size()
	ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
	imgs.append(ref_img)

	img = torch.cat(imgs, dim=0)
	flat_ref_img_hidden_states.append(img)
	else:
	flat_ref_img_hidden_states.append(None)

	# image patch embeddings
	flat_hidden_states = []
	for i in range(batch_size):
	img = hidden_states[i]
	C, H, W = img.size()

	img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
	flat_hidden_states.append(img)

	padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
	padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
	for i in range(batch_size):
	if ref_img_sizes[i] is not None:
	padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
	padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True

	padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
	padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
	for i in range(batch_size):
	padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
	padded_img_mask[i, :l_effective_img_len[i]] = True

	return (
	padded_hidden_states,
	padded_ref_img_hidden_states,
	padded_img_mask,
	padded_ref_img_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	)

	def cal_preprocessed_text_feat_dim(self, text_feature_configs: Dict[str, Any]):
	num_text_feature_layers = max(text_feature_configs.get("num_text_feature_layers", 1), 1)
	text_feat_dim = text_feature_configs.get("text_feat_dim", 4096)
	reduce_type = text_feature_configs.get("reduce_type", "concat")
	if "cat" in reduce_type.lower():
	return num_text_feature_layers * text_feat_dim
	elif "mean" in reduce_type.lower():
	return text_feat_dim
	else:
	raise ValueError(f"Invalid reduce_type: {reduce_type}")


	def preprocess_text_hidden_states(self, raw_text_hidden_states, text_feature_configs: Dict[str, Any]):
	num_text_feature_layers = max(text_feature_configs.get("num_text_feature_layers", 1), 1)
	text_feat_dim = text_feature_configs.get("text_feat_dim", 4096)
	reduce_type = text_feature_configs.get("reduce_type", "concat")

	text_hidden_states = None
	if isinstance(raw_text_hidden_states, torch.Tensor):
	text_hidden_states = raw_text_hidden_states
	elif isinstance(raw_text_hidden_states, (list,tuple) ):
	assert len(raw_text_hidden_states) == num_text_feature_layers
	if "cat" in reduce_type.lower():
	text_hidden_states = torch.cat(raw_text_hidden_states, dim=-1)
	elif "mean" in reduce_type.lower():
	text_hidden_states = torch.mean(torch.stack(raw_text_hidden_states), dim=0)
	else:
	raise ValueError(f"Invalid reduce_type: {reduce_type}")
	else:
	raise ValueError(f"Invalid type of raw_text_hidden_states, expected torch.Tensor or list, but got {type(raw_text_hidden_states)}")

	assert self.preprocessed_text_feat_dim == text_hidden_states.shape[-1]

	return text_hidden_states

	def forward(
	self,
	hidden_states: Union[torch.Tensor, List[torch.Tensor]],
	timestep: torch.Tensor,
	text_hidden_states: torch.Tensor,
	freqs_cis: torch.Tensor,
	text_attention_mask: torch.Tensor,
	ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
	attention_kwargs: Optional[Dict[str, Any]] = None,
	return_dict: bool = False,
	) -> Union[torch.Tensor, Transformer2DModelOutput]:
	"""
	Forward pass combining double-stream and single-stream processing.

	Processing flow:
	1. Text refinement and image embedding (same as original BOOGU)
	2. Double-stream processing: separate text and image streams
	3. Stream fusion: combine text and image streams into joint representation
	4. Single-stream processing: joint text+image processing
	5. Output projection

	Args:
	hidden_states: Input image tensors [List[Tensor]] or [B, C, H, W]
	timestep: Timestep tensor [B]
	text_hidden_states: Text features [B, L_txt, text_feat_dim]
	freqs_cis: Frequency components for rotary embeddings
	text_attention_mask: Text attention mask [B, L_txt]
	ref_image_hidden_states: Reference image tensors (optional)
	attention_kwargs: Additional attention arguments
	return_dict: Whether to return dict format

	Returns:
	Generated image tensors or Transformer2DModelOutput
	"""
	text_hidden_states = self.preprocess_text_hidden_states(text_hidden_states, self.text_feature_configs)

	enable_taylorseer = getattr(self, 'enable_taylorseer', False)
	if enable_taylorseer:
	cal_type(self.cache_dic, self.current)

	if attention_kwargs is not None:
	attention_kwargs = attention_kwargs.copy()
	lora_scale = attention_kwargs.pop("scale", 1.0)
	else:
	lora_scale = 1.0

	if USE_PEFT_BACKEND:
	# weight the lora layers by setting `lora_scale` for each PEFT layer
	scale_lora_layers(self, lora_scale)
	else:
	if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
	logger.warning(
	"Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
	)

	# === 1. Initial processing (same as original BOOGU) ===
	batch_size = len(hidden_states)
	is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)

	if is_hidden_states_tensor:
	assert hidden_states.ndim == 4
	hidden_states = [_hidden_states for _hidden_states in hidden_states]

	device = hidden_states[0].device

	# ########################debug##########################
	# print(f"#####################timestep.dtype: {timestep.dtype}###########################")
	# print(f"#####################text_hidden_states.dtype: {text_hidden_states.dtype}###########################")

	# ######################################################

	# Timestep and text embedding
	temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)

	# Flatten and pad sequences
	(
	hidden_states,
	ref_image_hidden_states,
	img_mask,
	ref_img_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)

	# Generate rotary embeddings
	(
	context_rotary_emb,
	ref_img_rotary_emb,
	noise_rotary_emb,
	rotary_emb,
	encoder_seq_lengths,
	seq_lengths,
	combined_img_rotary_emb,
	combined_img_seq_lengths,
	) = self.rope_embedder(
	freqs_cis,
	text_attention_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	device,
	)

	# === 2. Context refinement (same as original BOOGU) ===
	for layer in self.context_refiner:
	text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)

	# Embed and refine image patches
	combined_img_hidden_states = self.img_patch_embed_and_refine(
	hidden_states,
	ref_image_hidden_states,
	img_mask,
	ref_img_mask,
	noise_rotary_emb,
	ref_img_rotary_emb,
	l_effective_ref_img_len,
	l_effective_img_len,
	temb,
	)

	# === 3. DOUBLE-STREAM PROCESSING ===
	# Initialize text and image streams
	txt_hidden_states = text_hidden_states # [B, L_txt, D]
	img_hidden_states = combined_img_hidden_states # [B, L_img, D] - contains ref_img + noise_img

	# Prepare joint attention mask for combined sequence [txt + img] (including ref_img)
	max_seq_len = max(seq_lengths)
	joint_attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
	for i, seq_len in enumerate(seq_lengths):
	joint_attention_mask[i, :seq_len] = True

	# Process through double-stream layers (if any)
	if self.num_double_stream_layers > 0:
	# Prepare image attention mask for combined image sequence [ref_img + noise_img]
	max_img_len = max(combined_img_seq_lengths)
	img_attention_mask = hidden_states.new_zeros(batch_size, max_img_len, dtype=torch.bool)
	for i, img_seq_len in enumerate(combined_img_seq_lengths):
	img_attention_mask[i, :img_seq_len] = True

	# Process through double-stream layers
	for layer_idx, layer in enumerate(self.double_stream_layers):
	if enable_taylorseer:
	layer.current = self.current
	layer.cache_dic = self.cache_dic
	layer.enable_taylorseer = True
	self.current['layer'] = layer_idx

	if torch.is_grad_enabled() and self.gradient_checkpointing:
	img_hidden_states, txt_hidden_states = self._gradient_checkpointing_func(
	layer, img_hidden_states, txt_hidden_states, img_attention_mask, joint_attention_mask,
	combined_img_rotary_emb, rotary_emb, temb, encoder_seq_lengths, seq_lengths
	)
	else:
	# Double-stream forward: returns (img_states, txt_states)
	img_hidden_states, txt_hidden_states = layer(
	img_hidden_states, txt_hidden_states, img_attention_mask, joint_attention_mask,
	combined_img_rotary_emb, rotary_emb, temb, encoder_seq_lengths, seq_lengths
	)

	# === 4. STREAM FUSION: Combine text and image streams ===
	# Following BOOGU's joint processing approach
	# img_hidden_states already contains the processed [ref_img_tokens, noise_img_tokens]
	joint_hidden_states = hidden_states.new_zeros(batch_size, max(seq_lengths), self.config.hidden_size)
	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	# Place text tokens first, then processed image tokens
	joint_hidden_states[i, :encoder_seq_len] = txt_hidden_states[i, :encoder_seq_len]
	joint_hidden_states[i, encoder_seq_len:seq_len] = img_hidden_states[i, :seq_len - encoder_seq_len]

	# === 5. SINGLE-STREAM PROCESSING ===
	# Process the joint representation through single-stream layers
	hidden_states = joint_hidden_states

	# TeaCache optimization (optional)
	if self.enable_teacache and len(self.single_stream_layers) > 0:
	teacache_hidden_states = hidden_states.clone()
	teacache_temb = temb.clone()
	modulated_inp, _, _, _ = self.single_stream_layers[0].norm1(teacache_hidden_states, teacache_temb)
	if self.teacache_params.is_first_or_last_step:
	should_calc = True
	self.teacache_params.accumulated_rel_l1_distance = 0
	else:
	self.teacache_params.accumulated_rel_l1_distance += self.rescale_func(
	((modulated_inp - self.teacache_params.previous_modulated_inp).abs().mean() \
	/ self.teacache_params.previous_modulated_inp.abs().mean()).cpu().item()
	)
	if self.teacache_params.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
	should_calc = False
	else:
	should_calc = True
	self.teacache_params.accumulated_rel_l1_distance = 0
	self.teacache_params.previous_modulated_inp = modulated_inp
	else:
	should_calc = True

	# Process through single-stream layers
	if self.enable_teacache and not should_calc:
	hidden_states += self.teacache_params.previous_residual
	else:
	if enable_taylorseer:
	self.current['stream'] = 'single_stream_layers'

	if self.enable_teacache:
	ori_hidden_states = hidden_states.clone()

	for layer_idx, layer in enumerate(self.single_stream_layers):
	if enable_taylorseer:
	layer.current = self.current
	layer.cache_dic = self.cache_dic
	layer.enable_taylorseer = True
	self.current['layer'] = self.num_double_stream_layers + layer_idx

	if torch.is_grad_enabled() and self.gradient_checkpointing:
	hidden_states = self._gradient_checkpointing_func(
	layer, hidden_states, joint_attention_mask, rotary_emb, temb
	)
	else:
	# Single-stream forward: standard transformer block
	hidden_states = layer(hidden_states, joint_attention_mask, rotary_emb, temb)

	if self.enable_teacache:
	self.teacache_params.previous_residual = hidden_states - ori_hidden_states

	# === 6. Output projection (same as original BOOGU) ===
	hidden_states = self.norm_out(hidden_states, temb)

	# Reshape output back to image format
	p = self.config.patch_size
	output = []
	for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
	height, width = img_size
	# Extract image portion from joint sequence (text tokens are at the beginning)
	img_tokens = hidden_states[i][seq_len - img_len:seq_len] # [img_len, patch_dim]
	# Reshape to image: (h w) (p1 p2 c) -> c (h p1) (w p2)
	img_output = rearrange(
	img_tokens,
	'(h w) (p1 p2 c) -> c (h p1) (w p2)',
	h=height // p, w=width // p, p1=p, p2=p
	)
	output.append(img_output)

	if is_hidden_states_tensor:
	output = torch.stack(output, dim=0)

	# Clean up LoRA scaling
	if USE_PEFT_BACKEND:
	unscale_lora_layers(self, lora_scale)

	# Update TaylorSeer step counter
	if enable_taylorseer:
	self.current['step'] += 1

	if not return_dict:
	return output
	return Transformer2DModelOutput(sample=output)


	##########################################################################################################################################









	class BOOGUTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
	"""
	BOOGU Transformer 2D Model.

	A transformer-based diffusion model for image generation with:
	- Patch-based image processing
	- Rotary position embeddings
	- Multi-head attention
	- Conditional generation support

	Args:
	patch_size: Size of image patches
	in_channels: Number of input channels
	out_channels: Number of output channels (defaults to in_channels)
	hidden_size: Size of hidden layers
	num_layers: Number of transformer layers
	num_refiner_layers: Number of refiner layers
	num_attention_heads: Number of attention heads
	num_kv_heads: Number of key-value heads
	multiple_of: Multiple of which the hidden dimension should be
	ffn_dim_multiplier: Multiplier for feed-forward network dimension
	norm_eps: Epsilon value for normalization layers
	axes_dim_rope: Dimensions for rotary position embeddings
	axes_lens: Lengths for rotary position embeddings
	text_feat_dim: Dimension of text features
	timestep_scale: Scale factor for timestep embeddings
	use_fused_rms_norm: Whether to use fused RMS normalization
	use_fused_swiglu: Whether to use fused SwiGLU activation
	"""

	_supports_gradient_checkpointing = True
	_no_split_modules = ["BOOGUTransformerBlock"]
	_skip_layerwise_casting_patterns = ["x_embedder", "norm"]

	@register_to_config
	def __init__(
	self,
	patch_size: int = 2,
	in_channels: int = 16,
	out_channels: Optional[int] = None,
	hidden_size: int = 2304,
	num_layers: int = 26,
	num_refiner_layers: int = 2,
	num_attention_heads: int = 24,
	num_kv_heads: int = 8,
	multiple_of: int = 256,
	ffn_dim_multiplier: Optional[float] = None,
	norm_eps: float = 1e-5,
	axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
	axes_lens: Tuple[int, int, int] = (300, 512, 512),
	text_feat_dim: int = 1024,
	timestep_scale: float = 1.0
	) -> None:
	"""Initialize the BOOGU transformer model."""
	super().__init__()

	# Validate configuration
	if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
	raise ValueError(
	f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
	f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
	)

	self.out_channels = out_channels or in_channels

	# Initialize embeddings
	self.rope_embedder = BOOGURotaryPosEmbed(
	theta=10000,
	axes_dim=axes_dim_rope,
	axes_lens=axes_lens,
	patch_size=patch_size,
	)

	self.x_embedder = nn.Linear(
	in_features=patch_size * patch_size * in_channels,
	out_features=hidden_size,
	)

	self.ref_image_patch_embedder = nn.Linear(
	in_features=patch_size * patch_size * in_channels,
	out_features=hidden_size,
	)

	self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
	hidden_size=hidden_size,
	text_feat_dim=text_feat_dim,
	norm_eps=norm_eps,
	timestep_scale=timestep_scale
	)

	# Initialize transformer blocks
	self.noise_refiner = nn.ModuleList([
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(num_refiner_layers)
	])

	self.ref_image_refiner = nn.ModuleList([
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(num_refiner_layers)
	])

	self.context_refiner = nn.ModuleList(
	[
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=False
	)
	for _ in range(num_refiner_layers)
	]
	)

	# 3. Transformer blocks
	self.layers = nn.ModuleList(
	[
	BOOGUTransformerBlock(
	hidden_size,
	num_attention_heads,
	num_kv_heads,
	multiple_of,
	ffn_dim_multiplier,
	norm_eps,
	modulation=True
	)
	for _ in range(num_layers)
	]
	)

	# 4. Output norm & projection
	self.norm_out = LuminaLayerNormContinuous(
	embedding_dim=hidden_size,
	conditioning_embedding_dim=min(hidden_size, 1024),
	elementwise_affine=False,
	eps=1e-6,
	bias=True,
	out_dim=patch_size * patch_size * self.out_channels
	)

	# Add learnable embeddings to distinguish different images
	self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images

	self.gradient_checkpointing = False

	self.initialize_weights()

	# TeaCache settings
	self.enable_teacache = False
	self.teacache_rel_l1_thresh = 0.05
	self.teacache_params = TeaCacheParams()

	coefficients = [-5.48259225, 11.48772289, -4.47407401, 2.47730926, -0.03316487]
	self.rescale_func = np.poly1d(coefficients)

	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the model.

	Uses Xavier uniform initialization for linear layers.
	"""
	nn.init.xavier_uniform_(self.x_embedder.weight)
	nn.init.constant_(self.x_embedder.bias, 0.0)

	nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
	nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)

	nn.init.zeros_(self.norm_out.linear_1.weight)
	nn.init.zeros_(self.norm_out.linear_1.bias)
	nn.init.zeros_(self.norm_out.linear_2.weight)
	nn.init.zeros_(self.norm_out.linear_2.bias)

	nn.init.normal_(self.image_index_embedding, std=0.02)

	def img_patch_embed_and_refine(
	self,
	hidden_states,
	ref_image_hidden_states,
	padded_img_mask,
	padded_ref_img_mask,
	noise_rotary_emb,
	ref_img_rotary_emb,
	l_effective_ref_img_len,
	l_effective_img_len,
	temb
	):
	batch_size = len(hidden_states)
	max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])

	hidden_states = self.x_embedder(hidden_states)
	ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)

	for i in range(batch_size):
	shift = 0
	for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
	ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
	shift += ref_img_len

	for layer in self.noise_refiner:
	hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)

	flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
	num_ref_images = len(flat_l_effective_ref_img_len)
	max_ref_img_len = max(flat_l_effective_ref_img_len)

	batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
	batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
	batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
	batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)

	# sequence of ref imgs to batch
	idx = 0
	for i in range(batch_size):
	shift = 0
	for ref_img_len in l_effective_ref_img_len[i]:
	batch_ref_img_mask[idx, :ref_img_len] = True
	batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
	batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
	batch_temb[idx] = temb[i]
	shift += ref_img_len
	idx += 1

	# refine ref imgs separately
	for layer in self.ref_image_refiner:
	batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)

	# batch of ref imgs to sequence
	idx = 0
	for i in range(batch_size):
	shift = 0
	for ref_img_len in l_effective_ref_img_len[i]:
	ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
	shift += ref_img_len
	idx += 1

	combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
	for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
	combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
	combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]

	return combined_img_hidden_states

	def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
	batch_size = len(hidden_states)
	p = self.config.patch_size
	device = hidden_states[0].device

	img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
	l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]

	if ref_image_hidden_states is not None:
	ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
	l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
	else:
	ref_img_sizes = [None for _ in range(batch_size)]
	l_effective_ref_img_len = [[0] for _ in range(batch_size)]

	max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
	max_img_len = max(l_effective_img_len)

	# ref image patch embeddings
	flat_ref_img_hidden_states = []
	for i in range(batch_size):
	if ref_img_sizes[i] is not None:
	imgs = []
	for ref_img in ref_image_hidden_states[i]:
	C, H, W = ref_img.size()
	ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
	imgs.append(ref_img)

	img = torch.cat(imgs, dim=0)
	flat_ref_img_hidden_states.append(img)
	else:
	flat_ref_img_hidden_states.append(None)

	# image patch embeddings
	flat_hidden_states = []
	for i in range(batch_size):
	img = hidden_states[i]
	C, H, W = img.size()

	img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
	flat_hidden_states.append(img)

	padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
	padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
	for i in range(batch_size):
	if ref_img_sizes[i] is not None:
	padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
	padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True

	padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
	padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
	for i in range(batch_size):
	padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
	padded_img_mask[i, :l_effective_img_len[i]] = True

	return (
	padded_hidden_states,
	padded_ref_img_hidden_states,
	padded_img_mask,
	padded_ref_img_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	)

	def forward(
	self,
	hidden_states: Union[torch.Tensor, List[torch.Tensor]], # output_images' feature
	timestep: torch.Tensor,
	text_hidden_states: torch.Tensor, # text' feature
	freqs_cis: torch.Tensor,
	text_attention_mask: torch.Tensor,
	ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None, # input_images' feature
	attention_kwargs: Optional[Dict[str, Any]] = None,
	return_dict: bool = False,
	) -> Union[torch.Tensor, Transformer2DModelOutput]:
	enable_taylorseer = getattr(self, 'enable_taylorseer', False)
	if enable_taylorseer:
	cal_type(self.cache_dic, self.current)

	if attention_kwargs is not None:
	attention_kwargs = attention_kwargs.copy()
	lora_scale = attention_kwargs.pop("scale", 1.0)
	else:
	lora_scale = 1.0

	if USE_PEFT_BACKEND:
	# weight the lora layers by setting `lora_scale` for each PEFT layer
	scale_lora_layers(self, lora_scale)
	else:
	if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
	logger.warning(
	"Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
	)

	# 1. Condition, positional & patch embedding
	batch_size = len(hidden_states)
	is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)

	if is_hidden_states_tensor:
	assert hidden_states.ndim == 4
	hidden_states = [_hidden_states for _hidden_states in hidden_states]

	device = hidden_states[0].device

	temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)

	(
	hidden_states,
	ref_image_hidden_states,
	img_mask,
	ref_img_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)

	(
	context_rotary_emb,
	ref_img_rotary_emb,
	noise_rotary_emb,
	rotary_emb,
	encoder_seq_lengths,
	seq_lengths,
	) = self.rope_embedder(
	freqs_cis,
	text_attention_mask,
	l_effective_ref_img_len,
	l_effective_img_len,
	ref_img_sizes,
	img_sizes,
	device,
	)

	# 2. Context refinement
	for layer in self.context_refiner:
	text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)

	combined_img_hidden_states = self.img_patch_embed_and_refine(
	hidden_states,
	ref_image_hidden_states,
	img_mask,
	ref_img_mask,
	noise_rotary_emb,
	ref_img_rotary_emb,
	l_effective_ref_img_len,
	l_effective_img_len,
	temb,
	)

	# 3. Joint Transformer blocks
	max_seq_len = max(seq_lengths) ## 220+256 = 476

	attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
	joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)

	# #########################################my debug##############################################
	# print(f"#####################text_hidden_states.shape: {text_hidden_states.shape}############################") # ext_hidden_states.shape: torch.Size([88, 220, 2520])
	# print(f"#####################combined_img_hidden_states.shape: {combined_img_hidden_states.shape}############################") # combined_img_hidden_states.shape: torch.Size([88, 256, 2520]) # seplen for image is all 256

	# print(f"#####################encoder_seq_lengths: {encoder_seq_lengths}############################") # [50, 50, 52, 170, 122, 197, 56, 172, 209, 151, 200, 50, 163, 166, 160, 163, 209, 166, 202, 19, 50, 174, 198, 181, 204, 173, 185, 201, 173, 51, 164, 154, 130, 208, 19, 50, 19, 191, 168, 47, 171, 153, 210, 49, 150, 165, 138, 51, 210, 55, 146, 49, 164, 114, 201, 195, 182, 166, 50, 212, 156, 48, 167, 162, 214, 149, 50, 171, 150, 220, 19, 209, 47, 156, 152, 143, 135, 166, 137, 144, 50, 50, 147, 135, 204, 47, 138, 209] # max : 220
	# print(f"#####################seq_lengths: {seq_lengths}############################") # [306, 306, 308, 426, 378, 453, 312, 428, 465, 407, 456, 306, 419, 422, 416, 419, 465, 422, 458, 275, 306, 430, 454, 437, 460, 429, 441, 457, 429, 307, 420, 410, 386, 464, 275, 306, 275, 447, 424, 303, 427, 409, 466, 305, 406, 421, 394, 307, 466, 311, 402, 305, 420, 370, 457, 451, 438, 422, 306, 468, 412, 304, 423, 418, 470, 405, 306, 427, 406, 476, 275, 465, 303, 412, 408, 399, 391, 422, 393, 400, 306, 306, 403, 391, 460, 303, 394, 465] # max: 276 = 220 + 256
	# ###############################################################################################

	for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
	attention_mask[i, :seq_len] = True
	joint_hidden_states[i, :encoder_seq_len] = text_hidden_states[i, :encoder_seq_len]
	joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]

	hidden_states = joint_hidden_states

	if self.enable_teacache:
	teacache_hidden_states = hidden_states.clone()
	teacache_temb = temb.clone()
	modulated_inp, _, _, _ = self.layers[0].norm1(teacache_hidden_states, teacache_temb)
	if self.teacache_params.is_first_or_last_step:
	should_calc = True
	self.teacache_params.accumulated_rel_l1_distance = 0
	else:
	self.teacache_params.accumulated_rel_l1_distance += self.rescale_func(
	((modulated_inp - self.teacache_params.previous_modulated_inp).abs().mean() \
	/ self.teacache_params.previous_modulated_inp.abs().mean()).cpu().item()
	)
	if self.teacache_params.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
	should_calc = False
	else:
	should_calc = True
	self.teacache_params.accumulated_rel_l1_distance = 0
	self.teacache_params.previous_modulated_inp = modulated_inp

	if self.enable_teacache:
	if not should_calc:
	hidden_states += self.teacache_params.previous_residual
	else:
	ori_hidden_states = hidden_states.clone()
	for layer_idx, layer in enumerate(self.layers):
	if torch.is_grad_enabled() and self.gradient_checkpointing:
	hidden_states = self._gradient_checkpointing_func(
	layer, hidden_states, attention_mask, rotary_emb, temb
	)
	else:
	hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
	self.teacache_params.previous_residual = hidden_states - ori_hidden_states
	else:
	if enable_taylorseer:
	self.current['stream'] = 'layers_stream'

	for layer_idx, layer in enumerate(self.layers):
	if enable_taylorseer:
	layer.current = self.current
	layer.cache_dic = self.cache_dic
	layer.enable_taylorseer = True
	self.current['layer'] = layer_idx

	if torch.is_grad_enabled() and self.gradient_checkpointing:
	hidden_states = self._gradient_checkpointing_func(
	layer, hidden_states, attention_mask, rotary_emb, temb
	)
	else:
	hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)

	# 4. Output norm & projection
	hidden_states = self.norm_out(hidden_states, temb)

	p = self.config.patch_size
	output = []
	for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
	height, width = img_size
	output.append(rearrange(hidden_states[i][seq_len - img_len:seq_len], '(h w) (p1 p2 c) -> c (h p1) (w p2)', h=height // p, w=width // p, p1=p, p2=p))
	if is_hidden_states_tensor:
	output = torch.stack(output, dim=0)

	if USE_PEFT_BACKEND:
	# remove `lora_scale` from each PEFT layer
	unscale_lora_layers(self, lora_scale)

	if enable_taylorseer:
	self.current['step'] += 1

	if not return_dict:
	return output
	return Transformer2DModelOutput(sample=output)

	###############################################################################################################################################################################################

Xet Storage Details

Size:: 248 kB
Xet hash:: 121d9333cc8de129fb789151ca7bf252e11310aea65573c4ceb2aa0046a9d998

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.