import os
import math
import datetime
from dataclasses import dataclass
from typing import Tuple, Optional, Literal

import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
from safetensors.torch import load_model

from kernel import act_quant, weight_dequant, fp8_gemm, int64_bmm_broadcast, \
    complex_int64_mul_broadcast, einsum_bshd_hdc_bshc, einsum_bshc_btc_bsht, softmax_init_q21, softmax_q21, einsum_bsht_btc_bshc, einsum_bshc_hdc_bshd, \
    silu_init_q25, silu_q25, sigmoid_q25, softmax_init_q19, softmax_q19, silu_init_q23, silu_q23, sigmoid_q23, RMS_Norm_int64


world_size = 1
rank = 0
block_size = 128
gemm_impl: Literal["bf16", "fp8"] = "bf16"
attn_impl: Literal["naive", "absorb"] = "absorb"

snark = True
zkDataDir = '../zkdata'

@dataclass
class ModelArgs:
    """
    Data class for defining model arguments and hyperparameters.

    Attributes:
        max_batch_size (int): Maximum batch size.
        max_seq_len (int): Maximum sequence length.
        dtype (Literal["bf16", "fp8"]): Data type for computations.
        vocab_size (int): Vocabulary size.
        dim (int): Model dimension.
        inter_dim (int): Intermediate dimension for MLP layers.
        moe_inter_dim (int): Intermediate dimension for MoE layers.
        n_layers (int): Number of transformer layers.
        n_dense_layers (int): Number of dense layers in the model.
        n_heads (int): Number of attention heads.
        n_routed_experts (int): Number of routed experts for MoE layers.
        n_shared_experts (int): Number of shared experts for MoE layers.
        n_activated_experts (int): Number of activated experts in MoE layers.
        n_expert_groups (int): Number of expert groups.
        n_limited_groups (int): Number of limited groups for MoE routing.
        score_func (Literal["softmax", "sigmoid"]): Scoring function for MoE routing.
        route_scale (float): Scaling factor for routing scores.
        q_lora_rank (int): LoRA rank for query projections.
        kv_lora_rank (int): LoRA rank for key-value projections.
        qk_nope_head_dim (int): Dimension for query-key projections without positional embeddings.
        qk_rope_head_dim (int): Dimension for query-key projections with rotary embeddings.
        v_head_dim (int): Dimension for value projections.
        original_seq_len (int): Original sequence length.
        rope_theta (float): Base for rotary positional encoding.
        rope_factor (float): Scaling factor for extended sequence lengths.
        beta_fast (int): Fast beta correction factor.
        beta_slow (int): Slow beta correction factor.
        mscale (float): Scaling factor for extended attention.
    """
    max_batch_size: int = 8
    max_seq_len: int = 4096 * 4
    dtype: Literal["bf16", "fp8"] = "bf16"
    vocab_size: int = 102400
    dim: int = 2048
    inter_dim: int = 10944
    moe_inter_dim: int = 1408
    n_layers: int = 27
    n_dense_layers: int = 1
    n_heads: int = 16
    # moe
    n_routed_experts: int = 64
    n_shared_experts: int = 2
    n_activated_experts: int = 6
    n_expert_groups: int = 1
    n_limited_groups: int = 1
    score_func: Literal["softmax", "sigmoid"] = "softmax"
    route_scale: float = 1.
    # mla
    q_lora_rank: int = 0
    kv_lora_rank: int = 512
    qk_nope_head_dim: int = 128
    qk_rope_head_dim: int = 64
    v_head_dim: int = 128
    # yarn
    original_seq_len: int = 4096
    rope_theta: float = 10000.0
    rope_factor: float = 40
    beta_fast: int = 32
    beta_slow: int = 1
    mscale: float = 1.

def saveTensor(fileName, t):
    with open(fileName, "w", encoding="utf-8") as f:
        t = t.detach()
        if t.device.type != "cpu":
            t = t.cpu()
        t = t.contiguous()
        with open(fileName, "wb") as f:
            # .numpy() -> bytes（C-order）
            f.write(t.numpy().tobytes(order="C"))

class ParallelEmbedding(nn.Module):
    """
    Embedding layer with parallelism support across distributed processes.

    Args:
        vocab_size (int): Vocabulary size.
        dim (int): Embedding dimension.
    """
    def __init__(self, vocab_size: int, dim: int):
        super().__init__()
        self.vocab_size = vocab_size
        self.dim = dim
        assert vocab_size % world_size == 0, f"Vocabulary size must be divisible by world size (world_size={world_size})"
        self.part_vocab_size = (vocab_size // world_size)
        self.vocab_start_idx = rank * self.part_vocab_size
        self.vocab_end_idx = self.vocab_start_idx + self.part_vocab_size
        # weight 的 shape: [129280, 7168]
        self.register_buffer("weight", torch.empty(self.part_vocab_size, self.dim, dtype=torch.int64))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for parallel embedding layer.

        Args:
            x (torch.Tensor): Input tensor containing token indices.

        Returns:
            torch.Tensor: Embedded representations.

        Raises:
            ValueError: If `world_size` is not defined.
        """
        # print('aaab ' + str(self.weight[0][0].type()))
        if world_size > 1:
            # 找出 x 中 的值不在 [vocab_start_idx, vocab_end_idx) 范围内的下标
            mask = (x < self.vocab_start_idx) | (x >= self.vocab_end_idx)
            # x 中所有的值都减去 vocab_start_idx
            x = x - self.vocab_start_idx
            # 之前找出的标记为 mask 下标的值设置为0
            x[mask] = 0
        y = F.embedding(x, self.weight)
        if world_size > 1:
            y[mask] = 0
            dist.all_reduce(y)

        # print(f'ParallelEmbedding x: {x}', flush=True)
        return y


def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
    """
    Applies a linear transformation to the incoming data: y = xA^T + b.
    This function supports specialized implementations based on quantization
    and tensor formats.

    Args:
        x (torch.Tensor): The input tensor.
        weight (torch.Tensor): The weight tensor. It may be quantized and 
            requires dequantization for certain cases.
        bias (Optional[torch.Tensor]): The bias tensor to be added. Default is None.

    Returns:
        torch.Tensor: The result of the linear transformation, which may involve 
        quantization-aware computations depending on the input parameters.

    Notes:
        - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version 
          is used for computation.
        - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied.
        - For other cases, the function applies quantization to `x` and uses `fp8_gemm` for computation.
    """

    element_size = weight.element_size()
    typ = weight.type()
    # print(f'linear weight element_size {element_size}, type: {typ}', flush=True)
    if weight.element_size() > 1:
        # print('linear weight.element_size > 1, element_size=' + str(weight.element_size()), flush=True)
        return F.linear(x, weight, bias)
    elif gemm_impl == "bf16":
        weight = weight_dequant(weight, weight.scale)
        return F.linear(x, weight, bias)
    else:
        # print('linear act_quant', flush=True)
        x, scale = act_quant(x, block_size)
        y = fp8_gemm(x, scale, weight, weight.scale)
        if bias is not None:
            y += bias
        return y

def linear_int(x: torch.Tensor, weight: torch.Tensor, x_rescale, weight_rescale, res_rescale, bias: Optional[torch.Tensor] = None) -> tuple[torch.Tensor]:
    if weight.element_size() > 1:
        (q, r) = int64_bmm_broadcast(x, weight, x_rescale, weight_rescale, res_rescale)
        return (q, r)
    elif gemm_impl == "bf16":
        weight = weight_dequant(weight, weight.scale)

        return (F.linear(x, weight, bias), torch.tensor(0, dtype=torch.int64))
    else:
        print('linear act_quant', flush=True)
        x, scale = act_quant(x, block_size)
        y = fp8_gemm(x, scale, weight, weight.scale)
        if bias is not None:
            y += bias
        return (y, torch.tensor(0, dtype=torch.int64))

class Linear_int(nn.Module):
    """
    Custom linear layer with support for quantized weights and optional bias.

    Args:
        in_features (int): Number of input features.
        out_features (int): Number of output features.
        bias (bool): Whether to include a bias term. Defaults to False.
        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
    """
    dtype = torch.int64

    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, res_rescale, dtype, bias: bool = False):
        super().__init__()
        self.layer_id = layer_id
        self.in_features = in_features
        self.out_features = out_features

        self.x_rescale = x_rescale
        self.weight_rescale = weight_rescale
        self.res_rescale = res_rescale

        self.register_buffer("weight", torch.empty(out_features, in_features, dtype=dtype))

        if bias:
            self.bias = nn.Parameter(torch.empty(out_features))
        else:
            self.register_parameter("bias", None)

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        q, r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, self.res_rescale, self.bias)
        return q, r

class Linear_rescale_int(nn.Module):
    """
    Custom linear layer with support for quantized weights and optional bias.

    Args:
        in_features (int): Number of input features.
        out_features (int): Number of output features.
        bias (bool): Whether to include a bias term. Defaults to False.
        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
    """
    dtype = torch.int64

    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, dtype, bias: bool = False):
        super().__init__()
        self.layer_id = layer_id
        self.in_features = in_features
        self.out_features = out_features

        self.x_rescale = x_rescale
        self.weight_rescale = weight_rescale

        self.register_buffer("weight", torch.empty(out_features, in_features, dtype=dtype))
        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))

        if bias:
            self.bias = nn.Parameter(torch.empty(out_features))
        else:
            self.register_parameter("bias", None)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        rescale = self.scale.item()
        y, _r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, rescale, self.bias)
        return y

class Linear(nn.Module):
    """
    Custom linear layer with support for quantized weights and optional bias.

    Args:
        in_features (int): Number of input features.
        out_features (int): Number of output features.
        bias (bool): Whether to include a bias term. Defaults to False.
        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
    """
    dtype = torch.bfloat16

    def __init__(self, layer_id, in_features: int, out_features: int, bias: bool = False, dtype = None):
        super().__init__()
        self.layer_id = layer_id
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype or Linear.dtype))

        # print('Linear.weight.element_size: ' + str(self.weight.element_size()))

        # nn.Parameter.element_size() 返回的是 每个元素在内存中占用的字节数
        # torch.float32 -> 4 字节
        # torch.float64 -> 8 字节
        # torch.int64 -> 8 字节
        # torch.bfloat16 -> 2 字节
        # torch.float8_e4m3fn -> 1 字节
        if self.weight.element_size() == 1:
            scale_out_features = (out_features + block_size - 1) // block_size
            scale_in_features = (in_features + block_size - 1) // block_size

            self.weight.scale = self.scale = nn.Parameter(torch.empty(scale_out_features, scale_in_features, dtype=torch.float32))
        else:
            self.register_parameter("scale", None)

        if bias:
            self.bias = nn.Parameter(torch.empty(out_features))
        else:
            self.register_parameter("bias", None)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the custom linear layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Transformed tensor after linear computation.
        """
        return linear(x, self.weight, self.bias)


class ColumnParallelLinear(Linear):
    """
    Linear layer with column parallelism, splitting output features across distributed processes.

    Args:
        in_features (int): Number of input features.
        out_features (int): Total number of output features.
        bias (bool): Whether to include a bias term. Defaults to False.
        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
    """
    def __init__(self, layer_id, in_features: int, out_features: int, bias: bool = False, dtype = None):
        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
        self.part_out_features = out_features // world_size
        super().__init__(layer_id, in_features, self.part_out_features, bias, dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for column parallel linear layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Transformed tensor with column-parallel computation.
        """
        y = linear(x, self.weight, self.bias)
        return y

class ColumnParallelLinear_int(Linear_int):
    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, res_rescale, dtype, bias: bool = False):
        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
        self.part_out_features = out_features // world_size
        super().__init__(layer_id, in_features, self.part_out_features, x_rescale, weight_rescale, res_rescale, dtype, bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y, _r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, self.res_rescale, self.bias)
        return y
    
class ColumnParallelLinear_rescale_int(Linear_int):
    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, dtype, bias: bool = False):
        assert out_features % world_size == 0, f"Output features must be divisible by world size (world_size={world_size})"
        self.part_out_features = out_features // world_size
        super().__init__(layer_id, in_features, self.part_out_features, x_rescale, weight_rescale, 1, dtype, bias)
        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
        # self.res_rescale = self.scale

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        rescale = self.scale.item()
        y, _r = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, rescale, self.bias)
        return y


class RowParallelLinear(Linear):
    """
    Linear layer with row parallelism, splitting input features across distributed processes.

    Args:
        in_features (int): Total number of input features.
        out_features (int): Number of output features.
        bias (bool): Whether to include a bias term. Defaults to False.
        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
    """
    def __init__(self, layer_id, in_features: int, out_features: int, bias: bool = False, dtype = None):
        assert in_features % world_size == 0, f"Input features must be divisible by world size (world_size={world_size})"
        self.part_in_features = in_features // world_size
        super().__init__(layer_id, self.part_in_features, out_features, bias, dtype)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for row parallel linear layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Transformed tensor with row-parallel computation.
        """
        y = linear(x, self.weight)
        if world_size > 1:
            dist.all_reduce(y)
        if self.bias is not None:
            y += self.bias
        return y

class RowParallelLinear_rescale_int(Linear_int):
    """
    Linear layer with row parallelism, splitting input features across distributed processes.

    Args:
        in_features (int): Total number of input features.
        out_features (int): Number of output features.
        bias (bool): Whether to include a bias term. Defaults to False.
        dtype (optional): Data type for the layer. Defaults to `torch.bfloat16`.
    """
    def __init__(self, layer_id, in_features: int, out_features: int, x_rescale, weight_rescale, res_rescale, dtype, bias: bool = False):
        assert in_features % world_size == 0, f"Input features must be divisible by world size (world_size={world_size})"
        self.part_in_features = in_features // world_size
        super().__init__(layer_id, self.part_in_features, out_features, x_rescale, weight_rescale, res_rescale, dtype, bias)
        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
        self.res_rescale = self.scale # useless

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for row parallel linear layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Transformed tensor with row-parallel computation.
        """
        # rescale = 2 ** self.scale.item()
        rescale = self.scale.item()
        # print(f'RowParallelLinear_rescale_int forward scale: {self.scale} ' + str(rescale), flush=True)
        y, _ = linear_int(x, self.weight, self.x_rescale, self.weight_rescale, rescale, self.bias)
        if world_size > 1:
            dist.all_reduce(y)
        if self.bias is not None:
            y += self.bias
        return y


class RMSNorm(nn.Module):
    """
    Root Mean Square Layer Normalization (RMSNorm).

    Args:
        dim (int): Dimension of the input tensor.
        eps (float): Epsilon value for numerical stability. Defaults to 1e-6.
    """
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.dim = dim
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x: torch.Tensor):
        """
        Forward pass for RMSNorm.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Normalized tensor with the same shape as input.
        """
        return F.rms_norm(x, (self.dim,), self.weight, self.eps)

class RMSNorm_int(nn.Module):
    def __init__(self, dim: int, dtype, eps: float = 1e-6):
        super().__init__()
        self.dim = dim
        self.eps = eps
        self.register_buffer(
            "weight",
            torch.ones(dim, dtype=dtype))

    def forward(self, x: torch.Tensor):
        # x 的 scale 为 2 ** 31
        # weight的scale 为 2 ** 15, 范围为 2^7 - 2^14
        # rms 的 scale 为 2 ** 28
        # 返回的结果 scale 为 2 ** 16,因为中间计算的时候 除以了 (1 << 15)，44 + 15 - 28 - 15 = 16
        (c, rms) = RMS_Norm_int64(x[0], self.weight, 1, self.dim)

        return (c[None, :], rms)


def precompute_freqs_cis(args: ModelArgs) -> torch.Tensor:
    """
    Precomputes frequency-based complex exponential values for rotary positional embeddings.

    Args:
        args (ModelArgs): Model arguments containing positional embedding parameters.

    Returns:
        torch.Tensor: Precomputed complex exponential values for positional embeddings.
    """
    # dim = 64
    dim = args.qk_rope_head_dim
    seqlen = args.max_seq_len
    beta_fast = args.beta_fast
    beta_slow = args.beta_slow
    base = args.rope_theta
    factor = args.rope_factor

    def find_correction_dim(num_rotations, dim, base, max_seq_len):
        """
        Computes the correction dimension for a given number of rotations in the rotary positional embedding.

        Args:
            num_rotations (float): Number of rotations to compute the correction for.
            dim (int): Dimensionality of the embedding space.
            base (float): Base value for the exponential computation.
            max_seq_len (int): Maximum sequence length.

        Returns:
            float: The correction dimension based on the input parameters.
        """
        return dim * math.log(max_seq_len / (num_rotations * 2 * math.pi)) / (2 * math.log(base))

    def find_correction_range(low_rot, high_rot, dim, base, max_seq_len):
        """
        Computes the range of correction dimensions for rotary positional embeddings.

        Args:
            low_rot (float): Lower bound for the number of rotations.
            high_rot (float): Upper bound for the number of rotations.
            dim (int): Dimensionality of the embedding space.
            base (float): Base value for the exponential computation.
            max_seq_len (int): Maximum sequence length.

        Returns:
            Tuple[int, int]: The range of correction dimensions (low, high), clamped to valid indices.
        """
        low = math.floor(find_correction_dim(low_rot, dim, base, max_seq_len))
        high = math.ceil(find_correction_dim(high_rot, dim, base, max_seq_len))
        return max(low, 0), min(high, dim-1)

    def linear_ramp_factor(min, max, dim):
        """
        Computes a linear ramp function used to smooth values between a minimum and maximum range.

        Args:
            min (float): Minimum value for the ramp function.
            max (float): Maximum value for the ramp function.
            dim (int): Dimensionality of the ramp tensor.

        Returns:
            torch.Tensor: A tensor of shape (dim,) with values linearly interpolated between 0 and 1,
                clamped to the range [0, 1].
        """
        if min == max:
            max += 0.001
        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
        ramp_func = torch.clamp(linear_func, 0, 1)
        return ramp_func

    # torch.arange(0, dim, 2, dtype=torch.float32) 的作用是： 生成从 0 开始、步长为 2、到 dim 之前（不含 dim）的一维张量，数据类型为 float32
    # 1/10000^(2k/d_model)
    # freqs shape: 一维向量，长度为 dim /2
    freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
    # original_seq_len=4096
    if seqlen > args.original_seq_len:
        low, high = find_correction_range(beta_fast, beta_slow, dim, base, args.original_seq_len)
        smooth = 1 - linear_ramp_factor(low, high, dim // 2)
        freqs = freqs / factor * (1 - smooth) + freqs * smooth

    t = torch.arange(seqlen)
    # torch.outer 的作用是计算两个向量的 外积 (outer product)，比如：
    # t = torch.tensor([1, 2, 3])       # shape = [3]
    # freqs = torch.tensor([10, 20])    # shape = [2]
    # out = torch.outer(t, freqs)
    # tensor([[10, 20],
    #         [20, 40],
    #         [30, 60]])
    # freqs shape为 [seqlen, dim/2]
    freqs = torch.outer(t, freqs)
    # torch.polar(abs, angle) 的作用: 把 极坐标 (r, θ) 转换成 复数 (x + iy) 的函数
    # freqs_cis_0 shape为 [seqlen, dim/2]
    freqs_cis_0 = torch.polar(torch.ones_like(freqs), freqs)

    # return freqs_cis_0

    # 复数转换成实数, freqs_cis_1 shape为 [seqlen, dim]
    freqs_cis_1 = torch.view_as_real(freqs_cis_0)

    # freqs_cis = torch.empty_like(freqs_cis_1, dtype=torch.int64, device='cuda')

    # cols 为 2 * freqs_cis_1.shape[1] 是因为 复数的实部 和 虚部
    # rescale 参数为 19 = 42 - 23, ex 部分加 +19，总的rescale为 2^42
    freqs_cis = (freqs_cis_1 * (2 ** 42)).round().to(torch.int64)

    freqs_cis_abs = freqs_cis.abs()
    min1 = freqs_cis_abs.min()
    max1 = freqs_cis_abs.max()
    print(f'freqs_cis min {min1}, max: {max1}', flush=True)

    # print(f'freqs_cis: {freqs_cis}')
    # freqs_cis  的 rescale 为 2^42
    return freqs_cis

#  x(q_pe) 的维度 [batch, seqLen, 128, 64]
def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
    """
    Applies rotary positional embeddings to the input tensor.

    Args:
        x (torch.Tensor): Input tensor with positional embeddings to be applied.
        freqs_cis (torch.Tensor): Precomputed complex exponential values for positional embeddings.

    Returns:
        torch.Tensor: Tensor with rotary embeddings applied.
    """

    # if x.dtype == torch.int64:
    # x 的维度 变为 [batch, seqLen, 128, 32, 2]
    ### important!!! 调用 so lib库之前，必须确保内存连续
    x = x.contiguous().view(*x.shape[:-1], -1, 2)
    # freqs_cis 的维度为 [1, seqLen, 1, 32, 2]
    freqs_cis = freqs_cis.view(1, x.size(1), 1, x.size(-2), 2)
    # freqs_cis = freqs_cis.view(1, x.size(1), 1, x.size(-1))
    # 4194304 = 1 << (64 - 42), 42是 rescale, int64 * int64 结果的高 64位 乘以 4194304
    # 4398046511104 = 1 << 42
    # print(x)
    # print(f'x shape: {x.shape}, freqs_cis shape: {freqs_cis.shape}')
    # y = complex_int64_mul_broadcast(x, freqs_cis, 4194304, 4398046511104)
    y = complex_int64_mul_broadcast(x, freqs_cis)
    y2 = y.flatten(3)
    return y2


def getBF16PrintStr(ele):
    v = int(ele.cpu().view(torch.uint16).item())
    ex = v >> 7 & 0xFF
    r = '(1+' + str(v & 0x7F) + '/128)'
    rraw = v & 0x7F

    if v & 0x8000:
        vstr = '-' + r + '*2^' + str(ex - 127)
    else:
        vstr = r + '*2^' + str(ex - 127)
    return vstr

class MLA(nn.Module):
    """
    Multi-Headed Attention Layer (MLA).

    Attributes:
        dim (int): Dimensionality of the input features.
        n_heads (int): Number of attention heads.
        n_local_heads (int): Number of local attention heads for distributed systems.
        q_lora_rank (int): Rank for low-rank query projection.
        kv_lora_rank (int): Rank for low-rank key/value projection.
        qk_nope_head_dim (int): Dimensionality of non-positional query/key projections.
        qk_rope_head_dim (int): Dimensionality of rotary-positional query/key projections.
        qk_head_dim (int): Total dimensionality of query/key projections.
        v_head_dim (int): Dimensionality of value projections.
        softmax_scale (float): Scaling factor for softmax in attention computation.
    """
    def __init__(self, layer_id, args: ModelArgs):
        super().__init__()

        # RowParallelLinear和ColumnParallelLinear是将Linear层按照行和列划分为多个子线性层并分配到各个设备上，每个设备维护一个子线性层，
        # 如线性层的shape为[in_features, out_features]，RowParallelLinear的shape为[in_features/world_size, out_features]，
        # ColumnParallelLinear的shape为[in_features，out_features/world_size]，world_size是设备数

        self.layer_id = layer_id

        # 7168
        self.dim = args.dim
        # 128
        self.n_heads = args.n_heads
        # 当前进程跑的header数目
        self.n_local_heads = args.n_heads // world_size
        # query向下投影矩阵维度，默认为0表示不压缩，实际使用过程为 1536
        self.q_lora_rank = args.q_lora_rank
        # key和value向下投影矩阵维度，实际使用过程为 512;
        self.kv_lora_rank = args.kv_lora_rank
        # query/key不包含位置信息的隐藏层维度, 实际使用过程为 128
        self.qk_nope_head_dim = args.qk_nope_head_dim
        # query/key包含rope位置信息的隐藏层维度, 实际使用过程为 64
        self.qk_rope_head_dim = args.qk_rope_head_dim

        # 192
        self.qk_head_dim = args.qk_nope_head_dim + args.qk_rope_head_dim
        # value隐藏层维度, 实际使用过程为 128
        self.v_head_dim = args.v_head_dim

        # query向下投影矩阵维度，默认为0表示不压缩，实际使用过程为 1536
        if self.q_lora_rank == 0:
            self.wq = ColumnParallelLinear(layer_id, self.dim, self.n_heads * self.qk_head_dim)
        else:
            # query向下投影矩阵, shape [7168, 1536], Float8_e4m3fnTensor
            self.wq_a = Linear_int(layer_id, self.dim, self.q_lora_rank, 1, 1, 30, torch.int32)
            self.q_norm = RMSNorm_int(self.q_lora_rank, torch.int32)
            # query向上投影矩阵的列并行线性层, shape [1536, 24576(128 * 192)], Float8_e4m3fnTensor
            # self.wq_b = ColumnParallelLinear_int(layer_id, self.q_lora_rank, self.n_heads * self.qk_head_dim, 1, 1, (1 << 30), torch.int32)
            self.wq_b1 = ColumnParallelLinear_int(layer_id, self.q_lora_rank, self.n_heads * args.qk_nope_head_dim, 1, 1, 30, torch.int32)
            self.wq_b2 = ColumnParallelLinear_int(layer_id, self.q_lora_rank, self.n_heads * args.qk_rope_head_dim, 1, 1, 30, torch.int32)

        # key和value的向下投影矩阵, shape [576, 7168], Float8_e4m3fnTensor, kv_lora_rank=512, qk_rope_head_dim=64
        # self.wkv_a = Linear_int(layer_id, self.dim, self.kv_lora_rank + self.qk_rope_head_dim, 1, 1, (1 << 29), torch.int32)
        self.wkv_a1 = Linear_int(layer_id, self.dim, self.kv_lora_rank, 1, 1, 29, torch.int32)
        self.wkv_a2 = Linear_int(layer_id, self.dim, self.qk_rope_head_dim, 1, 1, 29, torch.int32)
        # self.kv_norm = RMSNorm(self.kv_lora_rank)
        self.kv_norm = RMSNorm_int(self.kv_lora_rank, torch.int32)
        # key和value向上投影矩阵的列并行线性层, shape [32768, 512], Float8_e4m3fnTensor
        # kv_lora_rank=512, n_heads = 128, qk_nope_head_dim = 128, v_head_dim = 128
        # self.wkv_b = ColumnParallelLinear(layer_id, self.kv_lora_rank, self.n_heads * (self.qk_nope_head_dim + self.v_head_dim))
        self.wkv_b_1 = ColumnParallelLinear_rescale_int(layer_id, self.kv_lora_rank, self.n_heads * self.qk_nope_head_dim, 1, 1, torch.int32)
        self.wkv_b_2 = ColumnParallelLinear_rescale_int(layer_id, self.kv_lora_rank, self.n_heads * self.v_head_dim, 1, 1, torch.int32)

        # 输出投影行并行线性层, shape [7168, 16384], Float8_e4m3fnTensor
        self.wo = RowParallelLinear_rescale_int(layer_id, self.n_heads * self.v_head_dim, self.dim, 1, 1, 1, torch.int32)
        # softmax缩放系数, qk_head_dim = 192
        # self.softmax_scale = self.qk_head_dim ** -0.5
        # # max_seq_len = 4096 * 4, original_seq_len = 4096
        # if args.max_seq_len > args.original_seq_len:
        #     # mscale = 1.0, rope_factor = 40, math.log = ln 自然对数
        #     mscale = 0.1 * args.mscale * math.log(args.rope_factor) + 1.0
        #     self.softmax_scale = self.softmax_scale * mscale * mscale
        self.softmax_scale1 = 94
        self.softmax_scale2 = 695

        if attn_impl == "naive":
            self.register_buffer("k_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.n_local_heads, self.qk_head_dim), persistent=False)
            self.register_buffer("v_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.n_local_heads, self.v_head_dim), persistent=False)
        else:
            # 缓存key和value向下投影表示
            # self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank), persistent=False)
            # self.register_buffer("kv_cache", torch.zeros(1, args.max_seq_len, self.kv_lora_rank), persistent=False)
            self.register_buffer("kv_cache", torch.zeros(1, args.max_seq_len, self.kv_lora_rank, dtype=torch.int64), persistent=False)
            # 缓存key执行rope操作后的表示
            # self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim), persistent=False)
            # self.register_buffer("pe_cache", torch.zeros(1, args.max_seq_len, self.qk_rope_head_dim), persistent=False)
            self.register_buffer("pe_cache", torch.zeros(1, args.max_seq_len, self.qk_rope_head_dim, dtype=torch.int64), persistent=False)

    # x shape [1, seqLen, 7168], x 的resacle 为 2^21
    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
        """
        Forward pass for the Multi-Headed Attention Layer (MLA).

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_len, dim).
            start_pos (int): Starting position in the sequence for caching.
            freqs_cis (torch.Tensor): Precomputed complex exponential values for rotary embeddings.
            mask (Optional[torch.Tensor]): Mask tensor to exclude certain positions from attention.

        Returns:
            torch.Tensor: Output tensor with the same shape as the input.
        """

        # 从输入获取batch size和序列长度seqlen，并根据输入序列的起始位置计算输入序列的结束位置end_pos=start_pos+seqlen；
        bsz, seqlen, _ = x.size()
        end_pos = start_pos + seqlen

        # 获取query的投影表示：如果对query投影矩阵进行压缩(即q_lora_rank不为0)，则将输入乘以query的向下投影矩阵wq_a，然后经过归一化层q_norm，
        # 再乘以向上投影矩阵wq_b，否则直接乘以原始投影矩阵wq；将其维度调整为[batchsize, n_local_threads, qk_head_dim]；
        if self.q_lora_rank == 0:
            q = self.wq(x)
        else:
            # query向下投影矩阵, shape [7168, 1536], Float8_e4m3fnTensor
            # x(也就是 attn_normed) 的 scale 为 2^21, wq_a weight 的scale 为 2^30, q_down 的 scale 为 2^21
            q_down, q_down_rem = self.wq_a(x)
            # q_down = self.wq_a(x)

            if snark:
                dirStr = f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}'
                os.makedirs(dirStr, exist_ok=True)
                saveTensor(f'{dirStr}/wq_a_x.bin', x.cpu())
                saveTensor(f'{dirStr}/wq_a_w.bin', self.wq_a.weight.view(torch.uint32).cpu())
                saveTensor(f'{dirStr}/wq_a_y.bin', q_down.cpu())
                saveTensor(f'{dirStr}/q_norm_r.bin', q_down_rem.cpu())
                # q_down = (q_down.detach().to(torch.float32) * (2 ** -23)).to(torch.bfloat16)

            # q_norm 的 rescale 为 2^19
            (q_normed, rms) = self.q_norm(q_down)

            if snark:
                dirStr = f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}'
                os.makedirs(dirStr, exist_ok=True)
                saveTensor(f'{dirStr}/q_norm_x.bin', q_down.cpu())
                saveTensor(f'{dirStr}/q_norm_weight.bin', self.q_norm.weight.view(torch.uint32).cpu())
                saveTensor(f'{dirStr}/q_norm_rms.bin', rms.cpu())
                saveTensor(f'{dirStr}/q_norm_y.bin', q_normed.cpu())

            # q 的 rescale 为 2^19
            # q = self.wq_b(q_normed)
            q_nope = self.wq_b1(q_normed)
            q_pe = self.wq_b2(q_normed)

        # 在pytorch中view函数的作用为重构张量的维度
        # q = q.view(bsz, seqlen, self.n_local_heads, self.qk_head_dim)
        q_nope = q_nope.view(bsz, seqlen, self.n_local_heads, self.qk_nope_head_dim)
        q_pe = q_pe.view(bsz, seqlen, self.n_local_heads, self.qk_rope_head_dim)

        # 将query的投影表示按照最后一个维度拆分，前面qk_nope_head_dim维(128)作为query不包含位置信息的表示q_nope，后面qk_rope_head_dim维(64)添加rope位置信息
        # (调用apply_rotary_emb函数，参考秀才经商：DeepSeek源码解析之RoPE)作为query包含位置信息的表示q_pe(即公式39)；
        # q_nope 的维度[batch, seqLen, 128, 128], q_pe 的维度 [batch, seqLen, 128, 64]
        # q_nope, q_pe 的 rescale 为 2^19
        # q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
        # freqs_cis  的 rescale 为 2^42, 计算之后 q_pe 的 rescale 为 2^19

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/q_pe_x.bin', q_pe.cpu())
            saveTensor(f'{zkDataDir}/freqs_cis.bin', freqs_cis.cpu())

        q_pe = apply_rotary_emb(q_pe, freqs_cis)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/q_pe_y.bin', self.q_norm.weight.view(torch.uint32).cpu())

        # 获取key和value的联合表示kv(即公式41中的)和包含位置信息的key表示k_pe(即公式43中的)：输入乘以向下投影矩阵wkv_a后，按照最后一个维度拆分，
        # 前面kv_lora_rank维作为key和value的联合表示，后面qk_rope_head_dim维添加rope位置信息(调用apply_rotary_emb)后得到包含rope位置信息的key表示；

        # x 的resacle 为 2^21, kv shape [batch, seqLen, 512], kv 的resacle 为 2^21
        kv, kv_rem = self.wkv_a1(x)

        if snark:
            dirStr = f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}'
            os.makedirs(dirStr, exist_ok=True)
            saveTensor(f'{dirStr}/wkv_a1_x.bin', x.cpu())
            saveTensor(f'{dirStr}/wkv_a1_w.bin', self.wkv_a1.weight.view(torch.uint32).cpu())
            saveTensor(f'{dirStr}/wkv_a1_y.bin', kv.cpu())
            saveTensor(f'{dirStr}/wkv_a1_r.bin', kv_rem.cpu())

        k_pe, _ = self.wkv_a2(x)

        # print(f'k_pe 1 shape: {k_pe.shape}', flush=True)
        # unsqueeze()用于增加一个维度, k_pe.unsqueeze(2) 把 k_pe reshape 成 [batch, seqLen, 1, dim]
        # # kv, k_pe 的resacle 为 2^21
        k_pe = apply_rotary_emb(k_pe.unsqueeze(2), freqs_cis)
        # print(f'k_pe 2 shape: {k_pe.shape}', flush=True)

        if attn_impl == "naive":
            q = torch.cat([q_nope, q_pe], dim=-1)
            kv = self.wkv_b(self.kv_norm(kv))
            kv = kv.view(bsz, seqlen, self.n_local_heads, self.qk_nope_head_dim + self.v_head_dim)
            k_nope, v = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
            k = torch.cat([k_nope, k_pe.expand(-1, -1, self.n_local_heads, -1)], dim=-1)
            self.k_cache[:bsz, start_pos:end_pos] = k
            self.v_cache[:bsz, start_pos:end_pos] = v
            scores = torch.einsum("bshd,bthd->bsht", q, self.k_cache[:bsz, :end_pos]) * self.softmax_scale
        else:
            # 计算query和key的注意力：
            # query中不包含位置信息的q_nope(乘以了key的向上投影矩阵后)与缓存kv_cache中的key表示求内积；
            # query中包含位置信息的q_pe与缓存pe_cache中的key表示求内积；
            # 两者相加后乘以softmax缩放系数softmax_scale

            # q_nope 的维度[batch, seqLen, 128, 128], wkv_b_1 shape: [128, 128, 512]
            # q_nope rescale 2^19, wkv_b_1 rescale 2 ** 32
            # q_nope = torch.einsum("bshd,hdc->bshc", q_nope, wkv_b_1)
            # 调用 einsum_bshd_hdc_bshc 之后, q_nope维度 [batch, seqLen, 128, 512]
            wkv_b_1 = self.wkv_b_1.weight.view(self.n_local_heads, -1, self.kv_lora_rank)
            q_nope = einsum_bshd_hdc_bshc(q_nope.contiguous(), wkv_b_1.contiguous(), self.wkv_b_1.scale.item())
            # print('q_nope type: ' + str(q_nope.type()))
            # print('q_nope shape: ' + str(q_nope.shape))

            # kv_normed 的 rescale 为 2^23
            (kv_normed, rms) = self.kv_norm(kv)

            # kv_cache 的 rescale 为 2^23, shape [batch, seqLen, 512],
            self.kv_cache[:bsz, start_pos:end_pos] = kv_normed
            # self.kv_cache[:bsz, start_pos:end_pos] = kv2

            # kv = (kv.detach().to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
            # pe_cache 的 rescale 为 2^21
            self.pe_cache[:bsz, start_pos:end_pos] = k_pe.squeeze(2)

            # q_nope rescale: 2^19, kv_cache rescale: 2^23
            # q_nope 的维度 [batch, seqLen, 128, 512], kv_cache 维度 (batch, args.max_seq_len, 512)
            # score1 = torch.einsum("bshc,btc->bsht", q_nope, self.kv_cache[:bsz, :end_pos])
            kv_cache1 = self.kv_cache[:bsz, :end_pos]
            # score1 = einsum_bshc_btc_bsht(q_nope.contiguous(), kv_cache1.contiguous(), 25)
            # score1 的 rescale 为 2^19
            score1 = einsum_bshc_btc_bsht(q_nope.contiguous(), kv_cache1.contiguous(), 23)
            # print(f'kv_cache1 type: {kv_cache1.type()}, shape: {kv_cache1.shape}', flush=True)
            # score1 = (score1.detach().to(torch.float32) * (2 ** -21)).to(torch.bfloat16)

            # score2 = torch.einsum("bshr,btr->bsht", q_pe, self.pe_cache[:bsz, :end_pos])
            pe_cache1 = self.pe_cache[:bsz, :end_pos]
            # score2 = einsum_bshc_btc_bsht(q_pe.contiguous(), pe_cache1.contiguous(), 23)
            # q_pe 的 rescale 为 2^19, scores2 的rescale 为  2^19
            score2 = einsum_bshc_btc_bsht(q_pe.contiguous(), pe_cache1.contiguous(), 21)
            # score2 = (score2.detach().to(torch.float32) * (2 ** -21)).to(torch.bfloat16)

            # scores = (score1 + score2) * self.softmax_scale
            # scores  的 rescale 为 2 ** 19
            scores = (score1 + score2) * self.softmax_scale1 // self.softmax_scale2
            # scores = torch.round(((score1 + score2) * self.softmax_scale1).to(torch.float32) / self.softmax_scale2).to(torch.int64)


        # mask 在 unsqueeze(1) 之后的 shape 为 [seqLen, 1, senLen], scores 的shape 为 [batch, seqLen, heads , t]
        if mask is not None:
            # print('mask type: ' + str(mask.type()))
            # print('mask shape: ' + str(mask.shape))
            scores += mask.unsqueeze(1)
        # query和key的内积按照最后一个维度计算softmax值；
        # scores = scores.softmax(dim=-1, dtype=torch.float32).type_as(x)
        scores_new = torch.empty_like(scores, dtype=torch.int64, device='cuda')
        # scores 和  scores_new 的 rescale 为 2 ** 19, shape: [bsz, seqLen, headCount, seqLen]

        # # softmax_q19 会破坏 scores 的原始数据，先拷贝一份数据
        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_x.bin', scores.contiguous().cpu())

        softmax_q19(scores.contiguous(), scores_new)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/scores_softmax_y.bin', scores_new.cpu())

        if attn_impl == "naive":
            x = torch.einsum("bsht,bthd->bshd", scores, self.v_cache[:bsz, :end_pos])
        else:

            kv_cache2 = self.kv_cache[:bsz, :end_pos]
            # kv_cache2 = (kv_cache2.detach().to(torch.float32) * (2 ** -25)).to(torch.bfloat16)

            # x = (x.detach().to(torch.float32) * (2 ** -23)).to(torch.bfloat16)

            # 计算最终输出：
            # 注意力分数乘以kv缓存后，再乘以value的向上投影矩阵wkv_b(实现公式45和46)；
            # 乘以输出投影矩阵wo(公式47)；
            # x = torch.einsum("bsht,btc->bshc", scores_new, kv_cache2)
            # scores_new 的 rescale 为 2^19, kv_cache2 的 rescale 为 2^23, bshc 的 rescale 为 2^19
            # scores_new shape: [1, 8, 128, 8], bshc shape: [1, 8, 128, 512]
            # bshc = einsum_bsht_btc_bshc(scores_new.contiguous(), kv_cache2.contiguous(), 25)
            bshc = einsum_bsht_btc_bshc(scores_new.contiguous(), kv_cache2.contiguous(), 23)

            # # v_head_dim = 128, kv_lora_rank = 512, n_local_heads = 128
            # wkv_b_2 = wkv_b[:, -self.v_head_dim:]
            # # print('wkv_b 2 type: ' + str(wkv_b_2.type()))
            # # print('wkv_b 2 shape: ' + str(wkv_b_2.shape))
            wkv_b_2 = self.wkv_b_2.weight
            wkv_b_2 = wkv_b_2.view(self.n_local_heads, -1, self.kv_lora_rank)

            # wkv_b_2 = (wkv_b_2.detach().to(torch.float32) * (2 ** -self.wkv_b_2.scale.item())).to(torch.bfloat16)

            # x = torch.einsum("bshc,hdc->bshd", x, wkv_b_2)
            # bshc 的 rescale 为 2^19, wkv_b_2 的 rescale 为  self.wkv_b_2.scale
            # x 的 rescale 为 2 ** 19
            # bshc shape: [1, seqLen, 128, 512], wkv_b_2 shape: [128, 128, 512]
            x = einsum_bshc_hdc_bshd(bshc.contiguous(), wkv_b_2.contiguous(), self.wkv_b_2.scale.item())
            # x = (x.detach().to(torch.float32) * (2 ** -21)).to(torch.bfloat16)

        # x 返回的的 shape [1, seqLen, 7168]
        x = self.wo(x.flatten(2))

        return x

class MLP(nn.Module):
    """
    Multi-Layer Perceptron (MLP) used as a feed-forward layer.

    Attributes:
        w1 (nn.Module): Linear layer for input-to-hidden transformation.
        w2 (nn.Module): Linear layer for hidden-to-output transformation.
        w3 (nn.Module): Additional linear layer for feature transformation.
    """
    def __init__(self, layer_id, dim: int, inter_dim: int):
        """
        Initializes the MLP layer.

        Args:
            dim (int): Input and output dimensionality.
            inter_dim (int): Hidden layer dimensionality.
        """
        super().__init__()
        self.w1 = ColumnParallelLinear(layer_id, dim, inter_dim)
        self.w2 = RowParallelLinear(layer_id, inter_dim, dim)
        self.w3 = ColumnParallelLinear(layer_id, dim, inter_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the MLP layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after MLP computation.
        """
        return self.w2(F.silu(self.w1(x)) * self.w3(x))


class MLP_int(nn.Module):
    """
    Multi-Layer Perceptron (MLP) used as a feed-forward layer.

    Attributes:
        w1 (nn.Module): Linear layer for input-to-hidden transformation.
        w2 (nn.Module): Linear layer for hidden-to-output transformation.
        w3 (nn.Module): Additional linear layer for feature transformation.
    """
    def __init__(self, layer_id, dim: int, inter_dim: int):
        """
        Initializes the MLP layer.

        Args:
            dim (int): Input and output dimensionality.
            inter_dim (int): Hidden layer dimensionality.
        """
        super().__init__()
        self.layer_id = layer_id
        self.w1 = ColumnParallelLinear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)
        self.w2 = RowParallelLinear_rescale_int(layer_id, inter_dim, dim, 1, 1, 1, torch.int32)
        self.w3 = ColumnParallelLinear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)

    # 输入的 x 的rescale 为 2^23, [bsz, seqLen, 7168]
    def forward(self, start_pos: int, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the MLP layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after MLP computation.
        """
        # r1 shape: [bsz, seqLen, inter_dim], r1 rescale: 2^23
        r1 = self.w1(x)

        # s1 = F.silu(r1)
        # s1 shape: [bsz, seqLen, inter_dim], s1 rescale: 2^23
        s1 = torch.empty_like(r1, dtype=torch.int64, device='cuda')
        # silu_q25(r1, s1)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_x.bin', r1.contiguous().cpu())

        silu_q23(r1, s1)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/mlp_silu_y.bin', s1.cpu())

        # r2 rescale: 2^23, shape: [1, seqLen, inter_dim]
        r2 = self.w3(x)

        # 返回的 shape [bsz, seqLen, dim]
        q = self.w2(s1 * r2 // (1 << 23))
        return q


class Gate(nn.Module):
    """
    Gating mechanism for routing inputs in a mixture-of-experts (MoE) model.

    Attributes:
        dim (int): Dimensionality of input features.
        topk (int): Number of top experts activated for each input.
        n_groups (int): Number of groups for routing.
        topk_groups (int): Number of groups to route inputs to.
        score_func (str): Scoring function ('softmax' or 'sigmoid').
        route_scale (float): Scaling factor for routing weights.
        weight (torch.nn.Parameter): Learnable weights for the gate.
        bias (Optional[torch.nn.Parameter]): Optional bias term for the gate.
    """
    def __init__(self, layer_id: int, args: ModelArgs):
        """
        Initializes the Gate module.

        Args:
            args (ModelArgs): Model arguments containing gating parameters.
        """
        super().__init__()

        self.layer_id = layer_id

        self.dim = args.dim
        # n_activated_experts = 8
        self.topk = args.n_activated_experts
        # n_expert_groups = 8
        self.n_groups = args.n_expert_groups
        # n_limited_groups = 4
        self.topk_groups = args.n_limited_groups
        # score_func = 'sigmoid'
        self.score_func = args.score_func
        # route_scale = 2.5
        self.route_scale = args.route_scale
        # n_routed_experts = 256
        # self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim))
        self.register_buffer("weight", torch.empty(args.n_routed_experts, args.dim, dtype=torch.int32))
        self.register_buffer("scale", torch.tensor(0, dtype=torch.int32))
        # self.bias = nn.Parameter(torch.empty(args.n_routed_experts, dtype=torch.int32)) if self.dim == 7168 else None
        if self.dim == 7168:
            self.register_buffer("bias", torch.empty(args.n_routed_experts, dtype=torch.int32))
        else:
            self.bias = None

    # x 的 rescale 为 2^23
    def forward(self, start_pos: int, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Forward pass for the gating mechanism.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Routing weights and selected expert indices.
        """

        x = x.view(1, -1, self.dim)

        # scores = linear(x, self.weight)
        # self.weight shape: [256, 7168]
        # 当前 scores shape: [1, seqLen, 256]
        # rescale = 2 ** self.scale.item()
        rescale = self.scale.item()

        # scores 的 rescale 为 2^23
        scores, scores_rem = linear_int(x, self.weight, 1, 1, rescale)
        # scores = int64_bmm_with_bias(x, self.weight, bias, 1, 1, self.scale)

        # x shape: [seqLen, 7168]
        x = x.view(-1, self.dim)

        if self.score_func == "softmax":
            scores = scores.softmax(dim=-1, dtype=torch.float32)
        else:
            # scores = scores.sigmoid()
            C = torch.empty_like(scores, dtype=torch.int64, device='cuda')

            if snark:
                saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_x.bin', scores.cpu())
                saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_r.bin', scores_rem.cpu())

            sigmoid_q23(scores, C)

            if snark:
                saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/sigmoid_gate_y.bin', C.cpu())

            # 当前 scores shape: [seqLen, 256]
            scores = C.squeeze(0)

        # bias的rescale为2^23
        original_scores = scores
        if self.bias is not None:
            # scores = scores + self.bias
            # 当前 scores shape: [seqLen, 256]
            scores = scores + self.bias

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_original_scores.bin', original_scores.contiguous().cpu())
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_bias.bin', self.bias.view(torch.uint32).cpu())

        # n_groups = 8
        if self.n_groups > 1:
            # x.size(0) = 8，当前 scores shape: [seqLen, 8, 32]
            scores = scores.view(x.size(0), self.n_groups, -1)
            # print(f'scores shape 111: {scores.shape}', flush=True)
            if self.bias is None:
                group_scores = scores.amax(dim=-1)
            else:
                # topk 返回 -1维度上 最大的 前 2 个值，同时返回值和索引，[0] 表示 取值，sum(-1) 再把最大的两个值相加.
                # 256维，分成8个组，每个组挑最大的两个数相加，得到 [seqLen, 8] 的结果，代表 8 个组的 最大两个值的和。
                # group_scores 的 shape: [8, 8]
                group_scores = scores.topk(2, dim=-1)[0].sum(dim=-1)
                # print(group_scores[0], flush=True)
                # print(f'group_scores shape: {group_scores.shape}')

            # topk_groups = 4, 从 8 个group中选择最大的 4个，返回其索引，比如返回 [[0, 2, 4, 6], ...]
            # indices shape: [seqLen, 4]
            indices = group_scores.topk(self.topk_groups, dim=-1)[1]
            # print(indices[0], flush=True)

            # mask shape: [seqLen, 8]
            # scatter_: 按照给定索引，把某个源张量的值写入到目标张量对应位置。 Tensor.scatter_(dim, index, src, reduce=None)
            # 比如 mask 为[[False, True, False, True, False, True, False, True], ...]
            # mask: 每一行最大的4个值相对应的 mask 为 False
            mask = scores.new_ones(x.size(0), self.n_groups, dtype=bool).scatter_(1, indices, False)
            # print(mask[0], flush=True)
            # 把满足布尔 mask 的位置替换成 "-inf", mask.unsqueeze(-1) shape: [8, 8, 1]
            # 把 scores 中 淘汰掉的4个group中的每一个值设置为 "-inf",总共设置 128个 "-inf"，占每一行中的一半
            # scores shape: [seqLen, 256]
            # scores = scores.masked_fill_(mask.unsqueeze(-1), float("-inf")).flatten(1)
            scores = scores.masked_fill_(mask.unsqueeze(-1), -(1 << 42)).flatten(1)

        # 没有淘汰掉的group中的 128个值中，选择最大的8个值，返回其下标
        # self.topk = 8, indices shape: [8, 8]
        indices = torch.topk(scores, self.topk, dim=-1)[1]
        # print(indices[0], flush=True)

        # gather 用来按照索引从一个张量中取值，按照8个最大值的下标，获取其值
        # weights shape: [8, 8]
        weights = original_scores.gather(1, indices)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_indices.bin', indices.contiguous().cpu())
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/gate_weights.bin', weights.contiguous().cpu())

        # print(f'weights shape: {weights.shape}')
        if self.score_func == "sigmoid":
            sum1 = weights.sum(dim=-1, keepdim=True)
            # weights = (weights * (2 ** 25) + sum1 // 2) // sum1
            weights = (weights * (2 ** 23)) // sum1
            # weights /= weights.sum(dim=-1, keepdim=True)

        #self.route_scale = 2.5
        # weights *= self.route_scale
        weights = weights * 5 // 2

        # weights = (weights.to(torch.float32) * (2 ** -23)).to(torch.bfloat16)
        # return weights.type_as(x), indices
        return weights, indices


class Expert_int(nn.Module):
    """
    Expert layer for Mixture-of-Experts (MoE) models.

    Attributes:
        w1 (nn.Module): Linear layer for input-to-hidden transformation.
        w2 (nn.Module): Linear layer for hidden-to-output transformation.
        w3 (nn.Module): Additional linear layer for feature transformation.
    """
    def __init__(self, layer_id, idx, dim: int, inter_dim: int):
        """
        Initializes the Expert layer.

        Args:
            dim (int): Input and output dimensionality.
            inter_dim (int): Hidden layer dimensionality.
        """
        super().__init__()
        # # w1 shape: [2048, 7168]
        # self.w1 = Linear(layer_id, dim, inter_dim)
        # # w2 shape: [7168, 2048]
        # self.w2 = Linear(layer_id, inter_dim, dim)
        # # w3 shape: [2048, 7168]
        # self.w3 = Linear(layer_id, dim, inter_dim)

        self.layer_id = layer_id
        self.idx = idx

        self.w1 = Linear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)
        self.w2 = Linear_rescale_int(layer_id, inter_dim, dim, 1, 1, torch.int32)
        self.w3 = Linear_rescale_int(layer_id, dim, inter_dim, 1, 1, torch.int32)


    # todo: add row id in the forward function
    def forward(self, start_pos: int, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the Expert layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after expert computation.
        """

        # 返回的 shape [bsz, seqLen, 7168]
        # return self.w2(F.silu(self.w1(x)) * self.w3(x))
        # r1 shape: [bsz, seqLen, 18432], r1 rescale: 2^23
        r1 = self.w1(x)

        # s1 = F.silu(r1)
        # s1 shape: [bsz, seqLen, 18432], s1 rescale: 2^23
        s1 = torch.empty_like(r1, dtype=torch.int64, device='cuda')
        # silu_q25(r1, s1)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_x.bin', r1.contiguous().cpu())

        silu_q23(r1, s1)

        if snark:
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/expert_{self.idx}_silu_y.bin', s1.cpu())

        # r2 rescale: 2^23
        r2 = self.w3(x)

        # 返回的 shape [bsz, seqLen, 7168]
        q = self.w2((s1 * r2) >> 23)
        return q


class MoE(nn.Module):
    """
    Mixture-of-Experts (MoE) module.

    Attributes:
        dim (int): Dimensionality of input features.
        n_routed_experts (int): Total number of experts in the model.
        n_local_experts (int): Number of experts handled locally in distributed systems.
        n_activated_experts (int): Number of experts activated for each input.
        gate (nn.Module): Gating mechanism to route inputs to experts.
        experts (nn.ModuleList): List of expert modules.
        shared_experts (nn.Module): Shared experts applied to all inputs.
    """
    def __init__(self, layer_id, args: ModelArgs, ckpt_path):
        """
        Initializes the MoE module.

        Args:
            args (ModelArgs): Model arguments containing MoE parameters.
        """
        super().__init__()
        self.layer_id = layer_id
        self.ckpt_path = ckpt_path
        self.dim = args.dim
        self.moe_inter_dim = args.moe_inter_dim
        assert args.n_routed_experts % world_size == 0, f"Number of experts must be divisible by world size (world_size={world_size})"
        self.n_routed_experts = args.n_routed_experts
        self.n_local_experts = args.n_routed_experts // world_size
        self.n_activated_experts = args.n_activated_experts
        self.experts_start_idx = rank * self.n_local_experts
        self.experts_end_idx = self.experts_start_idx + self.n_local_experts
        self.gate = Gate(layer_id, args)
        # moe_inter_dim = 2048
        # self.experts = nn.ModuleList([Expert(layer_id, args.dim, args.moe_inter_dim) if self.experts_start_idx <= i < self.experts_end_idx else None
        #                               for i in range(self.n_routed_experts)])
        # self.experts = torch.nn.ModuleList()

        # dim = 7168, n_shared_experts = 1, moe_inter_dim = 2048
        self.shared_experts = MLP_int(layer_id, args.dim, args.n_shared_experts * args.moe_inter_dim)

    # x 的 rescale 为 2^23, shape: [1, seqLen, 7168]
    def forward(self, start_pos: int, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the MoE module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after expert routing and computation.
        """
        # ffn_normed 的 rescale 为 2^23
        # x = (x.to(torch.float32) * (2 ** -23)).to(torch.bfloat16)

        # z rescale: 2^23, z 的 shape [seqLen, 7168]
        z = self.shared_experts(start_pos, x)

        # x shape 之前为: [bsz, seqLen, 7168], 之后为 [8, 7168]
        shape = x.size()
        x = x.view(-1, self.dim)

        # weights shape: [seqLen, 8], indices shape: [seqLen, 8]
        # weights 的 rescale 为 2^23
        weights, indices = self.gate(start_pos, x)

        # y shape: [seqLen, 7168]
        y = torch.zeros_like(x)
        # torch.bincount 用来统计非负整数张量中各个数值出现的次数，类似于直方图计数
        # torch.bincount(input, weights=None, minlength=0) -> Tensor, weights: 可选的一维浮点张量，和 input 形状一致。若提供，就不是“次数统计”，而是“权重和”
        # 统计 256 个 专家 出现的次数
        counts = torch.bincount(indices.flatten(), minlength=self.n_routed_experts).tolist()
        for i in range(self.experts_start_idx, self.experts_end_idx):
            if counts[i] == 0:
                continue
            # expert = self.experts[i]
            with torch.device("cuda"):
                expert = Expert_int(self.layer_id, i, self.dim, self.moe_inter_dim)
            # load_model(expert, f'/data3/DeepSeek-V3-Demo1/experts-{self.layer_id}/{i}.safetensors')
            expertModelPath = os.path.join(self.ckpt_path, f"experts-{self.layer_id}/{i}.safetensors")
            load_model(expert, expertModelPath)

            # 第 idx 个 token, 专家 i 出现的编号是 top
            # 比如
            # [0, 1, 3, 2, 5, 4, 6, 9]
            # [7, 8, 3, 12, 5, 11, 6, 1]
            # [16, 10, 3, 2, 15, 4, 6, 9]
            # [10, 21, 3, 2, 5, 4, 1, 9]
            # torch.where(indices == 1) 返回的结果是 ([0, 1, 3], [1, 7, 6])
            idx, top = torch.where(indices == i)
            # expert(x[idx]) 返回的 shape [seqLen, 2048], weights[idx, top, None] 的 shape 为 [seqLen, 1], 包含一个 weight 值
            # y[idx] += expert(x[idx]) * weights[idx, top, None]
            x2 = x[idx].unsqueeze(0)
            y2 = expert(start_pos, x2)
            y2 = y2.view(-1, self.dim)
            # y[idx] += y2 * weights[idx, top, None] // (1 << 25)
            y[idx] += y2 * weights[idx, top, None] // (1 << 23)
        # z = self.shared_experts(x)
        if world_size > 1:
            dist.all_reduce(y)
        return (y + z).view(shape)

def getBF8PrintStr(ele):
    v = int(ele.cpu().view(torch.uint8).item())
    ex = v >> 3 & 0xF
    r = v & 0x7

    if ex == 15 and r == 7:
        print(f'BF8 Nan: {ex} {r} !!!', flush=True)
    elif ex == 0:
        print(f'BF8 subnormal: {ex} {r} !!!', flush=True)

    if v & 0x80:
        vstr = f'-{ex} {r}'
    else:
        vstr =  f'{ex} {r}'
    return vstr

class Block(nn.Module):
    """
    Transformer block combining attention and feed-forward layers.

    Attributes:
        attn (nn.Module): Attention layer (MLA).
        ffn (nn.Module): Feed-forward network (MLP or MoE).
        attn_norm (nn.Module): Layer normalization for attention.
        ffn_norm (nn.Module): Layer normalization for feed-forward network.
    """
    def __init__(self, layer_id: int, args: ModelArgs, ckpt_path):
        """
        Initializes the Transformer block.

        Args:
            layer_id (int): Layer index in the transformer.
            args (ModelArgs): Model arguments containing block parameters.
        """
        super().__init__()
        self.layer_id = layer_id
        self.ckpt_path = ckpt_path
        self.attn = MLA(layer_id, args)
        self.ffn = MLP_int(layer_id, args.dim, args.inter_dim) if layer_id < args.n_dense_layers else MoE(layer_id, args, ckpt_path)
        # print('args.dim: ' + str(args.dim))
        # args.dim = 7168
        self.attn_norm = RMSNorm_int(args.dim, torch.int32)
        self.ffn_norm = RMSNorm_int(args.dim, torch.int32)
        # self.ffn_norm = RMSNorm(args.dim)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> torch.Tensor:
        """
        Forward pass for the Transformer block.

        Args:
            x (torch.Tensor): Input tensor.
            start_pos (int): Starting position in the sequence.
            freqs_cis (torch.Tensor): Precomputed complex exponential values for rotary embeddings.
            mask (Optional[torch.Tensor]): Mask tensor to exclude certain positions from attention.

        Returns:
            torch.Tensor: Output tensor after block computation.
        """

        x_abs = x.abs()
        x_abs_min = x_abs.min().item()
        x_abs_max = x_abs.max().item()
        print(f'x abs min: {x_abs_min}, max: {x_abs_max}', flush=True)

        # self.attn_norm(x): 在进行attention之前，先将7168维的embeding 进行 归一化
        # attn_norm 的 scale 为 2^21, x 的 scale 为 2^31
        (atten_normed, rms) = self.attn_norm(x)

        if snark:
            os.makedirs(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}', exist_ok=True)
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_x.bin', x.cpu())
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_weight.bin', self.attn_norm.weight.view(torch.uint32).cpu())
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_y.bin', atten_normed.cpu())
            saveTensor(f'{zkDataDir}/pos_{start_pos}/layer_{self.layer_id}/attn_norm_rms.bin', rms.cpu())

        # attned 的 rescale 是 2^19, shape: [1, seqLen, 7168]
        attned = self.attn(atten_normed, start_pos, freqs_cis, mask)

        # 调整 rescale，因为 x 的 rescale 是 2^31, attned 的 rescale 是 2^19，因此要乘以 2^12
        # x = x + attned * (2 ** 10)
        x = x + attned * (2 ** 12)

        # ffn_normed 的 rescale 为 2^23
        (ffn_normed, rms) = self.ffn_norm(x)

        ffned = self.ffn(start_pos, ffn_normed)
        # x = x + ffned * (2 ** 6)
        x = x + ffned * (2 ** 8)

        # 返回的 x 的rescale 为 2^31
        return x

# Transformer 类在初始化中就已经明确好了自己的进程（rank），并且可以发现它是由比较经典的transformer组件构成的：
# embedding层（self.embed）、堆叠的decoding block（self.layers），标准的RMSnorm层（self.norm）与最后将隐藏状态投射到词表分布的output层（self.head）
# 根据前面提及的初始化的参数来看，词表大小为129280，模型的hidden dim为7168，堆叠的decode block一共有61个。维度变换会在下面举例说明。
# Transformer 由61个Block组成，每个Block有 attn 和 ffd
# Transformer类在初始化中就已经明确好了自己的进程（rank），并且可以发现它是由比较经典的transformer组件构成的
# embedding层（self.embed）、堆叠的decoding block（self.layers），标准的RMSnorm层（self.norm）与最后将隐藏状态投射到词表分布的output层（self.head）。
class Transformer(nn.Module):
    """
    Transformer model with positional embeddings, multiple layers, and output projection.

    Attributes:
        max_seq_len (int): Maximum sequence length for the transformer.
        embed (nn.Module): Embedding layer for input tokens.
        layers (torch.nn.ModuleList): List of transformer blocks.
        norm (nn.Module): Layer normalization applied after all blocks.
        head (nn.Module): Output projection layer mapping to vocabulary size.
        freqs_cis (torch.Tensor): Precomputed complex exponential values for rotary(旋转的) embeddings.
    """
    def __init__(self, args: ModelArgs):
        """
        Initializes the Transformer model.

        Args:
            args (ModelArgs): Model arguments containing transformer parameters.
        """
        global world_size, rank
        world_size = dist.get_world_size() if dist.is_initialized() else 1
        rank = dist.get_rank() if dist.is_initialized() else 0
        Linear.dtype = torch.float8_e4m3fn if args.dtype == "fp8" else torch.bfloat16
        super().__init__()
        self.args = args
        self.max_seq_len = args.max_seq_len
        self.embed = ParallelEmbedding(args.vocab_size, args.dim)
        self.layers = torch.nn.ModuleList()
        for layer_id in range(args.n_layers):
        #     self.layers.append(Block(layer_id, args))
            self.layers.append(nn.Module())

        self.norm = RMSNorm_int(args.dim, torch.int64)
        # self.head = ColumnParallelLinear(-1, args.dim, args.vocab_size, dtype=torch.get_default_dtype())
        # 模型中的 head 的 rescale 为 2^43, 使用的过程中的rescale为 2^35, head 输入的 rescale为 2^15, 输出的 rescale为 2^21
        # self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, 1, (1 << 8), (1 << 29), torch.int64)
        self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, 1, (1 << 8), 29, torch.int64)
        # self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, 1, (1 << 8), (1 << 31), torch.int64)
        # self.head = ColumnParallelLinear_int(-1, args.dim, args.vocab_size, (1 << 5), (1 << 11), (1 << 21), torch.int64)
        # register_buffer()注册了名为 "freqs_cis" 的缓冲区，缓冲区的值由 precompute_freqs_cis(args) 提供，并且由于设置了 persistent=False，
        # 该缓冲区不会被保存到模型的状态字典中。缓冲区注册的张量是该Transformer类的位置编码。
        # register_buffer 用于注册一个非参数张量（tensor），这个张量虽然不是模型的可学习参数，但仍然是模型状态的一部分。
        # 与参数不同，缓冲区不会在反向传播中计算梯度，也不会被优化器更新，但它会随模型一起移动到相应的设备（如 GPU）上。
        # persistent=False表示这个参数表示该缓冲区不属于持久状态（persistent state）。也就是说，当你调用 model.state_dict() 保存模型时，
        # 这个缓冲区不会被包含进去。位置编码可以在模型加载后重新计算，不需要存储。
        self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)

    @torch.inference_mode()
    def prep_inference(self, tokens: torch.Tensor, start_pos: int = 0):
        # softmax_init()
        softmax_init_q19()
        softmax_init_q21()
        silu_init_q23()

        seqlen = tokens.size(1)

        # h 是经过embed之后的结果，embed将文本表达转化为词嵌入，h的形状为 (batch_size, seq_len, 7168)
        h = self.embed(tokens)
        # h = h.to(torch.bfloat16) * (1.0 / (1 << 44))

        return (h, start_pos, seqlen)

    @torch.inference_mode()
    def layer_inference(self, layer_id, h, start_pos, seqlen):
        freqs_cis = self.freqs_cis[start_pos:start_pos+seqlen]
        mask = None

        # triu = triangle up
        # 返回上三角矩阵
        # 参数 k=0 代表主对角线，k 为正数则从主对角线开始向上数第 k 条，k 为负数则从主对角线开始向下数第 k 条
        if seqlen > 1:
            # mask = torch.full((seqlen, seqlen), float("-inf"), device="cuda").triu_(1)
            mask = torch.full((seqlen, seqlen), -(64 << 36), dtype=torch.int64, device="cuda").triu_(1)

        h = self.layers[layer_id](h, start_pos, freqs_cis, mask)

        h_abs = (h.to(torch.float32) * (2 ** -31)).to(torch.bfloat16).abs()
        h_abs_max = h_abs.max()
        h_abs[h_abs < (2 ** -125)] = h_abs_max
        h_abs_min = h_abs.min()
        h_abs_min_str = getBF16PrintStr(h_abs_min)
        h_abs_max_str = getBF16PrintStr(h_abs_max)
        print(f'h_abs min: {h_abs_min_str}, max: {h_abs_max_str}')

        # 返回的 h 的rescale 为 2^31
        return h

    @torch.inference_mode()
    def finish_inference(self, h):
        # norm的结果的scale = 2^15, h 的 scale = 2^15
        h = self.norm(h)[0][:, -1]

        # logits 的rescale 为 2^21
        logits = self.head(h[None, :])
        if world_size > 1:
            all_logits = [torch.empty_like(logits) for _ in range(world_size)]
            dist.all_gather(all_logits, logits)
            logits = torch.cat(all_logits, dim=-1)

        # logits 的 scale = 2^21
        return logits

    # # 这里开始推理了，torch.inference_mode 这句话 关闭梯度计算 并 禁止 autograd 构建计算图，同时比 torch.no_grad() 还高效，专门为推理场景优化
    # @torch.inference_mode()
    # def forward(self, tokens: torch.Tensor, start_pos: int = 0):
    #     """
    #     Forward pass for the Transformer model.

    #     Args:
    #         tokens (torch.Tensor): Input tensor of token IDs with shape (batch_size, seq_len).
    #         start_pos (int, optional): Starting position in the sequence for rotary(旋转的) embeddings. Defaults to 0.

    #     Returns:
    #         torch.Tensor: Logits tensor of shape (batch_size, vocab_size).
    #     """
    #     seqlen = tokens.size(1)
    #     # h 是经过embed之后的结果，embed将文本表达转化为词嵌入，h的形状为 (batch_size, seq_len, 7168)
    #     h = self.embed(tokens)
    #     freqs_cis = self.freqs_cis[start_pos:start_pos+seqlen]
    #     print('freqs_cis: ' + str(freqs_cis.tolist()))

    #     mask = None

    #     # triu = triangle up
    #     # 返回上三角矩阵
    #     # 参数 k=0 代表主对角线，k 为正数则从主对角线开始向上数第 k 条，k 为负数则从主对角线开始向下数第 k 条
    #     if seqlen > 1:
    #         mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device).triu_(1)

    #     for layer in self.layers:
    #         h = layer(h, start_pos, freqs_cis, mask)

    #     # 只取最后一个 token
    #     h = self.norm(h)[:, -1]
    #     logits = self.head(h)
    #     if world_size > 1:
    #         all_logits = [torch.empty_like(logits) for _ in range(world_size)]
    #         dist.all_gather(all_logits, logits)
    #         logits = torch.cat(all_logits, dim=-1)
    #     return logits


if __name__ == "__main__":
    torch.set_default_dtype(torch.bfloat16)
    torch.set_default_device("cuda")
    torch.manual_seed(0)
    args = ModelArgs()
    x = torch.randint(0, args.vocab_size, (2, 128))
    model = Transformer(0, args)
    print(model(x).size())