qcw2333
/

YingLong_50m

@@ -1,112 +1,35 @@
-"""Full definition of a GPT NeoX Language Model, all of it in this single file.
-Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
-https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
 """
 import math, random
 import numpy as np
 from typing import Any, List, Optional, Tuple
 import torch
 import torch.nn as nn
-from lightning_utilities.core.imports import RequirementCache
-from typing_extensions import Self
-from flash_attn import flash_attn_func
-# from lit_gpt.config import Config
-from xformers.ops import SwiGLU
 import torch.nn.functional as F
-# from .fused_rotary_embedding import apply_rotary_emb_func
-RoPECache = Tuple[torch.Tensor, torch.Tensor]
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-PretokenCache = torch.Tensor
-# Tuple[torch.Tensor, torch.Tensor]
-FlashAttention2Available = RequirementCache("flash-attn>=2.0.0.post1")
-from einops import rearrange
-from transformers import PreTrainedModel, Cache, DynamicCache
-from huggingface_hub import PyTorchModelHubMixin
-from .model_config import YingLongConfig
-# from torch.distributions import Normal, LowRankMultivariateNormal, kl_divergence,MultivariateNormal
-class quantitleLoss(torch.nn.Module):
-    def __init__(self,
-                 qSize = 99,
-                 patch_size = 16,
-                 *args,**kwargs) -> None:
-        super().__init__()
-        self.qSize = qSize
-        self.patch_size = patch_size
-        q = np.array([i+1 for i in range(self.qSize)])
-        q = q / (self.qSize + 1)
-        q = q.reshape((1,1,-1))
-        q_variance = q*(1-q)
-        self.register_buffer('q', torch.tensor(q))
-        self.register_buffer('q_variance', torch.tensor(q_variance))
-    def forward(self, input: torch.Tensor, target: torch.Tensor,rel_loss = False) -> torch.Tensor:
-        target = target.unsqueeze(-1)
-        input = input[:,:target.shape[1],:,:]
-        posPart = input - target
-        negPart = -posPart
-        raw_loss = torch.maximum(self.q * negPart, (1-self.q) * posPart)
-        target_absmean = torch.mean(target.abs(),dim = (1,2),keepdims = True)
-        raw_loss = raw_loss  / torch.sqrt(self.q_variance)  / (target_absmean + 1e-4)
-        return torch.mean(raw_loss)
-def haarMatrix_unnormalized(n):
-    # Allow only size n of power 2
-    n = 2**np.ceil(np.log2(n))
-    if n > 2:
-        h = haarMatrix(n / 2)
-    else:
-        return np.array([[1, 1], [1, -1]])
-    # calculate upper haar part
-    h_n = np.kron(h, [1, 1])
-    # calculate lower haar part
-    # if normalized:
-        # h_i = np.sqrt(n/2)*np.kron(np.eye(len(h)), [1, -1])
-    # else:
-    h_i = np.kron(np.eye(len(h)), [1, -1])
-    # combine parts
-    h = np.vstack((h_n, h_i))
-    return h
-def haarMatrix(n,normalized = 'ortho'):
-    h = haarMatrix_unnormalized(n)
-    scaler = np.diag(1/np.sqrt(np.diag(h@h.transpose())))
-    if normalized == 'ortho':
-        return scaler @ h
-    elif normalized == 'forward':
-        return scaler @ h/ np.sqrt(n)
-    else:
-        return scaler @ h *  np.sqrt(n)
-    # else:
-        # scaler = 1
 class Tokenizer(torch.nn.Module):
     def __init__(self, config: YingLongConfig, *args,**kwargs) -> None:
         super().__init__()
@@ -119,6 +42,7 @@ class Tokenizer(torch.nn.Module):
         self.register_buffer('mask_token', torch.zeros(1000))
         if self.config.haar_trans:
             self.register_buffer('haar_transform',torch.Tensor(haarMatrix(self.config.patch_size,normalized = self.config.haar_trans_norm)))
@@ -167,7 +91,6 @@ class Tokenizer(torch.nn.Module):
         else:
             factor = 1
             more_rows = future_token // self.patch_size + 1
             prev_more_rows = prev_token // self.patch_size + 1
@@ -187,7 +110,6 @@ class Tokenizer(torch.nn.Module):
             masks = [jj for jj in range(x_featured.shape[1])]
             masks = masks[-more_rows:]
-            # if not mean_replace:
             x_featured[:,-more_rows:] = self.mask0(self.mask_token[:len(masks)].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
             x_featured[:,:prev_more_rows] = self.mask0(self.mask_token[:prev_more_rows].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
@@ -199,26 +121,30 @@ class Tokenizer(torch.nn.Module):
 class model_tmp(PreTrainedModel):
     config_class = YingLongConfig
     base_model_prefix = "model"
-    # supports_gradient_checkpointing = True
-    # _no_split_modules = ["TimeMoeDecoderLayer"]
-    # _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = False
-    _supports_cache_class = True
-# class GPT(nn.Module,PreTrainedModel,PyTorchModelHubMixin):
 class GPT(model_tmp):
     def __init__(self, config: YingLongConfig, *args,**kwargs) -> None:
-        # config_class = YingLongConfig
-    #     base_model_prefix = "model"
-    # # supports_gradient_checkpointing = True
-    # # _no_split_modules = ["TimeMoeDecoderLayer"]
-    # # _skip_keys_device_placement = "past_key_values"
-    #     _supports_flash_attn_2 = True
-    #     _supports_sdpa = False
-    #     _supports_cache_class = True
         super().__init__(config)
         self.config = config
@@ -227,49 +153,28 @@ class GPT(model_tmp):
         if self.config._norm_class == "RMSNorm":
-            # from .model import RMSNorm
             self.config.norm_class =  RMSNorm
         elif self.config._norm_class == "FusedRMSNorm":
-            # from .model import FusedRMSNorm
             self.config.norm_class =  FusedRMSNorm
         elif self.config._norm_class == 'BatchNorm':
-            # from .model import iBatchNorm
             self.config.norm_class =  iBatchNorm
         if self.config._mlp_class == "GptNeoxMLP":
-            # from .model import GptNeoxMLP
             self.config.mlp_class =  GptNeoxMLP
         elif self.config._mlp_class == "LLaMAMLP":
-            # from .model import LLaMAMLP
             self.config.mlp_class =  LLaMAMLP
-        if config.stats_encoding:
-            self.stat_tokens = 1
-        else:
-            self.stat_tokens = 0
         self.tokenizer = Tokenizer(config)
-        # self.lm_head = nn.Sequential(config.norm_class(config.n_embd, eps=config.norm_eps),
-        #                             nn.Linear(config.n_embd, config.n_embd*4),
-        #                             nn.ReLU(),
-        #                             nn.Linear(config.n_embd*4, 99*self.patch_size),
-        #                             )
         self.lm_head =  nn.Linear(config.n_embd, 99*self.patch_size)
-        # self.gate = nn.Linear(config.n_embd, 1)
         self.quantitleLoss = quantitleLoss(99,patch_size = self.patch_size)
@@ -296,48 +201,16 @@ class GPT(model_tmp):
-        self.rope_cache: Optional[RoPECache] = None
-        self.mask_cache: Optional[torch.Tensor] = None
-        self.kv_caches: List[KVCache] = []
-    def _init_weights(self, module: nn.Module) -> None:
-        """Meant to be used with `gpt.apply(gpt._init_weights)`."""
-        # GPT-NeoX  https://arxiv.org/pdf/2204.06745.pdf
-        if isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
-            # RWKV: set it to 1e-4
-            # torch.nn.init.uniform_(module.weight,  -1e-4, 1e-4)
-        elif isinstance(module, nn.Linear):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        # GPT-NeoX
-        for name, p in module.named_parameters():
-            if (name == "proj.weight" and isinstance(module, LLaMAMLP)) or (name == "w3.weight" and isinstance(module, SwiGLU) or (name=="proj.weight" and isinstance(module, BidirectedlSelfAttention))):  #if use xformer swiglu, fc2 layer will be renamed to w3
-                nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd)  /  self.config.n_layer)
-    def reset_cache(self) -> None:
-        self.kv_caches.clear()
-        if self.mask_cache is not None and self.mask_cache.device.type == "xla":
-            # https://github.com/Lightning-AI/lit-gpt/pull/83#issuecomment-1558150179
-            self.rope_cache = None
-            self.mask_cache = None
     def forward(
         self, idx: torch.Tensor,
-        max_seq_length: Optional[int] = None,
-        input_pos: Optional[torch.Tensor] = None,
-        next_token: torch.Tensor = None,
         future_token: int = 0,
         prev_token: int = 0,
-        val: bool = False,
-        print_intermediate: bool = False,
-        cot_rounds: int = -1,
-        sequential: bool = False,
         *args,**kwargs,
-    ) -> torch.Tensor:
         if future_token > 0:
             more_rows = future_token // self.patch_size + 1
@@ -348,138 +221,53 @@ class GPT(model_tmp):
         B, T = idx.size()
-        use_kv_cache = input_pos is not None
         block_size = self.config.block_size
-        if max_seq_length is None:
-            max_seq_length = block_size
-        if use_kv_cache:  # not relevant otherwise
-            assert (
-                max_seq_length >= T
-            ), f"Cannot forward sequence of length {T}, max seq length is only {max_seq_length}"
         assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"
-        if self.rope_cache is None:
-            self.rope_cache = self.build_rope_cache(idx)
-        if use_kv_cache and self.mask_cache is None:
-            self.mask_cache = self.build_mask_cache(idx)
         cos, sin = self.rope_cache
-        if use_kv_cache:
-            if self.stat_tokens:
-                if len(input_pos) == 1:
-                    idx = idx[:,input_pos]
-                    input_pos = input_pos.add_(1)
-                else:
-                    input_pos = torch.arange(0, input_pos[-1]+2, device=idx.device)
-                cos = cos.index_select(0, input_pos)
-                sin = sin.index_select(0, input_pos)
-                mask = self.mask_cache.index_select(2, input_pos)
-                mask = mask[:, :, :, :max_seq_length]
-            else:
-                cos = cos.index_select(0, input_pos)
-                sin = sin.index_select(0, input_pos)
-                idx = idx[:,input_pos]
-        else:
-            cos = cos[:max(T,1024) + self.stat_tokens]
-            sin = sin[:max(T,1024) +  self.stat_tokens]
-            mask = None
-        idx_ori = idx
-        if use_kv_cache:
-            pass
-        else:
-            x,x_raw,masks,mean,std,x_0 =  self.tokenizer(idx,
-                                                         future_token =future_token,
-                                                         prev_token = prev_token,
-                                                         sequential = sequential,
-                                                        )
         if self.unet:
             skips = []
-        res_intermediate = []
-        target_intermediate = []
-        if not use_kv_cache:
-            if cot_rounds <0:
-                cot_rounds = self.config.n_cot
-            res_list = []
-            gate_list = []
-            for rep in range(cot_rounds):
-                for block_idx in range(len( self.transformer.h)):
-                    block =  self.transformer.h[block_idx]
-                    if self.unet and block_idx >=len(self.transformer.h) //2:
-                        x = self.unet_projection[block_idx - len(self.transformer.h) //2](torch.cat((skips.pop(),x),dim = -1))
-                    x, *_ = block(x, (cos, sin), max_seq_length)
-                    if self.unet and block_idx <len(self.transformer.h) //2:
-                        skips.append(x)
-                        x_delay = torch.cat((x[:,0,:].unsqueeze(1),x[:,:-1,:]),dim = 1)
-                        x = self.unet_merge[block_idx](torch.cat((x_delay,x),dim = -1))
-                    # if block_idx <len(self.transformer.h) //2:
-                        # x_delay = torch.cat((x[:,0,:].unsqueeze(1),x[:,:-1,:]),dim = 1)
-                        # x = self.unet_merge[block_idx](torch.cat((x_delay,x),dim = -1))
-                # res_list.append(self.lm_head(x).unsqueeze(-1))
-                # gate_list.append(self.gate(x).unsqueeze(-1))
-                    # gate_list.append(self.gate(x))
-                # if print_intermediate:
-                    # res_intermediate.append(res_list[-1])
-                # if print_intermediate:
-#                     res_tmp = self.lm_head(x[:,self.stat_tokens:])
-#                     res_tmp = rearrange(res_tmp,'b c (l1 l2) -> b c l1 l2', l2 = 99)
-#                     if self.config.haar_trans_inv:
-#                         res_tmp = torch.einsum('bcal,ad->bcdl',res_tmp,self.tokenizer.haar_transform)
-#                         if self.config.haar_trans_norm == "backward":
-#                             res_tmp = res_tmp / np.sqrt(res_tmp.shape[-2])
-#                         elif self.config.haar_trans_norm == "forward":
-#                             res_tmp = res_tmp * np.sqrt(res_tmp.shape[-2])
-#                     res_tmp = res_tmp * (std.unsqueeze(-1) + 1e-4) + mean.unsqueeze(-1)
-#                     res_intermediate.append(res_tmp[:,masks,:,:])
-        else:
-            self.kv_caches = self.kv_caches or self.build_kv_caches(x, max_seq_length, cos.size(-1) * 2)
-            for block_idx in range(len( self.transformer.h)):
-                block =  self.transformer.h[block_idx]
-                if self.unet and block_idx >=len(self.transformer.h) //2:
-                    x = F.silu(skips.pop()) * x
-                x, self.kv_caches[block_idx] = block(x, (cos, sin), max_seq_length, mask, input_pos, self.kv_caches[block_idx])
-                if self.unet and block_idx <len(self.transformer.h) //2:
-                    skips.append(x)
         res = self.lm_head(x)
-        # gate = torch.cat(gate_list,dim = -1)
-        # gate = F.softmax(gate,dim = -1)
-        # res = torch.cat(res_list,dim = -1) * gate
-        # res = res.sum(dim = -1)
         res = rearrange(res,'b c (l1 l2) -> b c l1 l2', l2 = 99)
@@ -487,7 +275,6 @@ class GPT(model_tmp):
         if self.config.haar_trans_inv:
-            # print('res',res.shape,self.tokenizer.haar_transform.shape)
             res = torch.einsum('bcal,ad->bcdl',res,self.tokenizer.haar_transform)
             if self.config.haar_trans_norm == "backward":
                 res = res / np.sqrt(res.shape[-2])
@@ -495,32 +282,33 @@ class GPT(model_tmp):
                 res = res * np.sqrt(res.shape[-2])
         res = res * (std.unsqueeze(-1) + 1e-4) + mean.unsqueeze(-1)
         if future_token == 0:
-            return res[:,masks,:,:], x_raw[:,masks,:],res_intermediate,target_intermediate
         else:
-            return res[:,masks,:,:],res_intermediate
     def generate(self,*args,**kwargs):
-        res, _ = self.forward(*args,**kwargs)
-        # logits_all,res_intermediate = model(idx = x_train, future_token = (pred_len//32  + 1)* 32, prev_token = 0,print_intermediate = False,cot_rounds = 1)
         res = rearrange(res, 'b l c d -> b (l c) d')
         return res[:,:kwargs['future_token'],:]
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Self:
         return cls(Config.from_name(name, **kwargs))
-    def build_rope_cache(self, idx: torch.Tensor) -> RoPECache:
         return build_rope_cache(
-            seq_len=self.config.block_size +  self.stat_tokens,
             n_elem=int(self.config.rotary_percentage * self.config.head_size),
             dtype=torch.bfloat16,
             device=idx.device,
@@ -528,27 +316,6 @@ class GPT(model_tmp):
             condense_ratio=self.config.condense_ratio,
         )
-    def build_mask_cache(self, idx: torch.Tensor) -> torch.Tensor:
-        ones = torch.ones((self.config.block_size+self.stat_tokens, self.config.block_size+self.stat_tokens), device=idx.device, dtype=torch.bool)
-        return torch.tril(ones).unsqueeze(0).unsqueeze(0)
-    def build_kv_caches(self, idx: torch.Tensor, max_seq_length: int, rope_cache_length: int) -> List[KVCache]:
-        B = idx.size(0)
-        heads = 1 if self.config.n_query_groups == 1 else self.config.n_query_groups
-        k_cache_shape = (
-            B,
-            max_seq_length,
-            heads,
-            rope_cache_length + self.config.head_size - int(self.config.rotary_percentage * self.config.head_size),
-        )
-        v_cache_shape = (B, max_seq_length, heads, self.config.head_size)
-        device = idx.device
-        return [
-            (torch.zeros(k_cache_shape, device=device), torch.zeros(v_cache_shape, device=device))
-            for _ in range(self.config.n_layer)
-        ]
 class Block(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
@@ -562,15 +329,14 @@ class Block(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        rope: RoPECache,
         max_seq_length: int,
         mask: Optional[torch.Tensor] = None,
         input_pos: Optional[torch.Tensor] = None,
-        kv_cache: Optional[KVCache] = None,
-    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
         n_1 = self.norm_1(x)
-        h, new_kv_cache = self.attn(n_1, rope, max_seq_length, mask, input_pos, kv_cache)
         if self.config.parallel_residual:
             n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
             x = x + h + self.mlp(n_2)
@@ -583,29 +349,25 @@ class Block(nn.Module):
             x = x + h
             x = x + self.mlp(self.norm_2(x))
-        return x, new_kv_cache
 class BidirectedlSelfAttention(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
         super().__init__()
         shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
-        # key, query, value projections for all heads, but in a batch
         self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
-        # output projection
         self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
         self.config = config
     def forward(
         self,
         x: torch.Tensor,
-        rope: RoPECache,
         max_seq_length: int,
         mask: Optional[torch.Tensor] = None,
         input_pos: Optional[torch.Tensor] = None,
-        kv_cache: Optional[KVCache] = None,
-    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
         B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
@@ -616,52 +378,20 @@ class BidirectedlSelfAttention(nn.Module):
         q_per_kv = self.config.n_head // self.config.n_query_groups
         total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
         qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size) # (B, T, n_query_groups, total_qkv, hs)
-        # qkv = qkv.permute(0, 2, 3, 1, 4)  # (B, n_query_groups, total_qkv, T, hs)
         # split batched computation into three
         q, k, v = qkv.split((q_per_kv, 1, 1), dim=-2)
-        # repeat k and v if necessary
-        # Peiyuan: we do not need to do this as flash attention 2 already support GQA
-        # if self.config.n_query_groups != 1:  # doing this would require a full kv cache with MQA (inefficient!)
-        #     # for MHA this is a no-op
-        #     k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
-        #     v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
         q = q.reshape(B,  T, -1, self.config.head_size)  # (B, T, nh_q, hs)
         k = k.reshape(B,  T, -1, self.config.head_size)
         v = v.reshape(B,  T, -1, self.config.head_size)
         cos, sin = rope
-        # apply rope in fp32 significanly stabalize training
-        # fused rope expect (batch_size, seqlen, nheads, headdim)
         q = apply_rotary_emb_func(q, cos, sin, False, True)
         k = apply_rotary_emb_func(k, cos, sin, False, True)
-        # n_elem = int(self.config.rotary_percentage * self.config.head_size)
-        # q_roped = apply_rope(q[..., :n_elem], cos.repeat(1,2), sin.repeat(1,2))
-        # k_roped = apply_rope(k[..., :n_elem], cos.repeat(1,2), sin.repeat(1,2))
-        # print( (q_roped - q).sum())
-        # q = torch.cat((q_roped, q[..., n_elem:]), dim=-1)
-        # k = torch.cat((k_roped, k[..., n_elem:]), dim=-1)
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            cache_k, cache_v = cache_k.to(dtype=k.dtype), cache_v.to(dtype=v.dtype)
-            # check if reached token limit
-            if input_pos[-1] >= max_seq_length:
-                input_pos = torch.tensor(max_seq_length - 1, device=input_pos.device)
-                # shift 1 position to the left
-                cache_k = torch.roll(cache_k, -1, dims=1)
-                cache_v = torch.roll(cache_v, -1, dims=1)
-            k = cache_k.index_copy_(1, input_pos, k)
-            v = cache_v.index_copy_(1, input_pos, v)
-            kv_cache = k, v
         y = self.scaled_dot_product_attention(q, k, v, mask=mask)
@@ -670,7 +400,9 @@ class BidirectedlSelfAttention(nn.Module):
         # output projection
         y = self.proj(y)
-        return y, kv_cache
     def scaled_dot_product_attention(
         self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
@@ -690,14 +422,83 @@ class BidirectedlSelfAttention(nn.Module):
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
         if q.size() != k.size():
-             k = k.repeat_interleave(q.shape[1]//k.shape[1], dim=1)
-             v = v.repeat_interleave(q.shape[1]//v.shape[1], dim=1)
         y = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=False
         )
         return y.transpose(1, 2)
 class GptNeoxMLP(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
         super().__init__()
@@ -713,21 +514,15 @@ class GptNeoxMLP(nn.Module):
 class LLaMAMLP(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
         super().__init__()
-        # self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
-        # self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
-        # self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
         self.swiglu = SwiGLU(config.n_embd,config.intermediate_size, bias=False, _pack_weights=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x_fc_1 = self.fc_1(x)
-        # x_fc_2 = self.fc_2(x)
-        # x = torch.nn.functional.silu(x_fc_1) * x_fc_2
-        # return self.proj(x)
         return self.swiglu(x)
 def build_rope_cache(
     seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000, condense_ratio: int = 1
-) -> RoPECache:
     """Enhanced Transformer with Rotary Position Embedding.
     Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
@@ -745,7 +540,6 @@ def build_rope_cache(
     cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)
-    # print('    print(seq_idx.shape,theta.shape,sin.shape,cos.shape,idx_theta.shape)',seq_idx.shape,theta.shape,sin.shape,cos.shape,idx_theta.shape)
     # added by peiyuan to ensure same data type with q, k, to use fused rotary embedding
     if dtype == torch.bfloat16:
         return cos.bfloat16(), sin.bfloat16()
@@ -766,6 +560,14 @@ def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.T
 import torch
 # Copyright (c) 2022, Tri Dao.
 # Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py AND https://github.com/Dao-AILab/flash-attention/blob/7a983df74215e035e566e37125b0a71e3618f39d/flash_attn/ops/layer_norm.py#L16
@@ -1611,7 +1413,18 @@ class RMSNorm(torch.nn.Module):
 # Copyright (c) 2023, Tri Dao.

+"""
+Based on the tinyllama implementation: https://github.com/jzhang38/TinyLlama
 """
 import math, random
 import numpy as np
 from typing import Any, List, Optional, Tuple
+from typing_extensions import Self
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from lightning_utilities.core.imports import RequirementCache
+FlashAttention2Available = RequirementCache("flash-attn>=2.0.0.post1")
+from flash_attn import flash_attn_func
+from xformers.ops import SwiGLU
+from einops import rearrange
+from transformers import PreTrainedModel
+from .model_config import YingLongConfig
 class Tokenizer(torch.nn.Module):
     def __init__(self, config: YingLongConfig, *args,**kwargs) -> None:
         super().__init__()
         self.register_buffer('mask_token', torch.zeros(1000))
         if self.config.haar_trans:
             self.register_buffer('haar_transform',torch.Tensor(haarMatrix(self.config.patch_size,normalized = self.config.haar_trans_norm)))
         else:
             factor = 1
             more_rows = future_token // self.patch_size + 1
             prev_more_rows = prev_token // self.patch_size + 1
             masks = [jj for jj in range(x_featured.shape[1])]
             masks = masks[-more_rows:]
             x_featured[:,-more_rows:] = self.mask0(self.mask_token[:len(masks)].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
             x_featured[:,:prev_more_rows] = self.mask0(self.mask_token[:prev_more_rows].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
 class model_tmp(PreTrainedModel):
     config_class = YingLongConfig
     base_model_prefix = "model"
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
+        elif isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        for name, p in module.named_parameters():
+            if (name == "proj.weight" and isinstance(module, LLaMAMLP)) or (name == "w3.weight" and isinstance(module, SwiGLU) or (name=="proj.weight" and isinstance(module, BidirectedlSelfAttention))):
+                nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd)  /  self.config.n_layer)
 class GPT(model_tmp):
     def __init__(self, config: YingLongConfig, *args,**kwargs) -> None:
         super().__init__(config)
         self.config = config
         if self.config._norm_class == "RMSNorm":
             self.config.norm_class =  RMSNorm
         elif self.config._norm_class == "FusedRMSNorm":
             self.config.norm_class =  FusedRMSNorm
         elif self.config._norm_class == 'BatchNorm':
             self.config.norm_class =  iBatchNorm
         if self.config._mlp_class == "GptNeoxMLP":
             self.config.mlp_class =  GptNeoxMLP
         elif self.config._mlp_class == "LLaMAMLP":
             self.config.mlp_class =  LLaMAMLP
         self.tokenizer = Tokenizer(config)
         self.lm_head =  nn.Linear(config.n_embd, 99*self.patch_size)
         self.quantitleLoss = quantitleLoss(99,patch_size = self.patch_size)
+        self.rope_cache = None
     def forward(
         self, idx: torch.Tensor,
         future_token: int = 0,
         prev_token: int = 0,
         *args,**kwargs,
+        ) -> torch.Tensor:
         if future_token > 0:
             more_rows = future_token // self.patch_size + 1
         B, T = idx.size()
         block_size = self.config.block_size
+        max_seq_length = T
         assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"
+        self.rope_cache = self.build_rope_cache(idx)
         cos, sin = self.rope_cache
+        cos = cos[:max(T,1024)]
+        sin = sin[:max(T,1024)]
+        x,x_raw,masks,mean,std,_ =  self.tokenizer(idx, future_token =future_token,prev_token = prev_token)
         if self.unet:
             skips = []
+        for block_idx in range(len( self.transformer.h)):
+            block =  self.transformer.h[block_idx]
+            if self.unet and block_idx >=len(self.transformer.h) //2:
+                x = self.unet_projection[block_idx - len(self.transformer.h) //2](torch.cat((skips.pop(),x),dim = -1))
+            x = block(x, (cos, sin), max_seq_length)
+            if self.unet and block_idx <len(self.transformer.h) //2:
+                skips.append(x)
+                x_delay = torch.cat((x[:,0,:].unsqueeze(1),x[:,:-1,:]),dim = 1)
+                x = self.unet_merge[block_idx](torch.cat((x_delay,x),dim = -1))
         res = self.lm_head(x)
         res = rearrange(res,'b c (l1 l2) -> b c l1 l2', l2 = 99)
         if self.config.haar_trans_inv:
             res = torch.einsum('bcal,ad->bcdl',res,self.tokenizer.haar_transform)
             if self.config.haar_trans_norm == "backward":
                 res = res / np.sqrt(res.shape[-2])
                 res = res * np.sqrt(res.shape[-2])
         res = res * (std.unsqueeze(-1) + 1e-4) + mean.unsqueeze(-1)
         if future_token == 0:
+            return res[:,masks,:,:], x_raw[:,masks,:]
         else:
+            return res[:,masks,:,:]
     def generate(self,*args,**kwargs):
+        res = self.forward(*args,**kwargs)
         res = rearrange(res, 'b l c d -> b (l c) d')
         return res[:,:kwargs['future_token'],:]
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Self:
         return cls(Config.from_name(name, **kwargs))
+    def build_rope_cache(self, idx: torch.Tensor) :
         return build_rope_cache(
+            seq_len=self.config.block_size,
             n_elem=int(self.config.rotary_percentage * self.config.head_size),
             dtype=torch.bfloat16,
             device=idx.device,
             condense_ratio=self.config.condense_ratio,
         )
 class Block(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
     def forward(
         self,
         x: torch.Tensor,
+        rope: Optional[Tuple[torch.Tensor, torch.Tensor]],
         max_seq_length: int,
         mask: Optional[torch.Tensor] = None,
         input_pos: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         n_1 = self.norm_1(x)
+        h = self.attn(n_1, rope, max_seq_length, mask, input_pos)
         if self.config.parallel_residual:
             n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
             x = x + h + self.mlp(n_2)
             x = x + h
             x = x + self.mlp(self.norm_2(x))
+        return x
 class BidirectedlSelfAttention(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
         super().__init__()
         shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
         self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
         self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
         self.config = config
     def forward(
         self,
         x: torch.Tensor,
+        rope: Tuple[torch.Tensor, torch.Tensor],
         max_seq_length: int,
         mask: Optional[torch.Tensor] = None,
         input_pos: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
         B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
         q_per_kv = self.config.n_head // self.config.n_query_groups
         total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
         qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size) # (B, T, n_query_groups, total_qkv, hs)
         # split batched computation into three
         q, k, v = qkv.split((q_per_kv, 1, 1), dim=-2)
         q = q.reshape(B,  T, -1, self.config.head_size)  # (B, T, nh_q, hs)
         k = k.reshape(B,  T, -1, self.config.head_size)
         v = v.reshape(B,  T, -1, self.config.head_size)
         cos, sin = rope
         q = apply_rotary_emb_func(q, cos, sin, False, True)
         k = apply_rotary_emb_func(k, cos, sin, False, True)
         y = self.scaled_dot_product_attention(q, k, v, mask=mask)
         # output projection
         y = self.proj(y)
+        return y
     def scaled_dot_product_attention(
         self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
         if q.size() != k.size():
+            k = k.repeat_interleave(q.shape[1]//k.shape[1], dim=1)
+            v = v.repeat_interleave(q.shape[1]//v.shape[1], dim=1)
         y = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=False
         )
         return y.transpose(1, 2)
+class quantitleLoss(torch.nn.Module):
+    def __init__(self,
+                 qSize = 99,
+                 patch_size = 16,
+                 *args,**kwargs):
+        super().__init__()
+        self.qSize = qSize
+        self.patch_size = patch_size
+        q = np.array([i+1 for i in range(self.qSize)])
+        q = q / (self.qSize + 1)
+        q = q.reshape((1,1,-1))
+        q_variance = q*(1-q)
+        self.register_buffer('q', torch.tensor(q))
+        self.register_buffer('q_variance', torch.tensor(q_variance))
+    def forward(self, input: torch.Tensor, target: torch.Tensor,rel_loss = False):
+        target = target.unsqueeze(-1)
+        input = input[:,:target.shape[1],:,:]
+        posPart = input - target
+        negPart = -posPart
+        raw_loss = torch.maximum(self.q * negPart, (1-self.q) * posPart)
+        target_absmean = torch.mean(target.abs(),dim = (1,2),keepdims = True)
+        raw_loss = raw_loss  / torch.sqrt(self.q_variance)  / (target_absmean + 1e-4)
+        return torch.mean(raw_loss)
+def haarMatrix_unnormalized(n):
+    n = 2**np.ceil(np.log2(n))
+    if n > 2:
+        h = haarMatrix(n / 2)
+    else:
+        return np.array([[1, 1], [1, -1]])
+    h_n = np.kron(h, [1, 1])
+    h_i = np.kron(np.eye(len(h)), [1, -1])
+    h = np.vstack((h_n, h_i))
+    return h
+def haarMatrix(n,normalized = 'ortho'):
+    h = haarMatrix_unnormalized(n)
+    scaler = np.diag(1/np.sqrt(np.diag(h@h.transpose())))
+    if normalized == 'ortho':
+        return scaler @ h
+    elif normalized == 'forward':
+        return scaler @ h/ np.sqrt(n)
+    else:
+        return scaler @ h *  np.sqrt(n)
 class GptNeoxMLP(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
         super().__init__()
 class LLaMAMLP(nn.Module):
     def __init__(self, config:YingLongConfig) -> None:
         super().__init__()
         self.swiglu = SwiGLU(config.n_embd,config.intermediate_size, bias=False, _pack_weights=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.swiglu(x)
 def build_rope_cache(
     seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000, condense_ratio: int = 1
+) -> Tuple[torch.Tensor,torch.Tensor]:
     """Enhanced Transformer with Rotary Position Embedding.
     Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
     cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)
     # added by peiyuan to ensure same data type with q, k, to use fused rotary embedding
     if dtype == torch.bfloat16:
         return cos.bfloat16(), sin.bfloat16()
+######################################
+#layernorm
+######################################
 import torch
 # Copyright (c) 2022, Tri Dao.
 # Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py AND https://github.com/Dao-AILab/flash-attention/blob/7a983df74215e035e566e37125b0a71e3618f39d/flash_attn/ops/layer_norm.py#L16
+######################################
+#rope_emb
+######################################
 # Copyright (c) 2023, Tri Dao.