Airin-chan
/

LCTVLM

+# -*- coding: utf-8 -*-
+"""LCTVLM.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1ekzgx_mlQEjCXDZlg6qdKd951zUJDBcT
+"""
+import torch
+from torch import nn
+from typing import Optional
+class LCMBlock (nn.Module) :
+  """
+    LCm (Laten Connected Model ) block, looking attention as two preception and icreasing it
+    to N multiple magnitude values.
+  """
+  def __init__ (self,d_model :int, drop_rate : float = 0.1) :
+    """
+      args:
+        d_model : int
+          dimention of model
+        drop_rate : float
+          rate of dropout mechanism
+    """
+    super().__init__()
+    self.step1 = nn.Linear(d_model,d_model)
+    self.step2 = nn.Linear(d_model,d_model)
+    self.magnitude = nn.Linear(d_model,d_model)
+    self.drop = nn.Dropout(drop_rate)
+    self.gelu1 = nn.GELU(approximate='tanh')
+    self.gelu2 = nn.GELU(approximate='tanh')
+    self.tanh = nn.Tanh()
+    self.norm = nn.LayerNorm(d_model)
+  def forward(self,x) :
+    normx = self.norm(x)
+    step1 = self.step1(normx)
+    step1 = self.gelu1(step1)
+    step2 = self.step2(normx)
+    step2 = self.gelu2(step2)
+    laten = step1 + step2
+    laten = self.magnitude(laten)
+    laten = self.tanh(laten)
+    return x + laten
+class PathEmbedding (nn.Module) :
+  def __init__ (self,image_size,path_size,embedding_dim) :
+    super().__init__()
+    self.projection = nn.Conv2d(in_channels=3,out_channels=embedding_dim,kernel_size=path_size,stride=path_size)
+    self.n_path = (image_size//path_size)**2
+  def forward(self,x) :
+    x = self.projection(x)
+    x = x.flatten(2)
+    x = x.transpose(1,2)
+    return x
+class PositionalEncoding (nn.Module) :
+  def __init__ (self,n_path,embedding_dim) :
+    super().__init__()
+    self.position = nn.Parameter(torch.normal(mean=0.0,std=0.02,size=(1,n_path + 1,embedding_dim)))
+    self.cls_token = nn.Parameter(torch.normal(mean=0.0,std=0.02,size=(1,1,embedding_dim)))
+  def forward(self,x) :
+    batch = x.shape[0]
+    cls_token = self.cls_token.repeat(batch,1,1)
+    x = torch.cat([cls_token,x],dim=1)
+    return x + self.position
+class VisionLCTBlock (nn.Module) :
+  def __init__ (self,d_model,drop_rate) :
+    super().__init__()
+    self.Attention = nn.MultiheadAttention(embed_dim=d_model,num_heads=4,dropout=drop_rate,batch_first=True)
+    self.norm = nn.LayerNorm(d_model)
+    self.lcmblock = LCMBlock(d_model,drop_rate)
+  def forward(self,x) :
+    normx = self.norm(x)
+    attention,_ = self.Attention(normx,normx,normx)
+    x = x + attention
+    x = self.lcmblock(x)
+    return x
+class LMLCTBlock (nn.Module) :
+  def __init__ (self,d_model,drop_rate) :
+    super().__init__()
+    self.attention = nn.MultiheadAttention(embed_dim=d_model,num_heads=4,dropout=drop_rate,batch_first=True)
+    self.norm = nn.LayerNorm(d_model)
+    self.lcmblock = LCMBlock(d_model,drop_rate)
+  def forward(self,x) :
+    S = x.shape[1]
+    mask = torch.triu(torch.ones(S,S,device=x.device),diagonal=1).bool()
+    normx = self.norm(x)
+    attention,_ = self.attention(normx,normx,normx,attn_mask=mask)
+    x = x + attention
+    x = self.lcmblock(x)
+    return x
+class QFormersBlock (nn.Module) :
+  def __init__ (self,d_model,drop_rate) :
+    super().__init__()
+    self.FFN = nn.Sequential(
+        nn.Linear(d_model,d_model*4),
+        nn.GELU(),
+        nn.Dropout(drop_rate),
+        nn.Linear(d_model*4,d_model),
+        nn.Dropout(drop_rate)
+    )
+    self.norm1 = nn.LayerNorm(d_model)
+    self.norm2 = nn.LayerNorm(d_model)
+    self.attention = nn.MultiheadAttention(d_model,num_heads=4,dropout=drop_rate,batch_first=True)
+    self.cross_attn = nn.MultiheadAttention(d_model,num_heads=4,dropout=drop_rate,batch_first=True)
+    self.norm2 = nn.LayerNorm(d_model)
+    self.norm3 = nn.LayerNorm(d_model)
+  def forward(self,Query : torch.Tensor,vision_feats : torch.Tensor,attn_mask : Optional[torch.Tensor] = None ) :
+    q = Query
+    qnorm = self.norm1(q)
+    q2,_ = self.attention(qnorm,qnorm,qnorm,attn_mask=attn_mask)
+    q = q + q2
+    qnorm2 = self.norm2(q2)
+    q3,_ = self.cross_attn(qnorm2,vision_feats,vision_feats)
+    q = q + q3
+    qnorm3 = self.norm3(q)
+    ffn = self.FFN(qnorm3)
+    q = q + ffn
+    return q
+class QFormer (nn.Module) :
+  def __init__ (self,dim : int = 768,
+                num_query : int = 32,
+                depth : int = 6 ,
+                num_head : int = 4 ,
+                drop_rate : float = 0.1,
+                proj_to_lm_dim : Optional[int]=None):
+    super().__init__()
+    self.dim = dim
+    self.num_query = num_query
+    self.depth = depth
+    self.num_head = num_head
+    self.drop_rate = drop_rate
+    self.proj_to_lm_dim = proj_to_lm_dim
+    self.query_embed = nn.Parameter(torch.randn(1,num_query,dim))
+    self.layers = nn.ModuleList([
+        QFormersBlock(dim,drop_rate) for _ in range(depth)
+    ])
+    self.outnorm = nn.LayerNorm(dim)
+    self.proj_to_lm : Optional[nn.Linear] = None
+    if proj_to_lm_dim is not None :
+      self.proj_to_lm = nn.Linear(dim,proj_to_lm_dim)
+  def forward(self,vision_feats : torch.Tensor,attn_mask : Optional[torch.Tensor] = None) :
+    B = vision_feats.shape[0]
+    queries = self.query_embed.expand(B,-1,-1).contiguous()
+    for layer in self.layers :
+      queries = layer(queries,vision_feats,attn_mask)
+    queries = self.outnorm(queries)
+    if self.proj_to_lm is not None :
+      queries = self.proj_to_lm(queries)
+    return queries
+def _check_tensor_ok(t: torch.Tensor, name="tensor"):
+    if torch.isnan(t).any():
+        raise RuntimeError(f"{name} contains NaN values")
+    if torch.isinf(t).any():
+        raise RuntimeError(f"{name} contains Inf values")
+# ---------- PretrainedVIT forward (fixed) ----------
+class PretrainedVIT(nn.Module):
+    def __init__(self, image_size: int = 224, patch_size: int = 16,
+                 embdding_dim: int = 256, n_block: int = 4):
+        super().__init__()
+        self.Pathembedding = PathEmbedding(image_size, patch_size, embdding_dim)
+        # Ensure PathEmbedding exposes H and W (recommended). If not, compute:
+        # self.patch_H = image_size // patch_size
+        # self.patch_W = image_size // patch_size
+        self.patch_H = image_size // patch_size
+        self.patch_W = image_size // patch_size
+        self.PositionalEncoding = PositionalEncoding(self.Pathembedding.n_path, embdding_dim)
+        self.VisionACT = nn.ModuleList([VisionLCTBlock(embdding_dim, 0.15) for _ in range(n_block)])
+        self.ffn = nn.Sequential(
+            nn.Linear(embdding_dim, embdding_dim * 4),
+            nn.GELU(approximate='tanh'),
+            nn.Linear(embdding_dim * 4, embdding_dim)
+        )
+        self.upsampling = nn.Sequential(
+            nn.ConvTranspose2d(embdding_dim, embdding_dim // 2, kernel_size=2, stride=2),
+            nn.GELU(approximate='tanh'),
+            nn.ConvTranspose2d(embdding_dim // 2, embdding_dim // 4, kernel_size=2, stride=2),
+            nn.GELU(approximate='tanh'),
+            nn.ConvTranspose2d(embdding_dim //4, 3, kernel_size=2, stride=2)
+        )
+    def forward(self, x):
+        """
+        x: [B, 3, H_image, W_image]
+        safe steps:
+         - call PathEmbedding -> tokens (B, N_tokens, C)
+         - positional encoding might add cls token (so tokens length = n_path or n_path+1)
+         - handle both cases robustly
+         - reshape using stored patch_H/patch_W (not sqrt(N))
+        """
+        # 1) patch embedding -> token sequence
+        tokens = self.Pathembedding(x)   # expected shape (B, N or N+1, C)
+        # 2) positional + blocks
+        tokens = self.PositionalEncoding(tokens)
+        for blk in self.VisionACT:
+            tokens = blk(tokens)
+        tokens = self.ffn(tokens)
+        # 3) handle cls token robustly
+        # expected number of patch tokens:
+        expected_n = self.patch_H * self.patch_W
+        B, N_all, C = tokens.shape
+        if N_all == expected_n + 1:
+            # there is a cls token at index 0 (consistent with PositionalEncoding)
+            token_patches = tokens[:, 1:, :]   # (B, expected_n, C)
+        elif N_all == expected_n:
+            token_patches = tokens  # already just patches
+        else:
+            # informative error instead of silent crash
+            raise RuntimeError(
+                f"Unexpected token length: got N_all={N_all}, expected {expected_n} or {expected_n+1}. "
+                "Check PathEmbedding / PositionalEncoding outputs."
+            )
+        # 4) reshape to [B, C, patch_H, patch_W] using stored dims
+        # ensure correct ordering: tokens are (B, N, C) where N = patch_H * patch_W
+        token_patches = token_patches.contiguous()
+        try:
+            token_map = token_patches.view(B, self.patch_H, self.patch_W, C).permute(0, 3, 1, 2)
+        except Exception as e:
+            # more informative if reshape fails
+            raise RuntimeError(f"Reshape to grid failed: B={B}, N={token_patches.shape[1]}, C={C}, "
+                               f"patch_H={self.patch_H}, patch_W={self.patch_W}. Error: {e}")
+        # 5) safety checks to avoid silent CUDA asserts
+        _check_tensor_ok(token_map, name="token_map before upsampling")
+        # 6) decoder / upsampling to full image
+        out = self.upsampling(token_map)
+        # optional clamp / tanh mapping depending on your training scale (0..1 or -1..1)
+        # out = out.clamp(0., 1.)  # only if your training uses 0..1 images
+        _check_tensor_ok(out, name="output image (after upsampling)")
+        return out
+def noicing_image (image : torch.Tensor,time_steps,b_start = 1e-3,b_end=0.07,T=50) :
+  beta = torch.linspace(b_start,b_end,T,device=image.device)
+  alpha = 1 - beta
+  alpha_bar  = torch.cumprod(alpha,dim=0)
+  step1 = torch.sqrt(alpha_bar[time_steps]).view(-1,1,1,1)
+  step2 = torch.sqrt(1 - alpha_bar[time_steps]).view(-1,1,1,1)
+  image_noised = step1 * image + step2 * torch.randn_like(image)
+  return image_noised