Upload SLIP model, checkpoints, and source code

Browse files

Files changed (14) hide show

caption.safetensors +3 -0
config.json +31 -0
ecg.safetensors +3 -0
har.safetensors +3 -0
model.safetensors +3 -0
model_factory/SLIP.py +678 -0
model_factory/__init__.py +0 -0
model_factory/multimodal_gemma.py +192 -0
model_factory/ts_transformer.py +809 -0
modeling_slip.py +272 -0
sleep.safetensors +3 -0
tsqa.safetensors +3 -0
util/__init__.py +0 -0
util/pos_embed.py +246 -0

caption.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff90d912d788314a8d449b4a764c7ac52ca044c0702db303bfce094869d33623
+size 1386043740

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "model_type": "slip",
+  "architectures": [
+    "SLIP"
+  ],
+  "auto_map": {
+    "AutoModel": "modeling_slip.SLIPModel"
+  },
+  "llm_model_name": "google/gemma-3-270m",
+  "max_llm_len": 768,
+  "num_img_queries": 64,
+  "num_heads": 5,
+  "caption_loss_weight": 1.0,
+  "contrastive_loss_weight": 1.0,
+  "use_lora": false,
+  "unlocked_layers": 4,
+  "split_layer": 12,
+  "common_dim": 640,
+  "post_train": true,
+  "sensor_encoder": {
+    "embed_dim": 768,
+    "num_heads": 12,
+    "mlp_ratio": 4,
+    "depth": 12,
+    "dropout_rate": 0.1,
+    "learnable_pos_emb": false,
+    "max_position_embeddings": 4880,
+    "patch_size": null,
+    "channel_attn_type": "all_attn"
+  }
+}

ecg.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03d627bf5b7a4d0ce61803baa1726abe4dbed3bf6b9bf2c3f48d8f9eed060c37
+size 1499488484

har.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bed649ee52aaa13efd27c922a544181181ff27362147c743c9d59d5e39974c7d
+size 1386043740

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ceb7c446945dd61ddab80c82a0688e798e71f8840f1bba6b79c47dba0ae2ec5
+size 1386043740

model_factory/SLIP.py ADDED Viewed

	@@ -0,0 +1,678 @@

+# Reference: https://github.com/lucidrains/CoCa-pytorch/blob/main/coca_pytorch/coca_pytorch.py
+import math
+from sympy import shape
+from omegaconf import DictConfig
+import torch
+torch._dynamo.config.capture_scalar_outputs = True
+from torch import Tensor, einsum, nn
+import torch.nn.functional as F
+from torch.autograd import Function
+import torch.distributed as dist
+from einops import rearrange, repeat,reduce
+from model_factory.multimodal_gemma import Gemma3MultimodalModel
+import hydra
+# for generation
+from typing import Optional, List, Union
+import contextlib
+from transformers.generation.utils import GenerationMixin
+from model_factory.ts_transformer import AttentionPooling
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def masked_mean(t, mask, dim = 1, eps = 1e-6):
+    '''
+    t: B, L, D
+    mask: B, L, 1
+    '''
+    t = t.masked_fill(~mask, 0.)
+    numer = t.sum(dim = dim)
+    denom = mask.sum(dim = dim).clamp(min = eps)
+    return numer / denom
+# helper metric: https://arxiv.org/pdf/2005.10242
+def lalign(x, y, alpha=2):
+    # calculate the closness of the positive pairs.
+    return (x - y).norm(dim=1).pow(alpha).mean()
+def lunif(x, t=2):
+    # calculate the uniformity of one side.
+    sq = torch.pdist(x, p=2).pow(2)
+    return sq.mul(-t).exp().mean().log()
+# distributed
+def pad_dim_to(t, length, dim = 0):
+    pad_length = length - t.shape[dim]
+    zero_pairs = (-dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    return F.pad(t, (*((0, 0) * zero_pairs), 0, pad_length))
+# https://huggingface.co/Qwen/Qwen3-Embedding-8B
+def last_token_pool(last_hidden_states: Tensor,
+                 attention_mask: Tensor) -> Tensor:
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def all_gather_variable_batch(x):
+    """
+    All-gather variable sized tensors across DDP ranks.
+    x: [B_local, D]
+    Returns:
+        out: [sum(B_local across ranks), D]
+        sizes: python list of sizes per rank
+    """
+    world = dist.get_world_size()
+    rank = dist.get_rank()
+    device = x.device
+    # 1. Gather sizes
+    local_size = torch.tensor([x.shape[0]], device=device, dtype=torch.long)
+    all_sizes = [torch.zeros_like(local_size) for _ in range(world)]
+    dist.all_gather(all_sizes, local_size)
+    sizes = [int(s.item()) for s in all_sizes]
+    # 2. Pad local tensor to max size
+    max_size = max(sizes)
+    if local_size < max_size:
+        pad_len = max_size - local_size
+        padding = torch.zeros(pad_len, *x.shape[1:], device=device, dtype=x.dtype)
+        x_padded = torch.cat([x, padding], dim=0)
+    else:
+        x_padded = x
+    # 3. All-gather padded tensors
+    gathered = [torch.zeros_like(x_padded) for _ in range(world)]
+    dist.all_gather(gathered, x_padded)
+    # 4. Trim each rank's padded slice
+    trimmed = [g[:sizes[i]] for i, g in enumerate(gathered)]
+    # 5. Concatenate true global batch
+    out = torch.cat(trimmed, dim=0)
+    return out, sizes
+class AllGather(Function):
+    @staticmethod
+    def forward(ctx, x):
+        assert dist.is_initialized() and dist.get_world_size() > 1
+        x, batch_sizes = all_gather_variable_batch(x)
+        ctx.batch_sizes = batch_sizes
+        return x
+    @staticmethod
+    def backward(ctx, grads):
+        batch_sizes, rank = ctx.batch_sizes, dist.get_rank()
+        grads_by_rank = grads.split(batch_sizes, dim = 0)
+        return grads_by_rank[rank]
+all_gather = AllGather.apply
+# to latents
+class EmbedToLatents(nn.Module):
+    def __init__(self, dim, dim_latents):
+        super().__init__()
+        self.to_latents = nn.Linear(dim, dim_latents, bias=False)
+    def forward(self, x):
+        latents = self.to_latents(x)
+        return F.normalize(latents, dim=-1)
+class SLIP(nn.Module,GenerationMixin):
+    _is_stateful = False
+    def __init__(
+        self,
+        tokenizer=None, #legacy argument.
+        **kwargs
+    ):
+        super().__init__()
+        self.tokenizer = tokenizer
+        enc_cfg = kwargs['sensor_encoder_cfg']
+        if isinstance(enc_cfg, (DictConfig, dict)):
+            self.sensor_encoder = hydra.utils.instantiate(enc_cfg)
+        else:
+            self.sensor_encoder = enc_cfg
+        ############################################################
+        dim = self.sensor_encoder.embed_dim # 384
+        text_encoder = kwargs['llm_model_name']
+        self.embed_dim = dim
+        self.use_lora = kwargs.get('use_lora', True)
+        self.post_train = kwargs.get('post_train', True)
+        ##########################################
+        ## Text encoder ####
+        self.caption_loss_weight = kwargs['caption_loss_weight']
+        self.max_llm_len = kwargs['max_llm_len']
+        self.multimodalModel = Gemma3MultimodalModel(text_encoder,self.post_train)
+        if self.caption_loss_weight <= 0:
+            self.multimodalModel._truncate_to_unimodal()
+        unlocked_layers = kwargs.get('unlocked_layers', 0)
+        if unlocked_layers < 12: # 12 is the split layer
+            self.multimodalModel._lock_text(
+                unlocked_layers=unlocked_layers,
+                freeze_layer_norm=kwargs.get('freeze_layer_norm', True)
+            )
+        lm_dim = self.multimodalModel.hidden_size #640
+        self.lm_dim = lm_dim
+        common_dim = lm_dim # harcoded for now
+        # self.multimodalModel.model.gradient_checkpointing_enable()
+        #########################################
+        num_img_queries = kwargs.get('num_img_queries', 0)
+        if num_img_queries>0:
+            self.img_queries = nn.Parameter(torch.randn(num_img_queries + 1, common_dim))
+            self.img_attn_pool = AttentionPooling(
+                dim=common_dim,
+                context_dim=dim,
+                num_heads=kwargs['num_heads']) # pre-norm+post_norm
+            dim = common_dim
+        # normalize.
+        self.img_to_latents = EmbedToLatents(dim, common_dim)
+        self.text_to_latents = EmbedToLatents(common_dim, common_dim)
+        # learnable temperature
+        self.temperature = nn.Parameter(torch.tensor(math.log(1/0.07)))
+        self.temperature_max = math.log(1/0.07)
+        if self.use_sig_loss:
+            # default implementation
+            self.temperature = nn.Parameter(torch.tensor(math.log(10)))
+            #self.temperature_max = math.log(10)
+            self.temperature_max = 999 # trivally large, so no upper bound.
+            self.logit_bias = nn.Parameter(torch.ones([]) * -10)
+        # multimodal decoder #############
+        pad_token_id = self.tokenizer.pad_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = self.tokenizer.eos_token_id
+        self.ce = nn.CrossEntropyLoss(ignore_index=pad_token_id)
+        self.contrastive_loss_weight = kwargs['contrastive_loss_weight']
+        ##################################
+        self._init_weights()
+        # whether in data parallel setting
+        self.is_distributed = dist.is_initialized() and dist.get_world_size() > 1
+        # for name, param in self.named_parameters():
+        #     if param.requires_grad:
+        #         print(f"TRAINABLE: {name}")
+    def _init_weights(self):
+        def _init(m):
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        # apply only to modules we added
+        self.img_to_latents.apply(_init)
+        self.text_to_latents.apply(_init)
+        if hasattr(self, 'img_attn_pool'):
+            self.img_attn_pool.apply(_init)
+            nn.init.xavier_uniform_(self.img_queries)
+    def get_lora_parameters(self): # make training script happy
+        """
+        Gathers:
+        1. LoRA weights (A and B matrices) inside Gemma.
+        2. Full-parameter updated 'modules_to_save' (Embeddings/Head).
+        3. Full-parameter updated Cross-Attention blocks.
+        4. Bridge layers (img_to_latents, text_to_latents, etc.).
+        """
+        if not self.use_lora:
+            return []
+        trainable_params = []
+        # 1. Check the multimodal LLM (Gemma + LoRA + Cross-Attn)
+        for name, param in self.multimodalModel.named_parameters():
+            if param.requires_grad:
+                trainable_params.append(param)
+        # 2. Check the Bridge modules
+        bridge_modules = [self.img_to_latents, self.text_to_latents]
+        if hasattr(self, 'img_attn_pool'):
+            bridge_modules.append(self.img_attn_pool)
+        for module in bridge_modules:
+            for param in module.parameters():
+                if param.requires_grad:
+                    trainable_params.append(param)
+        # 3. Check the Queries and Sensor Encoder
+        if hasattr(self, 'img_queries') and self.img_queries.requires_grad:
+            trainable_params.append(self.img_queries)
+        # Optionally add sensor_encoder if you haven't locked it
+        for param in self.sensor_encoder.parameters():
+            if param.requires_grad:
+                trainable_params.append(param)
+        return trainable_params
+    def _pad_to_len(self, x, max_len):
+        # pad along dim 1 to max_len with zeros
+        if x.dim() == 3:
+            # [B, L, D]
+            pad_len = max_len - x.size(1)
+            if pad_len > 0:
+                pad = x.new_zeros(x.size(0), pad_len, x.size(2))
+                x = torch.cat([pad, x], dim=1)
+        elif x.dim() == 2:
+            # [B, L] case such as masks
+            pad_len = max_len - x.size(1)
+            if pad_len > 0:
+                pad = x.new_zeros(x.size(0), pad_len)
+                x = torch.cat([pad, x], dim=1)
+        return x
+    def _gather_features(self, img, txt, gather_with_grad=False):
+        """Return all features if DDP, else inputs. Same batch size per rank assumed."""
+        if not (dist.is_available() and dist.is_initialized()):
+            return img, txt
+        ### prepare for gathering ###
+        #
+        # get max length across ranks for padding.
+        img_len = torch.tensor([img.size(1)], device=img.device, dtype=torch.long)
+        txt_len = torch.tensor([txt.size(1)], device=txt.device, dtype=torch.long)
+        dist.all_reduce(img_len, op=dist.ReduceOp.MAX)
+        dist.all_reduce(txt_len, op=dist.ReduceOp.MAX)
+        max_img_len = int(img_len.item())
+        max_txt_len = int(txt_len.item())
+        img = self._pad_to_len(img, max_img_len)
+        txt = self._pad_to_len(txt, max_txt_len)
+        #################################
+        if gather_with_grad:
+            # keep grad across ranks
+            all_img = all_gather(img)
+            all_txt = all_gather(txt)
+        else:
+            # no grad path, saves memory
+            ws = dist.get_world_size()
+            outs_i = [torch.empty_like(img) for _ in range(ws)]
+            outs_t = [torch.empty_like(txt) for _ in range(ws)]
+            try:
+                dist.all_gather(outs_i, img.contiguous())
+                dist.all_gather(outs_t, txt.contiguous())
+            except Exception as e:
+                print("Error occurred while gathering features:", e)
+            outs_i[dist.get_rank()] = img
+            outs_t[dist.get_rank()] = txt
+            all_img = torch.cat(outs_i, dim=0)
+            all_txt = torch.cat(outs_t, dim=0)
+        return all_img, all_txt
+    def embed_text(self,
+                   input_ids,
+                   attention_mask,
+                   text_embed=None):
+        '''
+        need to make this casual to avoid representation leak.
+        text: (BS, llm_seq_len) token_ids
+        attn_mask: (Bs, llm_seq_len)
+        '''
+        if text_embed is not None:
+            hidden_states = text_embed # (BS, max_seq_len, lm_dim)
+        else:
+            outputs = self.llm(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                return_dict=True,
+                output_hidden_states=False, # Set to False or remove
+                # use_cache=False             # Ensure cache is off for training/gradient ckpt
+            )
+            hidden_states = outputs.last_hidden_state
+        return hidden_states
+    def embed_sensor(self, sensors, sensor_attn_mask=None, time_index=None):
+        '''
+        sensors: (BS, num_channels, L)
+        '''
+        sensor_tokens, attn_mask = self.sensor_encoder(sensors, sensor_attn_mask, time_index=time_index)
+        # sensor_tokens: Bs,(nvar, num_p), img_dim
+        # attn_mask: BS, nvar, num_p
+        if hasattr(self, 'img_attn_pool'):
+            img_queries = repeat(self.img_queries, 'n d -> b n d', b=sensor_tokens.shape[0])
+            sensor_tokens = self.img_attn_pool(img_queries, sensor_tokens,attn_mask)
+        return sensor_tokens, attn_mask.bool()
+    # use an openCLIP implementation
+    def forward_loss(self,
+                     text_hidden,
+                     sensor_hidden,
+                     sensor_mask,
+                     gather_with_grad=False):
+        '''
+        text_embd: tuple of (text_cls, text_tokens)
+        sensor_embed: tuple of (sensor_cls, sensor_tokens)
+        sensor_mask: (BS, nvar, num_p)
+        '''
+        # global features
+        if hasattr(self, 'img_attn_pool'):
+            # use cls token
+            sensor_hidden = sensor_hidden[:, 0, :]
+        else:
+            sensor_hidden = masked_mean(sensor_hidden, rearrange(sensor_mask, 'b n p -> b (n p) 1'), dim=1)  # BS, img_dim
+        rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+        world = dist.get_world_size() if dist.is_available() and dist.is_initialized() else 1
+        if world > 1:
+            all_img, all_txt = self._gather_features(sensor_hidden, text_hidden, gather_with_grad=gather_with_grad)
+        else:
+            all_img, all_txt = sensor_hidden, text_hidden
+        contrastive_loss = self.CLIP_loss(all_txt, all_img)*self.contrastive_loss_weight
+        # some supplementry losses
+        align_loss = lalign(all_txt, all_img)
+        unif_txt = lunif(all_txt)
+        unif_img = lunif(all_img)
+        outputs = {
+            "loss": contrastive_loss,
+            'contrastive_loss': contrastive_loss,
+            "align_loss": align_loss,
+            "unif_txt": unif_txt,
+            "unif_img": unif_img,
+        }
+        return outputs
+    def CLIP_loss(
+            self,
+            text_cls,
+            sensor_cls,):
+        temperature = (self.temperature.clamp(max=self.temperature_max)).exp()
+        logits_t2i = temperature * (text_cls @ sensor_cls.t())  # [B_global, B_global]
+        targets = torch.arange(logits_t2i.size(0), device=sensor_cls.device)
+        contrastive_loss = 0.5 * (
+            F.cross_entropy(logits_t2i, targets) +
+            F.cross_entropy(logits_t2i.t(), targets)
+        )
+        return contrastive_loss
+    def sig_loss(self, text_hidden, sensor_hidden, sensor_mask):
+        '''
+        SigLip Loss: Decoupling contrastive-loss with batch size
+        text_hidden: (BS, dim)
+        sensor_hidden: (BS, sensor_len, dim)
+        text_mask: (BS, text_len)
+        sensor_mask: (BS, sensor_len)
+        '''
+        if hasattr(self, 'img_attn_pool'):
+            # use cls token
+            sensor_hidden = sensor_hidden[:, 0, :]
+        else:
+            sensor_hidden = masked_mean(sensor_hidden, rearrange(sensor_mask, 'b n p -> b (n p) 1'), dim=1)  # BS, img_dim
+        logit_scale = self.temperature.clamp(max=self.temperature_max).exp()
+        loss = self._sig_loss(sensor_hidden, text_hidden, logit_scale, self.logit_bias)
+        return {'loss': loss, 'contrastive_loss': loss}
+    def forward(
+        self,
+        text,
+        sensors,
+        prompt=None, # legacy input
+        return_embeddings=False,
+    ):
+        sensor_hidden, sensor_mask = self.embed_sensor(sensors=sensors['input_ids'],
+                                                   sensor_attn_mask=sensors['attention_mask'], # this is pixel-level mask
+                                                   time_index=sensors['time_index'])
+        # sensor_hidden: (BS, num_sensor_token, dim)
+        self.multimodalModel.condition_image(sensor_hidden)
+        text_hidden, logits = self.multimodalModel(input_ids=text['input_ids'][:,:-1],
+                                                   attention_mask=text['attention_mask'][:,:-1], )
+        # text_sentence_embed: (BS, dim)
+        # logits: (BS, pred_len, vocab_size)
+        labels = text['input_ids'][:,1:] # bs, pred_len
+        #logits = rearrange(logits, 'b n c -> b c n') # bs, vocab_size, pred_len
+        text_hidden = self.text_to_latents(text_hidden)
+        sensor_hidden = self.img_to_latents(sensor_hidden)
+        if self.use_sig_loss:
+            loss_dict = self.sig_loss(text_hidden,
+                                      sensor_hidden,
+                                      sensor_mask)
+        else:
+            # This branch will need all-gather.
+            loss_dict = self.forward_loss(text_hidden,
+                                        sensor_hidden,
+                                        sensor_mask,)
+        if self.caption_loss_weight > 0:
+            loss_logits = logits.reshape(-1, logits.size(-1)) # Shape: [BS * Seq, Vocab]
+            loss_labels = labels.reshape(-1)                   # Shape: [BS * Seq]
+            caption_loss = self.ce(loss_logits, loss_labels) * self.caption_loss_weight
+            loss_dict['caption_loss'] = caption_loss
+            loss_dict['loss'] = loss_dict['contrastive_loss'] + caption_loss
+        return loss_dict
+    def _lock_sensor(self,):
+        # Freeze all sensor-related parameters (cross-attn blocks)
+        for name, param in self.sensor_encoder.named_parameters():
+            param.requires_grad = False
+    def sft_training(self,text,sensors,return_output=False):
+        sensor_hidden, _ = self.embed_sensor(sensors=sensors['input_ids'],
+                                                   sensor_attn_mask=sensors['attention_mask'],
+                                                   time_index=sensors['time_index'])
+        # sensor_hidden: (BS, num_sensor_token, dim)
+        self.multimodalModel.condition_image(sensor_hidden)
+        # debugging code.
+        # sample_text = text['input_ids'][0]
+        # sample_label = text['labels'][0]
+        # # make the -100 to be the pad token id for decoding
+        # sample_label = torch.where(sample_label==-100, self.tokenizer.pad_token_id, sample_label)
+        # print('sample text:', self.tokenizer.decode(sample_text))
+        # print('sample label:', self.tokenizer.decode(sample_label))
+        # exit()
+        outputs = self.multimodalModel.model(input_ids=text['input_ids'],
+                                            attention_mask=text['attention_mask'],
+                                            return_dict=True,)
+                                                #    labels=text['labels'], )
+        if return_output:
+            return outputs
+        logits = outputs.logits # (BS, pred_len, vocab_size)
+        labels = text['labels'] # (BS, pred_len)
+        # shift for causal lm
+        shift_logits = logits[:, :-1, :].contiguous()
+        shift_labels = labels[:, 1:].contiguous()
+        # flatten logits for efficiency
+        logss_logits = shift_logits.view(-1, shift_logits.size(-1)) # Shape: [BS * Seq, Vocab]
+        loss_labels = shift_labels.view(-1)                   # Shape: [BS * Seq]
+        # define a new loss for stf
+        ce = torch.nn.functional.cross_entropy(
+            logss_logits,
+            loss_labels,
+            reduction='none',
+            ignore_index=-100,
+        )
+        if 'loss_weights' in text:
+            loss_weights = text['loss_weights']
+            loss_weights = loss_weights[:,1:].contiguous()
+            loss_weights = loss_weights.view(-1)  # Shape: [BS * Seq]
+            # apply weights
+            weighted_ce = ce * loss_weights
+            loss = weighted_ce.sum() / loss_weights.sum()
+        else:
+            loss = ce.mean()
+        return {'loss': loss}
+    def generate(self,
+                 text,
+                 sensors,
+                 **generate_kwargs):
+        """
+        Generates text conditioned on image embeddings.
+        """
+        sensor_hidden, _ = self.embed_sensor(sensors=sensors['input_ids'],
+                                                   sensor_attn_mask=sensors['attention_mask'], # this is pixel-level mask
+                                                   time_index=sensors['time_index'])
+        self.multimodalModel.condition_image(sensor_hidden)
+        generated_text = self.multimodalModel.model.generate(
+            input_ids=text['input_ids'],
+            attention_mask=text['attention_mask'],
+            max_new_tokens=300,
+            do_sample=False,
+            num_beams=1,
+            early_stopping=False,
+        )
+        return generated_text
+    @ torch.no_grad()
+    def get_embedding(self,text,sensors):
+        sensor_hidden, sensor_mask = self.embed_sensor(sensors=sensors['input_ids'],
+                                                   sensor_attn_mask=sensors['attention_mask'], # this is pixel-level mask
+                                                    time_index=sensors['time_index'])
+        self.multimodalModel.condition_image(sensor_hidden)
+        text_hidden, _ = self.multimodalModel(input_ids=text['input_ids'][:,:-1],
+                                                   attention_mask=text['attention_mask'][:,:-1], )
+        text_hidden = self.text_to_latents(text_hidden)
+        sensor_hidden = self.img_to_latents(sensor_hidden)
+        if hasattr(self, 'img_attn_pool'):
+            # use cls token
+            sensor_hidden = sensor_hidden[:, 0, :]
+        else:
+            sensor_hidden = masked_mean(sensor_hidden, rearrange(sensor_mask, 'b n p -> b (n p) 1'), dim=1)  # BS, img_dim # (BS, dim)
+        return text_hidden, sensor_hidden
+    @ torch.no_grad()
+    def get_sensor_embedding(self,input_ids,mask,time_index):
+        sensor_hidden, sensor_mask = self.embed_sensor(sensors=input_ids,
+                                                    sensor_attn_mask=mask,
+                                                    time_index=time_index)
+        sensor_hidden = self.img_to_latents(sensor_hidden)
+        if hasattr(self, 'img_attn_pool'):
+            # use cls token
+            sensor_hidden = sensor_hidden[:, 0, :]
+        else:
+            sensor_hidden = masked_mean(sensor_hidden, rearrange(sensor_mask, 'b n p -> b (n p) 1'), dim=1)  # BS, img_dim
+        return sensor_hidden
+    @ torch.no_grad()
+    def get_text_embedding(self,text):
+        text_mask = text['attention_mask']
+        text_hidden = self.embed_text(text['input_ids'],
+                                      attention_mask=text_mask,)
+        text_hidden = self.text_to_latents(text_hidden)
+        if self.llm.config.pooler == 'mean':
+            text_hidden = masked_mean(text_hidden, rearrange(text_mask, 'b l -> b l 1').bool(), dim=1)  # BS, lm_dim
+        else:
+            text_hidden = last_token_pool(text_hidden, text_mask) # (BS, dim)
+        return text_hidden
+    def get_multimodal_feature(self, question, sensors):
+        sensor_hidden, sensor_mask = self.embed_sensor(sensors=sensors['input_ids'],
+                                                   sensor_attn_mask=sensors['attention_mask'], # this is pixel-level mask
+                                                    time_index=sensors['time_index'])
+        # sensor_hidden: (BS, num_sensor_token, dim)
+        self.multimodalModel.condition_image(sensor_hidden)
+        outputs = self.multimodalModel(input_ids=question['input_ids'],
+                                                   attention_mask=question['attention_mask'],
+                                                   return_embeddings=True)
+        # text_sentence_embed: (BS, dim)
+        # logits: (BS, pred_len, vocab_size)
+        multimodal_hidden = outputs.hidden_states[-1][:,-1,:] # (BS, dim)
+        return multimodal_hidden
+class Config(dict):
+    def __getattr__(self, key):
+        return self[key]

model_factory/__init__.py ADDED Viewed

File without changes

model_factory/multimodal_gemma.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from transformers import AutoConfig, AutoModelForCausalLM
+from model_factory.ts_transformer import CrossAttention
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+class Gemma3MultimodalLayer(nn.Module):
+    def __init__(self, original_layer, cross_attn_block):
+        super().__init__()
+        self.original_layer = original_layer
+        self.cross_attn_block = cross_attn_block
+        self.vis_x = None
+    def condition_vis_x(self, vis_x):
+        self.vis_x = vis_x
+    def __getattr__(self, name):
+        """Forward all unknown attributes to the original layer."""
+        # This is CRITICAL for 'attention_type' and other internal HF flags
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.original_layer, name)
+    def forward(self, hidden_states, **kwargs):
+        # 1. Run the original unimodal Gemma Layer (Self-Attn + MLP)
+        # have to have self.vis_x
+        assert self.vis_x is not None, "vis_x must be set before forward pass."
+        outputs = self.original_layer(hidden_states, **kwargs) # gemma layer output
+        hidden_states = outputs[0]
+        hidden_states = self.cross_attn_block(hidden_states, context=self.vis_x)
+        return (hidden_states,) + outputs[1:] # make hf happy
+class Gemma3MultimodalModel(nn.Module):
+    def __init__(self,
+                 model_id="google/gemma-3-270m",
+                 post_train = True,
+                 split_layer=12):
+        super().__init__()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            trust_remote_code=True
+        )
+        if post_train:
+            # Load pre-trained weights
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                trust_remote_code=True
+            )
+        else:
+            # INITIALIZE FROM SCRATCH
+            print(f"Initializing {model_id} from SCRATCH (Random Weights)...")
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+            self.model = AutoModelForCausalLM.from_config(
+                config,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                trust_remote_code=True
+            )
+        self.split_layer = split_layer
+        self.device = self.model.device
+        # Initialize and insert cross-attention
+        hidden_size = self.model.config.hidden_size # 640
+        num_heads = self.model.config.num_attention_heads
+        self.hidden_size = hidden_size
+        for i in range(split_layer, len(self.model.model.layers)):
+            # Create the specific cross-attn block for this layer
+            cross_attn = CrossAttention(
+                dim=hidden_size,
+                context_dim=hidden_size,
+                num_heads=num_heads,
+                dropout_rate=0.1
+            )
+            # Wrap the original layer
+            original_layer = self.model.model.layers[i]
+            self.model.model.layers[i] = Gemma3MultimodalLayer(
+                original_layer,
+                Residual(cross_attn)
+            )
+        self.to(torch.bfloat16)
+    def condition_image(self, image_embeds):
+        """Passes image embeddings (Bs, img_q, 640) to layers 12+"""
+        # Ensure we match the model's device and dtype
+        self.image_embeds = image_embeds.to(next(self.parameters()).device, dtype=torch.bfloat16)
+        for layer in self.model.model.layers:
+            if isinstance(layer, Gemma3MultimodalLayer):
+                layer.condition_vis_x(self.image_embeds)
+    def forward(self,
+                input_ids,
+                attention_mask=None,
+                return_embeddings=False,
+                **kwargs):
+        # HF Forward
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            **kwargs
+        )
+        # Extraction for contrastive learning
+        # Index split_layer gives the output of (split_layer - 1)
+        # e.g., index 12 = output of Layer 11
+        unimodal_hidden_states = outputs.hidden_states[self.split_layer]
+        text_sentence_embedding = unimodal_hidden_states[:, -1, :]
+        if return_embeddings:
+            return outputs
+        else:
+            return text_sentence_embedding, outputs.logits
+    def _lock_text(self,
+                   unlocked_layers: int = 0,
+                   freeze_layer_norm: bool = True):
+        """
+        Locks the unimodal encoder.
+        unlocked_layers: How many unimodal layers (counting back from split_layer) to keep trainable.
+        freeze_layer_norm: Whether to freeze Norm layers (RMSNorm/LayerNorm).
+        """
+        # 1. Ensure the Multimodal Decoder and Head are ALWAYS trainable
+        for param in self.model.parameters():
+            param.requires_grad = True
+        # 2. Identify Unimodal components
+        embeddings = self.model.model.embed_tokens
+        unimodal_layer_list = self.model.model.layers[:self.split_layer]
+        modules = [embeddings, *unimodal_layer_list]
+        if unlocked_layers > 0:
+            modules_to_freeze = modules[:-unlocked_layers]
+        else:
+            modules_to_freeze = modules
+        first_unlocked_layer_idx = (len(modules) - unlocked_layers) - 1
+        print(f"Locking {len(modules_to_freeze)} unimodal modules (Embeddings + Layers 0 to {first_unlocked_layer_idx - 1}).")
+        print(f"Unimodal layers {max(0, first_unlocked_layer_idx)} to {self.split_layer - 1} remain trainable.")
+        # 4. Perform Freezing
+        for module in modules_to_freeze:
+            for n, p in module.named_parameters():
+                is_norm = any(x in n.split(".") for x in ["norm", "LayerNorm", "input_layernorm", "post_attention_layernorm"])
+                if is_norm:
+                    p.requires_grad = not freeze_layer_norm
+                else:
+                    p.requires_grad = False
+    def _truncate_to_unimodal(self):
+        """
+        Deletes all layers from split_layer onwards, keeping only the
+        unimodal layers (0 to split_layer-1).
+        """
+        # 1. Physically remove the layers (indices split_layer to end)
+        # This deletes the Gemma3MultimodalLayer wrappers and their weights
+        self.model.model.layers = nn.ModuleList(self.model.model.layers[:self.split_layer])
+        # 2. Update the config so the model handles the new length correctly
+        # (This ensures the final layer-norm and LM-head use the correct hidden state)
+        self.model.config.num_hidden_layers = self.split_layer
+        # 3. Cleanup image references
+        if hasattr(self, 'image_embeds'):
+            del self.image_embeds
+        print(f"Multimodal layers deleted. Model truncated to {self.split_layer} layers.")

model_factory/ts_transformer.py ADDED Viewed

	@@ -0,0 +1,809 @@

+# Reference: https://huggingface.co/thuml/sundial-base-128m/blob/main/modeling_sundial.py
+import contextlib
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, List, Union
+from util.pos_embed import RotaryEmbedding, apply_rotary_pos_emb,apply_rotary_pos_emb_2d, build_2d_position_ids
+from transformers.activations import ACT2FN
+from einops import rearrange,reduce
+class TsRoPEAttention(nn.Module):
+    def __init__(self, layer_idx: int, **cfg):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = cfg.get("embed_dim", 768)
+        self.num_heads = cfg.get("num_heads", 12)
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attention_dropout = cfg.get("dropout_rate", 0.1)
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        # 2d RoPE
+        self.rotary_emb = RotaryEmbedding(
+            self.head_dim//2, max_position_embeddings=cfg.get("max_position_embeddings"))
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        '''
+        hidden_states: [bs, seq_len, hidden_size]
+        attention_mask: [bs, nvar, num_p]
+        '''
+        bsz, q_len, _ = hidden_states.size()
+        tmp_attn_mask = rearrange(attention_mask, 'b nvar p -> b (nvar p)')
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states) # Bs, L, hidden_size
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        tmp_attn_mask = tmp_attn_mask.unsqueeze(1).unsqueeze(2).expand(-1, 1, q_len, q_len).bool()  # bs, 1, L, L
+        pos_var, pos_patch = build_2d_position_ids(attention_mask,flatten=True)
+        q_h = query_states[..., : self.head_dim // 2]
+        q_w = query_states[..., self.head_dim // 2 :]
+        cos_h, sin_h = self.rotary_emb(q_h, seq_len=int(pos_var.max().item()) + 1)
+        cos_w, sin_w = self.rotary_emb(q_w, seq_len=int(pos_patch.max().item()) + 1)
+        query_states, key_states = apply_rotary_pos_emb_2d(
+            query_states, key_states,
+            cos_h, sin_h,
+            cos_w, sin_w,
+            pos_var, pos_patch
+        )
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            tmp_attn_mask,
+            dropout_p=self.attention_dropout
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+# helper function
+def flatten_list(input_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+    """
+    Flatten a nested list of lists into a single list.
+    Args:
+        input_list (List[List[Tensor]]): Nested list to flatten.
+    Returns:
+        List[Tensor]: Flattened list.
+    """
+    return [item for sublist in input_list for item in sublist]
+class MultiSizePatchEmbed(nn.Module):
+    def __init__(self, base_patch=32, **cfg):
+        super().__init__()
+        self.base_patch = base_patch
+        hidden_size = cfg['embed_dim']
+        intermediate_size = cfg['mlp_ratio'] * hidden_size # 3072
+        self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+        # [ts, time_idx, mask] concatenated together
+        self.shared_linear = nn.Linear(base_patch*3, intermediate_size) # putting mask on hidden.
+        self.shared_residual = nn.Linear(base_patch*3, hidden_size)
+        # MLP embedder ###
+        self.dropout = nn.Dropout(cfg['dropout_rate'])
+        self.act = ACT2FN['silu']
+        self.output_layer = nn.Linear(
+            intermediate_size, hidden_size)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize nn.Linear and nn.LayerNorm
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                # we use xavier_uniform following official JAX ViT:
+                torch.nn.init.xavier_uniform_(m.weight)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        self.apply(_init_weights)
+    def resize_weight(self, patch_size: int):
+        """
+        Interpolate weights along the patch dimension to target patch size.
+        """
+        base_w = self.shared_linear.weight  # [out_dim, base_patch]
+        base_b = self.shared_linear.bias
+        res_w = self.shared_residual.weight
+        res_b = self.shared_residual.bias
+        # FlexiViT: interpolate kernel linearly along patch axis
+        # interpolate (base_patch, d) -> (patch_size,d)
+        new_w = F.interpolate(
+            base_w.unsqueeze(1), size=patch_size, mode="linear", align_corners=False
+        ).squeeze(1).to(base_w.dtype)
+        new_res_w = F.interpolate(
+            res_w.unsqueeze(1), size=patch_size, mode="linear", align_corners=False
+        ).squeeze(1).to(res_w.dtype)
+        return new_w, base_b,new_res_w,res_b
+    def forward(self, x_list, attention_mask, time_idx):
+        """
+        x_list: list of tensors of shape (num_patches, patch_size)
+        attention_mask: list of tensors.
+        Returns:
+            list of transformed tensors in the same order.
+        """
+        amp_dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else torch.float32
+        device = torch.device("cuda", torch.cuda.current_device()) if torch.cuda.is_available() else torch.device("cpu")
+        # group by patch size
+        sizes = torch.tensor([x.shape[-1] for x in x_list])
+        unique_sizes = sizes.unique(sorted=True)
+        N = x_list[0].shape[0] # number of patches
+        outputs = torch.empty(len(x_list), N, self.intermediate_size,
+                              device=device,dtype=amp_dtype)
+        res_outputs = torch.empty(len(x_list), N, self.hidden_size,
+                                  device=device,dtype=amp_dtype)
+        for psize in unique_sizes.tolist():
+            idxs = (sizes == psize).nonzero(as_tuple=True)[0]
+            xs = torch.stack([x_list[i] for i in idxs]) # B_g, num_p, ps
+            mask = torch.stack([attention_mask[i] for i in idxs]) # B_g, num_p, ps
+            ti = torch.stack([time_idx[i] for i in idxs])
+            xs = xs.to(device=device, non_blocking=True)
+            mask = mask.to(device=device, non_blocking=True)
+            ti = ti.to(device=device, non_blocking=True)
+            xs = torch.cat([xs,mask,ti],dim=-1) # B_g, num_p, ps*3
+            w, b, r_w, r_b = self.resize_weight(psize*3)
+            res_outputs[idxs] = F.linear(xs,r_w,r_b)
+            outputs[idxs] = F.linear(xs, w, b)
+        hid = self.act(outputs) # BS, num_p, intermediate_size
+        out = self.dropout(self.output_layer(hid)) # BS, num_p, hidden
+        out = out + res_outputs
+        return out
+class PatchEmbedding(nn.Module):
+    def __init__(self, **cfg):
+        super().__init__()
+        patch_size = cfg['patch_size']
+        self.patch_size = patch_size
+        self.dropout = nn.Dropout(cfg.get('dropout_rate', 0.1))
+        hidden_size = cfg['embed_dim']
+        self.hidden_layer = nn.Linear(
+            patch_size * 3, hidden_size)
+        self.act = ACT2FN['silu']
+        self.output_layer = nn.Linear(
+            hidden_size, hidden_size)
+        self.residual_layer = nn.Linear(
+            patch_size * 3, hidden_size)
+        self.patch_size = patch_size
+    def forward(self, x, mask, time_idx):
+        '''
+        x,mask,time_idx: bs, nvar,L
+        '''
+        x = rearrange(x, 'bs nvar (nump ps) -> (bs nvar) nump ps', ps=self.patch_size)
+        mask = rearrange(mask, 'bs nvar (nump ps) -> (bs nvar) nump ps', ps=self.patch_size)
+        time_idx = rearrange(time_idx, 'bs nvar (nump ps) -> (bs nvar) nump ps', ps=self.patch_size)
+        x = torch.cat([x, mask,time_idx], dim=-1)
+        hid = self.act(self.hidden_layer(x))
+        out = self.dropout(self.output_layer(hid))
+        res = self.residual_layer(x)
+        out = out + res
+        return out # bs*nvar, num_p, hidden_size
+class Attention(nn.Module):
+    def __init__(self, layer_idx: int, is_rope=True, **cfg):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.is_rope = is_rope
+        self.hidden_size = cfg.get("embed_dim", 768)
+        self.num_heads = cfg.get("num_heads", 12)
+        self.sensor_max_len = cfg.get("sensor_max_len", 2880)
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attention_dropout = cfg.get("dropout_rate", 0.1)
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        if self.is_rope:
+            self.rotary_emb = RotaryEmbedding(
+                self.head_dim, max_position_embeddings=self.sensor_max_len)
+        else:
+            self.rotary_emb = None
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None, # index of positions.
+            **kwargs,
+    ) -> torch.Tensor:
+        '''
+        hidden_states: [bs, seq_len, hidden_size]
+        attention_mask: [bs, 1, seq_len, seq_len]
+        position_ids: [bs, seq_len]
+        '''
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states) # Bs, L, hidden_size
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if self.is_rope:
+            kv_seq_len = key_states.shape[-2]
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, position_ids)
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout_p=self.attention_dropout
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class CrossAttention(nn.Module):
+    def __init__(self,
+                 dim=768, # unifed embed space
+                 *,
+                 context_dim=384,
+                 num_heads=12,
+                 dropout_rate=0.1):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = int(dim // num_heads)
+        self.scale = self.head_dim ** -0.5
+        self.attn_dropout = dropout_rate
+        self.norm = nn.LayerNorm(dim)
+        self.context_norm = nn.LayerNorm(context_dim)
+        self.q_proj = nn.Linear(dim, dim, bias=True)
+        self.k_proj = nn.Linear(context_dim, dim, bias=True)
+        self.v_proj = nn.Linear(context_dim, dim, bias=True)
+        self.o_proj = nn.Linear(dim, dim, bias=False)
+    def forward(
+            self,
+            query,
+            context,
+            attention_mask: Optional[torch.Tensor] = None,
+            **kwargs,
+    ) -> torch.Tensor:
+        '''
+        hidden_states: [bs, seq_len, hidden_size]
+        attention_mask: [BS, 1, seq_len, context_len]
+        position_ids: [bs, seq_len]
+        '''
+        bsz, q_len, _ = query.size()
+        bsc, k_len, _ = context.size()
+        assert bsz == bsc, f"Batch size mismatch: {bsz} vs {bsc}"
+        # pre-norm
+        query = self.norm(query)
+        context = self.context_norm(context)
+        query_states = self.q_proj(query)
+        key_states = self.k_proj(context)
+        value_states = self.v_proj(context) # Bs, L, hidden_size
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, k_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, k_len, self.num_heads, self.head_dim).transpose(1, 2)
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout_p=self.attn_dropout
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.dim)
+        attn_output = self.o_proj(attn_output) # bs, q_len, dim
+        return attn_output
+class MLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, hidden_act: str):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class AllAttention(nn.Module):
+    def __init__(self, layer_idx, **cfg):
+        super().__init__()
+        self.self_attention = TsRoPEAttention(**cfg, layer_idx=layer_idx)
+        self.layer_norm = nn.LayerNorm(cfg.get('embed_dim'))
+        self.dropout = nn.Dropout(cfg.get('dropout_rate', 0.1))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ):
+        '''
+        ts self attention with residual
+        hidden_states: bs (nvar L) d
+        attention_mask: bs, nvar, L
+        '''
+        normed_hidden_states = self.layer_norm(hidden_states) # pre-norm
+        attention_output = self.self_attention(
+           normed_hidden_states,
+           attention_mask,
+        )
+        # residual
+        hidden_states = hidden_states + self.dropout(attention_output)
+        return hidden_states
+class TimeSelfAttention(nn.Module):
+    def __init__(self, layer_idx, **cfg):
+        super().__init__()
+        self.self_attention = Attention(layer_idx=layer_idx, is_rope=True, **cfg)
+        self.layer_norm = nn.LayerNorm(cfg.get('embed_dim', 768))
+        self.dropout = nn.Dropout(cfg.get('dropout_rate', 0.1))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_ids: torch.Tensor,
+    ):
+        '''
+        ts self attention with residual
+        hidden_states: bs*nvar, L, d
+        attention_mask: bs, nvar, L
+        '''
+        q_len = hidden_states.size(1)
+        attention_mask = rearrange(attention_mask, 'b nvar p -> (b nvar) p')  # bs*nvar, L
+        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2).expand(-1, 1, q_len, q_len)  # bs*nvar, 1, L, L
+        attention_mask = attention_mask.bool()  # convert to bool
+        normed_hidden_states = self.layer_norm(hidden_states) # pre-norm
+        attention_output = self.self_attention(
+           normed_hidden_states,
+           attention_mask,
+           position_ids
+        )
+        # residual
+        hidden_states = hidden_states + self.dropout(attention_output)
+        return hidden_states
+class GroupSelfAttention(nn.Module):
+    """Self-attention applied along the batch axis masked by the group attention mask"""
+    def __init__(self, layer_idx: int, **cfg):
+        super().__init__()
+        # we don't use RoPE here because there's no natural ordering along the batch axis
+        self.self_attention = Attention(layer_idx, is_rope=False, **cfg)
+        self.layer_norm = nn.LayerNorm(cfg.get('embed_dim', 768))
+        self.dropout = nn.Dropout(cfg.get('dropout_rate', 0.1))
+    def _construct_group_mask(self,
+                              group_ids: torch.Tensor,
+                              attention_mask: torch.Tensor) -> torch.Tensor:
+            # construct group_mask (batch, batch) from group ids
+            # a cell is True if both row and col had the same group id
+            group_mask = group_ids[:, None] == group_ids[None, :]
+            # group_mask: bs*nvar, bs*nvar
+            # attention_mask: bs*nvar, L
+            group_time_mask = torch.einsum("qb, bt -> qbt", group_mask, attention_mask).float() # bs*nvar, bs*nvar, L
+            group_time_mask = rearrange(group_time_mask, "q b t -> t 1 q b") # L,1, bs*nvar, bs*nvar
+            group_time_mask = group_time_mask.bool()  # convert to bool
+            return group_time_mask
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            group_ids: torch.Tensor,
+    ):
+        '''
+        hidden_states: bs*nvar, L, d
+        attention_mask: bs, nvar, L
+        group_ids: bs*nvar
+        '''
+        # attention_mask = rearrange(attention_mask, 'b nvar l -> (b nvar) l')  # bs*nvar, L
+        # hidden_states = rearrange(hidden_states, 'bs l d -> l bs d',) # L, bs*nvar, d
+        # group_attn_mask = self._construct_group_mask(group_ids, attention_mask) #L,1, bs*nvar, bs*nvar
+        BS, nvar, _ = attention_mask.shape
+        hidden_states = rearrange(hidden_states, '(bs nvar) l d -> (bs l) nvar d', bs=BS, nvar=nvar)
+        attention_mask = rearrange(attention_mask, 'bs nvar l -> (bs l) nvar')  # (bs*L), nvar
+        group_attn_mask = attention_mask.unsqueeze(1).unsqueeze(2).expand(-1, 1, nvar, nvar).bool()  # (bs*L), 1, nvar, nvar
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.self_attention(
+            normed_hidden_states,
+            group_attn_mask,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output)
+        # flip time and batch axes back to their original position
+        hidden_states = rearrange(hidden_states, '(bs l) nvar d -> (bs nvar) l d', bs=BS, nvar=nvar)
+        # hidden_states = rearrange(hidden_states, "time batch d -> batch time d") # Bs*nvar, L, d
+        return hidden_states
+class AttentionPooling(nn.Module):
+    def __init__(self,
+                 dim=768,
+                 mlp_ratio=4,
+                 context_dim=384,
+                 num_heads=12,
+                 dropout_rate=0.1):
+        super().__init__()
+        self.cross_attn = CrossAttention(dim=dim,
+                                         context_dim=context_dim,
+                                         num_heads=num_heads,
+                                         dropout_rate=dropout_rate)
+        self.ffn_norm = nn.LayerNorm(dim)
+        self.ffn_layer = MLP(
+            hidden_size=dim,
+            intermediate_size=dim * mlp_ratio,
+            hidden_act='silu',
+        )
+        self.post_norm = nn.LayerNorm(dim)
+    def forward(self, x, context, attn_mask=None):
+        # x: BS, num_query, dim
+        # context: BS, num_kv, context_dim
+        # attn_mask: BS, nvar, num_p,
+        b,n,_ = x.shape
+        kv_len = context.shape[1]
+        attn_mask = rearrange(attn_mask, 'b nvar p -> b (nvar p)')
+        attn_mask = attn_mask.view(b, 1, 1, kv_len).expand(b, 1, n, kv_len).bool()
+        x = self.cross_attn(x, context, attn_mask)
+        x = x + self.ffn_layer(self.ffn_norm(x))
+        x = self.post_norm(x)
+        return x
+class SensorEncoderLayer(nn.Module):
+    def __init__(self, layer_idx: int, **cfg):
+        super().__init__()
+        hidden_size = cfg['embed_dim']
+        intermediate_size = cfg['mlp_ratio'] * hidden_size
+        self.channel_attn_type = cfg.get('channel_attn_type', 'group_attn')
+        if self.channel_attn_type == 'group_attn':
+            self.ts_attn = TimeSelfAttention(layer_idx=layer_idx, **cfg) # pre-norm
+            self.group_attn = GroupSelfAttention(layer_idx=layer_idx, **cfg) # pre-norm
+        elif self.channel_attn_type == 'univariate':
+            self.ts_attn = TimeSelfAttention(layer_idx=layer_idx, **cfg)
+        else:
+            self.ts_attn = AllAttention(layer_idx=layer_idx, **cfg)
+        self.norm = nn.LayerNorm(hidden_size) # post-norm
+        self.ffn_layer = MLP(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            hidden_act='silu',
+        )
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            group_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, Optional[torch.FloatTensor], Optional[torch.FloatTensor]]:
+        if self.channel_attn_type == 'group_attn':
+            '''
+            Time self attention with residual
+            hidden_states: bs*nvar, L, d
+            attention_mask: bs, nvar, L
+            group_attention_mask: bs*nvar, bs*nvar
+            '''
+            hidden_states = self.ts_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids
+            ) # handled residual
+            hidden_states = self.group_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                group_ids=group_ids,
+            ) # handled residual
+            # Fully Connected
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            hidden_states = self.ffn_layer(hidden_states)
+            hidden_states = residual + hidden_states
+        elif self.channel_attn_type == 'univariate':
+            # hidden_states: bs*nvar, L, d
+            hidden_states = self.ts_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids
+            ) # handled residual
+            # Fully Connected
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            hidden_states = self.ffn_layer(hidden_states)
+            hidden_states = residual + hidden_states
+        else:
+            # hidden_states: bs (nvar L) d
+            hidden_states = self.ts_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+            ) # b (nvar l) d
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            hidden_states = self.ffn_layer(hidden_states)
+            hidden_states = residual + hidden_states
+        return hidden_states
+class SensorTransformerModel(nn.Module):
+    def __init__(self, **cfg):
+        super().__init__()
+        patch_size = cfg.get('patch_size', None)
+        self.patch_size = patch_size
+        if patch_size is not None:
+            # fixed patch size embedder
+            self.patch_embed = PatchEmbedding(**cfg)
+        else:
+            self.patch_embed = MultiSizePatchEmbed(**cfg)
+        self.blocks = nn.ModuleList(
+            [SensorEncoderLayer(layer_idx, **cfg)
+             for layer_idx in range(cfg['depth'])]
+        )
+        self.norm = torch.nn.LayerNorm(cfg['embed_dim'])
+        self.embed_dim = cfg['embed_dim']
+        self.channel_attn_type = cfg.get('channel_attn_type', 'group_attn') # group_attn, all_attn, univariate
+    def forward(
+            self,
+            input_ids,
+            attention_mask,
+            time_index,):
+        if self.patch_size is None:
+            '''
+            input_ids: list of list of tensor # BS, nvar, num_p, patch_size
+            attention_mask: same as input_ids
+            self.patch_embed will handle device.
+            '''
+            BS = len(input_ids)
+            flat_input_ids = flatten_list(input_ids)
+            flat_attention_mask = flatten_list(attention_mask)
+            flat_time_index = flatten_list(time_index)
+            # embed each variable separately
+            hidden_states = self.patch_embed(flat_input_ids,flat_attention_mask,flat_time_index)  # (bs*nvar, seq_len, embed_dim)
+            attention_mask = self._get_self_attn_mask(attention_mask).to(hidden_states.device)  # BS, nvar, num_p
+            position_ids = self._build_rope_position_ids(attention_mask)  # BS, nvar, num_p
+            position_ids = rearrange(position_ids, 'b nvar p -> (b nvar) p')  # BS*nvar, num_p
+        else:
+            '''
+            input_ids: tensor # BS, nvar, L
+            attention_mask: tensor # BS, nvar, L
+            time_index: tensor # BS, nvar, L
+            '''
+            BS, nvar, L = input_ids.shape
+            hidden_states = self.patch_embed(input_ids, attention_mask, time_index)  # (bs*nvar, seq_len, embed_dim)
+            # transform pixel-level attn mask (BS, nvar, L)to patch-level attn mask (BS, nvar, num_p), element would be 1 if all pixel is 1,if all pixel is 0, then is 0
+            attention_mask = reduce(
+                attention_mask,
+                'b v (p ps) -> b v p',
+                'max',
+                ps=self.patch_size
+            )
+            position_ids = self._build_rope_position_ids(attention_mask)  # BS, nvar, num_p
+            position_ids = rearrange(position_ids, 'b nvar p -> (b nvar) p')  # BS*nvar, num_p
+        if self.channel_attn_type == 'all_attn':
+            hidden_states = rearrange(hidden_states, '(b nvar) l d -> b (nvar l) d', b=BS)
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states,
+                attention_mask=attention_mask,
+                group_ids=None, # legacy argument
+                position_ids=position_ids,
+            ) # bs*nvar, seq, emb or bs (nvar l) d
+        if self.channel_attn_type == 'group_attn':
+            hidden_states = rearrange(hidden_states, '(b nvar) l d -> b (nvar l) d', b=BS)
+        hidden_states = self.norm(hidden_states) # (Bs*nvar), seq, emb
+        return hidden_states, attention_mask
+    def _build_rope_position_ids(self,attention_mask):
+        """
+        attention_mask: Tensor [BS, nvar, num_p]
+        returns: LongTensor [BS, nvar, num_p]
+        """
+        assert attention_mask.dim() == 3
+        BS, nvar, num_p = attention_mask.shape
+        mask = attention_mask.to(torch.long)
+        # position index increases inside each variable
+        pos = (mask.cumsum(dim=-1) - 1) * mask         # [BS, nvar, num_p]
+        return pos
+    def _get_self_attn_mask(self,attn_mask_list):
+        """
+        Collapse a nested list of attention masks from shape
+            [BS][nvar][num_p, patch_size]
+        into tensors of shape [BS, nvar, num_p].
+        Args:
+            attention_mask (list[list[Tensor]]):
+                Each tensor has shape [num_p, patch_size], and all have the same shape.
+        Returns:
+            torch.Tensor (BS, nvar, num_p)
+        """
+        collapsed_batch = []
+        for sample_masks in attn_mask_list:  # loop over batch
+            # collapse each [num_p, patch_size] → [num_p]
+            nvar_collapsed = [
+                (var_mask.sum(dim=-1) > 0).to(var_mask.dtype) for var_mask in sample_masks
+            ]
+            nvar_collapsed = torch.stack(nvar_collapsed, dim=0)  # [nvar, num_p]
+            collapsed_batch.append(nvar_collapsed)
+        collapsed_batch = torch.stack(collapsed_batch, dim=0)  # [BS, nvar, num_p]
+        return collapsed_batch
+    def _get_group_ids(self,attn_mask_list):
+        """
+        attn_mask_list: list of list of tensor
+            BS, nvar
+            each tensor is shape (num_p, patch_size)
+        Returns:
+            group_mask: (BS*nvar, BS*nvar) boolean tensor
+                True means same group
+                False means different group
+        """
+        BS = len(attn_mask_list)
+        nvar = len(attn_mask_list[0])
+        # build group ids
+        # each sample i repeats nvar times
+        group_ids = torch.arange(BS).repeat_interleave(nvar)  # (BS*nvar)
+        return group_ids
+if __name__ == "__main__":
+    from model_factory.coca import Config
+    cfg = Config(embed_dim=384,
+                 num_heads=6,
+                 mlp_ratio=4,
+                 depth=12,
+                 dropout_rate=0.1,)
+    sensor_model = SensorTransformerModel(**cfg)
+    dummy_input = [[torch.randn(14,40),torch.randn(14,40)],[torch.randn(14,40),torch.randn(14,30)]]
+    mask = [[torch.ones(14,40),torch.zeros(14,40)],[torch.zeros(14,40),torch.zeros(14,30)]]
+    time_idx = [[torch.ones(14,40),torch.ones(14,40)],[torch.ones(14,40),torch.ones(14,30)]]
+    out, attn_mask = sensor_model(dummy_input,mask,time_idx)
+    print(out.shape)  # expect (2*2, max_num_patches, embed
+    # python -m model_factory.ts_transformer

modeling_slip.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+SLIP Model - HuggingFace Hub Loading Interface
+Usage:
+    from modeling_slip import SLIPModel
+    model = SLIPModel.from_pretrained("LeoChen085/SLIP")
+    # Or load a task-specific checkpoint:
+    model = SLIPModel.from_pretrained("LeoChen085/SLIP", checkpoint="har.safetensors")
+"""
+import os
+import sys
+import json
+import math
+import torch
+import torch.nn as nn
+from pathlib import Path
+from typing import Optional
+# Ensure model_factory and util are importable from the HF cache directory
+_THIS_DIR = Path(__file__).resolve().parent
+if str(_THIS_DIR) not in sys.path:
+    sys.path.insert(0, str(_THIS_DIR))
+from model_factory.ts_transformer import (
+    SensorTransformerModel,
+    AttentionPooling,
+    CrossAttention,
+)
+from model_factory.multimodal_gemma import (
+    Gemma3MultimodalModel,
+    Residual,
+)
+# ── Lightweight helpers (from SLIP.py, no distributed deps) ──
+def masked_mean(t, mask, dim=1, eps=1e-6):
+    t = t.masked_fill(~mask, 0.)
+    numer = t.sum(dim=dim)
+    denom = mask.sum(dim=dim).clamp(min=eps)
+    return numer / denom
+class EmbedToLatents(nn.Module):
+    def __init__(self, dim, dim_latents):
+        super().__init__()
+        self.to_latents = nn.Linear(dim, dim_latents, bias=False)
+    def forward(self, x):
+        latents = self.to_latents(x)
+        return torch.nn.functional.normalize(latents, dim=-1)
+class SLIPModel(nn.Module):
+    """
+    SLIP model for inference. Loads from HuggingFace Hub without Hydra dependency.
+    Supports:
+        - get_embedding(text, sensors) -> (text_emb, sensor_emb)
+        - get_sensor_embedding(input_ids, mask, time_index) -> sensor_emb
+        - generate(text, sensors) -> generated_token_ids
+        - sft_training(text, sensors) -> loss_dict
+    """
+    def __init__(self, config: dict):
+        super().__init__()
+        # Build sensor encoder directly (no Hydra)
+        sensor_cfg = config["sensor_encoder"]
+        self.sensor_encoder = SensorTransformerModel(**sensor_cfg)
+        dim = self.sensor_encoder.embed_dim  # 768
+        self.embed_dim = dim
+        # Build multimodal LLM
+        llm_model_name = config.get("llm_model_name", "google/gemma-3-270m")
+        post_train = config.get("post_train", True)
+        split_layer = config.get("split_layer", 12)
+        self.multimodalModel = Gemma3MultimodalModel(
+            llm_model_name, post_train, split_layer
+        )
+        lm_dim = self.multimodalModel.hidden_size  # 640
+        self.lm_dim = lm_dim
+        common_dim = config.get("common_dim", lm_dim)
+        # Attention pooling
+        num_img_queries = config.get("num_img_queries", 0)
+        if num_img_queries > 0:
+            self.img_queries = nn.Parameter(
+                torch.randn(num_img_queries + 1, common_dim)
+            )
+            self.img_attn_pool = AttentionPooling(
+                dim=common_dim,
+                context_dim=dim,
+                num_heads=config.get("num_heads", 5),
+            )
+            dim = common_dim
+        # Bridge projections
+        self.img_to_latents = EmbedToLatents(dim, common_dim)
+        self.text_to_latents = EmbedToLatents(common_dim, common_dim)
+        # Temperature
+        self.temperature = nn.Parameter(torch.tensor(math.log(1 / 0.07)))
+        self.temperature_max = math.log(1 / 0.07)
+        # Store config
+        self.config_dict = config
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id_or_path: str,
+        checkpoint: str = "model.safetensors",
+        device: str = "cpu",
+        dtype: torch.dtype = torch.bfloat16,
+        **kwargs,
+    ) -> "SLIPModel":
+        """
+        Load SLIP from a HuggingFace repo or local directory.
+        Args:
+            repo_id_or_path: HuggingFace repo ID (e.g., "LeoChen085/SLIP")
+                             or local directory path.
+            checkpoint: Which checkpoint file to load.
+                        Default "model.safetensors" (base pretrained).
+                        Options: "har.safetensors", "sleep.safetensors",
+                                 "ecg.safetensors", "tsqa.safetensors",
+                                 "caption.safetensors"
+            device: Device to load model on.
+            dtype: Model dtype (default bfloat16).
+        """
+        local_path = Path(repo_id_or_path)
+        if local_path.is_dir():
+            # Load from local directory
+            config_path = local_path / "config.json"
+            weights_path = local_path / checkpoint
+        else:
+            # Download from HuggingFace Hub
+            from huggingface_hub import hf_hub_download
+            config_path = hf_hub_download(repo_id_or_path, "config.json")
+            weights_path = hf_hub_download(repo_id_or_path, checkpoint)
+            # Also download source files (needed for model classes)
+            for src_file in [
+                "model_factory/__init__.py",
+                "model_factory/SLIP.py",
+                "model_factory/multimodal_gemma.py",
+                "model_factory/ts_transformer.py",
+                "util/__init__.py",
+                "util/pos_embed.py",
+            ]:
+                try:
+                    hf_hub_download(repo_id_or_path, src_file)
+                except Exception:
+                    pass  # File may not exist separately
+        # Load config
+        with open(config_path) as f:
+            config = json.load(f)
+        # Build model
+        print(f"Building SLIP model...")
+        model = cls(config)
+        # Load weights
+        print(f"Loading weights from {checkpoint}...")
+        if str(weights_path).endswith(".safetensors"):
+            from safetensors.torch import load_file
+            state_dict = load_file(weights_path, device=device)
+        else:
+            state_dict = torch.load(weights_path, map_location=device, weights_only=False)
+            if isinstance(state_dict, dict):
+                if "model" in state_dict:
+                    state_dict = state_dict["model"]
+                elif "state_dict" in state_dict:
+                    state_dict = state_dict["state_dict"]
+            # Remove DDP module. prefix
+            state_dict = {
+                k.replace("module.", "", 1) if k.startswith("module.") else k: v
+                for k, v in state_dict.items()
+                if isinstance(v, torch.Tensor)
+            }
+        # Load state dict
+        missing, unexpected = model.load_state_dict(state_dict, strict=False)
+        if missing:
+            print(f"Missing keys ({len(missing)}): {missing[:5]}{'...' if len(missing) > 5 else ''}")
+        if unexpected:
+            print(f"Unexpected keys ({len(unexpected)}): {unexpected[:5]}{'...' if len(unexpected) > 5 else ''}")
+        model = model.to(dtype=dtype, device=device)
+        model.eval()
+        print("Model loaded successfully.")
+        return model
+    # ── Inference methods ─────────────────────────────────────
+    def embed_sensor(self, sensors, sensor_attn_mask=None, time_index=None):
+        from einops import repeat
+        sensor_tokens, attn_mask = self.sensor_encoder(
+            sensors, sensor_attn_mask, time_index=time_index
+        )
+        if hasattr(self, "img_attn_pool"):
+            img_queries = repeat(
+                self.img_queries, "n d -> b n d", b=sensor_tokens.shape[0]
+            )
+            sensor_tokens = self.img_attn_pool(img_queries, sensor_tokens, attn_mask)
+        return sensor_tokens, attn_mask.bool()
+    @torch.no_grad()
+    def get_embedding(self, text, sensors):
+        from einops import rearrange
+        sensor_hidden, sensor_mask = self.embed_sensor(
+            sensors=sensors["input_ids"],
+            sensor_attn_mask=sensors["attention_mask"],
+            time_index=sensors["time_index"],
+        )
+        self.multimodalModel.condition_image(sensor_hidden)
+        text_hidden, _ = self.multimodalModel(
+            input_ids=text["input_ids"][:, :-1],
+            attention_mask=text["attention_mask"][:, :-1],
+        )
+        text_hidden = self.text_to_latents(text_hidden)
+        sensor_hidden = self.img_to_latents(sensor_hidden)
+        if hasattr(self, "img_attn_pool"):
+            sensor_hidden = sensor_hidden[:, 0, :]
+        else:
+            sensor_hidden = masked_mean(
+                sensor_hidden,
+                rearrange(sensor_mask, "b n p -> b (n p) 1"),
+                dim=1,
+            )
+        return text_hidden, sensor_hidden
+    @torch.no_grad()
+    def get_sensor_embedding(self, input_ids, mask, time_index):
+        from einops import rearrange
+        sensor_hidden, sensor_mask = self.embed_sensor(
+            sensors=input_ids, sensor_attn_mask=mask, time_index=time_index
+        )
+        sensor_hidden = self.img_to_latents(sensor_hidden)
+        if hasattr(self, "img_attn_pool"):
+            sensor_hidden = sensor_hidden[:, 0, :]
+        else:
+            sensor_hidden = masked_mean(
+                sensor_hidden,
+                rearrange(sensor_mask, "b n p -> b (n p) 1"),
+                dim=1,
+            )
+        return sensor_hidden
+    @torch.no_grad()
+    def generate(self, text, sensors, **generate_kwargs):
+        sensor_hidden, _ = self.embed_sensor(
+            sensors=sensors["input_ids"],
+            sensor_attn_mask=sensors["attention_mask"],
+            time_index=sensors["time_index"],
+        )
+        self.multimodalModel.condition_image(sensor_hidden)
+        return self.multimodalModel.model.generate(
+            input_ids=text["input_ids"],
+            attention_mask=text["attention_mask"],
+            max_new_tokens=generate_kwargs.get("max_new_tokens", 300),
+            do_sample=generate_kwargs.get("do_sample", False),
+            num_beams=generate_kwargs.get("num_beams", 1),
+        )

sleep.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9685e181a9b4038d03744647621f864ff3a3e866520ec7038c061e8ce0e88b13
+size 1386043740

tsqa.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8f6b02d497fa409d65c18b776da0132046143e88514336cdd83255dbbf76833
+size 1386043740

util/__init__.py ADDED Viewed

File without changes

util/pos_embed.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import numpy as np
+import torch
+def get_1d_sincos_pos_embed(embed_dim, length, cls_token=False):
+    """
+    Create 1D sine-cosine positional embeddings.
+    Args:
+        embed_dim (int): Dimension of the embedding (must be even)
+        length (int): Number of positions (sequence length)
+        cls_token (bool): Whether to include an extra zero vector for [CLS] token
+    Returns:
+        np.ndarray of shape (length, embed_dim) or (1+length, embed_dim) if cls_token=True
+    """
+    # position indices 0 ... length-1
+    pos = np.arange(length, dtype=np.float32)
+    # get embedding from grid
+    pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, pos)  # (L, D)
+    # optionally add CLS token embedding
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    # --------------------------------------------------------
+    # 2D sine-cosine position embedding
+    # References:
+    # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+    # MoCo v3: https://github.com/facebookresearch/moco-v3
+    # --------------------------------------------------------
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  #changed(H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  #changed (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def interpolate_pos_embed(model, checkpoint_model, orig_size, new_size):
+    '''
+    Input: model: the class is definging for downstream
+           checkpoint_model: pre-train weight
+           orig_size = patch size in the ckpt
+           new_size = patch size in the current model
+    '''
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'] # 1 x 560 x 768 (1 x num_patches x E)
+        embedding_size = pos_embed_checkpoint.shape[-1] # 768
+        # number of special tokens (e.g. in this case num_extra_tokens = 1 for the cls token)
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size[0], orig_size[1], new_size[0], new_size[1]))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] # old positions
+            pos_tokens = pos_tokens.reshape(-1, orig_size[0], orig_size[1], embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+# RoPE: https://huggingface.co/thuml/sundial-base-128m/blob/main/modeling_sundial.py
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=10000, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim,
+                          2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device,
+                         dtype=torch.int64).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer(
+            "sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(
+                seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# two dimensional version
+def apply_rotary_pos_emb_2d(q, k,
+                            cos_h, sin_h,
+                            cos_w, sin_w,
+                            pos_h, pos_w,
+                            unsqueeze_dim=1):
+    """
+    q, k: [B, heads, N, Dh]
+    cos_h, sin_h: caches from 1D rotary with dim = Dh // 2 for the first axis
+    cos_w, sin_w: caches from 1D rotary with dim = Dh // 2 for the second axis
+    pos_h, pos_w: [B, N] integer positions for each token along the two axes
+    returns q_out, k_out with same shape as q, k
+    """
+    Dh = q.shape[-1]
+    assert Dh % 4 == 0, "head dim must be divisible by 4 so each half is even for rotate_half"
+    # split channel dim into two halves
+    q_h, q_w = q.split(Dh // 2, dim=-1)
+    k_h, k_w = k.split(Dh // 2, dim=-1)
+    # apply 1D RoPE on each half with its own positions
+    pos_h = pos_h.long()
+    pos_w = pos_w.long()
+    q_h, k_h = apply_rotary_pos_emb(q_h, k_h, cos_h, sin_h, pos_h, unsqueeze_dim=unsqueeze_dim)
+    q_w, k_w = apply_rotary_pos_emb(q_w, k_w, cos_w, sin_w, pos_w, unsqueeze_dim=unsqueeze_dim)
+    # concat back
+    q_out = torch.cat([q_h, q_w], dim=-1)
+    k_out = torch.cat([k_h, k_w], dim=-1)
+    return q_out, k_out
+def build_2d_position_ids(attention_mask: torch.Tensor,
+                          flatten: bool = True):
+    """
+    attention_mask: Tensor [BS, nvar, num_p] with 1 for valid patches, 0 for padding.
+    Returns:
+        If flatten is True:
+            pos_var_flat: LongTensor [BS, nvar*num_p]
+            pos_patch_flat: LongTensor [BS, nvar*num_p]
+        Else:
+            pos_var:  LongTensor [BS, nvar, num_p]
+            pos_patch: LongTensor [BS, nvar, num_p]
+    """
+    assert attention_mask.dim() == 3, "attention_mask must be [BS, nvar, num_p]"
+    B, V, P = attention_mask.shape
+    mask = attention_mask.to(dtype=torch.long)
+    # per patch index within each variable, ignores padding
+    pos_patch = (mask.cumsum(dim=-1) - 1) * mask                      # [B, V, P]
+    # per variable index, ignores variables that are entirely padded
+    var_valid = mask.any(dim=-1).to(dtype=torch.long)                 # [B, V]
+    pos_var_base = (var_valid.cumsum(dim=1) - 1) * var_valid          # [B, V]
+    pos_var = pos_var_base.unsqueeze(-1).expand(B, V, P) * mask       # [B, V, P]
+    if flatten:
+        return pos_var.reshape(B, V * P).long(), pos_patch.reshape(B, V * P).long()
+    return pos_var.long(), pos_patch.long()
+def build_1d_position_ids(attention_mask: torch.Tensor):
+    """
+    Build 1D position ids for [BS, nvar, num_p],
+    output shape [BS * nvar, num_p].
+    Each (batch, variable) pair gets its own 1D position index sequence
+    along the patch axis, skipping padded positions.
+    Args:
+        attention_mask: Tensor [BS, nvar, num_p], 1 for valid, 0 for padding.
+    Returns:
+        pos_ids: LongTensor [BS * nvar, num_p]
+    """
+    assert attention_mask.dim() == 3, "attention_mask must be [BS, nvar, num_p]"
+    B, V, P = attention_mask.shape
+    mask = attention_mask.to(dtype=torch.long)
+    # Compute per-variable cumulative index
+    pos_ids = (mask.cumsum(dim=-1) - 1) * mask  # [B, V, P]
+    # Reshape to [BS * nvar, num_p]
+    pos_ids = pos_ids.view(B * V, P).long()
+    return pos_ids