feat: first commit

Files changed (17) hide show

__init__.py +7 -0
bert_config/config.json +23 -0
bert_config/tokenizer.json +0 -0
bert_config/tokenizer_config.json +1 -0
bert_config/vocab.txt +0 -0
config.json +37 -0
configuration_aurora.py +62 -0
flow_loss.py +254 -0
generation_config.json +4 -0
modality_connector.py +266 -0
model.safetensors +3 -0
modeling_aurora.py +636 -0
prototype_retriever.py +205 -0
ts_generation_mixin.py +114 -0
util_functions.py +154 -0
vit_config/config.json +21 -0
vit_config/preprocessor_config.json +15 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+'''
+* @author: EmpyreanMoon
+*
+* @create: 2025-07-17 19:20
+*
+* @description:
+'''

bert_config/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.6.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

bert_config/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert_config/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "model_max_length": 512}

bert_config/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "aurora_base",
+  "architectures": [
+    "AuroraForPrediction"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_aurora.AuroraConfig",
+    "AutoModelForCausalLM": "modeling_aurora.AuroraForPrediction"
+  },
+  "dropout_rate": 0.2,
+  "hidden_act": "silu",
+  "hidden_size": 256,
+  "token_len": 48,
+  "intermediate_size": 512,
+  "max_position_embeddings": 10000,
+  "model_type": "aurora",
+  "num_attention_heads": 8,
+  "num_enc_layers": 1,
+  "num_dec_layers": 9,
+  "rope_theta": 10000,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "num_sampling_steps": 50,
+  "flow_loss_depth": 3,
+  "diffusion_batch_mul": 4,
+  "threshold_ratio": [0.2, 0.3, 0.4, 0.5],
+  "mask_ratio": 0.5,
+  "norm_mode": "batch",
+  "num_prototypes": 1000,
+  "num_retriever_enc_layers": 1,
+  "num_retriever_dec_layers": 1,
+  "num_text_cross_layers": 1,
+  "num_vision_cross_layers": 1,
+  "num_text_connect_layers": 1,
+  "num_vision_connect_layers": 1,
+  "num_distill": 10
+}

configuration_aurora.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from transformers import PretrainedConfig
+class AuroraConfig(PretrainedConfig):
+    model_type = "aurora"
+    def __init__(
+            self,
+            token_len: int = 48,
+            hidden_size: int = 512,
+            intermediate_size: int = 1024,
+            num_enc_layers: int = 12,
+            num_dec_layers: int = 12,
+            num_attention_heads: int = 8,
+            hidden_act: str = "silu",
+            rope_theta: int = 10000,
+            dropout_rate: float = 0.2,
+            max_position_embeddings: int = 10000,
+            num_sampling_steps: int = 50,
+            flow_loss_depth: int = 3,
+            diffusion_batch_mul: int = 4,
+            threshold_ratio: list[float] = [0.2, 0.3, 0.4, 0.5],
+            mask_ratio: float = 0.5,
+            norm_mode: str = 'batch',
+            num_prototypes: int = 1024,
+            num_retriever_enc_layers: int = 1,
+            num_retriever_dec_layers: int = 1,
+            num_text_cross_layers: int = 1,
+            num_vision_cross_layers: int = 1,
+            num_text_connect_layers: int = 1,
+            num_vision_connect_layers: int = 1,
+            num_distill: int = 10,
+            **kwargs,
+    ):
+        self.token_len = token_len
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_enc_layers = num_enc_layers
+        self.num_dec_layers = num_dec_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.rope_theta = rope_theta
+        self.dropout_rate = dropout_rate
+        self.max_position_embeddings = max_position_embeddings
+        self.num_sampling_steps = num_sampling_steps
+        self.flow_loss_depth = flow_loss_depth
+        self.diffusion_batch_mul = diffusion_batch_mul
+        self.threshold_ratio = threshold_ratio
+        self.mask_ratio = mask_ratio
+        self.norm_mode = norm_mode
+        self.num_prototypes = num_prototypes
+        self.num_retriever_enc_layers = num_retriever_enc_layers
+        self.num_retriever_dec_layers = num_retriever_dec_layers
+        self.num_text_cross_layers = num_text_cross_layers
+        self.num_vision_cross_layers = num_vision_cross_layers
+        self.num_text_connect_layers = num_text_connect_layers
+        self.num_vision_connect_layers = num_vision_connect_layers
+        self.num_distill = num_distill
+        super().__init__(
+            **kwargs,
+        )

flow_loss.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .util_functions import resample
+class FlowLoss(nn.Module):
+    """Flow Loss"""
+    def __init__(self, target_channels, z_channels, depth, width, num_sampling_steps):
+        super(FlowLoss, self).__init__()
+        self.in_channels = target_channels
+        self.net = SimpleMLPAdaLN(
+            in_channels=target_channels,
+            model_channels=width,
+            out_channels=target_channels,
+            z_channels=z_channels,
+            num_res_blocks=depth
+        )
+        self.num_sampling_steps = num_sampling_steps
+    def forward(self, target, z, prototype=None, mask=None, eps=1e2):
+        noise = torch.randn_like(target)
+        t = torch.rand(target.shape[0], device=target.device)
+        if prototype is not None:
+            noised_target = t[:, None] * target + (1 - t[:, None]) * (prototype + noise)
+        else:
+            noised_target = t[:, None] * target + (1 - t[:, None]) * noise
+        predict_v = self.net(noised_target, t * 1000, z)
+        loss = ((predict_v - target) ** 2)
+        if mask is not None:
+            loss = (loss * mask).sum(dim=-1) / mask.sum(dim=-1)
+        value_mask = loss < eps
+        loss = loss[value_mask].sum() / value_mask.sum()
+        return loss.mean()
+    def sample(self, z, prototype=None, num_samples=1, inference_token_len=48):
+        z = z.repeat(num_samples, 1)
+        noise = torch.randn(z.shape[0], self.in_channels).to(z.device)
+        if prototype is not None:
+            prototype = prototype.repeat(num_samples, 1)
+            start_point = noise + prototype
+            x = noise + prototype
+        else:
+            start_point = noise
+            x = noise
+        dt = 1.0 / self.num_sampling_steps
+        for i in range(self.num_sampling_steps):
+            t = (torch.ones((x.shape[0])) * i /
+                 self.num_sampling_steps).to(x.device)
+            pred = self.net(x, t * 1000, z)
+            x = x + (pred - start_point) * dt
+        if not self.training:
+            old_weight = torch.eye(self.in_channels).to(x.device)
+            new_weight = resample(old_weight, inference_token_len).T
+            x = F.linear(x, new_weight)
+            x = x.reshape(num_samples, -1, inference_token_len).transpose(0, 1)
+            return x
+        x = x.reshape(num_samples, -1, self.in_channels).transpose(0, 1)
+        return x
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0,
+                                                 end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class ResBlock(nn.Module):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    """
+    def __init__(
+            self,
+            channels
+    ):
+        super().__init__()
+        self.channels = channels
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(channels, 3 * channels, bias=True)
+        )
+    def forward(self, x, y):
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(
+            y).chunk(3, dim=-1)
+        h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
+        h = self.mlp(h)
+        return x + gate_mlp * h
+class FinalLayer(nn.Module):
+    """
+    The final layer adopted from DiT.
+    """
+    def __init__(self, model_channels, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(
+            model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=False)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(model_channels, 2 * model_channels, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        o = self.linear(x)
+        return o
+class SimpleMLPAdaLN(nn.Module):
+    """
+    The MLP for Diffusion Loss.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param z_channels: channels in the condition.
+    :param num_res_blocks: number of residual blocks per downsample.
+    """
+    def __init__(
+            self,
+            in_channels,
+            model_channels,
+            out_channels,
+            z_channels,
+            num_res_blocks,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.time_embed = TimestepEmbedder(model_channels)
+        self.cond_embed = nn.Linear(z_channels, model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        res_blocks = []
+        for i in range(num_res_blocks):
+            res_blocks.append(ResBlock(
+                model_channels,
+            ))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.final_layer = FinalLayer(model_channels, out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+    def forward(self, x, t, c):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C] Tensor of inputs.
+        :param t: a 1-D batch of timesteps.
+        :param c: conditioning from AR transformer.
+        :return: an [N x C] Tensor of outputs.
+        """
+        x = self.input_proj(x)
+        t = self.time_embed(t)
+        c = self.cond_embed(c)
+        y = t + c
+        for block in self.res_blocks:
+            x = block(x, y)
+        return self.final_layer(x, y)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.40.1"
+}

modality_connector.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import os
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Resize
+from transformers import ViTImageProcessor, ViTModel, BertModel, ViTConfig, BertConfig
+from .configuration_aurora import AuroraConfig
+class VisionEncoder(nn.Module):
+    config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'vit_config')
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.processor = UnifiedImageProcessor(config)
+        self.model = ViTModel(ViTConfig.from_json_file(os.path.join(self.config_path, 'config.json')))
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.hidden_size = self.model.config.hidden_size
+        self.output_dim = config.hidden_size
+        self.num_distill = config.num_distill
+        self.projection = nn.Linear(self.hidden_size, self.output_dim)
+        self.target_vision_tokens = nn.Parameter(torch.randn(self.num_distill, self.output_dim))
+        # Cross-attention layer
+        self.cross_vision = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                dim_feedforward=config.intermediate_size,
+                dropout=config.dropout_rate,
+                batch_first=True,
+            ),
+            norm=nn.LayerNorm(config.hidden_size),
+            num_layers=config.num_vision_cross_layers,
+        )
+    def extract_vit_features(self, image_tensor):
+        """
+        Extract image features using ViT
+        Args:
+            image_tensor: Preprocessed image tensor with shape [batch_size, 3, H, W]
+        Returns:
+            cls_feature: [CLS] token feature with shape [batch_size, hidden_size]
+            patch_features: Features of all patches with shape [batch_size, num_patches, hidden_size]
+        """
+        outputs = self.model(pixel_values=image_tensor)
+        last_hidden_state = outputs.last_hidden_state
+        cls_feature = last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
+        patch_features = last_hidden_state[:, 1:, :]  # [batch_size, num_patches, hidden_size]
+        return cls_feature, patch_features
+    def forward(self, x, type='pseudo'):
+        x = self.processor(x, type=type)
+        _, patch_features = self.extract_vit_features(x)
+        patch_features = self.projection(patch_features)
+        target_vision_tokens = self.target_vision_tokens.unsqueeze(0).repeat(patch_features.shape[0], 1, 1)
+        output_tokens = self.cross_vision(target_vision_tokens, patch_features)
+        return output_tokens  # [batch_size, num_patches, hidden_size]
+class UnifiedImageProcessor(nn.Module):
+    config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'vit_config')
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        # Load ViT preprocessor to get pretrained normalization parameters and target size
+        self.vit_processor = ViTImageProcessor.from_json_file(os.path.join(self.config_path, 'preprocessor_config.json'))
+        self.target_size = self.vit_processor.size["height"]  # e.g., 224 (default ViT input size)
+        # Define resizer for pseudo-images (matches real image target size)
+        self.pseudo_resizer = Resize((self.target_size, self.target_size))
+        self.token_len = config.token_len
+    def process_real_image(self, images):
+        """Process real images: automatic resizing, cropping, and normalization"""
+        # Directly use ViTImageProcessor to ensure consistency with pretraining pipeline
+        inputs = self.vit_processor(images=images, return_tensors="pt")
+        return inputs["pixel_values"]  # Shape: [batch_size, 3, H, W]
+    def _period_search(self, x):
+        xf = torch.fft.rfft(x, dim=-1)
+        # find period by amplitudes
+        frequency_list = abs(xf).mean(0)
+        frequency_list[0] = 0
+        _, top_list = torch.topk(frequency_list, 1)
+        top_list = top_list.detach().cpu().numpy()
+        period = x.shape[1] // top_list
+        return period
+    def process_pseudo_image(self, x):
+        """Process pseudo-images (converted from time series): ensure consistent normalization with real images"""
+        # Segmentation
+        input_length = x.shape[-1]
+        period = list(self._period_search(x))[0]
+        period = period if 0 < period < input_length else self.token_len
+        if period > input_length:
+            period = input_length
+        padding_length = (period - (input_length %
+                                            period)) % period
+        x_pad = F.pad(x, (padding_length, 0))
+        x_2d = einops.rearrange(x_pad, 'b (p f) -> b 1 f p', f=period)
+        # 3. Render & Alignment
+        x_resize = self.pseudo_resizer(x_2d)
+        image_input = einops.repeat(x_resize, 'b 1 h w -> b c h w', c=3)
+        return image_input
+    def forward(self, x, type='pseudo'):
+        if type == 'pseudo':
+            return self.process_pseudo_image(x)
+        else:
+            return self.process_real_image(x)
+class TextEncoder(nn.Module):
+    config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bert_config')
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.model = BertModel(BertConfig.from_json_file(os.path.join(self.config_path, 'config.json')))
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.hidden_size = self.model.config.hidden_size
+        self.output_dim = config.hidden_size
+        self.num_distill = config.num_distill
+        self.max_length = 125
+        self.projection = nn.Linear(self.hidden_size, self.output_dim)
+        # Define learnable target tokens (shape: [num_distill_tokens, hidden_size])
+        self.target_text_tokens = nn.Parameter(torch.randn(self.num_distill, self.output_dim))
+        self.cross_text = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                dim_feedforward=config.intermediate_size,
+                dropout=config.dropout_rate,
+                batch_first=True,
+            ),
+            norm=nn.LayerNorm(config.hidden_size),
+            num_layers=config.num_text_cross_layers,
+        )
+    def extract_bert_features(self, input_dict):
+        """Extract and clean BERT features with fixed output shape"""
+        outputs = self.model(**input_dict)
+        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
+        cls_feature = last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
+        token_features = last_hidden_state
+        # Create mask to exclude [CLS], [SEP], and padding tokens
+        attention_mask = input_dict["attention_mask"]  # [batch_size, seq_len]
+        batch_size, seq_len = attention_mask.shape
+        valid_mask = torch.ones_like(attention_mask)
+        valid_mask[:, 0] = 0  # Exclude [CLS]
+        for i in range(batch_size):
+            sep_pos = torch.where(attention_mask[i] == 1)[0][-1]
+            valid_mask[i, sep_pos] = 0  # Exclude [SEP]
+        # Apply mask and get valid tokens
+        valid_token_mask = valid_mask.unsqueeze(-1).expand(-1, -1, self.hidden_size)
+        clean_token_features = token_features * valid_token_mask
+        # Convert to fixed shape [batch_size, max_valid_tokens, hidden_size]
+        fixed_features = torch.zeros(batch_size, self.max_length, self.hidden_size,
+                                     device=clean_token_features.device)
+        valid_counts = []
+        for i in range(batch_size):
+            # Get valid tokens (excluding zeros)
+            valid_tokens = clean_token_features[i][clean_token_features[i].sum(dim=1) != 0]
+            valid_count = valid_tokens.shape[0]
+            valid_counts.append(valid_count)
+            # Truncate if longer than max_length, else pad with zeros
+            if valid_count > self.max_length:
+                fixed_features[i] = valid_tokens[:self.max_length]
+            else:
+                fixed_features[i, :valid_count] = valid_tokens
+        return cls_feature, token_features, fixed_features, valid_counts
+    def forward(self, texts):
+        """Return fixed-shape token features [batch_size, max_valid_tokens, hidden_size]"""
+        _, _, fixed_features, _ = self.extract_bert_features(texts)
+        fixed_features = self.projection(fixed_features)
+        target_text_tokens = self.target_text_tokens.unsqueeze(0).repeat(fixed_features.shape[0], 1, 1)
+        output_tokens = self.cross_text(target_text_tokens, fixed_features)
+        return output_tokens
+class ModalityConnector(nn.Module):
+    def __init__(self, config: AuroraConfig):
+        """
+        Args:
+            hidden_size: Feature dimension (must match text/vision feature dimensions)
+            num_distill_tokens: Unified token count (constant N)
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Define learnable target tokens (shape: [num_distill_tokens, hidden_size])
+        self.connect_text = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                dim_feedforward=config.intermediate_size,
+                dropout=config.dropout_rate,
+                batch_first=True,
+            ),
+            norm=nn.LayerNorm(config.hidden_size),
+            num_layers=config.num_text_connect_layers,
+        )
+        self.connect_vision = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                dim_feedforward=config.intermediate_size,
+                dropout=config.dropout_rate,
+                batch_first=True,
+            ),
+            norm=nn.LayerNorm(config.hidden_size),
+            num_layers=config.num_vision_connect_layers,
+        )
+    def forward(self, x, text_features, vision_features):
+        """
+        Distill text and vision tokens to the same count N
+        Args:
+            x: Time Series with shape [batch_size, n, hidden_size] (n is time series token count)
+            text_features: Text features with shape [batch_size, T, hidden_size] (T is text token count)
+            vision_features: Vision features with shape [batch_size, V, hidden_size] (V is vision token count)
+        Returns:
+            text_distilled: Distilled text tokens with shape [batch_size, N, hidden_size]
+            vision_distilled: Distilled vision tokens with shape [batch_size, N, hidden_size]
+        """
+        if text_features is not None:
+            from_text = self.connect_text(
+                x,
+                text_features
+            )
+        else:
+            from_text = None
+        from_vision = self.connect_vision(
+            x,
+            vision_features
+        )
+        return from_text, from_vision

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df2fb96852a59515a14552d5bddc35c03588b6a8bea69355984b3dd926a72b58
+size 843564328

modeling_aurora.py ADDED Viewed

	@@ -0,0 +1,636 @@

+import random
+from typing import Optional, Tuple, Union
+import math
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import MoeModelOutputWithPast, MoeCausalLMOutputWithPast
+from .configuration_aurora import AuroraConfig
+from .flow_loss import FlowLoss
+from .modality_connector import ModalityConnector, VisionEncoder, TextEncoder
+from .prototype_retriever import PrototypeRetriever
+from .ts_generation_mixin import TSGenerationMixin
+from .util_functions import resample, Transpose, causal_attention_mask, RoPE_decoder
+class AuroraPatchEmbedding(nn.Module):
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.proj_layer = nn.Linear(config.token_len, config.hidden_size, bias=False)
+        self.token_len = config.token_len
+        self.threshold_ratio = config.threshold_ratio
+        self.mask_ratio = config.mask_ratio
+    def _freq_masking(self, x):
+        x_fft = torch.fft.rfft(x, dim=-1)
+        x_ifft_list = []
+        for ratio in self.threshold_ratio:
+            temp = x_fft.clone()
+            truncation = int(temp.shape[-1] * ratio)
+            if random.random() > self.mask_ratio:
+                temp[:, :truncation] = 0
+            else:
+                temp[:, truncation:] = 0
+            x_ifft = torch.fft.irfft(temp, dim=-1)
+            x_ifft_list.append(x_ifft)
+        x_ifft = torch.stack(x_ifft_list, dim=0)
+        return rearrange(x_ifft, 's b l -> (s b) l')
+    def _predict(self, x, inference_token_len=48):
+        input_length = x.shape[-1]
+        padding_length = (inference_token_len - (input_length %
+                                                 inference_token_len)) % inference_token_len
+        x = F.pad(x, (padding_length, 0))
+        x = x.unfold(dimension=-1, size=inference_token_len,
+                     step=inference_token_len)
+        resampled_weight = resample(old=self.proj_layer.weight.data, new_patch_len=inference_token_len)
+        output = F.linear(x, resampled_weight)
+        return output, None
+    def forward(self, x, inference_token_len=48):
+        if not self.training:
+            return self._predict(x, inference_token_len)
+        input_length = x.shape[-1]
+        padding_length = (self.token_len - (input_length %
+                                            self.token_len)) % self.token_len
+        x = F.pad(x, (padding_length, 0))
+        x_masked = self._freq_masking(x)
+        x_origin = x.unfold(dimension=-1, size=self.token_len,
+                            step=self.token_len)
+        output_origin = self.proj_layer(x_origin)
+        x_masked = x_masked.unfold(dimension=-1, size=self.token_len,
+                                   step=self.token_len)
+        output_masked = self.proj_layer(x_masked)
+        return output_origin, output_masked
+class AuroraAttention(nn.Module):
+    def __init__(self, config: AuroraConfig, layer_idx: Optional[int] = None, rope: bool = False):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.attention_dropout = config.dropout_rate
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.rope = rope
+    def _scaled_dot_product_attention(self, Q, K, V, bias=None, attn_mask=None):
+        attn_scores = torch.matmul(Q, K.transpose(-2, -1))
+        attn_scores = attn_scores / math.sqrt(Q.size(-1))
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_scores = attn_scores.masked_fill(attn_mask, float('-inf'))
+            else:
+                attn_scores = attn_scores + attn_mask
+        if bias is not None:
+            if attn_scores.shape[0] > bias.shape[0]:
+                bias = bias.repeat(attn_scores.shape[0] // bias.shape[0], 1, 1, 1)
+            attn_scores += bias
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        if self.attention_dropout > 0.0 and self.training:
+            attn_weights = F.dropout(attn_weights, p=self.attention_dropout)
+        attn_output = torch.matmul(attn_weights, V)
+        return attn_output, attn_scores
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            key_embedding: torch.Tensor = None,
+            value_embedding: torch.Tensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: bool = False,
+            bias: torch.Tensor = None,
+            **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+        if key_embedding is None:
+            key_embedding = hidden_states
+        if value_embedding is None:
+            value_embedding = hidden_states
+        _, k_len, _ = key_embedding.size()
+        _, v_len, _ = value_embedding.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(key_embedding)
+        value_states = self.v_proj(value_embedding)
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, k_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if self.rope:
+            query_states, key_states = RoPE_decoder(query_states, key_states)
+        attn_output, attn_scores = self._scaled_dot_product_attention(
+            Q=query_states, K=key_states, V=value_states, bias=bias,
+            attn_mask=attention_mask)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_scores = None
+        return attn_output, attn_scores
+class AuroraFFN(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, hidden_act: str):
+        super().__init__()
+        self.ffn = nn.Sequential(nn.Linear(hidden_size, intermediate_size),
+                                 ACT2FN[hidden_act],
+                                 nn.Linear(intermediate_size, hidden_size))
+    def forward(self, hidden_state):
+        return self.ffn(hidden_state)
+class AuroraDecoderLayer(nn.Module):
+    def __init__(self, config: AuroraConfig, layer_idx: int):
+        super().__init__()
+        self.self_attn = AuroraAttention(config, layer_idx, rope=False)
+        self.cross_attn = AuroraAttention(config, layer_idx, rope=True)
+        self.ffn_layer = AuroraFFN(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act
+        )
+        if config.norm_mode == 'batch':
+            self.norm1 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.hidden_size), Transpose(1, 2))
+            self.norm2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.hidden_size), Transpose(1, 2))
+            self.norm3 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.hidden_size), Transpose(1, 2))
+        else:
+            self.norm1 = torch.nn.LayerNorm(config.hidden_size)
+            self.norm2 = torch.nn.LayerNorm(config.hidden_size)
+            self.norm3 = torch.nn.LayerNorm(config.hidden_size)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cross_states: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+            **kwargs,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        residual = hidden_states
+        num_token = hidden_states.shape[1]
+        attention_mask = causal_attention_mask(num_token).to(hidden_states.device)
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        x_attn = residual + self.norm1(hidden_states)
+        x_cross, cross_attn_weights = self.cross_attn(hidden_states=x_attn, key_embedding=cross_states,
+                                                      value_embedding=cross_states)
+        x_cross = self.norm2(x_cross) + x_attn
+        # Fully Connected
+        output_states = self.ffn_layer(x_cross)
+        output_states = self.norm3(output_states) + x_cross
+        if not output_attentions:
+            self_attn_weights = None
+            cross_attn_weights = None
+        return output_states, self_attn_weights, cross_attn_weights
+class AuroraEncoderLayer(nn.Module):
+    def __init__(self, config: AuroraConfig, layer_idx: int):
+        super().__init__()
+        self.self_attn = AuroraAttention(config, layer_idx, rope=False)
+        self.ffn_layer = AuroraFFN(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act
+        )
+        if config.norm_mode == 'batch':
+            self.norm1 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.hidden_size), Transpose(1, 2))
+            self.norm2 = nn.Sequential(Transpose(1, 2), nn.BatchNorm1d(config.hidden_size), Transpose(1, 2))
+        else:
+            self.norm1 = torch.nn.LayerNorm(config.hidden_size)
+            self.norm2 = torch.nn.LayerNorm(config.hidden_size)
+        self.dropout_1 = nn.Dropout(config.dropout_rate)
+        self.dropout_2 = nn.Dropout(config.dropout_rate)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+            bias: torch.Tensor = None,
+            **kwargs
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        residual = hidden_states
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            output_attentions=output_attentions,
+            bias=bias
+        )
+        x_attn = self.norm1(residual + self.dropout_1(hidden_states))
+        # Fully Connected
+        output_states = self.ffn_layer(x_attn)
+        output_states = self.norm2(self.dropout_2(output_states) + x_attn)
+        if not output_attentions:
+            self_attn_weights = None
+        return output_states, self_attn_weights
+class AuroraPredictHead(nn.Module):
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.output_proj = nn.Linear(config.hidden_size, config.token_len, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def _predict(self, hidden_states: torch.Tensor, inference_token_len=48):
+        resampled_weight = resample(old=self.output_proj.weight.data.T, new_patch_len=inference_token_len).T
+        output = F.linear(hidden_states, resampled_weight)
+        return output
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            inference_token_len: int = 48,
+            **kwargs
+    ) -> torch.FloatTensor:
+        if not self.training:
+            return self._predict(hidden_states, inference_token_len)
+        return self.output_proj(self.dropout(hidden_states))
+class AuroraPreTrainedModel(PreTrainedModel):
+    config_class = AuroraConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AuroraEncoderLayer", "AuroraDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = False
+class AuroraModel(nn.Module):
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.embed_layer = AuroraPatchEmbedding(config)
+        self.enc_layers = nn.ModuleList(
+            [AuroraEncoderLayer(config, layer_idx)
+             for layer_idx in range(config.num_enc_layers)]
+        )
+        self.dec_layers = nn.ModuleList(
+            [AuroraDecoderLayer(config, layer_idx)
+             for layer_idx in range(config.num_dec_layers)]
+        )
+        self.mask_num = len(config.threshold_ratio)
+        self.gradient_checkpointing = False
+        self.VisionEncoder = VisionEncoder(config)
+        self.TextEncoder = TextEncoder(config)
+        self.ModalityConnector = ModalityConnector(config)
+        self.VisionGuider = AuroraAttention(config)
+        self.TextGuider = AuroraAttention(config)
+        self.W = nn.Parameter(torch.eye(config.num_distill))
+        self.fuse = nn.Linear(config.hidden_size, config.hidden_size)
+    def forward(
+            self,
+            input_ids: torch.FloatTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            text_input_ids: Optional[torch.FloatTensor] = None,
+            text_attention_mask: Optional[torch.FloatTensor] = None,
+            text_token_type_ids: Optional[torch.FloatTensor] = None,
+            vision_ids: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            predict_token_num: Optional[int] = None,
+            inference_token_len: Optional[int] = None,
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        # input_ids is the input of time series, its shape is [batch_size, seq_len]
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        if inference_token_len is None:
+            inference_token_len = self.config.token_len
+        masked_embeds = None
+        if inputs_embeds is None:
+            inputs_embeds, masked_embeds = self.embed_layer(input_ids, inference_token_len)
+        if masked_embeds is None:
+            x_enc = inputs_embeds
+        else:
+            x_enc = torch.concat([inputs_embeds, masked_embeds], dim=0)
+        if vision_ids is not None:
+            vision_features = self.VisionEncoder(vision_ids, type='real')
+        else:
+            vision_features = self.VisionEncoder(input_ids, type='pseudo')
+        _, attn_vision = self.VisionGuider(
+            inputs_embeds,
+            vision_features,
+            vision_features,
+            output_attentions=True
+        )
+        if text_input_ids is not None:
+            text_features = self.TextEncoder({'input_ids': text_input_ids, 'attention_mask': text_attention_mask,
+                                              'token_type_ids': text_token_type_ids})
+            _, attn_text = self.TextGuider(
+                inputs_embeds,
+                text_features,
+                text_features,
+                output_attentions=True
+            )
+        else:
+            text_features = None
+            attn_text = None
+        if attn_text is not None:
+            guided_bias = torch.einsum("bhik,kl,bhjl->bhij", attn_vision, self.W, attn_text)
+        else:
+            guided_bias = None
+        # encoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for encoder_layer in self.enc_layers:
+            if output_hidden_states:
+                all_hidden_states += (x_enc,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    x_enc,
+                    output_attentions,
+                    guided_bias
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    x_enc,
+                    output_attentions=output_attentions,
+                    bias=guided_bias
+                )
+            x_enc = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        if x_enc.shape[0] > batch_size:
+            x_enc, x_rec = torch.split(x_enc, [batch_size, x_enc.shape[0] - batch_size], dim=0)
+            x_rec = rearrange(x_rec, '(s b) n d -> s b n d', s=self.mask_num)
+            x_rec = x_rec.mean(0)
+        else:
+            x_rec = None
+        decay_weights = 0.5 ** torch.arange(predict_token_num)
+        decay_weights = decay_weights.unsqueeze(0).unsqueeze(-1).to(x_enc.device)
+        from_text, from_vision = self.ModalityConnector(x_enc, text_features, vision_features)
+        if from_text is not None:
+            x_enc = x_enc + self.fuse(from_vision + from_text)
+        else:
+            x_enc = x_enc + self.fuse(from_vision)
+        last_token = x_enc[:, -1:, :]
+        x_dec = decay_weights * last_token.repeat(1, predict_token_num, 1)
+        # decoder layers
+        for decoder_layer in self.dec_layers:
+            if output_hidden_states:
+                all_hidden_states += (x_dec,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    x_dec,
+                    x_enc,
+                    output_attentions=output_attentions,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    x_dec,
+                    x_enc,
+                    output_attentions=output_attentions
+                )
+            x_dec = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (x_dec,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [x_dec, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        output_states = (x_rec, x_dec, from_text, from_vision)
+        return MoeModelOutputWithPast(
+            last_hidden_state=output_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class AuroraForPrediction(AuroraPreTrainedModel, TSGenerationMixin):
+    def __init__(self, config: AuroraConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = AuroraModel(config)
+        self.point_loss = torch.nn.MSELoss(reduction='none')
+        self.flow_match = FlowLoss(config.token_len, config.hidden_size, config.flow_loss_depth, config.hidden_size,
+                                   config.num_sampling_steps)
+        self.linear_head = AuroraPredictHead(config)
+        self.retriever = PrototypeRetriever(config)
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+            self,
+            input_ids: torch.FloatTensor = None,
+            text_input_ids: torch.FloatTensor = None,
+            text_attention_mask: torch.FloatTensor = None,
+            text_token_type_ids: torch.FloatTensor = None,
+            vision_ids: torch.FloatTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.FloatTensor] = None,
+            loss_masks: Optional[torch.FloatTensor] = None,
+            mask_y: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            max_output_length: Optional[int] = None,
+            revin: Optional[bool] = True,
+            num_samples: Optional[int] = 1,
+            inference_token_len: Optional[int] = 48,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            if max_output_length is None:
+                max_output_length = labels.shape[1]
+            predict_token_num = math.ceil(max_output_length / self.config.token_len)
+        else:
+            predict_token_num = math.ceil(max_output_length / inference_token_len)
+        if revin:
+            means = input_ids.mean(1, keepdim=True).detach()
+            stdev = input_ids.std(dim=1, keepdim=True, unbiased=False).detach() + 1e-5
+            input_ids = (input_ids - means) / stdev
+        outputs = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            text_input_ids=text_input_ids,
+            text_attention_mask=text_attention_mask,
+            text_token_type_ids=text_token_type_ids,
+            vision_ids=vision_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            predict_token_num=predict_token_num,
+            inference_token_len=inference_token_len
+        )
+        hidden_states = outputs[0] if not return_dict else outputs.last_hidden_state
+        x_rec, x_dec, from_text, from_vision = hidden_states
+        if from_text is not None:
+            generated_prototypes = self.retriever(from_text + from_vision, predict_token_num)
+        else:
+            generated_prototypes = self.retriever(from_vision, predict_token_num)
+        loss = None
+        predictions = None
+        eps = 1e2
+        mask = None
+        if labels is not None:
+            if revin:
+                origin_labels = labels
+                labels = (labels - means) / stdev
+            origin_length = labels.shape[-1]
+            target_length = predict_token_num * self.config.token_len
+            if origin_length < target_length:
+                pad_length = target_length - origin_length
+                labels = F.pad(labels, (0, pad_length))
+                mask = torch.tensor([1] * origin_length + [0] * pad_length, device=labels.device)
+                mask = mask.unsqueeze(0)
+            reco = rearrange(self.linear_head(x_rec), 'b n p -> b (n p)')
+            fore = rearrange(self.linear_head(x_dec), 'b n p -> b (n p)')
+            if revin:
+                fore = fore * stdev + means
+            reco_loss = self.point_loss(reco[:, :input_ids.shape[-1]], input_ids)
+            fore_loss = self.point_loss(fore[:, :origin_length], origin_labels)
+            reco_loss = reco_loss[reco_loss < eps]
+            fore_loss = fore_loss[fore_loss < eps]
+            point_loss = reco_loss.mean() + fore_loss.mean()
+            shift_labels = labels.unfold(
+                dimension=-1, size=self.config.token_len, step=self.config.token_len)
+            bsz, L, _ = shift_labels.shape
+            shift_labels = shift_labels.reshape(
+                bsz * L, -1).repeat(self.config.diffusion_batch_mul, 1)
+            x_dec = x_dec.reshape(
+                bsz * L, -1).repeat(self.config.diffusion_batch_mul, 1)
+            protos = generated_prototypes.reshape(bsz * L, -1).repeat(self.config.diffusion_batch_mul, 1)
+            flow_loss = self.flow_match(target=shift_labels, z=x_dec.detach(), prototype=protos, eps=eps, mask=mask)
+            loss = point_loss + flow_loss
+        else:
+            predictions = self.flow_match.sample(z=rearrange(x_dec, 'b n d -> (b n) d'),
+                                                 prototype=rearrange(generated_prototypes, 'b n p -> (b n) p'),
+                                                 num_samples=num_samples,
+                                                 inference_token_len=inference_token_len)
+            predictions = rearrange(predictions, '(b n) s p -> b s (n p)', n=predict_token_num)[:, :,
+            :max_output_length]
+            if revin:
+                stdev = stdev.unsqueeze(1).repeat(1, num_samples, 1)
+                means = means.unsqueeze(1).repeat(1, num_samples, 1)
+                predictions = (predictions * stdev) + means
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            logits=predictions,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

prototype_retriever.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from .configuration_aurora import AuroraConfig
+from .util_functions import sinusoidal_position_embedding, causal_attention_mask
+class PrototypeRetriever(nn.Module):
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_prototypes = config.num_prototypes
+        self.token_len = config.token_len
+        # Define the learnable prototype parameter container.
+        # Initialize an empty Parameter first, to be filled in _initialize_prototypes.
+        self.prototypes = nn.Parameter(torch.empty(self.num_prototypes, self.token_len))
+        # Initialize prototypes using the new logic
+        self._initialize_prototypes()
+        self.retriever = Retriever(config)
+    def _initialize_prototypes(self, random_seed=42):
+        """
+        Initialize prototype parameters using diverse function generators.
+        Adapted from the generate_prototypes logic to fit the class structure.
+        """
+        # Set random seed for reproducibility
+        np.random.seed(random_seed)
+        length = self.token_len
+        # Create time series x, range from 0 to 10
+        x = np.linspace(0, 10, length)
+        prototypes_list = []
+        # --- Define internal generation functions ---
+        def generate_sin():
+            """Generate sine function features"""
+            freq = np.random.uniform(0.3, 2.0)
+            amp = np.random.uniform(0.5, 2.0)
+            phase = np.random.uniform(0, np.pi)
+            return amp * np.sin(freq * x + phase)
+        def generate_cos():
+            """Generate cosine function features"""
+            freq = np.random.uniform(0.3, 2.0)
+            amp = np.random.uniform(0.5, 2.0)
+            phase = np.random.uniform(0, np.pi)
+            return amp * np.cos(freq * x + phase)
+        def generate_log():
+            """Generate logarithmic function features (trend)"""
+            # Ensure x is positive, suitable for log function
+            x_log = x + np.random.uniform(0.5, 2.0)
+            slope = np.random.uniform(0.3, 1.5)
+            offset = np.random.uniform(-2.0, 2.0)
+            return slope * np.log(x_log) + offset
+        def generate_exponential():
+            """Generate exponential function features (trend)"""
+            # Can be positive or negative, allowing growth or decay
+            growth = np.random.uniform(-0.3, 0.3)
+            amp = np.random.uniform(0.5, 2.0)
+            return amp * np.exp(growth * x)
+        def generate_linear():
+            """Generate linear function features (trend)"""
+            slope = np.random.uniform(-1.0, 1.0)
+            intercept = np.random.uniform(-2.0, 2.0)
+            return slope * x + intercept
+        def generate_combination():
+            """Generate combined features from multiple functions"""
+            # Generate weights that sum to 1
+            weights = np.random.dirichlet(np.ones(3))
+            func1 = generate_sin()
+            func2 = generate_linear()
+            # Randomly select the third component
+            func3 = generate_exponential() if np.random.random() > 0.5 else generate_log()
+            return weights[0] * func1 + weights[1] * func2 + weights[2] * func3
+        # Function types and their probability distributions
+        functions = [
+            (generate_sin, 0.2),
+            (generate_cos, 0.2),
+            (generate_log, 0.15),
+            (generate_exponential, 0.15),
+            (generate_linear, 0.1),
+            (generate_combination, 0.2)
+        ]
+        # Extract functions and corresponding probabilities
+        funcs, probs = zip(*functions)
+        # --- Prototype generation loop ---
+        for _ in range(self.num_prototypes):
+            # Randomly select function type based on probability
+            func = np.random.choice(funcs, p=probs)
+            prototype = func()
+            # Add some noise
+            noise_level = np.random.uniform(0.05, 0.2)
+            noise = np.random.normal(0, noise_level, length)
+            prototype += noise
+            prototypes_list.append(prototype)
+        # Convert to Numpy array
+        prototypes_np = np.array(prototypes_list)
+        # --- Key step: Convert to Tensor and assign to Parameter ---
+        # 1. Convert to Tensor
+        # 2. Convert to float32 (numpy defaults to float64, PyTorch typically uses float32)
+        # 3. Use .data.copy_ to fill nn.Parameter, maintaining the gradient tracking mechanism
+        tensor_data = torch.from_numpy(prototypes_np).float()
+        self.prototypes.data.copy_(tensor_data)
+    def forward(self, x, output_token_len):
+        """
+        Args:
+            x: Input representation with shape [B, k, d]
+        Returns:
+            synthetic_protos: [B, F, p] (Normalized)
+        """
+        # Calculate distribution [B, F, M]
+        dist = self.retriever(x, output_token_len)
+        # Weighted combination of prototypes [B, F, p]
+        synthetic_protos = torch.matmul(dist, self.prototypes)
+        # Normalize
+        # Note: Since the new initialization logic generates values with larger ranges and noise,
+        # Instance Normalization here is crucial for output stability.
+        mean = synthetic_protos.mean(dim=-1, keepdim=True).detach()
+        std = synthetic_protos.std(dim=-1, keepdim=True).detach() + 1e-5
+        synthetic_protos = (synthetic_protos - mean) / std
+        return synthetic_protos
+class Retriever(nn.Module):
+    def __init__(self, config: AuroraConfig):
+        super().__init__()
+        self.input_emb = nn.Sequential(nn.LayerNorm(config.hidden_size),
+                                       nn.Linear(config.hidden_size, config.hidden_size))
+        self.encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                dim_feedforward=config.intermediate_size,
+                dropout=config.dropout_rate,
+                batch_first=True,
+            ),
+            norm=nn.LayerNorm(config.hidden_size),
+            num_layers=config.num_retriever_enc_layers,
+        )
+        self.decoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                dim_feedforward=config.intermediate_size,
+                dropout=config.dropout_rate,
+                batch_first=True,
+            ),
+            norm=nn.LayerNorm(config.hidden_size),
+            num_layers=config.num_retriever_dec_layers,
+        )
+        self.head = nn.Sequential(
+            nn.Linear(config.hidden_size, config.intermediate_size),  # Combine context and position information
+            nn.LayerNorm(config.intermediate_size),
+            nn.SiLU(),
+            nn.Dropout(config.dropout_rate),
+            nn.Linear(config.intermediate_size, config.num_prototypes),  # Predict prototype distribution
+            nn.Softmax(dim=-1)
+        )
+        self.hidden_size = config.hidden_size
+    def forward(self, x, output_token_len):
+        x_encoded = self.input_emb(x)
+        enc_attn_mask = causal_attention_mask(x.shape[1]).to(x.device)
+        enc_output = self.encoder(x_encoded, mask=enc_attn_mask.squeeze(0).squeeze(0))  # Shape: [B, k, d]
+        enc_output = enc_output[:, -1:, :]
+        dec = enc_output.repeat(1, output_token_len, 1)
+        pos_embeds = sinusoidal_position_embedding(
+            batch_size=dec.shape[0], num_heads=1,
+            max_len=output_token_len, output_dim=self.hidden_size,
+            device=dec.device).squeeze(1)
+        embeds = dec + pos_embeds
+        dec_attn_mask = causal_attention_mask(output_token_len).to(x.device)
+        dec_output = self.decoder(embeds, mask=dec_attn_mask.squeeze(0).squeeze(0))
+        dist = self.head(dec_output)  # Shape: [B, F, M]
+        return dist

ts_generation_mixin.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+from typing import Any, Dict, List, Optional, Union, Callable
+import torch
+from transformers import BertTokenizer
+from transformers import GenerationMixin, LogitsProcessorList, StoppingCriteriaList
+from transformers.generation.utils import GenerationConfig, GenerateOutput
+from transformers.utils import ModelOutput
+class TSGenerationMixin(GenerationMixin):
+    tokenizer = BertTokenizer.from_pretrained(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bert_config'))
+    @torch.no_grad()
+    def generate(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            text_inputs=None,
+            text_input_ids: Optional[torch.Tensor] = None,
+            text_attention_mask: Optional[torch.Tensor] = None,
+            text_token_type_ids: Optional[torch.Tensor] = None,
+            vision_inputs: Optional[torch.Tensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            synced_gpus: Optional[bool] = None,
+            assistant_model: Optional["PreTrainedModel"] = None,
+            streamer: Optional["BaseStreamer"] = None,
+            negative_prompt_ids: Optional[torch.Tensor] = None,
+            negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+            revin: Optional[bool] = True,
+            num_samples: Optional[int] = 1,
+            max_output_length: Optional[int] = 96,
+            inference_token_len: Optional[int] = None,
+            max_text_token_length: Optional[int] = 125,
+            **kwargs,
+    ) -> Union[GenerateOutput, torch.Tensor]:
+        if len(inputs.shape) != 2:
+            raise ValueError('Input shape must be: [batch_size, seq_len]')
+        if revin:
+            means = inputs.mean(dim=-1, keepdim=True)
+            stdev = inputs.std(dim=-1, keepdim=True, unbiased=False) + 1e-5
+            inputs = (inputs - means) / stdev
+        if text_inputs is not None:
+            tokenized_text = self._tokenize(text_inputs, max_length=max_text_token_length)
+            text_input_ids = tokenized_text['input_ids'].squeeze(0)
+            text_attention_mask = tokenized_text['attention_mask'].squeeze(0)
+            text_token_type_ids = tokenized_text.get('token_type_ids', torch.zeros_like(text_input_ids)).squeeze(0)
+        model_inputs = self.prepare_inputs_for_generation(
+            inputs,
+            text_input_ids=text_input_ids,
+            text_attention_mask=text_attention_mask,
+            text_token_type_ids=text_token_type_ids,
+            vision_inputs=vision_inputs,
+            generation_config=generation_config,
+            max_output_length=max_output_length,
+            inference_token_len=inference_token_len,
+            **kwargs
+        )
+        outputs = self(**model_inputs, return_dict=True, revin=False, num_samples=num_samples)
+        predictions = outputs.logits
+        if revin:
+            stdev = stdev.unsqueeze(1).repeat(1, num_samples, 1)
+            means = means.unsqueeze(1).repeat(1, num_samples, 1)
+            predictions = (predictions * stdev) + means
+        return predictions
+    def prepare_inputs_for_generation(
+            self,
+            inputs: torch.Tensor,
+            text_input_ids: Optional[torch.Tensor] = None,
+            text_attention_mask: Optional[torch.Tensor] = None,
+            text_token_type_ids: Optional[torch.Tensor] = None,
+            vision_inputs: Optional[torch.Tensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            max_output_length: Optional[int] = None,
+            inference_token_len: Optional[int] = None,
+            **kwargs
+    ):
+        return {
+            "input_ids": inputs,
+            "text_input_ids": text_input_ids,
+            "text_attention_mask": text_attention_mask,
+            "text_token_type_ids": text_token_type_ids,
+            "vision_ids": vision_inputs,
+            "max_output_length": max_output_length,
+            "inference_token_len": inference_token_len,
+            **kwargs
+        }
+    def _tokenize(self, texts, max_length):
+        return self.tokenizer(
+            texts,
+            padding='max_length',
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt"
+        )
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            horizon_length: int = 1,
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        return model_kwargs

util_functions.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from typing import Tuple
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def resize(x_tensor, new_shape):
+    return F.interpolate(x_tensor.unsqueeze(0), size=new_shape, mode='linear').squeeze(0)
+def resample(old: torch.Tensor, new_patch_len: int):
+    assert old.dim() == 2, "the size of input tensor should be (d_model, patch_size)"
+    if old.size(1) == new_patch_len:
+        return old
+    old = old.T
+    old_shape = old.size(0)
+    factor = new_patch_len / old_shape
+    basis_vectors = torch.eye(old_shape, dtype=torch.get_default_dtype(), device=old.device)
+    resize_mat = resize(basis_vectors, new_patch_len).T
+    resize_mat_pinv = torch.linalg.pinv(resize_mat.T)
+    resampled_kernels = resize_mat_pinv @ old * math.sqrt(factor)
+    return resampled_kernels.T
+def RoPE(query: torch.Tensor, key: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply Rotary Position Embedding (RoPE) to the query and key tensors.
+    Args:
+        query (torch.Tensor): Query tensor with shape (bs, head, max_len, output_dim).
+        key (torch.Tensor): Key tensor with shape (bs, head, max_len, output_dim).
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Query and key tensors after applying RoPE.
+    """
+    # Get the shape information of the input tensors
+    batch_size, num_heads, max_len, output_dim = query.shape
+    # Generate sinusoidal position embeddings
+    pos_emb = sinusoidal_position_embedding(batch_size, num_heads, max_len, output_dim, query.device, factor=1)
+    # Extract cosine and sine position embeddings
+    cos_pos = pos_emb[..., 1::2].repeat_interleave(2, dim=-1)
+    sin_pos = pos_emb[..., ::2].repeat_interleave(2, dim=-1)
+    # Apply RoPE to the query tensor
+    query_rot = torch.stack([-query[..., 1::2], query[..., ::2]], dim=-1).reshape(query.shape)
+    query = query * cos_pos + query_rot * sin_pos
+    # Apply RoPE to the key tensor
+    key_rot = torch.stack([-key[..., 1::2], key[..., ::2]], dim=-1).reshape(key.shape)
+    key = key * cos_pos + key_rot * sin_pos
+    return query, key
+def RoPE_decoder(query: torch.Tensor, key: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply Rotary Position Embedding (RoPE) to the query and key tensors in the decoder.
+    Args:
+        query (torch.Tensor): Query tensor with shape (bs, head, q_max_len, output_dim).
+        key (torch.Tensor): Key tensor with shape (bs, head, k_max_len, output_dim).
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Query and key tensors after applying RoPE.
+    """
+    # Get the shape information of the input tensors
+    batch_size, num_heads, q_max_len, output_dim = query.shape
+    _, _, k_max_len, _ = key.shape
+    # Generate sinusoidal position embeddings
+    pos_emb = sinusoidal_position_embedding(batch_size, num_heads, k_max_len + q_max_len, output_dim, query.device,
+                                            factor=1)
+    # Extract cosine and sine position embeddings
+    cos_pos = pos_emb[..., 1::2].repeat_interleave(2, dim=-1)
+    sin_pos = pos_emb[..., ::2].repeat_interleave(2, dim=-1)
+    # Apply RoPE to the query tensor
+    query_rot = torch.stack([-query[..., 1::2], query[..., ::2]], dim=-1).reshape(query.shape)
+    query = query * cos_pos[:, :, -q_max_len:, :] + query_rot * sin_pos[:, :, -q_max_len:, :]
+    # Apply RoPE to the key tensor
+    key_rot = torch.stack([-key[..., 1::2], key[..., ::2]], dim=-1).reshape(key.shape)
+    key = key * cos_pos[:, :, :k_max_len, :] + key_rot * sin_pos[:, :, :k_max_len, :]
+    return query, key
+def sinusoidal_position_embedding(
+        batch_size: int,
+        num_heads: int,
+        max_len: int,
+        output_dim: int,
+        device: torch.device,
+        factor: float = 1.0
+) -> torch.Tensor:
+    """
+    Generate sinusoidal position embeddings.
+    Args:
+        batch_size (int): Batch size.
+        num_heads (int): Number of attention heads.
+        max_len (int): Maximum sequence length.
+        output_dim (int): Output dimension.
+        device (torch.device): Device type.
+        factor (float, optional): Scaling factor. Defaults to 1.0.
+    Returns:
+        torch.Tensor: Sinusoidal position embedding tensor with shape (bs, head, max_len, output_dim).
+    """
+    # Generate position indices
+    position = torch.arange(0, max_len * factor, 1 / factor, dtype=torch.float).unsqueeze(-1)
+    # Generate frequency indices
+    ids = torch.arange(0, output_dim // 2, dtype=torch.float)
+    theta = torch.pow(10000, -2 * ids / output_dim)
+    # Calculate position embeddings
+    embeddings = position * theta
+    embeddings = torch.stack([torch.sin(embeddings), torch.cos(embeddings)], dim=-1)
+    # Expand dimensions to match batch size and number of attention heads
+    embeddings = embeddings.repeat((batch_size, num_heads, *([1] * len(embeddings.shape))))
+    embeddings = torch.reshape(embeddings, (batch_size, num_heads, -1, output_dim))
+    embeddings = embeddings.to(device)
+    # If the factor is greater than 1, perform interpolation
+    if factor > 1.0:
+        interpolation_indices = torch.linspace(0, embeddings.shape[2] - 1, max_len).long()
+        embeddings = embeddings[:, :, interpolation_indices, :]
+    return embeddings
+def causal_attention_mask(seq_length):
+    mask = torch.triu(torch.ones(seq_length, seq_length) * float('-inf'), diagonal=1)
+    return mask.unsqueeze(0).unsqueeze(0)
+class Transpose(nn.Module):
+    def __init__(self, *dims, contiguous=False):
+        super().__init__()
+        self.dims, self.contiguous = dims, contiguous
+    def forward(self, x):
+        if self.contiguous:
+            return x.transpose(*self.dims).contiguous()
+        else:
+            return x.transpose(*self.dims)

vit_config/config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_name_or_path": "google/vit-base-patch16-224-in21k",
+  "architectures": [
+    "ViTModel"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "model_type": "vit",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "patch_size": 16,
+  "qkv_bias": true,
+  "transformers_version": "4.13.0.dev0"
+}

vit_config/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "do_normalize": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "size": 224
+}