""" This function is adapted from [NeurIPS2023-One-Fits-All] by [tianzhou2011] Original source: [https://github.com/DAMO-DI-ML/NeurIPS2023-One-Fits-All] """ import argparse from typing import Dict import numpy as np import torchinfo import torch from torch import nn, optim from torch.utils.data import DataLoader from torch.nn.utils import weight_norm import tqdm import os, math from typing import Optional import torch.nn.functional as F from transformers.models.gpt2.modeling_gpt2 import GPT2Model from einops import rearrange from ..utils.torch_utility import EarlyStoppingTorch, PositionalEmbedding, TokenEmbedding, TemporalEmbedding, get_gpu, TimeFeatureEmbedding, DataEmbedding, adjust_learning_rate from ..utils.dataset import ReconstructDataset class DataEmbedding_wo_pos(nn.Module): def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): super(DataEmbedding_wo_pos, self).__init__() self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) self.position_embedding = PositionalEmbedding(d_model=d_model) self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( d_model=d_model, embed_type=embed_type, freq=freq) self.dropout = nn.Dropout(p=dropout) def forward(self, x, x_mark): if x_mark is None: x = self.value_embedding(x) else: x = self.value_embedding(x) + self.temporal_embedding(x_mark) return self.dropout(x) class PatchEmbedding(nn.Module): def __init__(self, d_model, patch_len, stride, dropout): super(PatchEmbedding, self).__init__() # Patching self.patch_len = patch_len self.stride = stride self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space self.value_embedding = TokenEmbedding(patch_len, d_model) # Positional embedding self.position_embedding = PositionalEmbedding(d_model) # Residual dropout self.dropout = nn.Dropout(dropout) def forward(self, x): # do patching n_vars = x.shape[1] x = self.padding_patch_layer(x) x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) # Input encoding x = self.value_embedding(x) + self.position_embedding(x) return self.dropout(x), n_vars class DataEmbedding_wo_time(nn.Module): def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): super(DataEmbedding_wo_time, self).__init__() self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) self.position_embedding = PositionalEmbedding(d_model=d_model) self.dropout = nn.Dropout(p=dropout) def forward(self, x): x = self.value_embedding(x) + self.position_embedding(x) return self.dropout(x) class Model(nn.Module): def __init__(self, pred_len=0, seq_len=100, patch_size=1, stride=1, d_model = 768, d_ff = 768, embed = "timeF", gpt_layers = 6, enc_in = 1, c_out = 1, freq = "h", dropout= 0.1, mlp = 0, model_path = "pre_train"): super(Model, self).__init__() self.pred_len = pred_len self.seq_len = seq_len self.patch_size = patch_size self.stride = stride self.seq_len = seq_len self.d_ff = d_ff self.d_model = d_model self.embed = embed self.gpt_layers = gpt_layers self.enc_in = enc_in self.c_out = c_out self.freq = freq self.dropout = dropout self.model_path = model_path self.mlp = mlp self.patch_num = (self.seq_len + self.pred_len - self.patch_size) // self.stride + 1 self.padding_patch_layer = nn.ReplicationPad1d((0, self.stride)) self.patch_num += 1 self.enc_embedding = DataEmbedding(self.enc_in * self.patch_size, self.d_model, self.embed, self.freq, self.dropout) self.gpt2 = GPT2Model.from_pretrained('gpt2', output_attentions=True, output_hidden_states=True) self.gpt2.h = self.gpt2.h[:self.gpt_layers] for i, (name, param) in enumerate(self.gpt2.named_parameters()): if 'ln' in name or 'wpe' in name: # or 'mlp' in name: param.requires_grad = True elif 'mlp' in name and self.mlp == 1: param.requires_grad = True else: param.requires_grad = False # if configs.use_gpu: # device = torch.device('cuda:{}'.format(0)) # self.gpt2.to(device=device) self.ln_proj = nn.LayerNorm(self.d_ff) self.out_layer = nn.Linear( self.d_ff, self.c_out, bias=True) def forward(self, x_enc): dec_out = self.anomaly_detection(x_enc) return dec_out # [B, L, D] def anomaly_detection(self, x_enc): B, L, M = x_enc.shape # Normalization from Non-stationary Transformer seg_num = 25 x_enc = rearrange(x_enc, 'b (n s) m -> b n s m', s=seg_num) means = x_enc.mean(2, keepdim=True).detach() x_enc = x_enc - means stdev = torch.sqrt( torch.var(x_enc, dim=2, keepdim=True, unbiased=False) + 1e-5) x_enc /= stdev x_enc = rearrange(x_enc, 'b n s m -> b (n s) m') # means = x_enc.mean(1, keepdim=True).detach() # x_enc = x_enc - means # stdev = torch.sqrt( # torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5) # x_enc /= stdev # enc_out = self.enc_embedding(x_enc, None) # [B,T,C] enc_out = torch.nn.functional.pad(x_enc, (0, 768-x_enc.shape[-1])) outputs = self.gpt2(inputs_embeds=enc_out).last_hidden_state outputs = outputs[:, :, :self.d_ff] # outputs = self.ln_proj(outputs) dec_out = self.out_layer(outputs) # De-Normalization from Non-stationary Transformer dec_out = rearrange(dec_out, 'b (n s) m -> b n s m', s=seg_num) dec_out = dec_out * \ (stdev[:, :, 0, :].unsqueeze(2).repeat( 1, 1, seg_num, 1)) dec_out = dec_out + \ (means[:, :, 0, :].unsqueeze(2).repeat( 1, 1, seg_num, 1)) dec_out = rearrange(dec_out, 'b n s m -> b (n s) m') # dec_out = dec_out * \ # (stdev[:, 0, :].unsqueeze(1).repeat( # 1, self.pred_len + self.seq_len, 1)) # dec_out = dec_out + \ # (means[:, 0, :].unsqueeze(1).repeat( # 1, self.pred_len + self.seq_len, 1)) return dec_out class OFA(): def __init__(self, win_size = 100, stride = 1, enc_in = 1, features = 'M', batch_size = 128, learning_rate = 0.0001, epochs = 10, patience = 3, lradj = "type1", validation_size=0.2): super().__init__() self.win_size = win_size self.stride = stride self.enc_in = enc_in self.features = features self.batch_size = batch_size self.learning_rate = learning_rate self.epochs = epochs self.patience = patience self.lradj = lradj self.validation_size = validation_size self.decision_scores_ = None cuda = True self.y_hats = None self.cuda = cuda self.device = get_gpu(self.cuda) self.model = Model(seq_len=self.win_size, enc_in=self.enc_in, c_out=self.enc_in).float().to(self.device) self.model_optim = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.criterion = nn.MSELoss() self.early_stopping = EarlyStoppingTorch(None, patience=self.patience) self.input_shape = (self.batch_size, self.win_size, self.enc_in) def fit(self, data): tsTrain = data[:int((1-self.validation_size)*len(data))] tsValid = data[int((1-self.validation_size)*len(data)):] train_loader = DataLoader( dataset=ReconstructDataset(tsTrain, window_size=self.win_size, stride=self.stride), batch_size=self.batch_size, shuffle=True ) valid_loader = DataLoader( dataset=ReconstructDataset(tsValid, window_size=self.win_size, stride=self.stride), batch_size=self.batch_size, shuffle=False ) train_steps = len(train_loader) for epoch in range(1, self.epochs + 1): ## Training train_loss = 0 self.model.train() loop = tqdm.tqdm(enumerate(train_loader),total=len(train_loader),leave=True) for i, (batch_x, _) in loop: self.model_optim.zero_grad() batch_x = batch_x.float().to(self.device) outputs = self.model(batch_x) loss = self.criterion(outputs, batch_x) loss.backward() self.model_optim.step() train_loss += loss.cpu().item() loop.set_description(f'Training Epoch [{epoch}/{self.epochs}]') loop.set_postfix(loss=loss.item(), avg_loss=train_loss/(i+1)) ## Validation self.model.eval() total_loss = [] loop = tqdm.tqdm(enumerate(valid_loader),total=len(valid_loader),leave=True) with torch.no_grad(): for i, (batch_x, _) in loop: batch_x = batch_x.float().to(self.device) outputs = self.model(batch_x) f_dim = -1 if self.features == 'MS' else 0 outputs = outputs[:, :, f_dim:] pred = outputs.detach().cpu() true = batch_x.detach().cpu() loss = self.criterion(pred, true) total_loss.append(loss) loop.set_description(f'Valid Epoch [{epoch}/{self.epochs}]') valid_loss = np.average(total_loss) loop.set_postfix(loss=loss.item(), valid_loss=valid_loss) self.early_stopping(valid_loss, self.model) if self.early_stopping.early_stop: print(" Early stopping<<<") break adjust_learning_rate(self.model_optim, epoch + 1, self.lradj, self.learning_rate) def decision_function(self, data): test_loader = DataLoader( dataset=ReconstructDataset(data, window_size=self.win_size, stride=self.stride), batch_size=self.batch_size, shuffle=False ) self.model.eval() attens_energy = [] y_hats = [] self.anomaly_criterion = nn.MSELoss(reduce=False) loop = tqdm.tqdm(enumerate(test_loader),total=len(test_loader),leave=True) with torch.no_grad(): for i, (batch_x, _) in loop: batch_x = batch_x.float().to(self.device) # reconstruction outputs = self.model(batch_x) # # criterion # print('batch_x: ', batch_x.shape) # print('outputs: ', outputs.shape) score = torch.mean(self.anomaly_criterion(batch_x, outputs), dim=-1) y_hat = torch.squeeze(outputs, -1) score = score.detach().cpu().numpy()[:, -1] y_hat = y_hat.detach().cpu().numpy()[:, -1] attens_energy.append(score) y_hats.append(y_hat) loop.set_description(f'Testing Phase: ') attens_energy = np.concatenate(attens_energy, axis=0).reshape(-1) scores = np.array(attens_energy) y_hats = np.concatenate(y_hats, axis=0).reshape(-1) y_hats = np.array(y_hats) assert scores.ndim == 1 import shutil self.save_path = None if self.save_path and os.path.exists(self.save_path): shutil.rmtree(self.save_path) # Custom stride length scores_win = [scores[i] for i in range(scores.shape[0])] self.decision_scores_ = np.zeros(len(data)) count = np.zeros(len(data)) for i, score in enumerate(scores_win): start = i * self.stride end = start + self.win_size self.decision_scores_[start:end] += score count[start:end] += 1 self.decision_scores_ = self.decision_scores_ / np.maximum(count, 1) return self.decision_scores_ def param_statistic(self, save_file): model_stats = torchinfo.summary(self.model, self.input_shape, verbose=0) with open(save_file, 'w') as f: f.write(str(model_stats))