Upload 5 files

Browse files

Files changed (5) hide show

ExtLWM_sub16.pth +3 -0
ExtLWM_sub32.pth +3 -0
ExtLWM_sub64.pth +3 -0
lwm_model.py +299 -0
lwm_train.py +259 -0

ExtLWM_sub16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0abccf7087201c67845cc423b62b5bcf1e869a55ab94b7be82a4e52a20804c45
+size 9856811

ExtLWM_sub32.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7c0b2fb2be98455f0adaf73cec80d2b7f268e6bd27a7b00dedaa391580d5e2b
+size 9807787

ExtLWM_sub64.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:441dc9114b40607c4fc92828f52ee2b94b6a9aeaa5013b61b6d4a8662d6156df
+size 9832619

lwm_model.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 13 19:23:54 2024
+This script defines the LWM model architecture.
+@author: Sadjad Alikhani
+"""
+#%%
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from collections import defaultdict
+from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
+def create_dataloader(grouped_data, batch_size, shuffle, generator=None):
+    dataloaders = {}
+    for seq_length, group in grouped_data.items():
+        print(f"dataloader in progress ...\nkey: {seq_length}")
+        ## Uncomment the following line if you run out of memory during pre-training
+        # batch_size = batch_size // 8 if seq_length >= 5 else batch_size
+        # Unpack samples for the current group
+        input_ids, masked_tokens, masked_pos = zip(*group)
+        # Convert to tensors
+        input_ids_tensor = torch.tensor(input_ids, dtype=torch.float32)
+        masked_tokens_tensor = torch.tensor(masked_tokens, dtype=torch.float32)
+        masked_pos_tensor = torch.tensor(masked_pos, dtype=torch.long)
+        # Create TensorDataset and DataLoader
+        dataset = TensorDataset(input_ids_tensor, masked_tokens_tensor, masked_pos_tensor)
+        dataloaders[seq_length] = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=True, generator=generator)
+    return dataloaders
+def lwm_tokenizer(manual_data, patch_rows, patch_cols, masking_percent=.40, mask=False, mask_pos=None, seed=42, ):
+    patches = [patch_maker(np.array(manual_data), patch_rows, patch_cols)]
+    patches = [patch for patch_list in patches for patch in patch_list]   # list(Batch)
+    grouped_data = defaultdict(list)  # Group samples by sequence length
+    grouped_data_2 = []
+    # for user_idx in tqdm(range(len(patches)), desc="Processing items"):
+    for user_idx in range(len(patches)):
+        patch_size = patches[user_idx].shape[1]
+        n_patches = patches[user_idx].shape[0]
+        n_masks_half = int(masking_percent * n_patches)
+        word2id = {
+            '[CLS]': 0.2 * np.ones((patch_size)),
+            '[MASK]': 0.1 * np.ones((patch_size))
+        }
+        sample = make_sample(
+            user_idx, patches, word2id, n_patches, n_masks_half, mask_pos, mask=mask, seed=seed
+        )
+        if mask:
+            seq_length = len(sample[0])
+            grouped_data[seq_length].append(sample)
+        else:
+            grouped_data_2.append(sample)
+    if mask:
+        # Normalize keys to 0, 1, 2, ...
+        normalized_grouped_data = {i: grouped_data[key] for i, key in enumerate(sorted(grouped_data.keys()))}
+    else:
+        normalized_grouped_data = torch.stack(grouped_data_2, dim=0)
+    return normalized_grouped_data
+def make_sample(user_idx, patch, word2id, n_patches, n_masks, mask_pos=None, mask=True, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+        # Step 1: Retrieve tokens and prepend [CLS]
+    tokens = patch[user_idx]
+    input_ids = np.vstack((word2id['[CLS]'], tokens))
+    # Step 2: Mask real and imaginary patches
+    tokens_size = int(n_patches)  # int(n_patches / 2)
+    if mask_pos is not None:
+        masked_pos = mask_pos
+    else:
+        masked_pos = np.random.choice(range(1, tokens_size+1), size=n_masks, replace=False)
+    masked_tokens = []
+    for pos in masked_pos:
+        original_masked_tokens = input_ids[pos].copy()
+        masked_tokens.append(original_masked_tokens)
+        if mask:
+            input_ids[pos] = word2id['[MASK]']
+            # rnd_num = np.random.rand()
+            # if rnd_num < 0.1:
+            #     input_ids[pos] = np.random.rand(32)  # Replace with random values
+            # elif rnd_num < 0.9:
+            #     input_ids[pos] = word2id['[MASK]']  # Replace with [MASK]
+    if mask:
+        return [input_ids, masked_tokens, masked_pos]
+    else:
+        return torch.tensor(input_ids)
+def patch_maker(original_ch, patch_rows=1, patch_cols=16):
+    n_samples, n_rows, n_cols = original_ch.shape
+    # Step 2: Split into real and imaginary parts and interleave them
+    flat_real = original_ch.real
+    flat_imag = original_ch.imag
+    # Interleave real and imaginary parts along the last axis
+    interleaved = np.empty((n_samples, n_rows, n_cols * 2), dtype=np.float32)
+    interleaved[:, :, 0::2] = flat_real
+    interleaved[:, :, 1::2] = flat_imag
+    # Step 3: Compute the number of patches along rows and columns
+    n_patches_rows = int(np.ceil(n_rows / patch_rows))
+    n_patches_cols = int(np.ceil(n_cols / patch_cols))
+    # Step 4: Pad the matrix if necessary to make it divisible by patch size
+    padded_rows = n_patches_rows * patch_rows - n_rows
+    padded_cols = n_patches_cols * patch_cols - n_cols
+    if padded_rows > 0 or padded_cols > 0:
+        interleaved = np.pad(
+            interleaved,
+            ((0, 0), (0, padded_rows), (0, padded_cols * 2)),  # Double padding for interleaved axis
+            mode='constant',
+            constant_values=0,
+        )
+    # Step 5: Create patches by dividing into blocks
+    n_samples, padded_rows, padded_cols = interleaved.shape
+    padded_cols //= 2  # Adjust for interleaving (real and imaginary parts count as one)
+    patches = []
+    for i in range(0, padded_rows, patch_rows):
+        for j in range(0, padded_cols, patch_cols):
+            patch = interleaved[:, i:i + patch_rows, j * 2:(j + patch_cols) * 2]
+            patches.append(patch.reshape(n_samples, -1))  # Flatten each patch
+    # Step 6: Stack patches to form the final array
+    patches = np.stack(patches, axis=1)  # Shape: (num_samples, n_patches, patch_rows * patch_cols * 2)
+    # nor_patches = patches
+    nor_patches = patches*1e6
+    return nor_patches
+#%%
+class LayerNormalization(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.eps = eps
+        self.alpha = nn.Parameter(torch.ones(d_model))
+        self.bias = nn.Parameter(torch.zeros(d_model))
+    def forward(self, x):
+        mean = x.mean(dim=-1, keepdim=True)
+        std = x.std(dim=-1, keepdim=True)
+        return self.alpha * (x - mean) / (std + self.eps) + self.bias
+class Embedding(nn.Module):
+    def __init__(self, element_length, d_model, max_len=513):
+        super().__init__()
+        self.element_length = element_length
+        self.d_model = d_model
+        self.proj = nn.Linear(element_length, d_model)
+        self.pos_embed = nn.Embedding(max_len, d_model)
+        self.norm = LayerNormalization(d_model)
+    def forward(self, x):
+        seq_len = x.size(1)
+        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)
+        pos_encodings = self.pos_embed(pos)
+        tok_emb = self.proj(x.float())
+        embedding = tok_emb + pos_encodings
+        return self.norm(embedding)
+class ScaledDotProductAttention(nn.Module):
+    def __init__(self, d_k):
+        super().__init__()
+        self.d_k = d_k
+    def forward(self, Q, K, V):
+        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_k)
+        attn = F.softmax(scores, dim=-1)
+        context = torch.matmul(attn, V)
+        return context, attn
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model, n_heads, dropout):
+        super().__init__()
+        self.d_k = d_model // n_heads
+        self.d_v = d_model // n_heads
+        self.n_heads = n_heads
+        self.W_Q = nn.Linear(d_model, self.d_k * n_heads)
+        self.W_K = nn.Linear(d_model, self.d_k * n_heads)
+        self.W_V = nn.Linear(d_model, self.d_v * n_heads)
+        self.linear = nn.Linear(n_heads * self.d_v, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.scaled_dot_attn = ScaledDotProductAttention(self.d_k)
+    def forward(self, Q, K, V):
+        residual, batch_size = Q, Q.size(0)
+        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
+        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
+        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1, 2)
+        context, attn = self.scaled_dot_attn(q_s, k_s, v_s)
+        output = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_v)
+        output = self.linear(output)
+        return residual + self.dropout(output), attn
+class PoswiseFeedForwardNet(nn.Module):
+    def __init__(self, d_model, d_ff, dropout):
+        super().__init__()
+        self.fc1 = nn.Linear(d_model, d_ff)
+        self.fc2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.fc2(self.dropout(F.relu(self.fc1(x))))
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, n_heads, d_ff, dropout):
+        super().__init__()
+        self.enc_self_attn = MultiHeadAttention(d_model, n_heads, dropout)
+        self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff, dropout)
+        self.norm1 = LayerNormalization(d_model)
+        self.norm2 = LayerNormalization(d_model)
+    def forward(self, enc_inputs):
+        # Self-Attention with Add & Norm
+        attn_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs)
+        attn_outputs = self.norm1(enc_inputs + attn_outputs)  # Add & Norm
+        # Feed-Forward with Add & Norm
+        ff_outputs = self.pos_ffn(attn_outputs)
+        enc_outputs = self.norm2(attn_outputs + ff_outputs)  # Add & Norm
+        return enc_outputs, attn
+class lwm(nn.Module):
+    def __init__(self, element_length=32, d_model=128, n_layers=12, max_len=321, n_heads=8, dropout=0.1):
+        super().__init__()
+        self.embedding = Embedding(element_length, d_model, max_len)
+        self.layers = nn.ModuleList(
+            [EncoderLayer(d_model, n_heads, d_model*4, dropout) for _ in range(n_layers)]
+        )
+        self.linear = nn.Linear(d_model, d_model)
+        self.norm = LayerNormalization(d_model)
+        embed_weight = self.embedding.proj.weight
+        _, n_dim = embed_weight.size()
+        self.decoder = nn.Linear(d_model, n_dim, bias=False)
+        self.decoder_bias = nn.Parameter(torch.zeros(n_dim))
+    @classmethod
+    def from_pretrained(cls, ckpt_name='model_weights.pth', device='cuda'):
+        model = cls().to(device)
+        model.load_state_dict(torch.load(ckpt_name, map_location=device))
+        print(f"Model loaded successfully from {ckpt_name}")
+        return model
+    def forward(self, input_ids, masked_pos=None):
+        # Step 1: Embedding
+        output = self.embedding(input_ids)
+        attention_maps = []
+        # Step 2: Pass through Encoder Layers
+        for layer in self.layers:
+            output, attn = layer(output)
+            attention_maps.append(attn)
+        # If masked_pos is provided, perform masked token prediction
+        if masked_pos is not None:
+            masked_pos = masked_pos.long()[:, :, None].expand(-1, -1, output.size(-1))
+            h_masked = torch.gather(output, 1, masked_pos)
+            h_masked = self.norm(F.relu(self.linear(h_masked)))
+            logits_lm = self.decoder(h_masked) + self.decoder_bias
+            return logits_lm, output, attention_maps
+        else:
+            return output, attention_maps

lwm_train.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# -*- coding: utf-8 -*-
+"""
+The original LWM-1.1 implementation is available at:
+https://huggingface.co/wi-lab/lwm-v1.1/tree/main
+We extend our highest respect to wi-lab and thank them for their outstanding contributions to original LWM.
+"""
+from tqdm import tqdm
+import os
+import csv
+import torch
+import torch.nn as nn
+from lwm_model import lwm, lwm_tokenizer, create_dataloader
+import numpy as np
+from torch.optim import AdamW
+from collections import defaultdict
+def split_and_save_indices_same_seed(manual_data_list, used_ratio=1.0, train_ratio=0.50, val_ratio=0.50):
+    all_indices = {}
+    for i, data in enumerate(manual_data_list):
+        total_num = data.shape[0]
+        indices = np.arange(total_num)
+        np.random.shuffle(indices)
+        train_end = int(train_ratio * total_num)
+        val_end = int((train_ratio + val_ratio) * total_num)
+        used_end = int(train_end * used_ratio)
+        train_idx = indices[:train_end]
+        train_idx = train_idx[:used_end]        #
+        val_idx = indices[train_end:val_end]
+        all_idx_list = [train_idx, val_idx]
+        np.savez(f"all_indices_{i}_{used_ratio}.npz", train_id=train_idx, val_id=val_idx)
+        all_indices[f'array_{i}'] = all_idx_list
+    return all_indices
+def nmse_loss(y_pred, y_true):
+    y_pred_flat = y_pred.view(y_pred.size(0), -1)
+    y_true_flat = y_true.view(y_true.size(0), -1)
+    mse = torch.sum((y_true_flat - y_pred_flat) ** 2, dim=-1)
+    normalization = torch.sum(y_true_flat ** 2, dim=-1)
+    return mse / normalization
+def train_lwm(model, train_loaders, val_loaders, optimizer, save_model, epochs, device, save_dir="models", log_file="training_log.csv"):
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    # Initialize CSV log
+    if not os.path.exists(log_file):
+        with open(log_file, mode='w', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow(["Epoch", "Train NMSE", "Validation NMSE", "Learning Rate", "Best Model"])
+    train_nmse_losses = []
+    val_nmse_losses = []
+    best_val_nmse = float('inf')
+    start_epoch = 0
+    for epoch in range(start_epoch, epochs):
+        model.train()
+        train_nmse = 0.0
+        train_samples = 0
+        # Training loop across all buckets
+        print(f"\nEpoch {epoch + 1}/{epochs} [Training]")
+        for length, train_loader in train_loaders.items():
+            print(f"Processing sequences of length {length}")
+            with tqdm(train_loader, desc=f"Length {length} [Training]", unit="batch") as t:
+                for batch in t:
+                    optimizer.zero_grad()
+                    input_ids, masked_tokens, masked_pos = [b.to(device) for b in batch]
+                    logits_lm, _, _ = model(input_ids, masked_pos)
+                    loss = torch.sum(nmse_loss(masked_tokens, logits_lm))
+                    loss.backward()
+                    optimizer.step()
+                    train_nmse += loss.item()
+                    train_samples += input_ids.shape[0]
+                    t.set_postfix({"nmse": train_nmse / train_samples})
+        # Average NMSE across training batches
+        train_nmse /= max(train_samples, 1)
+        train_nmse_losses.append(train_nmse)
+        if epoch % 1 == 0:
+            # Validation loop across all buckets
+            model.eval()
+            val_nmse_list=[]
+            val_nmse = 0.0
+            val_samples = 0
+            with torch.no_grad():
+                print(f"\nEpoch {epoch + 1}/{epochs} [Validation]")
+                for length, val_loader in val_loaders.items():
+                    print(f"Processing sequences of length {length}")
+                    with tqdm(val_loader, desc=f"Length {length} [Validation]", unit="batch") as t:
+                        for batch in t:
+                            input_ids, masked_tokens, masked_pos = [b.to(device) for b in batch]
+                            logits_lm, _, _ = model(input_ids, masked_pos)
+                            test = nmse_loss(masked_tokens, logits_lm)
+                            loss = torch.sum(test)
+                            val_nmse += loss.item()
+                            val_samples += input_ids.shape[0]
+                            val_nmse_list.append(test)
+                            t.set_postfix({"nmse": val_nmse / val_samples})
+            val_nmse /= max(val_samples, 1)
+            val_nmse_losses.append(val_nmse)
+            # Save model if validation NMSE improves
+            is_best_model = False
+            if val_nmse < best_val_nmse:
+                best_val_nmse = val_nmse
+                model_path = os.path.join(save_dir, f"lwm_epoch{epoch+1}_train{train_nmse:.4f}_val{val_nmse:.4f}.pth")
+                if save_model:
+                    torch.save(model.state_dict(), model_path)
+                    print(f"Model saved: {model_path}")
+                is_best_model = True
+        # Log the results
+        print(f"  Train NMSE: {train_nmse:.4f}")
+        print(f"  Validation NMSE: {val_nmse:.4f}")
+        # Append to CSV log
+        with open(log_file, mode='a', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow([epoch + 1, train_nmse, val_nmse, optimizer.param_groups[0]['lr'], is_best_model])
+    print("Training and validation complete.")
+    return model
+def generate_mask_pos(num, total_num, allow_point_num):
+    total_point = total_num
+    all_pos = np.arange(1, total_point + 1)
+    init_pos, inter, n, L = 1, int(np.ceil(total_point / num)), num, int(allow_point_num / num)
+    un_msk_pos = np.array([init_pos + l + i * inter for i in range(num) for l in range(L)])
+    msk_pos = np.setdiff1d(all_pos, un_msk_pos)
+    return msk_pos
+def merge_dicts(dict_list):
+    merged = defaultdict(list)
+    for d in dict_list:
+        for key, value in d.items():
+            merged[key].extend(value)
+    return dict(merged)
+if __name__ == '__main__':
+    # 请手动修改以下参数
+    SAVE_DIR = "model"
+    LOG_FILE = "training.csv"
+    MASK_PERCENT = 0.90
+    save_model = False
+    scenario_name = "Boston_28G"
+    gpu_ids = [0]
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # 设置LWM训练超参数
+    EPOCHS = 20000    # 10000-100% 14000-80%
+    BATCH_SIZE = 128
+    D_MODEL = 128
+    MAX_LEN = 513
+    N_LAYERS = 12
+    WEIGHT_DECAY = 0.05
+    BETA1 = 0.9
+    BETA2 = 0.999
+    N_HEADS = 8
+    DROPOUT = 0.1
+    BASE_LR = 5e-5
+    SEED = 0
+    TEST = False
+    torch.manual_seed(SEED)
+    np.random.seed(SEED)
+    train_generator = torch.Generator()
+    train_generator.manual_seed(SEED)
+    manual_data = [np.load(f"./dataset/{scenario_name}.npy")]
+    indices_dict = split_and_save_indices_same_seed(manual_data, used_ratio=1.0)
+    ranges = [data.shape[1] for data in manual_data]
+    steps = [MASK_PERCENT]
+    mask_pos_list = [[np.sort(np.random.choice(np.arange(1, range_max+1), size=int(range_max*step), replace=False)) for step in steps] for range_max in ranges]
+    pre_train_dict = {}
+    key_counter = 0
+    for mask_idx in range(len(steps)):
+        for data_idx in range(len(ranges)):
+            pre_train_dict[key_counter] = lwm_tokenizer(
+                manual_data=manual_data[data_idx][indices_dict[f'array_{data_idx}'][0]],
+                patch_rows=1,
+                patch_cols=16,
+                mask=True,
+                seed=None,
+                masking_percent=MASK_PERCENT,
+                mask_pos=mask_pos_list[data_idx][mask_idx]
+            )
+            key_counter += 1
+    preprocessed_train_data = {}
+    for i in range(len(pre_train_dict)):
+        preprocessed_train_data[i] = pre_train_dict[i][0]
+    train_loaders = create_dataloader(preprocessed_train_data, batch_size=BATCH_SIZE, shuffle=True, generator=train_generator)
+    pre_val_dict = {}
+    key_counter = 0
+    for mask_idx in range(len(steps)):
+        for data_idx in range(len(ranges)):
+            pre_val_dict[key_counter] = lwm_tokenizer(
+                manual_data=manual_data[data_idx][indices_dict[f'array_{data_idx}'][1]],
+                patch_rows=1,
+                patch_cols=16,
+                mask=True,
+                seed=None,
+                masking_percent=MASK_PERCENT,
+                mask_pos=mask_pos_list[data_idx][mask_idx]
+            )
+            key_counter += 1
+    preprocessed_val_data = {}
+    for i in range(len(pre_val_dict)):
+        preprocessed_val_data[i] = pre_val_dict[i][0]
+    val_loaders = create_dataloader(preprocessed_val_data, batch_size=BATCH_SIZE, shuffle=False)
+    # 构建LWM模型
+    model = lwm(d_model=D_MODEL, dropout=DROPOUT).to(device)
+    pretrained_lwm_dict = torch.load("./ExtLWM_sub16.pth", map_location=device)
+    pretrained_lwm_dict = {k.replace("module.", ""): v for k, v in pretrained_lwm_dict.items()}
+    model.load_state_dict(pretrained_lwm_dict, strict=False)
+    model = nn.DataParallel(model, gpu_ids)
+    optimizer = AdamW(
+        model.parameters(),
+        lr=BASE_LR,
+        betas=(BETA1, BETA2),
+        weight_decay=WEIGHT_DECAY
+    )
+    pretrained_model = train_lwm(
+        model,
+        train_loaders,
+        val_loaders,
+        optimizer,
+        save_model,
+        EPOCHS,
+        device=device,
+        save_dir=SAVE_DIR,
+        log_file=LOG_FILE,
+    )