uploaded training code and model weights

Files changed (14) hide show

fuson_plm/training/README.md +60 -0
fuson_plm/training/__init__.py +0 -0
fuson_plm/training/__pycache__/__init__.cpython-310.pyc +0 -0
fuson_plm/training/__pycache__/config.cpython-310.pyc +0 -0
fuson_plm/training/__pycache__/model.cpython-310.pyc +0 -0
fuson_plm/training/__pycache__/plot.cpython-310.pyc +0 -0
fuson_plm/training/__pycache__/train.cpython-310.pyc +0 -0
fuson_plm/training/__pycache__/utils.cpython-310.pyc +0 -0
fuson_plm/training/config.py +38 -0
fuson_plm/training/demo.py +46 -0
fuson_plm/training/model.py +119 -0
fuson_plm/training/test_esm2.py +122 -0
fuson_plm/training/train.py +388 -0
fuson_plm/training/utils.py +312 -0

fuson_plm/training/README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+## Training Script
+This folder holds code for training the model (`train.py`), defining the model architecture (`model.py`), and defining utility functions including masking rate schedulers adn dataloaders (`utils.py`). There is also a script for running ESM-2 on the test data (`test_esm2.py`).
+The weights and other necessary files for loading FusOn-pLM are stored in `checkpoints/best/ckpt`. Results on the test set are stored in `checkpoints/best/test_results.csv`.
+### Usage
+#### Configs
+The `config.py` script holds configurations for **training** and **plotting**.
+```python
+# Model parameters
+EPOCHS = 30
+BATCH_SIZE = 8
+MAX_LENGTH = 2000
+LEARNING_RATE = 3e-4
+N_UNFROZEN_LAYERS = 8
+UNFREEZE_QUERY = True
+UNFREEZE_KEY = True
+UNFREEZE_VALUE = True
+### Masking parameters - must use either variable or fixed masking rate
+# var masking rate (choice 1)
+VAR_MASK_RATE = True            # if this is
+MASK_LOW = 0.15
+MASK_HIGH = 0.40
+MASK_STEPS = 20
+MASK_SCHEDULER = "cosine"       # specify the type of scheduler to use. options are: "cosine","loglinear","stepwise"
+# fixed masking rate (choice 2)
+MASK_PERCENTAGE = 0.15          # if VAR_MASK_RATE = False, code will use fixed masking rate
+# To continue training a model you already started, fill in the following parameters
+FINETUNE_FROM_SCRATCH = True                       # Set to False if you want to finetune from a checkpoint
+PATH_TO_STARTING_CKPT = ''  # only set the path if FINETUNE_FROM_SCRATCH = False
+# File paths - do not change unless you move the training dta
+TRAIN_PATH = '../data/splits/train_df.csv'
+VAL_PATH = '../data/splits/val_df.csv'
+TEST_PATH = '../data/splits/test_df.csv'
+# WandB parameters
+# Fill these in with your own WandB account info
+WANDB_PROJECT = ''
+WANDB_ENTITY = ''
+WANDB_API_KEY=''
+# GPU parameters
+CUDA_VISIBLE_DEVICES = "0"
+```
+#### Training
+The `train.py` script trains a fusion-aware ESM model according to the settings specified in  `config.py`.
+To run, enter in terminal:
+```bash
+python train.py
+```
+or, to run the (long) training process in the background:
+```bash
+nohup python train.py > train.out 2> train.err &

fuson_plm/training/__init__.py ADDED Viewed

File without changes

fuson_plm/training/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (152 Bytes). View file

fuson_plm/training/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (898 Bytes). View file

fuson_plm/training/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (4.88 kB). View file

fuson_plm/training/__pycache__/plot.cpython-310.pyc ADDED Viewed

Binary file (4.02 kB). View file

fuson_plm/training/__pycache__/train.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

fuson_plm/training/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (11.3 kB). View file

fuson_plm/training/config.py ADDED Viewed

	@@ -0,0 +1,38 @@

+###### TRAINING
+# Model parameters
+EPOCHS = 30
+BATCH_SIZE = 8
+MAX_LENGTH = 2000
+LEARNING_RATE = 3e-4
+N_UNFROZEN_LAYERS = 8
+UNFREEZE_QUERY = True
+UNFREEZE_KEY = True
+UNFREEZE_VALUE = True
+### Masking parameters - must use either variable or fixed masking rate
+# var masking rate (choice 1)
+VAR_MASK_RATE = True            # if this is
+MASK_LOW = 0.15
+MASK_HIGH = 0.40
+MASK_STEPS = 20
+MASK_SCHEDULER = "cosine"       # specify the type of scheduler to use. options are: "cosine","loglinear","stepwise"
+# fixed masking rate (choice 2)
+MASK_PERCENTAGE = 0.15          # if VAR_MASK_RATE = False, code will use fixed masking rate
+# To continue training a model you already started, fill in the following parameters
+FINETUNE_FROM_SCRATCH = True                       # Set to False if you want to finetune from a checkpoint
+PATH_TO_STARTING_CKPT = ''  # only set the path if FINETUNE_FROM_SCRATCH = False
+# File paths - do not change unless you move the training dta
+TRAIN_PATH = '../data/splits/train_df.csv'
+VAL_PATH = '../data/splits/val_df.csv'
+TEST_PATH = '../data/splits/test_df.csv'
+# WandB parameters
+# Fill these in with your own WandB account info
+WANDB_PROJECT = ''
+WANDB_ENTITY = ''
+WANDB_API_KEY=''
+# GPU parameters
+CUDA_VISIBLE_DEVICES = "0"

fuson_plm/training/demo.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from fuson_plm.training.model import FusOnpLM
+from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel
+import logging
+import torch
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = "1"
+# Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
+logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Load the tokenizer and model
+model_name = 'checkpoints/old_splits_snp_2000_ft_11layers_Q_b8_lr5e-05_mask0.15-08-12-2024-12:42:48/checkpoint_epoch_1.pth'
+model = AutoModel.from_pretrained(model_name)              # initialize model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model.eval()
+model.to(device)
+# Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
+# Amino acids 1-80 are derived from the head gene, MLLT10
+# Amino acids 81-119 are derived from the tail gene, PICALM
+sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
+# Tokenize the input sequence
+inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=2000)
+inputs = {k: v.to(device) for k, v in inputs.items()}
+# Get the embeddings
+with torch.no_grad():
+    outputs = model(**inputs)
+    # The embeddings are in the last_hidden_state tensor
+    embeddings = outputs.last_hidden_state
+    # remove extra dimension
+    embeddings = embeddings.squeeze(0)
+    # remove BOS and EOS tokens
+    embeddings = embeddings[1:-1, :]
+# Convert embeddings to numpy array (if needed)
+embeddings = embeddings.cpu().numpy()
+print("Sequence length: ", len(sequence))
+print("Per-residue embeddings shape:", embeddings.shape)

fuson_plm/training/model.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel
+import torch
+import os
+class FusOnTokenizer:
+    """
+    FusOnTokenizer class: a wrapper around AutoTokenizer
+    """
+    def __init__(self, pretrained_path='facebook/esm2_t33_650M_UR50D'):
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
+    def __getattr__(self, name):
+        """
+        Delegate attribute access to the underlying tokenizer.
+        This allows calls like .tokenize(), .train(), and .eval() to be forwarded to the tokenizer.
+        """
+        return getattr(self.tokenizer, name)
+    def __call__(self, *args, **kwargs):
+        """
+        Make the FusOnTokenizer object callable, delegating to the tokenizer's __call__ method.
+        """
+        return self.tokenizer(*args, **kwargs)
+    def save_tokenizer(self, save_directory):
+        self.tokenizer.save_pretrained(save_directory)
+    def load_tokenizer(self, load_directory):
+        self.tokenizer = AutoTokenizer.from_pretrained(load_directory)
+class FusOnpLM:
+    """
+    FusOn-pLM class: a wrapper around AutoModelForMaskedLM
+    """
+    def __init__(self, pretrained_path='facebook/esm2_t33_650M_UR50D', ckpt_path = None, mlm_head=False):
+        if not(ckpt_path is None):
+            self.load_model(ckpt_path, mlm_head)
+        else:
+            # Load the pre-trained model and tokenizer
+            self.model = AutoModelForMaskedLM.from_pretrained(pretrained_path)
+            self.tokenizer = FusOnTokenizer(pretrained_path)
+        self.n_layers = self.count_encoder_layers()
+    def __getattr__(self, name):
+        """
+        Delegate attribute access to the underlying model.
+        This allows calls like .to(), .train(), and .eval() to be forwarded to the model.
+        """
+        return getattr(self.model, name)
+    def __call__(self, *args, **kwargs):
+        """
+        Make the FusOnpLM object callable, delegating to the model's __call__ method.
+        """
+        return self.model(*args, **kwargs)
+    def freeze_model(self):
+        """
+        Freezes all parameters in the model
+        """
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def unfreeze_last_n_layers(self, n_unfrozen_layers, unfreeze_query=True, unfreeze_key=True, unfreeze_value=True):
+        """
+        Unfreezes specific parts of the final n layers in the model's encoder.
+        Args:
+            n_unfrozen_layers (int): Number of final layers to unfreeze.
+            unfreeze_query (bool): Whether to unfreeze the query projections. Default is True.
+            unfreeze_key (bool): Whether to unfreeze the key projections. Default is True.
+            unfreeze_value (bool): Whether to unfreeze the value projections. Default is True.
+        """
+        for i, layer in enumerate(self.model.esm.encoder.layer):
+            if (self.n_layers - i) <= n_unfrozen_layers:  # Only the last n layers
+                if unfreeze_query:
+                    self._unfreeze_parameters(layer.attention.self.query)
+                if unfreeze_key:
+                    self._unfreeze_parameters(layer.attention.self.key)
+                if unfreeze_value:
+                    self._unfreeze_parameters(layer.attention.self.value)
+    def _unfreeze_parameters(self, module):
+        """
+        Helper method to unfreeze parameters in a given module.
+        Args:
+            module (nn.Module): The module whose parameters are to be unfrozen.
+        """
+        for param in module.parameters():
+            param.requires_grad = True
+    def count_encoder_layers(self):
+        """
+        Count the number of encoder layers in the model.
+        """
+        return len(self.model.esm.encoder.layer)
+    def save_model(self, save_directory, optimizer=None):
+        # Save the model and tokenizer
+        self.model.save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+        # If an optimizer is provided, save its state dict
+        if optimizer is not None:
+            optimizer_path = os.path.join(save_directory, "optimizer.pt")
+            torch.save(optimizer.state_dict(), optimizer_path)
+    def load_model(self, load_directory, mlm_head):
+        # Load a checkpoint of the model either with or without an MLM head
+        if mlm_head:
+            self.model = AutoModelForMaskedLM.from_pretrained(load_directory)
+        else:
+        # Load the model and tokenizer from a directory
+            self.model = AutoModel.from_pretrained(load_directory)
+        self.tokenizer = AutoTokenizer.from_pretrained(load_directory)

fuson_plm/training/test_esm2.py ADDED Viewed

	@@ -0,0 +1,122 @@

+### Run ESM2 on the validation and test set. Get val and test losses.
+import os
+import fuson_plm.training.config as config
+# Set the WANDB_API_KEY environment variable
+os.environ['WANDB_API_KEY'] = config.WANDB_API_KEY
+os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES
+import torch
+import tqdm
+import numpy as np
+import pandas as pd
+import logging
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from fuson_plm.utils.logging import log_update, open_logfile, print_configpy
+from fuson_plm.benchmarking.caid.utils import DisorderDataset, get_dataloader, check_dataloaders
+from fuson_plm.training.utils import batch_sample_mask_tokens_with_probabilities, get_dataloader, check_dataloaders
+from fuson_plm.training.train import test
+def load_esm2_maskedlm(esm_type, device=None):
+    """
+    Loads ESM-2 version of a specified version (e.g. esm2_t33_650M_UR50D)
+    """
+    # Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
+    logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {device}")
+    model = AutoModelForMaskedLM.from_pretrained(f"facebook/{esm_type}")
+    tokenizer = AutoTokenizer.from_pretrained(f"facebook/{esm_type}")
+    model.to(device)
+    model.eval()  # disables dropout for deterministic results
+    return model, tokenizer, device
+def val(model, tokenizer, val_loader, mask_percentage=0.15, device='cuda', checkpoint_dir='./checkpoints'):
+    """
+    Same method as val, just for running the val set
+    """
+    model.to(device)
+    model.eval()
+    total_val_loss = 0
+    total_weighted_val_loss = 0
+    total_val_masked_tokens = 0
+    with torch.no_grad():  # No gradients needed
+        # Loop over val data (no progress bar)
+        with tqdm.tqdm(enumerate(val_loader), total=len(val_loader), desc='Val Batch', leave=True, position=0) as tbar:
+            for batch_idx, (inputs, prob) in tbar:
+                # Move tensors
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                prob = prob.to(device)
+                # Mask based on probability vectors
+                masked_inputs = batch_sample_mask_tokens_with_probabilities(inputs, prob, tokenizer, mask_percentage=mask_percentage)
+                # Forward pass
+                outputs = model(**masked_inputs)
+                val_loss = outputs.loss
+                # Number of masked tokens
+                num_masked_tokens = (masked_inputs["input_ids"] == tokenizer.mask_token_id).sum().item()
+                # Loss calculations
+                total_val_loss += val_loss.item()
+                total_weighted_val_loss += val_loss.item() * num_masked_tokens  # Multiply loss by number of masked tokens
+                total_val_masked_tokens += num_masked_tokens
+            # Compute and log avg. loss and perplexity
+            n_val_batches = len(val_loader)
+            avg_val_loss = total_val_loss / n_val_batches
+            avg_weighted_val_loss = total_weighted_val_loss / total_val_masked_tokens
+            val_perplexity = np.exp(avg_weighted_val_loss)
+            log_update(f"\nval results:\nTotal batches = {n_val_batches}, Total masked tokens = {total_val_masked_tokens}, Total Loss = {total_val_loss:.4f},  Avg Batch Loss = {avg_val_loss:.4f}, Avg Masked Token-Weighted Loss = {avg_weighted_val_loss:.4f}, Perplexity = {val_perplexity:.4f}")
+            # Save to dataframe for plotting
+            val_stats_df = pd.DataFrame(data={
+                    "total_val_loss": [total_val_loss], "weighted_val_loss": [total_weighted_val_loss],
+                    "avg_val_loss": [avg_val_loss], "avg_weighted_val_loss": [avg_weighted_val_loss],
+                    "val_perplexity": [val_perplexity]
+                })
+            val_stats_df.to_csv(f"{checkpoint_dir}/val_results.csv",index=False)    # overwrite old file no matter what; should only be one val eval
+def main():
+    # Load the ESM-2 model
+    model, tokenizer, device = load_esm2_maskedlm("esm2_t33_650M_UR50D")
+    checkpoint_dir = f"checkpoints/esm2_t33_650M_UR50D_{config.PROBABILITY_TYPE}_mask{config.MASK_PERCENTAGE}"
+    os.makedirs(checkpoint_dir,exist_ok=True)
+    with open_logfile(f"{checkpoint_dir}/evaluate_val_test_esm.txt"):
+        # Print configurations
+        print_configpy(config)
+        ##### Validation
+        val_loader = get_dataloader(config.VAL_PATH, tokenizer,
+                                    probability_type=config.PROBABILITY_TYPE,
+                                    batch_size=config.BATCH_SIZE,
+                                    max_length=config.MAX_LENGTH, shuffle=False)
+        # Validation
+        val(model, tokenizer, val_loader, config.MASK_PERCENTAGE, device=device, checkpoint_dir=checkpoint_dir)
+        ##### Test
+        # Crete dataloader
+        test_loader = get_dataloader(config.TEST_PATH,
+                                    tokenizer,
+                                    probability_type=config.PROBABILITY_TYPE,
+                                    batch_size=config.BATCH_SIZE,
+                                    max_length=config.MAX_LENGTH, shuffle=False)
+        # Test the model
+        test(model, tokenizer, test_loader, config.MASK_PERCENTAGE, device=device, checkpoint_dir=checkpoint_dir)
+if __name__ == "__main__":
+    main()

fuson_plm/training/train.py ADDED Viewed

	@@ -0,0 +1,388 @@

+'''
+This is a training script for finetuning ESM.
+I am going to freeze the parameters in the head and unfreeze the last N layers in the model.
+'''
+import os
+import fuson_plm.training.config as config
+# Set the WANDB_API_KEY environment variable
+os.environ['WANDB_API_KEY'] = config.WANDB_API_KEY
+os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES
+import torch
+import numpy as np
+import pandas as pd
+import tqdm
+from datetime import datetime
+import wandb
+import pytz
+import sys
+from transformers import AdamW
+from fuson_plm.utils.logging import print_configpy, get_local_time, open_logfile, open_errfile, log_update
+from fuson_plm.training.model import FusOnpLM
+from fuson_plm.training.utils import batch_sample_mask_tokens_with_probabilities, get_dataloader, check_dataloaders, get_mask_rate_scheduler
+from fuson_plm.training.plot import make_train_val_test_bd_plot
+def prepare_model(model, n_unfrozen_layers,  unfreeze_query=True, unfreeze_key=True, unfreeze_value=True):
+    # Log the model's initial state
+    n_layers = model.count_encoder_layers()
+    total_params = sum(p.numel() for p in model.parameters())
+    total_head_params = sum(p.numel() for p in model.lm_head.parameters())
+    log_update(f'\nInitial state:\n\tTotal number of layers in the model: {n_layers}')
+    log_update(f'\tTotal parameters in the AutoModelforMaskedLM model: {total_params}')
+    log_update(f'\tTotal parameters in the MLM Head ONLY: {total_head_params}')
+    # Freeze the model to start
+    model.freeze_model()
+    n_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    log_update(f'Froze all {model.n_layers} model layers')
+    log_update(f'\tTrainable params: {n_trainable_params}')
+    # Unfreeze the last n layers
+    model.unfreeze_last_n_layers(n_unfrozen_layers, unfreeze_query=unfreeze_query, unfreeze_key=unfreeze_key, unfreeze_value=unfreeze_value)
+    n_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    trainable_params = '\n\t\t'.join([name for name, param in model.named_parameters() if param.requires_grad])
+    num_trainable_params_lm_head = sum(p.numel() for p in model.lm_head.parameters() if p.requires_grad)
+    num_trainable_params_esm = sum(p.numel() for p in model.esm.parameters() if p.requires_grad)
+    log_update(f'Unfroze final {n_unfrozen_layers} layers')
+    log_update(f'\tTrainable params: {n_trainable_params}\n\t\t{trainable_params}')
+    log_update(f"\tTrainable parameters in the lm_head: {num_trainable_params_lm_head}")
+    log_update(f"\tTrainable params in the ESM part: {num_trainable_params_esm}")
+def train(model, tokenizer, optimizer, train_loader, val_loader, n_epochs=10, start_epoch=1, mask_percentage=0.15, mask_rate_scheduler=None, device='cuda', checkpoint_dir='./checkpoints'):
+    """
+    Train the model
+    """
+    # Loop over epochs
+    log_update("\n")
+    for epoch in range(start_epoch, start_epoch+n_epochs):
+        if mask_rate_scheduler is not None:
+            mask_rate_scheduler.reset() # resetting because we rant to ramp it up again every epoch
+        model.train()
+        total_train_loss = 0
+        total_weighted_train_loss = 0
+        total_train_masked_tokens = 0
+        log_update(f"Epoch {epoch}")
+        # Loop over train data with progress bar
+        with tqdm.tqdm(enumerate(train_loader), total=len(train_loader), desc='Training Batch', leave=True, position=0) as pbar:
+            for batch_idx, (inputs, prob) in pbar:
+                # Take a step with the mask rate scheduler, if there is one.
+                masking_rate = mask_percentage
+                if mask_rate_scheduler is not None:
+                    mask_rate_scheduler.step()
+                    masking_rate = mask_rate_scheduler.get_masking_rate()
+                    log_update(f"\tBatch index: {batch_idx}\tMasking rate: {masking_rate:.5f}")
+                # Move tensors
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                prob = prob.to(device)
+                # Mask based on probability vectors
+                masked_inputs = batch_sample_mask_tokens_with_probabilities(inputs, prob, tokenizer,mask_percentage=masking_rate)
+                # Forward pass and update
+                optimizer.zero_grad()
+                outputs = model(**masked_inputs)
+                loss = outputs.loss
+                loss.backward()
+                optimizer.step()
+                # Number of masked tokens
+                num_masked_tokens = (masked_inputs["input_ids"] == tokenizer.mask_token_id).sum().item()
+                # Loss calculations and wandb log
+                total_train_loss += loss.item()
+                total_weighted_train_loss += loss.item() * num_masked_tokens  # Multiply loss by number of masked tokens
+                total_train_masked_tokens += num_masked_tokens
+                wandb.log({"batch_loss": loss.item()})
+            # Save a checkpoint at the end of each epoch
+            checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}')
+            model.save_model(checkpoint_path, optimizer=optimizer)
+            log_update(f'\nSaved checkpoint to {checkpoint_path}')
+            # Calculate and log average training loss on wandb
+            n_train_batches = len(train_loader)
+            avg_train_loss = total_train_loss / n_train_batches
+            avg_weighted_train_loss = total_weighted_train_loss / total_train_masked_tokens
+            train_perplexity = np.exp(avg_weighted_train_loss)
+            wandb.log({"epoch": epoch,
+                       "total_train_loss": total_train_loss, "weighted_train_loss": total_weighted_train_loss,
+                       "avg_train_loss": avg_train_loss, "avg_weighted_train_loss": avg_weighted_train_loss,
+                       "train_perplexity": train_perplexity})
+            # Track curve stats for easy re-plotting of training curves later
+            train_stats_df = pd.DataFrame(data={
+                    "epoch": [epoch],
+                    "total_train_loss": [total_train_loss], "weighted_train_loss": [total_weighted_train_loss],
+                    "avg_train_loss": [avg_train_loss], "avg_weighted_train_loss": [avg_weighted_train_loss],
+                    "train_perplexity": [train_perplexity]
+                    })
+            if os.path.exists(f"{checkpoint_dir}/train_curve.csv"):   # add to file if necessary
+                train_stats_df.to_csv(f"{checkpoint_dir}/train_curve.csv",index=False,header=False,mode='a')
+            else:   # make new file if necessary
+                train_stats_df.to_csv(f"{checkpoint_dir}/train_curve.csv",index=False)
+            # Validation loop
+            model.eval()
+            total_val_loss = 0
+            total_weighted_val_loss = 0
+            total_val_masked_tokens = 0
+            with torch.no_grad(): # No gradients needed
+                # Loop over val data with progress bar
+                with tqdm.tqdm(enumerate(val_loader), total=len(val_loader), desc='Validation Batch', leave=True, position=0) as vbar:
+                    for batch_idx, (inputs, prob) in vbar:
+                        # Move tensors
+                        inputs = {k: v.to(device) for k, v in inputs.items()}
+                        prob = prob.to(device)
+                        # Mask based on probability vectors
+                        ## FIXED 15% masking for the validation set
+                        masked_inputs = batch_sample_mask_tokens_with_probabilities(inputs, prob, tokenizer,mask_percentage=0.15)
+                        # Forward pass
+                        outputs = model(**masked_inputs)
+                        val_loss = outputs.loss
+                        # Number of masked tokens
+                        num_masked_tokens = (masked_inputs["input_ids"] == tokenizer.mask_token_id).sum().item()
+                        # Loss calculations
+                        total_val_loss += val_loss.item()
+                        total_weighted_val_loss += val_loss.item() * num_masked_tokens  # Multiply loss by number of masked tokens
+                        total_val_masked_tokens += num_masked_tokens
+                # Calculate and log avg. loss and perplexity (wandb and locally)
+                n_val_batches = len(val_loader)
+                avg_val_loss = total_val_loss / n_val_batches                                # avg per batch
+                avg_weighted_val_loss = total_weighted_val_loss / total_val_masked_tokens       # avg per masked token
+                val_perplexity = np.exp(avg_weighted_val_loss)
+                wandb.log({"epoch": epoch,
+                           "total_val_loss": total_val_loss, "weighted_val_loss": total_weighted_val_loss,
+                           "avg_val_loss": avg_val_loss, "avg_weighted_val_loss": avg_weighted_val_loss,
+                           "val_perplexity": val_perplexity})
+                # Track curve stats for easy re-plotting of training curves later
+                val_stats_df = pd.DataFrame(data={
+                    "epoch": [epoch],
+                    "total_val_loss": [total_val_loss], "weighted_val_loss": [total_weighted_val_loss],
+                    "avg_val_loss": [avg_val_loss], "avg_weighted_val_loss": [avg_weighted_val_loss],
+                    "val_perplexity": [val_perplexity]
+                })
+                if os.path.exists(f"{checkpoint_dir}/val_curve.csv"):   # add to file if necessary
+                    val_stats_df.to_csv(f"{checkpoint_dir}/val_curve.csv",index=False,header=False,mode='a')
+                else:   # make new file if necessary
+                    val_stats_df.to_csv(f"{checkpoint_dir}/val_curve.csv",index=False)
+                log_update(f"Epoch: {epoch}")
+                log_update(f"\tTrain set: Total batches = {n_train_batches}, Total masked tokens = {total_train_masked_tokens}, Total Loss = {total_train_loss:.4f}, Avg Batch Loss = {avg_train_loss:.4f}, Avg Masked Token-Weighted Loss = {avg_weighted_train_loss:.4f}, Perplexity = {train_perplexity:.4f}")
+                log_update(f"\tValidation set: Total batches = {n_val_batches}, Total masked tokens = {total_val_masked_tokens}, Total Loss = {total_val_loss:.4f},  Avg Batch Loss = {avg_val_loss:.4f}, Avg Masked Token-Weighted Loss = {avg_weighted_val_loss:.4f}, Perplexity = {val_perplexity:.4f}")
+def test(model, tokenizer, test_loader, mask_percentage=0.15, device='cuda', checkpoint_dir='./checkpoints'):
+    """
+    """
+    model.to(device)
+    model.eval()
+    total_test_loss = 0
+    total_weighted_test_loss = 0
+    total_test_masked_tokens = 0
+    with torch.no_grad():  # No gradients needed
+        # Loop over test data (no progress bar)
+        with tqdm.tqdm(enumerate(test_loader), total=len(test_loader), desc='Test Batch', leave=True, position=0) as tbar:
+            for batch_idx, (inputs, prob) in tbar:
+                # Move tensors
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                prob = prob.to(device)
+                # Mask based on probability vectors
+                ### FIXED 15% masking for the testing set
+                masked_inputs = batch_sample_mask_tokens_with_probabilities(inputs, prob, tokenizer, mask_percentage=0.15)
+                # Forward pass
+                outputs = model(**masked_inputs)
+                test_loss = outputs.loss
+                # Number of masked tokens
+                num_masked_tokens = (masked_inputs["input_ids"] == tokenizer.mask_token_id).sum().item()
+                # Loss calculations
+                total_test_loss += test_loss.item()
+                total_weighted_test_loss += test_loss.item() * num_masked_tokens  # Multiply loss by number of masked tokens
+                total_test_masked_tokens += num_masked_tokens
+            # Compute and log avg. loss and perplexity
+            n_test_batches = len(test_loader)
+            avg_test_loss = total_test_loss / n_test_batches
+            avg_weighted_test_loss = total_weighted_test_loss / total_test_masked_tokens
+            test_perplexity = np.exp(avg_weighted_test_loss)
+            log_update(f"\nTest results:\nTotal batches = {n_test_batches}, Total masked tokens = {total_test_masked_tokens}, Total Loss = {total_test_loss:.4f},  Avg Batch Loss = {avg_test_loss:.4f}, Avg Masked Token-Weighted Loss = {avg_weighted_test_loss:.4f}, Perplexity = {test_perplexity:.4f}")
+            # Save to dataframe for plotting
+            test_stats_df = pd.DataFrame(data={
+                    "total_test_loss": [total_test_loss], "weighted_test_loss": [total_weighted_test_loss],
+                    "avg_test_loss": [avg_test_loss], "avg_weighted_test_loss": [avg_weighted_test_loss],
+                    "test_perplexity": [test_perplexity]
+                })
+            test_stats_df.to_csv(f"{checkpoint_dir}/test_results.csv",index=False)    # overwrite old file no matter what; should only be one test eval
+def check_env_variables():
+    log_update("\nChecking on environment variables...")
+    log_update(f"\tWANDB_API_KEY: {os.environ.get('WANDB_API_KEY')}")
+    log_update(f"\tCUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
+    log_update(f"\ttorch.cuda.device_count(): {torch.cuda.device_count()}")
+    for i in range(torch.cuda.device_count()):
+        log_update(f"\t\tDevice {i}: {torch.cuda.get_device_name(i)}")
+def intialize_model_and_optimizer(finetune_from_scratch, device, path_to_starting_ckpt=None, learning_rate=1e-4, n_unfrozen_layers=0, unfreeze_query=False, unfreeze_key=False, unfreeze_value=False):
+    """
+    Initializes the model, either from ESM-2-650M if finetuning from scratch, or from a prior checkpoint if not finetuning from scratch.
+    Also prepares
+    Args:
+        finetune_from_scratch (bool): True if finetuning from scratch. False if finetuning from a previous ckpt
+        path_to_starting_ckpt (str): path to starting ckpt for finetuning (optional)
+    """
+    if not(finetune_from_scratch) and not(os.path.exists(path_to_starting_ckpt)):
+        raise Exception(f"Error: could not find {path_to_starting_ckpt}. When finetuning from a prior checkpoint, you must provide a valid path to that checkpoint.")
+    # if finetuning from scratch, initialize from scratch
+    if finetune_from_scratch:
+        log_update(f"\nInitializing FusOn-pLM model to be finetuned from scratch")
+        model = FusOnpLM()                  # because of __getattr__, we can use FusOnpLM() to get the model. It also contains the tokenizer.
+        model.to(device)
+        prepare_model(model, n_unfrozen_layers,
+                unfreeze_query=unfreeze_query, unfreeze_key=unfreeze_key, unfreeze_value=unfreeze_value)
+        # Set the optimizer here, change it if we are finetuning from an old checkpoint
+        optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
+        return model, optimizer
+    # if not, initialize from starting ckpt
+    else:
+        log_update(f"\nInitializing FusOn-pLM model to be finetuned from previous checkpoint: {path_to_starting_ckpt}")
+        model = FusOnpLM(ckpt_path = path_to_starting_ckpt, mlm_head=True)
+        model.to(device)
+        prepare_model(model, n_unfrozen_layers,
+                unfreeze_query=unfreeze_query, unfreeze_key=unfreeze_key, unfreeze_value=unfreeze_value)
+        log_update(f"Loading optimizer state_dict from previous checkpoint")
+        optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()))
+        optimizer.load_state_dict(torch.load(os.path.join(path_to_starting_ckpt, "optimizer.pt"), map_location=device))
+        return model, optimizer
+def main():
+    # Set probability type to uniform; only option
+    config.PROBABILITY_TYPE = "uniform"
+    # Set run name (WANDB_NAME)
+    kqv_tag = f"{'Q' if config.UNFREEZE_QUERY else ''}" + f"{'K' if config.UNFREEZE_KEY else ''}" + f"{'V' if config.UNFREEZE_VALUE else ''}"
+    timestamp = get_local_time()
+    # make a mask tag _mask{config.MASK_PERCENTAGE}
+    mask_tag = f"mask{config.MASK_PERCENTAGE}"
+    if config.VAR_MASK_RATE:    # if variable masking rate, change the tag to relfect this
+        mask_tag=f"maskvar_{config.MASK_SCHEDULER}_low{config.MASK_LOW}_high{config.MASK_HIGH}"
+    # Define the train settings string and wandb name from this
+    TRAIN_SETTINGS_STRING = f"{config.PROBABILITY_TYPE}_{config.MAX_LENGTH}_ft_{config.N_UNFROZEN_LAYERS}layers_{kqv_tag}_b{config.BATCH_SIZE}_lr{config.LEARNING_RATE}_{mask_tag}"
+    WANDB_NAME = f'{TRAIN_SETTINGS_STRING}-{timestamp}'
+    # Create directory for model checkpoints
+    checkpoint_dir = f'checkpoints/{WANDB_NAME}'
+    start_epoch = 1
+    # Determine if we're adding to an old log file or opening a new one
+    logmode='w'
+    # If we're finetuning from a checkpoint, save to the same folder instead, and keep track of which epoch to start on
+    # Also, load the optimizer from here
+    if not(config.FINETUNE_FROM_SCRATCH):
+        logmode='a'
+        path_to_starting_ckpt = config.PATH_TO_STARTING_CKPT
+        checkpoint_dir = path_to_starting_ckpt[0:path_to_starting_ckpt.rindex('/')]
+        START_MODEL_TRAIN_SETTINGS_STRING = checkpoint_dir[checkpoint_dir.index('checkpoints/')+len('checkpoints/'):checkpoint_dir.index('-')]
+        start_epoch = int(path_to_starting_ckpt.split('/checkpoint_epoch_')[1])+1
+    os.makedirs(f'checkpoints', exist_ok=True)
+    os.makedirs(checkpoint_dir, exist_ok=True)
+    # Open log file
+    LOG_PATH = f'{checkpoint_dir}/training_log.txt'
+    ERR_PATH = f'{checkpoint_dir}/training_errors.txt'
+    with open_logfile(LOG_PATH,mode=logmode), open_errfile(ERR_PATH,mode=logmode):
+        if not(config.FINETUNE_FROM_SCRATCH):
+            log_update(f"\n{'-'*200}\nResuming finetuning from checkpoint {start_epoch-1} (first new checkpoint: {start_epoch})\n")
+            log_update(f"Settings tag for original model (starting point for finetuning) = {START_MODEL_TRAIN_SETTINGS_STRING}\nSettings tag for new model based on configs = {TRAIN_SETTINGS_STRING}\nSame: {START_MODEL_TRAIN_SETTINGS_STRING==TRAIN_SETTINGS_STRING}\n")
+            # ONLY proceed with training if we're using the same settings, otherwise we are not finetuning the model we think we are!
+            assert START_MODEL_TRAIN_SETTINGS_STRING==TRAIN_SETTINGS_STRING
+        # Print configurations
+        print_configpy(config)
+        # Verify that the environment variables are set correctly
+        check_env_variables()
+        # Check CUDA availability and set device
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        log_update(f"\nUsing device: {device}")
+        # Init wandb
+        wandb.init(project=config.WANDB_PROJECT, entity=config.WANDB_ENTITY, name=WANDB_NAME , config={
+            "batch_size": config.BATCH_SIZE,
+            "epochs": config.EPOCHS,
+            "learning_rate": config.LEARNING_RATE,
+        })
+        # Initialize model and prepare it (freeze/unfreeze proper layers). Initialize optimizer as well. Details depend on whether we are finetuning from scratch.
+        model, optimizer = intialize_model_and_optimizer(config.FINETUNE_FROM_SCRATCH, device,
+                                                         path_to_starting_ckpt=config.PATH_TO_STARTING_CKPT,
+                                                         learning_rate=config.LEARNING_RATE,
+                                                         n_unfrozen_layers=config.N_UNFROZEN_LAYERS,
+                                                         unfreeze_query=config.UNFREEZE_QUERY,
+                                                         unfreeze_key=config.UNFREEZE_KEY,
+                                                         unfreeze_value=config.UNFREEZE_VALUE)
+        # Initialize the tokenizer (independent of starting model for finetuning)
+        tokenizer = model.tokenizer
+        # Create DataLoader instances and perform sanity checks on them
+        train_loader = get_dataloader(config.TRAIN_PATH, tokenizer, probability_type=config.PROBABILITY_TYPE, batch_size=config.BATCH_SIZE, max_length=config.MAX_LENGTH, shuffle=True) ## FOR DEBUGGING ONLY, change shuffle to False. Otherwise, True!!
+        val_loader = get_dataloader(config.VAL_PATH, tokenizer, probability_type=config.PROBABILITY_TYPE, batch_size=config.BATCH_SIZE, max_length=config.MAX_LENGTH, shuffle=False)
+        test_loader = get_dataloader(config.TEST_PATH, tokenizer, probability_type=config.PROBABILITY_TYPE, batch_size=config.BATCH_SIZE, max_length=config.MAX_LENGTH, shuffle=False)
+        # If we're continuing to finetune an old ckpt, store the old batch diversity plot before we overwrite it
+        check_dataloaders(train_loader, val_loader, test_loader, max_length=config.MAX_LENGTH, checkpoint_dir=checkpoint_dir)
+        # Set up a masking rate scheduler, if one is needed
+        mask_rate_scheduler = None
+        if config.VAR_MASK_RATE:
+            mask_rate_scheduler = get_mask_rate_scheduler(scheduler_type=config.MASK_SCHEDULER,
+                                                          min_masking_rate=config.MASK_LOW,
+                                                          max_masking_rate=config.MASK_HIGH,
+                                                          total_batches=len(train_loader),
+                                                          total_steps=config.MASK_STEPS)
+        # Train the model
+        train(model, tokenizer, optimizer, train_loader, val_loader,
+            n_epochs=config.EPOCHS,
+            start_epoch = start_epoch,
+            device=device,
+            mask_rate_scheduler=mask_rate_scheduler,
+            mask_percentage=config.MASK_PERCENTAGE,
+            checkpoint_dir=checkpoint_dir)
+        # Test the model
+        test(model, tokenizer, test_loader, mask_percentage=0.15, device=device, checkpoint_dir=checkpoint_dir)
+if __name__ == "__main__":
+    main()

fuson_plm/training/utils.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import numpy as np
+import pandas as pd
+import torch
+import os
+from torch.nn.functional import softmax
+from fuson_plm.utils.logging import log_update
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer
+from abc import ABC, abstractmethod
+#----------------------------------------------------------------------------------------------------------------------------------------------------
+#### Masking Rate Scheduler base class and sub classes
+# abstract base class
+class MaskingRateScheduler(ABC):
+    def __init__(self, total_steps, min_masking_rate, max_masking_rate, last_step=-1):
+        self.total_steps = total_steps
+        self.min_masking_rate = min_masking_rate
+        self.max_masking_rate = max_masking_rate
+        self.current_step = last_step
+    def step(self):
+        self.current_step += 1
+    def reset(self):
+        """Reset the scheduler to its initial state."""
+        self.current_step = -1
+    def get_masking_rate(self):
+        progress = self.current_step / self.total_steps
+        return self.compute_masking_rate(progress)
+    @abstractmethod
+    def compute_masking_rate(self, progress):
+        """To be implemented by subclasses for specific increase functions."""
+        raise NotImplementedError("Subclasses must implement this method.")
+class CosineIncreaseMaskingRateScheduler(MaskingRateScheduler):
+    def compute_masking_rate(self, progress):
+        # Use a cosine increase function
+        cosine_increase = 0.5 * (1 - np.cos(np.pi * progress))
+        return self.min_masking_rate + (self.max_masking_rate - self.min_masking_rate) * cosine_increase
+class LogLinearIncreaseMaskingRateScheduler(MaskingRateScheduler):
+    def compute_masking_rate(self, progress):
+        # Avoid log(0) by clamping progress to a minimum of a small positive number
+        progress = max(progress, 1e-10)
+        log_linear_increase = np.log1p(progress) / np.log1p(1)  # Normalizing to keep range in [0, 1]
+        return self.min_masking_rate + (self.max_masking_rate - self.min_masking_rate) * log_linear_increase
+class StepwiseIncreaseMaskingRateScheduler(MaskingRateScheduler):
+    def __init__(self, total_batches, min_masking_rate, max_masking_rate, num_steps):
+        super().__init__(total_steps=total_batches, min_masking_rate=min_masking_rate, max_masking_rate=max_masking_rate)
+        self.num_steps = num_steps
+        self.batch_interval = total_batches // (num_steps)  # Adjusting to ensure max rate is included
+        self.rate_increment = (max_masking_rate - min_masking_rate) / (num_steps - 1)  # Include end rate in the steps
+    def compute_masking_rate(self, progress):
+        # Determine the current step based on the number of completed batches
+        current_step = int(self.current_step / self.batch_interval)
+        # Cap the step number to `num_steps - 1` to include the max rate at the final step
+        current_step = min(current_step, self.num_steps - 1)
+        # Calculate the masking rate for the current step
+        masking_rate = self.min_masking_rate + current_step * self.rate_increment
+        return masking_rate
+def get_mask_rate_scheduler(scheduler_type="cosine",min_masking_rate=0.15,max_masking_rate=0.40,total_batches=100,total_steps=20):
+    """
+    Initialize the mask rate scheduler and return it
+    """
+    if scheduler_type=="cosine":
+        return CosineIncreaseMaskingRateScheduler(total_steps=total_batches,
+                                                  min_masking_rate=min_masking_rate,
+                                                  max_masking_rate=max_masking_rate)
+    elif scheduler_type=="loglinear":
+        return LogLinearIncreaseMaskingRateScheduler(total_steps=total_batches,
+                                                    min_masking_rate=min_masking_rate,
+                                                    max_masking_rate=max_masking_rate)
+    elif scheduler_type=="stepwise":
+        return StepwiseIncreaseMaskingRateScheduler(total_batches=total_batches,
+                                                    num_steps=total_steps,
+                                                    min_masking_rate=min_masking_rate,
+                                                    max_masking_rate=max_masking_rate)
+    else:
+        raise Exception("Must specify valid scheduler_type: cosine, loglinear, stepwise")
+# Adjusted Dataloader for the sequences and probability vectors
+class ProteinDataset(Dataset):
+    def __init__(self, data_path, tokenizer, probability_type, max_length=512):
+        self.dataframe = pd.read_csv(data_path)
+        self.tokenizer = tokenizer
+        self.probability_type=probability_type
+        self.max_length = max_length
+        self.set_probabilities()
+    def __len__(self):
+        return len(self.dataframe)
+    def set_probabilities(self):
+        if self.probability_type=="snp":
+            self.dataframe = self.dataframe.rename(columns={'snp_probabilities':'probabilities'})
+        if self.probability_type=="uniform":
+            self.dataframe['probabilities'] = self.dataframe['sequence'].apply(len).apply(lambda x: ('1,'*x)[0:-1])
+        # make probabilities into numbers if they aren't already
+        if type(self.dataframe['probabilities'][0]) == str:
+            self.dataframe['probabilities'] = self.dataframe['probabilities'].apply(
+                lambda x: np.array([float(i) for i in x.split(',')])
+            )
+    def get_padded_probabilities(self, idx):
+        '''
+        Pads probabilities to max_length if they're too short; truncate them if they're too long
+        '''
+        no_mask_value = int(-1e9) # will be used to make sure CLS and PAD aren't masked
+        # add a no-mask slot for <CLS>
+        prob = np.concatenate((
+            np.array([no_mask_value]),
+            self.dataframe.iloc[idx]['probabilities']
+            )
+        )
+        # Pad with no_mask_value for everything after the probability vector ends
+        if len(prob) < self.max_length:
+            return np.pad(
+                prob,
+                (0, self.max_length - len(prob)),
+                'constant', constant_values=(0,no_mask_value))
+        # If it's too long, we need to truncate, but we also need to change the last token to an <EOS>.
+        prob = prob[0:self.max_length-1]
+        prob = np.concatenate((
+            prob,
+            np.array([no_mask_value]),
+            )
+        )
+        return prob
+    def __getitem__(self, idx):
+        sequence = self.dataframe.iloc[idx]['sequence']
+        probability = self.get_padded_probabilities(idx) # extract them
+        inputs = self.tokenizer(sequence, return_tensors="pt", padding='max_length', truncation=True, max_length=self.max_length) # does this have to be 512?
+        inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}  # Remove batch dimension
+        return inputs, probability
+def get_dataloader(data_path, tokenizer, probability_type='snp', max_length=512, batch_size=8, shuffle=True):
+    """
+    Creates a DataLoader for the dataset.
+    Args:
+        data_path (str): Path to the CSV file (train, val, or test).
+        batch_size (int): Batch size.
+        shuffle (bool): Whether to shuffle the data.
+        tokenizer (Tokenizer): tokenizer object for data tokenization
+    Returns:
+        DataLoader: DataLoader object.
+    """
+    dataset = ProteinDataset(data_path, tokenizer, probability_type, max_length=max_length)
+    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
+def check_dataloaders(train_loader, val_loader, test_loader, max_length=512, checkpoint_dir=''):
+    log_update(f'\nBuilt train, validation, and test dataloders')
+    log_update(f"\tNumber of sequences in the Training DataLoader: {len(train_loader.dataset)}")
+    log_update(f"\tNumber of sequences in the Validation DataLoader: {len(val_loader.dataset)}")
+    log_update(f"\tNumber of sequences in the Training DataLoader: {len(test_loader.dataset)}")
+    dataloader_overlaps = check_dataloader_overlap(train_loader, val_loader, test_loader)
+    if len(dataloader_overlaps)==0: log_update("\tDataloaders are clean (no overlaps)")
+    else: log_update(f"\tWARNING! sequence overlap found: {','.join(dataloader_overlaps)}")
+    # write length ranges to a text file
+    if not(os.path.exists(f'{checkpoint_dir}/batch_diversity')):
+        os.mkdir(f'{checkpoint_dir}/batch_diversity')
+    max_length_violators = []
+    for name, dataloader in {'train':train_loader, 'val':val_loader, 'test':test_loader}.items():
+        max_length_followed, length_ranges = check_max_length_and_length_diversity(dataloader, max_length)
+        if max_length_followed == False:
+            max_length_violators.append(name)
+        with open(f'{checkpoint_dir}/batch_diversity/{name}_batch_length_ranges.txt','w') as f:
+            for tup in length_ranges:
+                f.write(f'{tup[0]}\t{tup[1]}\n')
+    if len(max_length_violators)==0: log_update(f"\tDataloaders follow the max length limit set by user: {max_length}")
+    else: log_update(f"\tWARNING! these loaders have sequences longer than max length={max_length}: {','.join(max_length_violators)}")
+def check_dataloader_overlap(train_loader, val_loader, test_loader):
+    """
+    Check the data that's about to go into the model. Make sure there is no overlap between train, test, and val
+    Returns:
+    """
+    train_protein_seqs = set(train_loader.dataset.dataframe['sequence'].unique())
+    val_protein_seqs = set(val_loader.dataset.dataframe['sequence'].unique())
+    test_protein_seqs = set(test_loader.dataset.dataframe['sequence'].unique())
+    tr_va = len(train_protein_seqs.intersection(val_protein_seqs))
+    tr_te = len(train_protein_seqs.intersection(test_protein_seqs))
+    va_te = len(val_protein_seqs.intersection(test_protein_seqs))
+    overlaps = []
+    if tr_va==tr_te==va_te==0:
+        return overlaps             # data is clean
+    else:
+        if tr_va > 0: overlaps.append(f"Train-Val Overlap={tr_va}")
+        if tr_te > 0: overlaps.append(f"Train-Test Overlap={tr_te}")
+        if va_te > 0: overlaps.append(f"Val-Test Overlap={va_te}")
+        return overlaps
+def check_max_length_and_length_diversity(dataloader, max_length):
+    """
+    Check if all sequences in the DataLoader conform to the specified max_length,
+    and return the sequence length ranges within each batch.
+    Args:
+        dataloader (DataLoader): The DataLoader object to check.
+        max_length (int): The maximum allowed sequence length.
+    Returns:
+        bool: True if all sequences are within the max_length, False otherwise.
+        list: A list of tuples representing the min and max sequence lengths in each batch.
+    """
+    length_ranges = []
+    all_within_max_length = True
+    for batch_idx, (inputs, _) in enumerate(dataloader):
+        input_ids = inputs['input_ids']
+        # Calculate the actual lengths of sequences in this batch
+        actual_lengths = (input_ids != dataloader.dataset.tokenizer.pad_token_id).sum(dim=1)
+        min_length = actual_lengths.min().item()
+        max_length_in_batch = actual_lengths.max().item()
+        # Check for max length violation
+        if max_length_in_batch > max_length:
+            #print(f"Error: Sequence exceeds max_length of {max_length} at batch {batch_idx + 1}. Max length found: {max_length_in_batch}")
+            all_within_max_length = False
+        # Store the length range for this batch
+        length_ranges.append((min_length, max_length_in_batch))
+    #print(f"All sequences in the DataLoader conform to the max_length of {max_length}.") if all_within_max_length else None
+    #print(f"Sequence length ranges per batch: {length_ranges}")
+    return all_within_max_length, length_ranges
+def check_max_length_in_dataloader(dataloader, max_length):
+    """
+    Check if all sequences in the DataLoader conform to the specified max_length.
+    Args:
+        dataloader (DataLoader): The DataLoader object to check.
+        max_length (int): The maximum allowed sequence length.
+    Returns:
+        bool: True if all sequences are within the max_length, False otherwise.
+    """
+    for batch_idx, (inputs, _) in enumerate(dataloader):
+        input_ids = inputs['input_ids']
+        # Check if any sequence length exceeds max_length
+        if input_ids.size(1) > max_length:
+            return False
+    return True
+def batch_sample_mask_tokens_with_probabilities(inputs, probabilities, tokenizer: AutoTokenizer, mask_percentage=0.15):
+    """
+    """
+    #print('the batch sample method was called!')
+    labels = inputs["input_ids"].detach().clone()
+    labels[labels != tokenizer.mask_token_id] = -100  # Set labels for unmasked tokens to -100
+    # Iterate over each sequence and its corresponding probabilities in the batch
+    for idx in range(inputs["input_ids"].size(0)):  # Assuming the first dimension is batch size
+        input_ids = inputs["input_ids"][idx]
+        prob = probabilities[idx]
+        cls_token_index = (input_ids == 0).nonzero(as_tuple=False)[0].item()
+        eos_token_index = (input_ids == 2).nonzero(as_tuple=False)[0].item()
+        seq_length = eos_token_index - (cls_token_index+1)
+        assert prob.shape[0] == input_ids.shape[0]
+        # Normalize probabilities using softmax
+        prob = softmax(prob, dim=0).cpu().numpy() # move to CPU for numpy
+        assert 1 - sum(prob) < 1e-6
+        # Calculate the number of tokens to mask
+        num_tokens_to_mask = int(mask_percentage * seq_length)
+        # Choose indices to mask based on the probability distribution
+        mask_indices = np.random.choice(input_ids.shape[0], size=num_tokens_to_mask, replace=False, p=prob)
+        attention_mask_1_indices = np.arange(0, eos_token_index+1, 1)
+        # Mask the selected indices and set the corresponding labels
+        labels[idx, mask_indices] = input_ids[mask_indices].detach().clone()
+        input_ids[mask_indices] = tokenizer.mask_token_id
+        inputs["attention_mask"][idx] = torch.zeros_like(input_ids)
+        inputs["attention_mask"][idx][attention_mask_1_indices] = 1 # just added this to try and update the attention mask....
+        # Update the input_ids in the inputs dictionary
+        inputs["input_ids"][idx] = input_ids
+    inputs["labels"] = labels
+    return inputs