Upload 14 files

Browse files

Geneformer backbone by Theodoris et al.

Files changed (14) hide show

geneformer/.DS_Store +0 -0
geneformer/Dataset_Create.py +79 -0
geneformer/__init__.py +12 -0
geneformer/collator_for_classification.py +602 -0
geneformer/emb_extractor.py +806 -0
geneformer/gene_median_dictionary.pkl +3 -0
geneformer/gene_name_id_dict.pkl +3 -0
geneformer/in_silico_perturber.py +915 -0
geneformer/in_silico_perturber_stats.py +1042 -0
geneformer/model.safetensors +3 -0
geneformer/perturber_utils.py +699 -0
geneformer/pretrainer.py +978 -0
geneformer/token_dictionary.pkl +3 -0
geneformer/tokenizer.py +369 -0

geneformer/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

geneformer/Dataset_Create.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import numpy as np
+from datasets import load_from_disk
+import torch
+from transformers import BertForMaskedLM
+import os
+import sys
+from tqdm.notebook import tqdm
+import seaborn as sns
+import matplotlib.pyplot as plt
+# sys.path.append('/Users/chenj0i/Desktop/Lab Work/Geneformer')
+from geneformer.pretrainer import token_dictionary
+import datetime
+# imports
+import os
+import time
+os.environ["NCCL_DEBUG"] = "INFO"
+os.environ["OMPI_MCA_opal_cuda_support"] = "true"
+os.environ["CONDA_OVERRIDE_GLIBC"] = "2.56"
+import pickle
+import random
+import subprocess
+import numpy as np
+import pytz
+import torch
+from datasets import load_from_disk, Dataset
+from transformers import BertConfig, BertForMaskedLM, TrainingArguments, TrainerCallback, Trainer, BertModel, BertPreTrainedModel
+from geneformer import GeneformerPretrainer
+from typing import Tuple
+from torch import Tensor
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.models.bert.modeling_bert import BertLMPredictionHead, BertOnlyMLMHead, BertPredictionHeadTransform
+from transformers.activations import ACT2FN
+from typing import List, Optional, Tuple, Union
+import torch.nn.functional as F
+# # Randomly select 100_000 sequences from Genecorpus to conduct the training
+genecorpus = load_from_disk("/ibex/user/chenj0i/Geneformer/Genecorpus-30M/genecorpus_30M_2048.dataset")
+subset_size = 1_200_000
+subset_sequences = genecorpus.shuffle(seed=42).select(i for i in tqdm(list(range(subset_size))))['input_ids']
+subset_train_dataset = Dataset.from_dict({"input_ids": subset_sequences[:-200_000]})
+subset_train_dataset.save_to_disk("/ibex/user/chenj0i/Geneformer/subset_1Mtrain_genecorpus.dataset")
+subset_test_dataset = Dataset.from_dict({"input_ids": subset_sequences[-200_000:]})
+subset_test_dataset.save_to_disk("/ibex/user/chenj0i/Geneformer/subset_200K_1Mtrain_genecorpus.dataset")
+# Create length file for the training
+# Define the value to repeat
+value_to_repeat = 2048
+# Define the total number of elements
+total_elements = 1_000_000
+# Create the list with repeated values
+data_list = [value_to_repeat] * total_elements
+# Define the path for the output .pkl length file
+output_file = "sub_1Mtrain_genecorpus_30M_2048_lengths.pkl"
+# Save the list to a .pkl file
+with open(output_file, 'wb') as f:
+    pickle.dump(data_list, f)
+print(f"List with {subset_size} elements saved as {output_file}")
+value_to_repeat_test = 2048
+# Define the total number of elements
+total_elements_test = 200_000
+# Create the list with repeated values
+data_list_test = [value_to_repeat_test] * total_elements_test
+# Define the path for the output .pkl length file
+output_file_test = "sub_200K_1Mtrain_genecorpus_30M_2048_lengths.pkl"
+# Save the list to a .pkl file
+with open(output_file_test, 'wb') as f:
+    pickle.dump(data_list_test, f)
+print(f"List with {subset_size} elements saved as {output_file_test}")

geneformer/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from . import tokenizer
+from . import pretrainer
+from . import collator_for_classification
+from . import in_silico_perturber
+from . import in_silico_perturber_stats
+from .tokenizer import TranscriptomeTokenizer
+from .pretrainer import GeneformerPretrainer
+from .collator_for_classification import DataCollatorForGeneClassification
+from .collator_for_classification import DataCollatorForCellClassification
+from .emb_extractor import EmbExtractor
+from .in_silico_perturber import InSilicoPerturber
+from .in_silico_perturber_stats import InSilicoPerturberStats

geneformer/collator_for_classification.py ADDED Viewed

	@@ -0,0 +1,602 @@

+"""
+Geneformer collator for gene and cell classification.
+Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
+"""
+import numpy as np
+import torch
+import warnings
+from enum import Enum
+from typing import Dict, List, Optional, Union
+from transformers import (
+    DataCollatorForTokenClassification,
+    SpecialTokensMixin,
+    BatchEncoding,
+)
+from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
+from transformers.utils.generic import _is_tensorflow, _is_torch
+from .pretrainer import token_dictionary
+EncodedInput = List[int]
+logger = logging.get_logger(__name__)
+VERY_LARGE_INTEGER = int(
+    1e30
+)  # This is used to set the max input length for a model with infinite size input
+LARGE_INTEGER = int(
+    1e20
+)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
+# precollator functions
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            "%r is not a valid %s, please select one of %s"
+            % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
+        )
+class TruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
+    """
+    ONLY_FIRST = "only_first"
+    ONLY_SECOND = "only_second"
+    LONGEST_FIRST = "longest_first"
+    DO_NOT_TRUNCATE = "do_not_truncate"
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
+    in an IDE.
+    """
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
+    """
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
+    mask_token = "<mask>"
+    mask_token_id = token_dictionary.get("<mask>")
+    pad_token = "<pad>"
+    pad_token_id = token_dictionary.get("<pad>")
+    padding_side = "right"
+    all_special_ids = [
+        token_dictionary.get("<mask>"),
+        token_dictionary.get("<pad>")
+    ]
+    model_input_names = ["input_ids"]
+    def _get_padding_truncation_strategies(
+        self, padding=True, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
+    ):
+        """
+        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
+        and pad_to_max_length) and behaviors.
+        """
+        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
+        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
+        # Backward compatibility for previous behavior, maybe we should deprecate it:
+        # If you only set max_length, it activates truncation for max_length
+        if max_length is not None and padding is False and truncation is False:
+            if verbose:
+                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
+                    logger.warning(
+                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
+                        "please use `truncation=True` to explicitly truncate examples to max length. "
+                        "Defaulting to 'longest_first' truncation strategy. "
+                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
+                        "more precisely by providing a specific strategy to `truncation`."
+                    )
+                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
+            truncation = "longest_first"
+        # Get padding strategy
+        if padding is False and old_pad_to_max_length:
+            if verbose:
+                warnings.warn(
+                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
+                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
+                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
+                    "maximal input size of the model (e.g. 512 for Bert).",
+                    FutureWarning,
+                )
+            if max_length is None:
+                padding_strategy = PaddingStrategy.LONGEST
+            else:
+                padding_strategy = PaddingStrategy.MAX_LENGTH
+        elif padding is not False:
+            if padding is True:
+                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+        # Get truncation strategy
+        if truncation is False and old_truncation_strategy != "do_not_truncate":
+            if verbose:
+                warnings.warn(
+                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
+                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
+                    "maximal input size of the model (e.g. 512 for Bert). "
+                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
+                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
+                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
+                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
+                    FutureWarning,
+                )
+            truncation_strategy = TruncationStrategy(old_truncation_strategy)
+        elif truncation is not False:
+            if truncation is True:
+                truncation_strategy = (
+                    TruncationStrategy.LONGEST_FIRST
+                )  # Default to truncate the longest sequences in pairs of inputs
+            elif not isinstance(truncation, TruncationStrategy):
+                truncation_strategy = TruncationStrategy(truncation)
+            elif isinstance(truncation, TruncationStrategy):
+                truncation_strategy = truncation
+        else:
+            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+        # Set max length if needed
+        if max_length is None:
+            if padding_strategy == PaddingStrategy.MAX_LENGTH:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
+                            logger.warning(
+                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no padding."
+                            )
+                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
+                    padding_strategy = PaddingStrategy.DO_NOT_PAD
+                else:
+                    max_length = self.model_max_length
+            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
+                            logger.warning(
+                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no truncation."
+                            )
+                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
+                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+                else:
+                    max_length = self.model_max_length
+        # Test if we have a padding token
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
+            raise ValueError(
+                "Asking to pad but the tokenizer does not have a padding token. "
+                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
+                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
+            )
+        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
+        if (
+            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
+            and padding_strategy != PaddingStrategy.DO_NOT_PAD
+            and pad_to_multiple_of is not None
+            and max_length is not None
+            and (max_length % pad_to_multiple_of != 0)
+        ):
+            raise ValueError(
+                f"Truncation and padding are both activated but "
+                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
+            )
+        return padding_strategy, truncation_strategy, max_length, kwargs
+    def pad(
+        self,
+        encoded_inputs: Union[
+            BatchEncoding,
+            List[BatchEncoding],
+            Dict[str, EncodedInput],
+            Dict[str, List[EncodedInput]],
+            List[Dict[str, EncodedInput]],
+        ],
+        class_type, # options: "gene" or "cell"
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+        in the batch.
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+        ``self.pad_token_id`` and ``self.pad_token_type_id``)
+        .. note::
+            If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+            result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
+            case of PyTorch tensors, you will lose the specific device of your tensors however.
+        Args:
+            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+                well as in a PyTorch Dataloader collate function.
+                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                see the note above for the return type.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                  single sequence if provided).
+                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                  different lengths).
+            max_length (:obj:`int`, `optional`):
+                Maximum length of the returned list and optionally padding length (see above).
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask (:obj:`bool`, `optional`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to print more information and warnings.
+        """
+        # If we have a list of dicts, let's convert it in a dict of lists
+        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+        # The model's main input name, usually `input_ids`, has be passed for padding
+        if self.model_input_names[0] not in encoded_inputs:
+            raise ValueError(
+                "You should supply an encoding or a list of encodings to this method"
+                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
+            )
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if not required_input:
+            if return_attention_mask:
+                encoded_inputs["attention_mask"] = []
+            return encoded_inputs
+        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+        # and rebuild them afterwards if no return_tensors is specified
+        # Note that we lose the specific device the tensor may be on for PyTorch
+        first_element = required_input[0]
+        if isinstance(first_element, (list, tuple)):
+            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+            index = 0
+            while len(required_input[index]) == 0:
+                index += 1
+            if index < len(required_input):
+                first_element = required_input[index][0]
+        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+        if not isinstance(first_element, (int, list, tuple)):
+            if is_tf_available() and _is_tensorflow(first_element):
+                return_tensors = "tf" if return_tensors is None else return_tensors
+            elif is_torch_available() and _is_torch(first_element):
+                return_tensors = "pt" if return_tensors is None else return_tensors
+            elif isinstance(first_element, np.ndarray):
+                return_tensors = "np" if return_tensors is None else return_tensors
+            else:
+                raise ValueError(
+                    f"type of {first_element} unknown: {type(first_element)}. "
+                    f"Should be one of a python, numpy, pytorch or tensorflow object."
+                )
+            for key, value in encoded_inputs.items():
+                encoded_inputs[key] = to_py_obj(value)
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding, max_length=max_length, verbose=verbose
+        )
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if required_input and not isinstance(required_input[0], (list, tuple)):
+            encoded_inputs = self._pad(
+                encoded_inputs,
+                class_type=class_type,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+        batch_size = len(required_input)
+        assert all(
+            len(v) == batch_size for v in encoded_inputs.values()
+        ), "Some items in the output dictionary have a different batch size than others."
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(len(inputs) for inputs in required_input)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+        batch_outputs = {}
+        for i in range(batch_size):
+            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            outputs = self._pad(
+                inputs,
+                class_type=class_type,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+        if class_type == "cell":
+            del batch_outputs["label"]
+        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        class_type, # options: "gene" or "cell"
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.LONGEST,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = True,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+                if class_type == "gene":
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [-100] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+                if class_type == "gene":
+                    encoded_inputs["labels"] = [-100] * difference + encoded_inputs["labels"]
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+        elif return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+        return encoded_inputs
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                List of ids of the second sequence.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        assert already_has_special_tokens and token_ids_1 is None, (
+            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
+            "Please use a slow (full python) tokenizer to activate this argument."
+            "Or set `return_special_tokens_mask=True` when calling the encoding method "
+            "to get the special tokens mask in any tokenizer. "
+        )
+        all_special_ids = self.all_special_ids  # cache the property
+        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
+        return special_tokens_mask
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        """
+        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
+        vocabulary.
+        Args:
+            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+        Returns:
+            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
+        """
+        if tokens is None:
+            return None
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+        return token_dictionary.get(token)
+    def __len__(self):
+        return len(token_dictionary)
+# collator functions
+class DataCollatorForGeneClassification(DataCollatorForTokenClassification):
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
+    """
+    tokenizer = PrecollatorForGeneAndCellClassification()
+    class_type = "gene"
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(
+            tokenizer=self.tokenizer,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            label_pad_token_id=self.label_pad_token_id,
+            *args, **kwargs)
+    def _prepare_batch(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        batch = self.tokenizer.pad(
+            features,
+            class_type=self.class_type,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        return batch
+    def __call__(self, features):
+        batch = self._prepare_batch(features)
+        batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
+        return batch
+class DataCollatorForCellClassification(DataCollatorForGeneClassification):
+    class_type = "cell"
+    def _prepare_batch(self, features):
+        batch = super()._prepare_batch(features)
+        # Special handling for labels.
+        # Ensure that tensor is created with the correct type
+        # (it should be automatically the case, but let's make sure of it.)
+        first = features[0]
+        if "label" in first and first["label"] is not None:
+            label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
+            dtype = torch.long if isinstance(label, int) else torch.float
+            batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
+        return batch

geneformer/emb_extractor.py ADDED Viewed

	@@ -0,0 +1,806 @@

+"""
+Geneformer embedding extractor.
+**Description:**
+| Extracts gene or cell embeddings.
+| Plots cell embeddings as heatmaps or UMAPs.
+| Generates cell state embedding dictionary for use with InSilicoPerturber.
+"""
+# imports
+import logging
+import pickle
+from collections import Counter
+from pathlib import Path
+import anndata
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scanpy as sc
+import seaborn as sns
+import torch
+from tdigest import TDigest
+from tqdm.auto import trange
+from . import perturber_utils as pu
+from .tokenizer import TOKEN_DICTIONARY_FILE
+logger = logging.getLogger(__name__)
+# extract embeddings
+def get_embs(
+    model,
+    filtered_input_data,
+    emb_mode,
+    layer_to_quant,
+    pad_token_id,
+    forward_batch_size,
+    summary_stat=None,
+    silent=False,
+):
+    model_input_size = pu.get_model_input_size(model)
+    total_batch_length = len(filtered_input_data)
+    if summary_stat is None:
+        embs_list = []
+    elif summary_stat is not None:
+        # test embedding extraction for example cell and extract # emb dims
+        example = filtered_input_data.select([i for i in range(1)])
+        example.set_format(type="torch")
+        emb_dims = test_emb(model, example["input_ids"], layer_to_quant)
+        if emb_mode == "cell":
+            # initiate tdigests for # of emb dims
+            embs_tdigests = [TDigest() for _ in range(emb_dims)]
+        if emb_mode == "gene":
+            gene_set = list(
+                {
+                    element
+                    for sublist in filtered_input_data["input_ids"]
+                    for element in sublist
+                }
+            )
+            # initiate dict with genes as keys and tdigests for # of emb dims as values
+            embs_tdigests_dict = {
+                k: [TDigest() for _ in range(emb_dims)] for k in gene_set
+            }
+    overall_max_len = 0
+    for i in trange(0, total_batch_length, forward_batch_size, leave=(not silent)):
+        max_range = min(i + forward_batch_size, total_batch_length)
+        minibatch = filtered_input_data.select([i for i in range(i, max_range)])
+        max_len = int(max(minibatch["length"]))
+        original_lens = torch.tensor(minibatch["length"], device="cuda")
+        minibatch.set_format(type="torch")
+        input_data_minibatch = minibatch["input_ids"]
+        input_data_minibatch = pu.pad_tensor_list(
+            input_data_minibatch, max_len, pad_token_id, model_input_size
+        )
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_data_minibatch.to("cuda"),
+                attention_mask=pu.gen_attention_mask(minibatch),
+            )
+        embs_i = outputs.hidden_states[layer_to_quant]
+        if emb_mode == "cell":
+            mean_embs = pu.mean_nonpadding_embs(embs_i, original_lens)
+            if summary_stat is None:
+                embs_list.append(mean_embs)
+            elif summary_stat is not None:
+                # update tdigests with current batch for each emb dim
+                accumulate_tdigests(embs_tdigests, mean_embs, emb_dims)
+            del mean_embs
+        elif emb_mode == "gene":
+            if summary_stat is None:
+                embs_list.append(embs_i)
+            elif summary_stat is not None:
+                for h in trange(len(minibatch)):
+                    length_h = minibatch[h]["length"]
+                    input_ids_h = minibatch[h]["input_ids"][0:length_h]
+                    # double check dimensions before unsqueezing
+                    embs_i_dim = embs_i.dim()
+                    if embs_i_dim != 3:
+                        logger.error(
+                            f"Embedding tensor should have 3 dimensions, not {embs_i_dim}"
+                        )
+                        raise
+                    embs_h = embs_i[h, :, :].unsqueeze(dim=1)
+                    dict_h = dict(zip(input_ids_h, embs_h))
+                    for k in dict_h.keys():
+                        accumulate_tdigests(
+                            embs_tdigests_dict[int(k)], dict_h[k], emb_dims
+                        )
+        overall_max_len = max(overall_max_len, max_len)
+        del outputs
+        del minibatch
+        del input_data_minibatch
+        del embs_i
+        torch.cuda.empty_cache()
+    if summary_stat is None:
+        if emb_mode == "cell":
+            embs_stack = torch.cat(embs_list, dim=0)
+        elif emb_mode == "gene":
+            embs_stack = pu.pad_tensor_list(
+                embs_list,
+                overall_max_len,
+                pad_token_id,
+                model_input_size,
+                1,
+                pu.pad_3d_tensor,
+            )
+    # calculate summary stat embs from approximated tdigests
+    elif summary_stat is not None:
+        if emb_mode == "cell":
+            if summary_stat == "mean":
+                summary_emb_list = tdigest_mean(embs_tdigests, emb_dims)
+            elif summary_stat == "median":
+                summary_emb_list = tdigest_median(embs_tdigests, emb_dims)
+            embs_stack = torch.tensor(summary_emb_list)
+        elif emb_mode == "gene":
+            if summary_stat == "mean":
+                [
+                    update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims)
+                    for gene in embs_tdigests_dict.keys()
+                ]
+            elif summary_stat == "median":
+                [
+                    update_tdigest_dict_median(embs_tdigests_dict, gene, emb_dims)
+                    for gene in embs_tdigests_dict.keys()
+                ]
+            return embs_tdigests_dict
+    return embs_stack
+def accumulate_tdigests(embs_tdigests, mean_embs, emb_dims):
+    # note: tdigest batch update known to be slow so updating serially
+    [
+        embs_tdigests[j].update(mean_embs[i, j].item())
+        for i in range(mean_embs.size(0))
+        for j in range(emb_dims)
+    ]
+def update_tdigest_dict(embs_tdigests_dict, gene, gene_embs, emb_dims):
+    embs_tdigests_dict[gene] = accumulate_tdigests(
+        embs_tdigests_dict[gene], gene_embs, emb_dims
+    )
+def update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims):
+    embs_tdigests_dict[gene] = tdigest_mean(embs_tdigests_dict[gene], emb_dims)
+def update_tdigest_dict_median(embs_tdigests_dict, gene, emb_dims):
+    embs_tdigests_dict[gene] = tdigest_median(embs_tdigests_dict[gene], emb_dims)
+def summarize_gene_embs(h, minibatch, embs_i, embs_tdigests_dict, emb_dims):
+    length_h = minibatch[h]["length"]
+    input_ids_h = minibatch[h]["input_ids"][0:length_h]
+    embs_h = embs_i[h, :, :].unsqueeze(dim=1)
+    dict_h = dict(zip(input_ids_h, embs_h))
+    [
+        update_tdigest_dict(embs_tdigests_dict, k, dict_h[k], emb_dims)
+        for k in dict_h.keys()
+    ]
+def tdigest_mean(embs_tdigests, emb_dims):
+    return [embs_tdigests[i].trimmed_mean(0, 100) for i in range(emb_dims)]
+def tdigest_median(embs_tdigests, emb_dims):
+    return [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
+def test_emb(model, example, layer_to_quant):
+    with torch.no_grad():
+        outputs = model(input_ids=example.to("cuda"))
+    embs_test = outputs.hidden_states[layer_to_quant]
+    return embs_test.size()[2]
+def label_cell_embs(embs, downsampled_data, emb_labels):
+    embs_df = pd.DataFrame(embs.cpu().numpy())
+    if emb_labels is not None:
+        for label in emb_labels:
+            emb_label = downsampled_data[label]
+            embs_df[label] = emb_label
+    return embs_df
+def label_gene_embs(embs, downsampled_data, token_gene_dict):
+    gene_set = {
+        element for sublist in downsampled_data["input_ids"] for element in sublist
+    }
+    gene_emb_dict = {k: [] for k in gene_set}
+    for i in range(embs.size()[0]):
+        length = downsampled_data[i]["length"]
+        dict_i = dict(
+            zip(
+                downsampled_data[i]["input_ids"][0:length],
+                embs[i, :, :].unsqueeze(dim=1),
+            )
+        )
+        for k in dict_i.keys():
+            gene_emb_dict[k].append(dict_i[k])
+    for k in gene_emb_dict.keys():
+        gene_emb_dict[k] = (
+            torch.squeeze(torch.mean(torch.stack(gene_emb_dict[k]), dim=0), dim=0)
+            .cpu()
+            .numpy()
+        )
+    embs_df = pd.DataFrame(gene_emb_dict).T
+    embs_df.index = [token_gene_dict[token] for token in embs_df.index]
+    return embs_df
+def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict):
+    only_embs_df = embs_df.iloc[:, :emb_dims]
+    only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
+    only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(
+        str
+    )
+    vars_dict = {"embs": only_embs_df.columns}
+    obs_dict = {"cell_id": list(only_embs_df.index), f"{label}": list(embs_df[label])}
+    adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
+    sc.tl.pca(adata, svd_solver="arpack")
+    sc.pp.neighbors(adata)
+    sc.tl.umap(adata)
+    sns.set(rc={"figure.figsize": (10, 10)}, font_scale=2.3)
+    sns.set_style("white")
+    default_kwargs_dict = {"palette": "Set2", "size": 200}
+    if kwargs_dict is not None:
+        default_kwargs_dict.update(kwargs_dict)
+    sc.pl.umap(adata, color=label, save=output_file, **default_kwargs_dict)
+def gen_heatmap_class_colors(labels, df):
+    pal = sns.cubehelix_palette(
+        len(Counter(labels).keys()),
+        light=0.9,
+        dark=0.1,
+        hue=1,
+        reverse=True,
+        start=1,
+        rot=-2,
+    )
+    lut = dict(zip(map(str, Counter(labels).keys()), pal))
+    colors = pd.Series(labels, index=df.index).map(lut)
+    return colors
+def gen_heatmap_class_dict(classes, label_colors_series):
+    class_color_dict_df = pd.DataFrame(
+        {"classes": classes, "color": label_colors_series}
+    )
+    class_color_dict_df = class_color_dict_df.drop_duplicates(subset=["classes"])
+    return dict(zip(class_color_dict_df["classes"], class_color_dict_df["color"]))
+def make_colorbar(embs_df, label):
+    labels = list(embs_df[label])
+    cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
+    label_colors = pd.DataFrame(cell_type_colors, columns=[label])
+    for i, row in label_colors.iterrows():
+        colors = row[0]
+        if len(colors) != 3 or any(np.isnan(colors)):
+            print(i, colors)
+    label_colors.isna().sum()
+    # create dictionary for colors and classes
+    label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
+    return label_colors, label_color_dict
+def plot_heatmap(embs_df, emb_dims, label, output_file, kwargs_dict):
+    sns.set_style("white")
+    sns.set(font_scale=2)
+    plt.figure(figsize=(15, 15), dpi=150)
+    label_colors, label_color_dict = make_colorbar(embs_df, label)
+    default_kwargs_dict = {
+        "row_cluster": True,
+        "col_cluster": True,
+        "row_colors": label_colors,
+        "standard_scale": 1,
+        "linewidths": 0,
+        "xticklabels": False,
+        "yticklabels": False,
+        "figsize": (15, 15),
+        "center": 0,
+        "cmap": "magma",
+    }
+    if kwargs_dict is not None:
+        default_kwargs_dict.update(kwargs_dict)
+    g = sns.clustermap(
+        embs_df.iloc[:, 0:emb_dims].apply(pd.to_numeric), **default_kwargs_dict
+    )
+    plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
+    for label_color in list(label_color_dict.keys()):
+        g.ax_col_dendrogram.bar(
+            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0
+        )
+        g.ax_col_dendrogram.legend(
+            title=f"{label}",
+            loc="lower center",
+            ncol=4,
+            bbox_to_anchor=(0.5, 1),
+            facecolor="white",
+        )
+    plt.savefig(output_file, bbox_inches="tight")
+class EmbExtractor:
+    valid_option_dict = {
+        "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
+        "num_classes": {int},
+        "emb_mode": {"cell", "gene"},
+        "cell_emb_style": {"mean_pool"},
+        "gene_emb_style": {"mean_pool"},
+        "filter_data": {None, dict},
+        "max_ncells": {None, int},
+        "emb_layer": {-1, 0},
+        "emb_label": {None, list},
+        "labels_to_plot": {None, list},
+        "forward_batch_size": {int},
+        "nproc": {int},
+        "summary_stat": {None, "mean", "median", "exact_mean", "exact_median"},
+    }
+    def __init__(
+        self,
+        model_type="Pretrained",
+        num_classes=0,
+        emb_mode="cell",
+        cell_emb_style="mean_pool",
+        gene_emb_style="mean_pool",
+        filter_data=None,
+        max_ncells=1000,
+        emb_layer=-1,
+        emb_label=None,
+        labels_to_plot=None,
+        forward_batch_size=100,
+        nproc=4,
+        summary_stat=None,
+        token_dictionary_file=TOKEN_DICTIONARY_FILE,
+    ):
+        """
+        Initialize embedding extractor.
+        **Parameters:**
+        model_type : {"Pretrained", "GeneClassifier", "CellClassifier"}
+            | Whether model is the pretrained Geneformer or a fine-tuned gene or cell classifier.
+        num_classes : int
+            | If model is a gene or cell classifier, specify number of classes it was trained to classify.
+            | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
+        emb_mode : {"cell", "gene"}
+            | Whether to output cell or gene embeddings.
+        cell_emb_style : "mean_pool"
+            | Method for summarizing cell embeddings.
+            | Currently only option is mean pooling of gene embeddings for given cell.
+        gene_emb_style : "mean_pool"
+            | Method for summarizing gene embeddings.
+            | Currently only option is mean pooling of contextual gene embeddings for given gene.
+        filter_data : None, dict
+            | Default is to extract embeddings from all input data.
+            | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
+        max_ncells : None, int
+            | Maximum number of cells to extract embeddings from.
+            | Default is 1000 cells randomly sampled from input data.
+            | If None, will extract embeddings from all cells.
+        emb_layer : {-1, 0}
+            | Embedding layer to extract.
+            | The last layer is most specifically weighted to optimize the given learning objective.
+            | Generally, it is best to extract the 2nd to last layer to get a more general representation.
+            | -1: 2nd to last layer
+            | 0: last layer
+        emb_label : None, list
+            | List of column name(s) in .dataset to add as labels to embedding output.
+        labels_to_plot : None, list
+            | Cell labels to plot.
+            | Shown as color bar in heatmap.
+            | Shown as cell color in umap.
+            | Plotting umap requires labels to plot.
+        forward_batch_size : int
+            | Batch size for forward pass.
+        nproc : int
+            | Number of CPU processes to use.
+        summary_stat : {None, "mean", "median", "exact_mean", "exact_median"}
+            | If exact_mean or exact_median, outputs only exact mean or median embedding of input data.
+            | If mean or median, outputs only approximated mean or median embedding of input data.
+            | Non-exact recommended if encountering memory constraints while generating goal embedding positions.
+            | Non-exact is slower but more memory-efficient.
+        token_dictionary_file : Path
+            | Path to pickle file containing token dictionary (Ensembl ID:token).
+        **Examples:**
+        .. code-block :: python
+            >>> from geneformer import EmbExtractor
+            >>> embex = EmbExtractor(model_type="CellClassifier",
+            ...         num_classes=3,
+            ...         emb_mode="cell",
+            ...         filter_data={"cell_type":["cardiomyocyte"]},
+            ...         max_ncells=1000,
+            ...         max_ncells_to_plot=1000,
+            ...         emb_layer=-1,
+            ...         emb_label=["disease", "cell_type"],
+            ...         labels_to_plot=["disease", "cell_type"])
+        """
+        self.model_type = model_type
+        self.num_classes = num_classes
+        self.emb_mode = emb_mode
+        self.cell_emb_style = cell_emb_style
+        self.gene_emb_style = gene_emb_style
+        self.filter_data = filter_data
+        self.max_ncells = max_ncells
+        self.emb_layer = emb_layer
+        self.emb_label = emb_label
+        self.labels_to_plot = labels_to_plot
+        self.forward_batch_size = forward_batch_size
+        self.nproc = nproc
+        if (summary_stat is not None) and ("exact" in summary_stat):
+            self.summary_stat = None
+            self.exact_summary_stat = summary_stat
+        else:
+            self.summary_stat = summary_stat
+            self.exact_summary_stat = None
+        self.validate_options()
+        # load token dictionary (Ensembl IDs:token)
+        with open(token_dictionary_file, "rb") as f:
+            self.gene_token_dict = pickle.load(f)
+        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
+        self.pad_token_id = self.gene_token_dict.get("<pad>")
+    def validate_options(self):
+        # confirm arguments are within valid options and compatible with each other
+        for attr_name, valid_options in self.valid_option_dict.items():
+            attr_value = self.__dict__[attr_name]
+            if not isinstance(attr_value, (list, dict)):
+                if attr_value in valid_options:
+                    continue
+            valid_type = False
+            for option in valid_options:
+                if (option in [int, list, dict, bool]) and isinstance(
+                    attr_value, option
+                ):
+                    valid_type = True
+                    break
+            if valid_type:
+                continue
+            logger.error(
+                f"Invalid option for {attr_name}. "
+                f"Valid options for {attr_name}: {valid_options}"
+            )
+            raise
+        if self.filter_data is not None:
+            for key, value in self.filter_data.items():
+                if not isinstance(value, list):
+                    self.filter_data[key] = [value]
+                    logger.warning(
+                        "Values in filter_data dict must be lists. "
+                        f"Changing {key} value to list ([{value}])."
+                    )
+    def extract_embs(
+        self,
+        model_directory,
+        input_data_file,
+        output_directory,
+        output_prefix,
+        output_torch_embs=False,
+        cell_state=None,
+    ):
+        """
+        Extract embeddings from input data and save as results in output_directory.
+        **Parameters:**
+        model_directory : Path
+            | Path to directory containing model
+        input_data_file : Path
+            | Path to directory containing .dataset inputs
+        output_directory : Path
+            | Path to directory where embedding data will be saved as csv
+        output_prefix : str
+            | Prefix for output file
+        output_torch_embs : bool
+            | Whether or not to also output the embeddings as a tensor.
+            | Note, if true, will output embeddings as both dataframe and tensor.
+        cell_state : dict
+            | Cell state key and value for state embedding extraction.
+        **Examples:**
+        .. code-block :: python
+            >>> embs = embex.extract_embs("path/to/model",
+            ...                           "path/to/input_data",
+            ...                           "path/to/output_directory",
+            ...                           "output_prefix")
+        """
+        filtered_input_data = pu.load_and_filter(
+            self.filter_data, self.nproc, input_data_file
+        )
+        if cell_state is not None:
+            filtered_input_data = pu.filter_by_dict(
+                filtered_input_data, cell_state, self.nproc
+            )
+        downsampled_data = pu.downsample_and_sort(filtered_input_data, self.max_ncells)
+        model = pu.load_model(self.model_type, self.num_classes, model_directory)
+        layer_to_quant = pu.quant_layers(model) + self.emb_layer
+        embs = get_embs(
+            model,
+            downsampled_data,
+            self.emb_mode,
+            layer_to_quant,
+            self.pad_token_id,
+            self.forward_batch_size,
+            self.summary_stat,
+        )
+        if self.emb_mode == "cell":
+            if self.summary_stat is None:
+                embs_df = label_cell_embs(embs, downsampled_data, self.emb_label)
+            elif self.summary_stat is not None:
+                embs_df = pd.DataFrame(embs.cpu().numpy()).T
+        elif self.emb_mode == "gene":
+            if self.summary_stat is None:
+                embs_df = label_gene_embs(embs, downsampled_data, self.token_gene_dict)
+            elif self.summary_stat is not None:
+                embs_df = pd.DataFrame(embs).T
+                embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
+        # save embeddings to output_path
+        if cell_state is None:
+            output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
+            embs_df.to_csv(output_path)
+        if self.exact_summary_stat == "exact_mean":
+            embs = embs.mean(dim=0)
+            embs_df = pd.DataFrame(
+                embs_df[0:255].mean(axis="rows"), columns=[self.exact_summary_stat]
+            ).T
+        elif self.exact_summary_stat == "exact_median":
+            embs = torch.median(embs, dim=0)[0]
+            embs_df = pd.DataFrame(
+                embs_df[0:255].median(axis="rows"), columns=[self.exact_summary_stat]
+            ).T
+        if cell_state is not None:
+            return embs
+        else:
+            if output_torch_embs:
+                return embs_df, embs
+            else:
+                return embs_df
+    def get_state_embs(
+        self,
+        cell_states_to_model,
+        model_directory,
+        input_data_file,
+        output_directory,
+        output_prefix,
+        output_torch_embs=True,
+    ):
+        """
+        Extract exact mean or exact median cell state embedding positions from input data and save as results in output_directory.
+        **Parameters:**
+        cell_states_to_model : None, dict
+            | Cell states to model if testing perturbations that achieve goal state change.
+            | Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
+            | state_key: key specifying name of column in .dataset that defines the start/goal states
+            | start_state: value in the state_key column that specifies the start state
+            | goal_state: value in the state_key column taht specifies the goal end state
+            | alt_states: list of values in the state_key column that specify the alternate end states
+            | For example:
+            |      {"state_key": "disease",
+            |      "start_state": "dcm",
+            |      "goal_state": "nf",
+            |      "alt_states": ["hcm", "other1", "other2"]}
+        model_directory : Path
+            | Path to directory containing model
+        input_data_file : Path
+            | Path to directory containing .dataset inputs
+        output_directory : Path
+            | Path to directory where embedding data will be saved as csv
+        output_prefix : str
+            | Prefix for output file
+        output_torch_embs : bool
+            | Whether or not to also output the embeddings as a tensor.
+            | Note, if true, will output embeddings as both dataframe and tensor.
+        **Outputs**
+        | Outputs state_embs_dict for use with in silico perturber.
+        | Format is dictionary of embedding positions of each cell state to model shifts from/towards.
+        | Keys specify each possible cell state to model.
+        | Values are target embedding positions as torch.tensor.
+        | For example:
+        |      {"nf": emb_nf,
+        |      "hcm": emb_hcm,
+        |      "dcm": emb_dcm,
+        |      "other1": emb_other1,
+        |      "other2": emb_other2}
+        """
+        pu.validate_cell_states_to_model(cell_states_to_model)
+        valid_summary_stats = ["exact_mean", "exact_median"]
+        if self.exact_summary_stat not in valid_summary_stats:
+            logger.error(
+                "For extracting state embs, summary_stat in EmbExtractor "
+                f"must be set to option in {valid_summary_stats}"
+            )
+            raise
+        state_embs_dict = dict()
+        state_key = cell_states_to_model["state_key"]
+        for k, v in cell_states_to_model.items():
+            if k == "state_key":
+                continue
+            elif (k == "start_state") or (k == "goal_state"):
+                state_embs_dict[v] = self.extract_embs(
+                    model_directory,
+                    input_data_file,
+                    output_directory,
+                    output_prefix,
+                    output_torch_embs,
+                    cell_state={state_key: v},
+                )
+            else:  # k == "alt_states"
+                for alt_state in v:
+                    state_embs_dict[alt_state] = self.extract_embs(
+                        model_directory,
+                        input_data_file,
+                        output_directory,
+                        output_prefix,
+                        output_torch_embs,
+                        cell_state={state_key: alt_state},
+                    )
+        output_path = (Path(output_directory) / output_prefix).with_suffix(".pkl")
+        with open(output_path, "wb") as fp:
+            pickle.dump(state_embs_dict, fp)
+        return state_embs_dict
+    def plot_embs(
+        self,
+        embs,
+        plot_style,
+        output_directory,
+        output_prefix,
+        max_ncells_to_plot=1000,
+        kwargs_dict=None,
+    ):
+        """
+        Plot embeddings, coloring by provided labels.
+        **Parameters:**
+        embs : pandas.core.frame.DataFrame
+            | Pandas dataframe containing embeddings output from extract_embs
+        plot_style : str
+            | Style of plot: "heatmap" or "umap"
+        output_directory : Path
+            | Path to directory where plots will be saved as pdf
+        output_prefix : str
+            | Prefix for output file
+        max_ncells_to_plot : None, int
+            | Maximum number of cells to plot.
+            | Default is 1000 cells randomly sampled from embeddings.
+            | If None, will plot embeddings from all cells.
+        kwargs_dict : dict
+            | Dictionary of kwargs to pass to plotting function.
+        **Examples:**
+        .. code-block :: python
+            >>> embex.plot_embs(embs=embs,
+            ...                 plot_style="heatmap",
+            ...                 output_directory="path/to/output_directory",
+            ...                 output_prefix="output_prefix")
+        """
+        if plot_style not in ["heatmap", "umap"]:
+            logger.error(
+                "Invalid option for 'plot_style'. " "Valid options: {'heatmap','umap'}"
+            )
+            raise
+        if (plot_style == "umap") and (self.labels_to_plot is None):
+            logger.error("Plotting UMAP requires 'labels_to_plot'. ")
+            raise
+        if max_ncells_to_plot > self.max_ncells:
+            max_ncells_to_plot = self.max_ncells
+            logger.warning(
+                "max_ncells_to_plot must be <= max_ncells. "
+                f"Changing max_ncells_to_plot to {self.max_ncells}."
+            )
+        if (max_ncells_to_plot is not None) and (max_ncells_to_plot < self.max_ncells):
+            embs = embs.sample(max_ncells_to_plot, axis=0)
+        if self.emb_label is None:
+            label_len = 0
+        else:
+            label_len = len(self.emb_label)
+        emb_dims = embs.shape[1] - label_len
+        if self.emb_label is None:
+            emb_labels = None
+        else:
+            emb_labels = embs.columns[emb_dims:]
+        if plot_style == "umap":
+            for label in self.labels_to_plot:
+                if label not in emb_labels:
+                    logger.warning(
+                        f"Label {label} from labels_to_plot "
+                        f"not present in provided embeddings dataframe."
+                    )
+                    continue
+                output_prefix_label = "_" + output_prefix + f"_umap_{label}"
+                output_file = (
+                    Path(output_directory) / output_prefix_label
+                ).with_suffix(".pdf")
+                plot_umap(embs, emb_dims, label, output_prefix_label, kwargs_dict)
+        if plot_style == "heatmap":
+            for label in self.labels_to_plot:
+                if label not in emb_labels:
+                    logger.warning(
+                        f"Label {label} from labels_to_plot "
+                        f"not present in provided embeddings dataframe."
+                    )
+                    continue
+                output_prefix_label = output_prefix + f"_heatmap_{label}"
+                output_file = (
+                    Path(output_directory) / output_prefix_label
+                ).with_suffix(".pdf")
+                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

geneformer/gene_median_dictionary.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3b589bb5ec75040d05fc44dd6bf0184cf87f3c362cf158d196a6ed3b7fe5f39
+size 940965

geneformer/gene_name_id_dict.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55e67962e79c0039a6c32d43c5c99f38e51964bbcfa32f736150ee1e285c438c
+size 1117117

geneformer/in_silico_perturber.py ADDED Viewed

	@@ -0,0 +1,915 @@

+"""
+Geneformer in silico perturber.
+**Usage:**
+.. code-block :: python
+    >>> from geneformer import InSilicoPerturber
+    >>> isp = InSilicoPerturber(perturb_type="delete",
+    ...                         perturb_rank_shift=None,
+    ...                         genes_to_perturb="all",
+    ...                         model_type="CellClassifier",
+    ...                         num_classes=0,
+    ...                         emb_mode="cell",
+    ...                         filter_data={"cell_type":["cardiomyocyte"]},
+    ...                         cell_states_to_model={"state_key": "disease", "start_state": "dcm", "goal_state": "nf", "alt_states": ["hcm", "other1", "other2"]},
+    ...                         state_embs_dict ={"nf": emb_nf, "hcm": emb_hcm, "dcm": emb_dcm, "other1": emb_other1, "other2": emb_other2},
+    ...                         max_ncells=None,
+    ...                         emb_layer=0,
+    ...                         forward_batch_size=100,
+    ...                         nproc=16)
+    >>> isp.perturb_data("path/to/model",
+    ...                  "path/to/input_data",
+    ...                  "path/to/output_directory",
+    ...                  "output_prefix")
+**Description:**
+| Performs in silico perturbation (e.g. deletion or overexpression) of defined set of genes or all genes in sample of cells.
+| Outputs impact of perturbation on cell or gene embeddings.
+| Output files are analyzed with ``in_silico_perturber_stats``.
+"""
+import logging
+# imports
+import os
+import pickle
+from collections import defaultdict
+from typing import List
+import seaborn as sns
+import torch
+from datasets import Dataset
+from tqdm.auto import trange
+from . import perturber_utils as pu
+from .emb_extractor import get_embs
+from .tokenizer import TOKEN_DICTIONARY_FILE
+sns.set()
+logger = logging.getLogger(__name__)
+class InSilicoPerturber:
+    valid_option_dict = {
+        "perturb_type": {"delete", "overexpress", "inhibit", "activate"},
+        "perturb_rank_shift": {None, 1, 2, 3},
+        "genes_to_perturb": {"all", list},
+        "combos": {0, 1},
+        "anchor_gene": {None, str},
+        "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
+        "num_classes": {int},
+        "emb_mode": {"cell", "cell_and_gene"},
+        "cell_emb_style": {"mean_pool"},
+        "filter_data": {None, dict},
+        "cell_states_to_model": {None, dict},
+        "state_embs_dict": {None, dict},
+        "max_ncells": {None, int},
+        "cell_inds_to_perturb": {"all", dict},
+        "emb_layer": {-1, 0},
+        "forward_batch_size": {int},
+        "nproc": {int},
+    }
+    def __init__(
+        self,
+        perturb_type="delete",
+        perturb_rank_shift=None,
+        genes_to_perturb="all",
+        combos=0,
+        anchor_gene=None,
+        model_type="Pretrained",
+        num_classes=0,
+        emb_mode="cell",
+        cell_emb_style="mean_pool",
+        filter_data=None,
+        cell_states_to_model=None,
+        state_embs_dict=None,
+        max_ncells=None,
+        cell_inds_to_perturb="all",
+        emb_layer=-1,
+        forward_batch_size=100,
+        nproc=4,
+        token_dictionary_file=TOKEN_DICTIONARY_FILE,
+    ):
+        """
+        Initialize in silico perturber.
+        **Parameters:**
+        perturb_type : {"delete", "overexpress", "inhibit", "activate"}
+            | Type of perturbation.
+            | "delete": delete gene from rank value encoding
+            | "overexpress": move gene to front of rank value encoding
+            | *(TBA)* "inhibit": move gene to lower quartile of rank value encoding
+            | *(TBA)* "activate": move gene to higher quartile of rank value encoding
+        *(TBA)* perturb_rank_shift : None, {1,2,3}
+            | Number of quartiles by which to shift rank of gene.
+            | For example, if perturb_type="activate" and perturb_rank_shift=1:
+            |     genes in 4th quartile will move to middle of 3rd quartile.
+            |     genes in 3rd quartile will move to middle of 2nd quartile.
+            |     genes in 2nd quartile will move to middle of 1st quartile.
+            |     genes in 1st quartile will move to front of rank value encoding.
+            | For example, if perturb_type="inhibit" and perturb_rank_shift=2:
+            |     genes in 1st quartile will move to middle of 3rd quartile.
+            |     genes in 2nd quartile will move to middle of 4th quartile.
+            |     genes in 3rd or 4th quartile will move to bottom of rank value encoding.
+        genes_to_perturb : "all", list
+            | Default is perturbing each gene detected in each cell in the dataset.
+            | Otherwise, may provide a list of ENSEMBL IDs of genes to perturb.
+            | If gene list is provided, then perturber will only test perturbing them all together
+            | (rather than testing each possible combination of the provided genes).
+        combos : {0,1}
+            | Whether to perturb genes individually (0) or in pairs (1).
+        anchor_gene : None, str
+            | ENSEMBL ID of gene to use as anchor in combination perturbations.
+            | For example, if combos=1 and anchor_gene="ENSG00000148400":
+            |     anchor gene will be perturbed in combination with each other gene.
+        model_type : {"Pretrained", "GeneClassifier", "CellClassifier"}
+            | Whether model is the pretrained Geneformer or a fine-tuned gene or cell classifier.
+        num_classes : int
+            | If model is a gene or cell classifier, specify number of classes it was trained to classify.
+            | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
+        emb_mode : {"cell", "cell_and_gene"}
+            | Whether to output impact of perturbation on cell and/or gene embeddings.
+            | Gene embedding shifts only available as compared to original cell, not comparing to goal state.
+        cell_emb_style : "mean_pool"
+            | Method for summarizing cell embeddings.
+            | Currently only option is mean pooling of gene embeddings for given cell.
+        filter_data : None, dict
+            | Default is to use all input data for in silico perturbation study.
+            | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
+        cell_states_to_model : None, dict
+            | Cell states to model if testing perturbations that achieve goal state change.
+            | Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
+            | state_key: key specifying name of column in .dataset that defines the start/goal states
+            | start_state: value in the state_key column that specifies the start state
+            | goal_state: value in the state_key column taht specifies the goal end state
+            | alt_states: list of values in the state_key column that specify the alternate end states
+            | For example: {"state_key": "disease",
+            |               "start_state": "dcm",
+            |               "goal_state": "nf",
+            |               "alt_states": ["hcm", "other1", "other2"]}
+        state_embs_dict : None, dict
+            | Embedding positions of each cell state to model shifts from/towards (e.g. mean or median).
+            | Dictionary with keys specifying each possible cell state to model.
+            | Values are target embedding positions as torch.tensor.
+            | For example: {"nf": emb_nf,
+            |               "hcm": emb_hcm,
+            |               "dcm": emb_dcm,
+            |               "other1": emb_other1,
+            |               "other2": emb_other2}
+        max_ncells : None, int
+            | Maximum number of cells to test.
+            | If None, will test all cells.
+        cell_inds_to_perturb : "all", list
+            | Default is perturbing each cell in the dataset.
+            | Otherwise, may provide a dict of indices of cells to perturb with keys start_ind and end_ind.
+            | start_ind: the first index to perturb.
+            | end_ind: the last index to perturb (exclusive).
+            | Indices will be selected *after* the filter_data criteria and sorting.
+            | Useful for splitting extremely large datasets across separate GPUs.
+        emb_layer : {-1, 0}
+            | Embedding layer to use for quantification.
+            | 0: last layer (recommended for questions closely tied to model's training objective)
+            | -1: 2nd to last layer (recommended for questions requiring more general representations)
+        forward_batch_size : int
+            | Batch size for forward pass.
+        nproc : int
+            | Number of CPU processes to use.
+        token_dictionary_file : Path
+            | Path to pickle file containing token dictionary (Ensembl ID:token).
+        """
+        self.perturb_type = perturb_type
+        self.perturb_rank_shift = perturb_rank_shift
+        self.genes_to_perturb = genes_to_perturb
+        self.combos = combos
+        self.anchor_gene = anchor_gene
+        if self.genes_to_perturb == "all":
+            self.perturb_group = False
+        else:
+            self.perturb_group = True
+            if (self.anchor_gene is not None) or (self.combos != 0):
+                self.anchor_gene = None
+                self.combos = 0
+                logger.warning(
+                    "anchor_gene set to None and combos set to 0. "
+                    "If providing list of genes to perturb, "
+                    "list of genes_to_perturb will be perturbed together, "
+                    "without anchor gene or combinations."
+                )
+        self.model_type = model_type
+        self.num_classes = num_classes
+        self.emb_mode = emb_mode
+        self.cell_emb_style = cell_emb_style
+        self.filter_data = filter_data
+        self.cell_states_to_model = cell_states_to_model
+        self.state_embs_dict = state_embs_dict
+        self.max_ncells = max_ncells
+        self.cell_inds_to_perturb = cell_inds_to_perturb
+        self.emb_layer = emb_layer
+        self.forward_batch_size = forward_batch_size
+        self.nproc = nproc
+        self.validate_options()
+        # load token dictionary (Ensembl IDs:token)
+        with open(token_dictionary_file, "rb") as f:
+            self.gene_token_dict = pickle.load(f)
+        self.pad_token_id = self.gene_token_dict.get("<pad>")
+        if self.anchor_gene is None:
+            self.anchor_token = None
+        else:
+            try:
+                self.anchor_token = [self.gene_token_dict[self.anchor_gene]]
+            except KeyError:
+                logger.error(f"Anchor gene {self.anchor_gene} not in token dictionary.")
+                raise
+        if self.genes_to_perturb == "all":
+            self.tokens_to_perturb = "all"
+        else:
+            missing_genes = [
+                gene
+                for gene in self.genes_to_perturb
+                if gene not in self.gene_token_dict.keys()
+            ]
+            if len(missing_genes) == len(self.genes_to_perturb):
+                logger.error(
+                    "None of the provided genes to perturb are in token dictionary."
+                )
+                raise
+            elif len(missing_genes) > 0:
+                logger.warning(
+                    f"Genes to perturb {missing_genes} are not in token dictionary."
+                )
+            self.tokens_to_perturb = [
+                self.gene_token_dict.get(gene) for gene in self.genes_to_perturb
+            ]
+    def validate_options(self):
+        # first disallow options under development
+        if self.perturb_type in ["inhibit", "activate"]:
+            logger.error(
+                "In silico inhibition and activation currently under development. "
+                "Current valid options for 'perturb_type': 'delete' or 'overexpress'"
+            )
+            raise
+        if (self.combos > 0) and (self.anchor_token is None):
+            logger.error(
+                "Combination perturbation without anchor gene is currently under development. "
+                "Currently, must provide anchor gene for combination perturbation."
+            )
+            raise
+        # confirm arguments are within valid options and compatible with each other
+        for attr_name, valid_options in self.valid_option_dict.items():
+            attr_value = self.__dict__[attr_name]
+            if type(attr_value) not in {list, dict}:
+                if attr_value in valid_options:
+                    continue
+                if attr_name in ["anchor_gene"]:
+                    if type(attr_name) in {str}:
+                        continue
+            valid_type = False
+            for option in valid_options:
+                if (option in [bool, int, list, dict]) and isinstance(
+                    attr_value, option
+                ):
+                    valid_type = True
+                    break
+            if valid_type:
+                continue
+            logger.error(
+                f"Invalid option for {attr_name}. "
+                f"Valid options for {attr_name}: {valid_options}"
+            )
+            raise
+        if self.perturb_type in ["delete", "overexpress"]:
+            if self.perturb_rank_shift is not None:
+                if self.perturb_type == "delete":
+                    logger.warning(
+                        "perturb_rank_shift set to None. "
+                        "If perturb type is delete then gene is deleted entirely "
+                        "rather than shifted by quartile"
+                    )
+                elif self.perturb_type == "overexpress":
+                    logger.warning(
+                        "perturb_rank_shift set to None. "
+                        "If perturb type is overexpress then gene is moved to front "
+                        "of rank value encoding rather than shifted by quartile"
+                    )
+            self.perturb_rank_shift = None
+        if (self.anchor_gene is not None) and (self.emb_mode == "cell_and_gene"):
+            self.emb_mode = "cell"
+            logger.warning(
+                "emb_mode set to 'cell'. "
+                "Currently, analysis with anchor gene "
+                "only outputs effect on cell embeddings."
+            )
+        if self.cell_states_to_model is not None:
+            pu.validate_cell_states_to_model(self.cell_states_to_model)
+            if self.anchor_gene is not None:
+                self.anchor_gene = None
+                logger.warning(
+                    "anchor_gene set to None. "
+                    "Currently, anchor gene not available "
+                    "when modeling multiple cell states."
+                )
+            if self.state_embs_dict is None:
+                logger.error(
+                    "state_embs_dict must be provided for mode with cell_states_to_model. "
+                    "Format is dictionary with keys specifying each possible cell state to model. "
+                    "Values are target embedding positions as torch.tensor."
+                )
+                raise
+            for state_emb in self.state_embs_dict.values():
+                if not torch.is_tensor(state_emb):
+                    logger.error(
+                        "state_embs_dict must be dictionary with values being torch.tensor."
+                    )
+                    raise
+            keys_absent = []
+            for k, v in self.cell_states_to_model.items():
+                if (k == "start_state") or (k == "goal_state"):
+                    if v not in self.state_embs_dict.keys():
+                        keys_absent.append(v)
+                if k == "alt_states":
+                    for state in v:
+                        if state not in self.state_embs_dict.keys():
+                            keys_absent.append(state)
+            if len(keys_absent) > 0:
+                logger.error(
+                    "Each start_state, goal_state, and alt_states in cell_states_to_model "
+                    "must be a key in state_embs_dict with the value being "
+                    "the state's embedding position as torch.tensor. "
+                    f"Missing keys: {keys_absent}"
+                )
+                raise
+        if self.perturb_type in ["inhibit", "activate"]:
+            if self.perturb_rank_shift is None:
+                logger.error(
+                    "If perturb_type is inhibit or activate then "
+                    "quartile to shift by must be specified."
+                )
+                raise
+        if self.filter_data is not None:
+            for key, value in self.filter_data.items():
+                if not isinstance(value, list):
+                    self.filter_data[key] = [value]
+                    logger.warning(
+                        "Values in filter_data dict must be lists. "
+                        f"Changing {key} value to list ([{value}])."
+                    )
+        if self.cell_inds_to_perturb != "all":
+            if set(self.cell_inds_to_perturb.keys()) != {"start", "end"}:
+                logger.error(
+                    "If cell_inds_to_perturb is a dictionary, keys must be 'start' and 'end'."
+                )
+                raise
+            if (
+                self.cell_inds_to_perturb["start"] < 0
+                or self.cell_inds_to_perturb["end"] < 0
+            ):
+                logger.error("cell_inds_to_perturb must be positive.")
+                raise
+    def perturb_data(
+        self, model_directory, input_data_file, output_directory, output_prefix
+    ):
+        """
+        Perturb genes in input data and save as results in output_directory.
+        **Parameters:**
+        model_directory : Path
+            | Path to directory containing model
+        input_data_file : Path
+            | Path to directory containing .dataset inputs
+        output_directory : Path
+            | Path to directory where perturbation data will be saved as batched pickle files
+        output_prefix : str
+            | Prefix for output files
+        """
+        ### format output path ###
+        output_path_prefix = os.path.join(
+            output_directory, f"in_silico_{self.perturb_type}_{output_prefix}"
+        )
+        ### load model and define parameters ###
+        model = pu.load_model(self.model_type, self.num_classes, model_directory)
+        self.max_len = pu.get_model_input_size(model)
+        layer_to_quant = pu.quant_layers(model) + self.emb_layer
+        ### filter input data ###
+        # general filtering of input data based on filter_data argument
+        filtered_input_data = pu.load_and_filter(
+            self.filter_data, self.nproc, input_data_file
+        )
+        filtered_input_data = self.apply_additional_filters(filtered_input_data)
+        if self.perturb_group is True:
+            self.isp_perturb_set(
+                model, filtered_input_data, layer_to_quant, output_path_prefix
+            )
+        else:
+            self.isp_perturb_all(
+                model, filtered_input_data, layer_to_quant, output_path_prefix
+            )
+    def apply_additional_filters(self, filtered_input_data):
+        # additional filtering of input data dependent on isp mode
+        if self.cell_states_to_model is not None:
+            # filter for cells with start_state and log result
+            filtered_input_data = pu.filter_data_by_start_state(
+                filtered_input_data, self.cell_states_to_model, self.nproc
+            )
+        if (self.tokens_to_perturb != "all") and (self.perturb_type != "overexpress"):
+            # filter for cells with tokens_to_perturb and log result
+            filtered_input_data = pu.filter_data_by_tokens_and_log(
+                filtered_input_data,
+                self.tokens_to_perturb,
+                self.nproc,
+                "genes_to_perturb",
+            )
+        if self.anchor_token is not None:
+            # filter for cells with anchor gene and log result
+            filtered_input_data = pu.filter_data_by_tokens_and_log(
+                filtered_input_data, self.anchor_token, self.nproc, "anchor_gene"
+            )
+        # downsample and sort largest to smallest to encounter memory constraints earlier
+        filtered_input_data = pu.downsample_and_sort(
+            filtered_input_data, self.max_ncells
+        )
+        # slice dataset if cells_inds_to_perturb is not "all"
+        if self.cell_inds_to_perturb != "all":
+            filtered_input_data = pu.slice_by_inds_to_perturb(
+                filtered_input_data, self.cell_inds_to_perturb
+            )
+        return filtered_input_data
+    def isp_perturb_set(
+        self,
+        model,
+        filtered_input_data: Dataset,
+        layer_to_quant: int,
+        output_path_prefix: str,
+    ):
+        def make_group_perturbation_batch(example):
+            example_input_ids = example["input_ids"]
+            example["tokens_to_perturb"] = self.tokens_to_perturb
+            indices_to_perturb = [
+                example_input_ids.index(token) if token in example_input_ids else None
+                for token in self.tokens_to_perturb
+            ]
+            indices_to_perturb = [
+                item for item in indices_to_perturb if item is not None
+            ]
+            if len(indices_to_perturb) > 0:
+                example["perturb_index"] = indices_to_perturb
+            else:
+                # -100 indicates tokens to overexpress are not present in rank value encoding
+                example["perturb_index"] = [-100]
+            if self.perturb_type == "delete":
+                example = pu.delete_indices(example)
+            elif self.perturb_type == "overexpress":
+                example = pu.overexpress_tokens(example, self.max_len)
+                example["n_overflow"] = pu.calc_n_overflow(
+                    self.max_len,
+                    example["length"],
+                    self.tokens_to_perturb,
+                    indices_to_perturb,
+                )
+            return example
+        total_batch_length = len(filtered_input_data)
+        if self.cell_states_to_model is None:
+            cos_sims_dict = defaultdict(list)
+        else:
+            cos_sims_dict = {
+                state: defaultdict(list)
+                for state in pu.get_possible_states(self.cell_states_to_model)
+            }
+        perturbed_data = filtered_input_data.map(
+            make_group_perturbation_batch, num_proc=self.nproc
+        )
+        if self.perturb_type == "overexpress":
+            filtered_input_data = filtered_input_data.add_column(
+                "n_overflow", perturbed_data["n_overflow"]
+            )
+            # remove overflow genes from original data so that embeddings are comparable
+            # i.e. if original cell has genes 0:2047 and you want to overexpress new gene 2048,
+            # then the perturbed cell will be 2048+0:2046 so we compare it to an original cell 0:2046.
+            # (otherwise we will be modeling the effect of both deleting 2047 and adding 2048,
+            # rather than only adding 2048)
+            filtered_input_data = filtered_input_data.map(
+                pu.truncate_by_n_overflow, num_proc=self.nproc
+            )
+        if self.emb_mode == "cell_and_gene":
+            stored_gene_embs_dict = defaultdict(list)
+        # iterate through batches
+        for i in trange(0, total_batch_length, self.forward_batch_size):
+            max_range = min(i + self.forward_batch_size, total_batch_length)
+            inds_select = [i for i in range(i, max_range)]
+            minibatch = filtered_input_data.select(inds_select)
+            perturbation_batch = perturbed_data.select(inds_select)
+            if self.cell_emb_style == "mean_pool":
+                full_original_emb = get_embs(
+                    model,
+                    minibatch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    summary_stat=None,
+                    silent=True,
+                )
+                indices_to_perturb = perturbation_batch["perturb_index"]
+                # remove indices that were perturbed
+                original_emb = pu.remove_perturbed_indices_set(
+                    full_original_emb,
+                    self.perturb_type,
+                    indices_to_perturb,
+                    self.tokens_to_perturb,
+                    minibatch["length"],
+                )
+                full_perturbation_emb = get_embs(
+                    model,
+                    perturbation_batch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    summary_stat=None,
+                    silent=True,
+                )
+                # remove overexpressed genes
+                if self.perturb_type == "overexpress":
+                    perturbation_emb = full_perturbation_emb[
+                        :, len(self.tokens_to_perturb) :, :
+                    ]
+                elif self.perturb_type == "delete":
+                    perturbation_emb = full_perturbation_emb[
+                        :, : max(perturbation_batch["length"]), :
+                    ]
+                n_perturbation_genes = perturbation_emb.size()[1]
+                # if no goal states, the cosine similarties are the mean of gene cosine similarities
+                if (
+                    self.cell_states_to_model is None
+                    or self.emb_mode == "cell_and_gene"
+                ):
+                    gene_cos_sims = pu.quant_cos_sims(
+                        perturbation_emb,
+                        original_emb,
+                        self.cell_states_to_model,
+                        self.state_embs_dict,
+                        emb_mode="gene",
+                    )
+                # if there are goal states, the cosine similarities are the cell cosine similarities
+                if self.cell_states_to_model is not None:
+                    original_cell_emb = pu.mean_nonpadding_embs(
+                        full_original_emb,
+                        torch.tensor(minibatch["length"], device="cuda"),
+                        dim=1,
+                    )
+                    perturbation_cell_emb = pu.mean_nonpadding_embs(
+                        full_perturbation_emb,
+                        torch.tensor(perturbation_batch["length"], device="cuda"),
+                        dim=1,
+                    )
+                    cell_cos_sims = pu.quant_cos_sims(
+                        perturbation_cell_emb,
+                        original_cell_emb,
+                        self.cell_states_to_model,
+                        self.state_embs_dict,
+                        emb_mode="cell",
+                    )
+                # get cosine similarities in gene embeddings
+                # if getting gene embeddings, need gene names
+                if self.emb_mode == "cell_and_gene":
+                    gene_list = minibatch["input_ids"]
+                    # need to truncate gene_list
+                    gene_list = [
+                        [g for g in genes if g not in self.tokens_to_perturb][
+                            :n_perturbation_genes
+                        ]
+                        for genes in gene_list
+                    ]
+                    for cell_i, genes in enumerate(gene_list):
+                        for gene_j, affected_gene in enumerate(genes):
+                            if len(self.genes_to_perturb) > 1:
+                                tokens_to_perturb = tuple(self.tokens_to_perturb)
+                            else:
+                                tokens_to_perturb = self.tokens_to_perturb[0]
+                            # fill in the gene cosine similarities
+                            try:
+                                stored_gene_embs_dict[
+                                    (tokens_to_perturb, affected_gene)
+                                ].append(gene_cos_sims[cell_i, gene_j].item())
+                            except KeyError:
+                                stored_gene_embs_dict[
+                                    (tokens_to_perturb, affected_gene)
+                                ] = gene_cos_sims[cell_i, gene_j].item()
+                else:
+                    gene_list = None
+            if self.cell_states_to_model is None:
+                # calculate the mean of the gene cosine similarities for cell shift
+                # tensor of nonpadding lengths for each cell
+                if self.perturb_type == "overexpress":
+                    # subtract number of genes that were overexpressed
+                    # since they are removed before getting cos sims
+                    n_overexpressed = len(self.tokens_to_perturb)
+                    nonpadding_lens = [
+                        x - n_overexpressed for x in perturbation_batch["length"]
+                    ]
+                else:
+                    nonpadding_lens = perturbation_batch["length"]
+                cos_sims_data = pu.mean_nonpadding_embs(
+                    gene_cos_sims, torch.tensor(nonpadding_lens, device="cuda")
+                )
+                cos_sims_dict = self.update_perturbation_dictionary(
+                    cos_sims_dict,
+                    cos_sims_data,
+                    filtered_input_data,
+                    indices_to_perturb,
+                    gene_list,
+                )
+            else:
+                cos_sims_data = cell_cos_sims
+                for state in cos_sims_dict.keys():
+                    cos_sims_dict[state] = self.update_perturbation_dictionary(
+                        cos_sims_dict[state],
+                        cos_sims_data[state],
+                        filtered_input_data,
+                        indices_to_perturb,
+                        gene_list,
+                    )
+            del minibatch
+            del perturbation_batch
+            del original_emb
+            del perturbation_emb
+            del cos_sims_data
+            torch.cuda.empty_cache()
+        pu.write_perturbation_dictionary(
+            cos_sims_dict,
+            f"{output_path_prefix}_cell_embs_dict_{self.tokens_to_perturb}",
+        )
+        if self.emb_mode == "cell_and_gene":
+            pu.write_perturbation_dictionary(
+                stored_gene_embs_dict,
+                f"{output_path_prefix}_gene_embs_dict_{self.tokens_to_perturb}",
+            )
+    def isp_perturb_all(
+        self,
+        model,
+        filtered_input_data: Dataset,
+        layer_to_quant: int,
+        output_path_prefix: str,
+    ):
+        pickle_batch = -1
+        if self.cell_states_to_model is None:
+            cos_sims_dict = defaultdict(list)
+        else:
+            cos_sims_dict = {
+                state: defaultdict(list)
+                for state in pu.get_possible_states(self.cell_states_to_model)
+            }
+        if self.emb_mode == "cell_and_gene":
+            stored_gene_embs_dict = defaultdict(list)
+        for i in trange(len(filtered_input_data)):
+            example_cell = filtered_input_data.select([i])
+            full_original_emb = get_embs(
+                model,
+                example_cell,
+                "gene",
+                layer_to_quant,
+                self.pad_token_id,
+                self.forward_batch_size,
+                summary_stat=None,
+                silent=True,
+            )
+            # gene_list is used to assign cos sims back to genes
+            # need to remove the anchor gene
+            gene_list = example_cell["input_ids"][0][:]
+            if self.anchor_token is not None:
+                for token in self.anchor_token:
+                    gene_list.remove(token)
+            perturbation_batch, indices_to_perturb = pu.make_perturbation_batch(
+                example_cell,
+                self.perturb_type,
+                self.tokens_to_perturb,
+                self.anchor_token,
+                self.combos,
+                self.nproc,
+            )
+            full_perturbation_emb = get_embs(
+                model,
+                perturbation_batch,
+                "gene",
+                layer_to_quant,
+                self.pad_token_id,
+                self.forward_batch_size,
+                summary_stat=None,
+                silent=True,
+            )
+            num_inds_perturbed = 1 + self.combos
+            # need to remove overexpressed gene to quantify cosine shifts
+            if self.perturb_type == "overexpress":
+                perturbation_emb = full_perturbation_emb[:, num_inds_perturbed:, :]
+                gene_list = gene_list[
+                    num_inds_perturbed:
+                ]  # index 0 is not overexpressed
+            elif self.perturb_type == "delete":
+                perturbation_emb = full_perturbation_emb
+            original_batch = pu.make_comparison_batch(
+                full_original_emb, indices_to_perturb, perturb_group=False
+            )
+            if self.cell_states_to_model is None or self.emb_mode == "cell_and_gene":
+                gene_cos_sims = pu.quant_cos_sims(
+                    perturbation_emb,
+                    original_batch,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="gene",
+                )
+            if self.cell_states_to_model is not None:
+                original_cell_emb = pu.compute_nonpadded_cell_embedding(
+                    full_original_emb, "mean_pool"
+                )
+                perturbation_cell_emb = pu.compute_nonpadded_cell_embedding(
+                    full_perturbation_emb, "mean_pool"
+                )
+                cell_cos_sims = pu.quant_cos_sims(
+                    perturbation_cell_emb,
+                    original_cell_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="cell",
+                )
+            if self.emb_mode == "cell_and_gene":
+                # remove perturbed index for gene list
+                perturbed_gene_dict = {
+                    gene: gene_list[:i] + gene_list[i + 1 :]
+                    for i, gene in enumerate(gene_list)
+                }
+                for perturbation_i, perturbed_gene in enumerate(gene_list):
+                    for gene_j, affected_gene in enumerate(
+                        perturbed_gene_dict[perturbed_gene]
+                    ):
+                        try:
+                            stored_gene_embs_dict[
+                                (perturbed_gene, affected_gene)
+                            ].append(gene_cos_sims[perturbation_i, gene_j].item())
+                        except KeyError:
+                            stored_gene_embs_dict[
+                                (perturbed_gene, affected_gene)
+                            ] = gene_cos_sims[perturbation_i, gene_j].item()
+            if self.cell_states_to_model is None:
+                cos_sims_data = torch.mean(gene_cos_sims, dim=1)
+                cos_sims_dict = self.update_perturbation_dictionary(
+                    cos_sims_dict,
+                    cos_sims_data,
+                    filtered_input_data,
+                    indices_to_perturb,
+                    gene_list,
+                )
+            else:
+                cos_sims_data = cell_cos_sims
+                for state in cos_sims_dict.keys():
+                    cos_sims_dict[state] = self.update_perturbation_dictionary(
+                        cos_sims_dict[state],
+                        cos_sims_data[state],
+                        filtered_input_data,
+                        indices_to_perturb,
+                        gene_list,
+                    )
+            # save dict to disk every 100 cells
+            if i % 100 == 0:
+                pu.write_perturbation_dictionary(
+                    cos_sims_dict,
+                    f"{output_path_prefix}_dict_cell_embs_1Kbatch{pickle_batch}",
+                )
+                if self.emb_mode == "cell_and_gene":
+                    pu.write_perturbation_dictionary(
+                        stored_gene_embs_dict,
+                        f"{output_path_prefix}_dict_gene_embs_1Kbatch{pickle_batch}",
+                    )
+            # reset and clear memory every 1000 cells
+            if i % 1000 == 0:
+                pickle_batch += 1
+                if self.cell_states_to_model is None:
+                    cos_sims_dict = defaultdict(list)
+                else:
+                    cos_sims_dict = {
+                        state: defaultdict(list)
+                        for state in pu.get_possible_states(self.cell_states_to_model)
+                    }
+                if self.emb_mode == "cell_and_gene":
+                    stored_gene_embs_dict = defaultdict(list)
+                torch.cuda.empty_cache()
+        pu.write_perturbation_dictionary(
+            cos_sims_dict, f"{output_path_prefix}_dict_cell_embs_1Kbatch{pickle_batch}"
+        )
+        if self.emb_mode == "cell_and_gene":
+            pu.write_perturbation_dictionary(
+                stored_gene_embs_dict,
+                f"{output_path_prefix}_dict_gene_embs_1Kbatch{pickle_batch}",
+            )
+    def update_perturbation_dictionary(
+        self,
+        cos_sims_dict: defaultdict,
+        cos_sims_data: torch.Tensor,
+        filtered_input_data: Dataset,
+        indices_to_perturb: List[List[int]],
+        gene_list=None,
+    ):
+        if gene_list is not None and cos_sims_data.shape[0] != len(gene_list):
+            logger.error(
+                f"len(cos_sims_data.shape[0]) != len(gene_list). \n \
+                            cos_sims_data.shape[0] = {cos_sims_data.shape[0]}.\n \
+                            len(gene_list) = {len(gene_list)}."
+            )
+            raise
+        if self.perturb_group is True:
+            if len(self.tokens_to_perturb) > 1:
+                perturbed_genes = tuple(self.tokens_to_perturb)
+            else:
+                perturbed_genes = self.tokens_to_perturb[0]
+            # if cell embeddings, can just append
+            # shape will be (batch size, 1)
+            cos_sims_data = torch.squeeze(cos_sims_data).tolist()
+            # handle case of single cell left
+            if not isinstance(cos_sims_data, list):
+                cos_sims_data = [cos_sims_data]
+            cos_sims_dict[(perturbed_genes, "cell_emb")] += cos_sims_data
+        else:
+            for i, cos in enumerate(cos_sims_data.tolist()):
+                cos_sims_dict[(gene_list[i], "cell_emb")].append(cos)
+        return cos_sims_dict

geneformer/in_silico_perturber_stats.py ADDED Viewed

	@@ -0,0 +1,1042 @@

+"""
+Geneformer in silico perturber stats generator.
+**Usage:**
+.. code-block :: python
+    >>> from geneformer import InSilicoPerturberStats
+    >>> ispstats = InSilicoPerturberStats(mode="goal_state_shift",
+    ...    cell_states_to_model={"state_key": "disease",
+    ...                          "start_state": "dcm",
+    ...                          "goal_state": "nf",
+    ...                          "alt_states": ["hcm", "other1", "other2"]})
+    >>> ispstats.get_stats("path/to/input_data",
+    ...                    None,
+    ...                    "path/to/output_directory",
+    ...                    "output_prefix")
+**Description:**
+| Aggregates data or calculates stats for in silico perturbations based on type of statistics specified in InSilicoPerturberStats.
+| Input data is raw in silico perturbation results in the form of dictionaries outputted by ``in_silico_perturber``.
+"""
+import logging
+import os
+import pickle
+import random
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import statsmodels.stats.multitest as smt
+from scipy.stats import ranksums
+from sklearn.mixture import GaussianMixture
+from tqdm.auto import tqdm, trange
+from .perturber_utils import flatten_list, validate_cell_states_to_model
+from .tokenizer import TOKEN_DICTIONARY_FILE
+GENE_NAME_ID_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
+logger = logging.getLogger(__name__)
+# invert dictionary keys/values
+def invert_dict(dictionary):
+    return {v: k for k, v in dictionary.items()}
+def read_dict(cos_sims_dict, cell_or_gene_emb, anchor_token):
+    if cell_or_gene_emb == "cell":
+        cell_emb_dict = {
+            k: v for k, v in cos_sims_dict.items() if v and "cell_emb" in k
+        }
+        return [cell_emb_dict]
+    elif cell_or_gene_emb == "gene":
+        if anchor_token is None:
+            gene_emb_dict = {k: v for k, v in cos_sims_dict.items() if v}
+        else:
+            gene_emb_dict = {
+                k: v for k, v in cos_sims_dict.items() if v and anchor_token == k[0]
+            }
+    return [gene_emb_dict]
+# read raw dictionary files
+def read_dictionaries(
+    input_data_directory,
+    cell_or_gene_emb,
+    anchor_token,
+    cell_states_to_model,
+    pickle_suffix,
+):
+    file_found = False
+    file_path_list = []
+    if cell_states_to_model is None:
+        dict_list = []
+    else:
+        validate_cell_states_to_model(cell_states_to_model)
+        cell_states_to_model_valid = {
+            state: value
+            for state, value in cell_states_to_model.items()
+            if state != "state_key"
+            and cell_states_to_model[state] is not None
+            and cell_states_to_model[state] != []
+        }
+        cell_states_list = []
+        # flatten all state values into list
+        for state in cell_states_to_model_valid:
+            value = cell_states_to_model_valid[state]
+            if isinstance(value, list):
+                cell_states_list += value
+            else:
+                cell_states_list.append(value)
+        state_dict = {state_value: dict() for state_value in cell_states_list}
+    for file in os.listdir(input_data_directory):
+        # process only files with given suffix (e.g. "_raw.pickle")
+        if file.endswith(pickle_suffix):
+            file_found = True
+            file_path_list += [f"{input_data_directory}/{file}"]
+    for file_path in tqdm(file_path_list):
+        with open(file_path, "rb") as fp:
+            cos_sims_dict = pickle.load(fp)
+            if cell_states_to_model is None:
+                dict_list += read_dict(cos_sims_dict, cell_or_gene_emb, anchor_token)
+            else:
+                for state_value in cell_states_list:
+                    new_dict = read_dict(
+                        cos_sims_dict[state_value], cell_or_gene_emb, anchor_token
+                    )[0]
+                    for key in new_dict:
+                        try:
+                            state_dict[state_value][key] += new_dict[key]
+                        except KeyError:
+                            state_dict[state_value][key] = new_dict[key]
+    if not file_found:
+        logger.error(
+            "No raw data for processing found within provided directory. "
+            "Please ensure data files end with '{pickle_suffix}'."
+        )
+        raise
+    if cell_states_to_model is None:
+        return dict_list
+    else:
+        return state_dict
+# get complete gene list
+def get_gene_list(dict_list, mode):
+    if mode == "cell":
+        position = 0
+    elif mode == "gene":
+        position = 1
+    gene_set = set()
+    if isinstance(dict_list, list):
+        for dict_i in dict_list:
+            gene_set.update([k[position] for k, v in dict_i.items() if v])
+    elif isinstance(dict_list, dict):
+        for state, dict_i in dict_list.items():
+            gene_set.update([k[position] for k, v in dict_i.items() if v])
+    else:
+        logger.error(
+            "dict_list should be a list, or if modeling shift to goal states, a dict. "
+            f"{type(dict_list)} is not the correct format."
+        )
+        raise
+    gene_list = list(gene_set)
+    if mode == "gene":
+        gene_list.remove("cell_emb")
+    gene_list.sort()
+    return gene_list
+def token_tuple_to_ensembl_ids(token_tuple, gene_token_id_dict):
+    try:
+        return tuple([gene_token_id_dict.get(i, np.nan) for i in token_tuple])
+    except TypeError:
+        return gene_token_id_dict.get(token_tuple, np.nan)
+def n_detections(token, dict_list, mode, anchor_token):
+    cos_sim_megalist = []
+    for dict_i in dict_list:
+        if mode == "cell":
+            cos_sim_megalist += dict_i.get((token, "cell_emb"), [])
+        elif mode == "gene":
+            cos_sim_megalist += dict_i.get((anchor_token, token), [])
+    return len(cos_sim_megalist)
+def get_fdr(pvalues):
+    return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
+def get_impact_component(test_value, gaussian_mixture_model):
+    impact_border = gaussian_mixture_model.means_[0][0]
+    nonimpact_border = gaussian_mixture_model.means_[1][0]
+    if test_value > nonimpact_border:
+        impact_component = 0
+    elif test_value < impact_border:
+        impact_component = 1
+    else:
+        impact_component_raw = gaussian_mixture_model.predict([[test_value]])[0]
+        if impact_component_raw == 1:
+            impact_component = 0
+        elif impact_component_raw == 0:
+            impact_component = 1
+    return impact_component
+# aggregate data for single perturbation in multiple cells
+def isp_aggregate_grouped_perturb(cos_sims_df, dict_list):
+    names = ["Cosine_shift"]
+    cos_sims_full_df = pd.DataFrame(columns=names)
+    cos_shift_data = []
+    token = cos_sims_df["Gene"][0]
+    for dict_i in dict_list:
+        cos_shift_data += dict_i.get((token, "cell_emb"), [])
+    cos_sims_full_df["Cosine_shift"] = cos_shift_data
+    return cos_sims_full_df
+def find(variable, x):
+    try:
+        if x in variable:  # Test if variable is iterable and contains x
+            return True
+    except (ValueError, TypeError):
+        return x == variable  # Test if variable is x if non-iterable
+def isp_aggregate_gene_shifts(
+    cos_sims_df, dict_list, gene_token_id_dict, gene_id_name_dict
+):
+    cos_shift_data = dict()
+    for i in trange(cos_sims_df.shape[0]):
+        token = cos_sims_df["Gene"][i]
+        for dict_i in dict_list:
+            affected_pairs = [k for k, v in dict_i.items() if find(k[0], token)]
+            for key in affected_pairs:
+                if key in cos_shift_data.keys():
+                    cos_shift_data[key] += dict_i.get(key, [])
+                else:
+                    cos_shift_data[key] = dict_i.get(key, [])
+    cos_data_mean = {
+        k: [np.mean(v), np.std(v), len(v)] for k, v in cos_shift_data.items()
+    }
+    cos_sims_full_df = pd.DataFrame()
+    cos_sims_full_df["Perturbed"] = [k[0] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Gene_name"] = [
+        cos_sims_df[cos_sims_df["Gene"] == k[0]]["Gene_name"][0]
+        for k, v in cos_data_mean.items()
+    ]
+    cos_sims_full_df["Ensembl_ID"] = [
+        cos_sims_df[cos_sims_df["Gene"] == k[0]]["Ensembl_ID"][0]
+        for k, v in cos_data_mean.items()
+    ]
+    cos_sims_full_df["Affected"] = [k[1] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Affected_gene_name"] = [
+        gene_id_name_dict.get(gene_token_id_dict.get(token, np.nan), np.nan)
+        for token in cos_sims_full_df["Affected"]
+    ]
+    cos_sims_full_df["Affected_Ensembl_ID"] = [
+        gene_token_id_dict.get(token, np.nan) for token in cos_sims_full_df["Affected"]
+    ]
+    cos_sims_full_df["Cosine_shift_mean"] = [v[0] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Cosine_shift_stdev"] = [v[1] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["N_Detections"] = [v[2] for k, v in cos_data_mean.items()]
+    specific_val = "cell_emb"
+    cos_sims_full_df["temp"] = list(cos_sims_full_df["Affected"] == specific_val)
+    # reorder so cell embs are at the top and all are subordered by magnitude of cosine shift
+    cos_sims_full_df = cos_sims_full_df.sort_values(
+        by=(["temp", "Cosine_shift_mean"]), ascending=[False, False]
+    ).drop("temp", axis=1)
+    return cos_sims_full_df
+# stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
+def isp_stats_to_goal_state(
+    cos_sims_df, result_dict, cell_states_to_model, genes_perturbed
+):
+    if (
+        ("alt_states" not in cell_states_to_model.keys())
+        or (len(cell_states_to_model["alt_states"]) == 0)
+        or (cell_states_to_model["alt_states"] == [None])
+    ):
+        alt_end_state_exists = False
+    elif (len(cell_states_to_model["alt_states"]) > 0) and (
+        cell_states_to_model["alt_states"] != [None]
+    ):
+        alt_end_state_exists = True
+    # for single perturbation in multiple cells, there are no random perturbations to compare to
+    if genes_perturbed != "all":
+        cos_sims_full_df = pd.DataFrame()
+        cos_shift_data_end = []
+        token = cos_sims_df["Gene"][0]
+        cos_shift_data_end += result_dict[cell_states_to_model["goal_state"]].get(
+            (token, "cell_emb"), []
+        )
+        cos_sims_full_df["Shift_to_goal_end"] = [np.mean(cos_shift_data_end)]
+        if alt_end_state_exists is True:
+            for alt_state in cell_states_to_model["alt_states"]:
+                cos_shift_data_alt_state = []
+                cos_shift_data_alt_state += result_dict.get(alt_state).get(
+                    (token, "cell_emb"), []
+                )
+                cos_sims_full_df[f"Shift_to_alt_end_{alt_state}"] = [
+                    np.mean(cos_shift_data_alt_state)
+                ]
+        # sort by shift to desired state
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Shift_to_goal_end"], ascending=[False]
+        )
+        return cos_sims_full_df
+    elif genes_perturbed == "all":
+        goal_end_random_megalist = []
+        if alt_end_state_exists is True:
+            alt_end_state_random_dict = {
+                alt_state: [] for alt_state in cell_states_to_model["alt_states"]
+            }
+        for i in trange(cos_sims_df.shape[0]):
+            token = cos_sims_df["Gene"][i]
+            goal_end_random_megalist += result_dict[
+                cell_states_to_model["goal_state"]
+            ].get((token, "cell_emb"), [])
+            if alt_end_state_exists is True:
+                for alt_state in cell_states_to_model["alt_states"]:
+                    alt_end_state_random_dict[alt_state] += result_dict[alt_state].get(
+                        (token, "cell_emb"), []
+                    )
+        # downsample to improve speed of ranksums
+        if len(goal_end_random_megalist) > 100_000:
+            random.seed(42)
+            goal_end_random_megalist = random.sample(
+                goal_end_random_megalist, k=100_000
+            )
+        if alt_end_state_exists is True:
+            for alt_state in cell_states_to_model["alt_states"]:
+                if len(alt_end_state_random_dict[alt_state]) > 100_000:
+                    random.seed(42)
+                    alt_end_state_random_dict[alt_state] = random.sample(
+                        alt_end_state_random_dict[alt_state], k=100_000
+                    )
+        names = [
+            "Gene",
+            "Gene_name",
+            "Ensembl_ID",
+            "Shift_to_goal_end",
+            "Goal_end_vs_random_pval",
+        ]
+        if alt_end_state_exists is True:
+            [
+                names.append(f"Shift_to_alt_end_{alt_state}")
+                for alt_state in cell_states_to_model["alt_states"]
+            ]
+            names.append(names.pop(names.index("Goal_end_vs_random_pval")))
+            [
+                names.append(f"Alt_end_vs_random_pval_{alt_state}")
+                for alt_state in cell_states_to_model["alt_states"]
+            ]
+        cos_sims_full_df = pd.DataFrame(columns=names)
+        n_detections_dict = dict()
+        for i in trange(cos_sims_df.shape[0]):
+            token = cos_sims_df["Gene"][i]
+            name = cos_sims_df["Gene_name"][i]
+            ensembl_id = cos_sims_df["Ensembl_ID"][i]
+            goal_end_cos_sim_megalist = result_dict[
+                cell_states_to_model["goal_state"]
+            ].get((token, "cell_emb"), [])
+            n_detections_dict[token] = len(goal_end_cos_sim_megalist)
+            mean_goal_end = np.mean(goal_end_cos_sim_megalist)
+            pval_goal_end = ranksums(
+                goal_end_random_megalist, goal_end_cos_sim_megalist
+            ).pvalue
+            if alt_end_state_exists is True:
+                alt_end_state_dict = {
+                    alt_state: [] for alt_state in cell_states_to_model["alt_states"]
+                }
+                for alt_state in cell_states_to_model["alt_states"]:
+                    alt_end_state_dict[alt_state] = result_dict[alt_state].get(
+                        (token, "cell_emb"), []
+                    )
+                    alt_end_state_dict[f"{alt_state}_mean"] = np.mean(
+                        alt_end_state_dict[alt_state]
+                    )
+                    alt_end_state_dict[f"{alt_state}_pval"] = ranksums(
+                        alt_end_state_random_dict[alt_state],
+                        alt_end_state_dict[alt_state],
+                    ).pvalue
+            results_dict = dict()
+            results_dict["Gene"] = token
+            results_dict["Gene_name"] = name
+            results_dict["Ensembl_ID"] = ensembl_id
+            results_dict["Shift_to_goal_end"] = mean_goal_end
+            results_dict["Goal_end_vs_random_pval"] = pval_goal_end
+            if alt_end_state_exists is True:
+                for alt_state in cell_states_to_model["alt_states"]:
+                    results_dict[f"Shift_to_alt_end_{alt_state}"] = alt_end_state_dict[
+                        f"{alt_state}_mean"
+                    ]
+                    results_dict[
+                        f"Alt_end_vs_random_pval_{alt_state}"
+                    ] = alt_end_state_dict[f"{alt_state}_pval"]
+            cos_sims_df_i = pd.DataFrame(results_dict, index=[i])
+            cos_sims_full_df = pd.concat([cos_sims_full_df, cos_sims_df_i])
+        cos_sims_full_df["Goal_end_FDR"] = get_fdr(
+            list(cos_sims_full_df["Goal_end_vs_random_pval"])
+        )
+        if alt_end_state_exists is True:
+            for alt_state in cell_states_to_model["alt_states"]:
+                cos_sims_full_df[f"Alt_end_FDR_{alt_state}"] = get_fdr(
+                    list(cos_sims_full_df[f"Alt_end_vs_random_pval_{alt_state}"])
+                )
+        # quantify number of detections of each gene
+        cos_sims_full_df["N_Detections"] = [
+            n_detections_dict[token] for token in cos_sims_full_df["Gene"]
+        ]
+        # sort by shift to desired state
+        cos_sims_full_df["Sig"] = [
+            1 if fdr < 0.05 else 0 for fdr in cos_sims_full_df["Goal_end_FDR"]
+        ]
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Sig", "Shift_to_goal_end", "Goal_end_FDR"],
+            ascending=[False, False, True],
+        )
+        return cos_sims_full_df
+# stats comparing cos sim shifts of test perturbations vs null distribution
+def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
+    cos_sims_full_df = cos_sims_df.copy()
+    cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
+    cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
+    cos_sims_full_df["Test_vs_null_avg_shift"] = np.zeros(
+        cos_sims_df.shape[0], dtype=float
+    )
+    cos_sims_full_df["Test_vs_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
+    cos_sims_full_df["Test_vs_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
+    cos_sims_full_df["N_Detections_test"] = np.zeros(
+        cos_sims_df.shape[0], dtype="uint32"
+    )
+    cos_sims_full_df["N_Detections_null"] = np.zeros(
+        cos_sims_df.shape[0], dtype="uint32"
+    )
+    for i in trange(cos_sims_df.shape[0]):
+        token = cos_sims_df["Gene"][i]
+        test_shifts = []
+        null_shifts = []
+        for dict_i in dict_list:
+            test_shifts += dict_i.get((token, "cell_emb"), [])
+        for dict_i in null_dict_list:
+            null_shifts += dict_i.get((token, "cell_emb"), [])
+        cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
+        cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
+        cos_sims_full_df.loc[i, "Test_vs_null_avg_shift"] = np.mean(
+            test_shifts
+        ) - np.mean(null_shifts)
+        cos_sims_full_df.loc[i, "Test_vs_null_pval"] = ranksums(
+            test_shifts, null_shifts, nan_policy="omit"
+        ).pvalue
+        # remove nan values
+        cos_sims_full_df.Test_vs_null_pval = np.where(
+            np.isnan(cos_sims_full_df.Test_vs_null_pval),
+            1,
+            cos_sims_full_df.Test_vs_null_pval,
+        )
+        cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
+        cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
+    cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(
+        cos_sims_full_df["Test_vs_null_pval"]
+    )
+    cos_sims_full_df["Sig"] = [
+        1 if fdr < 0.05 else 0 for fdr in cos_sims_full_df["Test_vs_null_FDR"]
+    ]
+    cos_sims_full_df = cos_sims_full_df.sort_values(
+        by=["Sig", "Test_vs_null_avg_shift", "Test_vs_null_FDR"],
+        ascending=[False, False, True],
+    )
+    return cos_sims_full_df
+# stats for identifying perturbations with largest effect within a given set of cells
+# fits a mixture model to 2 components (impact vs. non-impact) and
+# reports the most likely component for each test perturbation
+# Note: because assumes given perturbation has a consistent effect in the cells tested,
+# we recommend only using the mixture model strategy with uniform cell populations
+def isp_stats_mixture_model(cos_sims_df, dict_list, combos, anchor_token):
+    names = ["Gene", "Gene_name", "Ensembl_ID"]
+    if combos == 0:
+        names += ["Test_avg_shift"]
+    elif combos == 1:
+        names += [
+            "Anchor_shift",
+            "Test_token_shift",
+            "Sum_of_indiv_shifts",
+            "Combo_shift",
+            "Combo_minus_sum_shift",
+        ]
+    names += ["Impact_component", "Impact_component_percent"]
+    cos_sims_full_df = pd.DataFrame(columns=names)
+    avg_values = []
+    gene_names = []
+    for i in trange(cos_sims_df.shape[0]):
+        token = cos_sims_df["Gene"][i]
+        name = cos_sims_df["Gene_name"][i]
+        ensembl_id = cos_sims_df["Ensembl_ID"][i]
+        cos_shift_data = []
+        for dict_i in dict_list:
+            if (combos == 0) and (anchor_token is not None):
+                cos_shift_data += dict_i.get((anchor_token, token), [])
+            else:
+                cos_shift_data += dict_i.get((token, "cell_emb"), [])
+        # Extract values for current gene
+        if combos == 0:
+            test_values = cos_shift_data
+        elif combos == 1:
+            test_values = []
+            for tup in cos_shift_data:
+                test_values.append(tup[2])
+        if len(test_values) > 0:
+            avg_value = np.mean(test_values)
+            avg_values.append(avg_value)
+            gene_names.append(name)
+    # fit Gaussian mixture model to dataset of mean for each gene
+    avg_values_to_fit = np.array(avg_values).reshape(-1, 1)
+    gm = GaussianMixture(n_components=2, random_state=0).fit(avg_values_to_fit)
+    for i in trange(cos_sims_df.shape[0]):
+        token = cos_sims_df["Gene"][i]
+        name = cos_sims_df["Gene_name"][i]
+        ensembl_id = cos_sims_df["Ensembl_ID"][i]
+        cos_shift_data = []
+        for dict_i in dict_list:
+            if (combos == 0) and (anchor_token is not None):
+                cos_shift_data += dict_i.get((anchor_token, token), [])
+            else:
+                cos_shift_data += dict_i.get((token, "cell_emb"), [])
+        if combos == 0:
+            mean_test = np.mean(cos_shift_data)
+            impact_components = [
+                get_impact_component(value, gm) for value in cos_shift_data
+            ]
+        elif combos == 1:
+            anchor_cos_sim_megalist = [
+                anchor for anchor, token, combo in cos_shift_data
+            ]
+            token_cos_sim_megalist = [token for anchor, token, combo in cos_shift_data]
+            anchor_plus_token_cos_sim_megalist = [
+                1 - ((1 - anchor) + (1 - token))
+                for anchor, token, combo in cos_shift_data
+            ]
+            combo_anchor_token_cos_sim_megalist = [
+                combo for anchor, token, combo in cos_shift_data
+            ]
+            combo_minus_sum_cos_sim_megalist = [
+                combo - (1 - ((1 - anchor) + (1 - token)))
+                for anchor, token, combo in cos_shift_data
+            ]
+            mean_anchor = np.mean(anchor_cos_sim_megalist)
+            mean_token = np.mean(token_cos_sim_megalist)
+            mean_sum = np.mean(anchor_plus_token_cos_sim_megalist)
+            mean_test = np.mean(combo_anchor_token_cos_sim_megalist)
+            mean_combo_minus_sum = np.mean(combo_minus_sum_cos_sim_megalist)
+            impact_components = [
+                get_impact_component(value, gm)
+                for value in combo_anchor_token_cos_sim_megalist
+            ]
+        impact_component = get_impact_component(mean_test, gm)
+        impact_component_percent = np.mean(impact_components) * 100
+        data_i = [token, name, ensembl_id]
+        if combos == 0:
+            data_i += [mean_test]
+        elif combos == 1:
+            data_i += [
+                mean_anchor,
+                mean_token,
+                mean_sum,
+                mean_test,
+                mean_combo_minus_sum,
+            ]
+        data_i += [impact_component, impact_component_percent]
+        cos_sims_df_i = pd.DataFrame(dict(zip(names, data_i)), index=[i])
+        cos_sims_full_df = pd.concat([cos_sims_full_df, cos_sims_df_i])
+    # quantify number of detections of each gene
+    cos_sims_full_df["N_Detections"] = [
+        n_detections(i, dict_list, "gene", anchor_token)
+        for i in cos_sims_full_df["Gene"]
+    ]
+    if combos == 0:
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Impact_component", "Test_avg_shift"], ascending=[False, True]
+        )
+    elif combos == 1:
+        cos_sims_full_df = cos_sims_full_df.sort_values(
+            by=["Impact_component", "Combo_minus_sum_shift"], ascending=[False, True]
+        )
+    return cos_sims_full_df
+class InSilicoPerturberStats:
+    valid_option_dict = {
+        "mode": {
+            "goal_state_shift",
+            "vs_null",
+            "mixture_model",
+            "aggregate_data",
+            "aggregate_gene_shifts",
+        },
+        "genes_perturbed": {"all", list},
+        "combos": {0, 1},
+        "anchor_gene": {None, str},
+        "cell_states_to_model": {None, dict},
+        "pickle_suffix": {None, str},
+    }
+    def __init__(
+        self,
+        mode="mixture_model",
+        genes_perturbed="all",
+        combos=0,
+        anchor_gene=None,
+        cell_states_to_model=None,
+        pickle_suffix="_raw.pickle",
+        token_dictionary_file=TOKEN_DICTIONARY_FILE,
+        gene_name_id_dictionary_file=GENE_NAME_ID_DICTIONARY_FILE,
+    ):
+        """
+        Initialize in silico perturber stats generator.
+        **Parameters:**
+        mode : {"goal_state_shift", "vs_null", "mixture_model", "aggregate_data", "aggregate_gene_shifts"}
+            | Type of stats.
+            | "goal_state_shift": perturbation vs. random for desired cell state shift
+            | "vs_null": perturbation vs. null from provided null distribution dataset
+            | "mixture_model": perturbation in impact vs. no impact component of mixture model (no goal direction)
+            | "aggregate_data": aggregates cosine shifts for single perturbation in multiple cells
+            | "aggregate_gene_shifts": aggregates cosine shifts of genes in response to perturbation(s)
+        genes_perturbed : "all", list
+            | Genes perturbed in isp experiment.
+            | Default is assuming genes_to_perturb in isp experiment was "all" (each gene in each cell).
+            | Otherwise, may provide a list of ENSEMBL IDs of genes perturbed as a group all together.
+        combos : {0,1,2}
+            | Whether to perturb genes individually (0), in pairs (1), or in triplets (2).
+        anchor_gene : None, str
+            | ENSEMBL ID of gene to use as anchor in combination perturbations or in testing effect on downstream genes.
+            | For example, if combos=1 and anchor_gene="ENSG00000136574":
+            |    analyzes data for anchor gene perturbed in combination with each other gene.
+            | However, if combos=0 and anchor_gene="ENSG00000136574":
+            |    analyzes data for the effect of anchor gene's perturbation on the embedding of each other gene.
+        cell_states_to_model: None, dict
+            | Cell states to model if testing perturbations that achieve goal state change.
+            | Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
+            | state_key: key specifying name of column in .dataset that defines the start/goal states
+            | start_state: value in the state_key column that specifies the start state
+            | goal_state: value in the state_key column taht specifies the goal end state
+            | alt_states: list of values in the state_key column that specify the alternate end states
+            | For example: {"state_key": "disease",
+            |               "start_state": "dcm",
+            |               "goal_state": "nf",
+            |               "alt_states": ["hcm", "other1", "other2"]}
+        token_dictionary_file : Path
+            | Path to pickle file containing token dictionary (Ensembl ID:token).
+        gene_name_id_dictionary_file : Path
+            | Path to pickle file containing gene name to ID dictionary (gene name:Ensembl ID).
+        """
+        self.mode = mode
+        self.genes_perturbed = genes_perturbed
+        self.combos = combos
+        self.anchor_gene = anchor_gene
+        self.cell_states_to_model = cell_states_to_model
+        self.pickle_suffix = pickle_suffix
+        self.validate_options()
+        # load token dictionary (Ensembl IDs:token)
+        with open(token_dictionary_file, "rb") as f:
+            self.gene_token_dict = pickle.load(f)
+        # load gene name dictionary (gene name:Ensembl ID)
+        with open(gene_name_id_dictionary_file, "rb") as f:
+            self.gene_name_id_dict = pickle.load(f)
+        if anchor_gene is None:
+            self.anchor_token = None
+        else:
+            self.anchor_token = self.gene_token_dict[self.anchor_gene]
+    def validate_options(self):
+        for attr_name, valid_options in self.valid_option_dict.items():
+            attr_value = self.__dict__[attr_name]
+            if type(attr_value) not in {list, dict}:
+                if attr_name in {"anchor_gene"}:
+                    continue
+                elif attr_value in valid_options:
+                    continue
+            valid_type = False
+            for option in valid_options:
+                if (option in [str, int, list, dict]) and isinstance(
+                    attr_value, option
+                ):
+                    valid_type = True
+                    break
+            if not valid_type:
+                logger.error(
+                    f"Invalid option for {attr_name}. "
+                    f"Valid options for {attr_name}: {valid_options}"
+                )
+                raise
+        if self.cell_states_to_model is not None:
+            if len(self.cell_states_to_model.items()) == 1:
+                logger.warning(
+                    "The single value dictionary for cell_states_to_model will be "
+                    "replaced with a dictionary with named keys for start, goal, and alternate states. "
+                    "Please specify state_key, start_state, goal_state, and alt_states "
+                    "in the cell_states_to_model dictionary for future use. "
+                    "For example, cell_states_to_model={"
+                    "'state_key': 'disease', "
+                    "'start_state': 'dcm', "
+                    "'goal_state': 'nf', "
+                    "'alt_states': ['hcm', 'other1', 'other2']}"
+                )
+                for key, value in self.cell_states_to_model.items():
+                    if (len(value) == 3) and isinstance(value, tuple):
+                        if (
+                            isinstance(value[0], list)
+                            and isinstance(value[1], list)
+                            and isinstance(value[2], list)
+                        ):
+                            if len(value[0]) == 1 and len(value[1]) == 1:
+                                all_values = value[0] + value[1] + value[2]
+                                if len(all_values) == len(set(all_values)):
+                                    continue
+                # reformat to the new named key format
+                state_values = flatten_list(list(self.cell_states_to_model.values()))
+                self.cell_states_to_model = {
+                    "state_key": list(self.cell_states_to_model.keys())[0],
+                    "start_state": state_values[0][0],
+                    "goal_state": state_values[1][0],
+                    "alt_states": state_values[2:][0],
+                }
+            elif set(self.cell_states_to_model.keys()) == {
+                "state_key",
+                "start_state",
+                "goal_state",
+                "alt_states",
+            }:
+                if (
+                    (self.cell_states_to_model["state_key"] is None)
+                    or (self.cell_states_to_model["start_state"] is None)
+                    or (self.cell_states_to_model["goal_state"] is None)
+                ):
+                    logger.error(
+                        "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model."
+                    )
+                    raise
+                if (
+                    self.cell_states_to_model["start_state"]
+                    == self.cell_states_to_model["goal_state"]
+                ):
+                    logger.error("All states must be unique.")
+                    raise
+                if self.cell_states_to_model["alt_states"] is not None:
+                    if not isinstance(self.cell_states_to_model["alt_states"], list):
+                        logger.error(
+                            "self.cell_states_to_model['alt_states'] must be a list (even if it is one element)."
+                        )
+                        raise
+                    if len(self.cell_states_to_model["alt_states"]) != len(
+                        set(self.cell_states_to_model["alt_states"])
+                    ):
+                        logger.error("All states must be unique.")
+                        raise
+            else:
+                logger.error(
+                    "cell_states_to_model must only have the following four keys: "
+                    "'state_key', 'start_state', 'goal_state', 'alt_states'."
+                    "For example, cell_states_to_model={"
+                    "'state_key': 'disease', "
+                    "'start_state': 'dcm', "
+                    "'goal_state': 'nf', "
+                    "'alt_states': ['hcm', 'other1', 'other2']}"
+                )
+                raise
+            if self.anchor_gene is not None:
+                self.anchor_gene = None
+                logger.warning(
+                    "anchor_gene set to None. "
+                    "Currently, anchor gene not available "
+                    "when modeling multiple cell states."
+                )
+        if self.combos > 0:
+            if self.anchor_gene is None:
+                logger.error(
+                    "Currently, stats are only supported for combination "
+                    "in silico perturbation run with anchor gene. Please add "
+                    "anchor gene when using with combos > 0. "
+                )
+                raise
+        if (self.mode == "mixture_model") and (self.genes_perturbed != "all"):
+            logger.error(
+                "Mixture model mode requires multiple gene perturbations to fit model "
+                "so is incompatible with a single grouped perturbation."
+            )
+            raise
+        if (self.mode == "aggregate_data") and (self.genes_perturbed == "all"):
+            logger.error(
+                "Simple data aggregation mode is for single perturbation in multiple cells "
+                "so is incompatible with a genes_perturbed being 'all'."
+            )
+            raise
+    def get_stats(
+        self,
+        input_data_directory,
+        null_dist_data_directory,
+        output_directory,
+        output_prefix,
+        null_dict_list=None,
+    ):
+        """
+        Get stats for in silico perturbation data and save as results in output_directory.
+        **Parameters:**
+        input_data_directory : Path
+            | Path to directory containing cos_sim dictionary inputs
+        null_dist_data_directory : Path
+            | Path to directory containing null distribution cos_sim dictionary inputs
+        output_directory : Path
+            | Path to directory where perturbation data will be saved as .csv
+        output_prefix : str
+            | Prefix for output .csv
+        null_dict_list: list[dict]
+            | List of loaded null distribution dictionary if more than one comparison vs. the null is to be performed
+        **Outputs:**
+        Definition of possible columns in .csv output file.
+        | Of note, not all columns will be present in all output files.
+        | Some columns are specific to particular perturbation modes.
+        | "Gene": gene token
+        | "Gene_name": gene name
+        | "Ensembl_ID": gene Ensembl ID
+        | "N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
+        | "Sig": 1 if FDR<0.05, otherwise 0
+        | "Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
+        | "Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation
+        | "Goal_end_vs_random_pval": pvalue of cosine shift from start state towards goal end state by Wilcoxon
+        |     pvalue compares shift caused by perturbing given gene compared to random genes
+        | "Alt_end_vs_random_pval": pvalue of cosine shift from start state towards alternate end state by Wilcoxon
+        |     pvalue compares shift caused by perturbing given gene compared to random genes
+        | "Goal_end_FDR": Benjamini-Hochberg correction of "Goal_end_vs_random_pval"
+        | "Alt_end_FDR": Benjamini-Hochberg correction of "Alt_end_vs_random_pval"
+        | "Test_avg_shift": cosine shift in response to given perturbation in cells from test distribution
+        | "Null_avg_shift": cosine shift in response to given perturbation in cells from null distribution (e.g. random cells)
+        | "Test_vs_null_avg_shift": difference in cosine shift in cells from test vs. null distribution
+        |     (i.e. "Test_avg_shift" minus "Null_avg_shift")
+        | "Test_vs_null_pval": pvalue of cosine shift in test vs. null distribution
+        | "Test_vs_null_FDR": Benjamini-Hochberg correction of "Test_vs_null_pval"
+        | "N_Detections_test": "N_Detections" in cells from test distribution
+        | "N_Detections_null": "N_Detections" in cells from null distribution
+        | "Anchor_shift": cosine shift in response to given perturbation of anchor gene
+        | "Test_token_shift": cosine shift in response to given perturbation of test gene
+        | "Sum_of_indiv_shifts": sum of cosine shifts in response to individually perturbing test and anchor genes
+        | "Combo_shift": cosine shift in response to given perturbation of both anchor and test gene(s) in combination
+        | "Combo_minus_sum_shift": difference of cosine shifts in response combo perturbation vs. sum of individual perturbations
+        |     (i.e. "Combo_shift" minus "Sum_of_indiv_shifts")
+        | "Impact_component": whether the given perturbation was modeled to be within the impact component by the mixture model
+        |     1: within impact component; 0: not within impact component
+        | "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
+        | In case of aggregating gene shifts:
+        | "Perturbed": ID(s) of gene(s) being perturbed
+        | "Affected": ID of affected gene or "cell_emb" indicating the impact on the cell embedding as a whole
+        | "Cosine_shift_mean": mean of cosine shift of modeled perturbation on affected gene or cell
+        | "Cosine_shift_stdev": standard deviation of cosine shift of modeled perturbation on affected gene or cell
+        """
+        if self.mode not in [
+            "goal_state_shift",
+            "vs_null",
+            "mixture_model",
+            "aggregate_data",
+            "aggregate_gene_shifts",
+        ]:
+            logger.error(
+                "Currently, only modes available are stats for goal_state_shift, "
+                "vs_null (comparing to null distribution), "
+                "mixture_model (fitting mixture model for perturbations with or without impact), "
+                "and aggregating data for single perturbations or for gene embedding shifts."
+            )
+            raise
+        self.gene_token_id_dict = invert_dict(self.gene_token_dict)
+        self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
+        # obtain total gene list
+        if (self.combos == 0) and (self.anchor_token is not None):
+            # cos sim data for effect of gene perturbation on the embedding of each other gene
+            dict_list = read_dictionaries(
+                input_data_directory,
+                "gene",
+                self.anchor_token,
+                self.cell_states_to_model,
+                self.pickle_suffix,
+            )
+            gene_list = get_gene_list(dict_list, "gene")
+        elif (
+            (self.combos == 0)
+            and (self.anchor_token is None)
+            and (self.mode == "aggregate_gene_shifts")
+        ):
+            dict_list = read_dictionaries(
+                input_data_directory,
+                "gene",
+                self.anchor_token,
+                self.cell_states_to_model,
+                self.pickle_suffix,
+            )
+            gene_list = get_gene_list(dict_list, "cell")
+        else:
+            # cos sim data for effect of gene perturbation on the embedding of each cell
+            dict_list = read_dictionaries(
+                input_data_directory,
+                "cell",
+                self.anchor_token,
+                self.cell_states_to_model,
+                self.pickle_suffix,
+            )
+            gene_list = get_gene_list(dict_list, "cell")
+        # initiate results dataframe
+        cos_sims_df_initial = pd.DataFrame(
+            {
+                "Gene": gene_list,
+                "Gene_name": [self.token_to_gene_name(item) for item in gene_list],
+                "Ensembl_ID": [
+                    token_tuple_to_ensembl_ids(genes, self.gene_token_id_dict)
+                    if self.genes_perturbed != "all"
+                    else self.gene_token_id_dict[genes[1]]
+                    if isinstance(genes, tuple)
+                    else self.gene_token_id_dict[genes]
+                    for genes in gene_list
+                ],
+            },
+            index=[i for i in range(len(gene_list))],
+        )
+        if self.mode == "goal_state_shift":
+            cos_sims_df = isp_stats_to_goal_state(
+                cos_sims_df_initial,
+                dict_list,
+                self.cell_states_to_model,
+                self.genes_perturbed,
+            )
+        elif self.mode == "vs_null":
+            if null_dict_list is None:
+                null_dict_list = read_dictionaries(
+                    null_dist_data_directory,
+                    "cell",
+                    self.anchor_token,
+                    self.cell_states_to_model,
+                    self.pickle_suffix,
+                )
+            cos_sims_df = isp_stats_vs_null(
+                cos_sims_df_initial, dict_list, null_dict_list
+            )
+        elif self.mode == "mixture_model":
+            cos_sims_df = isp_stats_mixture_model(
+                cos_sims_df_initial, dict_list, self.combos, self.anchor_token
+            )
+        elif self.mode == "aggregate_data":
+            cos_sims_df = isp_aggregate_grouped_perturb(cos_sims_df_initial, dict_list)
+        elif self.mode == "aggregate_gene_shifts":
+            cos_sims_df = isp_aggregate_gene_shifts(
+                cos_sims_df_initial,
+                dict_list,
+                self.gene_token_id_dict,
+                self.gene_id_name_dict,
+            )
+        # save perturbation stats to output_path
+        output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
+        cos_sims_df.to_csv(output_path)
+    def token_to_gene_name(self, item):
+        if np.issubdtype(type(item), np.integer):
+            return self.gene_id_name_dict.get(
+                self.gene_token_id_dict.get(item, np.nan), np.nan
+            )
+        if isinstance(item, tuple):
+            return tuple(
+                [
+                    self.gene_id_name_dict.get(
+                        self.gene_token_id_dict.get(i, np.nan), np.nan
+                    )
+                    for i in item
+                ]
+            )

geneformer/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5e33a757431643b3697de7ef6127950cdc49e06e58d4266b3a3ab191b683f14
+size 41183536

geneformer/perturber_utils.py ADDED Viewed

	@@ -0,0 +1,699 @@

+import itertools as it
+import logging
+import pickle
+import re
+from collections import defaultdict
+from typing import List
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch
+from datasets import Dataset, load_from_disk
+from transformers import (
+    BertForMaskedLM,
+    BertForSequenceClassification,
+    BertForTokenClassification,
+)
+sns.set()
+logger = logging.getLogger(__name__)
+# load data and filter by defined criteria
+def load_and_filter(filter_data, nproc, input_data_file):
+    data = load_from_disk(input_data_file)
+    if filter_data is not None:
+        data = filter_by_dict(data, filter_data, nproc)
+    return data
+def filter_by_dict(data, filter_data, nproc):
+    for key, value in filter_data.items():
+        def filter_data_by_criteria(example):
+            return example[key] in value
+        data = data.filter(filter_data_by_criteria, num_proc=nproc)
+    if len(data) == 0:
+        logger.error("No cells remain after filtering. Check filtering criteria.")
+        raise
+    return data
+def filter_data_by_tokens(filtered_input_data, tokens, nproc):
+    def if_has_tokens(example):
+        return len(set(example["input_ids"]).intersection(tokens)) == len(tokens)
+    filtered_input_data = filtered_input_data.filter(if_has_tokens, num_proc=nproc)
+    return filtered_input_data
+def logging_filtered_data_len(filtered_input_data, filtered_tokens_categ):
+    if len(filtered_input_data) == 0:
+        logger.error(f"No cells in dataset contain {filtered_tokens_categ}.")
+        raise
+    else:
+        logger.info(f"# cells with {filtered_tokens_categ}: {len(filtered_input_data)}")
+def filter_data_by_tokens_and_log(
+    filtered_input_data, tokens, nproc, filtered_tokens_categ
+):
+    # filter for cells with anchor gene
+    filtered_input_data = filter_data_by_tokens(filtered_input_data, tokens, nproc)
+    # logging length of filtered data
+    logging_filtered_data_len(filtered_input_data, filtered_tokens_categ)
+    return filtered_input_data
+def filter_data_by_start_state(filtered_input_data, cell_states_to_model, nproc):
+    # confirm that start state is valid to prevent futile filtering
+    state_key = cell_states_to_model["state_key"]
+    state_values = filtered_input_data[state_key]
+    start_state = cell_states_to_model["start_state"]
+    if start_state not in state_values:
+        logger.error(
+            f"Start state {start_state} is not present "
+            f"in the dataset's {state_key} attribute."
+        )
+        raise
+    # filter for start state cells
+    def filter_for_origin(example):
+        return example[state_key] in [start_state]
+    filtered_input_data = filtered_input_data.filter(filter_for_origin, num_proc=nproc)
+    return filtered_input_data
+def slice_by_inds_to_perturb(filtered_input_data, cell_inds_to_perturb):
+    if cell_inds_to_perturb["start"] >= len(filtered_input_data):
+        logger.error(
+            "cell_inds_to_perturb['start'] is larger than the filtered dataset."
+        )
+        raise
+    if cell_inds_to_perturb["end"] > len(filtered_input_data):
+        logger.warning(
+            "cell_inds_to_perturb['end'] is larger than the filtered dataset. \
+                       Setting to the end of the filtered dataset."
+        )
+        cell_inds_to_perturb["end"] = len(filtered_input_data)
+    filtered_input_data = filtered_input_data.select(
+        [i for i in range(cell_inds_to_perturb["start"], cell_inds_to_perturb["end"])]
+    )
+    return filtered_input_data
+# load model to GPU
+def load_model(model_type, num_classes, model_directory):
+    if model_type == "Pretrained":
+        model = BertForMaskedLM.from_pretrained(
+            model_directory, output_hidden_states=True, output_attentions=False
+        )
+    elif model_type == "GeneClassifier":
+        model = BertForTokenClassification.from_pretrained(
+            model_directory,
+            num_labels=num_classes,
+            output_hidden_states=True,
+            output_attentions=False,
+        )
+    elif model_type == "CellClassifier":
+        model = BertForSequenceClassification.from_pretrained(
+            model_directory,
+            num_labels=num_classes,
+            output_hidden_states=True,
+            output_attentions=False,
+        )
+    # put the model in eval mode for fwd pass
+    model.eval()
+    model = model.to("cuda:0")
+    return model
+def quant_layers(model):
+    layer_nums = []
+    for name, parameter in model.named_parameters():
+        if "layer" in name:
+            layer_nums += [int(name.split("layer.")[1].split(".")[0])]
+    return int(max(layer_nums)) + 1
+def get_model_input_size(model):
+    return int(re.split("\(|,", str(model.bert.embeddings.position_embeddings))[1])
+def flatten_list(megalist):
+    return [item for sublist in megalist for item in sublist]
+def measure_length(example):
+    example["length"] = len(example["input_ids"])
+    return example
+def downsample_and_sort(data, max_ncells):
+    num_cells = len(data)
+    # if max number of cells is defined, then shuffle and subsample to this max number
+    if max_ncells is not None:
+        if num_cells > max_ncells:
+            data = data.shuffle(seed=42)
+            num_cells = max_ncells
+    data_subset = data.select([i for i in range(num_cells)])
+    # sort dataset with largest cell first to encounter any memory errors earlier
+    data_sorted = data_subset.sort("length", reverse=True)
+    return data_sorted
+def get_possible_states(cell_states_to_model):
+    possible_states = []
+    for key in ["start_state", "goal_state"]:
+        possible_states += [cell_states_to_model[key]]
+    possible_states += cell_states_to_model.get("alt_states", [])
+    return possible_states
+def forward_pass_single_cell(model, example_cell, layer_to_quant):
+    example_cell.set_format(type="torch")
+    input_data = example_cell["input_ids"]
+    with torch.no_grad():
+        outputs = model(input_ids=input_data.to("cuda"))
+    emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
+    del outputs
+    return emb
+def perturb_emb_by_index(emb, indices):
+    mask = torch.ones(emb.numel(), dtype=torch.bool)
+    mask[indices] = False
+    return emb[mask]
+def delete_indices(example):
+    indices = example["perturb_index"]
+    if any(isinstance(el, list) for el in indices):
+        indices = flatten_list(indices)
+    for index in sorted(indices, reverse=True):
+        del example["input_ids"][index]
+    example["length"] = len(example["input_ids"])
+    return example
+# for genes_to_perturb = "all" where only genes within cell are overexpressed
+def overexpress_indices(example):
+    indices = example["perturb_index"]
+    if any(isinstance(el, list) for el in indices):
+        indices = flatten_list(indices)
+    for index in sorted(indices, reverse=True):
+        example["input_ids"].insert(0, example["input_ids"].pop(index))
+    example["length"] = len(example["input_ids"])
+    return example
+# for genes_to_perturb = list of genes to overexpress that are not necessarily expressed in cell
+def overexpress_tokens(example, max_len):
+    # -100 indicates tokens to overexpress are not present in rank value encoding
+    if example["perturb_index"] != [-100]:
+        example = delete_indices(example)
+    [
+        example["input_ids"].insert(0, token)
+        for token in example["tokens_to_perturb"][::-1]
+    ]
+    # truncate to max input size, must also truncate original emb to be comparable
+    if len(example["input_ids"]) > max_len:
+        example["input_ids"] = example["input_ids"][0:max_len]
+    example["length"] = len(example["input_ids"])
+    return example
+def calc_n_overflow(max_len, example_len, tokens_to_perturb, indices_to_perturb):
+    n_to_add = len(tokens_to_perturb) - len(indices_to_perturb)
+    n_overflow = example_len + n_to_add - max_len
+    return n_overflow
+def truncate_by_n_overflow(example):
+    new_max_len = example["length"] - example["n_overflow"]
+    example["input_ids"] = example["input_ids"][0:new_max_len]
+    example["length"] = len(example["input_ids"])
+    return example
+def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
+    # indices_to_remove is list of indices to remove
+    indices_to_keep = [
+        i for i in range(emb.size()[gene_dim]) if i not in indices_to_remove
+    ]
+    num_dims = emb.dim()
+    emb_slice = [
+        slice(None) if dim != gene_dim else indices_to_keep for dim in range(num_dims)
+    ]
+    sliced_emb = emb[emb_slice]
+    return sliced_emb
+def remove_indices_from_emb_batch(emb_batch, list_of_indices_to_remove, gene_dim):
+    output_batch_list = [
+        remove_indices_from_emb(emb_batch[i, :, :], idxes, gene_dim - 1)
+        for i, idxes in enumerate(list_of_indices_to_remove)
+    ]
+    # add padding given genes are sometimes added that are or are not in original cell
+    batch_max = max([emb.size()[gene_dim - 1] for emb in output_batch_list])
+    output_batch_list_padded = [
+        pad_xd_tensor(emb, 0.000, batch_max, gene_dim - 1) for emb in output_batch_list
+    ]
+    return torch.stack(output_batch_list_padded)
+# removes perturbed indices
+# need to handle the various cases where a set of genes is overexpressed
+def remove_perturbed_indices_set(
+    emb,
+    perturb_type: str,
+    indices_to_perturb: List[List],
+    tokens_to_perturb: List[List],
+    original_lengths: List[int],
+    input_ids=None,
+):
+    if perturb_type == "overexpress":
+        num_perturbed = len(tokens_to_perturb)
+        if num_perturbed == 1:
+            indices_to_perturb_orig = [
+                idx if idx != [-100] else [None] for idx in indices_to_perturb
+            ]
+            if all(v is [None] for v in indices_to_perturb_orig):
+                return emb
+        else:
+            indices_to_perturb_orig = []
+            for idx_list in indices_to_perturb:
+                indices_to_perturb_orig.append(
+                    [idx if idx != [-100] else [None] for idx in idx_list]
+                )
+    else:
+        indices_to_perturb_orig = indices_to_perturb
+    emb = remove_indices_from_emb_batch(emb, indices_to_perturb_orig, gene_dim=1)
+    return emb
+def make_perturbation_batch(
+    example_cell, perturb_type, tokens_to_perturb, anchor_token, combo_lvl, num_proc
+) -> tuple[Dataset, List[int]]:
+    if combo_lvl == 0 and tokens_to_perturb == "all":
+        if perturb_type in ["overexpress", "activate"]:
+            range_start = 1
+        elif perturb_type in ["delete", "inhibit"]:
+            range_start = 0
+        indices_to_perturb = [
+            [i] for i in range(range_start, example_cell["length"][0])
+        ]
+    # elif combo_lvl > 0 and anchor_token is None:
+    ## to implement
+    elif combo_lvl > 0 and (anchor_token is not None):
+        example_input_ids = example_cell["input_ids"][0]
+        anchor_index = example_input_ids.index(anchor_token[0])
+        indices_to_perturb = [
+            sorted([anchor_index, i]) if i != anchor_index else None
+            for i in range(example_cell["length"][0])
+        ]
+        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+    else:
+        example_input_ids = example_cell["input_ids"][0]
+        indices_to_perturb = [
+            [example_input_ids.index(token)] if token in example_input_ids else None
+            for token in tokens_to_perturb
+        ]
+        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+    # create all permutations of combo_lvl of modifiers from tokens_to_perturb
+    if combo_lvl > 0 and (anchor_token is None):
+        if tokens_to_perturb != "all":
+            if len(tokens_to_perturb) == combo_lvl + 1:
+                indices_to_perturb = [
+                    list(x) for x in it.combinations(indices_to_perturb, combo_lvl + 1)
+                ]
+        else:
+            all_indices = [[i] for i in range(example_cell["length"][0])]
+            all_indices = [
+                index for index in all_indices if index not in indices_to_perturb
+            ]
+            indices_to_perturb = [
+                [[j for i in indices_to_perturb for j in i], x] for x in all_indices
+            ]
+    length = len(indices_to_perturb)
+    perturbation_dataset = Dataset.from_dict(
+        {
+            "input_ids": example_cell["input_ids"] * length,
+            "perturb_index": indices_to_perturb,
+        }
+    )
+    if length < 400:
+        num_proc_i = 1
+    else:
+        num_proc_i = num_proc
+    if perturb_type == "delete":
+        perturbation_dataset = perturbation_dataset.map(
+            delete_indices, num_proc=num_proc_i
+        )
+    elif perturb_type == "overexpress":
+        perturbation_dataset = perturbation_dataset.map(
+            overexpress_indices, num_proc=num_proc_i
+        )
+    perturbation_dataset = perturbation_dataset.map(measure_length, num_proc=num_proc_i)
+    return perturbation_dataset, indices_to_perturb
+# perturbed cell emb removing the activated/overexpressed/inhibited gene emb
+# so that only non-perturbed gene embeddings are compared to each other
+# in original or perturbed context
+def make_comparison_batch(original_emb_batch, indices_to_perturb, perturb_group):
+    all_embs_list = []
+    # if making comparison batch for multiple perturbations in single cell
+    if perturb_group is False:
+        # squeeze if single cell
+        if original_emb_batch.ndim == 3 and original_emb_batch.size()[0] == 1:
+            original_emb_batch = torch.squeeze(original_emb_batch)
+        original_emb_list = [original_emb_batch] * len(indices_to_perturb)
+    # if making comparison batch for single perturbation in multiple cells
+    elif perturb_group is True:
+        original_emb_list = original_emb_batch
+    for original_emb, indices in zip(original_emb_list, indices_to_perturb):
+        if indices == [-100]:
+            all_embs_list += [original_emb[:]]
+            continue
+        emb_list = []
+        start = 0
+        if any(isinstance(el, list) for el in indices):
+            indices = flatten_list(indices)
+        # removes indices that were perturbed from the original embedding
+        for i in sorted(indices):
+            emb_list += [original_emb[start:i]]
+            start = i + 1
+        emb_list += [original_emb[start:]]
+        all_embs_list += [torch.cat(emb_list)]
+    len_set = set([emb.size()[0] for emb in all_embs_list])
+    if len(len_set) > 1:
+        max_len = max(len_set)
+        all_embs_list = [pad_2d_tensor(emb, None, max_len, 0) for emb in all_embs_list]
+    return torch.stack(all_embs_list)
+def pad_list(input_ids, pad_token_id, max_len):
+    input_ids = np.pad(
+        input_ids,
+        (0, max_len - len(input_ids)),
+        mode="constant",
+        constant_values=pad_token_id,
+    )
+    return input_ids
+def pad_xd_tensor(tensor, pad_token_id, max_len, dim):
+    padding_length = max_len - tensor.size()[dim]
+    # Construct a padding configuration where all padding values are 0, except for the padding dimension
+    # 2 * number of dimensions (padding before and after for every dimension)
+    pad_config = [0] * 2 * tensor.dim()
+    # Set the padding after the desired dimension to the calculated padding length
+    pad_config[-2 * dim - 1] = padding_length
+    return torch.nn.functional.pad(
+        tensor, pad=pad_config, mode="constant", value=pad_token_id
+    )
+def pad_tensor(tensor, pad_token_id, max_len):
+    tensor = torch.nn.functional.pad(
+        tensor, pad=(0, max_len - tensor.numel()), mode="constant", value=pad_token_id
+    )
+    return tensor
+def pad_2d_tensor(tensor, pad_token_id, max_len, dim):
+    if dim == 0:
+        pad = (0, 0, 0, max_len - tensor.size()[dim])
+    elif dim == 1:
+        pad = (0, max_len - tensor.size()[dim], 0, 0)
+    tensor = torch.nn.functional.pad(
+        tensor, pad=pad, mode="constant", value=pad_token_id
+    )
+    return tensor
+def pad_3d_tensor(tensor, pad_token_id, max_len, dim):
+    if dim == 0:
+        raise Exception("dim 0 usually does not need to be padded.")
+    if dim == 1:
+        pad = (0, 0, 0, max_len - tensor.size()[dim])
+    elif dim == 2:
+        pad = (0, max_len - tensor.size()[dim], 0, 0)
+    tensor = torch.nn.functional.pad(
+        tensor, pad=pad, mode="constant", value=pad_token_id
+    )
+    return tensor
+def pad_or_truncate_encoding(encoding, pad_token_id, max_len):
+    if isinstance(encoding, torch.Tensor):
+        encoding_len = encoding.size()[0]
+    elif isinstance(encoding, list):
+        encoding_len = len(encoding)
+    if encoding_len > max_len:
+        encoding = encoding[0:max_len]
+    elif encoding_len < max_len:
+        if isinstance(encoding, torch.Tensor):
+            encoding = pad_tensor(encoding, pad_token_id, max_len)
+        elif isinstance(encoding, list):
+            encoding = pad_list(encoding, pad_token_id, max_len)
+    return encoding
+# pad list of tensors and convert to tensor
+def pad_tensor_list(
+    tensor_list,
+    dynamic_or_constant,
+    pad_token_id,
+    model_input_size,
+    dim=None,
+    padding_func=None,
+):
+    # determine maximum tensor length
+    if dynamic_or_constant == "dynamic":
+        max_len = max([tensor.squeeze().numel() for tensor in tensor_list])
+    elif isinstance(dynamic_or_constant, int):
+        max_len = dynamic_or_constant
+    else:
+        max_len = model_input_size
+        logger.warning(
+            "If padding style is constant, must provide integer value. "
+            f"Setting padding to max input size {model_input_size}."
+        )
+    # pad all tensors to maximum length
+    if dim is None:
+        tensor_list = [
+            pad_tensor(tensor, pad_token_id, max_len) for tensor in tensor_list
+        ]
+    else:
+        tensor_list = [
+            padding_func(tensor, pad_token_id, max_len, dim) for tensor in tensor_list
+        ]
+    # return stacked tensors
+    if padding_func != pad_3d_tensor:
+        return torch.stack(tensor_list)
+    else:
+        return torch.cat(tensor_list, 0)
+def gen_attention_mask(minibatch_encoding, max_len=None):
+    if max_len is None:
+        max_len = max(minibatch_encoding["length"])
+    original_lens = minibatch_encoding["length"]
+    attention_mask = [
+        [1] * original_len + [0] * (max_len - original_len)
+        if original_len <= max_len
+        else [1] * max_len
+        for original_len in original_lens
+    ]
+    return torch.tensor(attention_mask, device="cuda")
+# get cell embeddings excluding padding
+def mean_nonpadding_embs(embs, original_lens, dim=1):
+    # create a mask tensor based on padding lengths
+    mask = torch.arange(embs.size(dim), device=embs.device) < original_lens.unsqueeze(1)
+    if embs.dim() == 3:
+        # fill the masked positions in embs with zeros
+        masked_embs = embs.masked_fill(~mask.unsqueeze(2), 0.0)
+        # compute the mean across the non-padding dimensions
+        mean_embs = masked_embs.sum(dim) / original_lens.view(-1, 1).float()
+    elif embs.dim() == 2:
+        masked_embs = embs.masked_fill(~mask, 0.0)
+        mean_embs = masked_embs.sum(dim) / original_lens.float()
+    return mean_embs
+# get cell embeddings when there is no padding
+def compute_nonpadded_cell_embedding(embs, cell_emb_style):
+    if cell_emb_style == "mean_pool":
+        return torch.mean(embs, dim=embs.ndim - 2)
+# quantify shifts for a set of genes
+def quant_cos_sims(
+    perturbation_emb,
+    original_emb,
+    cell_states_to_model,
+    state_embs_dict,
+    emb_mode="gene",
+):
+    if emb_mode == "gene":
+        cos = torch.nn.CosineSimilarity(dim=2)
+    elif emb_mode == "cell":
+        cos = torch.nn.CosineSimilarity(dim=1)
+    if cell_states_to_model is None:
+        cos_sims = cos(perturbation_emb, original_emb).to("cuda")
+    else:
+        possible_states = get_possible_states(cell_states_to_model)
+        cos_sims = dict(zip(possible_states, [[] for _ in range(len(possible_states))]))
+        for state in possible_states:
+            cos_sims[state] = cos_sim_shift(
+                original_emb,
+                perturbation_emb,
+                state_embs_dict[state].to("cuda"),  # required to move to cuda here
+                cos,
+            )
+    return cos_sims
+# calculate cos sim shift of perturbation with respect to origin and alternative cell
+def cos_sim_shift(original_emb, perturbed_emb, end_emb, cos):
+    origin_v_end = cos(original_emb, end_emb)
+    perturb_v_end = cos(perturbed_emb, end_emb)
+    return perturb_v_end - origin_v_end
+def concatenate_cos_sims(cos_sims):
+    if isinstance(cos_sims, list):
+        return torch.cat(cos_sims)
+    else:
+        for state in cos_sims.keys():
+            cos_sims[state] = torch.cat(cos_sims[state])
+        return cos_sims
+def write_perturbation_dictionary(cos_sims_dict: defaultdict, output_path_prefix: str):
+    with open(f"{output_path_prefix}_raw.pickle", "wb") as fp:
+        pickle.dump(cos_sims_dict, fp)
+def tensor_list_to_pd(tensor_list):
+    tensor = torch.cat(tensor_list).cpu().numpy()
+    df = pd.DataFrame(tensor)
+    return df
+def validate_cell_states_to_model(cell_states_to_model):
+    if cell_states_to_model is not None:
+        if len(cell_states_to_model.items()) == 1:
+            logger.warning(
+                "The single value dictionary for cell_states_to_model will be "
+                "replaced with a dictionary with named keys for start, goal, and alternate states. "
+                "Please specify state_key, start_state, goal_state, and alt_states "
+                "in the cell_states_to_model dictionary for future use. "
+                "For example, cell_states_to_model={"
+                "'state_key': 'disease', "
+                "'start_state': 'dcm', "
+                "'goal_state': 'nf', "
+                "'alt_states': ['hcm', 'other1', 'other2']}"
+            )
+            for key, value in cell_states_to_model.items():
+                if (len(value) == 3) and isinstance(value, tuple):
+                    if (
+                        isinstance(value[0], list)
+                        and isinstance(value[1], list)
+                        and isinstance(value[2], list)
+                    ):
+                        if len(value[0]) == 1 and len(value[1]) == 1:
+                            all_values = value[0] + value[1] + value[2]
+                            if len(all_values) == len(set(all_values)):
+                                continue
+            # reformat to the new named key format
+            state_values = flatten_list(list(cell_states_to_model.values()))
+            cell_states_to_model = {
+                "state_key": list(cell_states_to_model.keys())[0],
+                "start_state": state_values[0][0],
+                "goal_state": state_values[1][0],
+                "alt_states": state_values[2:][0],
+            }
+        elif set(cell_states_to_model.keys()).issuperset(
+            {"state_key", "start_state", "goal_state"}
+        ):
+            if (
+                (cell_states_to_model["state_key"] is None)
+                or (cell_states_to_model["start_state"] is None)
+                or (cell_states_to_model["goal_state"] is None)
+            ):
+                logger.error(
+                    "Please specify 'state_key', 'start_state', and 'goal_state' in cell_states_to_model."
+                )
+                raise
+            if (
+                cell_states_to_model["start_state"]
+                == cell_states_to_model["goal_state"]
+            ):
+                logger.error("All states must be unique.")
+                raise
+            if "alt_states" in set(cell_states_to_model.keys()):
+                if cell_states_to_model["alt_states"] is not None:
+                    if not isinstance(cell_states_to_model["alt_states"], list):
+                        logger.error(
+                            "cell_states_to_model['alt_states'] must be a list (even if it is one element)."
+                        )
+                        raise
+                    if len(cell_states_to_model["alt_states"]) != len(
+                        set(cell_states_to_model["alt_states"])
+                    ):
+                        logger.error("All states must be unique.")
+                        raise
+            else:
+                cell_states_to_model["alt_states"] = []
+        else:
+            logger.error(
+                "cell_states_to_model must only have the following four keys: "
+                "'state_key', 'start_state', 'goal_state', 'alt_states'."
+                "For example, cell_states_to_model={"
+                "'state_key': 'disease', "
+                "'start_state': 'dcm', "
+                "'goal_state': 'nf', "
+                "'alt_states': ['hcm', 'other1', 'other2']}"
+            )
+            raise

geneformer/pretrainer.py ADDED Viewed

	@@ -0,0 +1,978 @@

+"""
+Geneformer precollator and pretrainer.
+Huggingface data collator and trainer modified to accommodate single-cell transcriptomics data.
+"""
+import collections
+import math
+import pickle
+import warnings
+from enum import Enum
+from typing import Dict, Iterator, List, Optional, Union
+import numpy as np
+import torch
+from datasets import Dataset
+from packaging import version
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import RandomSampler
+from transformers import (
+    BatchEncoding,
+    DataCollatorForLanguageModeling,
+    SpecialTokensMixin,
+    Trainer,
+)
+from transformers.file_utils import is_datasets_available, is_sagemaker_dp_enabled
+from transformers.trainer_pt_utils import (
+    DistributedLengthGroupedSampler,
+    DistributedSamplerWithLoop,
+    LengthGroupedSampler,
+)
+from transformers.training_args import ParallelMode
+from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
+from transformers.utils.generic import _is_tensorflow, _is_torch
+from .tokenizer import TOKEN_DICTIONARY_FILE
+logger = logging.get_logger(__name__)
+EncodedInput = List[int]
+VERY_LARGE_INTEGER = int(
+    1e30
+)  # This is used to set the max input length for a model with infinite size input
+LARGE_INTEGER = int(
+    1e20
+)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
+if is_sagemaker_dp_enabled():
+    import smdistributed.dataparallel.torch.distributed as dist
+else:
+    import torch.distributed as dist
+_is_torch_generator_available = False
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_torch_generator_available = True
+with open(TOKEN_DICTIONARY_FILE, "rb") as f:
+    token_dictionary = pickle.load(f)
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            "%r is not a valid %s, please select one of %s"
+            % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
+        )
+class TruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
+    """
+    ONLY_FIRST = "only_first"
+    ONLY_SECOND = "only_second"
+    LONGEST_FIRST = "longest_first"
+    DO_NOT_TRUNCATE = "do_not_truncate"
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
+    in an IDE.
+    """
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
+    """
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+class GeneformerPreCollator(SpecialTokensMixin):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(mask_token = "<mask>", pad_token = "<pad>")
+        self.token_dictionary = kwargs.get("token_dictionary")
+        # self.mask_token = "<mask>"
+        # self.mask_token_id = self.token_dictionary.get("<mask>")
+        # self.pad_token = "<pad>"
+        # self.pad_token_id = self.token_dictionary.get("<pad>")
+        self.padding_side = "right"
+        # self.all_special_ids = [
+        #     self.token_dictionary.get("<mask>"),
+        #     self.token_dictionary.get("<pad>"),
+        # ]
+        self.model_input_names = ["input_ids"]
+    def convert_ids_to_tokens(self,value):
+        return self.token_dictionary.get(value)
+    def _get_padding_truncation_strategies(
+        self,
+        padding=False,
+        truncation=False,
+        max_length=None,
+        pad_to_multiple_of=None,
+        verbose=True,
+        **kwargs,
+    ):
+        """
+        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
+        and pad_to_max_length) and behaviors.
+        """
+        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
+        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
+        # Backward compatibility for previous behavior, maybe we should deprecate it:
+        # If you only set max_length, it activates truncation for max_length
+        if max_length is not None and padding is False and truncation is False:
+            if verbose:
+                if not self.deprecation_warnings.get(
+                    "Truncation-not-explicitly-activated", False
+                ):
+                    logger.warning(
+                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
+                        "please use `truncation=True` to explicitly truncate examples to max length. "
+                        "Defaulting to 'longest_first' truncation strategy. "
+                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
+                        "more precisely by providing a specific strategy to `truncation`."
+                    )
+                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
+            truncation = "longest_first"
+        # Get padding strategy
+        if padding is False and old_pad_to_max_length:
+            if verbose:
+                warnings.warn(
+                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
+                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
+                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
+                    "maximal input size of the model (e.g. 512 for Bert).",
+                    FutureWarning,
+                )
+            if max_length is None:
+                padding_strategy = PaddingStrategy.LONGEST
+            else:
+                padding_strategy = PaddingStrategy.MAX_LENGTH
+        elif padding is not False:
+            if padding is True:
+                padding_strategy = (
+                    PaddingStrategy.LONGEST
+                )  # Default to pad to the longest sequence in the batch
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+        # Get truncation strategy
+        if truncation is False and old_truncation_strategy != "do_not_truncate":
+            if verbose:
+                warnings.warn(
+                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
+                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
+                    "maximal input size of the model (e.g. 512 for Bert). "
+                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
+                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
+                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
+                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
+                    FutureWarning,
+                )
+            truncation_strategy = TruncationStrategy(old_truncation_strategy)
+        elif truncation is not False:
+            if truncation is True:
+                truncation_strategy = (
+                    TruncationStrategy.LONGEST_FIRST
+                )  # Default to truncate the longest sequences in pairs of inputs
+            elif not isinstance(truncation, TruncationStrategy):
+                truncation_strategy = TruncationStrategy(truncation)
+            elif isinstance(truncation, TruncationStrategy):
+                truncation_strategy = truncation
+        else:
+            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+        # Set max length if needed
+        if max_length is None:
+            if padding_strategy == PaddingStrategy.MAX_LENGTH:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get(
+                            "Asking-to-pad-to-max_length", False
+                        ):
+                            logger.warning(
+                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no padding."
+                            )
+                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
+                    padding_strategy = PaddingStrategy.DO_NOT_PAD
+                else:
+                    max_length = self.model_max_length
+            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get(
+                            "Asking-to-truncate-to-max_length", False
+                        ):
+                            logger.warning(
+                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no truncation."
+                            )
+                        self.deprecation_warnings[
+                            "Asking-to-truncate-to-max_length"
+                        ] = True
+                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+                else:
+                    max_length = self.model_max_length
+        # Test if we have a padding token
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
+            not self.pad_token or self.pad_token_id < 0
+        ):
+            raise ValueError(
+                "Asking to pad but the tokenizer does not have a padding token. "
+                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
+                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
+            )
+        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
+        if (
+            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
+            and padding_strategy != PaddingStrategy.DO_NOT_PAD
+            and pad_to_multiple_of is not None
+            and max_length is not None
+            and (max_length % pad_to_multiple_of != 0)
+        ):
+            raise ValueError(
+                f"Truncation and padding are both activated but "
+                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
+            )
+        return padding_strategy, truncation_strategy, max_length, kwargs
+    def pad(
+        self,
+        encoded_inputs: Union[
+            BatchEncoding,
+            List[BatchEncoding],
+            Dict[str, EncodedInput],
+            Dict[str, List[EncodedInput]],
+            List[Dict[str, EncodedInput]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+        in the batch.
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+        ``self.pad_token_id`` and ``self.pad_token_type_id``)
+        .. note::
+            If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+            result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
+            case of PyTorch tensors, you will lose the specific device of your tensors however.
+        Args:
+            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+                well as in a PyTorch Dataloader collate function.
+                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                see the note above for the return type.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                  single sequence if provided).
+                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                  different lengths).
+            max_length (:obj:`int`, `optional`):
+                Maximum length of the returned list and optionally padding length (see above).
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask (:obj:`bool`, `optional`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to print more information and warnings.
+        """
+        # If we have a list of dicts, let's convert it in a dict of lists
+        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(encoded_inputs, (list, tuple)) and isinstance(
+            encoded_inputs[0], (dict, BatchEncoding)
+        ):
+            encoded_inputs = {
+                key: [example[key] for example in encoded_inputs]
+                for key in encoded_inputs[0].keys()
+            }
+        # The model's main input name, usually `input_ids`, has be passed for padding
+        if self.model_input_names[0] not in encoded_inputs:
+            raise ValueError(
+                "You should supply an encoding or a list of encodings to this method"
+                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
+            )
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if not required_input:
+            if return_attention_mask:
+                encoded_inputs["attention_mask"] = []
+            return encoded_inputs
+        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+        # and rebuild them afterwards if no return_tensors is specified
+        # Note that we lose the specific device the tensor may be on for PyTorch
+        first_element = required_input[0]
+        if isinstance(first_element, (list, tuple)):
+            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+            index = 0
+            while len(required_input[index]) == 0:
+                index += 1
+            if index < len(required_input):
+                first_element = required_input[index][0]
+        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+        if not isinstance(first_element, (int, list, tuple)):
+            if is_tf_available() and _is_tensorflow(first_element):
+                return_tensors = "tf" if return_tensors is None else return_tensors
+            elif is_torch_available() and _is_torch(first_element):
+                return_tensors = "pt" if return_tensors is None else return_tensors
+            elif isinstance(first_element, np.ndarray):
+                return_tensors = "np" if return_tensors is None else return_tensors
+            else:
+                raise ValueError(
+                    f"type of {first_element} unknown: {type(first_element)}. "
+                    f"Should be one of a python, numpy, pytorch or tensorflow object."
+                )
+            for key, value in encoded_inputs.items():
+                encoded_inputs[key] = to_py_obj(value)
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding, max_length=max_length, verbose=verbose
+        )
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if required_input and not isinstance(required_input[0], (list, tuple)):
+            encoded_inputs = self._pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+        batch_size = len(required_input)
+        assert all(
+            len(v) == batch_size for v in encoded_inputs.values()
+        ), "Some items in the output dictionary have a different batch size than others."
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(len(inputs) for inputs in required_input)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+        batch_outputs = {}
+        for i in range(batch_size):
+            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            outputs = self._pad(
+                inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if (
+            max_length is not None
+            and pad_to_multiple_of is not None
+            and (max_length % pad_to_multiple_of != 0)
+        ):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = (
+            padding_strategy != PaddingStrategy.DO_NOT_PAD
+            and len(required_input) != max_length
+        )
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [
+                        0
+                    ] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"]
+                        + [self.pad_token_type_id] * difference
+                    )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = (
+                        encoded_inputs["special_tokens_mask"] + [1] * difference
+                    )
+                encoded_inputs[self.model_input_names[0]] = (
+                    required_input + [self.pad_token_id] * difference
+                )
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(
+                        required_input
+                    )
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [
+                        self.pad_token_type_id
+                    ] * difference + encoded_inputs["token_type_ids"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [
+                        1
+                    ] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [
+                    self.pad_token_id
+                ] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+        elif return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+        return encoded_inputs
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                List of ids of the second sequence.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        assert already_has_special_tokens and token_ids_1 is None, (
+            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
+            "Please use a slow (full python) tokenizer to activate this argument."
+            "Or set `return_special_tokens_mask=True` when calling the encoding method "
+            "to get the special tokens mask in any tokenizer. "
+        )
+        all_special_ids = self.all_special_ids  # cache the property
+        special_tokens_mask = [
+            1 if token in all_special_ids else 0 for token in token_ids_0
+        ]
+        return special_tokens_mask
+    def convert_tokens_to_ids(
+        self, tokens: Union[str, List[str]]
+    ) -> Union[int, List[int]]:
+        """
+        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
+        vocabulary.
+        Args:
+            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+        Returns:
+            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
+        """
+        if tokens is None:
+            return None
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+        return self.token_dictionary.get(token)
+    def __len__(self):
+        return len(self.token_dictionary)
+class GeneformerPretrainer(Trainer):
+    def __init__(self, *args, **kwargs):
+        data_collator = kwargs.get("data_collator",None)
+        token_dictionary = kwargs.pop("token_dictionary")
+        if data_collator is None:
+            precollator = GeneformerPreCollator(token_dictionary=token_dictionary)
+            # # Data Collator Functions
+            data_collator = DataCollatorForLanguageModeling(
+                tokenizer=precollator, mlm=True, mlm_probability=0.15
+            )
+            kwargs["data_collator"] = data_collator
+        # load previously saved length vector for dataset to speed up LengthGroupedSampler
+        # pre-obtained with [dataset[i]["length"] for i in range(len(dataset))]
+        example_lengths_file = kwargs.pop("example_lengths_file")
+        if example_lengths_file:
+            with open(example_lengths_file, "rb") as f:
+                self.example_lengths = pickle.load(f)
+        else:
+            raise Exception(
+                "example_lengths_file is required; e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048_sorted_lengths.pkl"
+            )
+        super().__init__(*args, **kwargs)
+        # self.exp_logits_dir = exp_logits_dir
+        # self.min_exp_logits = float('inf')
+        # self.max_exp_logits = float('-inf')
+    # modify LengthGroupedSampler to avoid dataset[length_column_name] hanging
+    def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
+        if not isinstance(self.train_dataset, collections.abc.Sized):
+            return None
+        generator = None
+        if self.args.world_size <= 1 and _is_torch_generator_available:
+            generator = torch.Generator()
+            generator.manual_seed(
+                int(torch.empty((), dtype=torch.int64).random_().item())
+            )
+        # Build the sampler.
+        if self.args.group_by_length:
+            if is_datasets_available() and isinstance(self.train_dataset, Dataset):
+                lengths = self.example_lengths
+            else:
+                lengths = None
+            model_input_name = (
+                self.tokenizer.model_input_names[0]
+                if self.tokenizer is not None
+                else None
+            )
+            if self.args.world_size <= 1:
+                return LengthGroupedSampler(
+                    dataset=self.train_dataset,
+                    batch_size=self.args.train_batch_size,
+                    lengths=lengths,
+                    model_input_name=model_input_name,
+                    generator=generator,
+                )
+            else:
+                return CustomDistributedLengthGroupedSampler(
+                    dataset=self.train_dataset,
+                    batch_size=self.args.train_batch_size,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    lengths=lengths,
+                    model_input_name=model_input_name,
+                    seed=self.args.seed,
+                )
+        else:
+            if self.args.world_size <= 1:
+                if _is_torch_generator_available:
+                    return RandomSampler(self.train_dataset, generator=generator)
+                return RandomSampler(self.train_dataset)
+            elif (
+                self.args.parallel_mode
+                in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
+                and not self.args.dataloader_drop_last
+            ):
+                # Use a loop for TPUs when drop_last is False to have all batches have the same size.
+                return DistributedSamplerWithLoop(
+                    self.train_dataset,
+                    batch_size=self.args.per_device_train_batch_size,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    seed=self.args.seed,
+                )
+            else:
+                return DistributedSampler(
+                    self.train_dataset,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    seed=self.args.seed,
+                )
+class CustomDistributedLengthGroupedSampler(DistributedLengthGroupedSampler):
+    r"""
+    Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
+    length while keeping a bit of randomness.
+    """
+    # Copied and adapted from PyTorch DistributedSampler.
+    def __init__(
+        self,
+        dataset: Dataset,
+        batch_size: int,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        seed: int = 0,
+        drop_last: bool = False,
+        lengths: Optional[List[int]] = None,
+        model_input_name: Optional[str] = None,
+    ):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.num_replicas) / self.num_replicas
+            )
+        else:
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
+        self.total_size = self.num_samples * self.num_replicas
+        self.seed = seed
+        self.model_input_name = (
+            model_input_name if model_input_name is not None else "input_ids"
+        )
+        if lengths is None:
+            print("Lengths is none - calculating lengths.")
+            if (
+                not (
+                    isinstance(dataset[0], dict)
+                    or isinstance(dataset[0], BatchEncoding)
+                )
+                or self.model_input_name not in dataset[0]
+            ):
+                raise ValueError(
+                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
+                    f"'{self.model_input_name}' key."
+                )
+            lengths = [len(feature[self.model_input_name]) for feature in dataset]
+        self.lengths = lengths
+    def __iter__(self) -> Iterator:
+        # Deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            indices += indices[: (self.total_size - len(indices))]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+def get_length_grouped_indices(
+    lengths, batch_size, mega_batch_mult=None, generator=None
+):
+    """
+    Return a list of indices so that each slice of :obj:`batch_size` consecutive indices correspond to elements of
+    similar lengths. To do this, the indices are:
+    - randomly permuted
+    - grouped in mega-batches of size :obj:`mega_batch_mult * batch_size`
+    - sorted by length in each mega-batch
+    The result is the concatenation of all mega-batches, with the batch of :obj:`batch_size` containing the element of
+    maximum length placed first, so that an OOM happens sooner rather than later.
+    """
+    # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
+    if mega_batch_mult is None:
+        # mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
+        mega_batch_mult = min(len(lengths) // (batch_size * 4), 1000)
+        # Just in case, for tiny datasets
+        if mega_batch_mult == 0:
+            mega_batch_mult = 1
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = mega_batch_mult * batch_size
+    megabatches = [
+        indices[i : i + megabatch_size].tolist()
+        for i in range(0, len(lengths), megabatch_size)
+    ]
+    megabatches = [
+        list(sorted(megabatch, key=lambda i: lengths[i], reverse=True))
+        for megabatch in megabatches
+    ]
+    # The rest is to get the biggest batch first.
+    # Since each megabatch is sorted by descending length, the longest element is the first
+    megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
+    max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item()
+    # Switch to put the longest element in first position
+    megabatches[0][0], megabatches[max_idx][0] = (
+        megabatches[max_idx][0],
+        megabatches[0][0],
+    )
+    return [item for sublist in megabatches for item in sublist]
+# from typing import Any, Tuple, Optional
+# class CustomDataCollatorForMLM(DataCollatorForLanguageModeling):
+#     # def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+#     #     import torch
+#     #     labels = inputs.clone()
+#     #     # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+#     #     probability_matrix = torch.full(labels.shape, self.mlm_probability)
+#     #     if special_tokens_mask is None:
+#     #         special_tokens_mask = [
+#     #             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+#     #         ]
+#     #         special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+#     #     else:
+#     #         special_tokens_mask = special_tokens_mask.bool()
+#     #     probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+#     #     masked_indices = torch.bernoulli(probability_matrix).bool()
+#     #     labels[~masked_indices] = -100  # We only compute loss on masked tokens
+#     #     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+#     #     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+#     #     inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+#     #     # 15% of the time, we replace masked input tokens with random word
+#     #     indices_random = torch.bernoulli(torch.full(labels.shape, 0.75)).bool() & masked_indices & ~indices_replaced
+#     #     random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+#     #     inputs[indices_random] = random_words[indices_random]
+#     #     # The rest of the time (5% of the time) we keep the masked input tokens unchanged
+#     #     return inputs, labels
+#     def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+#         import torch
+#         labels = inputs.clone()
+#         probability_matrix = torch.full(labels.shape, self.mlm_probability)
+#         if special_tokens_mask is None:
+#             special_tokens_mask = [
+#                 self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+#             ]
+#             special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+#         else:
+#             special_tokens_mask = special_tokens_mask.bool()
+#         probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+#         masked_indices = torch.bernoulli(probability_matrix).bool()
+#         labels[~masked_indices] = -100  # We only compute loss on masked tokens
+#         # 100% of the time, replace masked input tokens with tokenizer.mask_token ([MASK])
+#         inputs[masked_indices] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+#         return inputs, labels
+# class CustomGeneformerPretrainer(Trainer):
+#     def __init__(self, *args, **kwargs):
+#         data_collator = kwargs.get("data_collator",None)
+#         token_dictionary = kwargs.pop("token_dictionary")
+#         if data_collator is None:
+#             precollator = GeneformerPreCollator(token_dictionary=token_dictionary)
+#             # # Data Collator Functions
+#             data_collator = CustomDataCollatorForMLM(
+#                 tokenizer=precollator, mlm=True, mlm_probability=0.15
+#             )
+#             kwargs["data_collator"] = data_collator
+#         # load previously saved length vector for dataset to speed up LengthGroupedSampler
+#         # pre-obtained with [dataset[i]["length"] for i in range(len(dataset))]
+#         example_lengths_file = kwargs.pop("example_lengths_file")
+#         if example_lengths_file:
+#             with open(example_lengths_file, "rb") as f:
+#                 self.example_lengths = pickle.load(f)
+#         else:
+#             raise Exception(
+#                 "example_lengths_file is required; e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048_sorted_lengths.pkl"
+#             )
+#         super().__init__(*args, **kwargs)
+#         # self.exp_logits_dir = exp_logits_dir
+#         # self.min_exp_logits = float('inf')
+#         # self.max_exp_logits = float('-inf')
+#     # modify LengthGroupedSampler to avoid dataset[length_column_name] hanging
+#     def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
+#         if not isinstance(self.train_dataset, collections.abc.Sized):
+#             return None
+#         generator = None
+#         if self.args.world_size <= 1 and _is_torch_generator_available:
+#             generator = torch.Generator()
+#             generator.manual_seed(
+#                 int(torch.empty((), dtype=torch.int64).random_().item())
+#             )
+#         # Build the sampler.
+#         if self.args.group_by_length:
+#             if is_datasets_available() and isinstance(self.train_dataset, Dataset):
+#                 lengths = self.example_lengths
+#             else:
+#                 lengths = None
+#             model_input_name = (
+#                 self.tokenizer.model_input_names[0]
+#                 if self.tokenizer is not None
+#                 else None
+#             )
+#             if self.args.world_size <= 1:
+#                 return LengthGroupedSampler(
+#                     dataset=self.train_dataset,
+#                     batch_size=self.args.train_batch_size,
+#                     lengths=lengths,
+#                     model_input_name=model_input_name,
+#                     generator=generator,
+#                 )
+#             else:
+#                 return CustomDistributedLengthGroupedSampler(
+#                     dataset=self.train_dataset,
+#                     batch_size=self.args.train_batch_size,
+#                     num_replicas=self.args.world_size,
+#                     rank=self.args.process_index,
+#                     lengths=lengths,
+#                     model_input_name=model_input_name,
+#                     seed=self.args.seed,
+#                 )
+#         else:
+#             if self.args.world_size <= 1:
+#                 if _is_torch_generator_available:
+#                     return RandomSampler(self.train_dataset, generator=generator)
+#                 return RandomSampler(self.train_dataset)
+#             elif (
+#                 self.args.parallel_mode
+#                 in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
+#                 and not self.args.dataloader_drop_last
+#             ):
+#                 # Use a loop for TPUs when drop_last is False to have all batches have the same size.
+#                 return DistributedSamplerWithLoop(
+#                     self.train_dataset,
+#                     batch_size=self.args.per_device_train_batch_size,
+#                     num_replicas=self.args.world_size,
+#                     rank=self.args.process_index,
+#                     seed=self.args.seed,
+#                 )
+#             else:
+#                 return DistributedSampler(
+#                     self.train_dataset,
+#                     num_replicas=self.args.world_size,
+#                     rank=self.args.process_index,
+#                     seed=self.args.seed,
+#                 )

geneformer/token_dictionary.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9dc40973fa5224d77b793e2fd114cacf3d08423ed9c4c49caf0ba9c7f218f1
+size 788424

geneformer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,369 @@

+"""
+Geneformer tokenizer.
+**Input data:**
+| *Required format:* raw counts scRNAseq data without feature selection as .loom or anndata file.
+| *Required row (gene) attribute:* "ensembl_id"; Ensembl ID for each gene.
+| *Required col (cell) attribute:* "n_counts"; total read counts in that cell.
+| *Optional col (cell) attribute:* "filter_pass"; binary indicator of whether cell should be tokenized based on user-defined filtering criteria.
+| *Optional col (cell) attributes:* any other cell metadata can be passed on to the tokenized dataset as a custom attribute dictionary as shown below.
+**Usage:**
+.. code-block :: python
+    >>> from geneformer import TranscriptomeTokenizer
+    >>> tk = TranscriptomeTokenizer({"cell_type": "cell_type", "organ_major": "organ"}, nproc=4)
+    >>> tk.tokenize_data("data_directory", "output_directory", "output_prefix")
+**Description:**
+| Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.
+| The discussion below references the .loom file format, but the analagous labels are required for .h5ad files, just that they will be column instead of row attributes and vice versa due to the transposed format of the two file types.
+| Genes should be labeled with Ensembl IDs (loom row attribute "ensembl_id"), which provide a unique identifer for conversion to tokens. Other forms of gene annotations (e.g. gene names) can be converted to Ensembl IDs via Ensembl Biomart. Cells should be labeled with the total read count in the cell (loom column attribute "n_counts") to be used for normalization.
+| No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes "cell_type" and "organ_major" and one would like to retain these attributes as labels in the tokenized dataset with the new names "cell_type" and "organ", respectively, the following custom attribute dictionary should be provided: {"cell_type": "cell_type", "organ_major": "organ"}.
+| Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
+| If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.
+"""
+from __future__ import annotations
+import logging
+import pickle
+import warnings
+from pathlib import Path
+from typing import Literal
+import anndata as ad
+import loompy as lp
+import numpy as np
+import scipy.sparse as sp
+from datasets import Dataset
+warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
+logger = logging.getLogger(__name__)
+GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
+TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary.pkl"
+def rank_genes(gene_vector, gene_tokens):
+    """
+    Rank gene expression vector.
+    """
+    # sort by median-scaled gene values
+    sorted_indices = np.argsort(-gene_vector)
+    return gene_tokens[sorted_indices]
+def tokenize_cell(gene_vector, gene_tokens):
+    """
+    Convert normalized gene expression vector to tokenized rank value encoding.
+    """
+    # create array of gene vector with token indices
+    # mask undetected genes
+    nonzero_mask = np.nonzero(gene_vector)[0]
+    # rank by median-scaled gene values
+    return rank_genes(gene_vector[nonzero_mask], gene_tokens[nonzero_mask])
+class TranscriptomeTokenizer:
+    def __init__(
+        self,
+        custom_attr_name_dict=None,
+        nproc=1,
+        chunk_size=512,
+        gene_median_file=GENE_MEDIAN_FILE,
+        token_dictionary_file=TOKEN_DICTIONARY_FILE,
+    ):
+        """
+        Initialize tokenizer.
+        **Parameters:**
+        custom_attr_name_dict : None, dict
+            | Dictionary of custom attributes to be added to the dataset.
+            | Keys are the names of the attributes in the loom file.
+            | Values are the names of the attributes in the dataset.
+        nproc : int
+            | Number of processes to use for dataset mapping.
+        chunk_size: int = 512
+            | Chunk size for anndata tokenizer.
+        gene_median_file : Path
+            | Path to pickle file containing dictionary of non-zero median
+            | gene expression values across Genecorpus-30M.
+        token_dictionary_file : Path
+            | Path to pickle file containing token dictionary (Ensembl IDs:token).
+        """
+        # dictionary of custom attributes {output dataset column name: input .loom column name}
+        self.custom_attr_name_dict = custom_attr_name_dict
+        # number of processes for dataset mapping
+        self.nproc = nproc
+        # chunk size for anndata tokenizer
+        self.chunk_size = chunk_size
+        # load dictionary of gene normalization factors
+        # (non-zero median value of expression across Genecorpus-30M)
+        with open(gene_median_file, "rb") as f:
+            self.gene_median_dict = pickle.load(f)
+        # load token dictionary (Ensembl IDs:token)
+        with open(token_dictionary_file, "rb") as f:
+            self.gene_token_dict = pickle.load(f)
+        # gene keys for full vocabulary
+        self.gene_keys = list(self.gene_median_dict.keys())
+        # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization
+        self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys)))
+    def tokenize_data(
+        self,
+        data_directory: Path | str,
+        output_directory: Path | str,
+        output_prefix: str,
+        file_format: Literal["loom", "h5ad"] = "loom",
+        use_generator: bool = False,
+    ):
+        """
+        Tokenize .loom files in data_directory and save as tokenized .dataset in output_directory.
+        **Parameters:**
+        data_directory : Path
+            | Path to directory containing loom files or anndata files
+        output_directory : Path
+            | Path to directory where tokenized data will be saved as .dataset
+        output_prefix : str
+            | Prefix for output .dataset
+        file_format : str
+            | Format of input files. Can be "loom" or "h5ad".
+        use_generator : bool
+            | Whether to use generator or dict for tokenization.
+        """
+        tokenized_cells, cell_metadata = self.tokenize_files(
+            Path(data_directory), file_format
+        )
+        tokenized_dataset = self.create_dataset(
+            tokenized_cells, cell_metadata, use_generator=use_generator
+        )
+        output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
+        tokenized_dataset.save_to_disk(output_path)
+    def tokenize_files(
+        self, data_directory, file_format: Literal["loom", "h5ad"] = "loom"
+    ):
+        tokenized_cells = []
+        if self.custom_attr_name_dict is not None:
+            cell_attr = [attr_key for attr_key in self.custom_attr_name_dict.keys()]
+            cell_metadata = {
+                attr_key: [] for attr_key in self.custom_attr_name_dict.values()
+            }
+        # loops through directories to tokenize .loom files
+        file_found = 0
+        # loops through directories to tokenize .loom or .h5ad files
+        tokenize_file_fn = (
+            self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
+        )
+        for file_path in data_directory.glob(f"*.{file_format}"):
+            file_found = 1
+            print(f"Tokenizing {file_path}")
+            file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path)
+            tokenized_cells += file_tokenized_cells
+            if self.custom_attr_name_dict is not None:
+                for k in cell_attr:
+                    cell_metadata[self.custom_attr_name_dict[k]] += file_cell_metadata[
+                        k
+                    ]
+            else:
+                cell_metadata = None
+        if file_found == 0:
+            logger.error(
+                f"No .{file_format} files found in directory {data_directory}."
+            )
+            raise
+        return tokenized_cells, cell_metadata
+    def tokenize_anndata(self, adata_file_path, target_sum=10_000):
+        adata = ad.read(adata_file_path, backed="r")
+        if self.custom_attr_name_dict is not None:
+            file_cell_metadata = {
+                attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
+            }
+        coding_miRNA_loc = np.where(
+            [self.genelist_dict.get(i, False) for i in adata.var["ensembl_id"]]
+        )[0]
+        norm_factor_vector = np.array(
+            [
+                self.gene_median_dict[i]
+                for i in adata.var["ensembl_id"][coding_miRNA_loc]
+            ]
+        )
+        coding_miRNA_ids = adata.var["ensembl_id"][coding_miRNA_loc]
+        coding_miRNA_tokens = np.array(
+            [self.gene_token_dict[i] for i in coding_miRNA_ids]
+        )
+        try:
+            _ = adata.obs["filter_pass"]
+        except KeyError:
+            var_exists = False
+        else:
+            var_exists = True
+        if var_exists:
+            filter_pass_loc = np.where([i == 1 for i in adata.obs["filter_pass"]])[0]
+        elif not var_exists:
+            print(
+                f"{adata_file_path} has no column attribute 'filter_pass'; tokenizing all cells."
+            )
+            filter_pass_loc = np.array([i for i in range(adata.shape[0])])
+        tokenized_cells = []
+        for i in range(0, len(filter_pass_loc), self.chunk_size):
+            idx = filter_pass_loc[i : i + self.chunk_size]
+            n_counts = adata[idx].obs["n_counts"].values[:, None]
+            X_view = adata[idx, coding_miRNA_loc].X
+            X_norm = X_view / n_counts * target_sum / norm_factor_vector
+            X_norm = sp.csr_matrix(X_norm)
+            tokenized_cells += [
+                rank_genes(X_norm[i].data, coding_miRNA_tokens[X_norm[i].indices])
+                for i in range(X_norm.shape[0])
+            ]
+            # add custom attributes for subview to dict
+            if self.custom_attr_name_dict is not None:
+                for k in file_cell_metadata.keys():
+                    file_cell_metadata[k] += adata[idx].obs[k].tolist()
+            else:
+                file_cell_metadata = None
+        return tokenized_cells, file_cell_metadata
+    def tokenize_loom(self, loom_file_path, target_sum=10_000):
+        if self.custom_attr_name_dict is not None:
+            file_cell_metadata = {
+                attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
+            }
+        with lp.connect(str(loom_file_path)) as data:
+            # define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors
+            coding_miRNA_loc = np.where(
+                [self.genelist_dict.get(i, False) for i in data.ra["ensembl_id"]]
+            )[0]
+            norm_factor_vector = np.array(
+                [
+                    self.gene_median_dict[i]
+                    for i in data.ra["ensembl_id"][coding_miRNA_loc]
+                ]
+            )
+            coding_miRNA_ids = data.ra["ensembl_id"][coding_miRNA_loc]
+            coding_miRNA_tokens = np.array(
+                [self.gene_token_dict[i] for i in coding_miRNA_ids]
+            )
+            # define coordinates of cells passing filters for inclusion (e.g. QC)
+            try:
+                data.ca["filter_pass"]
+            except AttributeError:
+                var_exists = False
+            else:
+                var_exists = True
+            if var_exists:
+                filter_pass_loc = np.where([i == 1 for i in data.ca["filter_pass"]])[0]
+            elif not var_exists:
+                print(
+                    f"{loom_file_path} has no column attribute 'filter_pass'; tokenizing all cells."
+                )
+                filter_pass_loc = np.array([i for i in range(data.shape[1])])
+            # scan through .loom files and tokenize cells
+            tokenized_cells = []
+            for _ix, _selection, view in data.scan(
+                items=filter_pass_loc, axis=1, batch_size=self.chunk_size
+            ):
+                # select subview with protein-coding and miRNA genes
+                subview = view.view[coding_miRNA_loc, :]
+                # normalize by total counts per cell and multiply by 10,000 to allocate bits to precision
+                # and normalize by gene normalization factors
+                subview_norm_array = (
+                    subview[:, :]
+                    / subview.ca.n_counts
+                    * target_sum
+                    / norm_factor_vector[:, None]
+                )
+                # tokenize subview gene vectors
+                tokenized_cells += [
+                    tokenize_cell(subview_norm_array[:, i], coding_miRNA_tokens)
+                    for i in range(subview_norm_array.shape[1])
+                ]
+                # add custom attributes for subview to dict
+                if self.custom_attr_name_dict is not None:
+                    for k in file_cell_metadata.keys():
+                        file_cell_metadata[k] += subview.ca[k].tolist()
+                else:
+                    file_cell_metadata = None
+        return tokenized_cells, file_cell_metadata
+    def create_dataset(
+        self,
+        tokenized_cells,
+        cell_metadata,
+        use_generator=False,
+        keep_uncropped_input_ids=False,
+    ):
+        print("Creating dataset.")
+        # create dict for dataset creation
+        dataset_dict = {"input_ids": tokenized_cells}
+        if self.custom_attr_name_dict is not None:
+            dataset_dict.update(cell_metadata)
+        # create dataset
+        if use_generator:
+            def dict_generator():
+                for i in range(len(tokenized_cells)):
+                    yield {k: dataset_dict[k][i] for k in dataset_dict.keys()}
+            output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc)
+        else:
+            output_dataset = Dataset.from_dict(dataset_dict)
+        def format_cell_features(example):
+            # Store original uncropped input_ids in separate feature
+            if keep_uncropped_input_ids:
+                example["input_ids_uncropped"] = example["input_ids"]
+                example["length_uncropped"] = len(example["input_ids"])
+            # Truncate/Crop input_ids to size 2,048
+            example["input_ids"] = example["input_ids"][0:2048]
+            example["length"] = len(example["input_ids"])
+            return example
+        output_dataset_truncated = output_dataset.map(
+            format_cell_features, num_proc=self.nproc
+        )
+        return output_dataset_truncated