JFLa commited on Oct 16, 2025

Commit

a8f93e1

verified ·

1 Parent(s): f930e87

Upload 56 files

Browse files

Downstream classification and zero-shot batch effect tasks

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Downstream_tasks/.DS_Store +0 -0
Downstream_tasks/Classification/.DS_Store +0 -0
Downstream_tasks/Classification/Cardio.py +1418 -0
Downstream_tasks/Classification/Cardio_ML.ipynb +1404 -0
Downstream_tasks/Classification/Gene_dosage.ipynb +0 -0
Downstream_tasks/Classification/Gene_dosage_ML.ipynb +0 -0
Downstream_tasks/Classification/Tissue_type.py +457 -0
Downstream_tasks/Classification/Tissue_type_ML.ipynb +933 -0
Downstream_tasks/Zero_shot_batch_effect/.DS_Store +0 -0
Downstream_tasks/Zero_shot_batch_effect/.gitignore +419 -0
Downstream_tasks/Zero_shot_batch_effect/CODE_OF_CONDUCT.md +9 -0
Downstream_tasks/Zero_shot_batch_effect/LICENSE +21 -0
Downstream_tasks/Zero_shot_batch_effect/README.md +162 -0
Downstream_tasks/Zero_shot_batch_effect/SECURITY.md +41 -0
Downstream_tasks/Zero_shot_batch_effect/SUPPORT.md +16 -0
Downstream_tasks/Zero_shot_batch_effect/envs/conda_env.yml +21 -0
Downstream_tasks/Zero_shot_batch_effect/envs/docker/base_image/Dockerfile +28 -0
Downstream_tasks/Zero_shot_batch_effect/envs/docker/base_image/test.py +66 -0
Downstream_tasks/Zero_shot_batch_effect/envs/docker/base_image/test_docker.sh +13 -0
Downstream_tasks/Zero_shot_batch_effect/envs/docker/jupyter/Dockerfile +12 -0
Downstream_tasks/Zero_shot_batch_effect/envs/installation.sh +85 -0
Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_Geneformer.ipynb +0 -0
Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_HVG_and_scVI.ipynb +0 -0
Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_evaluation_aggregated.ipynb +1058 -0
Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_raw_data.ipynb +328 -0
Downstream_tasks/Zero_shot_batch_effect/requirements.txt +12 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__init__.py +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/__init__.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/__init__.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/cell_embeddings.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/cell_embeddings.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/data.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/data.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/geneformer_forward.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/geneformer_forward.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/model_output.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/model_output.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/scgpt_forward.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/utils.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/utils.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/cell_embeddings.py +417 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/data.py +330 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/geneformer_forward.py +365 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__init__.py +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/__init__.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/__init__.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/custom_logging.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/custom_logging.cpython-311.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/umap.cpython-310.pyc +0 -0
Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/umap.cpython-311.pyc +0 -0

Downstream_tasks/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Downstream_tasks/Classification/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Downstream_tasks/Classification/Cardio.py ADDED Viewed

	@@ -0,0 +1,1418 @@

+import os
+from tqdm.auto import tqdm, trange
+GPU_NUMBER = [0]
+os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
+os.environ["NCCL_DEBUG"] = "INFO"
+# imports
+from collections import Counter
+import seaborn as sns; sns.set()
+from datasets import load_from_disk
+from sklearn.metrics import accuracy_score, f1_score
+from transformers import Trainer
+from transformers.training_args import TrainingArguments
+import pandas as pd
+from datasets.utils.logging import disable_progress_bar, enable_progress_bar
+from sklearn import preprocessing
+from sklearn.metrics import (
+    ConfusionMatrixDisplay,
+    accuracy_score,
+    auc,
+    confusion_matrix,
+    f1_score,
+    roc_curve,
+)
+from pathlib import Path
+import sys
+# sys.path.append('../Geneformer')
+from geneformer import DataCollatorForCellClassification
+from datasets import load_from_disk
+import sys
+from tqdm.notebook import tqdm
+import seaborn as sns
+import matplotlib.pyplot as plt
+from geneformer.pretrainer import token_dictionary
+import datetime
+import time
+import pickle
+import random
+import subprocess
+import numpy as np
+import pytz
+import torch
+from datasets import load_from_disk, Dataset
+from transformers import (BertConfig, BertForMaskedLM, TrainingArguments, TrainerCallback,
+                        Trainer, BertModel, BertPreTrainedModel, BertForSequenceClassification, BertForTokenClassification)
+from geneformer import GeneformerPretrainer
+from torch import Tensor
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.models.bert.modeling_bert import BertLMPredictionHead, BertOnlyMLMHead, BertPredictionHeadTransform
+from transformers.activations import ACT2FN
+from typing import List, Optional, Tuple, Union
+import torch.nn.functional as F
+macro_f1_list = []
+acc_list = []
+iter_step = 2
+class CustomBertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["decoder.weight", "bert.embeddings.word_embeddings.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.transform = BertPredictionHeadTransform(config)
+        self.decoder = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = torch.nn.Parameter(torch.zeros(config.vocab_size))
+        # Initialize weights
+        self.init_weights()
+        # Tie weights automatically
+        self.tie_weights()
+        # self.post_init()
+    def tie_weights(self):
+        """
+        Ties the weights between the input embeddings and output decoder weights.
+        """
+        self.decoder.weight = self.bert.embeddings.word_embeddings.weight
+    def probability_convert(self, probs: Tensor, input_ids: Tensor, labels: Tensor) -> Tensor:
+        device = probs.device
+        batch_size, seq_length, vocab_size = probs.size()
+        _, input_seq_length = input_ids.size()
+        non_mask = labels == -100
+        non_mask_indices = non_mask.nonzero(as_tuple=True)
+        known_gene_indices = input_ids[non_mask]
+        # Generate (1-p) matrix whiel assigning all known genes in the beginning
+        zeros = torch.zeros((batch_size, 1, vocab_size), device=device)
+        zeros[non_mask_indices[0], 0, known_gene_indices] = 1.0
+        probs_shifted = torch.cat((zeros, probs[:, :-1, :]), dim=1)
+        inv_probs_shifted = 1 - probs_shifted
+        # Cumulative product to get (1-p_1)*(1-p_2)*...*(p_i)
+        cumprod_inv_probs = torch.cumprod(inv_probs_shifted, dim=1)
+        modified_probs = probs * cumprod_inv_probs
+        # # Since we are assigning probabilities for already known genes,
+        # # (1-p_1)*(1-p_2)*...*(p_i) for these genes can result in 0, due to hard assignment of probs to be 1
+        # # Add 1e-18 to avoid dividing modified probs by 0
+        # # During dubugging stage, some issues occurred in the normalization step.
+        # # Since probabilities in each position do not necessarily need to sum up to one, leave out normalization.
+        normalized_probs = modified_probs.sum(dim=-1, keepdim=True).clamp(min=1e-18)
+        modified_probs = modified_probs / normalized_probs # Normalization after cumulative production
+        return modified_probs
+    def assign_known_gene_probs(self, probs: Tensor, input_ids: Tensor, labels: Tensor) -> Tensor:
+        device = probs.device
+        batch_size, seq_length, vocab_size = probs.size()
+        _, input_seq_length = input_ids.size()
+        # Truncate `labels` to match the length of `input_ids` along the sequence dimension
+        truncated_labels = labels[:, :input_seq_length]
+        non_mask = truncated_labels == -100
+        non_mask_indices = non_mask.nonzero(as_tuple=True)
+        ones = torch.ones((batch_size, seq_length, vocab_size), device=device)
+        zeros = torch.zeros((batch_size, seq_length, vocab_size), device=device)
+        known_gene_indices = input_ids[non_mask]
+        ones[non_mask_indices[0], non_mask_indices[1], :] = 0.0
+        zeros[non_mask_indices[0], non_mask_indices[1], known_gene_indices] = 1.0
+        # Modify already known genes' probabilities using the one-hot tensor
+        modified_probs = probs * ones
+        modified_probs = modified_probs + zeros
+        # Do the normalization
+        modified_probs = modified_probs / modified_probs.sum(dim=-1, keepdim=True).clamp(min=1e-18)  # Normalize
+        return modified_probs
+    def forward(
+        self,
+        input_ids: Tensor | None = None,
+        attention_mask: Tensor | None = None,
+        token_type_ids: Tensor | None = None,
+        position_ids: Tensor | None = None,
+        head_mask: Tensor | None = None,
+        inputs_embeds: Tensor | None = None,
+        encoder_hidden_states: Tensor | None = None,
+        encoder_attention_mask: Tensor | None = None,
+        labels: Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            )
+        hidden_states = outputs[0]
+        hidden_transform = self.transform(hidden_states)
+        logits = self.decoder(hidden_transform) + self.bias
+        probs = F.softmax(logits, dim=-1)
+        # Probability manipulations to avoid repeats from already known genes
+        probs = self.assign_known_gene_probs(probs, input_ids, labels)
+        convert_probs = self.probability_convert(probs, input_ids, labels)
+        assigned_probs = self.assign_known_gene_probs(convert_probs, input_ids, labels)
+        masked_lm_loss = None
+        if labels is not None:
+            probs_flat = assigned_probs.view(-1, self.config.vocab_size)
+            labels_flat = labels.view(-1)
+            mask = (labels != -100).float().view(-1)
+            # Compute masked cross-entropy loss
+            masked_lm_loss = -torch.log(torch.clamp(probs_flat[torch.arange(len(labels_flat)), labels_flat], min=1e-18)) * mask
+            masked_lm_loss = masked_lm_loss.sum() / mask.sum()
+        else:
+            loss = None
+        if not return_dict:
+            output = (assigned_probs,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=assigned_probs,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+def prepare_data(
+    input_data_file,
+    output_directory,
+    output_prefix,
+    split_id_dict=None,
+    test_size=None,
+    attr_to_split=None,
+    attr_to_balance=None,
+    max_trials=100,
+    pval_threshold=0.1,
+):
+    """
+    Prepare data for cell state or gene classification.
+    **Parameters**
+    input_data_file : Path
+        | Path to directory containing .dataset input
+    output_directory : Path
+        | Path to directory where prepared data will be saved
+    output_prefix : str
+        | Prefix for output file
+    split_id_dict : None, dict
+        | Dictionary of IDs for train and test splits
+        | Three-item dictionary with keys: attr_key, train, test
+        | attr_key: key specifying name of column in .dataset that contains the IDs for the data splits
+        | train: list of IDs in the attr_key column to include in the train split
+        | test: list of IDs in the attr_key column to include in the test split
+        | For example: {"attr_key": "individual",
+        |               "train": ["patient1", "patient2", "patient3", "patient4"],
+        |               "test": ["patient5", "patient6"]}
+    test_size : None, float
+        | Proportion of data to be saved separately and held out for test set
+        | (e.g. 0.2 if intending hold out 20%)
+        | If None, will inherit from split_sizes["test"] from Classifier
+        | The training set will be further split to train / validation in self.validate
+        | Note: only available for CellClassifiers
+    attr_to_split : None, str
+        | Key for attribute on which to split data while balancing potential confounders
+        | e.g. "patient_id" for splitting by patient while balancing other characteristics
+        | Note: only available for CellClassifiers
+    attr_to_balance : None, list
+        | List of attribute keys on which to balance data while splitting on attr_to_split
+        | e.g. ["age", "sex"] for balancing these characteristics while splitting by patient
+        | Note: only available for CellClassifiers
+    max_trials : None, int
+        | Maximum number of trials of random splitting to try to achieve balanced other attributes
+        | If no split is found without significant (p<0.05) differences in other attributes, will select best
+        | Note: only available for CellClassifiers
+    pval_threshold : None, float
+        | P-value threshold to use for attribute balancing across splits
+        | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
+    """
+    if test_size is None:
+        test_size = oos_test_size
+    # prepare data and labels for classification
+    data = load_and_filter(filter_data, nproc, input_data_file)
+    if classifier == "cell":
+        if "label" in data.features:
+            logger.error(
+                "Column name 'label' must be reserved for class IDs. Please rename column."
+            )
+            raise
+    elif classifier == "gene":
+        if "labels" in data.features:
+            logger.error(
+                "Column name 'labels' must be reserved for class IDs. Please rename column."
+            )
+            raise
+    if (attr_to_split is not None) and (attr_to_balance is None):
+        logger.error(
+            "Splitting by attribute while balancing confounders requires both attr_to_split and attr_to_balance to be defined."
+        )
+        raise
+    if not isinstance(attr_to_balance, list):
+        attr_to_balance = [attr_to_balance]
+    if classifier == "cell":
+        # remove cell states representing < rare_threshold of cells
+        data = remove_rare(
+            data, rare_threshold, cell_state_dict["state_key"], nproc
+        )
+        # downsample max cells and max per class
+        data = downsample_and_shuffle(
+            data, max_ncells, None, cell_state_dict
+        )
+        # rename cell state column to "label"
+        data = rename_cols(data, cell_state_dict["state_key"])
+    # convert classes to numerical labels and save as id_class_dict
+    # of note, will label all genes in gene_class_dict
+    # if (cross-)validating, genes will be relabeled in column "labels" for each split
+    # at the time of training with Classifier.validate
+    data, id_class_dict = label_classes(
+        classifier, data, None, nproc
+    )
+    # save id_class_dict for future reference
+    id_class_output_path = (
+        Path(output_directory) / f"{output_prefix}_id_class_dict"
+    ).with_suffix(".pkl")
+    with open(id_class_output_path, "wb") as f:
+        pickle.dump(id_class_dict, f)
+    if split_id_dict is not None:
+        data_dict = dict()
+        data_dict["train"] = filter_by_dict(
+            data, {split_id_dict["attr_key"]: split_id_dict["train"]}, nproc
+        )
+        data_dict["test"] = filter_by_dict(
+            data, {split_id_dict["attr_key"]: split_id_dict["test"]}, nproc
+        )
+        train_data_output_path = (
+            Path(output_directory) / f"{output_prefix}_labeled_train"
+        ).with_suffix(".dataset")
+        test_data_output_path = (
+            Path(output_directory) / f"{output_prefix}_labeled_test"
+        ).with_suffix(".dataset")
+        data_dict["train"].save_to_disk(str(train_data_output_path))
+        data_dict["test"].save_to_disk(str(test_data_output_path))
+    elif (test_size is not None) and (classifier == "cell"):
+        if 1 > test_size > 0:
+            if attr_to_split is None:
+                data_dict = data.train_test_split(
+                    test_size=test_size,
+                    stratify_by_column=None,
+                    seed=42,
+                )
+                train_data_output_path = (
+                    Path(output_directory) / f"{output_prefix}_labeled_train"
+                ).with_suffix(".dataset")
+                test_data_output_path = (
+                    Path(output_directory) / f"{output_prefix}_labeled_test"
+                ).with_suffix(".dataset")
+                data_dict["train"].save_to_disk(str(train_data_output_path))
+                data_dict["test"].save_to_disk(str(test_data_output_path))
+            else:
+                data_dict, balance_df = cu.balance_attr_splits(
+                    data,
+                    attr_to_split,
+                    attr_to_balance,
+                    test_size,
+                    max_trials,
+                    pval_threshold,
+                    cell_state_dict["state_key"],
+                    nproc,
+                )
+                balance_df.to_csv(
+                    f"{output_directory}/{output_prefix}_train_test_balance_df.csv"
+                )
+                train_data_output_path = (
+                    Path(output_directory) / f"{output_prefix}_labeled_train"
+                ).with_suffix(".dataset")
+                test_data_output_path = (
+                    Path(output_directory) / f"{output_prefix}_labeled_test"
+                ).with_suffix(".dataset")
+                data_dict["train"].save_to_disk(str(train_data_output_path))
+                data_dict["test"].save_to_disk(str(test_data_output_path))
+        else:
+            data_output_path = (
+                Path(output_directory) / f"{output_prefix}_labeled"
+            ).with_suffix(".dataset")
+            data.save_to_disk(str(data_output_path))
+            print(data_output_path)
+    else:
+        data_output_path = (
+            Path(output_directory) / f"{output_prefix}_labeled"
+        ).with_suffix(".dataset")
+        data.save_to_disk(str(data_output_path))
+def load_and_filter(filter_data, nproc, input_data_file):
+    data = load_from_disk(input_data_file)
+    if filter_data is not None:
+        data = filter_by_dict(data, filter_data, nproc)
+    return data
+# get number of classes for classifier
+def get_num_classes(id_class_dict):
+    return len(set(id_class_dict.values()))
+def filter_by_dict(data, filter_data, nproc):
+    for key, value in filter_data.items():
+        def filter_data_by_criteria(example):
+            return example[key] in value
+        data = data.filter(filter_data_by_criteria, num_proc=nproc)
+    if len(data) == 0:
+        logger.error("No cells remain after filtering. Check filtering criteria.")
+        raise
+    return data
+def remove_rare(data, rare_threshold, label, nproc):
+    if rare_threshold > 0:
+        total_cells = len(data)
+        label_counter = Counter(data[label])
+        nonrare_label_dict = {
+            label: [k for k, v in label_counter if (v / total_cells) > rare_threshold]
+        }
+        data = filter_by_dict(data, nonrare_label_dict, nproc)
+    return data
+def downsample_and_shuffle(data, max_ncells, max_ncells_per_class, cell_state_dict):
+    data = data.shuffle(seed=42)
+    num_cells = len(data)
+    # if max number of cells is defined, then subsample to this max number
+    if max_ncells is not None:
+        if num_cells > max_ncells:
+            data = data.select([i for i in range(max_ncells)])
+    if max_ncells_per_class is not None:
+        class_labels = data[cell_state_dict["state_key"]]
+        random.seed(42)
+        subsample_indices = subsample_by_class(class_labels, max_ncells_per_class)
+        data = data.select(subsample_indices)
+    return data
+def rename_cols(data, state_key):
+    data = data.rename_column(state_key, "label")
+    return data
+def label_classes(classifier, data, gene_class_dict, nproc):
+    if classifier == "cell":
+        label_set = set(data["label"])
+    elif classifier == "gene":
+        # remove cells without any of the target genes
+        def if_contains_label(example):
+            a = pu.flatten_list(gene_class_dict.values())
+            b = example["input_ids"]
+            return not set(a).isdisjoint(b)
+        data = data.filter(if_contains_label, num_proc=nproc)
+        label_set = gene_class_dict.keys()
+        if len(data) == 0:
+            logger.error(
+                "No cells remain after filtering for target genes. Check target gene list."
+            )
+            raise
+    class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
+    id_class_dict = {v: k for k, v in class_id_dict.items()}
+    def classes_to_ids(example):
+        if classifier == "cell":
+            example["label"] = class_id_dict[example["label"]]
+        elif classifier == "gene":
+            example["labels"] = label_gene_classes(
+                example, class_id_dict, gene_class_dict
+            )
+        return example
+    data = data.map(classes_to_ids, num_proc=nproc)
+    return data, id_class_dict
+def train_classifier(
+        model_directory,
+        num_classes,
+        train_data,
+        eval_data,
+        output_directory,
+        predict=False,
+        classifier='cell',
+        no_eval=False,
+        quantize = False,
+        freeze_layers=2,
+    ):
+        """
+        Fine-tune model for cell state or gene classification.
+        **Parameters**
+        model_directory : Path
+            | Path to directory containing model
+        num_classes : int
+            | Number of classes for classifier
+        train_data : Dataset
+            | Loaded training .dataset input
+            | For cell classifier, labels in column "label".
+            | For gene classifier, labels in column "labels".
+        eval_data : None, Dataset
+            | (Optional) Loaded evaluation .dataset input
+            | For cell classifier, labels in column "label".
+            | For gene classifier, labels in column "labels".
+        output_directory : Path
+            | Path to directory where fine-tuned model will be saved
+        predict : bool
+            | Whether or not to save eval predictions from trainer
+        """
+        ##### Validate and prepare data #####
+        train_data, eval_data = validate_and_clean_cols(
+            train_data, eval_data, classifier
+        )
+        if (no_eval is True) and (eval_data is not None):
+            logger.warning(
+                "no_eval set to True; model will be trained without evaluation."
+            )
+            eval_data = None
+        if (classifier == "gene") and (predict is True):
+            logger.warning(
+                "Predictions during training not currently available for gene classifiers; setting predict to False."
+            )
+            predict = False
+        # ensure not overwriting previously saved model
+        saved_model_test = os.path.join(output_directory, "pytorch_model.bin")
+        if os.path.isfile(saved_model_test) is True:
+            logger.error("Model already saved to this designated output directory.")
+            raise
+        # make output directory
+        # subprocess.call(f"mkdir {output_directory}", shell=True)
+        os.makedirs(output_dir, exist_ok=True)
+        ##### Load model and training args #####
+        model = load_model(
+            "CellClassifier",
+            num_classes,
+            model_directory,
+            "train",
+            quantize=quantize,
+        )
+        #############
+        pretrained_model = CustomBertForMaskedLM.from_pretrained(model_directory)
+        # Extract the word embeddings from the pretrained model
+        pretrained_word_embeddings = pretrained_model.bert.embeddings.word_embeddings.weight.clone()
+        model.bert.embeddings.word_embeddings.load_state_dict({"weight": pretrained_word_embeddings})
+        ############
+        def_training_args, def_freeze_layers = get_default_train_args(
+            model, classifier, train_data, output_directory
+        )
+        if training_args is not None:
+            def_training_args.update(training_args)
+        logging_steps = round(
+            len(train_data) / def_training_args["per_device_train_batch_size"] / 10
+        )
+        def_training_args["logging_steps"] = logging_steps
+        def_training_args["output_dir"] = output_directory
+        if eval_data is None:
+            def_training_args["evaluation_strategy"] = "no"
+            def_training_args["load_best_model_at_end"] = False
+        training_args_init = TrainingArguments(**def_training_args)
+        if freeze_layers is not None:
+            def_freeze_layers = freeze_layers
+        if def_freeze_layers > 0:
+            modules_to_freeze = model.bert.encoder.layer[:def_freeze_layers]
+            for module in modules_to_freeze:
+                for param in module.parameters():
+                    param.requires_grad = False
+        ##### Fine-tune the model #####
+        # define the data collator
+        if classifier == "cell":
+            data_collator = DataCollatorForCellClassification()
+        elif self.classifier == "gene":
+            data_collator = DataCollatorForGeneClassification()
+        # create the trainer
+        trainer = Trainer(
+            model=model,
+            args=training_args_init,
+            data_collator=data_collator,
+            train_dataset=train_data,
+            eval_dataset=eval_data,
+            compute_metrics=compute_metrics,
+        )
+        # train the classifier
+        trainer.train()
+        trainer.save_model(output_directory)
+        if predict is True:
+            # make eval predictions and save predictions and metrics
+            predictions = trainer.predict(eval_data)
+            prediction_output_path = f"{output_directory}/predictions.pkl"
+            with open(prediction_output_path, "wb") as f:
+                pickle.dump(predictions, f)
+            trainer.save_metrics("eval", predictions.metrics)
+        return trainer
+def validate_and_clean_cols(train_data, eval_data, classifier):
+    # validate that data has expected label column and remove others
+    if classifier == "cell":
+        label_col = "label"
+    elif classifier == "gene":
+        label_col = "labels"
+    cols_to_keep = [label_col] + ["input_ids", "length"]
+    if label_col not in train_data.column_names:
+        logger.error(f"train_data must contain column {label_col} with class labels.")
+        raise
+    else:
+        train_data = remove_cols(train_data, cols_to_keep)
+    if eval_data is not None:
+        if label_col not in eval_data.column_names:
+            logger.error(
+                f"eval_data must contain column {label_col} with class labels."
+            )
+            raise
+        else:
+            eval_data = remove_cols(eval_data, cols_to_keep)
+    return train_data, eval_data
+def remove_cols(data, cols_to_keep):
+    other_cols = list(data.features.keys())
+    other_cols = [ele for ele in other_cols if ele not in cols_to_keep]
+    data = data.remove_columns(other_cols)
+    return data
+def load_model(model_type, num_classes, model_directory, mode, quantize=False):
+    if model_type == "MTLCellClassifier-Quantized":
+        model_type = "MTLCellClassifier"
+        quantize = True
+    output_hidden_states = (mode == "eval")
+    # Quantization logic
+    if quantize:
+        if model_type == "MTLCellClassifier":
+            quantize_config = BitsAndBytesConfig(load_in_8bit=True)
+            peft_config = None
+        else:
+            quantize_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+            peft_config = LoraConfig(
+                lora_alpha=128,
+                lora_dropout=0.1,
+                r=64,
+                bias="none",
+                task_type="TokenClassification",
+            )
+    else:
+        quantize_config = None
+        peft_config = None
+    # Model class selection
+    model_classes = {
+        "Pretrained": BertForMaskedLM,
+        "GeneClassifier": BertForTokenClassification,
+        "CellClassifier": BertForSequenceClassification,
+        "MTLCellClassifier": BertForMaskedLM
+    }
+    model_class = model_classes.get(model_type)
+    if not model_class:
+        raise ValueError(f"Unknown model type: {model_type}")
+    # Model loading
+    model_args = {
+        "pretrained_model_name_or_path": model_directory,
+        "output_hidden_states": output_hidden_states,
+        "output_attentions": False,
+    }
+    if model_type != "Pretrained":
+        model_args["num_labels"] = num_classes
+    if quantize_config:
+        model_args["quantization_config"] = quantize_config
+    # Load the model
+    model = model_class.from_pretrained(**model_args)
+    ###########################
+    if mode == "eval":
+        model.eval()
+    # Handle device placement and PEFT
+    if not quantize:
+        # Only move non-quantized models
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+    elif peft_config:
+        # Apply PEFT for quantized models (except MTLCellClassifier)
+        model.enable_input_require_grads()
+        model = get_peft_model(model, peft_config)
+    return model
+def get_default_train_args(model, classifier, data, output_dir):
+    num_layers = quant_layers(model)
+    freeze_layers_get = 0
+    batch_size = 12
+    if classifier == "cell":
+        epochs = 10
+        evaluation_strategy = "epoch"
+        load_best_model_at_end = True
+    else:
+        epochs = 1
+        evaluation_strategy = "no"
+        load_best_model_at_end = False
+    if num_layers == 6:
+        default_training_args = {
+            "learning_rate": 5e-5,
+            "lr_scheduler_type": "linear",
+            "warmup_steps": 500,
+            "per_device_train_batch_size": batch_size,
+            "per_device_eval_batch_size": batch_size,
+        }
+    else:
+        default_training_args = {
+            "per_device_train_batch_size": batch_size,
+            "per_device_eval_batch_size": batch_size,
+        }
+    training_args = {
+        "num_train_epochs": epochs,
+        "do_train": True,
+        "do_eval": True,
+        "evaluation_strategy": evaluation_strategy,
+        "logging_steps": np.floor(len(data) / batch_size / 8),  # 8 evals per epoch
+        "save_strategy": "epoch",
+        "group_by_length": False,
+        "length_column_name": "length",
+        "disable_tqdm": False,
+        "weight_decay": 0.001,
+        "load_best_model_at_end": load_best_model_at_end,
+    }
+    training_args.update(default_training_args)
+    return training_args, freeze_layers_get
+def quant_layers(model):
+    layer_nums = []
+    for name, parameter in model.named_parameters():
+        if "layer" in name:
+            layer_nums += [int(name.split("layer.")[1].split(".")[0])]
+    return int(max(layer_nums)) + 1
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = pred.predictions.argmax(-1)
+    # calculate accuracy and macro f1 using sklearn's function
+    acc = accuracy_score(labels, preds)
+    macro_f1 = f1_score(labels, preds, average='macro')
+    weighted_f1 = f1_score(labels, preds, average='weighted')
+    return {
+        'accuracy': acc,
+        'macro_f1': macro_f1,
+        'weighted_f1': weighted_f1
+    }
+def evaluate_model(
+    model,
+    num_classes,
+    id_class_dict,
+    eval_data,
+    predict=False,
+    output_directory=None,
+    output_prefix=None,
+):
+    """
+    Evaluate the fine-tuned model.
+    **Parameters**
+    model : nn.Module
+        | Loaded fine-tuned model (e.g. trainer.model)
+    num_classes : int
+        | Number of classes for classifier
+    id_class_dict : dict
+        | Loaded _id_class_dict.pkl previously prepared by Classifier.prepare_data
+        | (dictionary of format: numerical IDs: class_labels)
+    eval_data : Dataset
+        | Loaded evaluation .dataset input
+    predict : bool
+        | Whether or not to save eval predictions
+    output_directory : Path
+        | Path to directory where eval data will be saved
+    output_prefix : str
+        | Prefix for output files
+    """
+    ##### Evaluate the model #####
+    labels = id_class_dict.keys()
+    y_pred, y_true, logits_list = classifier_predict(
+        model, classifier, eval_data, 100
+    )
+    conf_mat, macro_f1, acc, roc_metrics = get_metrics(
+        y_pred, y_true, logits_list, num_classes, labels
+    )
+    if predict is True:
+        pred_dict = {
+            "pred_ids": y_pred,
+            "label_ids": y_true,
+            "predictions": logits_list,
+        }
+        pred_dict_output_path = (
+            Path(output_directory) / f"{output_prefix}_pred_dict"
+        ).with_suffix(".pkl")
+        with open(pred_dict_output_path, "wb") as f:
+            pickle.dump(pred_dict, f)
+    return {
+        "conf_mat": conf_mat,
+        "macro_f1": macro_f1,
+        "acc": acc,
+        "roc_metrics": roc_metrics,
+    }
+def classifier_predict(model, classifier_type, evalset, forward_batch_size):
+    if classifier_type == "gene":
+        label_name = "labels"
+    elif classifier_type == "cell":
+        label_name = "label"
+    predict_logits = []
+    predict_labels = []
+    model.eval()
+    # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
+    evalset_len = len(evalset)
+    max_divisible = find_largest_div(evalset_len, forward_batch_size)
+    if len(evalset) - max_divisible == 1:
+        evalset_len = max_divisible
+    max_evalset_len = max(evalset.select([i for i in range(evalset_len)])["length"])
+    disable_progress_bar()  # disable progress bar for preprocess_classifier_batch mapping
+    for i in trange(0, evalset_len, forward_batch_size):
+        max_range = min(i + forward_batch_size, evalset_len)
+        batch_evalset = evalset.select([i for i in range(i, max_range)])
+        padded_batch = preprocess_classifier_batch(
+            batch_evalset, max_evalset_len, label_name
+        )
+        padded_batch.set_format(type="torch")
+        input_data_batch = padded_batch["input_ids"]
+        attn_msk_batch = padded_batch["attention_mask"]
+        label_batch = padded_batch[label_name]
+        with torch.no_grad():
+            outputs = model(
+                input_ids=input_data_batch.to("cuda"),
+                attention_mask=attn_msk_batch.to("cuda"),
+                labels=label_batch.to("cuda"),
+            )
+            predict_logits += [torch.squeeze(outputs.logits.to("cpu"))]
+            predict_labels += [torch.squeeze(label_batch.to("cpu"))]
+    enable_progress_bar()
+    logits_by_cell = torch.cat(predict_logits)
+    last_dim = len(logits_by_cell.shape) - 1
+    all_logits = logits_by_cell.reshape(-1, logits_by_cell.shape[last_dim])
+    labels_by_cell = torch.cat(predict_labels)
+    all_labels = torch.flatten(labels_by_cell)
+    logit_label_paired = [
+        item
+        for item in list(zip(all_logits.tolist(), all_labels.tolist()))
+        if item[1] != -100
+    ]
+    y_pred = [vote(item[0]) for item in logit_label_paired]
+    y_true = [item[1] for item in logit_label_paired]
+    logits_list = [item[0] for item in logit_label_paired]
+    return y_pred, y_true, logits_list
+def find_largest_div(N, K):
+    rem = N % K
+    if rem == 0:
+        return N
+    else:
+        return N - rem
+def preprocess_classifier_batch(cell_batch, max_len, label_name):
+    if max_len is None:
+        max_len = max([len(i) for i in cell_batch["input_ids"]])
+    def pad_label_example(example):
+        example[label_name] = np.pad(
+            example[label_name],
+            (0, max_len - len(example["input_ids"])),
+            mode="constant",
+            constant_values=-100,
+        )
+        example["input_ids"] = np.pad(
+            example["input_ids"],
+            (0, max_len - len(example["input_ids"])),
+            mode="constant",
+            constant_values=gene_token_dict.get("<pad>"),
+        )
+        example["attention_mask"] = (
+            example["input_ids"] != gene_token_dict.get("<pad>")
+        ).astype(int)
+        return example
+    padded_batch = cell_batch.map(pad_label_example)
+    return padded_batch
+def vote(logit_list):
+    m = max(logit_list)
+    logit_list.index(m)
+    indices = [i for i, x in enumerate(logit_list) if x == m]
+    if len(indices) > 1:
+        return "tie"
+    else:
+        return indices[0]
+def py_softmax(vector):
+    e = np.exp(vector)
+    return e / e.sum()
+def get_metrics(y_pred, y_true, logits_list, num_classes, labels):
+    conf_mat = confusion_matrix(y_true, y_pred, labels=list(labels))
+    macro_f1 = f1_score(y_true, y_pred, average="macro")
+    acc = accuracy_score(y_true, y_pred)
+    roc_metrics = None  # roc metrics not reported for multiclass
+    if num_classes == 2:
+        y_score = [py_softmax(item)[1] for item in logits_list]
+        fpr, tpr, _ = roc_curve(y_true, y_score)
+        mean_fpr = np.linspace(0, 1, 100)
+        interp_tpr = np.interp(mean_fpr, fpr, tpr)
+        interp_tpr[0] = 0.0
+        tpr_wt = len(tpr)
+        roc_auc = auc(fpr, tpr)
+        roc_metrics = {
+            "fpr": fpr,
+            "tpr": tpr,
+            "interp_tpr": interp_tpr,
+            "auc": roc_auc,
+            "tpr_wt": tpr_wt,
+        }
+    return conf_mat, macro_f1, acc, roc_metrics
+def evaluate_saved_model(
+    model_directory,
+    id_class_dict_file,
+    test_data_file,
+    output_directory,
+    output_prefix,
+    predict=True,
+):
+    """
+    Evaluate the fine-tuned model.
+    **Parameters**
+    model_directory : Path
+        | Path to directory containing model
+    id_class_dict_file : Path
+        | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
+        | (dictionary of format: numerical IDs: class_labels)
+    test_data_file : Path
+        | Path to directory containing test .dataset
+    output_directory : Path
+        | Path to directory where eval data will be saved
+    output_prefix : str
+        | Prefix for output files
+    predict : bool
+        | Whether or not to save eval predictions
+    """
+    # load numerical id to class dictionary (id:class)
+    with open(id_class_dict_file, "rb") as f:
+        id_class_dict = pickle.load(f)
+    # get number of classes for classifier
+    num_classes = get_num_classes(id_class_dict)
+    # load previously filtered and prepared data
+    test_data = load_and_filter(None, nproc, test_data_file)
+    # load previously fine-tuned model
+    model = load_model(
+        "CellClassifier",
+        num_classes,
+        model_directory,
+        "eval",
+        quantize=quantize,
+    )
+    # evaluate the model
+    result = evaluate_model(
+        model,
+        num_classes,
+        id_class_dict,
+        test_data,
+        predict=predict,
+        output_directory=output_directory,
+        output_prefix="CellClassifier",
+    )
+    all_conf_mat_df = pd.DataFrame(
+        result["conf_mat"],
+        columns=id_class_dict.values(),
+        index=id_class_dict.values(),
+    )
+    all_metrics = {
+        "conf_matrix": all_conf_mat_df,
+        "macro_f1": result["macro_f1"],
+        "acc": result["acc"],
+    }
+    all_roc_metrics = None  # roc metrics not reported for multiclass
+    if num_classes == 2:
+        mean_fpr = np.linspace(0, 1, 100)
+        mean_tpr = result["roc_metrics"]["interp_tpr"]
+        all_roc_auc = result["roc_metrics"]["auc"]
+        all_roc_metrics = {
+            "mean_tpr": mean_tpr,
+            "mean_fpr": mean_fpr,
+            "all_roc_auc": all_roc_auc,
+        }
+    all_metrics["all_roc_metrics"] = all_roc_metrics
+    test_metrics_output_path = (
+        Path(output_directory) / f"{output_prefix}_test_metrics_dict"
+    ).with_suffix(".pkl")
+    with open(test_metrics_output_path, "wb") as f:
+        pickle.dump(all_metrics, f)
+    return all_metrics
+def plot_conf_mat(
+    conf_mat_dict,
+    output_directory,
+    output_prefix,
+    custom_class_order=None,
+):
+    """
+    Plot confusion matrix results of evaluating the fine-tuned model.
+    **Parameters**
+    conf_mat_dict : dict
+        | Dictionary of model_name : confusion_matrix_DataFrame
+        | (all_metrics["conf_matrix"] from self.validate)
+    output_directory : Path
+        | Path to directory where plots will be saved
+    output_prefix : str
+        | Prefix for output file
+    custom_class_order : None, list
+        | List of classes in custom order for plots.
+        | Same order will be used for all models.
+    """
+    for model_name in conf_mat_dict.keys():
+        plot_confusion_matrix(
+            conf_mat_dict[model_name],
+            model_name,
+            output_directory,
+            output_prefix,
+            custom_class_order,
+        )
+def plot_confusion_matrix(
+    conf_mat_df, title, output_dir, output_prefix, custom_class_order
+):
+    fig = plt.figure()
+    fig.set_size_inches(10, 10)
+    sns.set(font_scale=1)
+    sns.set_style("whitegrid", {"axes.grid": False})
+    if custom_class_order is not None:
+        conf_mat_df = conf_mat_df.reindex(
+            index=custom_class_order, columns=custom_class_order
+        )
+    display_labels = generate_display_labels(conf_mat_df)
+    conf_mat = preprocessing.normalize(conf_mat_df.to_numpy(), norm="l1")
+    display = ConfusionMatrixDisplay(
+        confusion_matrix=conf_mat, display_labels=display_labels
+    )
+    display.plot(cmap="Blues", values_format=".2g")
+    plt.title(title)
+    plt.show()
+    output_file = (Path(output_dir) / f"{output_prefix}_conf_mat").with_suffix(".pdf")
+    display.figure_.savefig(output_file, bbox_inches="tight")
+def generate_display_labels(conf_mat_df):
+    display_labels = []
+    i = 0
+    for label in conf_mat_df.index:
+        display_labels += [f"{label}\nn={conf_mat_df.iloc[i,:].sum():.0f}"]
+        i = i + 1
+    return display_labels
+def plot_predictions(
+    predictions_file,
+    id_class_dict_file,
+    title,
+    output_directory,
+    output_prefix,
+    custom_class_order=None,
+    kwargs_dict=None,
+):
+    """
+    Plot prediction results of evaluating the fine-tuned model.
+    **Parameters**
+    predictions_file : path
+        | Path of model predictions output to plot
+        | (saved output from self.validate if predict_eval=True)
+        | (or saved output from self.evaluate_saved_model)
+    id_class_dict_file : Path
+        | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
+        | (dictionary of format: numerical IDs: class_labels)
+    title : str
+        | Title for legend containing class labels.
+    output_directory : Path
+        | Path to directory where plots will be saved
+    output_prefix : str
+        | Prefix for output file
+    custom_class_order : None, list
+        | List of classes in custom order for plots.
+        | Same order will be used for all models.
+    kwargs_dict : None, dict
+        | Dictionary of kwargs to pass to plotting function.
+    """
+    # load predictions
+    with open(predictions_file, "rb") as f:
+        predictions = pickle.load(f)
+    # load numerical id to class dictionary (id:class)
+    with open(id_class_dict_file, "rb") as f:
+        id_class_dict = pickle.load(f)
+    if isinstance(predictions, dict):
+        if all(
+            [
+                key in predictions.keys()
+                for key in ["pred_ids", "label_ids", "predictions"]
+            ]
+        ):
+            # format is output from self.evaluate_saved_model
+            predictions_logits = np.array(predictions["predictions"])
+            true_ids = predictions["label_ids"]
+    else:
+        # format is output from self.validate if predict_eval=True
+        predictions_logits = predictions.predictions
+        true_ids = predictions.label_ids
+    num_classes = len(id_class_dict.keys())
+    num_predict_classes = predictions_logits.shape[1]
+    assert num_classes == num_predict_classes
+    classes = id_class_dict.values()
+    true_labels = [id_class_dict[idx] for idx in true_ids]
+    predictions_df = pd.DataFrame(predictions_logits, columns=classes)
+    if custom_class_order is not None:
+        predictions_df = predictions_df.reindex(columns=custom_class_order)
+    predictions_df["true"] = true_labels
+    custom_dict = dict(zip(classes, [i for i in range(len(classes))]))
+    if custom_class_order is not None:
+        custom_dict = dict(
+            zip(custom_class_order, [i for i in range(len(custom_class_order))])
+        )
+    predictions_df = predictions_df.sort_values(
+        by=["true"], key=lambda x: x.map(custom_dict)
+    )
+    plot_predictions_eu(
+        predictions_df, title, output_directory, output_prefix, kwargs_dict
+    )
+def plot_predictions_eu(predictions_df, title, output_dir, output_prefix, kwargs_dict):
+    sns.set(font_scale=2)
+    plt.figure(figsize=(10, 10), dpi=150)
+    label_colors, label_color_dict = make_colorbar(predictions_df, "true")
+    predictions_df = predictions_df.drop(columns=["true"])
+    predict_colors_list = [label_color_dict[label] for label in predictions_df.columns]
+    predict_label_list = [label for label in predictions_df.columns]
+    predict_colors = pd.DataFrame(
+        pd.Series(predict_colors_list, index=predict_label_list), columns=["predicted"]
+    )
+    default_kwargs_dict = {
+        "row_cluster": False,
+        "col_cluster": False,
+        "row_colors": label_colors,
+        "col_colors": predict_colors,
+        "linewidths": 0,
+        "xticklabels": False,
+        "yticklabels": False,
+        "center": 0,
+        "cmap": "vlag",
+    }
+    if kwargs_dict is not None:
+        default_kwargs_dict.update(kwargs_dict)
+    g = sns.clustermap(predictions_df, **default_kwargs_dict)
+    plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
+    for label_color in list(label_color_dict.keys()):
+        g.ax_col_dendrogram.bar(
+            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0
+        )
+        g.ax_col_dendrogram.legend(
+            title=f"{title}",
+            loc="lower center",
+            ncol=4,
+            bbox_to_anchor=(0.5, 1),
+            facecolor="white",
+        )
+    output_file = (Path(output_dir) / f"{output_prefix}_pred").with_suffix(".pdf")
+    plt.savefig(output_file, bbox_inches="tight")
+def make_colorbar(embs_df, label):
+    labels = list(embs_df[label])
+    cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
+    label_colors = pd.DataFrame(cell_type_colors, columns=[label])
+    # create dictionary for colors and classes
+    label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
+    return label_colors, label_color_dict
+def gen_heatmap_class_colors(labels, df):
+    pal = sns.cubehelix_palette(
+        len(Counter(labels).keys()),
+        light=0.9,
+        dark=0.1,
+        hue=1,
+        reverse=True,
+        start=1,
+        rot=-2,
+    )
+    lut = dict(zip(map(str, Counter(labels).keys()), pal))
+    colors = pd.Series(labels, index=df.index).map(lut)
+    return colors
+def gen_heatmap_class_dict(classes, label_colors_series):
+    class_color_dict_df = pd.DataFrame(
+        {"classes": classes, "color": label_colors_series}
+    )
+    class_color_dict_df = class_color_dict_df.drop_duplicates(subset=["classes"])
+    return dict(zip(class_color_dict_df["classes"], class_color_dict_df["color"]))
+for i in range(iter_step):
+    model_directory = "model path"
+    corpus_dir = "Pretrain_data"
+    with open(corpus_dir + "/token_dictionary.pkl", "rb") as fp:
+        gene_token_dict = pickle.load(fp)
+    token_gene_dict = {v: k for k, v in gene_token_dict.items()}
+    filter_data_dict={"cell_type":["Cardiomyocyte1","Cardiomyocyte2","Cardiomyocyte3"]}
+    training_args = {
+        "num_train_epochs": 0.9,
+        "learning_rate": 0.000804,
+        "lr_scheduler_type": "polynomial",
+        "warmup_steps": 1812,
+        "weight_decay":0.258828,
+        "per_device_train_batch_size": 12,
+        "seed": 73,
+    }
+    cell_state_dict = {"state_key": "disease", "states": "all"}
+    classifier='cell'
+    filter_data=filter_data_dict
+    split_sizes={"train": 0.8, "valid": 0.1, "test": 0.1}
+    train_size = split_sizes["train"]
+    valid_size = split_sizes["valid"]
+    oos_test_size = split_sizes["test"]
+    max_ncells=None
+    freeze_layers = 2
+    num_crossval_splits = 1
+    forward_batch_size=200
+    nproc=16
+    rare_threshold=0
+    quantize=None
+    train_ids = ["1447", "1600", "1462", "1558", "1300", "1508", "1358", "1678", "1561", "1304", "1610", "1430", "1472", "1707", "1726", "1504", "1425", "1617", "1631", "1735", "1582", "1722", "1622", "1630", "1290", "1479", "1371", "1549", "1515"]
+    eval_ids = ["1422", "1510", "1539", "1606", "1702"]
+    test_ids = ["1437", "1516", "1602", "1685", "1718"]
+    train_test_id_split_dict = {"attr_key": "individual",
+                                "train": train_ids+eval_ids,
+                                "test": test_ids}
+    train_valid_id_split_dict = {"attr_key": "individual",
+                                "train": train_ids,
+                                "eval": eval_ids}
+    # define output directory path
+    current_date = datetime.datetime.now()
+    datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}{current_date.strftime('%X').replace(':','')}"
+    datestamp_min = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+    output_directory = "output path"
+    if output_directory[-1:] != "/":  # add slash for dir if not present
+        output_directory = output_directory + "/"
+    output_dir = f"{output_directory}{datestamp}_geneformer_diseaseClassifier/"
+    output_prefix = "cm_classifier_test"
+    subprocess.call(f"mkdir {output_dir}", shell=True)
+    os.makedirs(output_dir, exist_ok=True)
+    prepare_data(input_data_file="example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset",
+                    output_directory=output_dir,
+                    output_prefix=output_prefix,
+                    split_id_dict=train_test_id_split_dict)
+    with open(f"{output_dir}/{output_prefix}_id_class_dict.pkl", "rb") as f:
+        id_class_dict = pickle.load(f)
+    class_id_dict = {v: k for k, v in id_class_dict.items()}
+    num_classes = get_num_classes(id_class_dict)
+    data = load_and_filter(None, nproc, f"{output_dir}/{output_prefix}_labeled_train.dataset")
+    data = data.shuffle(seed=42)
+    ##### (Cross-)validate the model #####
+    results = []
+    all_conf_mat = np.zeros((num_classes, num_classes))
+    iteration_num = 1
+    split_id_dict=train_valid_id_split_dict
+    for i in trange(num_crossval_splits):
+        print(
+            f"****** Validation split: {iteration_num}/{num_crossval_splits} ******\n"
+        )
+        ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
+        if num_crossval_splits == 1:
+            # single 1-eval_size:eval_size split
+            if split_id_dict is not None:
+                data_dict = dict()
+                data_dict["train"] = filter_by_dict(
+                    data,
+                    {split_id_dict["attr_key"]: split_id_dict["train"]},
+                    nproc,
+                )
+                data_dict["test"] = filter_by_dict(
+                    data,
+                    {split_id_dict["attr_key"]: split_id_dict["eval"]},
+                    nproc,
+                )
+            train_data = data_dict["train"]
+            eval_data = data_dict["test"]
+    trainer = train_classifier(
+        model_directory,
+        num_classes,
+        train_data,
+        eval_data,
+        ksplit_output_dir,
+    )
+    result = evaluate_model(
+                        trainer.model,
+                        num_classes,
+                        id_class_dict,
+                        eval_data,
+                        True,
+                        ksplit_output_dir,
+                        output_prefix,
+                    )
+    results += [result]
+    all_conf_mat = all_conf_mat + result["conf_mat"]
+    iteration_num = iteration_num + 1
+    all_conf_mat_df = pd.DataFrame(
+        all_conf_mat, columns=id_class_dict.values(), index=id_class_dict.values()
+    )
+    all_metrics = {
+        "conf_matrix": all_conf_mat_df,
+        "macro_f1": [result["macro_f1"] for result in results],
+        "acc": [result["acc"] for result in results],
+    }
+    all_roc_metrics = None  # roc metrics not reported for multiclass
+    if num_classes == 2:
+        mean_fpr = np.linspace(0, 1, 100)
+        all_tpr = [result["roc_metrics"]["interp_tpr"] for result in results]
+        all_roc_auc = [result["roc_metrics"]["auc"] for result in results]
+        all_tpr_wt = [result["roc_metrics"]["tpr_wt"] for result in results]
+        mean_tpr, roc_auc, roc_auc_sd = eu.get_cross_valid_roc_metrics(
+            all_tpr, all_roc_auc, all_tpr_wt
+        )
+        all_roc_metrics = {
+            "mean_tpr": mean_tpr,
+            "mean_fpr": mean_fpr,
+            "all_roc_auc": all_roc_auc,
+            "roc_auc": roc_auc,
+            "roc_auc_sd": roc_auc_sd,
+        }
+    all_metrics["all_roc_metrics"] = all_roc_metrics
+    save_eval_output=True
+    if save_eval_output is True:
+        eval_metrics_output_path = (
+            Path(output_dir) / f"cm_classifier_test_eval_metrics_dict"
+        ).with_suffix(".pkl")
+        with open(eval_metrics_output_path, "wb") as f:
+            pickle.dump(all_metrics, f)
+    datestamp_min = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}_{current_date.strftime('%X').replace(':','')}"
+    all_metrics_test = evaluate_saved_model(
+            model_directory=f"{output_dir}/ksplit1/",
+            id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
+            test_data_file=f"{output_dir}/{output_prefix}_labeled_test.dataset",
+            output_directory=output_dir,
+            output_prefix=output_prefix,
+        )
+    macro_f1_list.append(all_metrics_test['macro_f1'])
+    acc_list.append(all_metrics_test['acc'])
+print("Macro F1: ", macro_f1_list)
+print("Accuracy: ", acc_list)

Downstream_tasks/Classification/Cardio_ML.ipynb ADDED Viewed

	@@ -0,0 +1,1404 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm, trange\n",
+    "GPU_NUMBER = [0]\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \",\".join([str(s) for s in GPU_NUMBER])\n",
+    "os.environ[\"NCCL_DEBUG\"] = \"INFO\"\n",
+    "\n",
+    "# imports\n",
+    "from collections import Counter\n",
+    "import datetime\n",
+    "import pickle\n",
+    "import subprocess\n",
+    "import seaborn as sns; sns.set()\n",
+    "from datasets import load_from_disk\n",
+    "from sklearn.metrics import accuracy_score, f1_score\n",
+    "from transformers import BertForSequenceClassification, BertForMaskedLM, BertForTokenClassification\n",
+    "from transformers import Trainer\n",
+    "from transformers.training_args import TrainingArguments\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from datasets.utils.logging import disable_progress_bar, enable_progress_bar\n",
+    "from sklearn import preprocessing\n",
+    "from sklearn.metrics import (\n",
+    "    ConfusionMatrixDisplay,\n",
+    "    accuracy_score,\n",
+    "    auc,\n",
+    "    confusion_matrix,\n",
+    "    f1_score,\n",
+    "    roc_curve,\n",
+    ")\n",
+    "from pathlib import Path\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import sys\n",
+    "# sys.path.append('geneformer')\n",
+    "from geneformer import DataCollatorForCellClassification\n",
+    "\n",
+    "macro_f1_list = []\n",
+    "acc_list = []\n",
+    "\n",
+    "iter_step = 2\n",
+    "\n",
+    "def prepare_data(\n",
+    "    input_data_file,\n",
+    "    output_directory,\n",
+    "    output_prefix,\n",
+    "    split_id_dict=None,\n",
+    "    test_size=None,\n",
+    "    attr_to_split=None,\n",
+    "    attr_to_balance=None,\n",
+    "    max_trials=100,\n",
+    "    pval_threshold=0.1,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Prepare data for cell state or gene classification.\n",
+    "\n",
+    "    **Parameters**\n",
+    "\n",
+    "    input_data_file : Path\n",
+    "        | Path to directory containing .dataset input\n",
+    "    output_directory : Path\n",
+    "        | Path to directory where prepared data will be saved\n",
+    "    output_prefix : str\n",
+    "        | Prefix for output file\n",
+    "    split_id_dict : None, dict\n",
+    "        | Dictionary of IDs for train and test splits\n",
+    "        | Three-item dictionary with keys: attr_key, train, test\n",
+    "        | attr_key: key specifying name of column in .dataset that contains the IDs for the data splits\n",
+    "        | train: list of IDs in the attr_key column to include in the train split\n",
+    "        | test: list of IDs in the attr_key column to include in the test split\n",
+    "        | For example: {\"attr_key\": \"individual\",\n",
+    "        |               \"train\": [\"patient1\", \"patient2\", \"patient3\", \"patient4\"],\n",
+    "        |               \"test\": [\"patient5\", \"patient6\"]}\n",
+    "    test_size : None, float\n",
+    "        | Proportion of data to be saved separately and held out for test set\n",
+    "        | (e.g. 0.2 if intending hold out 20%)\n",
+    "        | If None, will inherit from split_sizes[\"test\"] from Classifier\n",
+    "        | The training set will be further split to train / validation in self.validate\n",
+    "        | Note: only available for CellClassifiers\n",
+    "    attr_to_split : None, str\n",
+    "        | Key for attribute on which to split data while balancing potential confounders\n",
+    "        | e.g. \"patient_id\" for splitting by patient while balancing other characteristics\n",
+    "        | Note: only available for CellClassifiers\n",
+    "    attr_to_balance : None, list\n",
+    "        | List of attribute keys on which to balance data while splitting on attr_to_split\n",
+    "        | e.g. [\"age\", \"sex\"] for balancing these characteristics while splitting by patient\n",
+    "        | Note: only available for CellClassifiers\n",
+    "    max_trials : None, int\n",
+    "        | Maximum number of trials of random splitting to try to achieve balanced other attributes\n",
+    "        | If no split is found without significant (p<0.05) differences in other attributes, will select best\n",
+    "        | Note: only available for CellClassifiers\n",
+    "    pval_threshold : None, float\n",
+    "        | P-value threshold to use for attribute balancing across splits\n",
+    "        | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance\n",
+    "    \"\"\"\n",
+    "\n",
+    "    if test_size is None:\n",
+    "        test_size = oos_test_size\n",
+    "\n",
+    "    # prepare data and labels for classification\n",
+    "    data = load_and_filter(filter_data, nproc, input_data_file)\n",
+    "\n",
+    "    if classifier == \"cell\":\n",
+    "        if \"label\" in data.features:\n",
+    "            logger.error(\n",
+    "                \"Column name 'label' must be reserved for class IDs. Please rename column.\"\n",
+    "            )\n",
+    "            raise\n",
+    "    elif classifier == \"gene\":\n",
+    "        if \"labels\" in data.features:\n",
+    "            logger.error(\n",
+    "                \"Column name 'labels' must be reserved for class IDs. Please rename column.\"\n",
+    "            )\n",
+    "            raise\n",
+    "\n",
+    "    if (attr_to_split is not None) and (attr_to_balance is None):\n",
+    "        logger.error(\n",
+    "            \"Splitting by attribute while balancing confounders requires both attr_to_split and attr_to_balance to be defined.\"\n",
+    "        )\n",
+    "        raise\n",
+    "\n",
+    "    if not isinstance(attr_to_balance, list):\n",
+    "        attr_to_balance = [attr_to_balance]\n",
+    "\n",
+    "    if classifier == \"cell\":\n",
+    "        # remove cell states representing < rare_threshold of cells\n",
+    "        data = remove_rare(\n",
+    "            data, rare_threshold, cell_state_dict[\"state_key\"], nproc\n",
+    "        )\n",
+    "        # downsample max cells and max per class\n",
+    "        data = downsample_and_shuffle(\n",
+    "            data, max_ncells, None, cell_state_dict\n",
+    "        )\n",
+    "        # rename cell state column to \"label\"\n",
+    "        data = rename_cols(data, cell_state_dict[\"state_key\"])\n",
+    "\n",
+    "    # convert classes to numerical labels and save as id_class_dict\n",
+    "    # of note, will label all genes in gene_class_dict\n",
+    "    # if (cross-)validating, genes will be relabeled in column \"labels\" for each split\n",
+    "    # at the time of training with Classifier.validate\n",
+    "    data, id_class_dict = label_classes(\n",
+    "        classifier, data, None, nproc\n",
+    "    )\n",
+    "\n",
+    "    # save id_class_dict for future reference\n",
+    "    id_class_output_path = (\n",
+    "        Path(output_directory) / f\"{output_prefix}_id_class_dict\"\n",
+    "    ).with_suffix(\".pkl\")\n",
+    "    with open(id_class_output_path, \"wb\") as f:\n",
+    "        pickle.dump(id_class_dict, f)\n",
+    "\n",
+    "    if split_id_dict is not None:\n",
+    "        data_dict = dict()\n",
+    "        data_dict[\"train\"] = filter_by_dict(\n",
+    "            data, {split_id_dict[\"attr_key\"]: split_id_dict[\"train\"]}, nproc\n",
+    "        )\n",
+    "        data_dict[\"test\"] = filter_by_dict(\n",
+    "            data, {split_id_dict[\"attr_key\"]: split_id_dict[\"test\"]}, nproc\n",
+    "        )\n",
+    "        train_data_output_path = (\n",
+    "            Path(output_directory) / f\"{output_prefix}_labeled_train\"\n",
+    "        ).with_suffix(\".dataset\")\n",
+    "        test_data_output_path = (\n",
+    "            Path(output_directory) / f\"{output_prefix}_labeled_test\"\n",
+    "        ).with_suffix(\".dataset\")\n",
+    "        data_dict[\"train\"].save_to_disk(str(train_data_output_path))\n",
+    "        data_dict[\"test\"].save_to_disk(str(test_data_output_path))\n",
+    "    elif (test_size is not None) and (classifier == \"cell\"):\n",
+    "        if 1 > test_size > 0:\n",
+    "            if attr_to_split is None:\n",
+    "                data_dict = data.train_test_split(\n",
+    "                    test_size=test_size,\n",
+    "                    stratify_by_column=None,\n",
+    "                    seed=42,\n",
+    "                )\n",
+    "                train_data_output_path = (\n",
+    "                    Path(output_directory) / f\"{output_prefix}_labeled_train\"\n",
+    "                ).with_suffix(\".dataset\")\n",
+    "                test_data_output_path = (\n",
+    "                    Path(output_directory) / f\"{output_prefix}_labeled_test\"\n",
+    "                ).with_suffix(\".dataset\")\n",
+    "                data_dict[\"train\"].save_to_disk(str(train_data_output_path))\n",
+    "                data_dict[\"test\"].save_to_disk(str(test_data_output_path))\n",
+    "            else:\n",
+    "                data_dict, balance_df = cu.balance_attr_splits(\n",
+    "                    data,\n",
+    "                    attr_to_split,\n",
+    "                    attr_to_balance,\n",
+    "                    test_size,\n",
+    "                    max_trials,\n",
+    "                    pval_threshold,\n",
+    "                    cell_state_dict[\"state_key\"],\n",
+    "                    nproc,\n",
+    "                )\n",
+    "                balance_df.to_csv(\n",
+    "                    f\"{output_directory}/{output_prefix}_train_test_balance_df.csv\"\n",
+    "                )\n",
+    "                train_data_output_path = (\n",
+    "                    Path(output_directory) / f\"{output_prefix}_labeled_train\"\n",
+    "                ).with_suffix(\".dataset\")\n",
+    "                test_data_output_path = (\n",
+    "                    Path(output_directory) / f\"{output_prefix}_labeled_test\"\n",
+    "                ).with_suffix(\".dataset\")\n",
+    "                data_dict[\"train\"].save_to_disk(str(train_data_output_path))\n",
+    "                data_dict[\"test\"].save_to_disk(str(test_data_output_path))\n",
+    "        else:\n",
+    "            data_output_path = (\n",
+    "                Path(output_directory) / f\"{output_prefix}_labeled\"\n",
+    "            ).with_suffix(\".dataset\")\n",
+    "            data.save_to_disk(str(data_output_path))\n",
+    "            print(data_output_path)\n",
+    "    else:\n",
+    "        data_output_path = (\n",
+    "            Path(output_directory) / f\"{output_prefix}_labeled\"\n",
+    "        ).with_suffix(\".dataset\")\n",
+    "        data.save_to_disk(str(data_output_path))\n",
+    "\n",
+    "def load_and_filter(filter_data, nproc, input_data_file):\n",
+    "    data = load_from_disk(input_data_file)\n",
+    "    if filter_data is not None:\n",
+    "        data = filter_by_dict(data, filter_data, nproc)\n",
+    "    return data\n",
+    "# get number of classes for classifier\n",
+    "def get_num_classes(id_class_dict):\n",
+    "    return len(set(id_class_dict.values()))\n",
+    "\n",
+    "def filter_by_dict(data, filter_data, nproc):\n",
+    "    for key, value in filter_data.items():\n",
+    "\n",
+    "        def filter_data_by_criteria(example):\n",
+    "            return example[key] in value\n",
+    "\n",
+    "        data = data.filter(filter_data_by_criteria, num_proc=nproc)\n",
+    "    if len(data) == 0:\n",
+    "        logger.error(\"No cells remain after filtering. Check filtering criteria.\")\n",
+    "        raise\n",
+    "    return data\n",
+    "def remove_rare(data, rare_threshold, label, nproc):\n",
+    "    if rare_threshold > 0:\n",
+    "        total_cells = len(data)\n",
+    "        label_counter = Counter(data[label])\n",
+    "        nonrare_label_dict = {\n",
+    "            label: [k for k, v in label_counter if (v / total_cells) > rare_threshold]\n",
+    "        }\n",
+    "        data = filter_by_dict(data, nonrare_label_dict, nproc)\n",
+    "    return data\n",
+    "def downsample_and_shuffle(data, max_ncells, max_ncells_per_class, cell_state_dict):\n",
+    "    data = data.shuffle(seed=42)\n",
+    "    num_cells = len(data)\n",
+    "    # if max number of cells is defined, then subsample to this max number\n",
+    "    if max_ncells is not None:\n",
+    "        if num_cells > max_ncells:\n",
+    "            data = data.select([i for i in range(max_ncells)])\n",
+    "    if max_ncells_per_class is not None:\n",
+    "        class_labels = data[cell_state_dict[\"state_key\"]]\n",
+    "        random.seed(42)\n",
+    "        subsample_indices = subsample_by_class(class_labels, max_ncells_per_class)\n",
+    "        data = data.select(subsample_indices)\n",
+    "    return data\n",
+    "def rename_cols(data, state_key):\n",
+    "    data = data.rename_column(state_key, \"label\")\n",
+    "    return data\n",
+    "def label_classes(classifier, data, gene_class_dict, nproc):\n",
+    "    if classifier == \"cell\":\n",
+    "        label_set = set(data[\"label\"])\n",
+    "    elif classifier == \"gene\":\n",
+    "        # remove cells without any of the target genes\n",
+    "        def if_contains_label(example):\n",
+    "            a = pu.flatten_list(gene_class_dict.values())\n",
+    "            b = example[\"input_ids\"]\n",
+    "            return not set(a).isdisjoint(b)\n",
+    "\n",
+    "        data = data.filter(if_contains_label, num_proc=nproc)\n",
+    "        label_set = gene_class_dict.keys()\n",
+    "\n",
+    "        if len(data) == 0:\n",
+    "            logger.error(\n",
+    "                \"No cells remain after filtering for target genes. Check target gene list.\"\n",
+    "            )\n",
+    "            raise\n",
+    "\n",
+    "    class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))\n",
+    "    id_class_dict = {v: k for k, v in class_id_dict.items()}\n",
+    "\n",
+    "    def classes_to_ids(example):\n",
+    "        if classifier == \"cell\":\n",
+    "            example[\"label\"] = class_id_dict[example[\"label\"]]\n",
+    "        elif classifier == \"gene\":\n",
+    "            example[\"labels\"] = label_gene_classes(\n",
+    "                example, class_id_dict, gene_class_dict\n",
+    "            )\n",
+    "        return example\n",
+    "\n",
+    "    data = data.map(classes_to_ids, num_proc=nproc)\n",
+    "    return data, id_class_dict\n",
+    "\n",
+    "def train_classifier(\n",
+    "        model_directory,\n",
+    "        num_classes,\n",
+    "        train_data,\n",
+    "        eval_data,\n",
+    "        output_directory,\n",
+    "        predict=False,\n",
+    "        classifier='cell',\n",
+    "        no_eval=False,\n",
+    "        quantize = False,\n",
+    "        freeze_layers=2,\n",
+    "    ):\n",
+    "        \"\"\"\n",
+    "        Fine-tune model for cell state or gene classification.\n",
+    "\n",
+    "        **Parameters**\n",
+    "\n",
+    "        model_directory : Path\n",
+    "            | Path to directory containing model\n",
+    "        num_classes : int\n",
+    "            | Number of classes for classifier\n",
+    "        train_data : Dataset\n",
+    "            | Loaded training .dataset input\n",
+    "            | For cell classifier, labels in column \"label\".\n",
+    "            | For gene classifier, labels in column \"labels\".\n",
+    "        eval_data : None, Dataset\n",
+    "            | (Optional) Loaded evaluation .dataset input\n",
+    "            | For cell classifier, labels in column \"label\".\n",
+    "            | For gene classifier, labels in column \"labels\".\n",
+    "        output_directory : Path\n",
+    "            | Path to directory where fine-tuned model will be saved\n",
+    "        predict : bool\n",
+    "            | Whether or not to save eval predictions from trainer\n",
+    "        \"\"\"\n",
+    "\n",
+    "        ##### Validate and prepare data #####\n",
+    "        train_data, eval_data = validate_and_clean_cols(\n",
+    "            train_data, eval_data, classifier\n",
+    "        )\n",
+    "        \n",
+    "        if (no_eval is True) and (eval_data is not None):\n",
+    "            logger.warning(\n",
+    "                \"no_eval set to True; model will be trained without evaluation.\"\n",
+    "            )\n",
+    "            eval_data = None\n",
+    "\n",
+    "        if (classifier == \"gene\") and (predict is True):\n",
+    "            logger.warning(\n",
+    "                \"Predictions during training not currently available for gene classifiers; setting predict to False.\"\n",
+    "            )\n",
+    "            predict = False\n",
+    "\n",
+    "        # ensure not overwriting previously saved model\n",
+    "        saved_model_test = os.path.join(output_directory, \"pytorch_model.bin\")\n",
+    "        if os.path.isfile(saved_model_test) is True:\n",
+    "            logger.error(\"Model already saved to this designated output directory.\")\n",
+    "            raise\n",
+    "        # make output directory\n",
+    "        # subprocess.call(f\"mkdir {output_directory}\", shell=True)\n",
+    "        os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "        ##### Load model and training args #####\n",
+    "        model = load_model(\n",
+    "            \"CellClassifier\",\n",
+    "            num_classes,\n",
+    "            model_directory,\n",
+    "            \"train\",\n",
+    "            quantize=quantize,\n",
+    "        )\n",
+    "        def_training_args, def_freeze_layers = get_default_train_args(\n",
+    "            model, classifier, train_data, output_directory\n",
+    "        )\n",
+    "\n",
+    "        if training_args is not None:\n",
+    "            def_training_args.update(training_args)\n",
+    "        logging_steps = round(\n",
+    "            len(train_data) / def_training_args[\"per_device_train_batch_size\"] / 10\n",
+    "        )\n",
+    "        def_training_args[\"logging_steps\"] = logging_steps\n",
+    "        def_training_args[\"output_dir\"] = output_directory\n",
+    "        if eval_data is None:\n",
+    "            def_training_args[\"evaluation_strategy\"] = \"no\"\n",
+    "            def_training_args[\"load_best_model_at_end\"] = False\n",
+    "        training_args_init = TrainingArguments(**def_training_args)\n",
+    "\n",
+    "        if freeze_layers is not None:\n",
+    "            def_freeze_layers = freeze_layers\n",
+    "\n",
+    "        if def_freeze_layers > 0:\n",
+    "            modules_to_freeze = model.bert.encoder.layer[:def_freeze_layers]\n",
+    "            for module in modules_to_freeze:\n",
+    "                for param in module.parameters():\n",
+    "                    param.requires_grad = False\n",
+    "\n",
+    "        ##### Fine-tune the model #####\n",
+    "        # define the data collator\n",
+    "        if classifier == \"cell\":\n",
+    "            data_collator = DataCollatorForCellClassification()\n",
+    "        elif self.classifier == \"gene\":\n",
+    "            data_collator = DataCollatorForGeneClassification()\n",
+    "\n",
+    "        # create the trainer\n",
+    "        trainer = Trainer(\n",
+    "            model=model,\n",
+    "            args=training_args_init,\n",
+    "            data_collator=data_collator,\n",
+    "            train_dataset=train_data,\n",
+    "            eval_dataset=eval_data,\n",
+    "            compute_metrics=compute_metrics,\n",
+    "        )\n",
+    "\n",
+    "        # train the classifier\n",
+    "        trainer.train()\n",
+    "        trainer.save_model(output_directory)\n",
+    "        if predict is True:\n",
+    "            # make eval predictions and save predictions and metrics\n",
+    "            predictions = trainer.predict(eval_data)\n",
+    "            prediction_output_path = f\"{output_directory}/predictions.pkl\"\n",
+    "            with open(prediction_output_path, \"wb\") as f:\n",
+    "                pickle.dump(predictions, f)\n",
+    "            trainer.save_metrics(\"eval\", predictions.metrics)\n",
+    "        return trainer\n",
+    "    \n",
+    "def validate_and_clean_cols(train_data, eval_data, classifier):\n",
+    "    # validate that data has expected label column and remove others\n",
+    "    if classifier == \"cell\":\n",
+    "        label_col = \"label\"\n",
+    "    elif classifier == \"gene\":\n",
+    "        label_col = \"labels\"\n",
+    "\n",
+    "    cols_to_keep = [label_col] + [\"input_ids\", \"length\"]\n",
+    "    if label_col not in train_data.column_names:\n",
+    "        logger.error(f\"train_data must contain column {label_col} with class labels.\")\n",
+    "        raise\n",
+    "    else:\n",
+    "        train_data = remove_cols(train_data, cols_to_keep)\n",
+    "\n",
+    "    if eval_data is not None:\n",
+    "        if label_col not in eval_data.column_names:\n",
+    "            logger.error(\n",
+    "                f\"eval_data must contain column {label_col} with class labels.\"\n",
+    "            )\n",
+    "            raise\n",
+    "        else:\n",
+    "            eval_data = remove_cols(eval_data, cols_to_keep)\n",
+    "    return train_data, eval_data\n",
+    "    \n",
+    "def remove_cols(data, cols_to_keep):\n",
+    "    other_cols = list(data.features.keys())\n",
+    "    other_cols = [ele for ele in other_cols if ele not in cols_to_keep]\n",
+    "    data = data.remove_columns(other_cols)\n",
+    "    return data\n",
+    "\n",
+    "def load_model(model_type, num_classes, model_directory, mode, quantize=False):\n",
+    "    if model_type == \"MTLCellClassifier-Quantized\":\n",
+    "        model_type = \"MTLCellClassifier\"\n",
+    "        quantize = True\n",
+    "\n",
+    "    output_hidden_states = (mode == \"eval\")\n",
+    "\n",
+    "    # Quantization logic\n",
+    "    if quantize:\n",
+    "        if model_type == \"MTLCellClassifier\":\n",
+    "            quantize_config = BitsAndBytesConfig(load_in_8bit=True)\n",
+    "            peft_config = None\n",
+    "        else:\n",
+    "            quantize_config = BitsAndBytesConfig(\n",
+    "                load_in_4bit=True,\n",
+    "                bnb_4bit_use_double_quant=True,\n",
+    "                bnb_4bit_quant_type=\"nf4\",\n",
+    "                bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "            )\n",
+    "            peft_config = LoraConfig(\n",
+    "                lora_alpha=128,\n",
+    "                lora_dropout=0.1,\n",
+    "                r=64,\n",
+    "                bias=\"none\",\n",
+    "                task_type=\"TokenClassification\",\n",
+    "            )\n",
+    "    else:\n",
+    "        quantize_config = None\n",
+    "        peft_config = None\n",
+    "\n",
+    "    # Model class selection\n",
+    "    model_classes = {\n",
+    "        \"Pretrained\": BertForMaskedLM,\n",
+    "        \"GeneClassifier\": BertForTokenClassification,\n",
+    "        \"CellClassifier\": BertForSequenceClassification,\n",
+    "        \"MTLCellClassifier\": BertForMaskedLM\n",
+    "    }\n",
+    "\n",
+    "    model_class = model_classes.get(model_type)\n",
+    "    if not model_class:\n",
+    "        raise ValueError(f\"Unknown model type: {model_type}\")\n",
+    "\n",
+    "    # Model loading\n",
+    "    model_args = {\n",
+    "        \"pretrained_model_name_or_path\": model_directory,\n",
+    "        \"output_hidden_states\": output_hidden_states,\n",
+    "        \"output_attentions\": False,\n",
+    "    }\n",
+    "\n",
+    "    if model_type != \"Pretrained\":\n",
+    "        model_args[\"num_labels\"] = num_classes\n",
+    "\n",
+    "    if quantize_config:\n",
+    "        model_args[\"quantization_config\"] = quantize_config\n",
+    "    \n",
+    "    # Load the model\n",
+    "    model = model_class.from_pretrained(**model_args)\n",
+    "    ###########################\n",
+    "\n",
+    "    if mode == \"eval\":\n",
+    "        model.eval()\n",
+    "\n",
+    "    # Handle device placement and PEFT\n",
+    "    if not quantize:\n",
+    "        # Only move non-quantized models\n",
+    "        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "        model = model.to(device)\n",
+    "    elif peft_config:\n",
+    "        # Apply PEFT for quantized models (except MTLCellClassifier)\n",
+    "        model.enable_input_require_grads()\n",
+    "        model = get_peft_model(model, peft_config)\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "def get_default_train_args(model, classifier, data, output_dir):\n",
+    "    num_layers = quant_layers(model)\n",
+    "    freeze_layers_get = 0\n",
+    "    batch_size = 12\n",
+    "    if classifier == \"cell\":\n",
+    "        epochs = 10\n",
+    "        evaluation_strategy = \"epoch\"\n",
+    "        load_best_model_at_end = True\n",
+    "    else:\n",
+    "        epochs = 1\n",
+    "        evaluation_strategy = \"no\"\n",
+    "        load_best_model_at_end = False\n",
+    "\n",
+    "    if num_layers == 6:\n",
+    "        default_training_args = {\n",
+    "            \"learning_rate\": 5e-5,\n",
+    "            \"lr_scheduler_type\": \"linear\",\n",
+    "            \"warmup_steps\": 500,\n",
+    "            \"per_device_train_batch_size\": batch_size,\n",
+    "            \"per_device_eval_batch_size\": batch_size,\n",
+    "        }\n",
+    "    else:\n",
+    "        default_training_args = {\n",
+    "            \"per_device_train_batch_size\": batch_size,\n",
+    "            \"per_device_eval_batch_size\": batch_size,\n",
+    "        }\n",
+    "\n",
+    "    training_args = {\n",
+    "        \"num_train_epochs\": epochs,\n",
+    "        \"do_train\": True,\n",
+    "        \"do_eval\": True,\n",
+    "        \"evaluation_strategy\": evaluation_strategy,\n",
+    "        \"logging_steps\": np.floor(len(data) / batch_size / 8),  # 8 evals per epoch\n",
+    "        \"save_strategy\": \"epoch\",\n",
+    "        \"group_by_length\": False,\n",
+    "        \"length_column_name\": \"length\",\n",
+    "        \"disable_tqdm\": False,\n",
+    "        \"weight_decay\": 0.001,\n",
+    "        \"load_best_model_at_end\": load_best_model_at_end,\n",
+    "    }\n",
+    "    training_args.update(default_training_args)\n",
+    "\n",
+    "    return training_args, freeze_layers_get\n",
+    "\n",
+    "def quant_layers(model):\n",
+    "    layer_nums = []\n",
+    "    for name, parameter in model.named_parameters():\n",
+    "        if \"layer\" in name:\n",
+    "            layer_nums += [int(name.split(\"layer.\")[1].split(\".\")[0])]\n",
+    "    return int(max(layer_nums)) + 1\n",
+    "\n",
+    "def compute_metrics(pred):\n",
+    "    labels = pred.label_ids\n",
+    "    preds = pred.predictions.argmax(-1)\n",
+    "    # calculate accuracy and macro f1 using sklearn's function\n",
+    "    acc = accuracy_score(labels, preds)\n",
+    "    macro_f1 = f1_score(labels, preds, average='macro')\n",
+    "    weighted_f1 = f1_score(labels, preds, average='weighted')\n",
+    "    return {\n",
+    "        'accuracy': acc,\n",
+    "        'macro_f1': macro_f1,\n",
+    "        'weighted_f1': weighted_f1\n",
+    "    }\n",
+    "def evaluate_model(\n",
+    "    model,\n",
+    "    num_classes,\n",
+    "    id_class_dict,\n",
+    "    eval_data,\n",
+    "    predict=False,\n",
+    "    output_directory=None,\n",
+    "    output_prefix=None,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Evaluate the fine-tuned model.\n",
+    "\n",
+    "    **Parameters**\n",
+    "\n",
+    "    model : nn.Module\n",
+    "        | Loaded fine-tuned model (e.g. trainer.model)\n",
+    "    num_classes : int\n",
+    "        | Number of classes for classifier\n",
+    "    id_class_dict : dict\n",
+    "        | Loaded _id_class_dict.pkl previously prepared by Classifier.prepare_data\n",
+    "        | (dictionary of format: numerical IDs: class_labels)\n",
+    "    eval_data : Dataset\n",
+    "        | Loaded evaluation .dataset input\n",
+    "    predict : bool\n",
+    "        | Whether or not to save eval predictions\n",
+    "    output_directory : Path\n",
+    "        | Path to directory where eval data will be saved\n",
+    "    output_prefix : str\n",
+    "        | Prefix for output files\n",
+    "    \"\"\"\n",
+    "\n",
+    "    ##### Evaluate the model #####\n",
+    "    labels = id_class_dict.keys()\n",
+    "    y_pred, y_true, logits_list = classifier_predict(\n",
+    "        model, classifier, eval_data, 100\n",
+    "    )\n",
+    "    conf_mat, macro_f1, acc, roc_metrics = get_metrics(\n",
+    "        y_pred, y_true, logits_list, num_classes, labels\n",
+    "    )\n",
+    "    if predict is True:\n",
+    "        pred_dict = {\n",
+    "            \"pred_ids\": y_pred,\n",
+    "            \"label_ids\": y_true,\n",
+    "            \"predictions\": logits_list,\n",
+    "        }\n",
+    "        pred_dict_output_path = (\n",
+    "            Path(output_directory) / f\"{output_prefix}_pred_dict\"\n",
+    "        ).with_suffix(\".pkl\")\n",
+    "        with open(pred_dict_output_path, \"wb\") as f:\n",
+    "            pickle.dump(pred_dict, f)\n",
+    "    return {\n",
+    "        \"conf_mat\": conf_mat,\n",
+    "        \"macro_f1\": macro_f1,\n",
+    "        \"acc\": acc,\n",
+    "        \"roc_metrics\": roc_metrics,\n",
+    "    }\n",
+    "        \n",
+    "def classifier_predict(model, classifier_type, evalset, forward_batch_size):\n",
+    "    if classifier_type == \"gene\":\n",
+    "        label_name = \"labels\"\n",
+    "    elif classifier_type == \"cell\":\n",
+    "        label_name = \"label\"\n",
+    "\n",
+    "    predict_logits = []\n",
+    "    predict_labels = []\n",
+    "    model.eval()\n",
+    "\n",
+    "    # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims\n",
+    "    evalset_len = len(evalset)\n",
+    "    max_divisible = find_largest_div(evalset_len, forward_batch_size)\n",
+    "    if len(evalset) - max_divisible == 1:\n",
+    "        evalset_len = max_divisible\n",
+    "\n",
+    "    max_evalset_len = max(evalset.select([i for i in range(evalset_len)])[\"length\"])\n",
+    "\n",
+    "    disable_progress_bar()  # disable progress bar for preprocess_classifier_batch mapping\n",
+    "    for i in trange(0, evalset_len, forward_batch_size):\n",
+    "        max_range = min(i + forward_batch_size, evalset_len)\n",
+    "        batch_evalset = evalset.select([i for i in range(i, max_range)])\n",
+    "        padded_batch = preprocess_classifier_batch(\n",
+    "            batch_evalset, max_evalset_len, label_name\n",
+    "        )\n",
+    "        padded_batch.set_format(type=\"torch\")\n",
+    "\n",
+    "        input_data_batch = padded_batch[\"input_ids\"]\n",
+    "        attn_msk_batch = padded_batch[\"attention_mask\"]\n",
+    "        label_batch = padded_batch[label_name]\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(\n",
+    "                input_ids=input_data_batch.to(\"cuda\"),\n",
+    "                attention_mask=attn_msk_batch.to(\"cuda\"),\n",
+    "                labels=label_batch.to(\"cuda\"),\n",
+    "            )\n",
+    "            predict_logits += [torch.squeeze(outputs.logits.to(\"cpu\"))]\n",
+    "            predict_labels += [torch.squeeze(label_batch.to(\"cpu\"))]\n",
+    "\n",
+    "    enable_progress_bar()\n",
+    "    logits_by_cell = torch.cat(predict_logits)\n",
+    "    last_dim = len(logits_by_cell.shape) - 1\n",
+    "    all_logits = logits_by_cell.reshape(-1, logits_by_cell.shape[last_dim])\n",
+    "    labels_by_cell = torch.cat(predict_labels)\n",
+    "    all_labels = torch.flatten(labels_by_cell)\n",
+    "    logit_label_paired = [\n",
+    "        item\n",
+    "        for item in list(zip(all_logits.tolist(), all_labels.tolist()))\n",
+    "        if item[1] != -100\n",
+    "    ]\n",
+    "    y_pred = [vote(item[0]) for item in logit_label_paired]\n",
+    "    y_true = [item[1] for item in logit_label_paired]\n",
+    "    logits_list = [item[0] for item in logit_label_paired]\n",
+    "    return y_pred, y_true, logits_list\n",
+    "\n",
+    "def find_largest_div(N, K):\n",
+    "    rem = N % K\n",
+    "    if rem == 0:\n",
+    "        return N\n",
+    "    else:\n",
+    "        return N - rem\n",
+    "def preprocess_classifier_batch(cell_batch, max_len, label_name):\n",
+    "    if max_len is None:\n",
+    "        max_len = max([len(i) for i in cell_batch[\"input_ids\"]])\n",
+    "\n",
+    "    def pad_label_example(example):\n",
+    "        example[label_name] = np.pad(\n",
+    "            example[label_name],\n",
+    "            (0, max_len - len(example[\"input_ids\"])),\n",
+    "            mode=\"constant\",\n",
+    "            constant_values=-100,\n",
+    "        )\n",
+    "        example[\"input_ids\"] = np.pad(\n",
+    "            example[\"input_ids\"],\n",
+    "            (0, max_len - len(example[\"input_ids\"])),\n",
+    "            mode=\"constant\",\n",
+    "            constant_values=gene_token_dict.get(\"<pad>\"),\n",
+    "        )\n",
+    "        example[\"attention_mask\"] = (\n",
+    "            example[\"input_ids\"] != gene_token_dict.get(\"<pad>\")\n",
+    "        ).astype(int)\n",
+    "        return example\n",
+    "\n",
+    "    padded_batch = cell_batch.map(pad_label_example)\n",
+    "    return padded_batch\n",
+    "def vote(logit_list):\n",
+    "    m = max(logit_list)\n",
+    "    logit_list.index(m)\n",
+    "    indices = [i for i, x in enumerate(logit_list) if x == m]\n",
+    "    if len(indices) > 1:\n",
+    "        return \"tie\"\n",
+    "    else:\n",
+    "        return indices[0]\n",
+    "def py_softmax(vector):\n",
+    "    e = np.exp(vector)\n",
+    "    return e / e.sum()\n",
+    "def get_metrics(y_pred, y_true, logits_list, num_classes, labels):\n",
+    "    conf_mat = confusion_matrix(y_true, y_pred, labels=list(labels))\n",
+    "    macro_f1 = f1_score(y_true, y_pred, average=\"macro\")\n",
+    "    acc = accuracy_score(y_true, y_pred)\n",
+    "    roc_metrics = None  # roc metrics not reported for multiclass\n",
+    "    if num_classes == 2:\n",
+    "        y_score = [py_softmax(item)[1] for item in logits_list]\n",
+    "        fpr, tpr, _ = roc_curve(y_true, y_score)\n",
+    "        mean_fpr = np.linspace(0, 1, 100)\n",
+    "        interp_tpr = np.interp(mean_fpr, fpr, tpr)\n",
+    "        interp_tpr[0] = 0.0\n",
+    "        tpr_wt = len(tpr)\n",
+    "        roc_auc = auc(fpr, tpr)\n",
+    "        roc_metrics = {\n",
+    "            \"fpr\": fpr,\n",
+    "            \"tpr\": tpr,\n",
+    "            \"interp_tpr\": interp_tpr,\n",
+    "            \"auc\": roc_auc,\n",
+    "            \"tpr_wt\": tpr_wt,\n",
+    "        }\n",
+    "    return conf_mat, macro_f1, acc, roc_metrics\n",
+    "def evaluate_saved_model(\n",
+    "    model_directory,\n",
+    "    id_class_dict_file,\n",
+    "    test_data_file,\n",
+    "    output_directory,\n",
+    "    output_prefix,\n",
+    "    predict=True,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Evaluate the fine-tuned model.\n",
+    "\n",
+    "    **Parameters**\n",
+    "\n",
+    "    model_directory : Path\n",
+    "        | Path to directory containing model\n",
+    "    id_class_dict_file : Path\n",
+    "        | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data\n",
+    "        | (dictionary of format: numerical IDs: class_labels)\n",
+    "    test_data_file : Path\n",
+    "        | Path to directory containing test .dataset\n",
+    "    output_directory : Path\n",
+    "        | Path to directory where eval data will be saved\n",
+    "    output_prefix : str\n",
+    "        | Prefix for output files\n",
+    "    predict : bool\n",
+    "        | Whether or not to save eval predictions\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # load numerical id to class dictionary (id:class)\n",
+    "    with open(id_class_dict_file, \"rb\") as f:\n",
+    "        id_class_dict = pickle.load(f)\n",
+    "\n",
+    "    # get number of classes for classifier\n",
+    "    num_classes = get_num_classes(id_class_dict)\n",
+    "\n",
+    "    # load previously filtered and prepared data\n",
+    "    test_data = load_and_filter(None, nproc, test_data_file)\n",
+    "\n",
+    "    # load previously fine-tuned model\n",
+    "    model = load_model(\n",
+    "        \"CellClassifier\",\n",
+    "        num_classes,\n",
+    "        model_directory,\n",
+    "        \"eval\",\n",
+    "        quantize=quantize,\n",
+    "    )\n",
+    "\n",
+    "    # evaluate the model\n",
+    "    result = evaluate_model(\n",
+    "        model,\n",
+    "        num_classes,\n",
+    "        id_class_dict,\n",
+    "        test_data,\n",
+    "        predict=predict,\n",
+    "        output_directory=output_directory,\n",
+    "        output_prefix=\"CellClassifier\",\n",
+    "    )\n",
+    "\n",
+    "    all_conf_mat_df = pd.DataFrame(\n",
+    "        result[\"conf_mat\"],\n",
+    "        columns=id_class_dict.values(),\n",
+    "        index=id_class_dict.values(),\n",
+    "    )\n",
+    "    all_metrics = {\n",
+    "        \"conf_matrix\": all_conf_mat_df,\n",
+    "        \"macro_f1\": result[\"macro_f1\"],\n",
+    "        \"acc\": result[\"acc\"],\n",
+    "    }\n",
+    "    all_roc_metrics = None  # roc metrics not reported for multiclass\n",
+    "\n",
+    "    if num_classes == 2:\n",
+    "        mean_fpr = np.linspace(0, 1, 100)\n",
+    "        mean_tpr = result[\"roc_metrics\"][\"interp_tpr\"]\n",
+    "        all_roc_auc = result[\"roc_metrics\"][\"auc\"]\n",
+    "        all_roc_metrics = {\n",
+    "            \"mean_tpr\": mean_tpr,\n",
+    "            \"mean_fpr\": mean_fpr,\n",
+    "            \"all_roc_auc\": all_roc_auc,\n",
+    "        }\n",
+    "    all_metrics[\"all_roc_metrics\"] = all_roc_metrics\n",
+    "    test_metrics_output_path = (\n",
+    "        Path(output_directory) / f\"{output_prefix}_test_metrics_dict\"\n",
+    "    ).with_suffix(\".pkl\")\n",
+    "    with open(test_metrics_output_path, \"wb\") as f:\n",
+    "        pickle.dump(all_metrics, f)\n",
+    "\n",
+    "    return all_metrics\n",
+    "\n",
+    "def plot_conf_mat(\n",
+    "    conf_mat_dict,\n",
+    "    output_directory,\n",
+    "    output_prefix,\n",
+    "    custom_class_order=None,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Plot confusion matrix results of evaluating the fine-tuned model.\n",
+    "\n",
+    "    **Parameters**\n",
+    "\n",
+    "    conf_mat_dict : dict\n",
+    "        | Dictionary of model_name : confusion_matrix_DataFrame\n",
+    "        | (all_metrics[\"conf_matrix\"] from self.validate)\n",
+    "    output_directory : Path\n",
+    "        | Path to directory where plots will be saved\n",
+    "    output_prefix : str\n",
+    "        | Prefix for output file\n",
+    "    custom_class_order : None, list\n",
+    "        | List of classes in custom order for plots.\n",
+    "        | Same order will be used for all models.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    for model_name in conf_mat_dict.keys():\n",
+    "        plot_confusion_matrix(\n",
+    "            conf_mat_dict[model_name],\n",
+    "            model_name,\n",
+    "            output_directory,\n",
+    "            output_prefix,\n",
+    "            custom_class_order,\n",
+    "        )\n",
+    "def plot_confusion_matrix(\n",
+    "    conf_mat_df, title, output_dir, output_prefix, custom_class_order\n",
+    "):\n",
+    "    fig = plt.figure()\n",
+    "    fig.set_size_inches(10, 10)\n",
+    "    sns.set(font_scale=1)\n",
+    "    sns.set_style(\"whitegrid\", {\"axes.grid\": False})\n",
+    "    if custom_class_order is not None:\n",
+    "        conf_mat_df = conf_mat_df.reindex(\n",
+    "            index=custom_class_order, columns=custom_class_order\n",
+    "        )\n",
+    "    display_labels = generate_display_labels(conf_mat_df)\n",
+    "    conf_mat = preprocessing.normalize(conf_mat_df.to_numpy(), norm=\"l1\")\n",
+    "    display = ConfusionMatrixDisplay(\n",
+    "        confusion_matrix=conf_mat, display_labels=display_labels\n",
+    "    )\n",
+    "    display.plot(cmap=\"Blues\", values_format=\".2g\")\n",
+    "    plt.title(title)\n",
+    "    plt.show()\n",
+    "\n",
+    "    output_file = (Path(output_dir) / f\"{output_prefix}_conf_mat\").with_suffix(\".pdf\")\n",
+    "    display.figure_.savefig(output_file, bbox_inches=\"tight\")\n",
+    "def generate_display_labels(conf_mat_df):\n",
+    "    display_labels = []\n",
+    "    i = 0\n",
+    "    for label in conf_mat_df.index:\n",
+    "        display_labels += [f\"{label}\\nn={conf_mat_df.iloc[i,:].sum():.0f}\"]\n",
+    "        i = i + 1\n",
+    "    return display_labels\n",
+    "\n",
+    "def plot_predictions(\n",
+    "    predictions_file,\n",
+    "    id_class_dict_file,\n",
+    "    title,\n",
+    "    output_directory,\n",
+    "    output_prefix,\n",
+    "    custom_class_order=None,\n",
+    "    kwargs_dict=None,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Plot prediction results of evaluating the fine-tuned model.\n",
+    "\n",
+    "    **Parameters**\n",
+    "\n",
+    "    predictions_file : path\n",
+    "        | Path of model predictions output to plot\n",
+    "        | (saved output from self.validate if predict_eval=True)\n",
+    "        | (or saved output from self.evaluate_saved_model)\n",
+    "    id_class_dict_file : Path\n",
+    "        | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data\n",
+    "        | (dictionary of format: numerical IDs: class_labels)\n",
+    "    title : str\n",
+    "        | Title for legend containing class labels.\n",
+    "    output_directory : Path\n",
+    "        | Path to directory where plots will be saved\n",
+    "    output_prefix : str\n",
+    "        | Prefix for output file\n",
+    "    custom_class_order : None, list\n",
+    "        | List of classes in custom order for plots.\n",
+    "        | Same order will be used for all models.\n",
+    "    kwargs_dict : None, dict\n",
+    "        | Dictionary of kwargs to pass to plotting function.\n",
+    "    \"\"\"\n",
+    "    # load predictions\n",
+    "    with open(predictions_file, \"rb\") as f:\n",
+    "        predictions = pickle.load(f)\n",
+    "\n",
+    "    # load numerical id to class dictionary (id:class)\n",
+    "    with open(id_class_dict_file, \"rb\") as f:\n",
+    "        id_class_dict = pickle.load(f)\n",
+    "\n",
+    "    if isinstance(predictions, dict):\n",
+    "        if all(\n",
+    "            [\n",
+    "                key in predictions.keys()\n",
+    "                for key in [\"pred_ids\", \"label_ids\", \"predictions\"]\n",
+    "            ]\n",
+    "        ):\n",
+    "            # format is output from self.evaluate_saved_model\n",
+    "            predictions_logits = np.array(predictions[\"predictions\"])\n",
+    "            true_ids = predictions[\"label_ids\"]\n",
+    "    else:\n",
+    "        # format is output from self.validate if predict_eval=True\n",
+    "        predictions_logits = predictions.predictions\n",
+    "        true_ids = predictions.label_ids\n",
+    "\n",
+    "    num_classes = len(id_class_dict.keys())\n",
+    "    num_predict_classes = predictions_logits.shape[1]\n",
+    "    assert num_classes == num_predict_classes\n",
+    "    classes = id_class_dict.values()\n",
+    "    true_labels = [id_class_dict[idx] for idx in true_ids]\n",
+    "    predictions_df = pd.DataFrame(predictions_logits, columns=classes)\n",
+    "    if custom_class_order is not None:\n",
+    "        predictions_df = predictions_df.reindex(columns=custom_class_order)\n",
+    "    predictions_df[\"true\"] = true_labels\n",
+    "    custom_dict = dict(zip(classes, [i for i in range(len(classes))]))\n",
+    "    if custom_class_order is not None:\n",
+    "        custom_dict = dict(\n",
+    "            zip(custom_class_order, [i for i in range(len(custom_class_order))])\n",
+    "        )\n",
+    "    predictions_df = predictions_df.sort_values(\n",
+    "        by=[\"true\"], key=lambda x: x.map(custom_dict)\n",
+    "    )\n",
+    "\n",
+    "    plot_predictions_eu(\n",
+    "        predictions_df, title, output_directory, output_prefix, kwargs_dict\n",
+    "    )\n",
+    "def plot_predictions_eu(predictions_df, title, output_dir, output_prefix, kwargs_dict):\n",
+    "    sns.set(font_scale=2)\n",
+    "    plt.figure(figsize=(10, 10), dpi=150)\n",
+    "    label_colors, label_color_dict = make_colorbar(predictions_df, \"true\")\n",
+    "    predictions_df = predictions_df.drop(columns=[\"true\"])\n",
+    "    predict_colors_list = [label_color_dict[label] for label in predictions_df.columns]\n",
+    "    predict_label_list = [label for label in predictions_df.columns]\n",
+    "    predict_colors = pd.DataFrame(\n",
+    "        pd.Series(predict_colors_list, index=predict_label_list), columns=[\"predicted\"]\n",
+    "    )\n",
+    "\n",
+    "    default_kwargs_dict = {\n",
+    "        \"row_cluster\": False,\n",
+    "        \"col_cluster\": False,\n",
+    "        \"row_colors\": label_colors,\n",
+    "        \"col_colors\": predict_colors,\n",
+    "        \"linewidths\": 0,\n",
+    "        \"xticklabels\": False,\n",
+    "        \"yticklabels\": False,\n",
+    "        \"center\": 0,\n",
+    "        \"cmap\": \"vlag\",\n",
+    "    }\n",
+    "\n",
+    "    if kwargs_dict is not None:\n",
+    "        default_kwargs_dict.update(kwargs_dict)\n",
+    "    g = sns.clustermap(predictions_df, **default_kwargs_dict)\n",
+    "\n",
+    "    plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha=\"right\")\n",
+    "\n",
+    "    for label_color in list(label_color_dict.keys()):\n",
+    "        g.ax_col_dendrogram.bar(\n",
+    "            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0\n",
+    "        )\n",
+    "\n",
+    "        g.ax_col_dendrogram.legend(\n",
+    "            title=f\"{title}\",\n",
+    "            loc=\"lower center\",\n",
+    "            ncol=4,\n",
+    "            bbox_to_anchor=(0.5, 1),\n",
+    "            facecolor=\"white\",\n",
+    "        )\n",
+    "\n",
+    "    output_file = (Path(output_dir) / f\"{output_prefix}_pred\").with_suffix(\".pdf\")\n",
+    "    plt.savefig(output_file, bbox_inches=\"tight\")\n",
+    "def make_colorbar(embs_df, label):\n",
+    "    labels = list(embs_df[label])\n",
+    "\n",
+    "    cell_type_colors = gen_heatmap_class_colors(labels, embs_df)\n",
+    "    label_colors = pd.DataFrame(cell_type_colors, columns=[label])\n",
+    "\n",
+    "    # create dictionary for colors and classes\n",
+    "    label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])\n",
+    "    return label_colors, label_color_dict\n",
+    "def gen_heatmap_class_colors(labels, df):\n",
+    "    pal = sns.cubehelix_palette(\n",
+    "        len(Counter(labels).keys()),\n",
+    "        light=0.9,\n",
+    "        dark=0.1,\n",
+    "        hue=1,\n",
+    "        reverse=True,\n",
+    "        start=1,\n",
+    "        rot=-2,\n",
+    "    )\n",
+    "    lut = dict(zip(map(str, Counter(labels).keys()), pal))\n",
+    "    colors = pd.Series(labels, index=df.index).map(lut)\n",
+    "    return colors\n",
+    "def gen_heatmap_class_dict(classes, label_colors_series):\n",
+    "    class_color_dict_df = pd.DataFrame(\n",
+    "        {\"classes\": classes, \"color\": label_colors_series}\n",
+    "    )\n",
+    "    class_color_dict_df = class_color_dict_df.drop_duplicates(subset=[\"classes\"])\n",
+    "    return dict(zip(class_color_dict_df[\"classes\"], class_color_dict_df[\"color\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7a260f2ee53e46cda883751b4f9ee36f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/3 shards):   0%|          | 0/115367 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56bf186783134b349bece0953132c491",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/17228 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cccf5a6fd66f4005b6ebd2aef3772229",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "****** Validation split: 1/1 ******\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c1733b61dd14cb4a9e36cee4704a218",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Filter (num_proc=16):   0%|          | 0/115367 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb09533c6da74363a7e26f20d777fce8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Filter (num_proc=16):   0%|          | 0/115367 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "corpus_dir = \"Pretrain_data\"\n",
+    "with open(corpus_dir + \"/token_dictionary.pkl\", \"rb\") as fp:\n",
+    "    gene_token_dict = pickle.load(fp)\n",
+    "token_gene_dict = {v: k for k, v in gene_token_dict.items()}\n",
+    "\n",
+    "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
+    "training_args = {\n",
+    "    \"num_train_epochs\": 0.9,\n",
+    "    \"learning_rate\": 0.000804,\n",
+    "    \"lr_scheduler_type\": \"polynomial\",\n",
+    "    \"warmup_steps\": 1812,\n",
+    "    \"weight_decay\":0.258828,\n",
+    "    \"per_device_train_batch_size\": 12,\n",
+    "    \"seed\": 73,\n",
+    "}\n",
+    "\n",
+    "cell_state_dict = {\"state_key\": \"disease\", \"states\": \"all\"}\n",
+    "classifier='cell'\n",
+    "filter_data=filter_data_dict\n",
+    "split_sizes={\"train\": 0.8, \"valid\": 0.1, \"test\": 0.1}\n",
+    "train_size = split_sizes[\"train\"]\n",
+    "valid_size = split_sizes[\"valid\"]\n",
+    "oos_test_size = split_sizes[\"test\"]\n",
+    "max_ncells=None\n",
+    "freeze_layers = 2\n",
+    "num_crossval_splits = 1\n",
+    "forward_batch_size=200\n",
+    "nproc=16\n",
+    "rare_threshold=0\n",
+    "quantize=None\n",
+    "\n",
+    "\n",
+    "train_ids = [\"1447\", \"1600\", \"1462\", \"1558\", \"1300\", \"1508\", \"1358\", \"1678\", \"1561\", \"1304\", \"1610\", \"1430\", \"1472\", \"1707\", \"1726\", \"1504\", \"1425\", \"1617\", \"1631\", \"1735\", \"1582\", \"1722\", \"1622\", \"1630\", \"1290\", \"1479\", \"1371\", \"1549\", \"1515\"]\n",
+    "eval_ids = [\"1422\", \"1510\", \"1539\", \"1606\", \"1702\"]\n",
+    "test_ids = [\"1437\", \"1516\", \"1602\", \"1685\", \"1718\"]\n",
+    "\n",
+    "train_test_id_split_dict = {\"attr_key\": \"individual\",\n",
+    "                            \"train\": train_ids+eval_ids,\n",
+    "                            \"test\": test_ids}\n",
+    "train_valid_id_split_dict = {\"attr_key\": \"individual\",\n",
+    "                            \"train\": train_ids,\n",
+    "                            \"eval\": eval_ids}\n",
+    "\n",
+    "# define output directory path\n",
+    "current_date = datetime.datetime.now()\n",
+    "datestamp = f\"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}{current_date.strftime('%X').replace(':','')}\"\n",
+    "datestamp_min = f\"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}\"\n",
+    "output_directory = \"output path\"\n",
+    "\n",
+    "if output_directory[-1:] != \"/\":  # add slash for dir if not present\n",
+    "    output_directory = output_directory + \"/\"\n",
+    "output_dir = f\"{output_directory}{datestamp}_geneformer_diseaseClassifier/\"\n",
+    "output_prefix = \"cm_classifier_test\"\n",
+    "subprocess.call(f\"mkdir {output_dir}\", shell=True)\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "prepare_data(input_data_file=\"example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\",\n",
+    "                output_directory=output_dir,\n",
+    "                output_prefix=output_prefix,\n",
+    "                split_id_dict=train_test_id_split_dict)\n",
+    "\n",
+    "with open(f\"{output_dir}/{output_prefix}_id_class_dict.pkl\", \"rb\") as f:\n",
+    "    id_class_dict = pickle.load(f)\n",
+    "class_id_dict = {v: k for k, v in id_class_dict.items()}\n",
+    "\n",
+    "num_classes = get_num_classes(id_class_dict)\n",
+    "\n",
+    "data = load_and_filter(None, nproc, f\"{output_dir}/{output_prefix}_labeled_train.dataset\")\n",
+    "data = data.shuffle(seed=42)\n",
+    "\n",
+    "##### (Cross-)validate the model #####\n",
+    "results = []\n",
+    "all_conf_mat = np.zeros((num_classes, num_classes))\n",
+    "iteration_num = 1\n",
+    "split_id_dict=train_valid_id_split_dict\n",
+    "\n",
+    "for i in trange(num_crossval_splits):\n",
+    "    print(\n",
+    "        f\"****** Validation split: {iteration_num}/{num_crossval_splits} ******\\n\"\n",
+    "    )\n",
+    "    ksplit_output_dir = os.path.join(output_dir, f\"ksplit{iteration_num}\")\n",
+    "    if num_crossval_splits == 1:\n",
+    "        # single 1-eval_size:eval_size split\n",
+    "        if split_id_dict is not None:\n",
+    "            data_dict = dict()\n",
+    "            data_dict[\"train\"] = filter_by_dict(\n",
+    "                data,\n",
+    "                {split_id_dict[\"attr_key\"]: split_id_dict[\"train\"]},\n",
+    "                nproc,\n",
+    "            )\n",
+    "            data_dict[\"test\"] = filter_by_dict(\n",
+    "                data,\n",
+    "                {split_id_dict[\"attr_key\"]: split_id_dict[\"eval\"]},\n",
+    "                nproc,\n",
+    "            )\n",
+    "        train_data = data_dict[\"train\"]\n",
+    "        eval_data = data_dict[\"test\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converting training dataset...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Converting sequences: 100%|██████████| 93589/93589 [00:02<00:00, 41967.40seq/s] \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converting evaluation dataset...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Converting sequences: 100%|██████████| 21778/21778 [00:00<00:00, 151581.39seq/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n",
+      "Training LogisticRegression...\n",
+      "                    Accuracy  Macro F1  Weighted F1  Weighted Precision\n",
+      "RandomForest        0.618055  0.457959     0.649440            0.687780\n",
+      "LogisticRegression  0.592065  0.440782     0.608307            0.645992\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.metrics import accuracy_score, f1_score, precision_score\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "def pad_or_truncate(seq, max_len):\n",
+    "    if len(seq) < max_len:\n",
+    "        return seq + [0] * (max_len - len(seq))\n",
+    "    else:\n",
+    "        return seq[:max_len]\n",
+    "\n",
+    "def dataset_to_numpy(hf_dataset, max_len=256):\n",
+    "    X = []\n",
+    "    for seq in tqdm(hf_dataset[\"input_ids\"], desc=\"Converting sequences\", unit=\"seq\"):\n",
+    "        X.append(pad_or_truncate(seq, max_len))\n",
+    "    y = np.array(hf_dataset[\"label\"])\n",
+    "    return np.array(X), y\n",
+    "\n",
+    "print(\"Converting training dataset...\")\n",
+    "X_train, y_train = dataset_to_numpy(train_data)\n",
+    "print(\"Converting evaluation dataset...\")\n",
+    "X_eval, y_eval = dataset_to_numpy(eval_data)\n",
+    "\n",
+    "models = {\n",
+    "    \"RandomForest\": RandomForestClassifier(n_estimators=100, random_state=42),\n",
+    "    \"LogisticRegression\": LogisticRegression(max_iter=1000, random_state=42),\n",
+    "    \"SVM\": SVC(kernel=\"linear\", probability=True, random_state=42),\n",
+    "    \"SVM\": make_pipeline(StandardScaler(), SVC(kernel=\"rbf\", probability=True, random_state=42)),\n",
+    "}\n",
+    "\n",
+    "results = {}\n",
+    "for name, model in models.items():\n",
+    "    print(f\"Training {name}...\")\n",
+    "    model.fit(X_train, y_train)\n",
+    "    y_pred = model.predict(X_eval)\n",
+    "    \n",
+    "    acc = accuracy_score(y_eval, y_pred)\n",
+    "    macro_f1 = f1_score(y_eval, y_pred, average=\"macro\")\n",
+    "    weighted_f1 = f1_score(y_eval, y_pred, average=\"weighted\")\n",
+    "    precision = precision_score(y_eval, y_pred, average=\"weighted\")\n",
+    "    \n",
+    "    results[name] = {\n",
+    "        \"Accuracy\": acc,\n",
+    "        \"Macro F1\": macro_f1,\n",
+    "        \"Weighted F1\": weighted_f1,\n",
+    "        \"Weighted Precision\": precision\n",
+    "    }\n",
+    "\n",
+    "# Display results\n",
+    "import pandas as pd\n",
+    "results_df = pd.DataFrame(results).T\n",
+    "print(results_df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'RandomForest': {'Accuracy': 0.6180549178069612,\n",
+       "  'Macro F1': 0.45795920359758124,\n",
+       "  'Weighted F1': 0.6494402066016174,\n",
+       "  'Weighted Precision': 0.687779833202143},\n",
+       " 'LogisticRegression': {'Accuracy': 0.5920653870878868,\n",
+       "  'Macro F1': 0.4407815175765883,\n",
+       "  'Weighted F1': 0.6083068177204959,\n",
+       "  'Weighted Precision': 0.6459924332028076}}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Downstream_tasks/Classification/Gene_dosage.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Downstream_tasks/Classification/Gene_dosage_ML.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Downstream_tasks/Classification/Tissue_type.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import os
+from tqdm.auto import tqdm, trange
+GPU_NUMBER = [0]
+os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
+os.environ["NCCL_DEBUG"] = "INFO"
+# imports
+from collections import Counter
+import seaborn as sns; sns.set()
+from datasets import load_from_disk
+from sklearn.metrics import accuracy_score, f1_score
+from transformers import Trainer
+from transformers.training_args import TrainingArguments
+import pandas as pd
+from datasets.utils.logging import disable_progress_bar, enable_progress_bar
+from sklearn import preprocessing
+from sklearn.metrics import (
+    ConfusionMatrixDisplay,
+    accuracy_score,
+    auc,
+    confusion_matrix,
+    f1_score,
+    roc_curve,
+)
+from pathlib import Path
+import sys
+# sys.path.append('../Geneformer')
+from geneformer import DataCollatorForCellClassification
+from datasets import load_from_disk
+import sys
+from tqdm.notebook import tqdm
+import seaborn as sns
+import matplotlib.pyplot as plt
+from geneformer.pretrainer import token_dictionary
+import datetime
+import time
+import pickle
+import random
+import subprocess
+import numpy as np
+import pytz
+import torch
+from datasets import load_from_disk, Dataset
+from transformers import (BertConfig, BertForMaskedLM, TrainingArguments, TrainerCallback,
+                        Trainer, BertModel, BertPreTrainedModel, BertForSequenceClassification, BertForTokenClassification)
+from geneformer import GeneformerPretrainer
+from torch import Tensor
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.models.bert.modeling_bert import BertLMPredictionHead, BertOnlyMLMHead, BertPredictionHeadTransform
+from transformers.activations import ACT2FN
+from typing import List, Optional, Tuple, Union
+import torch.nn.functional as F
+model_path = 'model path'
+prefix = 'CAB5_1M'
+total_iter = 1
+class CustomBertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+    _tied_weights_keys = ["decoder.weight", "bert.embeddings.word_embeddings.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.transform = BertPredictionHeadTransform(config)
+        self.decoder = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = torch.nn.Parameter(torch.zeros(config.vocab_size))
+        # Initialize weights
+        self.init_weights()
+        # Tie weights automatically
+        self.tie_weights()
+        # self.post_init()
+    def tie_weights(self):
+        """
+        Ties the weights between the input embeddings and output decoder weights.
+        """
+        self.decoder.weight = self.bert.embeddings.word_embeddings.weight
+    def probability_convert(self, probs: Tensor, input_ids: Tensor, labels: Tensor) -> Tensor:
+        device = probs.device
+        batch_size, seq_length, vocab_size = probs.size()
+        _, input_seq_length = input_ids.size()
+        # truncated_labels = labels[:, :input_seq_length]
+        # non_mask = truncated_labels == -100
+        non_mask = labels == -100
+        non_mask_indices = non_mask.nonzero(as_tuple=True)
+        known_gene_indices = input_ids[non_mask]
+        # Generate (1-p) matrix whiel assigning all known genes in the beginning
+        zeros = torch.zeros((batch_size, 1, vocab_size), device=device)
+        zeros[non_mask_indices[0], 0, known_gene_indices] = 1.0
+        probs_shifted = torch.cat((zeros, probs[:, :-1, :]), dim=1)
+        inv_probs_shifted = 1 - probs_shifted
+        # Cumulative product to get (1-p_1)*(1-p_2)*...*(p_i)
+        cumprod_inv_probs = torch.cumprod(inv_probs_shifted, dim=1)
+        modified_probs = probs * cumprod_inv_probs
+        # # Since we are assigning probabilities for already known genes,
+        # # (1-p_1)*(1-p_2)*...*(p_i) for these genes can result in 0, due to hard assignment of probs to be 1
+        # # Add 1e-18 to avoid dividing modified probs by 0
+        # # During dubugging stage, some issues occurred in the normalization step.
+        # # Since probabilities in each position do not necessarily need to sum up to one, leave out normalization.
+        normalized_probs = modified_probs.sum(dim=-1, keepdim=True).clamp(min=1e-18)
+        modified_probs = modified_probs / normalized_probs # Normalization after cumulative production
+        return modified_probs
+    def assign_known_gene_probs(self, probs: Tensor, input_ids: Tensor, labels: Tensor) -> Tensor:
+        device = probs.device
+        batch_size, seq_length, vocab_size = probs.size()
+        _, input_seq_length = input_ids.size()
+        # Truncate `labels` to match the length of `input_ids` along the sequence dimension
+        truncated_labels = labels[:, :input_seq_length]
+        non_mask = truncated_labels == -100
+        non_mask_indices = non_mask.nonzero(as_tuple=True)
+        ones = torch.ones((batch_size, seq_length, vocab_size), device=device)
+        zeros = torch.zeros((batch_size, seq_length, vocab_size), device=device)
+        known_gene_indices = input_ids[non_mask]
+        ones[non_mask_indices[0], non_mask_indices[1], :] = 0.0
+        zeros[non_mask_indices[0], non_mask_indices[1], known_gene_indices] = 1.0
+        # Modify already known genes' probabilities using the one-hot tensor
+        modified_probs = probs * ones
+        modified_probs = modified_probs + zeros
+        # Do the normalization
+        modified_probs = modified_probs / modified_probs.sum(dim=-1, keepdim=True).clamp(min=1e-18)  # Normalize
+        return modified_probs
+    def compute_similarity_on_probs(self, probs: Tensor, labels: Tensor) -> Tensor:
+        """
+        Optimized computation of average cosine similarity across all positions in each sequence and batch.
+        Args:
+            probs (torch.Tensor): Probability tensor of shape (batch_size, seq_length, vocab_size).
+        Returns:
+            torch.Tensor: Average similarity term for loss computation.
+        """
+        batch_size, seq_length, vocab_size = probs.size()
+        device = probs.device
+        non_mask = labels == -100
+        non_mask_indices = non_mask.nonzero(as_tuple=True)
+        mask_sim = torch.ones((batch_size, seq_length, seq_length), device=device)
+        mask_sim[non_mask_indices[0], non_mask_indices[1], :] = 0.0
+        seq_mask = torch.triu(torch.ones(seq_length, seq_length, device=device), diagonal=1)
+        batch_mask = seq_mask.unsqueeze(0).expand(batch_size, seq_length, seq_length)
+        mask_sim = mask_sim * batch_mask
+        # Normalize along the vocab_size dimension
+        probs_norm = F.normalize(probs, dim=-1)  # Shape: (batch_size, seq_length, vocab_size)
+        # Compute pairwise cosine similarity using einsum
+        similarities = torch.einsum("biv,bjv->bij", probs_norm, probs_norm)  # Shape: (batch_size, seq_length, seq_length), listing pair-wise similarity values across all positions
+        # Mask out lower triangle (to consider only i < j pairs)
+        # mask_sim = torch.triu(torch.ones(seq_length, seq_length, device=probs.device), diagonal=1)
+        valid_similarities = similarities * mask_sim  # Shape: (batch_size, seq_length, seq_length)
+        # Compute average similarity
+        total_similarity = valid_similarities.sum()
+        total_comparisons = mask_sim.sum().item()
+        if total_comparisons == 0:
+            return torch.tensor(0.0, device=device)
+        return total_similarity / total_comparisons
+    def forward(
+        self,
+        input_ids: Tensor | None = None,
+        attention_mask: Tensor | None = None,
+        token_type_ids: Tensor | None = None,
+        position_ids: Tensor | None = None,
+        head_mask: Tensor | None = None,
+        inputs_embeds: Tensor | None = None,
+        encoder_hidden_states: Tensor | None = None,
+        encoder_attention_mask: Tensor | None = None,
+        labels: Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            )
+        hidden_states = outputs[0]
+        hidden_transform = self.transform(hidden_states)
+        logits = self.decoder(hidden_transform) + self.bias
+        # temperature = 0.75
+        # logits = logits / temperature
+        probs = F.softmax(logits, dim=-1)
+        # Probability manipulations to avoid repeats from already known genes
+        probs = self.assign_known_gene_probs(probs, input_ids, labels)
+        convert_probs = self.probability_convert(probs, input_ids, labels)
+        assigned_probs = self.assign_known_gene_probs(convert_probs, input_ids, labels)
+        masked_lm_loss = None
+        if labels is not None:
+            probs_flat = assigned_probs.view(-1, self.config.vocab_size)
+            labels_flat = labels.view(-1)
+            mask = (labels != -100).float().view(-1)
+            # Compute masked cross-entropy loss
+            masked_lm_loss = -torch.log(torch.clamp(probs_flat[torch.arange(len(labels_flat)), labels_flat], min=1e-18)) * mask
+            masked_lm_loss = masked_lm_loss.sum() / mask.sum()
+            similarity_loss = self.compute_similarity_on_probs(assigned_probs, labels)
+            lambda_similarity = 5.0  # Adjust this value through experimentation
+            masked_lm_loss = masked_lm_loss + lambda_similarity * similarity_loss
+        else:
+            loss = None
+        if not return_dict:
+            output = (assigned_probs,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=assigned_probs,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+# load cell type dataset (includes all tissues)
+train_dataset=load_from_disk("example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset")
+# load evaluation dataset (includes all tissues)
+eval_dataset=load_from_disk("example_input_files/cell_classification/cell_type_annotation/cell_type_test_data.dataset")
+dataset_list = []
+evalset_list = []
+organ_list = []
+target_dict_list = []
+for organ in Counter(train_dataset["organ_major"]).keys():
+    # collect list of tissues for fine-tuning (immune and bone marrow are included together)
+    if organ in ["bone_marrow"]:
+        continue
+    elif organ=="immune":
+        organ_ids = ["immune","bone_marrow"]
+        organ_list += ["immune"]
+    else:
+        organ_ids = [organ]
+        organ_list += [organ]
+    # filter datasets for given organ
+    def if_organ(example):
+        return example["organ_major"] in organ_ids
+    trainset_organ = train_dataset.filter(if_organ, num_proc=16)
+    # per scDeepsort published method, drop cell types representing <0.5% of cells
+    celltype_counter = Counter(trainset_organ["cell_type"])
+    total_cells = sum(celltype_counter.values())
+    cells_to_keep = [k for k,v in celltype_counter.items() if v>(0.005*total_cells)]
+    def if_not_rare_celltype(example):
+        return example["cell_type"] in cells_to_keep
+    trainset_organ_subset = trainset_organ.filter(if_not_rare_celltype, num_proc=16)
+    # shuffle datasets and rename columns
+    trainset_organ_shuffled = trainset_organ_subset.shuffle(seed=42)
+    trainset_organ_shuffled = trainset_organ_shuffled.rename_column("cell_type","label")
+    trainset_organ_shuffled = trainset_organ_shuffled.remove_columns("organ_major")
+    # create dictionary of cell types : label ids
+    target_names = list(Counter(trainset_organ_shuffled["label"]).keys())
+    target_name_id_dict = dict(zip(target_names,[i for i in range(len(target_names))]))
+    target_dict_list += [target_name_id_dict]
+    # change labels to numerical ids
+    def classes_to_ids(example):
+        example["label"] = target_name_id_dict[example["label"]]
+        return example
+    labeled_trainset = trainset_organ_shuffled.map(classes_to_ids, num_proc=16)
+    # create 80/20 train/eval splits
+    labeled_train_split = labeled_trainset.select([i for i in range(0,round(len(labeled_trainset)*0.8))])
+    labeled_eval_split = labeled_trainset.select([i for i in range(round(len(labeled_trainset)*0.8),len(labeled_trainset))])
+    # filter dataset for cell types in corresponding training set
+    trained_labels = list(Counter(labeled_train_split["label"]).keys())
+    def if_trained_label(example):
+        return example["label"] in trained_labels
+    labeled_eval_split_subset = labeled_eval_split.filter(if_trained_label, num_proc=16)
+    dataset_list += [labeled_train_split]
+    evalset_list += [labeled_eval_split_subset]
+trainset_dict = dict(zip(organ_list,dataset_list))
+traintargetdict_dict = dict(zip(organ_list,target_dict_list))
+evalset_dict = dict(zip(organ_list,evalset_list))
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = pred.predictions.argmax(-1)
+    # calculate accuracy and macro f1 using sklearn's function
+    acc = accuracy_score(labels, preds)
+    macro_f1 = f1_score(labels, preds, average='macro')
+    weighted_f1 = f1_score(labels, preds, average='weighted')
+    return {
+    'accuracy': acc,
+    'macro_f1': macro_f1,
+    'weighted_f1': weighted_f1
+    }
+# set model parameters
+# max input size
+max_input_size = 2 ** 11  # 2048
+# set training hyperparameters
+# max learning rate
+max_lr = 5e-5
+# how many pretrained layers to freeze
+freeze_layers = 0
+# number gpus
+num_gpus = 1
+# number cpu cores
+num_proc = 16
+# batch size for training and eval
+geneformer_batch_size = 12
+# learning schedule
+lr_schedule_fn = "linear"
+# warmup steps
+warmup_steps = 500
+# number of epochs
+epochs = 10
+# optimizer
+optimizer = "adamw"
+for organ in organ_list:
+    print(organ)
+    organ_trainset = trainset_dict[organ]
+    organ_evalset = evalset_dict[organ]
+    organ_label_dict = traintargetdict_dict[organ]
+    # set logging steps
+    logging_steps = round(len(organ_trainset)/geneformer_batch_size/10)
+    # reload pretrained model
+    model = BertForSequenceClassification.from_pretrained(model_path,
+                                                    num_labels=len(organ_label_dict.keys()),
+                                                    output_attentions = False,
+                                                    output_hidden_states = False).to("cuda")
+    # #############
+    pretrained_model = CustomBertForMaskedLM.from_pretrained(model_path)
+    # Extract the word embeddings from the pretrained model
+    pretrained_word_embeddings = pretrained_model.bert.embeddings.word_embeddings.weight.clone()
+    model.bert.embeddings.word_embeddings.load_state_dict({"weight": pretrained_word_embeddings})
+    # ############
+    # define output directory path
+    current_date = datetime.datetime.now()
+    datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+    output_dir = f"/ibex/user/chenj0i/Geneformer/Downstream_tasks/Cell_Classify/{prefix}/{datestamp}_geneformer_CellClassifier_{organ}/"
+    # ensure not overwriting previously saved model
+    saved_model_test = os.path.join(output_dir, f"pytorch_model.bin")
+    if os.path.isfile(saved_model_test) == True:
+        raise Exception("Model already saved to this directory.")
+    # make output directory
+    # subprocess.call(f'mkdir {output_dir}', shell=True)
+    os.makedirs(output_dir, exist_ok=True)
+    # set training arguments
+    training_args = {
+        "learning_rate": max_lr,
+        "do_train": True,
+        "do_eval": True,
+        "evaluation_strategy": "epoch",
+        "save_strategy": "epoch",
+        "logging_steps": logging_steps,
+        "group_by_length": True,
+        "length_column_name": "length",
+        "disable_tqdm": False,
+        "lr_scheduler_type": lr_schedule_fn,
+        "warmup_steps": warmup_steps,
+        "weight_decay": 0.001,
+        "per_device_train_batch_size": geneformer_batch_size,
+        "per_device_eval_batch_size": geneformer_batch_size,
+        "num_train_epochs": epochs,
+        "load_best_model_at_end": True,
+        "output_dir": output_dir,
+    }
+    training_args_init = TrainingArguments(**training_args)
+    # create the trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args_init,
+        data_collator=DataCollatorForCellClassification(),
+        train_dataset=organ_trainset,
+        eval_dataset=organ_evalset,
+        compute_metrics=compute_metrics
+    )
+    # train the cell type classifier
+    trainer.train()
+    predictions = trainer.predict(organ_evalset)
+    with open(f"{output_dir}predictions.pickle", "wb") as fp:
+        pickle.dump(predictions, fp)
+    trainer.save_metrics("eval",predictions.metrics)
+    trainer.save_model(output_dir)

Downstream_tasks/Classification/Tissue_type_ML.ipynb ADDED Viewed

	@@ -0,0 +1,933 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from collections import Counter\n",
+    "import datetime\n",
+    "import pickle\n",
+    "import numpy as np\n",
+    "from datasets import load_from_disk\n",
+    "from sklearn.metrics import accuracy_score, f1_score\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "# Load datasets\n",
+    "train_dataset = load_from_disk(\"example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset\")\n",
+    "eval_dataset = load_from_disk(\"example_input_files/cell_classification/cell_type_annotation/cell_type_test_data.dataset\")\n",
+    "\n",
+    "dataset_list, evalset_list, organ_list, target_dict_list = [], [], [], []\n",
+    "\n",
+    "for organ in Counter(train_dataset[\"organ_major\"]).keys():\n",
+    "    if organ in [\"bone_marrow\"]:  \n",
+    "        continue\n",
+    "    elif organ == \"immune\":\n",
+    "        organ_ids = [\"immune\", \"bone_marrow\"]\n",
+    "        organ_list += [\"immune\"]\n",
+    "    else:\n",
+    "        organ_ids = [organ]\n",
+    "        organ_list += [organ]\n",
+    "    \n",
+    "    def if_organ(example):\n",
+    "        return example[\"organ_major\"] in organ_ids\n",
+    "    trainset_organ = train_dataset.filter(if_organ, num_proc=16)\n",
+    "    \n",
+    "    celltype_counter = Counter(trainset_organ[\"cell_type\"])\n",
+    "    total_cells = sum(celltype_counter.values())\n",
+    "    cells_to_keep = [k for k, v in celltype_counter.items() if v > (0.005 * total_cells)]\n",
+    "    \n",
+    "    def if_not_rare_celltype(example):\n",
+    "        return example[\"cell_type\"] in cells_to_keep\n",
+    "    trainset_organ_subset = trainset_organ.filter(if_not_rare_celltype, num_proc=16)\n",
+    "    \n",
+    "    trainset_organ_shuffled = trainset_organ_subset.shuffle(seed=42)\n",
+    "    trainset_organ_shuffled = trainset_organ_shuffled.rename_column(\"cell_type\", \"label\")\n",
+    "    trainset_organ_shuffled = trainset_organ_shuffled.remove_columns(\"organ_major\")\n",
+    "    \n",
+    "    target_names = list(Counter(trainset_organ_shuffled[\"label\"]).keys())\n",
+    "    target_name_id_dict = dict(zip(target_names, range(len(target_names))))\n",
+    "    target_dict_list.append(target_name_id_dict)\n",
+    "    \n",
+    "    def classes_to_ids(example):\n",
+    "        example[\"label\"] = target_name_id_dict[example[\"label\"]]\n",
+    "        return example\n",
+    "    labeled_trainset = trainset_organ_shuffled.map(classes_to_ids, num_proc=16)\n",
+    "    \n",
+    "    labeled_train_split = labeled_trainset.select(range(0, round(len(labeled_trainset) * 0.8)))\n",
+    "    labeled_eval_split = labeled_trainset.select(range(round(len(labeled_trainset) * 0.8), len(labeled_trainset)))\n",
+    "    \n",
+    "    trained_labels = list(Counter(labeled_train_split[\"label\"]).keys())\n",
+    "    def if_trained_label(example):\n",
+    "        return example[\"label\"] in trained_labels\n",
+    "    labeled_eval_split_subset = labeled_eval_split.filter(if_trained_label, num_proc=16)\n",
+    "    \n",
+    "    dataset_list.append(labeled_train_split)\n",
+    "    evalset_list.append(labeled_eval_split_subset)\n",
+    "\n",
+    "trainset_dict = dict(zip(organ_list, dataset_list))\n",
+    "traintargetdict_dict = dict(zip(organ_list, target_dict_list))\n",
+    "evalset_dict = dict(zip(organ_list, evalset_list))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===== Organ: spleen =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 12330it [00:00, 76763.11it/s]\n",
+      "padding...: 3083it [00:00, 75593.59it/s]\n",
+      "spleen models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "spleen models:  50%|█████     | 1/2 [00:00<00:00,  1.99it/s]/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                            "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.5864, Macro F1: 0.1947, Weighted F1: 0.5845\n",
+      "Training LogisticRegression...\n",
+      "LogisticRegression - Acc: 0.7415, Macro F1: 0.1419, Weighted F1: 0.6331\n",
+      "\n",
+      "===== Organ: kidney =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 35199it [00:00, 54605.10it/s]\n",
+      "padding...: 8800it [00:00, 57420.64it/s]\n",
+      "kidney models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "kidney models:  50%|█████     | 1/2 [00:01<00:01,  1.65s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.1755, Macro F1: 0.0826, Weighted F1: 0.1772\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                            "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.3287, Macro F1: 0.0713, Weighted F1: 0.2267\n",
+      "\n",
+      "===== Organ: lung =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 26098it [00:00, 63650.72it/s]\n",
+      "padding...: 6525it [00:00, 61571.18it/s]\n",
+      "lung models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "lung models:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.2077, Macro F1: 0.0910, Weighted F1: 0.2066\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                          "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.3099, Macro F1: 0.0761, Weighted F1: 0.2399\n",
+      "\n",
+      "===== Organ: brain =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 10656it [00:00, 67287.79it/s]\n",
+      "padding...: 2664it [00:00, 75149.65it/s]\n",
+      "brain models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "brain models:  50%|█████     | 1/2 [00:00<00:00,  2.21it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.7459, Macro F1: 0.1863, Weighted F1: 0.7495\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.8622, Macro F1: 0.1543, Weighted F1: 0.7985\n",
+      "\n",
+      "===== Organ: placenta =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 7415it [00:00, 54391.55it/s]\n",
+      "padding...: 1854it [00:00, 57379.91it/s]\n",
+      "placenta models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "placenta models:  50%|█████     | 1/2 [00:00<00:00,  1.88it/s]/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                              "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.6009, Macro F1: 0.3471, Weighted F1: 0.5959\n",
+      "Training LogisticRegression...\n",
+      "LogisticRegression - Acc: 0.7406, Macro F1: 0.2836, Weighted F1: 0.6302\n",
+      "\n",
+      "===== Organ: immune =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 20562it [00:00, 74370.86it/s]\n",
+      "padding...: 5140it [00:00, 70895.86it/s]\n",
+      "immune models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "immune models:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.2008, Macro F1: 0.1312, Weighted F1: 0.2005\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                            "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.2749, Macro F1: 0.0921, Weighted F1: 0.1488\n",
+      "\n",
+      "===== Organ: large_intestine =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 39678it [00:00, 74202.67it/s]\n",
+      "padding...: 9920it [00:00, 77582.36it/s]\n",
+      "large_intestine models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "large_intestine models:  50%|█████     | 1/2 [00:01<00:01,  1.47s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.2541, Macro F1: 0.0983, Weighted F1: 0.2556\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                                     "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.3095, Macro F1: 0.0843, Weighted F1: 0.2555\n",
+      "\n",
+      "===== Organ: pancreas =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 21934it [00:00, 63776.95it/s]\n",
+      "padding...: 5484it [00:00, 71125.95it/s]\n",
+      "pancreas models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pancreas models:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.2438, Macro F1: 0.1438, Weighted F1: 0.2424\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                              "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.3485, Macro F1: 0.1330, Weighted F1: 0.2601\n",
+      "\n",
+      "===== Organ: liver =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 22427it [00:00, 64230.25it/s]\n",
+      "padding...: 5607it [00:00, 62494.75it/s]\n",
+      "liver models:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training RandomForest...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "liver models:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RandomForest - Acc: 0.2814, Macro F1: 0.1262, Weighted F1: 0.2809\n",
+      "Training LogisticRegression...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LogisticRegression - Acc: 0.3512, Macro F1: 0.0738, Weighted F1: 0.2633\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r"
+     ]
+    }
+   ],
+   "source": [
+    "def extract_features(dataset):\n",
+    "    seqs = dataset[\"input_ids\"]\n",
+    "    max_len = max(len(s) for s in seqs)\n",
+    "    padded = np.zeros((len(seqs), max_len), dtype=np.int64)\n",
+    "    for i, s in tqdm(enumerate(seqs), desc=\"padding...\", colour=\"blue\"):\n",
+    "        padded[i, :len(s)] = s\n",
+    "    X = np.mean(padded, axis=1)[:, None]  # simple mean pooling\n",
+    "    y = np.array(dataset[\"label\"])\n",
+    "    return X, y\n",
+    "\n",
+    "results = {}\n",
+    "\n",
+    "for organ in organ_list:\n",
+    "    print(f\"\\n===== Organ: {organ} =====\")\n",
+    "    organ_trainset = trainset_dict[organ]\n",
+    "    organ_evalset = evalset_dict[organ]\n",
+    "    \n",
+    "    X_train, y_train = extract_features(organ_trainset)\n",
+    "    X_test, y_test = extract_features(organ_evalset)\n",
+    "    \n",
+    "    classifiers = {\n",
+    "        \"RandomForest\": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),\n",
+    "        # \"SVM\": make_pipeline(StandardScaler(), SVC(kernel=\"rbf\", probability=True, random_state=42)),\n",
+    "        \"LogisticRegression\": make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, multi_class=\"multinomial\"))\n",
+    "    }\n",
+    "    \n",
+    "    organ_results = {}\n",
+    "    for clf_name, clf in tqdm(classifiers.items(), desc=f\"{organ} models\", leave=False):\n",
+    "        print(f\"Training {clf_name}...\")\n",
+    "        clf.fit(X_train, y_train)\n",
+    "        preds = clf.predict(X_test)\n",
+    "        acc = accuracy_score(y_test, preds)\n",
+    "        macro_f1 = f1_score(y_test, preds, average=\"macro\")\n",
+    "        weighted_f1 = f1_score(y_test, preds, average=\"weighted\")\n",
+    "        organ_results[clf_name] = {\n",
+    "            \"accuracy\": acc,\n",
+    "            \"macro_f1\": macro_f1,\n",
+    "            \"weighted_f1\": weighted_f1\n",
+    "        }\n",
+    "        print(f\"{clf_name} - Acc: {acc:.4f}, Macro F1: {macro_f1:.4f}, Weighted F1: {weighted_f1:.4f}\")\n",
+    "    \n",
+    "    results[organ] = organ_results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===== Organ: spleen =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 12330it [00:00, 74149.68it/s]\n",
+      "padding...: 3083it [00:00, 79566.32it/s]\n",
+      "spleen models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                            "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.7434, Macro F1: 0.1421, Weighted F1: 0.6340\n",
+      "\n",
+      "===== Organ: kidney =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 35199it [00:00, 54654.42it/s]\n",
+      "padding...: 8800it [00:00, 54786.08it/s]\n",
+      "kidney models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                             "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.3340, Macro F1: 0.0731, Weighted F1: 0.2334\n",
+      "\n",
+      "===== Organ: lung =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 26098it [00:00, 63652.31it/s]\n",
+      "padding...: 6525it [00:00, 63915.46it/s]\n",
+      "lung models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.3137, Macro F1: 0.0773, Weighted F1: 0.2438\n",
+      "\n",
+      "===== Organ: brain =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 10656it [00:00, 73057.45it/s]\n",
+      "padding...: 2664it [00:00, 75210.35it/s]\n",
+      "brain models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.8622, Macro F1: 0.1543, Weighted F1: 0.7985\n",
+      "\n",
+      "===== Organ: placenta =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 7415it [00:00, 54724.23it/s]\n",
+      "padding...: 1854it [00:00, 57124.05it/s]\n",
+      "placenta models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                              "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.7406, Macro F1: 0.2836, Weighted F1: 0.6302\n",
+      "\n",
+      "===== Organ: immune =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 20562it [00:00, 74360.35it/s]\n",
+      "padding...: 5140it [00:00, 73610.91it/s]\n",
+      "immune models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                            "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.2969, Macro F1: 0.1286, Weighted F1: 0.2058\n",
+      "\n",
+      "===== Organ: large_intestine =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 39678it [00:00, 78336.69it/s]\n",
+      "padding...: 9920it [00:00, 77432.63it/s]\n",
+      "large_intestine models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                      "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.3850, Macro F1: 0.1027, Weighted F1: 0.3283\n",
+      "\n",
+      "===== Organ: pancreas =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 21934it [00:00, 76007.99it/s]\n",
+      "padding...: 5484it [00:00, 75661.05it/s]\n",
+      "pancreas models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                              "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.3769, Macro F1: 0.1398, Weighted F1: 0.2843\n",
+      "\n",
+      "===== Organ: liver =====\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "padding...: 22427it [00:00, 65347.56it/s]\n",
+      "padding...: 5607it [00:00, 66067.53it/s]\n",
+      "liver models:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training SVM...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SVM - Acc: 0.3820, Macro F1: 0.1061, Weighted F1: 0.3183\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r"
+     ]
+    }
+   ],
+   "source": [
+    "def extract_features(dataset):\n",
+    "    seqs = dataset[\"input_ids\"]\n",
+    "    max_len = max(len(s) for s in seqs)\n",
+    "    padded = np.zeros((len(seqs), max_len), dtype=np.int64)\n",
+    "    for i, s in tqdm(enumerate(seqs), desc=\"padding...\", colour=\"blue\"):\n",
+    "        padded[i, :len(s)] = s\n",
+    "    X = np.mean(padded, axis=1)[:, None]  # simple mean pooling\n",
+    "    y = np.array(dataset[\"label\"])\n",
+    "    return X, y\n",
+    "\n",
+    "results = {}\n",
+    "\n",
+    "for organ in organ_list:\n",
+    "    print(f\"\\n===== Organ: {organ} =====\")\n",
+    "    organ_trainset = trainset_dict[organ]\n",
+    "    organ_evalset = evalset_dict[organ]\n",
+    "    \n",
+    "    X_train, y_train = extract_features(organ_trainset)\n",
+    "    X_test, y_test = extract_features(organ_evalset)\n",
+    "    \n",
+    "    classifiers = {\n",
+    "        # \"RandomForest\": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),\n",
+    "        \"SVM\": make_pipeline(StandardScaler(), SVC(kernel=\"rbf\", probability=True, random_state=42)),\n",
+    "        # \"LogisticRegression\": make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, multi_class=\"multinomial\"))\n",
+    "    }\n",
+    "    \n",
+    "    organ_results = {}\n",
+    "    for clf_name, clf in tqdm(classifiers.items(), desc=f\"{organ} models\", leave=False):\n",
+    "        print(f\"Training {clf_name}...\")\n",
+    "        clf.fit(X_train, y_train)\n",
+    "        preds = clf.predict(X_test)\n",
+    "        acc = accuracy_score(y_test, preds)\n",
+    "        macro_f1 = f1_score(y_test, preds, average=\"macro\")\n",
+    "        weighted_f1 = f1_score(y_test, preds, average=\"weighted\")\n",
+    "        organ_results[clf_name] = {\n",
+    "            \"accuracy\": acc,\n",
+    "            \"macro_f1\": macro_f1,\n",
+    "            \"weighted_f1\": weighted_f1\n",
+    "        }\n",
+    "        print(f\"{clf_name} - Acc: {acc:.4f}, Macro F1: {macro_f1:.4f}, Weighted F1: {weighted_f1:.4f}\")\n",
+    "    \n",
+    "    results[organ] = organ_results\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Downstream_tasks/Zero_shot_batch_effect/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Downstream_tasks/Zero_shot_batch_effect/.gitignore ADDED Viewed

	@@ -0,0 +1,419 @@

+## Ignore specific fils from this repository, below is a long list of defaults
+##  to ignore from various code editors and IDEs
+# Python related
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+# Output folder with outputs of notebooks
+output/
+# Data should be downloaded from Zenodo, not stored in the repository
+data/
+# Build directory
+build/
+# big model files
+*.pkl
+*.bin
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml

Downstream_tasks/Zero_shot_batch_effect/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns

Downstream_tasks/Zero_shot_batch_effect/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

Downstream_tasks/Zero_shot_batch_effect/README.md ADDED Viewed

	@@ -0,0 +1,162 @@

+# Foundation models in single-cell biology: evaluating zero-shot capabilities
+[![DOI](https://badgen.net/badge/DOI/10.1101%2F2023.10.16.561085/red)](https://www.biorxiv.org/content/10.1101/2023.10.16.561085) [![DOI](https://badgen.net/badge/figshare/10.6084%2Fm9.figshare.24747228/green)](https://doi.org/10.6084/m9.figshare.24747228)
+This repository contains the code that accompanies our paper, **Assessing the limits of zero-shot foundation models in single-cell biology**. You can find the preprint of the paper [here](https://www.biorxiv.org/content/10.1101/2023.10.16.561085).
+## Project overview
+In this project, we assess two proposed foundation models in the context of single-cell RNA-seq: Geneformer ([pub](https://www.nature.com/articles/s41586-023-06139-9), [code](https://huggingface.co/ctheodoris/Geneformer)) and scGPT ([pub](https://www.biorxiv.org/content/10.1101/2023.04.30.538439v2), [code](https://github.com/bowang-lab/scGPT)). We focus on evaluating the zero-shot capabilities of these models, specifically their ability to generalize beyond their original training objectives. Our evaluation targets two main tasks: cell type clustering and batch integration. In these tasks, we compare the performance of Geneformer and scGPT against two baselines: scVI  ([pub](https://www.nature.com/articles/s41592-018-0229-2), [code](https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html)) and a heuristic method that selects highly variable genes (HVGs). We also investigate the performence of the models in reconstructing the gene expression profiles of cells, and compare it against the baselines - such as a mean expression value or average ranking.
+## Dependencies
+Currently the code requires the GPUs supported by flash attention, required for scGPT to run.
+GPUs supported by flash attention are:
+- Ampere, Ada, or Hopper GPUs (e.g., A100, RTX 3090, RTX 4090, H100).
+- Turing GPUs (T4, RTX 2080)
+<details>
+<summary>Packages version</summary>
+This code has been tested with the following versions of the packages:
+- Python - tested with `3.9`
+- PyTorch - tested with - `1.13`
+- CUDA - tested with `11.7`
+- [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) - depends on `v1.0.4`
+- [scGPT](https://github.com/bowang-lab/scGPT/tree/v0.1.6) - depends on `v0.1.6`
+- [Geneformer](https://huggingface.co/ctheodoris/Geneformer/tree/5d0082c1e188ab88997efa87891414fdc6e4f6ff) - depends on commit `5d0082c`
+- [scIB](https://github.com/theislab/scib/tree/v1.0.4) - tested with `v1.0.4`
+- [sc_foundation_evals](https://github.com/microsoft/zero-shot-scfoundation) `v0.1.0`
+</details>
+## Installation
+Below you can find the instructions on how to install the dependencies for this project. We provide two options: using conda/mamba or using Docker.
+<details>
+<summary>Conda / Mamba</summary>
+### Conda / Mamba
+You can install the dependencies using conda. To do so, you need to have conda installed on your machine. If you don't have it, you can install it from [here](https://docs.conda.io/en/latest/miniconda.html).
+We recommend using [mamba](https://mamba.readthedocs.io/en/latest/user_guide/mamba.html), since it is faster in our experience. You can install mamba following the guide [here](https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html#operating-system-package-managers).
+To simplify installation, we provide the installation script that creates a new conda environment with all the dependencies installed. You can run the following command to create the environment:
+```bash
+bash envs/installation.sh
+```
+If the installation is successful, you will see the following message:
+```console
+2024-08-22 19:49:26 SUCCESS: All packages installed successfully.
+```
+And you can activate the environment by running:
+```bash
+conda activate sc_foundation_evals
+```
+</details>
+<details>
+<summary>Docker</summary>
+### Docker
+The docker image is available on DockerHub [here](https://hub.docker.com/repository/docker/kzkedzierska/sc_foundation_evals/general). You can pull the image by running:
+```bash
+docker pull kzkedzierska/sc_foundation_evals
+```
+The image is based on the `cnstark/pytorch:1.13.0-py3.9.12-cuda11.7.1-ubuntu20.04` image, and has all the dependencies installed. The Dockerfile used to build the image can be found in the `envs/docker` directory.
+You can also skip pulling the image since `docker` will pull it if needed. To run the interactive session with the image, you can use the following command:
+```bash
+docker run --gpus all -it kzkedzierska/sc_foundation_evals
+```
+If you want to be able to run the notebooks, run the image with the following tag:
+```bash
+ docker run --gpus all -it --rm -p 8888:8888 -v  ./:/workspace kzkedzierska/sc_foundation_evals:latest_notebook
+```
+And open the link provided in the terminal in your browser. It should look like this:
+```console
+[I 2024-08-23 22:15:13.015 ServerApp] Serving notebooks from local directory: /workspace
+[I 2024-08-23 22:15:13.015 ServerApp] Jupyter Server 2.14.2 is running at:
+[I 2024-08-23 22:15:13.015 ServerApp] http://localhost:8888/tree
+[I 2024-08-23 22:15:13.015 ServerApp] http://127.0.0.1:8888/tree
+```
+For running the command on the server, consult the documentation of the server provider on how to forward the ports properly.
+</details>
+## Running the code
+### Downloading the weights
+To run notebooks you also need to have the weights of the models downloaded. scGPT weights are avaialble [here](https://github.com/bowang-lab/scGPT#pretrained-scgpt-model-zoo) and Geneformer weights are available in its repository. As per the instructions in the Geneformer repository, make sure you have `git lfs` installed before downloading the weights via repository cloning.
+### Copying this repository
+To run the code, you need to clone this repository.
+```bash
+git clone https://github.com/microsoft/zero-shot-scfoundation
+```
+And download and unpack the data, stored at figshare (see [here](https://doi.org/10.6084/m9.figshare.24747228) for more details).
+```bash
+cd zero-shot-scfoundation
+# download and unpack the data
+wget https://figshare.com/ndownloader/files/43480497 -O data.zip
+unzip data.zip && rm data.zip
+```
+### Notebooks
+To best understand the code and it's organization, please have a look at the notebooks. The `notebooks` directory currently contains the following notebooks:
+- [scGPT_zero_shot](notebooks/scGPT_zero_shot.ipynb) - notebook for running scGPT zero-shot evaluation
+- [Geneformer_zero_shot](notebooks/Geneformer_zero_shot.ipynb) - notebook for running Geneformer zero-shot evaluation
+- [Baselines_HVG_and_scVI](notebooks/Baselines_HVG_and_scVI.ipynb) - notebook for running the baselines used in the paper, i.e. HVG and scVI.
+## Any questions?
+If you have any questions, or find any issues with the code, please open an issue in this repository. You can find more information on how to file an issue in [here](/SUPPORT.md). We also welcome any contributions to the code - be sure to checkout the **Contributing** section below.
+## Contributing
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit <https://cla.opensource.microsoft.com>.
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+## Trademarks
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
+trademarks or logos is subject to and must follow
+[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
+Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
+Any use of third-party trademarks or logos are subject to those third-party's policies.

Downstream_tasks/Zero_shot_batch_effect/SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

Downstream_tasks/Zero_shot_batch_effect/SUPPORT.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug, ask a question
+or request a feature as a new Issue.
+If you face an issue with installation or running the code, on top of the error message please describe
+your enviornment well (what operating system do you use, if you use conda or virtual enviornment,
+please list what versions of the packages  are installed and available in your PATH at the time of
+running the code). We will try to respond and help.
+## Microsoft Support Policy
+Support for this PROJECT is limited to the resources listed above.

Downstream_tasks/Zero_shot_batch_effect/envs/conda_env.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: sc_foundation_evals
+channels:
+  - nvidia/label/cuda-11.7.0
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3.10
+  - cudatoolkit
+  - r-base=4.2.3
+  - ninja
+  - rpy2
+  - packaging
+  - gxx=11.4
+  - git-lfs
+  - pip>=21.1
+  - pip:
+    - --index-url https://download.pytorch.org/whl/cu117
+    - torch==1.13
+    - torchvision
+    - torchaudio

Downstream_tasks/Zero_shot_batch_effect/envs/docker/base_image/Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM cnstark/pytorch:1.13.0-py3.9.12-cuda11.7.1-ubuntu20.04
+# NAME sc_foundation_evals
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y wget git git-lfs && \
+  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb && \
+  dpkg -i cuda-keyring_1.1-1_all.deb && \
+  rm cuda-keyring_1.1-1_all.deb && \
+  apt-get update && \
+  echo "tzdata tzdata/Areas select Europe" > /tmp/prelseed.txt; \
+  echo "tzdata tzdata/Zones/Europe select Warsaw" >> /tmp/preseed.txt; \
+  debconf-set-selections /tmp/preseed.txt && \
+  apt-get install -y cuda-toolkit-11-7 && \
+  apt-get install -y r-base && \
+  apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+ENV PATH=/usr/local/cuda-11.7/bin${PATH:+:${PATH}}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-11.7/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+RUN pip install packaging && \
+  pip install flash-attn==1.0.4 --no-build-isolation
+RUN pip install scib[kBET,rpy2] colorlog PyComplexHeatmap wandb && \
+  pip install git+https://github.com/bowang-lab/scGPT.git@v0.1.6 && \
+  pip install git+https://huggingface.co/ctheodoris/Geneformer.git@5d0082c1e188ab88997efa87891414fdc6e4f6ff && \
+  pip install git+https://github.com/microsoft/zero-shot-scfoundation.git@v0.1.0

Downstream_tasks/Zero_shot_batch_effect/envs/docker/base_image/test.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#! /usr/bin/env python
+try:
+    from sc_foundation_evals.helpers.custom_logging import log
+except ImportError:
+    import logging
+    logging.basicConfig(level=logging.DEBUG)
+    log = logging.getLogger(__name__)
+    msg = "Cannot load sc_foundation_evals custom logging module. Exiting..."
+    log.error(msg)
+    raise ImportError(msg)
+log.info("Hello from the test script! This is to test the build process.")
+def import_package(package_name):
+    """
+    Try to import a package and return the package if successful.
+    Logs and raises an error if the package is not available.
+    """
+    try:
+        package = __import__(package_name)
+        version = getattr(package, "__version__", None)
+        log.info(
+            f"Successfully imported {package_name}. "
+            f"Version: {version if version else 'unknown'}"
+        )
+        return package
+    except ImportError as e:
+        msg = f"Could not import required package: {package_name}"
+        log.error(f"{msg}: {e}")
+        raise ImportError(msg)
+def test_cuda_availability():
+    """
+    Check if CUDA is available and log the result.
+    """
+    torch = import_package("torch")
+    if torch.cuda.is_available():
+        log.info("Success -- CUDA is available!")
+    else:
+        log.error(
+            "CUDA is not available. Please check your system configuration."
+        )
+def main():
+    try:
+        log.debug("Testing CUDA availability...")
+        test_cuda_availability()
+        log.debug("Testing loading scGPT...")
+        import_package("scgpt")
+        log.debug("Testing loading Geneformer...")
+        import_package("geneformer")
+        log.info("All tests passed successfully! :)")
+    except Exception as e:
+        log.error(f"An error occurred during the testing process: {e}")
+        raise
+if __name__ == "__main__":
+    main()

Downstream_tasks/Zero_shot_batch_effect/envs/docker/base_image/test_docker.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#! /bin/bash
+# This script is used to test the docker image built by the Dockerfile in the same directory.
+# The docker image is built by the following command:
+# docker build -t kzkedzierska/sc_foundation_evals[:tag] .
+# The script runs the docker image and executes the test.py script in the container.
+# The test.py script is a simple script that imports the sc_foundation_evals package and prints the version of the package.
+docker run \
+  --gpus all \
+  -v "$(pwd)":/workspace kzkedzierska/sc_foundation_evals \
+  python test.py

Downstream_tasks/Zero_shot_batch_effect/envs/docker/jupyter/Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM kzkedzierska/sc_foundation_evals:latest
+# Install Jupyter Notebook
+RUN pip install notebook
+WORKDIR /workspace
+# Expose the port Jupyter will run on
+EXPOSE 8888
+# Set the default command to run when starting the container
+CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"]

Downstream_tasks/Zero_shot_batch_effect/envs/installation.sh ADDED Viewed

	@@ -0,0 +1,85 @@

+#! /bin/bash
+# exit on error
+set -e
+_script_name=$(basename "$0")
+ENV_NAME="sc_foundation_evals"
+warning() {
+  yellow='\033[0;33m'
+  nc='\033[0m'
+  echo -e "${yellow}$(date '+%Y-%m-%d %H:%M:%S') WARNING: $@${nc}" 1>&2
+}
+success() {
+  green='\033[0;32m'
+  nc='\033[0m'
+  echo -e "${green}$(date '+%Y-%m-%d %H:%M:%S') SUCCESS: $@${nc}"
+}
+error() {
+  red='\033[0;31m'
+  nc='\033[0m'
+  echo -e "${red}$(date '+%Y-%m-%d %H:%M:%S') ERROR: $@${nc}" 1>&2
+  usage_and_exit 1
+}
+msg() {
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') INFO: $@"
+}
+usage() {
+  echo -e "
+USAGE: bash ${_script_name}
+Script to install the package and set up the Conda environment.
+EXAMPLES:
+  Install the package and set up the Conda environment:
+    bash ${_script_name}
+  "
+}
+usage_and_exit() {
+  usage
+  exit $1
+}
+# if mamba available, use it
+if command -v mamba &>/dev/null; then
+  conda_cli=mamba
+else
+  conda_cli=conda
+fi
+msg "Using '${conda_cli}' as the Conda CLI."
+${conda_cli} env create -f envs/conda_env.yml -n ${ENV_NAME} ||
+  error "Failed to create the Conda environment '${ENV_NAME}'."
+success "Conda environment '${ENV_NAME}' created successfully."
+${conda_cli} run \
+  -n ${ENV_NAME} pip install flash-attn==1.0.4 --no-build-isolation
+success "Flash attention installed successfully."
+${conda_cli} run \
+  -n ${ENV_NAME} pip install 'setuptools>=65.2' wandb colorlog \
+  PyComplexHeatmap scib[kBET,rpy2]==1.0.4 ||
+  error "Failed to install the wandb, colorlog, PyComplexHeatmap or scib."
+${conda_cli} run \
+  -n ${ENV_NAME} pip install git+https://github.com/bowang-lab/scGPT.git@v0.1.6 ||
+  error "Failed to install the scGPT."
+${conda_cli} run \
+  -n ${ENV_NAME} pip install \
+  git+https://huggingface.co/ctheodoris/Geneformer.git@5d0082c1e188ab88997efa87891414fdc6e4f6ff ||
+  error "Failed to install the Geneformer."
+${conda_cli} run \
+  -n ${ENV_NAME} pip install git+https://github.com/microsoft/zero-shot-scfoundation ||
+  error "Failed to install the sc_foundation_evals."
+success "All packages installed successfully."

Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_Geneformer.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_HVG_and_scVI.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_evaluation_aggregated.ipynb ADDED Viewed

	@@ -0,0 +1,1058 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Geneformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import logging\n",
+    "import warnings\n",
+    "import sys\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "\n",
+    "from sc_foundation_evals import geneformer_forward as gf\n",
+    "from sc_foundation_evals import data, cell_embeddings, model_output\n",
+    "from sc_foundation_evals.helpers.custom_logging import log\n",
+    "log.setLevel(logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geneformer_data = \"model path\"\n",
+    "# path to the pre-trained model, can work with the huggingface model hub\n",
+    "# i.e. ctheodoris/Geneformer\n",
+    "model_dir = os.path.join(geneformer_data)\n",
+    "# path to dictionaries in geneformer repo\n",
+    "dict_dir = \"Pretrain_data/\"\n",
+    "\n",
+    "# batch_size depends on available GPU memory\n",
+    "batch_size = 24\n",
+    "# output_dir is the path to which the results should be saved\n",
+    "output_dir = \"zero_shot_results/\"\n",
+    "# path to where we will store the embeddings and other evaluation outputs\n",
+    "model_out = os.path.join(output_dir, \"model_outputs\")\n",
+    "# if you can use multithreading specify num_workers, -1 means use all available\n",
+    "num_workers = -1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# specify the path to anndata object\n",
+    "in_dataset_path = \"Zero_shot_batch_data/pbmc.h5ad\"\n",
+    "# dataset_name is inferred from in_dataset_path\n",
+    "dataset_name = os.path.basename(in_dataset_path).split(\".\")[0]\n",
+    "# specify the path for the output of the pre-processing\n",
+    "preprocessed_path = f\"zero_shot_preprocess/{dataset_name}/\"\n",
+    "# create the preprocessed path if it does not exist\n",
+    "os.makedirs(preprocessed_path, exist_ok=True)\n",
+    "# in which column in adata.obs are gene names stored? if they are in index, the index will be copied to a column with this name\n",
+    "gene_col = \"gene_symbols\"\n",
+    "# batch column found in adata.obs\n",
+    "batch_col = \"batch\"\n",
+    "# where are labels stored in adata.obs? \n",
+    "label_col = \"celltype\" #\"str_labels\"\n",
+    "# where the raw counts are stored?\n",
+    "layer_key = \"counts\" #\"X\" "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geneform = gf.Geneformer_instance(save_dir = output_dir, \n",
+    "                                  saved_model_path = model_dir,\n",
+    "                                  explicit_save_dir = True,\n",
+    "                                  num_workers = num_workers)\n",
+    "\n",
+    "geneform.load_pretrained_model()\n",
+    "geneform.load_vocab(dict_dir)\n",
+    "# input_data = data.InputData(adata_dataset_path = in_dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input_data.preprocess_data(gene_col = gene_col,\n",
+    "#                            model_type = \"geneformer\",\n",
+    "#                            save_ext = \"loom\",\n",
+    "#                            gene_name_id_dict = geneform.gene_name_id,\n",
+    "#                            preprocessed_path = preprocessed_path)\n",
+    "\n",
+    "# geneform.tokenize_data(adata_path = os.path.join(preprocessed_path, \n",
+    "#                                                  f\"{dataset_name}.loom\"),\n",
+    "#                        dataset_path = preprocessed_path,\n",
+    "#                        cell_type_col = label_col)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32mINFO    \u001b[0m | 2025-07-17 11:57:03 | \u001b[32mLoading data from /ibex/user/chenj0i/Geneformer/zero_shot_preprocess/pbmc/pbmc.loom\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "geneform.load_tokenized_dataset(os.path.join(preprocessed_path, f\"{dataset_name}.dataset\"))\n",
+    "input_data = data.InputData(adata_dataset_path = os.path.join(preprocessed_path, f\"{dataset_name}.loom\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Embeddings extraction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c89a344776044c95bfef70882ebd4ff8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Geneformer (extracting embeddings):   0%|          | 0/500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "geneform.extract_embeddings(data = input_data,\n",
+    "                            batch_size = batch_size, \n",
+    "                            layer = -2\n",
+    "                            # layer = -1\n",
+    "                            # layer = 0\n",
+    "                            )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AnnData object with n_obs × n_vars = 11990 × 3226\n",
+       "    obs: 'adata_order', 'batch', 'celltype', 'labels', 'n_counts', 'n_genes', 'n_genes_by_counts', 'obs_names', 'str_labels', 'total_counts'\n",
+       "    var: 'ensembl_id', 'gene_symbols', 'has_ensembl_match', 'mean_counts', 'n_cells', 'n_cells_by_counts', 'n_counts', 'n_counts-0', 'n_counts-1', 'pct_dropout_by_counts', 'total_counts', 'var_names'\n",
+       "    obsm: 'geneformer'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_data.adata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict, Optional\n",
+    "import numpy as np\n",
+    "import scanpy as sc\n",
+    "import scib\n",
+    "from anndata import AnnData\n",
+    "from sklearn.metrics import silhouette_score\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "import logging\n",
+    "\n",
+    "log = logging.getLogger(__name__)\n",
+    "\n",
+    "\n",
+    "def eval_clustering_metrics(\n",
+    "    adata: AnnData,\n",
+    "    batch_key: Optional[str] = \"str_batch\",\n",
+    "    label_key: str = \"cell_type\",\n",
+    "    embedding_key: str = \"X\",  # \"X\" for raw, or embedding key in .obsm\n",
+    "    resolutions: Optional[list] = None,\n",
+    "    use_progress_bar: bool = True,\n",
+    "    verbose: bool = False,\n",
+    "    subsample_frac: Optional[float] = 0.25,\n",
+    ") -> Dict[str, float]:\n",
+    "    \"\"\"Evaluate biological and batch mixing metrics on an embedding or raw expression.\"\"\"\n",
+    "    \n",
+    "    results_dict = {}\n",
+    "\n",
+    "    if subsample_frac is not None and 0 < subsample_frac < 1:\n",
+    "        adata = adata.copy()\n",
+    "        sc.pp.subsample(adata, fraction=subsample_frac, copy=False)\n",
+    "        if verbose:\n",
+    "            log.info(f\"Subsampled adata to {subsample_frac * 100:.1f}% of original cells.\")\n",
+    "\n",
+    "    # Determine whether to use .X or .obsm[embedding_key]\n",
+    "    if embedding_key == \"X\":\n",
+    "        use_rep = \"X\"\n",
+    "        adata.obsm[\"X\"] = adata.X\n",
+    "    elif embedding_key in adata.obsm:\n",
+    "        use_rep = embedding_key\n",
+    "    else:\n",
+    "        raise ValueError(f\"embedding_key '{embedding_key}' not found in adata.obsm or is not 'X'\")\n",
+    "\n",
+    "    # Clear stale neighbors\n",
+    "    if \"neighbors\" in adata.uns:\n",
+    "        if verbose:\n",
+    "            log.warning(f\"Removing stale neighbors computed from other representations.\")\n",
+    "        adata.uns.pop(\"neighbors\", None)\n",
+    "\n",
+    "    sc.pp.neighbors(adata, use_rep=use_rep)\n",
+    "\n",
+    "    # Run Louvain across multiple resolutions\n",
+    "    if resolutions is None:\n",
+    "        resolutions = [2 * i / 20 for i in range(1, 21)]  # Default: 20 steps from 0.1 to 2.0\n",
+    "        # resolutions = [4 * i / 40 for i in range(1, 41)]  # Default: 20 steps from 0.1 to 2.0\n",
+    "\n",
+    "    best_nmi = -1\n",
+    "    best_res = None\n",
+    "    best_clustering = None\n",
+    "\n",
+    "    if verbose:\n",
+    "        log.info(f\"Searching for optimal clustering resolution on {use_rep}...\")\n",
+    "\n",
+    "    for res in tqdm(resolutions, disable=not use_progress_bar, desc=\"Louvain clustering\"):\n",
+    "        sc.tl.louvain(adata, resolution=res, key_added=\"temp_cluster\")\n",
+    "        nmi = scib.metrics.nmi(adata, \"temp_cluster\", label_key)\n",
+    "        if nmi > best_nmi:\n",
+    "            best_nmi = nmi\n",
+    "            best_res = res\n",
+    "            best_clustering = adata.obs[\"temp_cluster\"].copy()\n",
+    "        del adata.obs[\"temp_cluster\"]\n",
+    "\n",
+    "    if verbose:\n",
+    "        log.info(f\"Best resolution: {best_res:.2f} with NMI = {best_nmi:.4f}\")\n",
+    "\n",
+    "    adata.obs[\"cluster\"] = best_clustering\n",
+    "\n",
+    "    # Biological conservation metrics\n",
+    "    results_dict[\"NMI_cluster/label\"] = scib.metrics.nmi(adata, \"cluster\", label_key, \"arithmetic\")\n",
+    "    results_dict[\"ARI_cluster/label\"] = scib.metrics.ari(adata, \"cluster\", label_key)\n",
+    "    results_dict[\"ASW_label\"] = scib.metrics.silhouette(adata, label_key, use_rep, \"euclidean\")\n",
+    "\n",
+    "    # Batch effect metrics (if batch_key valid)\n",
+    "    if batch_key is not None and batch_key in adata.obs and adata.obs[batch_key].nunique() > 1:\n",
+    "        adata.obs[label_key] = adata.obs[label_key].astype(\"category\")\n",
+    "        results_dict[\"graph_conn\"] = scib.metrics.graph_connectivity(adata, label_key)\n",
+    "        results_dict[\"ASW_batch\"] = scib.metrics.silhouette(adata, batch_key, use_rep, \"euclidean\")\n",
+    "        results_dict[\"ASW_label/batch\"] = scib.metrics.silhouette_batch(\n",
+    "            adata, batch_key, label_key, embed=use_rep, metric=\"euclidean\", return_all=False\n",
+    "        )\n",
+    "        results_dict[\"PCR_batch\"] = scib.metrics.pcr(\n",
+    "            adata, covariate=batch_key, embed=use_rep, recompute_pca=True, n_comps=50, verbose=False\n",
+    "        )\n",
+    "        results_dict[\"Average_Batch_Score\"] = (\n",
+    "            results_dict[\"ASW_batch\"] + results_dict[\"PCR_batch\"]\n",
+    "        ) / 2\n",
+    "    else:\n",
+    "        if verbose:\n",
+    "            log.info(\"Skipping batch metrics — only one batch present or invalid batch_key.\")\n",
+    "    \n",
+    "    results_dict[\"avg_bio\"] = np.mean([\n",
+    "        results_dict[\"NMI_cluster/label\"],\n",
+    "        results_dict[\"ARI_cluster/label\"],\n",
+    "        results_dict[\"ASW_label\"]\n",
+    "    ])\n",
+    "\n",
+    "    # Filter NaNs\n",
+    "    results_dict = {k: v for k, v in results_dict.items() if not np.isnan(v)}\n",
+    "\n",
+    "    return results_dict\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Embeddings metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Louvain clustering: 100%|██████████| 20/20 [00:02<00:00,  7.68it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group:                    silhouette_score\n",
+      "group                              \n",
+      "B cells                    0.990590\n",
+      "CD14+ Monocytes            0.979706\n",
+      "CD4 T cells                0.987594\n",
+      "CD8 T cells                0.991305\n",
+      "Dendritic Cells            0.958009\n",
+      "FCGR3A+ Monocytes          0.990665\n",
+      "Megakaryocytes             0.857295\n",
+      "NK cells                   0.977292\n",
+      "Other                      0.933587\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.6061048617613637,\n",
+       " 'ARI_cluster/label': 0.503784927975462,\n",
+       " 'ASW_label': 0.510432125069201,\n",
+       " 'graph_conn': 0.8852579724762832,\n",
+       " 'ASW_batch': 0.5012279110960662,\n",
+       " 'ASW_label/batch': 0.9628935503212096,\n",
+       " 'PCR_batch': 0.0007131078007747846,\n",
+       " 'Average_Batch_Score': 0.25097050944842053,\n",
+       " 'avg_bio': 0.5401073049353422}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_dict = eval_clustering_metrics(adata=input_data.adata, \n",
+    "                                        batch_key=\"batch\",\n",
+    "                                        label_key=\"celltype\",\n",
+    "                                        embedding_key=\"geneformer\",  # or \"X_scGPT\", etc.\n",
+    "                                        verbose=True)\n",
+    "results_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "12c31089634046939fc59c2ef27adb59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Rank-Geneformer\n",
+      "0        0.805556\n"
+     ]
+    }
+   ],
+   "source": [
+    "from scGraph import scGraph\n",
+    "\n",
+    "scg = scGraph(adata=input_data.adata, batch_key=\"batch\", label_key=\"celltype\", \n",
+    "                trim_rate=0.05, thres_batch=1, thres_celltype=1)\n",
+    "scg.preprocess()\n",
+    "scg.compute()\n",
+    "results = scg.evaluate()\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OOD Dataset raw metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import scanpy as sc \n",
+    "\n",
+    "# cdata = sc.read_h5ad(\"zero_shot_data/ood_celltype_data1_expand.h5ad\")\n",
+    "# adata = cdata.copy()\n",
+    "# sc.pp.subsample(adata, fraction=0.05, copy=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use_rep = \"X\"\n",
+    "# adata.obsm[\"X\"] = adata.X\n",
+    "# adata.uns.pop(\"neighbors\", None)\n",
+    "\n",
+    "# sc.pp.neighbors(adata, use_rep=use_rep)\n",
+    "# resolutions = [2 * i / 20 for i in range(1, 21)]  # Default: 20 steps from 0.1 to 2.0\n",
+    "# best_nmi = -1\n",
+    "# best_res = None\n",
+    "# best_clustering = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Louvain clustering: 100%|██████████| 20/20 [00:22<00:00,  1.14s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group:             silhouette_score\n",
+      "group                       \n",
+      "CL:0000077          0.951371\n",
+      "CL:0000091          0.905183\n",
+      "CL:0000099          0.856871\n",
+      "CL:0000164          0.913159\n",
+      "CL:0000189          0.934462\n",
+      "CL:0000312          0.933951\n",
+      "CL:0000453          0.966310\n",
+      "CL:0000575          0.779139\n",
+      "CL:0000750          0.991985\n",
+      "CL:0000767          0.977141\n",
+      "CL:0000771          0.893556\n",
+      "CL:0000776          0.932994\n",
+      "CL:0000810          0.913306\n",
+      "CL:0000817          0.931130\n",
+      "CL:0000837          0.967683\n",
+      "CL:0000843          0.948814\n",
+      "CL:0000861          0.841148\n",
+      "CL:0000915          0.945803\n",
+      "CL:0000957          0.970545\n",
+      "CL:0001029          0.950351\n",
+      "CL:0001057          0.946863\n",
+      "CL:0001074          0.936960\n",
+      "CL:0002028          0.935891\n",
+      "CL:0002045          0.950375\n",
+      "CL:0002064          0.926107\n",
+      "CL:0002075          0.759782\n",
+      "CL:0002201          0.973459\n",
+      "CL:0002393          0.966944\n",
+      "CL:0002518          0.911847\n",
+      "CL:0005012          0.961174\n",
+      "CL:0009009          0.957441\n",
+      "CL:0009010          0.933421\n",
+      "CL:0009017          0.952055\n",
+      "CL:0009042          0.943946\n",
+      "CL:0009095          0.863287\n",
+      "CL:0011024          0.925223\n",
+      "CL:0017000          0.943662\n",
+      "CL:1000398          0.954797\n",
+      "CL:1000487          0.973023\n",
+      "CL:1000488          0.950142\n",
+      "CL:1001432          0.984860\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.7833172618112929,\n",
+       " 'ARI_cluster/label': 0.5728303202672791,\n",
+       " 'ASW_label': 0.4911566338564166,\n",
+       " 'graph_conn': 0.7769019941103583,\n",
+       " 'ASW_batch': 0.5006964505924973,\n",
+       " 'ASW_label/batch': 0.9306380360099057,\n",
+       " 'PCR_batch': 0.757978241899424,\n",
+       " 'Average_Batch_Score': 0.6293373462459606,\n",
+       " 'avg_bio': 0.6157680719783295}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# label_key = \"celltype\"\n",
+    "# results_dict = {}\n",
+    "# for res in tqdm(resolutions, disable=not True, desc=\"Louvain clustering\"):\n",
+    "#     sc.tl.louvain(adata, resolution=res, key_added=\"temp_cluster\")\n",
+    "#     nmi = scib.metrics.nmi(adata, \"temp_cluster\", label_key)\n",
+    "#     if nmi > best_nmi:\n",
+    "#         best_nmi = nmi\n",
+    "#         best_res = res\n",
+    "#         best_clustering = adata.obs[\"temp_cluster\"].copy()\n",
+    "#     del adata.obs[\"temp_cluster\"]\n",
+    "\n",
+    "# adata.obs[\"cluster\"] = best_clustering\n",
+    "# # Biological conservation metrics\n",
+    "# results_dict[\"NMI_cluster/label\"] = scib.metrics.nmi(adata, \"cluster\", label_key, \"arithmetic\")\n",
+    "# results_dict[\"ARI_cluster/label\"] = scib.metrics.ari(adata, \"cluster\", label_key)\n",
+    "# results_dict[\"ASW_label\"] = scib.metrics.silhouette(adata, label_key, use_rep, \"euclidean\")\n",
+    "\n",
+    "# # Batch effect metrics (if batch_key valid)\n",
+    "# batch_key = \"batch\"\n",
+    "# if batch_key is not None and batch_key in adata.obs and adata.obs[batch_key].nunique() > 1:\n",
+    "#     adata.obs[label_key] = adata.obs[label_key].astype(\"category\")\n",
+    "#     results_dict[\"graph_conn\"] = scib.metrics.graph_connectivity(adata, label_key)\n",
+    "#     results_dict[\"ASW_batch\"] = (1 - scib.metrics.silhouette(adata, batch_key, use_rep, \"euclidean\"))\n",
+    "#     results_dict[\"ASW_label/batch\"] = scib.metrics.silhouette_batch(\n",
+    "#         adata, batch_key, label_key, embed=use_rep, metric=\"euclidean\", return_all=False\n",
+    "#     )\n",
+    "#     results_dict[\"PCR_batch\"] = scib.metrics.pcr(\n",
+    "#         adata, covariate=batch_key, embed=use_rep, recompute_pca=True, n_comps=50, verbose=False\n",
+    "#     )\n",
+    "#     results_dict[\"Average_Batch_Score\"] = (\n",
+    "#         results_dict[\"ASW_batch\"] + results_dict[\"PCR_batch\"]\n",
+    "#     ) / 2\n",
+    "# else:\n",
+    "#     if verbose:\n",
+    "#         log.info(\"Skipping batch metrics — only one batch present or invalid batch_key.\")\n",
+    "\n",
+    "# results_dict[\"avg_bio\"] = np.mean([\n",
+    "#     results_dict[\"NMI_cluster/label\"],\n",
+    "#     results_dict[\"ARI_cluster/label\"],\n",
+    "#     results_dict[\"ASW_label\"]\n",
+    "# ])\n",
+    "\n",
+    "# # Filter NaNs\n",
+    "# results_dict = {k: v for k, v in results_dict.items() if not np.isnan(v)}\n",
+    "\n",
+    "# results_dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Raw data metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Louvain clustering: 100%|██████████| 20/20 [00:02<00:00,  6.97it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group:                    silhouette_score\n",
+      "group                              \n",
+      "B cells                    0.971033\n",
+      "CD14+ Monocytes            0.942456\n",
+      "CD4 T cells                0.988742\n",
+      "CD8 T cells                0.987412\n",
+      "Dendritic Cells            0.938792\n",
+      "FCGR3A+ Monocytes          0.950513\n",
+      "Megakaryocytes             0.752894\n",
+      "NK cells                   0.890206\n",
+      "Other                      0.914109\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.6505152890434263,\n",
+       " 'ARI_cluster/label': 0.5759899223104351,\n",
+       " 'ASW_label': 0.5245759263634682,\n",
+       " 'graph_conn': 0.8891452955038966,\n",
+       " 'ASW_batch': 0.4964794989209622,\n",
+       " 'ASW_label/batch': 0.9262396008669715,\n",
+       " 'PCR_batch': 0.0007824623021499673,\n",
+       " 'Average_Batch_Score': 0.24863098061155608,\n",
+       " 'avg_bio': 0.5836937125724432}"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_dict_raw = eval_clustering_metrics(adata=input_data.adata, \n",
+    "                                        batch_key=\"batch\",\n",
+    "                                        label_key=\"celltype\",\n",
+    "                                        embedding_key=\"X\",  # or \"X_scGPT\", etc.\n",
+    "                                        verbose=True)\n",
+    "results_dict_raw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scGraph import scGraph\n",
+    "\n",
+    "scg = scGraph(adata=input_data.adata, batch_key=\"batch\", label_key=\"celltype\", \n",
+    "                trim_rate=0.05, thres_batch=1, thres_celltype=1, embedding_key=\"X\")\n",
+    "scg.preprocess()\n",
+    "scg.compute()\n",
+    "results = scg.evaluate()\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# HVG & scVI"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## HVG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import logging\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scanpy as sc\n",
+    "from scipy import sparse\n",
+    "import scvi\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append(\"zero_shot_batch_effect\")\n",
+    "from sc_foundation_evals import utils\n",
+    "from sc_foundation_evals.helpers.custom_logging import log\n",
+    "\n",
+    "log.setLevel(logging.INFO)\n",
+    "\n",
+    "import warnings\n",
+    "os.environ[\"KMP_WARNINGS\"] = \"off\"\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AnnData object with n_obs × n_vars = 11990 × 3346\n",
+       "    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'celltype'\n",
+       "    var: 'gene_symbols', 'n_counts-0', 'n_counts-1', 'n_counts'\n",
+       "    uns: 'cell_types'\n",
+       "    obsm: 'design', 'normalized_qc', 'qc_pc', 'raw_qc'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# specify the path to anndata object\n",
+    "adata_path = in_dataset_path\n",
+    "# dataset_name is inferred from in_dataset_path\n",
+    "dataset_name = os.path.basename(adata_path).split(\".\")[0]\n",
+    "\n",
+    "# batch column found in adata.obs\n",
+    "batch_col = \"batch\"\n",
+    "# where are labels stored in adata.obs? \n",
+    "label_col = \"celltype\"\n",
+    "# where the raw counts are stored?\n",
+    "layer_key = \"counts\"\n",
+    "\n",
+    "adata = sc.read(adata_path)\n",
+    "adata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if layer_key == \"X\":\n",
+    "    adata.layers[\"counts\"] = adata.X\n",
+    "elif layer_key != \"counts\":\n",
+    "    adata.layers[\"counts\"] = adata.layers[layer_key]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.pp.filter_cells(adata, min_genes=10)\n",
+    "sc.pp.filter_genes(adata, min_cells=10)\n",
+    "sc.pp.normalize_total(adata, target_sum=1e4)\n",
+    "sc.pp.log1p(adata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.pp.highly_variable_genes(adata, flavor='seurat', subset=False, n_top_genes=2000)\n",
+    "\n",
+    "# hvg_mask = adata.var[\"highly_variable\"].values\n",
+    "\n",
+    "adata.obsm[\"X_genes\"] = adata.X[:, adata.var.highly_variable.values]\n",
+    "\n",
+    "# check if adata.obsm[\"X_genes\"] is sparse and if so, convert to dense\n",
+    "if sparse.issparse(adata.obsm[\"X_genes\"]):\n",
+    "    adata.obsm[\"X_genes\"] = np.asarray(adata.obsm[\"X_genes\"].todense())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32mINFO    \u001b[0m | 2025-06-22 14:32:11 | \u001b[32mSubsampled adata to 25.0% of original cells.\u001b[0m\n",
+      "\u001b[32mINFO    \u001b[0m | 2025-06-22 14:32:12 | \u001b[32mSearching for optimal clustering resolution on X_genes...\u001b[0m\n",
+      "Louvain clustering: 100%|██████████| 20/20 [00:02<00:00,  8.92it/s]\n",
+      "\u001b[32mINFO    \u001b[0m | 2025-06-22 14:32:14 | \u001b[32mBest resolution: 0.70 with NMI = 0.6944\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group:                    silhouette_score\n",
+      "group                              \n",
+      "B cells                    0.990475\n",
+      "CD14+ Monocytes            0.994091\n",
+      "CD4 T cells                0.994429\n",
+      "CD8 T cells                0.996067\n",
+      "Dendritic Cells            0.990181\n",
+      "FCGR3A+ Monocytes          0.997131\n",
+      "Megakaryocytes             0.973109\n",
+      "NK cells                   0.997118\n",
+      "Other                      0.982645\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.6944194464119003,\n",
+       " 'ARI_cluster/label': 0.6730602977338459,\n",
+       " 'ASW_label': 0.513224795460701,\n",
+       " 'graph_conn': 0.8757625892165339,\n",
+       " 'ASW_batch': 0.4997675784834428,\n",
+       " 'ASW_label/batch': 0.9905828886755944,\n",
+       " 'PCR_batch': 0.0008402505807411988,\n",
+       " 'Average_Batch_Score': 0.250303914532092,\n",
+       " 'avg_bio': 0.626901513202149}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_dict_hvg = eval_clustering_metrics(adata=adata, \n",
+    "                                        batch_key=batch_col,\n",
+    "                                        label_key=label_col,\n",
+    "                                        embedding_key=\"X_genes\",  # or \"X_scGPT\", etc.\n",
+    "                                        verbose=True)\n",
+    "results_dict_hvg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scGraph import scGraph\n",
+    "\n",
+    "scg = scGraph(adata=input_data.adata, batch_key=\"batch\", label_key=\"celltype\", \n",
+    "                trim_rate=0.05, thres_batch=1, thres_celltype=1, embedding_key=\"X_genes\")\n",
+    "scg.preprocess()\n",
+    "scg.compute()\n",
+    "results = scg.evaluate()\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## scVI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if \"counts\" not in adata.layers.keys():\n",
+    "    adata.layers[\"counts\"] = adata.X.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "AnnData object with n_obs × n_vars = 11990 × 3345\n",
+       "    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'celltype', 'n_genes'\n",
+       "    var: 'gene_symbols', 'n_counts-0', 'n_counts-1', 'n_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n",
+       "    uns: 'cell_types', 'log1p', 'hvg'\n",
+       "    obsm: 'design', 'normalized_qc', 'qc_pc', 'raw_qc', 'X_genes'\n",
+       "    layers: 'counts'"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "SLURM auto-requeueing enabled. Setting signal handlers.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f654545481b64af2b59385925c0f992a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Training:   0%|          | 0/400 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`Trainer.fit` stopped: `max_epochs=400` reached.\n"
+     ]
+    }
+   ],
+   "source": [
+    "scvi.model.SCVI.setup_anndata(adata, layer=\"counts\", batch_key=batch_col)\n",
+    "model = scvi.model.SCVI(adata, n_layers=2, n_latent=30, gene_likelihood=\"nb\")\n",
+    "model.train()\n",
+    "adata.obsm[\"X_scVI\"] = model.get_latent_representation()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata.obsm[\"X_scVI\"] = model.get_latent_representation()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32mINFO    \u001b[0m | 2025-06-22 14:36:48 | \u001b[32mSubsampled adata to 25.0% of original cells.\u001b[0m\n",
+      "\u001b[32mINFO    \u001b[0m | 2025-06-22 14:36:48 | \u001b[32mSearching for optimal clustering resolution on X_scVI...\u001b[0m\n",
+      "Louvain clustering: 100%|██████████| 20/20 [00:02<00:00,  7.97it/s]\n",
+      "\u001b[32mINFO    \u001b[0m | 2025-06-22 14:36:51 | \u001b[32mBest resolution: 1.20 with NMI = 0.7544\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group:                    silhouette_score\n",
+      "group                              \n",
+      "B cells                    0.991501\n",
+      "CD14+ Monocytes            0.976939\n",
+      "CD4 T cells                0.987053\n",
+      "CD8 T cells                0.980696\n",
+      "Dendritic Cells            0.931121\n",
+      "FCGR3A+ Monocytes          0.974440\n",
+      "Megakaryocytes             0.910766\n",
+      "NK cells                   0.971491\n",
+      "Other                      0.899360\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.7543923134993394,\n",
+       " 'ARI_cluster/label': 0.6471385261878778,\n",
+       " 'ASW_label': 0.482499361038208,\n",
+       " 'graph_conn': 0.9461266173017836,\n",
+       " 'ASW_batch': 0.5024425515439361,\n",
+       " 'ASW_label/batch': 0.9581518028443176,\n",
+       " 'PCR_batch': 0.00044665558752302455,\n",
+       " 'Average_Batch_Score': 0.25144460356572956,\n",
+       " 'avg_bio': 0.628010066908475}"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_dict_scvi = eval_clustering_metrics(adata=adata, \n",
+    "                                        batch_key=batch_col,\n",
+    "                                        label_key=label_col,\n",
+    "                                        embedding_key=\"X_scVI\",  # or \"X_scGPT\", etc.\n",
+    "                                        verbose=True)\n",
+    "results_dict_scvi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scGraph import scGraph\n",
+    "\n",
+    "scg = scGraph(adata=input_data.adata, batch_key=\"batch\", label_key=\"celltype\", \n",
+    "                trim_rate=0.05, thres_batch=1, thres_celltype=1, embedding_key=\"X_scVI\")\n",
+    "scg.preprocess()\n",
+    "scg.compute()\n",
+    "results = scg.evaluate()\n",
+    "print(results)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Downstream_tasks/Zero_shot_batch_effect/notebooks/zero_shot_raw_data.ipynb ADDED Viewed

	@@ -0,0 +1,328 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict, Optional\n",
+    "import numpy as np\n",
+    "import scanpy as sc\n",
+    "import scib\n",
+    "from anndata import AnnData\n",
+    "from sklearn.metrics import silhouette_score\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "import logging\n",
+    "\n",
+    "log = logging.getLogger(__name__)\n",
+    "\n",
+    "\n",
+    "def eval_clustering_metrics(\n",
+    "    adata: AnnData,\n",
+    "    batch_key: Optional[str] = \"str_batch\",\n",
+    "    label_key: str = \"cell_type\",\n",
+    "    embedding_key: str = \"X\",  # \"X\" for raw, or embedding key in .obsm\n",
+    "    resolutions: Optional[list] = None,\n",
+    "    use_progress_bar: bool = True,\n",
+    "    verbose: bool = False,\n",
+    ") -> Dict[str, float]:\n",
+    "    \"\"\"Evaluate biological and batch mixing metrics on an embedding or raw expression.\"\"\"\n",
+    "    \n",
+    "    results_dict = {}\n",
+    "\n",
+    "    # Determine whether to use .X or .obsm[embedding_key]\n",
+    "    if embedding_key == \"X\":\n",
+    "        use_rep = \"X\"\n",
+    "        adata.obsm[\"X\"] = adata.X\n",
+    "    elif embedding_key in adata.obsm:\n",
+    "        use_rep = embedding_key\n",
+    "    else:\n",
+    "        raise ValueError(f\"embedding_key '{embedding_key}' not found in adata.obsm or is not 'X'\")\n",
+    "\n",
+    "    # Clear stale neighbors\n",
+    "    if \"neighbors\" in adata.uns:\n",
+    "        if verbose:\n",
+    "            log.warning(f\"Removing stale neighbors computed from other representations.\")\n",
+    "        adata.uns.pop(\"neighbors\", None)\n",
+    "\n",
+    "    sc.pp.neighbors(adata, use_rep=use_rep)\n",
+    "\n",
+    "    # Run Louvain across multiple resolutions\n",
+    "    if resolutions is None:\n",
+    "        resolutions = [2 * i / 20 for i in range(1, 21)]  # Default: 20 steps from 0.1 to 2.0\n",
+    "\n",
+    "    best_nmi = -1\n",
+    "    best_res = None\n",
+    "    best_clustering = None\n",
+    "\n",
+    "    if verbose:\n",
+    "        log.info(f\"Searching for optimal clustering resolution on {use_rep}...\")\n",
+    "\n",
+    "    for res in tqdm(resolutions, disable=not use_progress_bar, desc=\"Louvain clustering\"):\n",
+    "        sc.tl.louvain(adata, resolution=res, key_added=\"temp_cluster\")\n",
+    "        nmi = scib.metrics.nmi(adata, \"temp_cluster\", label_key)\n",
+    "        if nmi > best_nmi:\n",
+    "            best_nmi = nmi\n",
+    "            best_res = res\n",
+    "            best_clustering = adata.obs[\"temp_cluster\"].copy()\n",
+    "        del adata.obs[\"temp_cluster\"]\n",
+    "\n",
+    "    if verbose:\n",
+    "        log.info(f\"Best resolution: {best_res:.2f} with NMI = {best_nmi:.4f}\")\n",
+    "\n",
+    "    adata.obs[\"cluster\"] = best_clustering\n",
+    "\n",
+    "    # Biological conservation metrics\n",
+    "    results_dict[\"NMI_cluster/label\"] = scib.metrics.nmi(adata, \"cluster\", label_key, \"arithmetic\")\n",
+    "    results_dict[\"ARI_cluster/label\"] = scib.metrics.ari(adata, \"cluster\", label_key)\n",
+    "    results_dict[\"ASW_label\"] = scib.metrics.silhouette(adata, label_key, use_rep, \"euclidean\")\n",
+    "\n",
+    "    # Batch effect metrics (if batch_key valid)\n",
+    "    if batch_key is not None and batch_key in adata.obs and adata.obs[batch_key].nunique() > 1:\n",
+    "        results_dict[\"graph_conn\"] = scib.metrics.graph_connectivity(adata, label_key)\n",
+    "        results_dict[\"ASW_batch\"] = scib.metrics.silhouette(adata, batch_key, use_rep, \"euclidean\")\n",
+    "        results_dict[\"ASW_label/batch\"] = scib.metrics.silhouette_batch(\n",
+    "            adata, batch_key, label_key, embed=use_rep, metric=\"euclidean\", return_all=False\n",
+    "        )\n",
+    "        results_dict[\"PCR_batch\"] = scib.metrics.pcr(\n",
+    "            adata, covariate=batch_key, embed=use_rep, recompute_pca=True, n_comps=50, verbose=False\n",
+    "        )\n",
+    "    else:\n",
+    "        if verbose:\n",
+    "            log.info(\"Skipping batch metrics — only one batch present or invalid batch_key.\")\n",
+    "    \n",
+    "    results_dict[\"avg_bio\"] = np.mean([\n",
+    "        results_dict[\"NMI_cluster/label\"],\n",
+    "        results_dict[\"ARI_cluster/label\"],\n",
+    "        results_dict[\"ASW_label\"]\n",
+    "    ])\n",
+    "\n",
+    "    # Filter NaNs\n",
+    "    results_dict = {k: v for k, v in results_dict.items() if not np.isnan(v)}\n",
+    "\n",
+    "    return results_dict\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Louvain clustering: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group:                    silhouette_score\n",
+      "group                              \n",
+      "B cells                    0.986484\n",
+      "CD14+ Monocytes            0.943531\n",
+      "CD4 T cells                0.980745\n",
+      "CD8 T cells                0.951482\n",
+      "Dendritic Cells            0.956119\n",
+      "FCGR3A+ Monocytes          0.986242\n",
+      "Megakaryocytes             0.856766\n",
+      "NK cells                   0.953083\n",
+      "Other                      0.930244\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/anndata/_core/anndata.py:522: FutureWarning: The dtype argument is deprecated and will be removed in late 2024.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import scanpy as sc \n",
+    "adata = sc.read_h5ad(\"zero_shot_batch_data/pbmc.h5ad\") \n",
+    "\n",
+    "results_dict = eval_clustering_metrics(adata=adata, \n",
+    "                                        batch_key=\"batch\",\n",
+    "                                        label_key=\"celltype\",\n",
+    "                                        embedding_key=\"X\",  # or \"X_scGPT\", etc.\n",
+    "                                        verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.7043350648326699,\n",
+       " 'ARI_cluster/label': 0.6456273245075416,\n",
+       " 'ASW_label': 0.5333220548927784,\n",
+       " 'graph_conn': 0.9038879996225364,\n",
+       " 'ASW_batch': 0.4965497492812574,\n",
+       " 'ASW_label/batch': 0.9494108132303586,\n",
+       " 'PCR_batch': 0.0009914006163016576,\n",
+       " 'avg_bio': 0.6277614814109966}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_786097/2986997571.py:30: ImplicitModificationWarning: Setting element `.obsm['X']` of view, initializing view as actual.\n",
+      "  adata.obsm[\"X\"] = adata.X\n",
+      "Louvain clustering: 100%|██████��███| 20/20 [00:11<00:00,  1.68it/s]\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n",
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/scib/metrics/graph_connectivity.py:56: FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+      "  tab = pd.value_counts(labels)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean silhouette per group: nan\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/ibex/user/chenj0i/pretrain_gf/lib/python3.11/site-packages/anndata/_core/anndata.py:522: FutureWarning: The dtype argument is deprecated and will be removed in late 2024.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "results_dict_ood = eval_clustering_metrics(adata=adata_ood[:15000],\n",
+    "                                        batch_key=\"batch\",\n",
+    "                                        label_key=\"cell_type\",\n",
+    "                                        embedding_key=\"X\", \n",
+    "                                        verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'NMI_cluster/label': 0.9334102174490695,\n",
+       " 'ARI_cluster/label': 0.9699361136567832,\n",
+       " 'ASW_label': 0.5538543930108312,\n",
+       " 'graph_conn': 0.9231509101914211,\n",
+       " 'ASW_batch': 0.6438532075334105,\n",
+       " 'PCR_batch': 0.042066597759588056,\n",
+       " 'avg_bio': 0.8190669080388946}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results_dict_ood"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Downstream_tasks/Zero_shot_batch_effect/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+anndata==0.9.2
+colorlog==6.7.0
+scgpt==0.1.6
+geneformer==0.0.1
+PyComplexHeatmap
+numpy
+pandas
+scanpy
+scipy
+seaborn
+scib
+scvi-tools

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__init__.py ADDED Viewed

File without changes

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (177 Bytes). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (193 Bytes). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/cell_embeddings.cpython-310.pyc ADDED Viewed

Binary file (9.72 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/cell_embeddings.cpython-311.pyc ADDED Viewed

Binary file (21.4 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/data.cpython-310.pyc ADDED Viewed

Binary file (5.76 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/data.cpython-311.pyc ADDED Viewed

Binary file (11 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/geneformer_forward.cpython-310.pyc ADDED Viewed

Binary file (9.1 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/geneformer_forward.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/model_output.cpython-310.pyc ADDED Viewed

Binary file (15 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/model_output.cpython-311.pyc ADDED Viewed

Binary file (31.6 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/scgpt_forward.cpython-310.pyc ADDED Viewed

Binary file (19.5 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (21.8 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/cell_embeddings.py ADDED Viewed

	@@ -0,0 +1,417 @@

+## Copyright (c) Microsoft Corporation.
+## Licensed under the MIT license.
+import os
+from typing import List, Optional, Tuple, Dict, Union
+import pandas as pd
+import matplotlib.pyplot as plt
+plt.style.use('fivethirtyeight')
+import seaborn as sns
+import scanpy as sc
+from .helpers import umap
+from .helpers.custom_logging import log
+from . import data, utils
+from .geneformer_forward import Geneformer_instance
+# from .scgpt_forward import scGPT_instance
+class CellEmbeddingsEval():
+    def __init__(self,
+                #  model_instance: Union[scGPT_instance,
+                #                        Geneformer_instance],
+                model_instance: Union[Geneformer_instance],
+                 data: data.InputData,
+                 label_key: Union[str, List[str]] = "cell_type",
+                 batch_key: Optional[str] = None,
+                 output_dir: Optional[str] = None,
+                 log_wandb: bool = False) -> None:
+        # test if model_instance is an instance of scGPT_instance or Geneformer_instance
+        # if not isinstance(model_instance,
+        #                   (scGPT_instance, Geneformer_instance)):
+        #     msg = ("scgpt_instance must be an instance of "
+        #            "scGPT_instance or Geneformer_instance")
+        if not isinstance(model_instance,
+                          (Geneformer_instance)):
+            msg = ("scgpt_instance must be an instance of "
+                   "scGPT_instance or Geneformer_instance")
+            log.error(msg)
+            raise ValueError(msg)
+        # test if instance is properly processed
+        if not hasattr(model_instance, "cell_embeddings"):
+            msg = "Cell embeddings need to be extracted first"
+            log.error(msg)
+            raise ValueError(msg)
+        # if wandb set to true and not initialized, throw error
+        if log_wandb and not model_instance._wandb:
+            msg = "wandb is not initialized in model_instance"
+            log.error(msg)
+            raise ValueError(msg)
+        self._wandb = model_instance._wandb
+        self.eval_instance = model_instance
+        self.data = data
+        if batch_key is not None:
+            if batch_key not in self.data.adata.obs.columns:
+                msg = f"batch_key {batch_key} not found in adata.obs"
+                log.error(msg)
+                raise ValueError(msg)
+            else:
+                self.batch_key = batch_key
+        else:
+            try:
+                self.batch_key = self.data.batch_str_col
+            except AttributeError:
+                msg = "batch_key not provided and not found in data object"
+                log.error(msg)
+                raise ValueError(msg)
+        if output_dir is not None:
+            # if output dir is provided, use it
+            self.output_dir = output_dir
+            # check if output_dir exists
+            if not os.path.exists(self.output_dir):
+                log.warning(f"Creating the output directory {self.output_dir}")
+                os.makedirs(self.output_dir)
+        else:
+            # use the same output_dir as the scgpt_instance
+            self.output_dir = self.eval_instance.output_dir
+        # if label_key is string, convert to list
+        if isinstance(label_key, str):
+            label_key = [label_key]
+        self.label_key = label_key
+        # make sure that each label exists and is categorical in adata.obs
+        for label in self.label_key:
+            if label not in self.data.adata.obs.columns:
+                msg = f"Label {label} not found in adata.obs"
+                log.error(msg)
+                raise ValueError(msg)
+            self.data.adata.obs[label] = self.data.adata.obs[label].astype("category")
+    def evaluate(self,
+                 embedding_key: str = "X_scGPT",
+                 n_cells: int = 7500) -> pd.DataFrame:
+        adata_ = self.data.adata.copy()
+        # if adata_ too big, take a subset
+        if adata_.n_obs > n_cells:
+            log.warning(f"adata_ has {adata_.n_obs} cells. "
+                        f"Taking a subset of {n_cells} cells.")
+            sc.pp.subsample(adata_, n_obs = n_cells, copy = False)
+        met_df = pd.DataFrame(columns = ["metric", "label", "value"])
+        # get unique values in self.label_key preserving the order
+        label_cols = [x for i, x in enumerate(self.label_key)
+                      if x not in self.label_key[:i]]
+        # remove label columns that are not in adata_.obs
+        label_cols = [x for x in label_cols if x in adata_.obs.columns]
+        if len(label_cols) == 0:
+            msg = f"No label columns {self.label_key} found in adata.obs"
+            log.error(msg)
+            raise ValueError(msg)
+        # check if the embeddings are in adata
+        if embedding_key not in adata_.obsm.keys():
+            msg = f"Embeddings {embedding_key} not found in adata.obsm"
+            log.error(msg)
+            raise ValueError(msg)
+        for label in label_cols:
+            log.debug(f"Computing metrics for {label}")
+            metrics = utils.eval_scib_metrics(adata_,
+                                              batch_key = self.batch_key,
+                                              label_key = label,
+                                              embedding_key = embedding_key)
+            for metric in metrics.keys():
+                log.debug(f"{metric} for {label}: {metrics[metric]}")
+                # log to wandb if initialized
+                if self._wandb:
+                    self._wandb.log({f"{embedding_key}/{label}/{metric}": metrics[metric]})
+                # add row to the dataframe
+                met_df.loc[len(met_df)] = [metric, label, metrics[metric]]
+        met_df.to_csv(os.path.join(self.output_dir,
+                                   f"{embedding_key}__metrics.csv"),
+                      index = False)
+        if self._wandb:
+            wandb_df = self._wandb.Table(data = met_df)
+            self._wandb.log({f"{embedding_key}/metrics": wandb_df})
+        return met_df
+    def create_original_umap(self,
+                             out_emb: str = "X_umap_input") -> None:
+        sc.pp.neighbors(self.data.adata)
+        temp = sc.tl.umap(self.data.adata, min_dist = 0.3, copy=True)
+        self.data.adata.obsm[out_emb] = temp.obsm["X_umap"].copy()
+    # TODO: this should be a more generic function that can plot any embedding
+    def visualize(self,
+                  embedding_key: str = "X_scGPT",
+                  return_fig: bool = False,
+                  plot_size: Tuple[float, float] = (9, 7),
+                  plot_title: Optional[str] = None,
+                  plot_type: [List, str] = "simple",
+                  n_cells: int = 7500
+                  ) -> Optional[Dict[str, plt.figure]]:
+        raw_emb = "X_umap_input"
+        if embedding_key == raw_emb:
+            # if the umap_raw embedding is used, create it first
+            self.create_original_umap(out_emb = embedding_key)
+        # if adata already has a umap embedding warn that it will be overwritten
+        if "X_umap" in self.data.adata.obsm.keys():
+            old_umap_name = "X_umap_old"
+            log.warning(f"Copying existing UMAP embedding to {old_umap_name} "
+                        "and overwriting X_umap.")
+            self.data.adata.obsm[old_umap_name] = self.data.adata.obsm["X_umap"].copy()
+        # check if the embeddings are in adata
+        if embedding_key not in self.data.adata.obsm.keys():
+            msg = f"Embeddings {embedding_key} not found in adata."
+            log.error(msg)
+            raise ValueError(msg)
+        # if embedding_key contains the string umap, do not compute umap again
+        if embedding_key != raw_emb:
+            # compute umap embeddings
+            sc.pp.neighbors(self.data.adata, use_rep = embedding_key)
+            sc.tl.umap(self.data.adata, min_dist = 0.3)
+        adata_ = self.data.adata.copy()
+        # if adata_ too big, take a subset
+        if adata_.n_obs > n_cells:
+            log.warning(f"adata_ has {adata_.n_obs} cells. "
+                        f"Taking a subset of {n_cells} cells.")
+            sc.pp.subsample(adata_, n_obs = n_cells, copy = False)
+            # save the subsetted adata.obs
+            adata_.obs.to_csv(os.path.join(self.output_dir,
+                                             "adata_obs_subset.csv"))
+        # make sure plot size is a tuple of numbers
+        try:
+            w, h = plot_size
+            if not isinstance(h, (int, float)) or not isinstance(w, (int, float)):
+                msg = f"Height (h = {h}) or width (w = {w}) not valid."
+                log.error(msg)
+                raise TypeError(msg)
+        except TypeError:
+            msg = f"Plot size {plot_size} is not a tuple of numbers."
+            log.error(msg)
+            raise TypeError(msg)
+        # get unique values in self.label_key preserving the order
+        label_cols = self.label_key + [self.batch_key]
+        label_cols = [x for i, x in enumerate(label_cols)
+                      if x not in label_cols[:i]]
+        # remove label columns that are not in adata_.obs
+        label_cols = [x for x in label_cols
+                      if x in self.data.adata.obs.columns]
+        if len(label_cols) == 0:
+            msg = f"No label columns {self.label_key} found in adata.obs"
+            log.error(msg)
+            raise ValueError(msg)
+        # set the colors for the labels
+        labels = dict()
+        labels_colors = dict()
+        palettes = ['viridis', 'inferno',
+                    'mako', 'rocket',
+                    'tab20', 'colorblind',
+                    'tab20b', 'tab20c']
+        if len(label_cols) > len(palettes):
+            log.warning("More labels than palettes. Adding random colors.")
+            palettes = palettes + ["random"] * (len(label_cols) - len(palettes))
+        # creating palettes for the labels
+        for i, label in enumerate(label_cols):
+            labels[label] = self.data.adata.obs[label].unique()
+            if len(labels[label]) > 10:
+                log.warning(f"More than 10 labels for {label}."
+                            f"The plots might be hard to read.")
+            labels_colors[label] = dict(zip(labels[label],
+                                        umap.generate_pallette(n = len(labels[label]),
+                                                                        cmap = palettes[i])))
+        figs = {}
+        # if plot_type a string, convert to list
+        if isinstance(plot_type, str):
+            plot_type = [plot_type]
+        plot_type = [x.lower() for x in plot_type]
+        # get unique values in plot_type
+        plot_type = [x for i, x in enumerate(plot_type)
+                     if x not in plot_type[:i]]
+        old_plot_type = plot_type
+        # check if plot_type is valid
+        valid_plot_types = ["simple", "wide", "scanpy"]
+        # create a subset of plot_type that is valid
+        plot_type = [x for x in plot_type if x in valid_plot_types]
+        if len(plot_type) == 0:
+            msg = f"Plot type {plot_type} is not valid. Valid plot types are {valid_plot_types}"
+            log.error(msg)
+            raise ValueError(msg)
+        # print a warning if plot_type is not valid
+        if len(plot_type) < len(old_plot_type):
+            log.warning(f"Some plot type(s) {old_plot_type} is not valid. "
+                        f"Valid plot types are {valid_plot_types}. "
+                        f"Plotting only {plot_type}")
+        plt_emb = "X_umap" if embedding_key != raw_emb else embedding_key
+        plot_title = (plot_title
+                      if plot_title is not None
+                      else "UMAP of the cell embeddings")
+        if "simple" in plot_type:
+            fig, axs = plt.subplots(ncols = len(label_cols),
+                                    figsize = (len(label_cols) * w, h),
+                                    squeeze = False)
+            axs = axs.flatten()
+            # basic plotting, problematic: size of the points
+            embedding = self.data.adata.obsm[plt_emb]
+            for i, label in enumerate(label_cols):
+                log.debug(f"Plotting the embeddings for {label}")
+                # remove axis and grid from the plot
+                axs[i].axis('off')
+                # plot umap embeddings, add color by cell type
+                axs[i].scatter(embedding[:, 0], embedding[:, 1],
+                            # make points smaller
+                            s = 0.5,
+                            c = [labels_colors[label][x] for x
+                                in self.data.adata.obs[label]])
+                legend_handles = [axs[i].plot([], [],
+                                            marker = "o", ls = "",
+                                            color = c, label = l)[0]
+                                            for l, c in labels_colors[label].items()]
+                axs[i].legend(handles = legend_handles,
+                            bbox_to_anchor = (1.05, 1),
+                            loc = 'upper left')
+                # Add a title to the plot
+                axs[i].title.set_text(f"{label}")
+            fig.suptitle(plot_title, fontsize = 16)
+            fig.tight_layout()
+            fig.subplots_adjust(top = 0.85)
+            fig_savefig = os.path.join(self.output_dir,
+                                    f"umap__{embedding_key}.png")
+            fig.savefig(fig_savefig)
+            # if wandb initialized, log the figure
+            if self._wandb:
+                self._wandb.log({f"umap__{embedding_key}": self._wandb.Image(fig_savefig)})
+            if return_fig:
+                figs["umap"] = fig
+        # wide plotting
+        if "wide" in plot_type:
+            df = pd.DataFrame(self.data.adata.obsm[plt_emb],
+                              columns = ["umap_1", "umap_2"])
+            for i, label in enumerate(label_cols):
+                if self.data.adata.obs[label].unique().shape[0] <= 10:
+                    df[label] = self.data.adata.obs[label].tolist()
+                    wide_plot = sns.relplot(data = df,
+                                            col = label,
+                                            x = "umap_1",
+                                            y = "umap_2",
+                                            hue = label,
+                                            style = label,
+                                            legend = "full",
+                                            palette = palettes[i])
+                    # switch off axes
+                    for axes in wide_plot.axes.flat:
+                        axes.set_axis_off()
+                    sns.move_legend(wide_plot, "upper left", bbox_to_anchor=(1, 1))
+                    wide_plot.fig.suptitle(plot_title, fontsize = 16)
+                    wide_plot.fig.tight_layout()
+                    wide_plot.fig.subplots_adjust(top = 0.85)
+                    wide_plot_savefig = os.path.join(self.output_dir,
+                                                    f"umap_wide__{embedding_key}_{label}.png")
+                    wide_plot.savefig(wide_plot_savefig)
+                    # if wandb initialized, log the figure
+                    if self._wandb:
+                        self._wandb.log({f"umap_wide__{embedding_key}_{label}": self._wandb.Image(wide_plot_savefig)})
+                    if return_fig:
+                        figs[label] = wide_plot
+                else:
+                    msg = f"More than 10 labels for {label}. Skipping wide plot."
+                    log.warning(msg)
+        if "scanpy" in plot_type:
+            # scanpy plotting
+            labels_colors_flat = {k: v for d in labels_colors
+                                for k, v in labels_colors[d].items()}
+            if embedding_key == raw_emb:
+                # TODO: this needs rewriting
+                adata_temp__ = self.data.adata.copy()
+                adata_temp__.obsm["X_umap"] = self.data.adata.obsm[raw_emb].copy()
+                fig2 = sc.pl.umap(adata_temp__,
+                                color = label_cols,
+                                add_outline = True,
+                                layer = plt_emb,
+                                legend_loc = 'on data',
+                                palette = labels_colors_flat,
+                                return_fig = True)
+                # remove the temporary adata
+                del adata_temp__
+            else:
+                fig2 = sc.pl.umap(self.data.adata,
+                                color = label_cols,
+                                add_outline = True,
+                                layer = plt_emb,
+                                legend_loc = 'on data',
+                                palette = labels_colors_flat,
+                                return_fig = True)
+            fig2.suptitle(plot_title, fontsize = 16)
+            fig2.tight_layout()
+            fig2.subplots_adjust(top = 0.85)
+            fig2_savefig = os.path.join(self.output_dir,
+                                        f"umap_scanpy__{embedding_key}.png")
+            fig2.savefig(fig2_savefig)
+            # if wandb initialized, log the figure
+            if self._wandb:
+                self._wandb.log({f"umap_scanpy/{embedding_key}": self._wandb.Image(fig2_savefig)})
+            if return_fig:
+                figs["umap_scanpy"] = fig2
+        if return_fig:
+            return figs

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/data.py ADDED Viewed

	@@ -0,0 +1,330 @@

+## Copyright (c) Microsoft Corporation.
+## Licensed under the MIT license.
+import os
+import scanpy as sc
+from typing import List, Optional, Union, Dict, Literal
+import numpy as np
+# from scgpt.preprocess import Preprocessor
+from .helpers.custom_logging import log
+# switch of warnings
+import warnings
+os.environ["KMP_WARNINGS"] = "off"
+warnings.filterwarnings('ignore')
+class InputData():
+    def __init__(self,
+                 adata_dataset_path: str) -> None:
+        # check if the dataset exists
+        if not os.path.isfile(adata_dataset_path):
+            msg = f"Dataset {adata_dataset_path} does not exist!"
+            log.error(msg)
+            raise ValueError(msg)
+        msg = f"Loading data from {adata_dataset_path}"
+        log.info(msg)
+        self.dataset_name = os.path.basename(adata_dataset_path).split(".")[0]
+        self.adata_path = adata_dataset_path
+        # read in the dataset
+        self.adata = sc.read(adata_dataset_path)
+        self.data_config = dict(
+            data_path = adata_dataset_path,
+        )
+        # this will be updated if add_batch_labels is called
+        self.batch_key = None
+    def add_batch_labels(self,
+                         batch_key: Optional[str] = None,
+                         batch_str_col: str = "str_batch",
+                         batch_id_col: str = "batch_id") -> int:
+        self.batch_key = batch_key
+        self.batch_id_col = batch_id_col
+        self.batch_str_col = batch_str_col
+        if self.batch_key is None:
+            # try guessing which column contains batch info
+            # get the columns that contain "batch"
+            batch_cols = [col for col in
+                            self.adata.obs.columns if "batch" in col.lower()]
+            if len(batch_cols) == 1:
+                ori_batch_col = batch_cols[0]
+                log.info(f"Using {ori_batch_col} as batch column")
+            else:
+                msg = "Cannot determine which column contains batch information!"
+                log.error(msg)
+                raise ValueError(msg)
+        else:
+            ori_batch_col = self.batch_key
+            log.info(f"Using {ori_batch_col} as batch column")
+        self.adata.obs[self.batch_str_col] = (
+            self
+            .adata
+            .obs[ori_batch_col]
+            .astype(str)
+        )
+        batch_id_labels = (
+            self.adata
+            .obs[self.batch_str_col]
+            .astype("category")
+            .cat
+            .codes
+            .values
+        )
+        self.adata.obs[self.batch_id_col] = batch_id_labels
+        log.debug(self.adata.obs[self.batch_id_col].value_counts())
+        num_batch_types = len(set(batch_id_labels))
+        log.debug(f"Number of batch types: {num_batch_types}")
+        return num_batch_types
+    def preprocess_data(self,
+                        gene_col: str = "gene_name",
+                        vocab_source: str = "model_default",
+                        fract_matching: float = 0.5,
+                        model_type: str = "scGPT",
+                        # arguments for Geneformer preprocessing
+                        gene_name_id_dict: Optional[Dict[str, str]] = None,
+                        filter_gene_by_cells: Optional[int] = 10,
+                        filter_cell_by_genes: Optional[int] = 10,
+                        preprocessed_path: Optional[str] = None,
+                        save_ext: Optional[str] = "loom",
+                        # arguments for scGPT preprocessing
+                        gene_vocab: Optional[List[str]] = None,
+                        data_is_raw: Optional[bool] = True,
+                        counts_layer: Optional[str] = "X",
+                        filter_gene_by_counts: Optional[int] = 3,
+                        filter_cell_by_counts: Optional[Union[int, bool]] = False,
+                        n_hvg: Optional[Union[int, bool]] = 1200,
+                        normalize_total: Optional[int] = 1e4,
+                        n_bins: Optional[int] = 50,
+                        **kwargs) -> None:
+        if gene_col not in self.adata.var.columns:
+            self.adata.var[gene_col] = self.adata.var.index.tolist()
+            log.warning(f"Gene names not found in var columns. Using index instead.")
+        self.gene_col = gene_col
+        self.data_config["gene_col"] = gene_col
+        # check if model_type is valid
+        model_type = model_type.lower()
+        valid_model_types = ["scgpt", "geneformer"]
+        if model_type not in valid_model_types:
+            msg = (f"Model type {model_type} not supported! "
+                   f"Valid options are: {valid_model_types}.")
+            log.error(msg)
+            raise ValueError(msg)
+        self.data_config["model_type"] = model_type
+        self.data_config["vocab_source"] = vocab_source
+        # note raw data shape
+        self.data_config["input__n_cells"] = self.adata.shape[0]
+        self.data_config["input__n_genes"] = self.adata.shape[1]
+        # check if scgpt found in lowercase model string
+        if model_type == "scgpt":
+            self.data_config["data_is_raw"] = data_is_raw
+            self._preprocess_data_scGPT(gene_vocab = gene_vocab,
+                                        fract_matching = fract_matching,
+                                        input_key = counts_layer,
+                                        filter_gene_by_counts = filter_gene_by_counts,
+                                        filter_cell_by_counts = filter_cell_by_counts,
+                                        normalize_total = normalize_total,
+                                        n_hvg = n_hvg,
+                                        n_bins = n_bins,
+                                        preprocessed_path = preprocessed_path,
+                                        **kwargs)
+        elif model_type == "geneformer":
+            self._preprocess_data_geneformer(preprocessed_path = preprocessed_path,
+                                             save_ext = save_ext,
+                                             gene_name_id_dict = gene_name_id_dict,
+                                             fract_matching = fract_matching,
+                                             filter_cell_by_genes = filter_cell_by_genes,
+                                             filter_gene_by_cells = filter_gene_by_cells)
+        # note raw preprocessed shape
+        self.data_config["preprocessed__n_cells"] = self.adata.shape[0]
+        self.data_config["preprocessed__n_genes"] = self.adata.shape[1]
+    # def _preprocess_data_scGPT(self,
+    #                            gene_vocab: List[str],
+    #                            fract_matching: float = 0.5,
+    #                            input_key: str = "X",
+    #                            filter_gene_by_counts: int = 3,
+    #                            filter_cell_by_counts: Union[int, bool] = False,
+    #                            normalize_total: int = 1e4,
+    #                            n_hvg: Union[int, bool] = 1200,
+    #                            n_bins: int = 51,
+    #                            normed_key: str = "X_normed",
+    #                            log1p_key: str = "X_log1p",
+    #                            binned_key: str = "X_binned",
+    #                            preprocessed_path: Optional[str] = None) -> None:
+    #     # preprocess the data
+    #     self.adata.var["id_in_vocab"] = [
+    #         1 if gene in gene_vocab else -1
+    #         for gene in self.adata.var[self.gene_col]
+    #     ]
+    #     gene_ids_in_vocab = np.array(self.adata.var["id_in_vocab"])
+    #     fract = np.sum(gene_ids_in_vocab >= 0)/len(gene_ids_in_vocab)
+    #     if fract < fract_matching:
+    #         msg = f"Only {fract*100:.2f}% genes in the dataset are in the vocabulary!"
+    #         log.error(msg)
+    #         raise ValueError(msg)
+    #     self.adata = self.adata[:, self.adata.var["id_in_vocab"] >= 0]
+    #     self.data_config["fract_genes_in_vocab"] = fract
+    #     log.info(
+    #         f"Matched {np.sum(gene_ids_in_vocab >= 0)}/{len(gene_ids_in_vocab)}"
+    #         f" genes in vocabulary of size {len(gene_vocab)}."
+    #     )
+    #     if n_hvg < 1:
+    #         n_hvg = False
+    #     # append preprocessing parameters to run config
+    #     d_ = {
+    #         "preprocesing__input_key": input_key,
+    #         "preprocesing__filter_gene_by_counts": filter_gene_by_counts,
+    #         "preprocesing__filter_cell_by_counts": filter_cell_by_counts,
+    #         "preprocesing__normalize_total": normalize_total,
+    #         "preprocesing__normed_key": normed_key,
+    #         "preprocesing__log1p_key": log1p_key,
+    #         "preprocesing__binned_key": binned_key,
+    #         "preprocesing__n_bins": n_bins,
+    #         "preprocesing__n_hvg": n_hvg,
+    #     }
+    #     self.data_config.update(d_)
+    #     msg = "Preprocessing data"
+    #     log.info(msg)
+    #     # Preprocess the data following the scGPT data pre-processing pipeline
+    #     preprocessor = Preprocessor(
+    #         # the key in adata.layers to use as raw data
+    #         use_key = input_key,
+    #         # step 1
+    #         filter_gene_by_counts = filter_gene_by_counts,
+    #         # step 2
+    #         filter_cell_by_counts = filter_cell_by_counts,
+    #         # 3. whether to normalize the raw data and to what sum
+    #         normalize_total = normalize_total,
+    #         # the key in adata.layers to store the normalized data
+    #         result_normed_key = normed_key,
+    #         # 4. whether to log1p the normalized data
+    #         log1p = self.data_config["data_is_raw"],
+    #         result_log1p_key = log1p_key,
+    #         # 5. whether to subset the raw data to highly variable genes
+    #         subset_hvg = n_hvg,
+    #         hvg_flavor = ("seurat_v3"
+    #                       if self.data_config["data_is_raw"]
+    #                       else "cell_ranger"),
+    #         # 6. whether to bin the raw data and to what number of bins
+    #         binning = n_bins,
+    #         # the key in adata.layers to store the binned data
+    #         result_binned_key = binned_key,
+    #     )
+    #     preprocessor(self.adata, batch_key = self.batch_key)
+    #     if preprocessed_path is not None:
+    #         # check if path exists
+    #         if os.path.exists(preprocessed_path):
+    #             msg = (f"Saving {self.dataset_name} preprocessed data "
+    #                    f"to {preprocessed_path}")
+    #             self.adata.write(os.path.join(preprocessed_path,
+    #                                           f"{self.dataset_name}.h5ad"))
+    #         else:
+    #             msg = (f"Directory {preprocessed_path} does not exist! "
+    #                    "Skipping saving preprocessed data.")
+    #             log.warning(msg)
+    def _preprocess_data_geneformer(self,
+                                    preprocessed_path: str,
+                                    gene_name_id_dict: Dict[str, str],
+                                    save_ext: Literal["loom", "h5ad"] = "loom",
+                                    fract_matching: float = 0.5,
+                                    filter_cell_by_genes: int = 10,
+                                    filter_gene_by_cells: int = 10) -> None:
+        # for geneformer we need the path to save the data, check if exists
+        if preprocessed_path is None or not os.path.exists(preprocessed_path):
+            msg = ("For Geneformer, preprocessed_path needs to be specified "
+                   "and exists to save the dataset. Provided path: "
+                   f"{preprocessed_path}")
+            log.error(msg)
+            raise ValueError(msg)
+        sc.pp.calculate_qc_metrics(self.adata,
+                                   percent_top = None,
+                                   log1p = False,
+                                   inplace = True)
+        self.adata.obs['n_counts'] = self.adata.obs['total_counts']
+        sc.pp.filter_cells(self.adata, min_genes=int(filter_cell_by_genes))
+        sc.pp.filter_genes(self.adata, min_cells=int(filter_gene_by_cells))
+        # for now, assuming gene names and using geneformer dictionary
+        # to match gene nam to ensembl id; TODO: look into better way?
+        # this is tricky because ensembl ids change, in a way
+        # gene names are more constant; however they aren't necessarily unique
+        # and might be missing from the geneformer dictionary/be different
+        # for now, make sure to report the fraction of genes that are matched
+        # and save the match/not matched
+        self.adata.var['ensembl_id'] = self.adata.var[self.gene_col].map(gene_name_id_dict)
+        self.adata.var['has_ensembl_match'] = self.adata.var['ensembl_id'].notnull()
+        n_all_genes = self.adata.var.shape[0]
+        n_matched = self.adata.var.has_ensembl_match.sum()
+        fract = n_matched / n_all_genes
+        if fract < fract_matching:
+            msg = f"Only {fract*100:.2f}% genes in the dataset are in the vocabulary!"
+            log.error(msg)
+            raise ValueError(msg)
+        # save the adata.var dataframe
+        self.adata.var.to_csv(os.path.join(preprocessed_path,
+                                           f"{self.dataset_name}_var.csv"),
+                              index = False)
+        # filter out genes that don't have a match
+        self.adata = self.adata[:, self.adata.var.has_ensembl_match]
+        # additionally, add the order of the samples, since they will be sorted
+        # to speed up forward pass
+        self.adata.obs['adata_order'] = self.adata.obs.index.tolist()
+        self.data_config["fract_genes_in_vocab"] = fract
+        log.info(
+            f"Matched {fract*100:.2f}% genes ({n_matched}/{n_all_genes})"
+            f" genes in vocabulary of size {len(gene_name_id_dict)}."
+        )
+        if save_ext == "loom":
+            self.adata.write_loom(os.path.join(preprocessed_path,
+                                               f"{self.dataset_name}.loom"))
+        elif save_ext == "h5ad":
+            self.adata.write_h5ad(os.path.join(preprocessed_path,
+                                               f"{self.dataset_name}.h5ad"))
+    def get_config(self):
+        return self.data_config

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/geneformer_forward.py ADDED Viewed

	@@ -0,0 +1,365 @@

+## Copyright (c) Microsoft Corporation.
+## Licensed under the MIT license.
+import os
+import importlib.util
+import pickle
+from typing import Dict, Optional, List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import BertForMaskedLM
+from geneformer.tokenizer import TranscriptomeTokenizer
+# from geneformer import EmbExtractor
+from tqdm.auto import trange
+from datasets import Dataset, load_from_disk
+from . import utils
+from .data import InputData
+from .helpers.custom_logging import log
+from GF_CAB import CustomBertForMaskedLM
+import warnings
+os.environ["KMP_WARNINGS"] = "off"
+warnings.filterwarnings("ignore")
+def pad_tensor(t: torch.Tensor,
+               max_size: int,
+               pad_token_id: int = 0) -> torch.Tensor:
+    """
+    Pad a tensor to a max size
+    """
+    return F.pad(t, pad = (0, max_size - t.numel()),
+                 mode = 'constant', value = pad_token_id)
+# get cell embeddings excluding padding
+def mean_nonpadding_embs(embs, original_lens):
+    # mask based on padding lengths
+    mask = torch.arange(embs.size(1)).unsqueeze(0).to("cuda") < original_lens.unsqueeze(1)
+    # extend mask dimensions to match the embeddings tensor
+    mask = mask.unsqueeze(2).expand_as(embs)
+    # use the mask to zero out the embeddings in padded areas
+    masked_embs = embs * mask.float()
+    # sum and divide by the lengths to get the mean of non-padding embs
+    mean_embs = masked_embs.sum(1) / original_lens.view(-1, 1).float()
+    return mean_embs
+def average_embeddings(embs: torch.Tensor,
+                       org_lengths: torch.Tensor) -> torch.Tensor:
+    device = embs.device
+    # mask based on padding lengths
+    mask = (torch.arange(embs.size(1)).unsqueeze(0).to(device) <
+            org_lengths.unsqueeze(1))
+    # extend mask dimensions to match the embeddings tensor
+    if len(embs.shape) > 2:
+        mask = mask.unsqueeze(2).expand_as(embs)
+    # Use the mask to compute the sum over non-padded areas
+    summed_embs = (embs * mask.float()).sum(dim=1)
+    # Divide by the lengths to get the mean of non-padding embs
+    mean_embs = summed_embs / org_lengths.view(-1, 1).float()
+    return mean_embs
+class Geneformer_instance():
+    def __init__(self,
+                 saved_model_path: Optional[str] = None,
+                 model_run: str = "pretrained",
+                 model_files: Dict[str, str] = {
+                     "model_args": "config.json",
+                     "model_training": "training_args.bin",
+                     "model_weights": "pytorch_model.bin"
+                     },
+                 save_dir: Optional[str] = None,
+                 explicit_save_dir: bool = False,
+                 num_workers: int = 0,
+                 log_wandb: bool = False,
+                 project_name: str = "Geneformer_eval",
+                 ) -> None:
+        # check if the model run is supported
+        supported_model_runs = ["pretrained"] #, "random", "finetune", "train"]
+        if model_run not in supported_model_runs:
+            msg = f"model_run must be one of {supported_model_runs}"
+            log.error(msg)
+            raise ValueError(msg)
+        self.model_run = model_run
+        self.saved_model_path = saved_model_path
+        self.model_files = model_files
+        if num_workers == -1:
+            num_workers = len(os.sched_getaffinity(0))
+        if num_workers == 0:
+            num_workers = 1
+        self.num_workers = num_workers
+        # check if output directory exists
+        if save_dir is not None:
+            if explicit_save_dir:
+                self.output_dir = save_dir
+            else:
+                self.output_dir = os.path.join(save_dir,
+                                               self.run_id)
+                # if the top out directory does not exist, create it
+                if not os.path.exists(save_dir):
+                    log.warning(f"Creating the top output directory {save_dir}")
+                    os.makedirs(save_dir)
+        else:
+            # save in a current path
+            self.output_dir = os.path.join(os.getcwd(), self.run_id)
+        # if the out directory already exists, raise an error
+        if os.path.exists(self.output_dir) and not explicit_save_dir:
+            msg = f"Output directory: {self.output_dir} exists. Something is wrong!"
+            log.error(msg)
+            raise ValueError(msg)
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.device = torch.device("cuda"
+                                   if torch.cuda.is_available()
+                                   else "cpu")
+        log.info(f"Using device {self.device}")
+        self.project_name = project_name
+        if log_wandb:
+            has_wandb = importlib.util.find_spec("wandb") is not None
+            if not has_wandb:
+                msg = "Wandb is not installed. Please install wandb to log to wandb."
+                log.error(msg)
+                raise RuntimeError(msg)
+            if has_wandb:
+                import wandb
+            self._wandb = wandb
+        else:
+            self._wandb = None
+        # update this when saved config so that when training it only is saved once
+        self.config_saved = False
+    def _check_attr(self,
+                    attr: str,
+                    not_none: bool = True) -> bool:
+        """
+        Check if the argument is in the class
+        """
+        out = hasattr(self, attr)
+        if not_none and out:
+            out = getattr(self, attr) is not None
+        return out
+    def load_pretrained_model(self) -> None:
+        # self.model = BertForMaskedLM.from_pretrained(self.saved_model_path,
+        #                                              output_attentions=False,
+        #                                              output_hidden_states=True)
+        self.model = CustomBertForMaskedLM.from_pretrained(self.saved_model_path,
+                                                     output_attentions=False,
+                                                     output_hidden_states=True)
+        self.model = self.model.to(self.device)
+        log.info(f"Model successfully loaded from {self.saved_model_path}")
+    def load_tokenized_dataset(self,
+                               dataset_path: str) -> None:
+        self.tokenized_dataset = load_from_disk(dataset_path)
+    def tokenize_data(self,
+                      adata_path: str,
+                      dataset_path: str,
+                      cell_type_col: str = "cell_type",
+                      columns_to_keep: List[str] = ["adata_order"]):
+        dataset_name = os.path.basename(adata_path).split(".")[0]
+        cols_to_keep = dict(zip([cell_type_col] + columns_to_keep,
+                                ['cell_type'] + columns_to_keep))
+        # initialize tokenizer
+        self.tokenizer = TranscriptomeTokenizer(cols_to_keep,
+                                                nproc = self.num_workers)
+        # get the extension from adata_path
+        _, ext = os.path.splitext(adata_path)
+        ext = ext.strip(".")
+        if ext not in ["loom", "h5ad"]:
+            msg = f"adata_path must be a loom or h5ad file. Got {ext}"
+            log.error(msg)
+            raise ValueError(msg)
+        if ext == "h5ad":
+            msg = ("using h5ad file. This sometimes causes issues. "
+                   "If not working try with loom.")
+            log.warning(msg)
+        # get the top directory of the adata_path
+        adata_dir = os.path.dirname(adata_path)
+        self.tokenizer.tokenize_data(adata_dir,
+                                     dataset_path,
+                                     dataset_name,
+                                     file_format=ext)
+        # tokenizer does not return the dataset
+        # load the dataset
+        self.load_tokenized_dataset(os.path.join(dataset_path,
+                                                 f"{dataset_name}.dataset"))
+    def load_vocab(self,
+                   dict_paths: str) -> None:
+        token_dictionary_path = os.path.join(dict_paths,
+                                            "token_dictionary.pkl")
+        with open(token_dictionary_path, "rb") as f:
+            self.vocab = pickle.load(f)
+        self.pad_token_id = self.vocab.get("<pad>")
+        # size of vocabulary
+        self.vocab_size = len(self.vocab)
+        gene_name_id_path = os.path.join(dict_paths,
+                                            "gene_name_id_dict.pkl")
+        with open(gene_name_id_path, "rb") as f:
+            self.gene_name_id = pickle.load(f)
+    def _extend_batch(self,
+                      batch_dataset: Dataset,
+                      return_attention_mask: bool = True):
+        max_size = max(batch_dataset['length'])
+        batch_ = [pad_tensor(x, max_size, self.pad_token_id)
+                  for x in batch_dataset['input_ids']]
+        batch_ = torch.stack(batch_).to(self.device)
+        if return_attention_mask:
+            mask_ = [[1] * l + [0] * (max_size - l)
+                     for l in batch_dataset['length']]
+            mask_ = torch.tensor(mask_).to(self.device)
+            return batch_, mask_
+        return batch_
+    def _pass_batch(self,
+                    batch_ids: torch.Tensor,
+                    attention_mask: torch.Tensor,
+                    **kwargs) -> torch.Tensor:
+        # make sure that batch and attn_mask on the same device
+        batch_ids = batch_ids.to(self.device)
+        attn_mask = attention_mask.to(self.device)
+        with torch.no_grad():
+            outputs = self.model(input_ids = batch_ids,
+                                 attention_mask = attn_mask,
+                                 **kwargs)
+        return outputs
+    def extract_embeddings(self,
+                           data: InputData,
+                           batch_size: int = 48,
+                           embedding_key: str = "geneformer",
+                           layer: int = -2):
+        # check if tokenized dataset is loaded
+        if not self._check_attr("tokenized_dataset"):
+            msg = "Tokenized dataset not loaded. Please load the tokenized dataset."
+            log.error(msg)
+            raise RuntimeError(msg)
+        # check if layer is valid
+        n_layers = self.model.config.num_hidden_layers
+        if layer >= n_layers or layer < -n_layers:
+            msg = (f"Layer {layer} is not valid. There are only {n_layers} "
+                   f"Acceptable values are between {-n_layers} (if counting "
+                   f"forwards) and {n_layers - 1} (if counting backwards)")
+            log.error(msg)
+            raise ValueError(msg)
+        # save the embeddings to subdir
+        embeddings_subdir = os.path.join(self.output_dir, "model_outputs")
+        os.makedirs(embeddings_subdir, exist_ok=True)
+        cell_embs_list = []
+        rankings_list = []
+        size = len(self.tokenized_dataset)
+        for i in trange(0, size, batch_size,
+                        desc = "Geneformer (extracting embeddings)"):
+            max_range = min(i+batch_size, size)
+            batch_dataset = self.tokenized_dataset.select(list(range(i, max_range)))
+            batch_dataset.set_format(type = 'torch')
+            org_lengths = torch.tensor(batch_dataset['length']).to(self.device)
+            batch, attn_mask = self._extend_batch(batch_dataset)
+            model_output = self._pass_batch(batch,
+                                            attention_mask = attn_mask)
+            embs = model_output.hidden_states[layer]
+            # cell_embs = average_embeddings(embs, org_lengths)
+            cell_embs = mean_nonpadding_embs(embs, org_lengths)
+            # add cell embeddings to the list
+            cell_embs_list.extend(cell_embs.detach().cpu().numpy())
+            # now, get the ranking reconstruction
+            out_rankings = (model_output.logits
+                            .argmax(axis=-1)
+                            .detach().cpu().numpy())
+            # save the rankings with the original order
+            rankings_list.extend(out_rankings)
+            torch.cuda.empty_cache()
+            del model_output
+            del batch
+            del attn_mask
+            del embs
+            del cell_embs
+        self.cell_embeddings = np.array(cell_embs_list)
+        self.output_rankings = rankings_list
+        self.input_rankings = [np.array(item)
+                               for item
+                               in self.tokenized_dataset['input_ids']]
+        # add embeddings to adata
+        data.adata.obsm[embedding_key] = self.cell_embeddings
+        # for plotting later, save the data.adata.obs
+        # order here agrees with the order of the embeddings
+        data.adata.obs.to_csv(os.path.join(embeddings_subdir,
+                                           "adata_obs.csv"))

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__init__.py ADDED Viewed

File without changes

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (185 Bytes). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (201 Bytes). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/custom_logging.cpython-310.pyc ADDED Viewed

Binary file (625 Bytes). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/custom_logging.cpython-311.pyc ADDED Viewed

Binary file (1.04 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/umap.cpython-310.pyc ADDED Viewed

Binary file (3.89 kB). View file

Downstream_tasks/Zero_shot_batch_effect/sc_foundation_evals/helpers/__pycache__/umap.cpython-311.pyc ADDED Viewed

Binary file (6.04 kB). View file