pull_geneformer

#557

by spartanzhao - opened Aug 6

base: refs/heads/main

←

from: refs/pr/557

Discussion Files changed

+44

-161

Files changed (7) hide show

geneformer/classifier.py +3 -22
geneformer/classifier_utils.py +1 -22
geneformer/emb_extractor.py +20 -68
geneformer/evaluation_utils.py +2 -17
geneformer/in_silico_perturber_stats.py +0 -3
geneformer/perturber_utils.py +0 -5
geneformer/tokenizer.py +18 -24

geneformer/classifier.py CHANGED Viewed

@@ -801,7 +801,7 @@ class Classifier:
                     # 5-fold cross-validate
                     num_cells = len(data)
                     fifth_cells = int(np.floor(num_cells * 0.2))
-                    num_eval = int(min((self.eval_size * num_cells), fifth_cells))
                     start = i * fifth_cells
                     end = start + num_eval
                     eval_indices = [j for j in range(start, end)]
@@ -1313,7 +1313,6 @@ class Classifier:
         predict=False,
         output_directory=None,
         output_prefix=None,
-        predict_metadata=None,
     ):
         """
         Evaluate the fine-tuned model.
@@ -1339,11 +1338,9 @@ class Classifier:
         ##### Evaluate the model #####
         labels = id_class_dict.keys()
-        y_pred, y_true, logits_list, predict_metadata_all = eu.classifier_predict(
-            model, self.classifier, eval_data, self.forward_batch_size, self.gene_token_dict, predict_metadata
         )
         conf_mat, macro_f1, acc, roc_metrics = eu.get_metrics(
             y_pred, y_true, logits_list, num_classes, labels
         )
@@ -1353,9 +1350,6 @@ class Classifier:
                 "label_ids": y_true,
                 "predictions": logits_list,
             }
-            if predict_metadata is not None:
-                pred_dict["prediction_metadata"] = predict_metadata_all
             pred_dict_output_path = (
                 Path(output_directory) / f"{output_prefix}_pred_dict"
             ).with_suffix(".pkl")
@@ -1376,7 +1370,6 @@ class Classifier:
         output_directory,
         output_prefix,
         predict=True,
-        predict_metadata=None,
     ):
         """
         Evaluate the fine-tuned model.
@@ -1396,8 +1389,6 @@ class Classifier:
             | Prefix for output files
         predict : bool
             | Whether or not to save eval predictions
-        predict_metadata : None | list
-            | Metadata labels to output with predictions (columns in test_data_file)
         """
         # load numerical id to class dictionary (id:class)
@@ -1410,15 +1401,6 @@ class Classifier:
         # load previously filtered and prepared data
         test_data = pu.load_and_filter(None, self.nproc, test_data_file)
-        if predict_metadata is not None:
-            absent_metadata = []
-            for predict_metadata_x in predict_metadata:
-                if predict_metadata_x not in test_data.features.keys():
-                    absent_metadata += [predict_metadata_x]
-            if len(absent_metadata)>0:
-                logger.error(f"Following predict_metadata was not found as column in test_data_file: {absent_metadata}")
-                raise
         # load previously fine-tuned model
         model = pu.load_model(
             self.model_type,
@@ -1437,7 +1419,6 @@ class Classifier:
             predict=predict,
             output_directory=output_directory,
             output_prefix=output_prefix,
-            predict_metadata=predict_metadata,
         )
         all_conf_mat_df = pd.DataFrame(

                     # 5-fold cross-validate
                     num_cells = len(data)
                     fifth_cells = int(np.floor(num_cells * 0.2))
+                    num_eval = min((self.eval_size * num_cells), fifth_cells)
                     start = i * fifth_cells
                     end = start + num_eval
                     eval_indices = [j for j in range(start, end)]
         predict=False,
         output_directory=None,
         output_prefix=None,
     ):
         """
         Evaluate the fine-tuned model.
         ##### Evaluate the model #####
         labels = id_class_dict.keys()
+        y_pred, y_true, logits_list = eu.classifier_predict(
+            model, self.classifier, eval_data, self.forward_batch_size, self.gene_token_dict
         )
         conf_mat, macro_f1, acc, roc_metrics = eu.get_metrics(
             y_pred, y_true, logits_list, num_classes, labels
         )
                 "label_ids": y_true,
                 "predictions": logits_list,
             }
             pred_dict_output_path = (
                 Path(output_directory) / f"{output_prefix}_pred_dict"
             ).with_suffix(".pkl")
         output_directory,
         output_prefix,
         predict=True,
     ):
         """
         Evaluate the fine-tuned model.
             | Prefix for output files
         predict : bool
             | Whether or not to save eval predictions
         """
         # load numerical id to class dictionary (id:class)
         # load previously filtered and prepared data
         test_data = pu.load_and_filter(None, self.nproc, test_data_file)
         # load previously fine-tuned model
         model = pu.load_model(
             self.model_type,
             predict=predict,
             output_directory=output_directory,
             output_prefix=output_prefix,
         )
         all_conf_mat_df = pd.DataFrame(

geneformer/classifier_utils.py CHANGED Viewed

@@ -94,7 +94,7 @@ def remove_rare(data, rare_threshold, label, nproc):
     return data
-def label_classes(classifier, data, gene_class_dict, nproc, id_class_dict=None):
     if classifier == "cell":
         label_set = set(data["label"])
     elif classifier == "gene":
@@ -570,27 +570,6 @@ def compute_metrics(pred):
     return {"accuracy": acc, "macro_f1": macro_f1}
-def robust_compute_objective(metrics: dict):
-    # tries both prefixed ("eval_") and raw metric names to support different transformers versions
-    metric_name = "macro_f1"
-    # check for the prefixed version
-    prefixed_metric_name = f"eval_{metric_name}"
-    if prefixed_metric_name in metrics:
-        return metrics[prefixed_metric_name]
-    # fall back to the raw name
-    elif metric_name in metrics:
-        return metrics[metric_name]
-    # if neither is found, raise a clear error to help with debugging
-    raise KeyError(
-        f"Could not find '{prefixed_metric_name}' or '{metric_name}' in the reported metrics. "
-        f"Please check your `compute_metrics` function and `TrainingArguments`. "
-        f"Available metrics: {list(metrics.keys())}"
-    )
 def get_default_train_args(model, classifier, data, output_dir):
     num_layers = pu.quant_layers(model)
     freeze_layers = 0

     return data
+def label_classes(classifier, data, gene_class_dict, nproc, id_class_dict):
     if classifier == "cell":
         label_set = set(data["label"])
     elif classifier == "gene":
     return {"accuracy": acc, "macro_f1": macro_f1}
 def get_default_train_args(model, classifier, data, output_dir):
     num_layers = pu.quant_layers(model)
     freeze_layers = 0

geneformer/emb_extractor.py CHANGED Viewed

@@ -42,8 +42,6 @@ def get_embs(
     special_token=False,
     summary_stat=None,
     silent=False,
-    save_tdigest=False,
-    tdigest_path=None,
 ):
     model_input_size = pu.get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
@@ -182,18 +180,12 @@ def get_embs(
     # calculate summary stat embs from approximated tdigests
     elif summary_stat is not None:
         if emb_mode == "cell":
-            if save_tdigest:
-                with open(f"{tdigest_path}","wb") as fp:
-                    pickle.dump(embs_tdigests, fp)
             if summary_stat == "mean":
                 summary_emb_list = tdigest_mean(embs_tdigests, emb_dims)
             elif summary_stat == "median":
                 summary_emb_list = tdigest_median(embs_tdigests, emb_dims)
             embs_stack = torch.tensor(summary_emb_list)
         elif emb_mode == "gene":
-            if save_tdigest:
-                with open(f"{tdigest_path}","wb") as fp:
-                    pickle.dump(embs_tdigests_dict, fp)
             if summary_stat == "mean":
                 [
                     update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims)
@@ -260,7 +252,7 @@ def label_cell_embs(embs, downsampled_data, emb_labels):
     return embs_df
-def label_gene_embs(embs, downsampled_data, token_gene_dict, gene_emb_style="mean_pool"):
     gene_set = {
         element for sublist in downsampled_data["input_ids"] for element in sublist
     }
@@ -275,39 +267,16 @@ def label_gene_embs(embs, downsampled_data, token_gene_dict, gene_emb_style="mea
         )
         for k in dict_i.keys():
             gene_emb_dict[k].append(dict_i[k])
-    if gene_emb_style != "all":
-        for k in gene_emb_dict.keys():
-            gene_emb_dict[k] = (
-                torch.squeeze(torch.mean(torch.stack(gene_emb_dict[k]), dim=0), dim=0)
-                .cpu()
-                .numpy()
-            )
-        embs_df = pd.DataFrame(gene_emb_dict).T
-    else:
-        embs_df = dict_lol_to_df(gene_emb_dict)
     embs_df.index = [token_gene_dict[token] for token in embs_df.index]
     return embs_df
-def dict_lol_to_df(data_dict):
-    # save dictionary with values being list of equal-length lists as dataframe
-    df_data = []
-    for key, list_of_lists in data_dict.items():
-        for i, sublist in enumerate(list_of_lists):
-            row_data = [key, i] + sublist.tolist()
-            df_data.append(row_data)
-    # determine column names based on the length of sublists
-    # assuming all sublists have the same length
-    num_columns_from_sublist = len(list(data_dict.values())[0][0])
-    column_names = ['Gene', 'Identifier'] + [f'{j}' for j in range(num_columns_from_sublist)]
-    # create the dataframe
-    df = pd.DataFrame(df_data, columns=column_names)
-    # set 'Gene' as the index
-    df = df.set_index('Gene')
-    return df
 def plot_umap(embs_df, emb_dims, labels_clean, output_prefix, output_directory, kwargs_dict, seed=0):
     only_embs_df = embs_df.iloc[:, :emb_dims]
@@ -435,7 +404,7 @@ class EmbExtractor:
         "num_classes": {int},
         "emb_mode": {"cls", "cell", "gene"},
         "cell_emb_style": {"mean_pool"},
-        "gene_emb_style": {"mean_pool", "all"},
         "filter_data": {None, dict},
         "max_ncells": {None, int},
         "emb_layer": {-1, 0},
@@ -463,7 +432,6 @@ class EmbExtractor:
         forward_batch_size=100,
         nproc=4,
         summary_stat=None,
-        save_tdigest=False,
         model_version="V2",
         token_dictionary_file=None,
     ):
@@ -483,9 +451,9 @@ class EmbExtractor:
         cell_emb_style : {"mean_pool"}
             | Method for summarizing cell embeddings if not using CLS token.
             | Currently only option is mean pooling of gene embeddings for given cell.
-        gene_emb_style : {"mean_pool", "all}
             | Method for summarizing gene embeddings.
-            | Currently only option is returning all or mean pooling of contextual gene embeddings for given gene.
         filter_data : None, dict
             | Default is to extract embeddings from all input data.
             | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
@@ -515,9 +483,6 @@ class EmbExtractor:
             | If mean or median, outputs only approximated mean or median embedding of input data.
             | Non-exact recommended if encountering memory constraints while generating goal embedding positions.
             | Non-exact is slower but more memory-efficient.
-        save_tdigest : bool
-            | Whether to save a dictionary of tdigests for each gene and embedding dimension
-            | Only applies when summary_stat is not None
         model_version : str
             | To auto-select settings for model version other than current default.
             | Current options: V1: models pretrained on ~30M cells, V2: models pretrained on ~104M cells
@@ -561,16 +526,9 @@ class EmbExtractor:
         else:
             self.summary_stat = summary_stat
             self.exact_summary_stat = None
-        self.save_tdigest = save_tdigest
         self.validate_options()
-        if (summary_stat is None) and (save_tdigest is True):
-            logger.warning(
-                    "tdigests will not be saved since summary_stat is None."
-                )
-            save_tdigest = False
         if self.model_version == "V1":
             from . import TOKEN_DICTIONARY_FILE_30M
             self.token_dictionary_file = TOKEN_DICTIONARY_FILE_30M
@@ -677,10 +635,6 @@ class EmbExtractor:
             self.model_type, self.num_classes, model_directory, mode="eval"
         )
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
-        if self.save_tdigest:
-            tdigest_path = (Path(output_directory) / f"{output_prefix}_tdigest").with_suffix(".pkl")
-        else:
-            tdigest_path = None
         embs = get_embs(
             model=model,
             filtered_input_data=downsampled_data,
@@ -690,8 +644,6 @@ class EmbExtractor:
             forward_batch_size=self.forward_batch_size,
             token_gene_dict=self.token_gene_dict,
             summary_stat=self.summary_stat,
-            save_tdigest=self.save_tdigest,
-            tdigest_path=tdigest_path,
         )
         if self.emb_mode == "cell":
@@ -701,7 +653,7 @@ class EmbExtractor:
                 embs_df = pd.DataFrame(embs.cpu().numpy()).T
         elif self.emb_mode == "gene":
             if self.summary_stat is None:
-                embs_df = label_gene_embs(embs, downsampled_data, self.token_gene_dict, self.gene_emb_style)
             elif self.summary_stat is not None:
                 embs_df = pd.DataFrame(embs).T
                 embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
@@ -885,14 +837,14 @@ class EmbExtractor:
             raise
         if max_ncells_to_plot is not None:
-            if self.max_ncells is not None:
-                if max_ncells_to_plot > self.max_ncells:
-                    max_ncells_to_plot = self.max_ncells
-                    logger.warning(
-                        "max_ncells_to_plot must be <= max_ncells. "
-                        f"Changing max_ncells_to_plot to {self.max_ncells}."
-                    )
-            embs = embs.sample(max_ncells_to_plot, axis=0)
         if self.emb_label is None:
             label_len = 0

     special_token=False,
     summary_stat=None,
     silent=False,
 ):
     model_input_size = pu.get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     # calculate summary stat embs from approximated tdigests
     elif summary_stat is not None:
         if emb_mode == "cell":
             if summary_stat == "mean":
                 summary_emb_list = tdigest_mean(embs_tdigests, emb_dims)
             elif summary_stat == "median":
                 summary_emb_list = tdigest_median(embs_tdigests, emb_dims)
             embs_stack = torch.tensor(summary_emb_list)
         elif emb_mode == "gene":
             if summary_stat == "mean":
                 [
                     update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims)
     return embs_df
+def label_gene_embs(embs, downsampled_data, token_gene_dict):
     gene_set = {
         element for sublist in downsampled_data["input_ids"] for element in sublist
     }
         )
         for k in dict_i.keys():
             gene_emb_dict[k].append(dict_i[k])
+    for k in gene_emb_dict.keys():
+        gene_emb_dict[k] = (
+            torch.squeeze(torch.mean(torch.stack(gene_emb_dict[k]), dim=0), dim=0)
+            .cpu()
+            .numpy()
+        )
+    embs_df = pd.DataFrame(gene_emb_dict).T
     embs_df.index = [token_gene_dict[token] for token in embs_df.index]
     return embs_df
 def plot_umap(embs_df, emb_dims, labels_clean, output_prefix, output_directory, kwargs_dict, seed=0):
     only_embs_df = embs_df.iloc[:, :emb_dims]
         "num_classes": {int},
         "emb_mode": {"cls", "cell", "gene"},
         "cell_emb_style": {"mean_pool"},
+        "gene_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "max_ncells": {None, int},
         "emb_layer": {-1, 0},
         forward_batch_size=100,
         nproc=4,
         summary_stat=None,
         model_version="V2",
         token_dictionary_file=None,
     ):
         cell_emb_style : {"mean_pool"}
             | Method for summarizing cell embeddings if not using CLS token.
             | Currently only option is mean pooling of gene embeddings for given cell.
+        gene_emb_style : "mean_pool"
             | Method for summarizing gene embeddings.
+            | Currently only option is mean pooling of contextual gene embeddings for given gene.
         filter_data : None, dict
             | Default is to extract embeddings from all input data.
             | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
             | If mean or median, outputs only approximated mean or median embedding of input data.
             | Non-exact recommended if encountering memory constraints while generating goal embedding positions.
             | Non-exact is slower but more memory-efficient.
         model_version : str
             | To auto-select settings for model version other than current default.
             | Current options: V1: models pretrained on ~30M cells, V2: models pretrained on ~104M cells
         else:
             self.summary_stat = summary_stat
             self.exact_summary_stat = None
         self.validate_options()
         if self.model_version == "V1":
             from . import TOKEN_DICTIONARY_FILE_30M
             self.token_dictionary_file = TOKEN_DICTIONARY_FILE_30M
             self.model_type, self.num_classes, model_directory, mode="eval"
         )
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         embs = get_embs(
             model=model,
             filtered_input_data=downsampled_data,
             forward_batch_size=self.forward_batch_size,
             token_gene_dict=self.token_gene_dict,
             summary_stat=self.summary_stat,
         )
         if self.emb_mode == "cell":
                 embs_df = pd.DataFrame(embs.cpu().numpy()).T
         elif self.emb_mode == "gene":
             if self.summary_stat is None:
+                embs_df = label_gene_embs(embs, downsampled_data, self.token_gene_dict)
             elif self.summary_stat is not None:
                 embs_df = pd.DataFrame(embs).T
                 embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
             raise
         if max_ncells_to_plot is not None:
+            if max_ncells_to_plot > self.max_ncells:
+                max_ncells_to_plot = self.max_ncells
+                logger.warning(
+                    "max_ncells_to_plot must be <= max_ncells. "
+                    f"Changing max_ncells_to_plot to {self.max_ncells}."
+                )
+            elif max_ncells_to_plot < self.max_ncells:
+                embs = embs.sample(max_ncells_to_plot, axis=0)
         if self.emb_label is None:
             label_len = 0

geneformer/evaluation_utils.py CHANGED Viewed

@@ -77,7 +77,7 @@ def py_softmax(vector):
     return e / e.sum()
-def classifier_predict(model, classifier_type, evalset, forward_batch_size, gene_token_dict, predict_metadata=None):
     if classifier_type == "gene":
         label_name = "labels"
     elif classifier_type == "cell":
@@ -85,14 +85,6 @@ def classifier_predict(model, classifier_type, evalset, forward_batch_size, gene
     predict_logits = []
     predict_labels = []
-    predict_metadata_all = None
-    if predict_metadata is not None:
-        predict_metadata_all = dict()
-        for metadata_name in predict_metadata:
-            predict_metadata_all[metadata_name] = []
     model.eval()
     # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
@@ -107,15 +99,9 @@ def classifier_predict(model, classifier_type, evalset, forward_batch_size, gene
     for i in trange(0, evalset_len, forward_batch_size):
         max_range = min(i + forward_batch_size, evalset_len)
         batch_evalset = evalset.select([i for i in range(i, max_range)])
-        if predict_metadata is not None:
-            for metadata_name in predict_metadata:
-                predict_metadata_all[metadata_name] += batch_evalset[metadata_name]
         padded_batch = preprocess_classifier_batch(
             batch_evalset, max_evalset_len, label_name, gene_token_dict
         )
         padded_batch.set_format(type="torch")
         # For datasets>=4.0.0, convert to dict to avoid format issues
@@ -148,8 +134,7 @@ def classifier_predict(model, classifier_type, evalset, forward_batch_size, gene
     y_pred = [vote(item[0]) for item in logit_label_paired]
     y_true = [item[1] for item in logit_label_paired]
     logits_list = [item[0] for item in logit_label_paired]
-    return y_pred, y_true, logits_list, predict_metadata_all
 def get_metrics(y_pred, y_true, logits_list, num_classes, labels):

     return e / e.sum()
+def classifier_predict(model, classifier_type, evalset, forward_batch_size, gene_token_dict):
     if classifier_type == "gene":
         label_name = "labels"
     elif classifier_type == "cell":
     predict_logits = []
     predict_labels = []
     model.eval()
     # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
     for i in trange(0, evalset_len, forward_batch_size):
         max_range = min(i + forward_batch_size, evalset_len)
         batch_evalset = evalset.select([i for i in range(i, max_range)])
         padded_batch = preprocess_classifier_batch(
             batch_evalset, max_evalset_len, label_name, gene_token_dict
         )
         padded_batch.set_format(type="torch")
         # For datasets>=4.0.0, convert to dict to avoid format issues
     y_pred = [vote(item[0]) for item in logit_label_paired]
     y_true = [item[1] for item in logit_label_paired]
     logits_list = [item[0] for item in logit_label_paired]
+    return y_pred, y_true, logits_list
 def get_metrics(y_pred, y_true, logits_list, num_classes, labels):

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -726,9 +726,6 @@ class InSilicoPerturberStats:
             |               "start_state": "dcm",
             |               "goal_state": "nf",
             |               "alt_states": ["hcm", "other1", "other2"]}
-        pickle_suffix : None, str
-            | Suffix to subselect intermediate raw files for analysis.
-            | Default output of InSilicoPerturber uses suffix "_raw.pickle".
         model_version : str
             | To auto-select settings for model version other than current default.
             | Current options: V1: models pretrained on ~30M cells, V2: models pretrained on ~104M cells

             |               "start_state": "dcm",
             |               "goal_state": "nf",
             |               "alt_states": ["hcm", "other1", "other2"]}
         model_version : str
             | To auto-select settings for model version other than current default.
             | Current options: V1: models pretrained on ~30M cells, V2: models pretrained on ~104M cells

geneformer/perturber_utils.py CHANGED Viewed

@@ -508,11 +508,6 @@ def make_perturbation_batch(
 def make_perturbation_batch_special(
     example_cell, perturb_type, tokens_to_perturb, anchor_token, combo_lvl, num_proc
 ) -> tuple[Dataset, List[int]]:
-    # For datasets>=4.0.0, convert to dict to avoid format issues
-    if int(datasets.__version__.split(".")[0]) >= 4:
-        example_cell = example_cell[:]
     if combo_lvl == 0 and tokens_to_perturb == "all":
         if perturb_type in ["overexpress", "activate"]:
             range_start = 1

 def make_perturbation_batch_special(
     example_cell, perturb_type, tokens_to_perturb, anchor_token, combo_lvl, num_proc
 ) -> tuple[Dataset, List[int]]:
     if combo_lvl == 0 and tokens_to_perturb == "all":
         if perturb_type in ["overexpress", "activate"]:
             range_start = 1

geneformer/tokenizer.py CHANGED Viewed

@@ -3,7 +3,7 @@ Geneformer tokenizer.
 **Input data:**
-| *Required format:* raw counts scRNAseq data without feature selection as .loom, .h5ad, or .zarr file.
 | *Required row (gene) attribute:* "ensembl_id"; Ensembl ID for each gene.
 | *Required col (cell) attribute:* "n_counts"; total read counts in that cell.
@@ -20,9 +20,9 @@ Geneformer tokenizer.
 **Description:**
-| Input data is a directory with .loom, .h5ad, or .zarr files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.
-| The discussion below references the .loom file format, but the analagous labels are required for .h5ad and .zarr files, just that they will be column instead of row attributes and vice versa due to the transposed format of the file types.
 | Genes should be labeled with Ensembl IDs (loom row attribute "ensembl_id"), which provide a unique identifer for conversion to tokens. Other forms of gene annotations (e.g. gene names) can be converted to Ensembl IDs via Ensembl Biomart. Cells should be labeled with the total read count in the cell (loom column attribute "n_counts") to be used for normalization.
@@ -30,7 +30,7 @@ Geneformer tokenizer.
 | Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
-| If one's data is in other formats besides .loom, .h5ad, or .zarr, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom, .h5ad, or .zarr format prior to running the transcriptome tokenizer.
 | OF NOTE: Use model_version to auto-select settings for model version other than current default. For V1 model series (original Geneformer pretrained in 2021 on ~30M cells), one must use correct corresponding token dictionary and gene median file, set special_token to False, and set model_input_size to 2048. This argument enables auto-selection of these settings. (For V2 model series, special_token must be True and model_input_size is 4096.)
@@ -46,7 +46,6 @@ from collections import Counter
 from pathlib import Path
 from typing import Literal
-import anndata as ad
 import loompy as lp
 import numpy as np
 import pandas as pd
@@ -201,16 +200,13 @@ def sum_ensembl_ids(
                             dsout.add_columns(processed_array, col_attrs=view.ca)
                 return dedup_filename
-    elif file_format in ["h5ad", "zarr"]:
         """
         Map Ensembl IDs from gene mapping dictionary. If duplicate Ensembl IDs are found, sum counts together.
         Returns adata object with deduplicated Ensembl IDs.
         """
-        if file_format == "h5ad":
-            data = sc.read_h5ad(str(data_directory))
-        else:  # zarr
-            data = ad.read_zarr(str(data_directory))
         if use_h5ad_index:
             data.var["ensembl_id"] = list(data.var.index)
@@ -240,7 +236,7 @@ def sum_ensembl_ids(
                 gene for gene in ensembl_ids if gene in gene_token_dict.keys()
             ]
             if len(ensembl_id_check) == len(set(ensembl_id_check)):
-                return data
             else:
                 raise ValueError("Error: data Ensembl IDs non-unique.")
@@ -439,7 +435,7 @@ class TranscriptomeTokenizer:
         data_directory: Path | str,
         output_directory: Path | str,
         output_prefix: str,
-        file_format: Literal["loom", "h5ad", "zarr"] = "loom",
         input_identifier: str = "",
         use_generator: bool = False,
     ):
@@ -455,9 +451,9 @@ class TranscriptomeTokenizer:
         output_prefix : str
             | Prefix for output .dataset
         file_format : str
-            | Format of input files. Can be "loom", "h5ad", or "zarr".
         input_identifier : str
-            | Substring identifier for input .loom, .h5ad, or .zarr, only matches are tokenized
             | Default is no identifier, tokenizes all files in provided directory.
         use_generator : bool
             | Whether to use generator or dict for tokenization.
@@ -477,7 +473,7 @@ class TranscriptomeTokenizer:
         tokenized_dataset.save_to_disk(str(output_path))
     def tokenize_files(
-        self, data_directory, file_format: Literal["loom", "h5ad", "zarr"] = "loom", input_identifier: str = ""
     ):
         tokenized_cells = []
         tokenized_counts = []
@@ -489,7 +485,7 @@ class TranscriptomeTokenizer:
         # loops through directories to tokenize .loom files
         file_found = 0
-        # loops through directories to tokenize .loom, .h5ad, or .zarr files
         tokenize_file_fn = (
             self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
         )
@@ -500,7 +496,7 @@ class TranscriptomeTokenizer:
         for file_path in data_directory.glob(file_match):
             file_found = 1
             print(f"Tokenizing {file_path}")
-            file_tokenized_cells, file_cell_metadata, file_tokenized_counts = tokenize_file_fn(file_path, file_format=file_format)
             tokenized_cells += file_tokenized_cells
             tokenized_counts += file_tokenized_counts
             if self.custom_attr_name_dict is not None:
@@ -518,7 +514,7 @@ class TranscriptomeTokenizer:
             raise
         return tokenized_cells, cell_metadata, tokenized_counts
-    def tokenize_anndata(self, adata_file_path, target_sum=10_000, file_format="h5ad"):
         adata = sum_ensembl_ids(
             adata_file_path,
             self.collapse_gene_ids,
@@ -526,7 +522,7 @@ class TranscriptomeTokenizer:
             self.gene_token_dict,
             self.custom_attr_name_dict,
             self.use_h5ad_index,
-            file_format=file_format,
             chunk_size=self.chunk_size,
         )
@@ -616,9 +612,7 @@ class TranscriptomeTokenizer:
         return tokenized_cells, file_cell_metadata, tokenized_counts
-    def tokenize_loom(self, loom_file_path, target_sum=10_000, file_format="loom"):
-        tokenized_counts = []  # keep_counts not implemented for tokenize_loom
         if self.custom_attr_name_dict is not None:
             file_cell_metadata = {
                 attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
@@ -633,7 +627,7 @@ class TranscriptomeTokenizer:
             self.gene_token_dict,
             self.custom_attr_name_dict,
             use_h5ad_index=False,
-            file_format=file_format,
             chunk_size=self.chunk_size,
         )
@@ -706,7 +700,7 @@ class TranscriptomeTokenizer:
                 del data.ra["ensembl_id_collapsed"]
-        return tokenized_cells, file_cell_metadata, tokenized_counts
     def create_dataset(
         self,

 **Input data:**
+| *Required format:* raw counts scRNAseq data without feature selection as .loom or anndata file.
 | *Required row (gene) attribute:* "ensembl_id"; Ensembl ID for each gene.
 | *Required col (cell) attribute:* "n_counts"; total read counts in that cell.
 **Description:**
+| Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.
+| The discussion below references the .loom file format, but the analagous labels are required for .h5ad files, just that they will be column instead of row attributes and vice versa due to the transposed format of the two file types.
 | Genes should be labeled with Ensembl IDs (loom row attribute "ensembl_id"), which provide a unique identifer for conversion to tokens. Other forms of gene annotations (e.g. gene names) can be converted to Ensembl IDs via Ensembl Biomart. Cells should be labeled with the total read count in the cell (loom column attribute "n_counts") to be used for normalization.
 | Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
+| If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.
 | OF NOTE: Use model_version to auto-select settings for model version other than current default. For V1 model series (original Geneformer pretrained in 2021 on ~30M cells), one must use correct corresponding token dictionary and gene median file, set special_token to False, and set model_input_size to 2048. This argument enables auto-selection of these settings. (For V2 model series, special_token must be True and model_input_size is 4096.)
 from pathlib import Path
 from typing import Literal
 import loompy as lp
 import numpy as np
 import pandas as pd
                             dsout.add_columns(processed_array, col_attrs=view.ca)
                 return dedup_filename
+    elif file_format == "h5ad":
         """
         Map Ensembl IDs from gene mapping dictionary. If duplicate Ensembl IDs are found, sum counts together.
         Returns adata object with deduplicated Ensembl IDs.
         """
+        data = sc.read_h5ad(str(data_directory))
         if use_h5ad_index:
             data.var["ensembl_id"] = list(data.var.index)
                 gene for gene in ensembl_ids if gene in gene_token_dict.keys()
             ]
             if len(ensembl_id_check) == len(set(ensembl_id_check)):
+                return data_directory
             else:
                 raise ValueError("Error: data Ensembl IDs non-unique.")
         data_directory: Path | str,
         output_directory: Path | str,
         output_prefix: str,
+        file_format: Literal["loom", "h5ad"] = "loom",
         input_identifier: str = "",
         use_generator: bool = False,
     ):
         output_prefix : str
             | Prefix for output .dataset
         file_format : str
+            | Format of input files. Can be "loom" or "h5ad".
         input_identifier : str
+            | Substring identifier for input .loom or .h5ad, only matches are tokenized
             | Default is no identifier, tokenizes all files in provided directory.
         use_generator : bool
             | Whether to use generator or dict for tokenization.
         tokenized_dataset.save_to_disk(str(output_path))
     def tokenize_files(
+        self, data_directory, file_format: Literal["loom", "h5ad"] = "loom", input_identifier: str = ""
     ):
         tokenized_cells = []
         tokenized_counts = []
         # loops through directories to tokenize .loom files
         file_found = 0
+        # loops through directories to tokenize .loom or .h5ad files
         tokenize_file_fn = (
             self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
         )
         for file_path in data_directory.glob(file_match):
             file_found = 1
             print(f"Tokenizing {file_path}")
+            file_tokenized_cells, file_cell_metadata, file_tokenized_counts = tokenize_file_fn(file_path)
             tokenized_cells += file_tokenized_cells
             tokenized_counts += file_tokenized_counts
             if self.custom_attr_name_dict is not None:
             raise
         return tokenized_cells, cell_metadata, tokenized_counts
+    def tokenize_anndata(self, adata_file_path, target_sum=10_000):
         adata = sum_ensembl_ids(
             adata_file_path,
             self.collapse_gene_ids,
             self.gene_token_dict,
             self.custom_attr_name_dict,
             self.use_h5ad_index,
+            file_format="h5ad",
             chunk_size=self.chunk_size,
         )
         return tokenized_cells, file_cell_metadata, tokenized_counts
+    def tokenize_loom(self, loom_file_path, target_sum=10_000):
         if self.custom_attr_name_dict is not None:
             file_cell_metadata = {
                 attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
             self.gene_token_dict,
             self.custom_attr_name_dict,
             use_h5ad_index=False,
+            file_format="loom",
             chunk_size=self.chunk_size,
         )
                 del data.ra["ensembl_id_collapsed"]
+        return tokenized_cells, file_cell_metadata
     def create_dataset(
         self,