ctheodoris
/

Geneformer

@@ -94,14 +94,14 @@ class TranscriptomeTokenizer:
             | Keys are the names of the attributes in the loom file.
             | Values are the names of the attributes in the dataset.
         nproc : int
-            Number of processes to use for dataset mapping.
         chunk_size: int = 512
-            Chunk size for anndata tokenizer.
         gene_median_file : Path
-            Path to pickle file containing dictionary of non-zero median
-            gene expression values across Genecorpus-30M.
         token_dictionary_file : Path
-            Path to pickle file containing token dictionary (Ensembl IDs:token).
         """
         # dictionary of custom attributes {output dataset column name: input .loom column name}
         self.custom_attr_name_dict = custom_attr_name_dict
@@ -141,15 +141,15 @@ class TranscriptomeTokenizer:
         **Parameters:**
         data_directory : Path
-            Path to directory containing loom files or anndata files
         output_directory : Path
-            Path to directory where tokenized data will be saved as .dataset
         output_prefix : str
-            Prefix for output .dataset
         file_format : str
-            Format of input files. Can be "loom" or "h5ad".
         use_generator : bool
-            Whether to use generator or dict for tokenization.
         """
         tokenized_cells, cell_metadata = self.tokenize_files(
             Path(data_directory), file_format

             | Keys are the names of the attributes in the loom file.
             | Values are the names of the attributes in the dataset.
         nproc : int
+            | Number of processes to use for dataset mapping.
         chunk_size: int = 512
+            | Chunk size for anndata tokenizer.
         gene_median_file : Path
+            | Path to pickle file containing dictionary of non-zero median
+            | gene expression values across Genecorpus-30M.
         token_dictionary_file : Path
+            | Path to pickle file containing token dictionary (Ensembl IDs:token).
         """
         # dictionary of custom attributes {output dataset column name: input .loom column name}
         self.custom_attr_name_dict = custom_attr_name_dict
         **Parameters:**
         data_directory : Path
+            | Path to directory containing loom files or anndata files
         output_directory : Path
+            | Path to directory where tokenized data will be saved as .dataset
         output_prefix : str
+            | Prefix for output .dataset
         file_format : str
+            | Format of input files. Can be "loom" or "h5ad".
         use_generator : bool
+            | Whether to use generator or dict for tokenization.
         """
         tokenized_cells, cell_metadata = self.tokenize_files(
             Path(data_directory), file_format