Christina Theodoris
commited on
Commit
·
e3330a6
1
Parent(s):
d1931b1
edit docstring format to highlight options
Browse files- geneformer/tokenizer.py +10 -10
geneformer/tokenizer.py
CHANGED
|
@@ -94,14 +94,14 @@ class TranscriptomeTokenizer:
|
|
| 94 |
| Keys are the names of the attributes in the loom file.
|
| 95 |
| Values are the names of the attributes in the dataset.
|
| 96 |
nproc : int
|
| 97 |
-
Number of processes to use for dataset mapping.
|
| 98 |
chunk_size: int = 512
|
| 99 |
-
Chunk size for anndata tokenizer.
|
| 100 |
gene_median_file : Path
|
| 101 |
-
Path to pickle file containing dictionary of non-zero median
|
| 102 |
-
gene expression values across Genecorpus-30M.
|
| 103 |
token_dictionary_file : Path
|
| 104 |
-
Path to pickle file containing token dictionary (Ensembl IDs:token).
|
| 105 |
"""
|
| 106 |
# dictionary of custom attributes {output dataset column name: input .loom column name}
|
| 107 |
self.custom_attr_name_dict = custom_attr_name_dict
|
|
@@ -141,15 +141,15 @@ class TranscriptomeTokenizer:
|
|
| 141 |
**Parameters:**
|
| 142 |
|
| 143 |
data_directory : Path
|
| 144 |
-
Path to directory containing loom files or anndata files
|
| 145 |
output_directory : Path
|
| 146 |
-
Path to directory where tokenized data will be saved as .dataset
|
| 147 |
output_prefix : str
|
| 148 |
-
Prefix for output .dataset
|
| 149 |
file_format : str
|
| 150 |
-
Format of input files. Can be "loom" or "h5ad".
|
| 151 |
use_generator : bool
|
| 152 |
-
Whether to use generator or dict for tokenization.
|
| 153 |
"""
|
| 154 |
tokenized_cells, cell_metadata = self.tokenize_files(
|
| 155 |
Path(data_directory), file_format
|
|
|
|
| 94 |
| Keys are the names of the attributes in the loom file.
|
| 95 |
| Values are the names of the attributes in the dataset.
|
| 96 |
nproc : int
|
| 97 |
+
| Number of processes to use for dataset mapping.
|
| 98 |
chunk_size: int = 512
|
| 99 |
+
| Chunk size for anndata tokenizer.
|
| 100 |
gene_median_file : Path
|
| 101 |
+
| Path to pickle file containing dictionary of non-zero median
|
| 102 |
+
| gene expression values across Genecorpus-30M.
|
| 103 |
token_dictionary_file : Path
|
| 104 |
+
| Path to pickle file containing token dictionary (Ensembl IDs:token).
|
| 105 |
"""
|
| 106 |
# dictionary of custom attributes {output dataset column name: input .loom column name}
|
| 107 |
self.custom_attr_name_dict = custom_attr_name_dict
|
|
|
|
| 141 |
**Parameters:**
|
| 142 |
|
| 143 |
data_directory : Path
|
| 144 |
+
| Path to directory containing loom files or anndata files
|
| 145 |
output_directory : Path
|
| 146 |
+
| Path to directory where tokenized data will be saved as .dataset
|
| 147 |
output_prefix : str
|
| 148 |
+
| Prefix for output .dataset
|
| 149 |
file_format : str
|
| 150 |
+
| Format of input files. Can be "loom" or "h5ad".
|
| 151 |
use_generator : bool
|
| 152 |
+
| Whether to use generator or dict for tokenization.
|
| 153 |
"""
|
| 154 |
tokenized_cells, cell_metadata = self.tokenize_files(
|
| 155 |
Path(data_directory), file_format
|