Add checks for custom attributes and n_counts prior to sum ensembl id
#461
by
hchen725
- opened
- geneformer/tokenizer.py +17 -0
geneformer/tokenizer.py
CHANGED
|
@@ -88,6 +88,7 @@ def sum_ensembl_ids(
|
|
| 88 |
collapse_gene_ids,
|
| 89 |
gene_mapping_dict,
|
| 90 |
gene_token_dict,
|
|
|
|
| 91 |
file_format="loom",
|
| 92 |
chunk_size=512,
|
| 93 |
):
|
|
@@ -104,6 +105,13 @@ def sum_ensembl_ids(
|
|
| 104 |
"ensembl_id_collapsed" not in data.ra.keys()
|
| 105 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# Get the ensembl ids that exist in data
|
| 109 |
ensembl_ids = data.ra.ensembl_id
|
|
@@ -208,6 +216,13 @@ def sum_ensembl_ids(
|
|
| 208 |
assert (
|
| 209 |
"ensembl_id_collapsed" not in data.var.columns
|
| 210 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
# Get the ensembl ids that exist in data
|
|
@@ -461,6 +476,7 @@ class TranscriptomeTokenizer:
|
|
| 461 |
self.collapse_gene_ids,
|
| 462 |
self.gene_mapping_dict,
|
| 463 |
self.gene_token_dict,
|
|
|
|
| 464 |
file_format="h5ad",
|
| 465 |
chunk_size=self.chunk_size,
|
| 466 |
)
|
|
@@ -537,6 +553,7 @@ class TranscriptomeTokenizer:
|
|
| 537 |
self.collapse_gene_ids,
|
| 538 |
self.gene_mapping_dict,
|
| 539 |
self.gene_token_dict,
|
|
|
|
| 540 |
file_format="loom",
|
| 541 |
chunk_size=self.chunk_size,
|
| 542 |
)
|
|
|
|
| 88 |
collapse_gene_ids,
|
| 89 |
gene_mapping_dict,
|
| 90 |
gene_token_dict,
|
| 91 |
+
custom_attr_name_dict,
|
| 92 |
file_format="loom",
|
| 93 |
chunk_size=512,
|
| 94 |
):
|
|
|
|
| 105 |
"ensembl_id_collapsed" not in data.ra.keys()
|
| 106 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
| 107 |
|
| 108 |
+
assert (
|
| 109 |
+
"n_counts" in data.ca.keys()
|
| 110 |
+
), "'n_counts' column missing from data.ca.keys()"
|
| 111 |
+
|
| 112 |
+
if custom_attr_name_dict is not None:
|
| 113 |
+
for label in custom_attr_name_dict:
|
| 114 |
+
assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
|
| 115 |
|
| 116 |
# Get the ensembl ids that exist in data
|
| 117 |
ensembl_ids = data.ra.ensembl_id
|
|
|
|
| 216 |
assert (
|
| 217 |
"ensembl_id_collapsed" not in data.var.columns
|
| 218 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
| 219 |
+
assert (
|
| 220 |
+
"n_counts" in data.obs.columns
|
| 221 |
+
), "'n_counts' column missing from data.obs"
|
| 222 |
+
|
| 223 |
+
if custom_attr_name_dict is not None:
|
| 224 |
+
for label in custom_attr_name_dict:
|
| 225 |
+
assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
|
| 226 |
|
| 227 |
|
| 228 |
# Get the ensembl ids that exist in data
|
|
|
|
| 476 |
self.collapse_gene_ids,
|
| 477 |
self.gene_mapping_dict,
|
| 478 |
self.gene_token_dict,
|
| 479 |
+
self.custom_attr_name_dict,
|
| 480 |
file_format="h5ad",
|
| 481 |
chunk_size=self.chunk_size,
|
| 482 |
)
|
|
|
|
| 553 |
self.collapse_gene_ids,
|
| 554 |
self.gene_mapping_dict,
|
| 555 |
self.gene_token_dict,
|
| 556 |
+
self.custom_attr_name_dict,
|
| 557 |
file_format="loom",
|
| 558 |
chunk_size=self.chunk_size,
|
| 559 |
)
|