Merge remote-tracking branch 'origin/main'
Browse files- geneformer/gene_name_id_dict_gc95M.pkl +2 -2
- geneformer/mtl/data.py +1 -1
- geneformer/tokenizer.py +17 -0
geneformer/gene_name_id_dict_gc95M.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fabfa0c2f49c598c59ae432a32c3499a5908c033756c663b5e0cddf58deea8e1
|
| 3 |
+
size 1660882
|
geneformer/mtl/data.py
CHANGED
|
@@ -112,7 +112,7 @@ def preload_and_process_data(config):
|
|
| 112 |
# Validate that the mappings match
|
| 113 |
validate_label_mappings(config)
|
| 114 |
|
| 115 |
-
return (*train_data, *val_data
|
| 116 |
|
| 117 |
|
| 118 |
def validate_label_mappings(config):
|
|
|
|
| 112 |
# Validate that the mappings match
|
| 113 |
validate_label_mappings(config)
|
| 114 |
|
| 115 |
+
return (*train_data[:2], *val_data) # Return train and val data along with mappings
|
| 116 |
|
| 117 |
|
| 118 |
def validate_label_mappings(config):
|
geneformer/tokenizer.py
CHANGED
|
@@ -88,6 +88,7 @@ def sum_ensembl_ids(
|
|
| 88 |
collapse_gene_ids,
|
| 89 |
gene_mapping_dict,
|
| 90 |
gene_token_dict,
|
|
|
|
| 91 |
file_format="loom",
|
| 92 |
chunk_size=512,
|
| 93 |
):
|
|
@@ -104,6 +105,13 @@ def sum_ensembl_ids(
|
|
| 104 |
"ensembl_id_collapsed" not in data.ra.keys()
|
| 105 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# Get the ensembl ids that exist in data
|
| 109 |
ensembl_ids = data.ra.ensembl_id
|
|
@@ -208,6 +216,13 @@ def sum_ensembl_ids(
|
|
| 208 |
assert (
|
| 209 |
"ensembl_id_collapsed" not in data.var.columns
|
| 210 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
# Get the ensembl ids that exist in data
|
|
@@ -461,6 +476,7 @@ class TranscriptomeTokenizer:
|
|
| 461 |
self.collapse_gene_ids,
|
| 462 |
self.gene_mapping_dict,
|
| 463 |
self.gene_token_dict,
|
|
|
|
| 464 |
file_format="h5ad",
|
| 465 |
chunk_size=self.chunk_size,
|
| 466 |
)
|
|
@@ -537,6 +553,7 @@ class TranscriptomeTokenizer:
|
|
| 537 |
self.collapse_gene_ids,
|
| 538 |
self.gene_mapping_dict,
|
| 539 |
self.gene_token_dict,
|
|
|
|
| 540 |
file_format="loom",
|
| 541 |
chunk_size=self.chunk_size,
|
| 542 |
)
|
|
|
|
| 88 |
collapse_gene_ids,
|
| 89 |
gene_mapping_dict,
|
| 90 |
gene_token_dict,
|
| 91 |
+
custom_attr_name_dict,
|
| 92 |
file_format="loom",
|
| 93 |
chunk_size=512,
|
| 94 |
):
|
|
|
|
| 105 |
"ensembl_id_collapsed" not in data.ra.keys()
|
| 106 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
| 107 |
|
| 108 |
+
assert (
|
| 109 |
+
"n_counts" in data.ca.keys()
|
| 110 |
+
), "'n_counts' column missing from data.ca.keys()"
|
| 111 |
+
|
| 112 |
+
if custom_attr_name_dict is not None:
|
| 113 |
+
for label in custom_attr_name_dict:
|
| 114 |
+
assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
|
| 115 |
|
| 116 |
# Get the ensembl ids that exist in data
|
| 117 |
ensembl_ids = data.ra.ensembl_id
|
|
|
|
| 216 |
assert (
|
| 217 |
"ensembl_id_collapsed" not in data.var.columns
|
| 218 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
| 219 |
+
assert (
|
| 220 |
+
"n_counts" in data.obs.columns
|
| 221 |
+
), "'n_counts' column missing from data.obs"
|
| 222 |
+
|
| 223 |
+
if custom_attr_name_dict is not None:
|
| 224 |
+
for label in custom_attr_name_dict:
|
| 225 |
+
assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
|
| 226 |
|
| 227 |
|
| 228 |
# Get the ensembl ids that exist in data
|
|
|
|
| 476 |
self.collapse_gene_ids,
|
| 477 |
self.gene_mapping_dict,
|
| 478 |
self.gene_token_dict,
|
| 479 |
+
self.custom_attr_name_dict,
|
| 480 |
file_format="h5ad",
|
| 481 |
chunk_size=self.chunk_size,
|
| 482 |
)
|
|
|
|
| 553 |
self.collapse_gene_ids,
|
| 554 |
self.gene_mapping_dict,
|
| 555 |
self.gene_token_dict,
|
| 556 |
+
self.custom_attr_name_dict,
|
| 557 |
file_format="loom",
|
| 558 |
chunk_size=self.chunk_size,
|
| 559 |
)
|