Update geneformer/tokenizer.py
Browse filesAdd ensembl_id_check under if not collapse_gene_ids
- geneformer/tokenizer.py +8 -2
geneformer/tokenizer.py
CHANGED
|
@@ -110,7 +110,10 @@ def sum_ensembl_ids(
|
|
| 110 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 111 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 112 |
if not collapse_gene_ids:
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
return data_directory
|
| 115 |
else:
|
| 116 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
|
@@ -212,7 +215,10 @@ def sum_ensembl_ids(
|
|
| 212 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 213 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 214 |
if not collapse_gene_ids:
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
| 216 |
return data_directory
|
| 217 |
else:
|
| 218 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
|
|
|
| 110 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 111 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 112 |
if not collapse_gene_ids:
|
| 113 |
+
ensembl_id_check = [
|
| 114 |
+
gene for gene in ensembl_ids if gene in gene_token_dict.keys()
|
| 115 |
+
]
|
| 116 |
+
if len(ensembl_id_check) == len(set(ensembl_id_check)):
|
| 117 |
return data_directory
|
| 118 |
else:
|
| 119 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
|
|
|
| 215 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 216 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 217 |
if not collapse_gene_ids:
|
| 218 |
+
ensembl_id_check = [
|
| 219 |
+
gene for gene in ensembl_ids if gene in gene_token_dict.keys()
|
| 220 |
+
]
|
| 221 |
+
if len(ensembl_id_check) == len(set(ensembl_id_check)):
|
| 222 |
return data_directory
|
| 223 |
else:
|
| 224 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|