Commit
·
70cd2c0
1
Parent(s):
dc0bd2a
update README
Browse files
README.md
CHANGED
|
@@ -2,9 +2,10 @@
|
|
| 2 |
# iSEEEK
|
| 3 |
A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
|
| 4 |
|
| 5 |
-
```python
|
| 6 |
## An simple pipeline for single-cell analysis
|
|
|
|
| 7 |
import torch
|
|
|
|
| 8 |
import re
|
| 9 |
from tqdm import tqdm
|
| 10 |
import numpy as np
|
|
@@ -31,8 +32,8 @@ model.eval()
|
|
| 31 |
|
| 32 |
|
| 33 |
## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
|
| 34 |
-
lines = [s.strip() for s in gzip.open("pbmc_ranking.txt.gz")]
|
| 35 |
-
labels = [s.strip() for s in gzip.open("pbmc_label.txt.gz")]
|
| 36 |
labels = np.asarray(labels)
|
| 37 |
|
| 38 |
|
|
@@ -66,3 +67,31 @@ sc.pl.umap(adata, color=['celltype','leiden'],save= "UMAP")
|
|
| 66 |
|
| 67 |
```
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# iSEEEK
|
| 3 |
A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
|
| 4 |
|
|
|
|
| 5 |
## An simple pipeline for single-cell analysis
|
| 6 |
+
```python
|
| 7 |
import torch
|
| 8 |
+
import gzip
|
| 9 |
import re
|
| 10 |
from tqdm import tqdm
|
| 11 |
import numpy as np
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
|
| 35 |
+
lines = [s.strip().decode() for s in gzip.open("pbmc_ranking.txt.gz")]
|
| 36 |
+
labels = [s.strip().decode() for s in gzip.open("pbmc_label.txt.gz")]
|
| 37 |
labels = np.asarray(labels)
|
| 38 |
|
| 39 |
|
|
|
|
| 67 |
|
| 68 |
```
|
| 69 |
|
| 70 |
+
## Extract token representations
|
| 71 |
+
```python
|
| 72 |
+
|
| 73 |
+
cell_counts = len(lines)
|
| 74 |
+
x = np.zeros((cell_counts, len(tokenizer)), dtype=np.float16)
|
| 75 |
+
|
| 76 |
+
for a in tqdm(dl, total=len(dl)):
|
| 77 |
+
batch = tokenizer(a, max_length=128, truncation=True,
|
| 78 |
+
padding=True, return_tensors="pt")
|
| 79 |
+
|
| 80 |
+
for k, v in batch.items():
|
| 81 |
+
batch[k] = v.to(device)
|
| 82 |
+
|
| 83 |
+
with torch.no_grad():
|
| 84 |
+
out = model(**batch)
|
| 85 |
+
|
| 86 |
+
eos_idxs = batch.attention_mask.sum(dim=1) - 1
|
| 87 |
+
f = out.last_hidden_state
|
| 88 |
+
batch_size = f.shape[0]
|
| 89 |
+
input_ids = batch.input_ids
|
| 90 |
+
|
| 91 |
+
for i in range(batch_size):
|
| 92 |
+
##genes = tokenizer.batch_decode(input_ids[i])
|
| 93 |
+
token_norms = [f[i][j].norm().item() for j in range(1, eos_idxs[i])]
|
| 94 |
+
idxs = input_ids[i].tolist()[1:eos_idxs[i]]
|
| 95 |
+
x[counter, idxs] = token_norms
|
| 96 |
+
counter = counter + 1
|
| 97 |
+
```
|