TJMUCH
/

transcriptome-iseeek

Model card Files Files and versions

lixiangchun commited on Dec 10, 2021

Commit

70cd2c0

·

1 Parent(s): dc0bd2a

update README

Files changed (1) hide show

README.md +32 -3

README.md CHANGED Viewed

@@ -2,9 +2,10 @@
 # iSEEEK
 A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
-```python
 ## An simple pipeline for single-cell analysis
 import torch
 import re
 from tqdm import tqdm
 import numpy as np
@@ -31,8 +32,8 @@ model.eval()
 ## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
-lines = [s.strip() for s in gzip.open("pbmc_ranking.txt.gz")]
-labels = [s.strip() for s in gzip.open("pbmc_label.txt.gz")]
 labels = np.asarray(labels)
@@ -66,3 +67,31 @@ sc.pl.umap(adata, color=['celltype','leiden'],save= "UMAP")
 ```

 # iSEEEK
 A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
 ## An simple pipeline for single-cell analysis
+```python
 import torch
+import gzip
 import re
 from tqdm import tqdm
 import numpy as np
 ## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
+lines = [s.strip().decode() for s in gzip.open("pbmc_ranking.txt.gz")]
+labels = [s.strip().decode() for s in gzip.open("pbmc_label.txt.gz")]
 labels = np.asarray(labels)
 ```
+## Extract token representations
+```python
+cell_counts = len(lines)
+x = np.zeros((cell_counts, len(tokenizer)), dtype=np.float16)
+for a in tqdm(dl, total=len(dl)):
+    batch = tokenizer(a, max_length=128, truncation=True,
+               padding=True, return_tensors="pt")
+    for k, v in batch.items():
+        batch[k] = v.to(device)
+    with torch.no_grad():
+        out = model(**batch)
+    eos_idxs = batch.attention_mask.sum(dim=1) - 1
+    f = out.last_hidden_state
+    batch_size = f.shape[0]
+    input_ids = batch.input_ids
+    for i in range(batch_size):
+        ##genes = tokenizer.batch_decode(input_ids[i])
+        token_norms = [f[i][j].norm().item() for j in range(1, eos_idxs[i])]
+        idxs = input_ids[i].tolist()[1:eos_idxs[i]]
+        x[counter, idxs] = token_norms
+        counter = counter + 1
+```