chengCCC commited on
Commit
dc7bee8
·
verified ·
1 Parent(s): e5e9905

Add README

Browse files
Files changed (1) hide show
  1. README.md +63 -0
README.md CHANGED
@@ -1,3 +1,66 @@
1
  ---
2
  license: gpl-3.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: gpl-3.0
3
  ---
4
+
5
+ ## Using CrossDNA 8.1M
6
+ ```python
7
+ import os
8
+ os.environ.setdefault("DISABLE_TORCH_COMPILE", "1")
9
+
10
+ import torch
11
+ if hasattr(torch, "compile"):
12
+ def _no_compile(fn=None, *args, **kwargs):
13
+ if fn is None:
14
+ def deco(f): return f
15
+ return deco
16
+ return fn
17
+ torch.compile = _no_compile
18
+
19
+ import torch
20
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
21
+
22
+ # Hugging Face Remote Repository Example
23
+ # repo_id = "chengCCC/CrossDNA_pretrain"
24
+ # subdir = "8.1M"
25
+
26
+ # tok = AutoTokenizer.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True)
27
+ # model = AutoModelForMaskedLM.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True).eval()
28
+
29
+ # Local Model Example
30
+ MODEL_DIR = "/data/zhaol/projects/huggingface_crossdna/crossdna"
31
+ tok = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True)
32
+ model = AutoModelForMaskedLM.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True).eval()
33
+
34
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+ model.to(device)
36
+
37
+ # 512bp DNA sequence
38
+ seq = "ACGT" * 128
39
+
40
+
41
+ enc = tok(seq, return_tensors="pt", add_special_tokens=False)
42
+
43
+
44
+ x = enc["input_ids"].to(device) # [1, L]
45
+
46
+ # Key: map tokenizer IDs back to the model-required range [0..4]
47
+ # A, C, G, T, N: 7..11 -> 0..4
48
+ # All other tokens (UNK/PAD/SEP/CLS/...) are treated as N (=4)
49
+ x = torch.where(x >= 7, x - 7, torch.full_like(x, 4))
50
+
51
+ # ====== embedding ======
52
+ was_pretrain = getattr(model.backbone, "pretrain", False)
53
+ was_for_repr = getattr(model.backbone, "for_representation", False)
54
+ model.backbone.pretrain = False
55
+ model.backbone.for_representation = True
56
+
57
+ with torch.inference_mode():
58
+ embeddings, _ = model.backbone(x) # [B, L, H]
59
+
60
+ print("input_ids.shape =", tuple(x.shape)) # input_ids.shape = (1, 512)
61
+ print("embeddings.shape =", tuple(embeddings.shape)) # embeddings.shape = (1, 512, 128)
62
+
63
+ model.backbone.pretrain = was_pretrain
64
+ model.backbone.for_representation = was_for_repr
65
+
66
+ ```