Add README
Browse files
README.md
CHANGED
|
@@ -1,3 +1,66 @@
|
|
| 1 |
---
|
| 2 |
license: gpl-3.0
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: gpl-3.0
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
## Using CrossDNA 8.1M
|
| 6 |
+
```python
|
| 7 |
+
import os
|
| 8 |
+
os.environ.setdefault("DISABLE_TORCH_COMPILE", "1")
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
if hasattr(torch, "compile"):
|
| 12 |
+
def _no_compile(fn=None, *args, **kwargs):
|
| 13 |
+
if fn is None:
|
| 14 |
+
def deco(f): return f
|
| 15 |
+
return deco
|
| 16 |
+
return fn
|
| 17 |
+
torch.compile = _no_compile
|
| 18 |
+
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 21 |
+
|
| 22 |
+
# Hugging Face Remote Repository Example
|
| 23 |
+
# repo_id = "chengCCC/CrossDNA_pretrain"
|
| 24 |
+
# subdir = "8.1M"
|
| 25 |
+
|
| 26 |
+
# tok = AutoTokenizer.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True)
|
| 27 |
+
# model = AutoModelForMaskedLM.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True).eval()
|
| 28 |
+
|
| 29 |
+
# Local Model Example
|
| 30 |
+
MODEL_DIR = "/data/zhaol/projects/huggingface_crossdna/crossdna"
|
| 31 |
+
tok = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True)
|
| 32 |
+
model = AutoModelForMaskedLM.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True).eval()
|
| 33 |
+
|
| 34 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 35 |
+
model.to(device)
|
| 36 |
+
|
| 37 |
+
# 512bp DNA sequence
|
| 38 |
+
seq = "ACGT" * 128
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
enc = tok(seq, return_tensors="pt", add_special_tokens=False)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
x = enc["input_ids"].to(device) # [1, L]
|
| 45 |
+
|
| 46 |
+
# Key: map tokenizer IDs back to the model-required range [0..4]
|
| 47 |
+
# A, C, G, T, N: 7..11 -> 0..4
|
| 48 |
+
# All other tokens (UNK/PAD/SEP/CLS/...) are treated as N (=4)
|
| 49 |
+
x = torch.where(x >= 7, x - 7, torch.full_like(x, 4))
|
| 50 |
+
|
| 51 |
+
# ====== embedding ======
|
| 52 |
+
was_pretrain = getattr(model.backbone, "pretrain", False)
|
| 53 |
+
was_for_repr = getattr(model.backbone, "for_representation", False)
|
| 54 |
+
model.backbone.pretrain = False
|
| 55 |
+
model.backbone.for_representation = True
|
| 56 |
+
|
| 57 |
+
with torch.inference_mode():
|
| 58 |
+
embeddings, _ = model.backbone(x) # [B, L, H]
|
| 59 |
+
|
| 60 |
+
print("input_ids.shape =", tuple(x.shape)) # input_ids.shape = (1, 512)
|
| 61 |
+
print("embeddings.shape =", tuple(embeddings.shape)) # embeddings.shape = (1, 512, 128)
|
| 62 |
+
|
| 63 |
+
model.backbone.pretrain = was_pretrain
|
| 64 |
+
model.backbone.for_representation = was_for_repr
|
| 65 |
+
|
| 66 |
+
```
|