File size: 2,137 Bytes
472c9c6
 
 
dc7bee8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
---

license: gpl-3.0
---


## Using CrossDNA 8.1M
```python

import os

os.environ.setdefault("DISABLE_TORCH_COMPILE", "1")  



import torch

if hasattr(torch, "compile"):

    def _no_compile(fn=None, *args, **kwargs):

        if fn is None:

            def deco(f): return f

            return deco

        return fn

    torch.compile = _no_compile



import torch

from transformers import AutoTokenizer, AutoModelForMaskedLM



# Hugging Face Remote Repository Example

# repo_id = "chengCCC/CrossDNA_pretrain"

# subdir = "8.1M"



# tok = AutoTokenizer.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True)

# model = AutoModelForMaskedLM.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True).eval()



# Local Model Example

MODEL_DIR = "/data/zhaol/projects/huggingface_crossdna/crossdna"

tok = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True)

model = AutoModelForMaskedLM.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True).eval()



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)



# 512bp DNA sequence

seq = "ACGT" * 128





enc = tok(seq, return_tensors="pt", add_special_tokens=False)





x = enc["input_ids"].to(device)  # [1, L]



# Key: map tokenizer IDs back to the model-required range [0..4]

#   A, C, G, T, N: 7..11 -> 0..4

#   All other tokens (UNK/PAD/SEP/CLS/...) are treated as N (=4)

x = torch.where(x >= 7, x - 7, torch.full_like(x, 4))



# ======  embedding ======

was_pretrain = getattr(model.backbone, "pretrain", False)

was_for_repr = getattr(model.backbone, "for_representation", False)

model.backbone.pretrain = False

model.backbone.for_representation = True



with torch.inference_mode():

    embeddings, _ = model.backbone(x)   # [B, L, H]



print("input_ids.shape =", tuple(x.shape)) # input_ids.shape = (1, 512)

print("embeddings.shape =", tuple(embeddings.shape)) # embeddings.shape = (1, 512, 128)



model.backbone.pretrain = was_pretrain

model.backbone.for_representation = was_for_repr



```