| | ---
|
| | license: gpl-3.0
|
| | ---
|
| |
|
| | ## Using CrossDNA 519M
|
| | ```python
|
| | import os
|
| | os.environ.setdefault("DISABLE_TORCH_COMPILE", "1")
|
| |
|
| | import torch
|
| | if hasattr(torch, "compile"):
|
| | def _no_compile(fn=None, *args, **kwargs):
|
| | if fn is None:
|
| | def deco(f): return f
|
| | return deco
|
| | return fn
|
| | torch.compile = _no_compile
|
| |
|
| | import torch
|
| | from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| |
|
| | # Hugging Face Remote Repository Example
|
| | # repo_id = "chengCCC/CrossDNA_pretrain"
|
| | # subdir = "519M"
|
| |
|
| | # tok = AutoTokenizer.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True)
|
| | # model = AutoModelForMaskedLM.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True).eval()
|
| |
|
| | # Local Model Example
|
| | MODEL_DIR = "/data/zhaol/projects/huggingface_crossdna_1024/crossdna"
|
| | tok = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True)
|
| | model = AutoModelForMaskedLM.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True).eval()
|
| |
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| | model.to(device)
|
| |
|
| | # 512bp DNA sequence
|
| | seq = "ACGT" * 128
|
| |
|
| |
|
| | enc = tok(seq, return_tensors="pt", add_special_tokens=False)
|
| |
|
| |
|
| | x = enc["input_ids"].to(device) # [1, L]
|
| |
|
| | # Key: map tokenizer IDs back to the model-required range [0..4]
|
| | # A, C, G, T, N: 7..11 -> 0..4
|
| | # All other tokens (UNK/PAD/SEP/CLS/...) are treated as N (=4)
|
| | x = torch.where(x >= 7, x - 7, torch.full_like(x, 4))
|
| |
|
| | # ====== embedding ======
|
| | was_pretrain = getattr(model.backbone, "pretrain", False)
|
| | was_for_repr = getattr(model.backbone, "for_representation", False)
|
| | model.backbone.pretrain = False
|
| | model.backbone.for_representation = True
|
| |
|
| | with torch.inference_mode():
|
| | embeddings, _ = model.backbone(x) # [B, L, H]
|
| |
|
| | print("input_ids.shape =", tuple(x.shape)) # input_ids.shape = (1, 512)
|
| | print("embeddings.shape =", tuple(embeddings.shape)) # embeddings.shape = (1, 512, 1024)
|
| |
|
| | model.backbone.pretrain = was_pretrain
|
| | model.backbone.for_representation = was_for_repr
|
| |
|
| | ``` |