--- license: gpl-3.0 --- ## Using CrossDNA 8.1M ```python import os os.environ.setdefault("DISABLE_TORCH_COMPILE", "1") import torch if hasattr(torch, "compile"): def _no_compile(fn=None, *args, **kwargs): if fn is None: def deco(f): return f return deco return fn torch.compile = _no_compile import torch from transformers import AutoTokenizer, AutoModelForMaskedLM # Hugging Face Remote Repository Example # repo_id = "chengCCC/CrossDNA_pretrain" # subdir = "8.1M" # tok = AutoTokenizer.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True) # model = AutoModelForMaskedLM.from_pretrained(repo_id, subfolder=subdir, trust_remote_code=True, local_files_only=True).eval() # Local Model Example MODEL_DIR = "/data/zhaol/projects/huggingface_crossdna/crossdna" tok = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True) model = AutoModelForMaskedLM.from_pretrained(MODEL_DIR, trust_remote_code=True, local_files_only=True).eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # 512bp DNA sequence seq = "ACGT" * 128 enc = tok(seq, return_tensors="pt", add_special_tokens=False) x = enc["input_ids"].to(device) # [1, L] # Key: map tokenizer IDs back to the model-required range [0..4] # A, C, G, T, N: 7..11 -> 0..4 # All other tokens (UNK/PAD/SEP/CLS/...) are treated as N (=4) x = torch.where(x >= 7, x - 7, torch.full_like(x, 4)) # ====== embedding ====== was_pretrain = getattr(model.backbone, "pretrain", False) was_for_repr = getattr(model.backbone, "for_representation", False) model.backbone.pretrain = False model.backbone.for_representation = True with torch.inference_mode(): embeddings, _ = model.backbone(x) # [B, L, H] print("input_ids.shape =", tuple(x.shape)) # input_ids.shape = (1, 512) print("embeddings.shape =", tuple(embeddings.shape)) # embeddings.shape = (1, 512, 128) model.backbone.pretrain = was_pretrain model.backbone.for_representation = was_for_repr ```