Upload PlasmidLM pretrained checkpoint (v4, step 15000)

Browse files

Files changed (9) hide show

README.md +124 -0
config.json +29 -0
configuration_plasmid_lm.py +35 -0
generation_config.json +7 -0
model.safetensors +3 -0
modeling_plasmid_lm.py +301 -0
special_tokens.txt +105 -0
tokenization_plasmid_lm.py +115 -0
vocab.json +117 -0

README.md ADDED Viewed

	@@ -0,0 +1,124 @@

+---
+language:
+- en
+license: apache-2.0
+library_name: transformers
+tags:
+- biology
+- genomics
+- dna
+- plasmid
+- synthetic-biology
+- causal-lm
+- protein-engineering
+datasets:
+- custom
+pipeline_tag: text-generation
+model-index:
+- name: PlasmidLM
+  results:
+  - task:
+      type: text-generation
+      name: Plasmid DNA Generation
+    metrics:
+    - name: Eval Loss
+      type: loss
+      value: 0.093
+    - name: Token Accuracy
+      type: accuracy
+      value: 0.961
+---
+# PlasmidLM
+A 17M-parameter transformer language model for conditional generation of synthetic plasmid DNA sequences.
+## Model Description
+PlasmidLM generates plasmid DNA sequences conditioned on functional component specifications. Given a prompt specifying desired elements (antibiotic resistance genes, origins of replication, promoters, reporters, etc.), it autoregressively generates a complete DNA sequence containing those elements.
+**Architecture**: LLaMA-style transformer decoder with RoPE, RMSNorm, and GELU activations.
+| Parameter | Value |
+|-----------|-------|
+| Parameters | 17M |
+| Hidden size | 384 |
+| Layers | 10 |
+| Attention heads | 8 |
+| Context length | 16,384 tokens |
+| Vocabulary | 120 tokens |
+The vocabulary consists of 5 DNA bases (A, T, C, G, N), control tokens (BOS, EOS, SEP, PAD, UNK), and ~100 categorical tokens representing functional plasmid components (e.g., `<AMR_KANAMYCIN>`, `<ORI_COLE1>`, `<PROM_T7>`).
+## Training
+Pretrained with causal language modeling on ~108K plasmid sequences derived from the [Addgene](https://www.addgene.org/) repository, annotated with functional components via [pLannotate](https://github.com/barricklab/pLannotate).
+- **Steps**: 15,000
+- **Epochs**: ~2.3
+- **Eval loss**: 0.093
+- **Token accuracy**: 96.1%
+- **Optimizer**: AdamW
+- **Precision**: bf16
+## Intended Use
+This is a **base pretrained model**. It has learned the statistical patterns of plasmid DNA sequences and their relationship to categorical component tokens. It can be used for:
+- **Direct generation**: Prompt with component tokens to generate plasmid sequences
+- **Fine-tuning**: Post-train with reinforcement learning (GRPO/PPO) to improve motif placement accuracy
+- **Embeddings**: Use hidden states as learned representations of plasmid sequences
+- **Research**: Study the learned structure of synthetic DNA
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("McClain/PlasmidLM", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("McClain/PlasmidLM", trust_remote_code=True)
+# Generate a plasmid with kanamycin resistance and ColE1 origin
+prompt = "<BOS><AMR_KANAMYCIN><ORI_COLE1><SEP>"
+inputs = tokenizer(prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=4096, temperature=0.8, do_sample=True)
+sequence = tokenizer.decode(outputs[0], skip_special_tokens=False)
+print(sequence)
+```
+## Input Format
+```
+<BOS><TOKEN1><TOKEN2>...<SEP>
+```
+The model generates DNA bases (A/T/C/G) after the `<SEP>` token until it produces `<EOS>` or hits the maximum length.
+## Component Categories
+| Category | Examples | Count |
+|----------|----------|-------|
+| Antibiotic Resistance (AMR) | Kanamycin, Ampicillin, Chloramphenicol, ... | 11 |
+| Origin of Replication (ORI) | ColE1, F1, P15A, pSC101, SV40, ... | 7 |
+| Promoter (PROM) | CMV, T7, U6, EF1a, CAG, ... | 11 |
+| Reporter | EGFP, mCherry, YFP, NanoLuc, ... | 6 |
+| Vector Type (VEC) | Lentiviral, CRISPR, Bacterial, AAV, ... | 10 |
+| Other | Tags, elements, species, backbones | ~55 |
+## Limitations
+- This is a **pretrained base model** -- it learns sequence statistics but has not been optimized for motif placement accuracy. Post-training with RL significantly improves functional element fidelity.
+- Generated sequences are **not experimentally validated**. Always verify computationally (e.g., with pLannotate) and experimentally before synthesis.
+- The model was trained on Addgene plasmids, which are biased toward commonly deposited vectors (mammalian expression, bacterial cloning, CRISPR).
+- Maximum context of 16K tokens (~16 kbp), which covers most but not all plasmids.
+## Citation
+```bibtex
+@misc{thiel2026plasmidlm,
+  title={PlasmidLM: Language Models for Conditional Plasmid DNA Generation},
+  author={Thiel, McClain},
+  year={2026},
+  url={https://huggingface.co/McClain/PlasmidLM}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "PlasmidLMForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "dtype": "float32",
+  "eos_token_id": 1,
+  "hidden_act": "gelu",
+  "hidden_size": 384,
+  "intermediate_size": 1536,
+  "max_position_embeddings": 16384,
+  "model_type": "plasmid_lm",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 10,
+  "pad_token_id": 3,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "transformers_version": "4.57.6",
+  "vocab_size": 120,
+  "auto_map": {
+    "AutoConfig": "configuration_plasmid_lm.PlasmidLMConfig",
+    "AutoModel": "modeling_plasmid_lm.PlasmidLMModel",
+    "AutoModelForCausalLM": "modeling_plasmid_lm.PlasmidLMForCausalLM",
+    "AutoTokenizer": [
+      "tokenization_plasmid_lm.PlasmidLMTokenizer",
+      null
+    ]
+  }
+}

configuration_plasmid_lm.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""HuggingFace configuration for PlasmidLM."""
+from transformers import PretrainedConfig
+class PlasmidLMConfig(PretrainedConfig):
+    model_type = "plasmid_lm"
+    def __init__(
+        self,
+        vocab_size: int = 112,
+        hidden_size: int = 384,
+        num_hidden_layers: int = 10,
+        num_attention_heads: int = 8,
+        intermediate_size: int = 1536,
+        hidden_act: str = "gelu",
+        rms_norm_eps: float = 1e-5,
+        max_position_embeddings: int = 16384,
+        rope_theta: float = 10000.0,
+        tie_word_embeddings: bool = True,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        super().__init__(
+            vocab_size=vocab_size,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 3,
+  "transformers_version": "4.57.6"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4429ce1399e6bab5ac053d7aae115daa7870032b0f54769859d24acb664ba91
+size 71004376

modeling_plasmid_lm.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""HuggingFace-compatible PlasmidLM model for use with AutoModelForCausalLM and vLLM."""
+from __future__ import annotations
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.cache_utils import DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_plasmid_lm import PlasmidLMConfig
+def _rope_freqs(dim: int, max_len: int, base: float) -> Tuple[torch.Tensor, torch.Tensor]:
+    freqs = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+    t = torch.arange(max_len).float()
+    angles = torch.outer(t, freqs)
+    return torch.cos(angles), torch.sin(angles)
+def _apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, offset: int = 0) -> torch.Tensor:
+    S = x.shape[2]
+    cos = cos[offset:offset + S].unsqueeze(0).unsqueeze(0)
+    sin = sin[offset:offset + S].unsqueeze(0).unsqueeze(0)
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1).flatten(-2)
+class PlasmidLMAttention(nn.Module):
+    def __init__(self, config: PlasmidLMConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope_cos: torch.Tensor,
+        rope_sin: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        position_offset: int = 0,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        B, S, _ = hidden_states.shape
+        q = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        dtype = q.dtype
+        q = _apply_rope(q, rope_cos, rope_sin, offset=position_offset).to(dtype)
+        k = _apply_rope(k, rope_cos, rope_sin, offset=position_offset).to(dtype)
+        if past_key_value is not None:
+            k = torch.cat([past_key_value[0], k], dim=2)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        new_kv = (k, v)
+        use_causal = past_key_value is None
+        attn = F.scaled_dot_product_attention(q, k, v, is_causal=use_causal)
+        out = attn.transpose(1, 2).reshape(B, S, -1)
+        return self.o_proj(out), new_kv
+class PlasmidLMMLP(nn.Module):
+    def __init__(self, config: PlasmidLMConfig):
+        super().__init__()
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act = nn.GELU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
+class PlasmidLMDecoderLayer(nn.Module):
+    def __init__(self, config: PlasmidLMConfig):
+        super().__init__()
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = PlasmidLMAttention(config)
+        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = PlasmidLMMLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope_cos: torch.Tensor,
+        rope_sin: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        position_offset: int = 0,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_out, new_kv = self.self_attn(hidden_states, rope_cos, rope_sin, past_key_value, position_offset)
+        hidden_states = residual + attn_out
+        residual = hidden_states
+        hidden_states = residual + self.mlp(self.post_attention_layernorm(hidden_states))
+        return hidden_states, new_kv
+class PlasmidLMPreTrainedModel(PreTrainedModel):
+    config_class = PlasmidLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, PlasmidLMModel):
+            module.gradient_checkpointing = value
+class PlasmidLMModel(PlasmidLMPreTrainedModel):
+    """Base model (backbone) — returned by AutoModel."""
+    def __init__(self, config: PlasmidLMConfig):
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([PlasmidLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        head_dim = config.hidden_size // config.num_attention_heads
+        cos, sin = _rope_freqs(head_dim, config.max_position_embeddings, config.rope_theta)
+        self.register_buffer("rope_cos", cos, persistent=False)
+        self.register_buffer("rope_sin", sin, persistent=False)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[list] = None,
+        position_offset: int = 0,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, list]:
+        hidden_states = self.embed_tokens(input_ids)
+        new_kv_caches = []
+        for i, layer in enumerate(self.layers):
+            past_kv = past_key_values[i] if past_key_values else None
+            if self.gradient_checkpointing and self.training:
+                # Gradient checkpointing recomputes activations on backward — no past_kv during training
+                def make_ckpt_fn(l):
+                    def fn(h, cos, sin):
+                        out, kv = l(h, cos, sin, None, 0)
+                        return out, kv[0], kv[1]
+                    return fn
+                hidden_states, k, v = torch.utils.checkpoint.checkpoint(
+                    make_ckpt_fn(layer), hidden_states, self.rope_cos, self.rope_sin,
+                    use_reentrant=False,
+                )
+                new_kv = (k, v)
+            else:
+                hidden_states, new_kv = layer(hidden_states, self.rope_cos, self.rope_sin, past_kv, position_offset)
+            new_kv_caches.append(new_kv)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, new_kv_caches
+class PlasmidLMForCausalLM(PlasmidLMPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: PlasmidLMConfig):
+        super().__init__(config)
+        self.model = PlasmidLMModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        has_cache = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, DynamicCache):
+                has_cache = past_key_values.get_seq_length() > 0
+            elif isinstance(past_key_values, list):
+                has_cache = len(past_key_values) > 0 and past_key_values[0] is not None
+        if has_cache:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def _convert_cache_to_list(self, past_key_values) -> Optional[list]:
+        """Convert DynamicCache to list of (k, v) tuples for our model."""
+        if past_key_values is None:
+            return None
+        if isinstance(past_key_values, list):
+            return past_key_values
+        if isinstance(past_key_values, DynamicCache):
+            if past_key_values.get_seq_length() == 0:
+                return None
+            return [(layer.keys, layer.values) for layer in past_key_values.layers]
+        return None
+    def _convert_list_to_cache(self, kv_list: list) -> DynamicCache:
+        """Convert list of (k, v) tuples to DynamicCache."""
+        cache = DynamicCache()
+        for i, (k, v) in enumerate(kv_list):
+            cache.update(k, v, layer_idx=i)
+        return cache
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values=None,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        kv_list = self._convert_cache_to_list(past_key_values)
+        position_offset = 0
+        if kv_list is not None:
+            position_offset = kv_list[0][0].shape[2]
+        hidden_states, new_kv_list = self.model(input_ids, kv_list, position_offset)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        new_cache = None
+        if use_cache:
+            new_cache = self._convert_list_to_cache(new_kv_list)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=new_cache,
+        )
+    @torch.no_grad()
+    def generate_simple(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int = 512,
+        temperature: float = 0.8,
+        top_k: int = 50,
+    ) -> torch.Tensor:
+        """Simple autoregressive generation with KV cache."""
+        # Prefill
+        hidden_states, kv_caches = self.model(input_ids)
+        logits = self.lm_head(hidden_states[:, -1:, :]).squeeze(1)
+        cur_len = input_ids.shape[1]
+        for _ in range(max_new_tokens):
+            scaled = logits.float() / temperature
+            scaled = torch.nan_to_num(scaled, nan=0.0, posinf=1e4, neginf=-1e4)
+            if top_k > 0:
+                k = min(top_k, scaled.size(-1))
+                v, _ = torch.topk(scaled, k)
+                scaled[scaled < v[:, [-1]]] = float("-inf")
+            probs = F.softmax(scaled, dim=-1)
+            next_token = torch.multinomial(probs, 1)
+            input_ids = torch.cat([input_ids, next_token], dim=1)
+            hidden_states, kv_caches = self.model(next_token, kv_caches, cur_len)
+            logits = self.lm_head(hidden_states).squeeze(1)
+            cur_len += 1
+        return input_ids

special_tokens.txt ADDED Viewed

	@@ -0,0 +1,105 @@

+<BOS>
+<EOS>
+<SEP>
+<PAD>
+<UNK>
+<SEQ>
+<AMR_AMPICILLIN>
+<AMR_BLASTICIDIN>
+<AMR_CHLORAMPHENICOL>
+<AMR_GENTAMICIN>
+<AMR_HYGROMYCIN>
+<AMR_KANAMYCIN>
+<AMR_NEOMYCIN>
+<AMR_PUROMYCIN>
+<AMR_SPECTINOMYCIN>
+<AMR_TETRACYCLINE>
+<AMR_ZEOCIN>
+<BB_LENTIGUIDE_PURO>
+<BB_P1316_IGG2A>
+<BB_PAAV2>
+<BB_PAAV>
+<BB_PCDNA31+>
+<BB_PCDNA31>
+<BB_PCDNA3>
+<BB_PCMV>
+<BB_PCRII_TOPO>
+<BB_PD649>
+<BB_PDONR221>
+<BB_PDONR223>
+<BB_PEGFP_C1>
+<BB_PEGFP_N1>
+<BB_PET28A>
+<BB_PHAGE>
+<BB_PLX_TRC317>
+<BB_PTT3>
+<BB_PUC19>
+<BB_UNKNOWN>
+<COPY_HIGH>
+<COPY_LOW>
+<ELEM_AAV_ITR>
+<ELEM_CMV_ENHANCER>
+<ELEM_CMV_INTRON>
+<ELEM_CPPT>
+<ELEM_GRNA_SCAFFOLD>
+<ELEM_IRES>
+<ELEM_LTR_3>
+<ELEM_LTR_5>
+<ELEM_MCS>
+<ELEM_POLYA_BGH>
+<ELEM_POLYA_SV40>
+<ELEM_PSI>
+<ELEM_TRACRRNA>
+<ELEM_WPRE>
+<ORI_2MU>
+<ORI_COLE1>
+<ORI_F1>
+<ORI_P15A>
+<ORI_PSC101>
+<ORI_RSF>
+<ORI_SV40>
+<PROM_AMPR>
+<PROM_CAG>
+<PROM_CMV>
+<PROM_EF1A>
+<PROM_LAC>
+<PROM_RSV>
+<PROM_SP6>
+<PROM_SV40>
+<PROM_T3>
+<PROM_T5>
+<PROM_T7>
+<PROM_U6>
+<REPORTER_EGFP>
+<REPORTER_GFP>
+<REPORTER_MCHERRY>
+<REPORTER_MEMERALD>
+<REPORTER_NANOLUC>
+<REPORTER_YFP>
+<SP_CELEGANS>
+<SP_DROSOPHILA>
+<SP_ECOLI>
+<SP_HUMAN>
+<SP_MOUSE>
+<SP_RAT>
+<SP_SYNTHETIC>
+<SP_YEAST>
+<SP_ZEBRAFISH>
+<TAG_FLAG>
+<TAG_GST>
+<TAG_HA>
+<TAG_HIS>
+<TAG_MYC>
+<TAG_NLS>
+<TAG_V5>
+<VEC_AAV>
+<VEC_BACTERIAL>
+<VEC_CRISPR>
+<VEC_GATEWAY>
+<VEC_INSECT>
+<VEC_LENTIVIRAL>
+<VEC_MAMMALIAN>
+<VEC_PLANT>
+<VEC_REPORTER>
+<VEC_RETROVIRAL>
+<VEC_YEAST>

tokenization_plasmid_lm.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""HuggingFace-compatible tokenizer for PlasmidLM."""
+from __future__ import annotations
+import json
+import os
+import re
+from typing import List, Optional
+from transformers import PreTrainedTokenizer
+DNA_BASES = list("ATCGNatcgn")
+class PlasmidLMTokenizer(PreTrainedTokenizer):
+    """Character-level tokenizer for plasmid sequences with special tokens."""
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file: str,
+        bos_token: str = "<BOS>",
+        eos_token: str = "<EOS>",
+        unk_token: str = "<UNK>",
+        pad_token: str = "<PAD>",
+        sep_token: str = "<SEP>",
+        **kwargs,
+    ):
+        # Load vocab before calling super().__init__
+        with open(vocab_file, "r") as f:
+            data = json.load(f)
+        # Support nested format with "token_to_id" key
+        if isinstance(data, dict) and "token_to_id" in data:
+            data = data["token_to_id"]
+        # Ensure DNA bases are in the vocab (matching PlasmidTokenizer)
+        next_id = max(data.values()) + 1 if data else 0
+        for base in DNA_BASES:
+            if base not in data:
+                data[base] = next_id
+                next_id += 1
+        self._vocab = data
+        self._id_to_token = {v: k for k, v in self._vocab.items()}
+        # Only pass special tokens that actually exist in the vocab.
+        # PreTrainedTokenizer would otherwise create new IDs for them.
+        special_kwargs = {}
+        for name, tok in [("bos_token", bos_token), ("eos_token", eos_token),
+                          ("unk_token", unk_token), ("pad_token", pad_token),
+                          ("sep_token", sep_token)]:
+            if tok in self._vocab:
+                special_kwargs[name] = tok
+        super().__init__(**special_kwargs, **kwargs)
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    @property
+    def pad_token_id(self) -> int:
+        return self._vocab.get("<PAD>", 0)
+    @property
+    def bos_token_id(self) -> int:
+        return self._vocab.get("<BOS>", 1)
+    @property
+    def eos_token_id(self) -> int:
+        return self._vocab.get("<EOS>", 2)
+    @property
+    def sep_token_id(self) -> int:
+        return self._vocab.get("<SEP>", 3)
+    def get_vocab(self) -> dict:
+        return dict(self._vocab)
+    def _tokenize(self, text: str) -> List[str]:
+        """Split into special <...> tokens and individual characters."""
+        parts = re.split(r"(<[^>]+>)", text)
+        tokens = []
+        for part in parts:
+            if not part or part.isspace():
+                continue
+            if part.startswith("<") and part.endswith(">"):
+                tokens.append(part)
+            else:
+                tokens.extend(c for c in part if not c.isspace())
+        return tokens
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab.get("<UNK>", 0))
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._id_to_token.get(index, "<UNK>")
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
+        with open(vocab_file, "w") as f:
+            json.dump(self._vocab, f, indent=2)
+        return (vocab_file,)

vocab.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "<BOS>": 0,
+  "<EOS>": 1,
+  "<SEP>": 2,
+  "<PAD>": 3,
+  "<UNK>": 4,
+  "<SEQ>": 5,
+  "<AMR_AMPICILLIN>": 6,
+  "<AMR_BLASTICIDIN>": 7,
+  "<AMR_CHLORAMPHENICOL>": 8,
+  "<AMR_GENTAMICIN>": 9,
+  "<AMR_HYGROMYCIN>": 10,
+  "<AMR_KANAMYCIN>": 11,
+  "<AMR_NEOMYCIN>": 12,
+  "<AMR_PUROMYCIN>": 13,
+  "<AMR_SPECTINOMYCIN>": 14,
+  "<AMR_TETRACYCLINE>": 15,
+  "<AMR_ZEOCIN>": 16,
+  "<BB_LENTIGUIDE_PURO>": 17,
+  "<BB_P1316_IGG2A>": 18,
+  "<BB_PAAV2>": 19,
+  "<BB_PAAV>": 20,
+  "<BB_PCDNA31+>": 21,
+  "<BB_PCDNA31>": 22,
+  "<BB_PCDNA3>": 23,
+  "<BB_PCMV>": 24,
+  "<BB_PCRII_TOPO>": 25,
+  "<BB_PD649>": 26,
+  "<BB_PDONR221>": 27,
+  "<BB_PDONR223>": 28,
+  "<BB_PEGFP_C1>": 29,
+  "<BB_PEGFP_N1>": 30,
+  "<BB_PET28A>": 31,
+  "<BB_PHAGE>": 32,
+  "<BB_PLX_TRC317>": 33,
+  "<BB_PTT3>": 34,
+  "<BB_PUC19>": 35,
+  "<BB_UNKNOWN>": 36,
+  "<COPY_HIGH>": 37,
+  "<COPY_LOW>": 38,
+  "<ELEM_AAV_ITR>": 39,
+  "<ELEM_CMV_ENHANCER>": 40,
+  "<ELEM_CMV_INTRON>": 41,
+  "<ELEM_CPPT>": 42,
+  "<ELEM_GRNA_SCAFFOLD>": 43,
+  "<ELEM_IRES>": 44,
+  "<ELEM_LTR_3>": 45,
+  "<ELEM_LTR_5>": 46,
+  "<ELEM_MCS>": 47,
+  "<ELEM_POLYA_BGH>": 48,
+  "<ELEM_POLYA_SV40>": 49,
+  "<ELEM_PSI>": 50,
+  "<ELEM_TRACRRNA>": 51,
+  "<ELEM_WPRE>": 52,
+  "<ORI_2MU>": 53,
+  "<ORI_COLE1>": 54,
+  "<ORI_F1>": 55,
+  "<ORI_P15A>": 56,
+  "<ORI_PSC101>": 57,
+  "<ORI_RSF>": 58,
+  "<ORI_SV40>": 59,
+  "<PROM_AMPR>": 60,
+  "<PROM_CAG>": 61,
+  "<PROM_CMV>": 62,
+  "<PROM_EF1A>": 63,
+  "<PROM_LAC>": 64,
+  "<PROM_RSV>": 65,
+  "<PROM_SP6>": 66,
+  "<PROM_SV40>": 67,
+  "<PROM_T3>": 68,
+  "<PROM_T5>": 69,
+  "<PROM_T7>": 70,
+  "<PROM_U6>": 71,
+  "<REPORTER_EGFP>": 72,
+  "<REPORTER_GFP>": 73,
+  "<REPORTER_MCHERRY>": 74,
+  "<REPORTER_MEMERALD>": 75,
+  "<REPORTER_NANOLUC>": 76,
+  "<REPORTER_YFP>": 77,
+  "<SP_CELEGANS>": 78,
+  "<SP_DROSOPHILA>": 79,
+  "<SP_ECOLI>": 80,
+  "<SP_HUMAN>": 81,
+  "<SP_MOUSE>": 82,
+  "<SP_RAT>": 83,
+  "<SP_SYNTHETIC>": 84,
+  "<SP_YEAST>": 85,
+  "<SP_ZEBRAFISH>": 86,
+  "<TAG_FLAG>": 87,
+  "<TAG_GST>": 88,
+  "<TAG_HA>": 89,
+  "<TAG_HIS>": 90,
+  "<TAG_MYC>": 91,
+  "<TAG_NLS>": 92,
+  "<TAG_V5>": 93,
+  "<VEC_AAV>": 94,
+  "<VEC_BACTERIAL>": 95,
+  "<VEC_CRISPR>": 96,
+  "<VEC_GATEWAY>": 97,
+  "<VEC_INSECT>": 98,
+  "<VEC_LENTIVIRAL>": 99,
+  "<VEC_MAMMALIAN>": 100,
+  "<VEC_PLANT>": 101,
+  "<VEC_REPORTER>": 102,
+  "<VEC_RETROVIRAL>": 103,
+  "<VEC_YEAST>": 104,
+  "A": 105,
+  "T": 106,
+  "C": 107,
+  "G": 108,
+  "N": 109,
+  "a": 110,
+  "t": 111,
+  "c": 112,
+  "g": 113,
+  "n": 114
+}