Taykhoom
/

mRNA-FM

@@ -109,3 +109,61 @@ class RnaFmTokenizer(PreTrainedTokenizer):
         if token_ids_1 is None:
             return [0] * (len(token_ids_0) + 2)
         return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 2)

         if token_ids_1 is None:
             return [0] * (len(token_ids_0) + 2)
         return [0] * (len(token_ids_0) + 2) + [0] * (len(token_ids_1) + 2)
+    @staticmethod
+    def _extract_cds(sequence, cds):
+        """Extract CDS region from a sequence, trimmed to a multiple of 3."""
+        import numpy as np
+        if sum(cds) == 0:
+            return sequence[:len(sequence) - (len(sequence) % 3)]
+        first = int(np.argmax(cds == 1))
+        last = int(len(cds) - 1 - np.argmax(np.flip(cds) == 1)) + 2
+        region = sequence[first:last + 1]
+        if len(region) % 3 != 0:
+            region = region[:-(len(region) % 3)]
+        return region
+    def batch_encode_with_cds(self, sequences, cds, max_length=None, **kwargs):
+        """Encode sequences with CDS extraction (k_mer=3 / mRNA-FM only).
+        Applies T->U, extracts the CDS region, chunks to max_length nucleotides
+        (aligned to codon boundaries), and encodes each chunk.
+        Args:
+            sequences: List of raw nucleotide strings (T or U).
+            cds: List of numpy arrays marking CDS codon start positions.
+            max_length: Nucleotide budget per chunk (defaults to
+                (model_max_length - 2) * k_mer).
+            **kwargs: Forwarded to batch_encode_plus (e.g. return_tensors,
+                padding, add_special_tokens).
+        Returns:
+            Tuple of (BatchEncoding, chunk_counts) where chunk_counts[i] is the
+            number of chunks produced for sequences[i].
+        """
+        if self.k_mer != 3:
+            raise ValueError("batch_encode_with_cds requires k_mer=3 (mRNA-FM tokenizer)")
+        budget = max_length if max_length is not None else (self.model_max_length - 2) * self.k_mer
+        budget = (budget // self.k_mer) * self.k_mer
+        all_chunks = []
+        chunk_counts = []
+        for seq, c in zip(sequences, cds):
+            seq = seq.replace("T", "U").replace("t", "u")
+            seq = self._extract_cds(seq, c)
+            raw_chunks = [seq[i:i + budget] for i in range(0, max(len(seq), 1), budget)]
+            chunks = []
+            for chunk in raw_chunks:
+                if len(chunk) % self.k_mer != 0:
+                    chunk = chunk[:-(len(chunk) % self.k_mer)]
+                if chunk:
+                    chunks.append(chunk)
+            if not chunks:
+                chunks = ["AUG"]
+            all_chunks.extend(chunks)
+            chunk_counts.append(len(chunks))
+        enc = self.batch_encode_plus(all_chunks, **kwargs)
+        return enc, chunk_counts