Translation
Transformers
Safetensors
Kannada
English
controlmt
text2text-generation
machine-translation
kannada
english
indic
low-resource
code-mix
encoder-decoder
custom_code
Eval Results (legacy)
Instructions to use anandkaman/controlmt-v2.3 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use anandkaman/controlmt-v2.3 with Transformers:
# Use a pipeline as a high-level helper # Warning: Pipeline type "translation" is no longer supported in transformers v5. # You must load the model directly (see below) or downgrade to v4.x with: # 'pip install "transformers<5.0.0' from transformers import pipeline pipe = pipeline("translation", model="anandkaman/controlmt-v2.3", trust_remote_code=True)# Load model directly from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained("anandkaman/controlmt-v2.3", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ControlMT tokenizer wrapper — SentencePiece + control/direction token handling. | |
| Lets users load via: | |
| AutoTokenizer.from_pretrained("anandkaman/controlmt-v2.3", trust_remote_code=True) | |
| """ | |
| import os | |
| from typing import List, Optional, Union | |
| import sentencepiece as spm | |
| from transformers import PreTrainedTokenizer | |
| VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} | |
| class ControlMTTokenizer(PreTrainedTokenizer): | |
| """Minimal SentencePiece wrapper with ControlMT's direction + style tokens. | |
| The model expects input formatted as: | |
| [BOS] [DIRECTION_ID] [STYLE_ID] <source tokens> [EOS] | |
| Use the high-level `.translate_text(...)` convenience that builds this prefix, | |
| or the lower-level `.encode(...)` if doing it manually. | |
| """ | |
| vocab_files_names = VOCAB_FILES_NAMES | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| vocab_file: str, | |
| bos_token: str = "<s>", | |
| eos_token: str = "</s>", | |
| unk_token: str = "<unk>", | |
| pad_token: str = "<pad>", | |
| sp_model_kwargs: Optional[dict] = None, | |
| direction_tokens: Optional[dict] = None, | |
| control_tokens: Optional[dict] = None, | |
| **kwargs, | |
| ): | |
| self.vocab_file = vocab_file | |
| self.sp_model_kwargs = sp_model_kwargs or {} | |
| self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) | |
| self.sp_model.Load(vocab_file) | |
| self.direction_tokens = direction_tokens or { | |
| "kn2en": 4, "en2kn": 5, | |
| "rkn2kn": 12, "rkn2en": 13, "hi2en": 14, "en2hi": 15, | |
| } | |
| self.control_tokens = control_tokens or { | |
| "strict": 6, "natural": 7, "formal": 8, | |
| "casual": 9, "json": 10, "text": 11, | |
| } | |
| super().__init__( | |
| bos_token=bos_token, eos_token=eos_token, | |
| unk_token=unk_token, pad_token=pad_token, | |
| sp_model_kwargs=self.sp_model_kwargs, | |
| direction_tokens=self.direction_tokens, | |
| control_tokens=self.control_tokens, | |
| **kwargs, | |
| ) | |
| def vocab_size(self) -> int: | |
| return self.sp_model.get_piece_size() | |
| def get_vocab(self): | |
| return {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} | |
| def _tokenize(self, text: str) -> List[str]: | |
| return self.sp_model.encode(text, out_type=str) | |
| def _convert_token_to_id(self, token: str) -> int: | |
| return self.sp_model.piece_to_id(token) | |
| def _convert_id_to_token(self, index: int) -> str: | |
| return self.sp_model.id_to_piece(index) | |
| def convert_tokens_to_string(self, tokens: List[str]) -> str: | |
| return self.sp_model.decode(tokens) | |
| def encode(self, text: str, **kwargs) -> List[int]: | |
| """Plain SentencePiece encoding (no prefix). Used inside .translate_text().""" | |
| return self.sp_model.encode(text, out_type=int) | |
| def decode(self, ids: List[int], **kwargs) -> str: | |
| # Strip any direction/style/special tokens that may have leaked | |
| special = set([0, 1, 2, 3]) # PAD, BOS, EOS, UNK | |
| special.update(self.direction_tokens.values()) | |
| special.update(self.control_tokens.values()) | |
| ids = [i for i in ids if i not in special] | |
| return self.sp_model.decode(ids) | |
| def translate_text(self, text: str, direction: str = "kn2en") -> List[int]: | |
| """Build the full HF-style input_ids prefix: [BOS] [DIRECTION] [CONTROL] tokens [EOS] | |
| v2.3 ships single-register; the control token slot is fixed to the architectural | |
| default (NATURAL = id 7). Future versions may surface a register selector. | |
| """ | |
| dir_id = self.direction_tokens[direction] | |
| ctrl_id = self.control_tokens.get("natural", 7) | |
| body = self.encode(text) | |
| return [1, dir_id, ctrl_id] + body + [2] # 1=BOS, 2=EOS | |
| def save_vocabulary(self, save_directory: str, filename_prefix: str = None): | |
| import shutil | |
| out_file = os.path.join( | |
| save_directory, | |
| (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] | |
| ) | |
| if os.path.abspath(self.vocab_file) != os.path.abspath(out_file): | |
| shutil.copy(self.vocab_file, out_file) | |
| return (out_file,) | |