| | """Tokenization classes for Arctic.""" |
| |
|
| | from typing import Any, Dict, Optional |
| |
|
| | from transformers.models.llama import LlamaTokenizer |
| |
|
| |
|
| | class ArcticTokenizer(LlamaTokenizer): |
| |
|
| | def __init__( |
| | self, |
| | vocab_file, |
| | unk_token="<unk>", |
| | bos_token="<s>", |
| | eos_token="</s>", |
| | pad_token=None, |
| | sp_model_kwargs: Optional[Dict[str, Any]] = None, |
| | add_bos_token=True, |
| | add_eos_token=False, |
| | clean_up_tokenization_spaces=False, |
| | use_default_system_prompt=False, |
| | spaces_between_special_tokens=False, |
| | legacy=False, |
| | add_prefix_space=True, |
| | **kwargs, |
| | ): |
| | |
| | super().__init__( |
| | vocab_file, |
| | bos_token=bos_token, |
| | eos_token=eos_token, |
| | unk_token=unk_token, |
| | pad_token=pad_token, |
| | sp_model_kwargs=sp_model_kwargs, |
| | add_bos_token=add_bos_token, |
| | add_eos_token=add_eos_token, |
| | clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
| | use_default_system_prompt=use_default_system_prompt, |
| | spaces_between_special_tokens=spaces_between_special_tokens, |
| | legacy=legacy, |
| | add_prefix_space=add_prefix_space, |
| | **kwargs, |
| | ) |
| |
|
| | @property |
| | def default_chat_template(self): |
| | """ |
| | This template formats inputs in the standard Arctic format. |
| | """ |
| | return ( |
| | "{% for message in messages %}" |
| | "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" |
| | "{% endfor %}" |
| | "{% if add_generation_prompt %}" |
| | "{{ '<|im_start|>assistant\n' }}" |
| | "{% endif %}" |
| | ) |
| |
|