| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Utils for tokenization.""" |
| import warnings |
|
|
| __all__ = ['hf_tokenizer'] |
|
|
|
|
| def set_pad_token_id(tokenizer): |
| """Set pad_token_id to eos_token_id if it is None. |
| |
| Args: |
| tokenizer (transformers.PreTrainedTokenizer): The tokenizer to be set. |
| |
| """ |
| if tokenizer.pad_token_id is None: |
| tokenizer.pad_token_id = tokenizer.eos_token_id |
| warnings.warn(f'tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}') |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| warnings.warn(f'tokenizer.pad_token is None. Now set to {tokenizer.eos_token}') |
|
|
|
|
| def hf_tokenizer(name_or_path, correct_pad_token=True, correct_gemma2=True, **kwargs): |
| """Create a huggingface pretrained tokenizer. |
| |
| Args: |
| name (str): The name of the tokenizer. |
| correct_pad_token (bool): Whether to correct the pad token id. |
| correct_gemma2 (bool): Whether to correct the gemma2 tokenizer. |
| **kwargs: The keyword arguments for the tokenizer. |
| |
| Returns: |
| transformers.PreTrainedTokenizer: The pretrained tokenizer. |
| |
| """ |
| from transformers import AutoTokenizer, AutoConfig, AutoProcessor |
| if correct_gemma2 and isinstance(name_or_path, str) and 'gemma-2-2b-it' in name_or_path: |
| |
| |
| warnings.warn('Found gemma-2-2b-it tokenizer. Set eos_token and eos_token_id to <end_of_turn> and 107.') |
| kwargs['eos_token'] = '<end_of_turn>' |
| kwargs['eos_token_id'] = 107 |
| |
| model = kwargs.get("model",None) |
| |
| if model == "openvla-oft": |
| from verl.utils.vla_utils.openvla_oft.configuration_prismatic import OpenVLAConfig |
| from verl.utils.vla_utils.openvla_oft.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor |
| print("*********USE VLA tokenizer*************") |
| AutoConfig.register("openvla", OpenVLAConfig) |
| AutoProcessor.register(OpenVLAConfig, PrismaticProcessor) |
| processor = AutoProcessor.from_pretrained(name_or_path, trust_remote_code=True) |
| tokenizer=processor.tokenizer |
| elif model == "openvla": |
| from verl.utils.vla_utils.openvla.configuration_prismatic import OpenVLAConfig |
| from verl.utils.vla_utils.openvla.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor |
| print("*********USE VLA tokenizer*************") |
| AutoConfig.register("openvla", OpenVLAConfig) |
| AutoProcessor.register(OpenVLAConfig, PrismaticProcessor) |
| processor = AutoProcessor.from_pretrained(name_or_path, trust_remote_code=True) |
| tokenizer=processor.tokenizer |
| else: |
| tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) |
| |
| if correct_pad_token: |
| set_pad_token_id(tokenizer) |
| return tokenizer |
|
|