Instantiation of tokenizer

#1
by devens-mn - opened

When trying to run the google colab example from the model repo page I find I can't instantiate the tokenizer. The code is shown in the google colab example and the error is:
TypeError Traceback (most recent call last)
in <cell line: 0>()
2 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
----> 4 tokenizer = AutoTokenizer.from_pretrained("olonok/flan-t5-base-pubmed-summarization")
5 model = AutoModelForSeq2SeqLM.from_pretrained("olonok/flan-t5-base-pubmed-summarization")

/usr/local/lib/python3.11/dist-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
1030
1031 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
-> 1032 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
1033 else:
1034 if tokenizer_class_py is not None:

/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2023 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
2024
-> 2025 return cls._from_pretrained(
2026 resolved_vocab_files,
2027 pretrained_model_name_or_path,

/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2061 # loaded directly from the GGUF file.
2062 if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file:
-> 2063 slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
2064 copy.deepcopy(resolved_vocab_files),
2065 pretrained_model_name_or_path,

/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2276 # Instantiate the tokenizer.
2277 try:
-> 2278 tokenizer = cls(*init_inputs, **init_kwargs)
2279 except import_protobuf_decode_error():
2280 logger.info(

/usr/local/lib/python3.11/dist-packages/transformers/models/t5/tokenization_t5.py in init(self, vocab_file, eos_token, unk_token, pad_token, extra_ids, additional_special_tokens, sp_model_kwargs, legacy, add_prefix_space, **kwargs)
148
149 self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 150 self.sp_model.Load(vocab_file)
151
152 if additional_special_tokens is not None:

/usr/local/lib/python3.11/dist-packages/sentencepiece/init.py in Load(self, model_file, model_proto)
959 if model_proto:
960 return self.LoadFromSerializedProto(model_proto)
--> 961 return self.LoadFromFile(model_file)
962
963

/usr/local/lib/python3.11/dist-packages/sentencepiece/init.py in LoadFromFile(self, arg)
314
315 def LoadFromFile(self, arg):
--> 316 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
317
318 def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):

TypeError: not a string
How can I fix this error? I see the same issue in my own notebook and nothing I've seen (including use_fast=True or use_fast=False) seems to fix it. Thank you

Sign up or log in to comment