kashif HF Staff commited on
Commit
47d83fe
·
1 Parent(s): 19273a6

tokenizer: expose .vocab property for fast-tokenizer-style callers (#3)

Browse files

- tokenizer: expose .vocab property for fast-tokenizer-style callers (54d1dc0a02a2fbce1fc599d4e52d54a1fb49c905)

Files changed (1) hide show
  1. tokenizer.py +8 -0
tokenizer.py CHANGED
@@ -144,6 +144,14 @@ class HybridDNATokenizer(PreTrainedTokenizer):
144
  def get_vocab(self) -> Dict[str, int]:
145
  return self._vocab.copy()
146
 
 
 
 
 
 
 
 
 
147
  def __len__(self):
148
  # Override default (len(get_vocab())) because get_vocab() deduplicates
149
  # CCCCCC which exists as both BPE (ID 91443) and DNA 6-mer (ID 154402).
 
144
  def get_vocab(self) -> Dict[str, int]:
145
  return self._vocab.copy()
146
 
147
+ @property
148
+ def vocab(self) -> Dict[str, int]:
149
+ # Compatibility shim: fast tokenizers (PreTrainedTokenizerFast) expose
150
+ # `tokenizer.vocab` as a property; slow PreTrainedTokenizer subclasses
151
+ # like this one only expose `get_vocab()`. Some downstream tools
152
+ # (e.g. llama.cpp's convert_hf_to_gguf.py) read `.vocab` directly.
153
+ return self._vocab
154
+
155
  def __len__(self):
156
  # Override default (len(get_vocab())) because get_vocab() deduplicates
157
  # CCCCCC which exists as both BPE (ID 91443) and DNA 6-mer (ID 154402).