YoussefKejue commited on
Commit
77b7971
·
verified ·
1 Parent(s): 7600df2

Upload ultravox_tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ultravox_tokenizer.py +25 -0
ultravox_tokenizer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import transformers
4
+
5
+ AUDIO_TOKEN = "<|audio|>"
6
+
7
+
8
+ def from_pretrained_text_tokenizer(
9
+ *args, **kwargs
10
+ ) -> transformers.PreTrainedTokenizerBase:
11
+ """
12
+ Create a tokenizer with the additional special token for audio.
13
+ This is mainly used for VLLM to work properly. This repo does not currently require it.
14
+ """
15
+
16
+ tokenizer = transformers.AutoTokenizer.from_pretrained(*args, **kwargs)
17
+ tokenizer.add_special_tokens({"additional_special_tokens": [AUDIO_TOKEN]})
18
+ logging.info(f"Audio token id: {get_audio_token_id(tokenizer)}")
19
+ return tokenizer
20
+
21
+
22
+ def get_audio_token_id(tokenizer: transformers.PreTrainedTokenizerBase) -> int:
23
+ audio_token_id = tokenizer.encode(AUDIO_TOKEN, add_special_tokens=False)
24
+ assert len(audio_token_id) == 1, "Audio token should be a single token"
25
+ return audio_token_id[0]