bhavnicksm commited on
Commit
ddba124
·
verified ·
1 Parent(s): afa6448

Add tokie .tkz tokenizer for voyageai/voyage-3

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -0
  2. README.md +12 -0
  3. tokenizer.json +0 -0
  4. tokenizer.tkz +3 -0
  5. tokenizer_config.json +40 -0
  6. tokie-banner.png +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.tkz filter=lfs diff=lfs merge=lfs -text
37
+ tokie-banner.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - tokie
4
+ ---
5
+
6
+ <p align="center">
7
+ <img src="tokie-banner.png" alt="tokie banner">
8
+ </p>
9
+
10
+ # voyage-3
11
+
12
+ Tokenizer for [voyageai/voyage-3](https://huggingface.co/voyageai/voyage-3) with [tokie](https://github.com/chonkie-inc/tokie) `.tkz` format.
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.tkz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:948dad698bb2c032ae1cc0436c015f5b56fed0163fcdd26b2b19a620e041d372
3
+ size 10575673
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|endoftext|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
tokie-banner.png ADDED

Git LFS Details

  • SHA256: 8fc8db798c25abc3f44d4c90cc2fd8bf3c060886e31f33e2f684fd6a7a4290ac
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB