Aobangaming commited on
Commit
359d4bb
·
verified ·
1 Parent(s): 1ddd872

Upload 3 files

Browse files
aoban_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
aoban_tokenizer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
2
+ from tokenizers.models import BPE
3
+ from tokenizers.trainers import BpeTrainer
4
+ from tokenizers.pre_tokenizers import ByteLevel
5
+ from tokenizers.decoders import ByteLevel as ByteLevelDecoder
6
+ from tokenizers.processors import TemplateProcessing
7
+
8
+ tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
9
+
10
+ tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
11
+ tokenizer.decoder = ByteLevelDecoder()
12
+
13
+ trainer = BpeTrainer(
14
+ vocab_size=5000,
15
+ min_frequency=1, # Lowered to combine words more easily
16
+ special_tokens=["<PAD>", "<UNK>", "<|endoftext|>"]
17
+ )
18
+
19
+ tokenizer.train(
20
+ files=["training_data.txt"],
21
+ trainer=trainer
22
+ )
23
+
24
+ tokenizer.post_processor = TemplateProcessing(
25
+ single="$A <|endoftext|>",
26
+ special_tokens=[
27
+ ("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>"))
28
+ ]
29
+ )
30
+
31
+ tokenizer.save("aoban_tokenizer.json")
32
+ print("Tokenizer fixed and saved.")
aoban_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9e242fc49770cc8f4a2bc5707c33c03e8039c159383aaf3a68e548bf9198b5
3
+ size 475722284