flpelerin commited on
Commit
788217c
·
1 Parent(s): 1d49067

Update file tokenizer.cli.py

Browse files
Files changed (1) hide show
  1. tokenizer.cli.py +31 -1
tokenizer.cli.py CHANGED
@@ -1 +1,31 @@
1
- # TODO: Implement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from argparse import ArgumentParser
3
+ from tokenizer import Tokenizer
4
+
5
+
6
+
7
+
8
+ parser = ArgumentParser(
9
+ prog='Flop Tokenizer Python code',
10
+ description=''
11
+ )
12
+
13
+
14
+ if __name__ == '__main__':
15
+ print('Hello world')
16
+
17
+ parser.add_argument('-i', '--input_file')
18
+ parser.add_argument('-o', '--output_file', default='tokenizer.bin')
19
+ parser.add_argument('-n', '--max_vocab_size', default=32000)
20
+
21
+ args = parser.parse_args()
22
+
23
+ tokenizer = Tokenizer()
24
+
25
+ with open(args.input_file, 'r') as f:
26
+ dataset = f.read()
27
+
28
+ tokenizer.train_rulebased(dataset, max_length=args.max_vocab_size);
29
+ tokenizer.to_file(args.output_file)
30
+
31
+ print(f"Tokenizer has vocab size: {tokenizer.vocab_size}");