lmz commited on
Commit
0c9c6fd
·
1 Parent(s): fd2cd21

Upload 20231029-tok.py

Browse files
Files changed (1) hide show
  1. 20231029-tok.py +9 -0
20231029-tok.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from convert_slow_tokenizer import MarianConverter
2
+ from transformers import AutoTokenizer
3
+
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en", use_fast=False)
6
+ fast_tokenizer = MarianConverter(tokenizer, index=0).converted()
7
+ fast_tokenizer.save(f"tokenizer-marian-base-fr.json")
8
+ fast_tokenizer = MarianConverter(tokenizer, index=1).converted()
9
+ fast_tokenizer.save(f"tokenizer-marian-base-en.json")