hezarai
/

bert-base-fa

arxyzan commited on Oct 1, 2023

Commit

ef087db

1 Parent(s): 53709b9

Hezar: Upload tokenizer_config.yaml

Files changed (1) hide show

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,6 +1,5 @@
 name: wordpiece_tokenizer
 config_type: preprocessor
-pretrained_path: hezar-ai/bert-base-fa
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
@@ -8,22 +7,15 @@ stride: 0
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
-pad_token_id: 0
-pad_token: '[PAD]'
 pad_token_type_id: 0
 unk_token: '[UNK]'
-special_tokens:
-- '[UNK]'
-- '[SEP]'
-- '[CLS]'
-- '[PAD]'
-- '[MASK]'
 wordpieces_prefix: '##'
-train_config:
-  name: wordpiece_tokenizer
-  config_type: preprocessor
-  vocab_size: 30000
-  min_frequency: 2
-  limit_alphabet: 1000
-  initial_alphabet: []
-  show_progress: true

 name: wordpiece_tokenizer
 config_type: preprocessor
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
 pad_token_type_id: 0
 unk_token: '[UNK]'
+sep_token: '[SEP]'
+pad_token: '[PAD]'
+cls_token: '[CLS]'
+mask_token: '[MASK]'
 wordpieces_prefix: '##'
+vocab_size: 42000
+min_frequency: 2
+limit_alphabet: 1000
+initial_alphabet: []
+show_progress: true