StormblessedKal commited on
Commit
f5ddb7a
·
verified ·
1 Parent(s): 618a3ca

Upload new model: PL-BERT-MULTILINGUAL

Browse files
.gitattributes CHANGED
@@ -60,3 +60,4 @@ ellie-french-v1/default.wav filter=lfs diff=lfs merge=lfs -text
60
  ember-french-v1/default.wav filter=lfs diff=lfs merge=lfs -text
61
  ellie-italian-v1/default.wav filter=lfs diff=lfs merge=lfs -text
62
  ember-italian-v1/default.wav filter=lfs diff=lfs merge=lfs -text
 
 
60
  ember-french-v1/default.wav filter=lfs diff=lfs merge=lfs -text
61
  ellie-italian-v1/default.wav filter=lfs diff=lfs merge=lfs -text
62
  ember-italian-v1/default.wav filter=lfs diff=lfs merge=lfs -text
63
+ PL-BERT-MULTILINGUAL/step_1100000.t7 filter=lfs diff=lfs merge=lfs -text
PL-BERT-MULTILINGUAL/config.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Checkpoint_all_phonemes"
2
+ mixed_precision: "fp16"
3
+ data_folder: "wikipedia_20220301.en.processed"
4
+ batch_size: 32
5
+ save_interval: 20000
6
+ log_interval: 10
7
+ num_process: 1 # number of GPUs
8
+ num_steps: 2000000
9
+
10
+ dataset_params:
11
+ tokenizer: "bert-base-multilingual-cased"
12
+ token_separator: " " # token used for phoneme separator (space)
13
+ token_mask: "M" # token used for phoneme mask (M)
14
+ word_separator: 102 # token used for word separator (<formula>)
15
+ token_maps: "token_maps.pkl" # token map path
16
+
17
+ max_mel_length: 512 # max phoneme length
18
+
19
+ word_mask_prob: 0.15 # probability to mask the entire word
20
+ phoneme_mask_prob: 0.1 # probability to mask each phoneme
21
+ replace_prob: 0.2 # probablity to replace phonemes
22
+
23
+ model_params:
24
+ vocab_size: 178
25
+ hidden_size: 768
26
+ num_attention_heads: 12
27
+ intermediate_size: 2048
28
+ max_position_embeddings: 512
29
+ num_hidden_layers: 12
30
+ dropout: 0.1
PL-BERT-MULTILINGUAL/step_1100000.t7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e920e029e5226284f6042bef36514323c7fd0ae73c59e0ed4ccecd1f2916fd1
3
+ size 25179228
PL-BERT-MULTILINGUAL/token_maps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb2b72e90be652f176c44fe4b101b897167bf1cd669ad76e11e9400928cc848
3
+ size 1858692
PL-BERT-MULTILINGUAL/util.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import torch
4
+ from transformers import AlbertConfig, AlbertModel
5
+
6
+ class CustomAlbert(AlbertModel):
7
+ def forward(self, *args, **kwargs):
8
+ # Call the original forward method
9
+ outputs = super().forward(*args, **kwargs)
10
+
11
+ # Only return the last_hidden_state
12
+ return outputs.last_hidden_state
13
+
14
+
15
+ def load_plbert(log_dir):
16
+ config_path = os.path.join(log_dir, "config.yml")
17
+ plbert_config = yaml.safe_load(open(config_path))
18
+
19
+ albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
20
+ bert = CustomAlbert(albert_base_configuration)
21
+
22
+ files = os.listdir(log_dir)
23
+ ckpts = []
24
+ for f in os.listdir(log_dir):
25
+ if f.startswith("step_"): ckpts.append(f)
26
+
27
+ iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
28
+ iters = sorted(iters)[-1]
29
+
30
+ checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
31
+ state_dict = checkpoint['net']
32
+ from collections import OrderedDict
33
+ new_state_dict = OrderedDict()
34
+ for k, v in state_dict.items():
35
+ name = k[7:] # remove `module.`
36
+ if name.startswith('encoder.'):
37
+ name = name[8:] # remove `encoder.`
38
+ new_state_dict[name] = v
39
+ try:
40
+ del new_state_dict["embeddings.position_ids"]
41
+ except KeyError:
42
+ pass
43
+ bert.load_state_dict(new_state_dict, strict=False)
44
+
45
+ return bert