niobures commited on
Commit
a47d980
·
verified ·
1 Parent(s): d1c5d31

PL-BERT (code, models, paper)

Browse files
Files changed (38) hide show
  1. .gitattributes +4 -0
  2. Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions.pdf +3 -0
  3. code/PL-BERT.zip +3 -0
  4. models/ar/pl-bert (fadi77)/.gitattributes +35 -0
  5. models/ar/pl-bert (fadi77)/README.md +107 -0
  6. models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/char_indexer.py +24 -0
  7. models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/config.yml +37 -0
  8. models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/model.pth +3 -0
  9. models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/char_indexer.py +24 -0
  10. models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/config.yml +39 -0
  11. models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/model.pth +3 -0
  12. models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/config.yml +36 -0
  13. models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/model.pth +3 -0
  14. models/en/mpbert (ydqmkkx)/.gitattributes +35 -0
  15. models/en/mpbert (ydqmkkx)/PhonemeBERT.zip +3 -0
  16. models/en/mpbert (ydqmkkx)/README.md +11 -0
  17. models/en/mpbert (ydqmkkx)/config.json +28 -0
  18. models/en/mpbert (ydqmkkx)/pytorch_model.bin +3 -0
  19. models/en/mpbert (ydqmkkx)/source.txt +1 -0
  20. models/en/plbert (ydqmkkx)/.gitattributes +1 -0
  21. models/en/plbert (ydqmkkx)/PhonemeBERT.zip +3 -0
  22. models/en/plbert (ydqmkkx)/README.md +11 -0
  23. models/en/plbert (ydqmkkx)/config.json +28 -0
  24. models/en/plbert (ydqmkkx)/pytorch_model.bin +3 -0
  25. models/hi/fine-tuned-PL_BERT-hindi/.gitattributes +37 -0
  26. models/hi/fine-tuned-PL_BERT-hindi/README.md +41 -0
  27. models/hi/fine-tuned-PL_BERT-hindi/source.txt +1 -0
  28. models/hi/fine-tuned-PL_BERT-hindi/step_2500.t7 +3 -0
  29. models/hi/fine-tuned-PL_BERT-hindi/step_5000.t7 +3 -0
  30. models/multi/multilingual-pl-bert/.gitattributes +36 -0
  31. models/multi/multilingual-pl-bert/README.md +44 -0
  32. models/multi/multilingual-pl-bert/config.yml +30 -0
  33. models/multi/multilingual-pl-bert/issues.txt +46 -0
  34. models/multi/multilingual-pl-bert/languages.txt +14 -0
  35. models/multi/multilingual-pl-bert/source.txt +1 -0
  36. models/multi/multilingual-pl-bert/step_1100000.t7 +3 -0
  37. models/multi/multilingual-pl-bert/token_maps.pkl +3 -0
  38. models/multi/multilingual-pl-bert/util.py +45 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/hi/fine-tuned-PL_BERT-hindi/step_2500.t7 filter=lfs diff=lfs merge=lfs -text
37
+ models/hi/fine-tuned-PL_BERT-hindi/step_5000.t7 filter=lfs diff=lfs merge=lfs -text
38
+ models/multi/multilingual-pl-bert/step_1100000.t7 filter=lfs diff=lfs merge=lfs -text
39
+ Phoneme-Level[[:space:]]BERT[[:space:]]for[[:space:]]Enhanced[[:space:]]Prosody[[:space:]]of[[:space:]]Text-to-Speech[[:space:]]with[[:space:]]Grapheme[[:space:]]Predictions.pdf filter=lfs diff=lfs merge=lfs -text
Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:976c96b54742fbc7209bfcacb9e6cb3d1e32d57a582e84f6aab7e8c4eaed5d52
3
+ size 278130
code/PL-BERT.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02de2d67919d8bd9dd1bd83cf153aead58926d3c6f9b008d14f062779c0c4253
3
+ size 7760040
models/ar/pl-bert (fadi77)/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/ar/pl-bert (fadi77)/README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
3
+ # Doc / guide: https://huggingface.co/docs/hub/model-cards
4
+ {}
5
+ ---
6
+
7
+ # Model Card for Arabic PL-BERT Models
8
+
9
+ This model card describes a collection of three Arabic BERT models trained with different objectives and datasets for phoneme-aware language modeling.
10
+
11
+ ## Model Details
12
+
13
+ ### Model Description
14
+
15
+ These models are Arabic adaptations of the PL-BERT (Phoneme-aware Language BERT) approach introduced in [Ashby et al. (2023)](https://arxiv.org/pdf/2301.08810). The models incorporate phonemic information to enhance language understanding, with variations in training objectives and data preprocessing.
16
+
17
+ The collection includes three models:
18
+ - **mlm_p2g_non_diacritics**: Trained with both MLM (Masked Language Modeling) and P2G (Phoneme-to-Grapheme) objectives on non-diacritized Arabic text
19
+ - **mlm_only_non_diacritics**: Trained with only the MLM objective on non-diacritized Arabic text
20
+ - **mlm_only_with_diacritics**: Fine-tuned version of mlm_only_non_diacritics on diacritized Arabic text
21
+
22
+ **Developed by:** Fadi (GitHub: Fadi987)
23
+ **Model type:** Transformer-based language models (BERT variants)
24
+ **Language:** Arabic
25
+
26
+ ### Model Sources
27
+
28
+ - **Paper (PL-BERT approach):** [Ashby et al. (2023)](https://arxiv.org/pdf/2301.08810)
29
+
30
+ ## Training Details
31
+
32
+ ### Training Data
33
+
34
+ All models were initially trained on a cleaned version of the Arabic Wikipedia dataset. The dataset is available at [wikipedia.20231101.ar](https://huggingface.co/datasets/wikimedia/wikipedia/tree/main/20231101.ar).
35
+
36
+ For the **mlm_only_with_diacritics** model, a random sample of 200,000 entries (out of approximately 1.2 million) was selected from the Wikipedia Arabic dataset and fully diacritized using the state-of-the-art CATT diacritizer ([Abjad AI, 2024](https://github.com/abjadai/catt)), introduced in [this paper](https://arxiv.org/abs/2407.03236) and licensed under CC BY-NC 4.0.
37
+
38
+ ### Training Procedure
39
+
40
+ #### Model Architecture and Objectives
41
+
42
+ The models follow different training objectives:
43
+
44
+ 1. **mlm_p2g_non_diacritics**:
45
+ - Trained with dual objectives similar to the original PL-BERT:
46
+ - Masked Language Modeling (MLM): Standard BERT pre-training objective
47
+ - Phoneme-to-Grapheme (P2G): Predicting token IDs from phonemic representations
48
+ - Tokenization was performed using [aubmindlab/bert-base-arabertv2](https://huggingface.co/aubmindlab/bert-base-arabertv2), which uses subword tokenization
49
+ - Trained for 10 epochs on non-diacritized Wikipedia Arabic
50
+
51
+ 2. **mlm_only_non_diacritics**:
52
+ - Trained with only the MLM objective
53
+ - Removes the P2G objective, which according to ablation studies in the PL-BERT paper minimally affected performance
54
+ - This removal eliminated dependence on tokenization, which:
55
+ - Reduced the model size considerably (word/subword tokenization has a much larger vocabulary than phoneme vocabulary)
56
+ - Allowed phonemization of entire sentences at once, resulting in more accurate phonemization
57
+ - Trained on non-diacritized Wikipedia Arabic
58
+
59
+ 3. **mlm_only_with_diacritics**:
60
+ - Fine-tuned version of mlm_only_non_diacritics
61
+ - Trained for 10 epochs on diacritized Arabic text
62
+ - Uses the same MLM-only objective
63
+
64
+ ## Technical Considerations
65
+
66
+ ### Tokenization Challenges
67
+
68
+ For the **mlm_p2g_non_diacritics** model, a notable limitation was the use of subword tokenization. This approach is not ideal for pronunciation modeling because phonemizing parts of words independently loses the context of the word, which heavily affects pronunciation. The authors of the original PL-BERT paper used a word-level tokenizer for English, but a comparable high-quality word-level tokenizer was not available for Arabic. This limitation was addressed in the subsequent models by removing the P2G objective.
69
+
70
+ ### Diacritization
71
+
72
+ Arabic text can be written with or without diacritics (short vowel marks). The **mlm_only_with_diacritics** model specifically addresses this by training on fully diacritized text, which provides explicit pronunciation information that is typically absent in standard written Arabic.
73
+
74
+ ## Uses
75
+
76
+ These models can be used for Arabic natural language understanding tasks where phonemic awareness may be beneficial, such as:
77
+ - Text-to-speech
78
+ - Speech recognition post-processing
79
+ - Dialect identification
80
+ - Pronunciation-sensitive applications
81
+
82
+ For examples on how these models can be used in code, take a look at: https://github.com/Fadi987/StyleTTS2/blob/main/Utils/PLBERT/util.py
83
+
84
+ ## Bias, Risks, and Limitations
85
+
86
+ The models are trained on Wikipedia data, which may not represent all varieties of Arabic equally. The diacritization process, while state-of-the-art, may introduce some errors or biases in the training data.
87
+
88
+ The subword tokenization approach used in the mlm_p2g_non_diacritics model has limitations for phonemic modeling as noted above.
89
+
90
+ ## Citation
91
+
92
+ **BibTeX:**
93
+ ```bibtex
94
+ @article{catt2024,
95
+ title={CATT: Character-based Arabic Tashkeel Transformer},
96
+ author={Alasmary, Faris and Zaafarani, Orjuwan and Ghannam, Ahmad},
97
+ journal={arXiv preprint arXiv:2407.03236},
98
+ year={2024}
99
+ }
100
+
101
+ @article{plbert2023,
102
+ title={Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions},
103
+ author={Li, Yinghao Aaron and Han, Cong and Jiang, Xilin and Mesgarani, Nima},
104
+ journal={arXiv preprint arXiv:2301.08810},
105
+ year={2023}
106
+ }
107
+ ```
models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/char_indexer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IPA Phonemizer: https://github.com/bootphon/phonemizer
2
+
3
+ import string
4
+
5
+ PAD = "P"
6
+ PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
7
+ LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
8
+ LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
9
+ PHONEME_MASK = "M"
10
+ PHONEME_SEPARATOR = " "
11
+ UNKNOWN='U'
12
+
13
+ # Export all symbols:
14
+ symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
15
+
16
+ assert len(symbols) == len(set(symbols)) # no duplicates
17
+
18
+ class CharacterIndexer:
19
+ def __init__(self):
20
+ self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
21
+
22
+ def __call__(self, text):
23
+ return [self.word_index_dictionary[char] if char in self.word_index_dictionary
24
+ else self.word_index_dictionary[UNKNOWN] for char in text]
models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/config.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprocess_params:
2
+ tokenizer: "aubmindlab/bert-base-arabertv2"
3
+ num_shards: 10000
4
+ max_workers: 25
5
+ max_try_count: 3
6
+ timeout: 300
7
+ phonemizer_language: "ar"
8
+ preprocess_dir: "data/pl_bert"
9
+ hf_dataset_name: "wikimedia/wikipedia"
10
+ hf_dataset_split: "20231101.ar"
11
+ cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
12
+
13
+ training_params:
14
+ output_dir: "/pl_bert/checkpoints"
15
+ mixed_precision: "fp16"
16
+ batch_size: 96
17
+ save_interval: 1000
18
+ log_interval: 10
19
+ num_process: 1 # number of GPUs
20
+ num_steps: 1000000
21
+ learning_rate: 7e-5
22
+ training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
23
+
24
+ dataset_params:
25
+ word_separator: 87 # token idx used for word separation (W)
26
+ max_seq_length: 512 # max phoneme sequence length
27
+ word_pred_prob: 0.15 # probability to select work for prediction
28
+ phoneme_mask_prob: 0.8 # probability to mask phonemes
29
+ replace_prob: 0.1 # probablity to replace phonemes
30
+
31
+ model_params:
32
+ hidden_size: 768
33
+ num_attention_heads: 12
34
+ intermediate_size: 2048
35
+ max_position_embeddings: 512
36
+ num_hidden_layers: 12
37
+ dropout: 0.1
models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:331314efb0d278271bb53d5a33985da8e5a82d73dd41e3e7b4881d96322edc09
3
+ size 72568654
models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/char_indexer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IPA Phonemizer: https://github.com/bootphon/phonemizer
2
+
3
+ import string
4
+
5
+ PAD = "P"
6
+ PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
7
+ LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
8
+ LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
9
+ PHONEME_MASK = "M"
10
+ PHONEME_SEPARATOR = " "
11
+ UNKNOWN='U'
12
+
13
+ # Export all symbols:
14
+ symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
15
+
16
+ assert len(symbols) == len(set(symbols)) # no duplicates
17
+
18
+ class CharacterIndexer:
19
+ def __init__(self):
20
+ self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
21
+
22
+ def __call__(self, text):
23
+ return [self.word_index_dictionary[char] if char in self.word_index_dictionary
24
+ else self.word_index_dictionary[UNKNOWN] for char in text]
models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/config.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprocess_params:
2
+ tokenizer: "aubmindlab/bert-base-arabertv2"
3
+ num_shards: 10000
4
+ max_workers: 25
5
+ max_try_count: 3
6
+ timeout: 300
7
+ phonemizer_language: "ar"
8
+ preprocess_dir: "data/pl_bert"
9
+ hf_dataset_name: "wikimedia/wikipedia"
10
+ hf_dataset_split: "20231101.ar"
11
+ cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
12
+
13
+ training_params:
14
+ output_dir: "/pl_bert/checkpoints"
15
+ mixed_precision: "fp16"
16
+ batch_size: 96
17
+ save_interval: 1000
18
+ log_interval: 10
19
+ num_process: 1 # number of GPUs
20
+ num_steps: 1000000
21
+ learning_rate: 7e-5
22
+ training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
23
+ split: "diacritized"
24
+
25
+ dataset_params:
26
+ word_separator: 87 # token idx used for word separation (W)
27
+ max_seq_length: 512 # max phoneme sequence length
28
+ word_pred_prob: 0.15 # probability to select work for prediction
29
+ phoneme_mask_prob: 0.8 # probability to mask phonemes
30
+ replace_prob: 0.1 # probablity to replace phonemes
31
+
32
+ model_params:
33
+ pretrained_model: "/pl_bert/checkpoints/modal_phoneme_only_non_diacritics/step_116000.pth"
34
+ hidden_size: 768
35
+ num_attention_heads: 12
36
+ intermediate_size: 2048
37
+ max_position_embeddings: 512
38
+ num_hidden_layers: 12
39
+ dropout: 0.1
models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b32ed12e3a3f5a4d1da88e261c217e36a8f4500d7155fae8324efd6c5bf2803b
3
+ size 72568548
models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/config.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprocess_params:
2
+ tokenizer: "aubmindlab/bert-base-arabertv2"
3
+ num_shards: 10000
4
+ max_workers: 25
5
+ max_try_count: 3
6
+ timeout: 300
7
+ phonemizer_language: "ar"
8
+ preprocess_dir: "data/pl_bert"
9
+ hf_dataset_name: "wikimedia/wikipedia"
10
+ hf_dataset_split: "20231101.ar"
11
+ output_dir: "wikipedia_20231101.ar.processed"
12
+
13
+ training_params:
14
+ output_dir: "checkpoints"
15
+ mixed_precision: "fp16"
16
+ batch_size: 16
17
+ save_interval: 5000
18
+ log_interval: 10
19
+ num_process: 1 # number of GPUs
20
+ num_steps: 1000000
21
+ learning_rate: 3e-5
22
+
23
+ dataset_params:
24
+ word_separator: 87 # token idx used for word separation (W)
25
+ max_seq_length: 512 # max phoneme sequence length
26
+ word_pred_prob: 0.15 # probability to select work for prediction
27
+ phoneme_mask_prob: 0.8 # probability to mask phonemes
28
+ replace_prob: 0.1 # probablity to replace phonemes
29
+
30
+ model_params:
31
+ hidden_size: 768
32
+ num_attention_heads: 12
33
+ intermediate_size: 2048
34
+ max_position_embeddings: 512
35
+ num_hidden_layers: 12
36
+ dropout: 0.1
models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c612ed18ac2283620496e11a01b95bf1432469ff2348fc58934af8b8fde888
3
+ size 709191470
models/en/mpbert (ydqmkkx)/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/en/mpbert (ydqmkkx)/PhonemeBERT.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e22ebf7e9ae759c697ceb8fc837a58bcd3c76dd5a0692504fbdb6c9d2d51cf
3
+ size 747051
models/en/mpbert (ydqmkkx)/README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ datasets:
6
+ - bookcorpus
7
+ - wikipedia
8
+ ---
9
+
10
+ # Mixed-Phoneme BERT
11
+ More information: https://github.com/ydqmkkx/PhonemeBERT
models/en/mpbert (ydqmkkx)/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MpbertForPreTraining"
4
+ ],
5
+ "vocab_size": 171,
6
+ "embedding_size": 768,
7
+ "hidden_size": 768,
8
+ "output_size": 768,
9
+ "intermediate_size": 3072,
10
+ "max_position_embeddings": 1024,
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 12,
13
+ "hidden_act": "gelu",
14
+ "attention_probs_dropout_prob": 0.1,
15
+ "hidden_dropout_prob": 0.1,
16
+ "layer_norm_eps": 1e-12,
17
+ "initializer_range": 0.02,
18
+ "pad_token_id": 0,
19
+ "position_biased_input": false,
20
+ "position_embedding_type": "relative_key_query",
21
+ "use_sup_phoneme": true,
22
+ "sup_phoneme_vocab_size": 30000,
23
+ "grapheme_max_position_embeddings": 1024,
24
+ "grapheme_vocab_size": 84481,
25
+ "use_cache": false,
26
+ "gradient_checkpointing": false,
27
+ "transformers_version": "4.41.2"
28
+ }
models/en/mpbert (ydqmkkx)/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e913f42be9014e088f50ed797132b0c04115429de10819f5c01ea38b800562d4
3
+ size 444135926
models/en/mpbert (ydqmkkx)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/ydqmkkx/mpbert
models/en/plbert (ydqmkkx)/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
models/en/plbert (ydqmkkx)/PhonemeBERT.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e22ebf7e9ae759c697ceb8fc837a58bcd3c76dd5a0692504fbdb6c9d2d51cf
3
+ size 747051
models/en/plbert (ydqmkkx)/README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ datasets:
6
+ - bookcorpus
7
+ - wikipedia
8
+ ---
9
+
10
+ # Phoneme-Level BERT
11
+ More information: https://github.com/ydqmkkx/PhonemeBERT
models/en/plbert (ydqmkkx)/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PlbertForPreTraining"
4
+ ],
5
+ "vocab_size": 171,
6
+ "embedding_size": 768,
7
+ "hidden_size": 768,
8
+ "output_size": 768,
9
+ "intermediate_size": 3072,
10
+ "max_position_embeddings": 1024,
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 12,
13
+ "hidden_act": "gelu",
14
+ "attention_probs_dropout_prob": 0.1,
15
+ "hidden_dropout_prob": 0.1,
16
+ "layer_norm_eps": 1e-12,
17
+ "initializer_range": 0.02,
18
+ "pad_token_id": 0,
19
+ "position_biased_input": false,
20
+ "position_embedding_type": "relative_key_query",
21
+ "use_sup_phoneme": false,
22
+ "sup_phoneme_vocab_size": 30000,
23
+ "grapheme_max_position_embeddings": 1024,
24
+ "grapheme_vocab_size": 84481,
25
+ "use_cache": false,
26
+ "gradient_checkpointing": false,
27
+ "transformers_version": "4.41.2"
28
+ }
models/en/plbert (ydqmkkx)/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:110f290b2adc0ac9c4994e494f61359ae45d91e1fe299d81ed69f86090d467cc
3
+ size 611719414
models/hi/fine-tuned-PL_BERT-hindi/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ step_2500.t7 filter=lfs diff=lfs merge=lfs -text
37
+ step_5000.t7 filter=lfs diff=lfs merge=lfs -text
models/hi/fine-tuned-PL_BERT-hindi/README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PL-BERT Fine-Tuned on Hindi Wikipedia Dataset
2
+
3
+ This model is a fine-tuned version of **PL-BERT**, specifically trained on the Hindi subset of the Wiki40b dataset. The model has been optimized to understand and generate high-quality Hindi text, making it suitable for various NLP tasks in the Hindi language.
4
+ For more information about this model, check out the [GitHub](https://github.com/Ionio-io/PL-BERT-Fine-Tuned-hi-) repository.
5
+
6
+ ## Model Overview
7
+
8
+ - **Model Name:** PL-BERT (Fine-tuned on Hindi)
9
+ - **Base Model:** PL-BERT (Multilingual BERT variant)
10
+ - **Dataset:** Hindi subset from Wiki40b (51,000 cleaned Wikipedia articles)
11
+ - **Precision:** Mixed precision (FP16)
12
+
13
+ The fine-tuning process focused on improving the model's ability to handle Hindi text more effectively by leveraging a large, cleaned corpus of Wikipedia articles in Hindi.
14
+
15
+ ## Training Details
16
+
17
+ - **Model:** PL-BERT
18
+ - **Dataset:** Hindi subset from Wiki40b
19
+ - **Batch Size:** 64
20
+ - **Mixed Precision:** FP16
21
+ - **Optimizer:** AdamW
22
+ - **Training Steps:** 15,000
23
+
24
+ ### Training Progress
25
+
26
+ - **Final Loss:** 1.879
27
+ - **Vocabulary Loss:** 0.49
28
+ - **Token Loss:** 1.465
29
+
30
+ ### Validation Results
31
+
32
+ During training, we monitored performance with validation metrics:
33
+
34
+ - **Validation Loss:** 1.879
35
+ - **Vocabulary Accuracy:** 78.54%
36
+ - **Token Accuracy:** 82.30%
37
+
38
+
39
+ ---
40
+ license: apache-2.0
41
+ ---
models/hi/fine-tuned-PL_BERT-hindi/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Ionio-ai/fine-tuned-PL_BERT-hindi
models/hi/fine-tuned-PL_BERT-hindi/step_2500.t7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32fa987b7a122dbdd18a2669065cc155b0b965cb2791dd31ff671b1bf26063ec
3
+ size 87387147
models/hi/fine-tuned-PL_BERT-hindi/step_5000.t7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd955319e7f7bd1c0bbc81e7819810d50ddd0dcd2ea98fd63ad0d0049ba5807
3
+ size 87387147
models/multi/multilingual-pl-bert/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ step_1100000.t7 filter=lfs diff=lfs merge=lfs -text
models/multi/multilingual-pl-bert/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - styletts2-community/multilingual-phonemes-10k-alpha
5
+ language:
6
+ - fr
7
+ - en
8
+ - es
9
+ - ca
10
+ - de
11
+ - el
12
+ - fa
13
+ - fi
14
+ - pt
15
+ - pl
16
+ - ru
17
+ - sv
18
+ - uk
19
+ - zh
20
+ ---
21
+
22
+ # Multilingual PL-BERT checkpoint
23
+
24
+ The checkpoint open-sourced here is trained by Papercup using the open-source PL-BERT model found here https://github.com/yl4579/PL-BERT. It is trained to be supported by StyleTTS2, which can be found here: https://github.com/yl4579/StyleTTS2. You can see in the model card the languages that it has been trained on (the languages correspond to the crowdsourced dataset found here https://huggingface.co/datasets/styletts2-community/multilingual-phonemes-10k-alpha).
25
+
26
+ Notable differences compared to the default PL-BERT checkpoint and config available [here](https://github.com/yl4579/StyleTTS2/tree/main/Utils/PLBERT):
27
+ * Because we are working with many languages, we are using a different tokenizer now: `bert-base-multilingual-cased`.
28
+ * The PL-BERT model was trained on the data obtained from `styletts2-community/multilingual-phonemes-10k-alpha` for 1.1M iterations.
29
+ * The `token_maps.pkl` file has changed (also open-sourced here).
30
+ * We have changed the `util.py` file to deal with an error when loading `new_state_dict["embeddings.position_ids"]`.
31
+
32
+ ## How do I train StyleTTS2 with this new PL-BERT checkpoint?
33
+
34
+ * Simply create a new folder under `Utils` in your StyleTTS2 repository. Call it, for example, `PLBERT_all_languages`.
35
+ * Copy paste into it `config.yml`, `step_1100000.t7` and `util.py`.
36
+ * Then, in your StyleTTS2 config file, change `PLBERT_dir` to `Utils/PLBERT_all_languages`. You will also need to change your import as such:
37
+ * Change `from Utils.PLBERT.util import load_plbert`
38
+ * To `from Utils.PLBERT_all_languages.util import load_plbert`
39
+ * Alternatively, you can just replace the relevant files in `Utils/PLBERT` and not have to change any code.
40
+ * Now, you need to create train and validation files. You will need to use `espeak` to create a file in the same format as the ones that exist in the `Data` folder of the StyleTTS2 repository. Careful! You will need to change the `language` argument to phonemise your text if it's not in English. You can find the correct language codes [here](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md). For example, Latin American Spanish is `es-419`
41
+
42
+ Voila, you can now train a multilingual StyleTTS2 model!
43
+
44
+ Thank you to Aaron (Yinghao) Li for these contributions.
models/multi/multilingual-pl-bert/config.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Checkpoint_all_phonemes"
2
+ mixed_precision: "fp16"
3
+ data_folder: "wikipedia_20220301.en.processed"
4
+ batch_size: 32
5
+ save_interval: 20000
6
+ log_interval: 10
7
+ num_process: 1 # number of GPUs
8
+ num_steps: 2000000
9
+
10
+ dataset_params:
11
+ tokenizer: "bert-base-multilingual-cased"
12
+ token_separator: " " # token used for phoneme separator (space)
13
+ token_mask: "M" # token used for phoneme mask (M)
14
+ word_separator: 102 # token used for word separator (<formula>)
15
+ token_maps: "token_maps.pkl" # token map path
16
+
17
+ max_mel_length: 512 # max phoneme length
18
+
19
+ word_mask_prob: 0.15 # probability to mask the entire word
20
+ phoneme_mask_prob: 0.1 # probability to mask each phoneme
21
+ replace_prob: 0.2 # probablity to replace phonemes
22
+
23
+ model_params:
24
+ vocab_size: 178
25
+ hidden_size: 768
26
+ num_attention_heads: 12
27
+ intermediate_size: 2048
28
+ max_position_embeddings: 512
29
+ num_hidden_layers: 12
30
+ dropout: 0.1
models/multi/multilingual-pl-bert/issues.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ===========================================================
2
+ [#2] How to inference it?
3
+ ===========================================================
4
+
5
+ jonathansilvasantos, 20.03.2024
6
+ Q: Can someone share an inference script for testing this model independently? I'm exploring how to adapt FastPitch for compatibility with this model or other less computationally demanding text-to-speech (TTS) models.
7
+
8
+ raph30370 (Papercup AI org), 27.03.2024
9
+ A: I recommend you check out the StyleTTS2 repo on how to do inference with PL-BERT. Specifically in the train_second.py file for example, you can look at plbert = load_plbert(BERT_path) as a starting point in that script.
10
+
11
+
12
+ ===========================================================
13
+ [#4] Language
14
+ ===========================================================
15
+
16
+ YoussefBarakat, 02.05.2024
17
+ Q: If i'm training a new language does it have to be phonemized with e-speak?
18
+
19
+ raph30370 (Papercup AI org), 02.05.2024
20
+ A: I think that's how all of these languages here have been phonemized. If you're training everything together, they probably have to be under the same phoneme space, so yeah probably, unless you phonemize everything else with a new phonemizer as well.
21
+
22
+
23
+ ===========================================================
24
+ [#5] Tokenizer Type
25
+ ===========================================================
26
+
27
+ abhinand, 15.01.2025
28
+ Q: I notice that the vocabulary contains sub-word tokens.
29
+
30
+ ...
31
+ {'word': '##م', 'token': 7030},
32
+ {'word': '##di', 'token': 7033},
33
+ {'word': '##kan', 'token': 7036},
34
+ {'word': '##ek', 'token': 7037},
35
+ {'word': '##ak', 'token': 7040},
36
+ {'word': '##ı', 'token': 7042},
37
+ {'word': '##lo', 'token': 7044},
38
+ {'word': '##ung', 'token': 7045},
39
+ ...
40
+
41
+ But for PL-BERT, isn't it necessary to have 'word-level' tokens? Or am I misinterpreting here?
42
+
43
+ Another reference thread which discussed this -> https://github.com/yl4579/StyleTTS2/issues/286#issuecomment-2383835836
44
+
45
+ So does this mean I can use a tokenizer like Indic-BERT to build my token_maps for Indic-PL-BERT??
46
+
models/multi/multilingual-pl-bert/languages.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ French
2
+ English
3
+ Spanish
4
+ Catalan
5
+ German
6
+ Greek
7
+ Persian
8
+ Finnish
9
+ Portuguese
10
+ Polish
11
+ Russian
12
+ Swedish
13
+ Ukrainian
14
+ Chinese
models/multi/multilingual-pl-bert/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/papercup-ai/multilingual-pl-bert
models/multi/multilingual-pl-bert/step_1100000.t7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e920e029e5226284f6042bef36514323c7fd0ae73c59e0ed4ccecd1f2916fd1
3
+ size 25179228
models/multi/multilingual-pl-bert/token_maps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb2b72e90be652f176c44fe4b101b897167bf1cd669ad76e11e9400928cc848
3
+ size 1858692
models/multi/multilingual-pl-bert/util.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import torch
4
+ from transformers import AlbertConfig, AlbertModel
5
+
6
+ class CustomAlbert(AlbertModel):
7
+ def forward(self, *args, **kwargs):
8
+ # Call the original forward method
9
+ outputs = super().forward(*args, **kwargs)
10
+
11
+ # Only return the last_hidden_state
12
+ return outputs.last_hidden_state
13
+
14
+
15
+ def load_plbert(log_dir):
16
+ config_path = os.path.join(log_dir, "config.yml")
17
+ plbert_config = yaml.safe_load(open(config_path))
18
+
19
+ albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
20
+ bert = CustomAlbert(albert_base_configuration)
21
+
22
+ files = os.listdir(log_dir)
23
+ ckpts = []
24
+ for f in os.listdir(log_dir):
25
+ if f.startswith("step_"): ckpts.append(f)
26
+
27
+ iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
28
+ iters = sorted(iters)[-1]
29
+
30
+ checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
31
+ state_dict = checkpoint['net']
32
+ from collections import OrderedDict
33
+ new_state_dict = OrderedDict()
34
+ for k, v in state_dict.items():
35
+ name = k[7:] # remove `module.`
36
+ if name.startswith('encoder.'):
37
+ name = name[8:] # remove `encoder.`
38
+ new_state_dict[name] = v
39
+ try:
40
+ del new_state_dict["embeddings.position_ids"]
41
+ except KeyError:
42
+ pass
43
+ bert.load_state_dict(new_state_dict, strict=False)
44
+
45
+ return bert