PL-BERT (code, models, paper)
Browse files- .gitattributes +4 -0
- Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions.pdf +3 -0
- code/PL-BERT.zip +3 -0
- models/ar/pl-bert (fadi77)/.gitattributes +35 -0
- models/ar/pl-bert (fadi77)/README.md +107 -0
- models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/char_indexer.py +24 -0
- models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/config.yml +37 -0
- models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/model.pth +3 -0
- models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/char_indexer.py +24 -0
- models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/config.yml +39 -0
- models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/model.pth +3 -0
- models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/config.yml +36 -0
- models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/model.pth +3 -0
- models/en/mpbert (ydqmkkx)/.gitattributes +35 -0
- models/en/mpbert (ydqmkkx)/PhonemeBERT.zip +3 -0
- models/en/mpbert (ydqmkkx)/README.md +11 -0
- models/en/mpbert (ydqmkkx)/config.json +28 -0
- models/en/mpbert (ydqmkkx)/pytorch_model.bin +3 -0
- models/en/mpbert (ydqmkkx)/source.txt +1 -0
- models/en/plbert (ydqmkkx)/.gitattributes +1 -0
- models/en/plbert (ydqmkkx)/PhonemeBERT.zip +3 -0
- models/en/plbert (ydqmkkx)/README.md +11 -0
- models/en/plbert (ydqmkkx)/config.json +28 -0
- models/en/plbert (ydqmkkx)/pytorch_model.bin +3 -0
- models/hi/fine-tuned-PL_BERT-hindi/.gitattributes +37 -0
- models/hi/fine-tuned-PL_BERT-hindi/README.md +41 -0
- models/hi/fine-tuned-PL_BERT-hindi/source.txt +1 -0
- models/hi/fine-tuned-PL_BERT-hindi/step_2500.t7 +3 -0
- models/hi/fine-tuned-PL_BERT-hindi/step_5000.t7 +3 -0
- models/multi/multilingual-pl-bert/.gitattributes +36 -0
- models/multi/multilingual-pl-bert/README.md +44 -0
- models/multi/multilingual-pl-bert/config.yml +30 -0
- models/multi/multilingual-pl-bert/issues.txt +46 -0
- models/multi/multilingual-pl-bert/languages.txt +14 -0
- models/multi/multilingual-pl-bert/source.txt +1 -0
- models/multi/multilingual-pl-bert/step_1100000.t7 +3 -0
- models/multi/multilingual-pl-bert/token_maps.pkl +3 -0
- models/multi/multilingual-pl-bert/util.py +45 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
models/hi/fine-tuned-PL_BERT-hindi/step_2500.t7 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
models/hi/fine-tuned-PL_BERT-hindi/step_5000.t7 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
models/multi/multilingual-pl-bert/step_1100000.t7 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
Phoneme-Level[[:space:]]BERT[[:space:]]for[[:space:]]Enhanced[[:space:]]Prosody[[:space:]]of[[:space:]]Text-to-Speech[[:space:]]with[[:space:]]Grapheme[[:space:]]Predictions.pdf filter=lfs diff=lfs merge=lfs -text
|
Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:976c96b54742fbc7209bfcacb9e6cb3d1e32d57a582e84f6aab7e8c4eaed5d52
|
| 3 |
+
size 278130
|
code/PL-BERT.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02de2d67919d8bd9dd1bd83cf153aead58926d3c6f9b008d14f062779c0c4253
|
| 3 |
+
size 7760040
|
models/ar/pl-bert (fadi77)/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/ar/pl-bert (fadi77)/README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
|
| 3 |
+
# Doc / guide: https://huggingface.co/docs/hub/model-cards
|
| 4 |
+
{}
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Model Card for Arabic PL-BERT Models
|
| 8 |
+
|
| 9 |
+
This model card describes a collection of three Arabic BERT models trained with different objectives and datasets for phoneme-aware language modeling.
|
| 10 |
+
|
| 11 |
+
## Model Details
|
| 12 |
+
|
| 13 |
+
### Model Description
|
| 14 |
+
|
| 15 |
+
These models are Arabic adaptations of the PL-BERT (Phoneme-aware Language BERT) approach introduced in [Ashby et al. (2023)](https://arxiv.org/pdf/2301.08810). The models incorporate phonemic information to enhance language understanding, with variations in training objectives and data preprocessing.
|
| 16 |
+
|
| 17 |
+
The collection includes three models:
|
| 18 |
+
- **mlm_p2g_non_diacritics**: Trained with both MLM (Masked Language Modeling) and P2G (Phoneme-to-Grapheme) objectives on non-diacritized Arabic text
|
| 19 |
+
- **mlm_only_non_diacritics**: Trained with only the MLM objective on non-diacritized Arabic text
|
| 20 |
+
- **mlm_only_with_diacritics**: Fine-tuned version of mlm_only_non_diacritics on diacritized Arabic text
|
| 21 |
+
|
| 22 |
+
**Developed by:** Fadi (GitHub: Fadi987)
|
| 23 |
+
**Model type:** Transformer-based language models (BERT variants)
|
| 24 |
+
**Language:** Arabic
|
| 25 |
+
|
| 26 |
+
### Model Sources
|
| 27 |
+
|
| 28 |
+
- **Paper (PL-BERT approach):** [Ashby et al. (2023)](https://arxiv.org/pdf/2301.08810)
|
| 29 |
+
|
| 30 |
+
## Training Details
|
| 31 |
+
|
| 32 |
+
### Training Data
|
| 33 |
+
|
| 34 |
+
All models were initially trained on a cleaned version of the Arabic Wikipedia dataset. The dataset is available at [wikipedia.20231101.ar](https://huggingface.co/datasets/wikimedia/wikipedia/tree/main/20231101.ar).
|
| 35 |
+
|
| 36 |
+
For the **mlm_only_with_diacritics** model, a random sample of 200,000 entries (out of approximately 1.2 million) was selected from the Wikipedia Arabic dataset and fully diacritized using the state-of-the-art CATT diacritizer ([Abjad AI, 2024](https://github.com/abjadai/catt)), introduced in [this paper](https://arxiv.org/abs/2407.03236) and licensed under CC BY-NC 4.0.
|
| 37 |
+
|
| 38 |
+
### Training Procedure
|
| 39 |
+
|
| 40 |
+
#### Model Architecture and Objectives
|
| 41 |
+
|
| 42 |
+
The models follow different training objectives:
|
| 43 |
+
|
| 44 |
+
1. **mlm_p2g_non_diacritics**:
|
| 45 |
+
- Trained with dual objectives similar to the original PL-BERT:
|
| 46 |
+
- Masked Language Modeling (MLM): Standard BERT pre-training objective
|
| 47 |
+
- Phoneme-to-Grapheme (P2G): Predicting token IDs from phonemic representations
|
| 48 |
+
- Tokenization was performed using [aubmindlab/bert-base-arabertv2](https://huggingface.co/aubmindlab/bert-base-arabertv2), which uses subword tokenization
|
| 49 |
+
- Trained for 10 epochs on non-diacritized Wikipedia Arabic
|
| 50 |
+
|
| 51 |
+
2. **mlm_only_non_diacritics**:
|
| 52 |
+
- Trained with only the MLM objective
|
| 53 |
+
- Removes the P2G objective, which according to ablation studies in the PL-BERT paper minimally affected performance
|
| 54 |
+
- This removal eliminated dependence on tokenization, which:
|
| 55 |
+
- Reduced the model size considerably (word/subword tokenization has a much larger vocabulary than phoneme vocabulary)
|
| 56 |
+
- Allowed phonemization of entire sentences at once, resulting in more accurate phonemization
|
| 57 |
+
- Trained on non-diacritized Wikipedia Arabic
|
| 58 |
+
|
| 59 |
+
3. **mlm_only_with_diacritics**:
|
| 60 |
+
- Fine-tuned version of mlm_only_non_diacritics
|
| 61 |
+
- Trained for 10 epochs on diacritized Arabic text
|
| 62 |
+
- Uses the same MLM-only objective
|
| 63 |
+
|
| 64 |
+
## Technical Considerations
|
| 65 |
+
|
| 66 |
+
### Tokenization Challenges
|
| 67 |
+
|
| 68 |
+
For the **mlm_p2g_non_diacritics** model, a notable limitation was the use of subword tokenization. This approach is not ideal for pronunciation modeling because phonemizing parts of words independently loses the context of the word, which heavily affects pronunciation. The authors of the original PL-BERT paper used a word-level tokenizer for English, but a comparable high-quality word-level tokenizer was not available for Arabic. This limitation was addressed in the subsequent models by removing the P2G objective.
|
| 69 |
+
|
| 70 |
+
### Diacritization
|
| 71 |
+
|
| 72 |
+
Arabic text can be written with or without diacritics (short vowel marks). The **mlm_only_with_diacritics** model specifically addresses this by training on fully diacritized text, which provides explicit pronunciation information that is typically absent in standard written Arabic.
|
| 73 |
+
|
| 74 |
+
## Uses
|
| 75 |
+
|
| 76 |
+
These models can be used for Arabic natural language understanding tasks where phonemic awareness may be beneficial, such as:
|
| 77 |
+
- Text-to-speech
|
| 78 |
+
- Speech recognition post-processing
|
| 79 |
+
- Dialect identification
|
| 80 |
+
- Pronunciation-sensitive applications
|
| 81 |
+
|
| 82 |
+
For examples on how these models can be used in code, take a look at: https://github.com/Fadi987/StyleTTS2/blob/main/Utils/PLBERT/util.py
|
| 83 |
+
|
| 84 |
+
## Bias, Risks, and Limitations
|
| 85 |
+
|
| 86 |
+
The models are trained on Wikipedia data, which may not represent all varieties of Arabic equally. The diacritization process, while state-of-the-art, may introduce some errors or biases in the training data.
|
| 87 |
+
|
| 88 |
+
The subword tokenization approach used in the mlm_p2g_non_diacritics model has limitations for phonemic modeling as noted above.
|
| 89 |
+
|
| 90 |
+
## Citation
|
| 91 |
+
|
| 92 |
+
**BibTeX:**
|
| 93 |
+
```bibtex
|
| 94 |
+
@article{catt2024,
|
| 95 |
+
title={CATT: Character-based Arabic Tashkeel Transformer},
|
| 96 |
+
author={Alasmary, Faris and Zaafarani, Orjuwan and Ghannam, Ahmad},
|
| 97 |
+
journal={arXiv preprint arXiv:2407.03236},
|
| 98 |
+
year={2024}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
@article{plbert2023,
|
| 102 |
+
title={Phoneme-Level BERT for Enhanced Prosody of Text-to-Speech with Grapheme Predictions},
|
| 103 |
+
author={Li, Yinghao Aaron and Han, Cong and Jiang, Xilin and Mesgarani, Nima},
|
| 104 |
+
journal={arXiv preprint arXiv:2301.08810},
|
| 105 |
+
year={2023}
|
| 106 |
+
}
|
| 107 |
+
```
|
models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/char_indexer.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
| 2 |
+
|
| 3 |
+
import string
|
| 4 |
+
|
| 5 |
+
PAD = "P"
|
| 6 |
+
PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
|
| 7 |
+
LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
|
| 8 |
+
LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
|
| 9 |
+
PHONEME_MASK = "M"
|
| 10 |
+
PHONEME_SEPARATOR = " "
|
| 11 |
+
UNKNOWN='U'
|
| 12 |
+
|
| 13 |
+
# Export all symbols:
|
| 14 |
+
symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
|
| 15 |
+
|
| 16 |
+
assert len(symbols) == len(set(symbols)) # no duplicates
|
| 17 |
+
|
| 18 |
+
class CharacterIndexer:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
|
| 21 |
+
|
| 22 |
+
def __call__(self, text):
|
| 23 |
+
return [self.word_index_dictionary[char] if char in self.word_index_dictionary
|
| 24 |
+
else self.word_index_dictionary[UNKNOWN] for char in text]
|
models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/config.yml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
preprocess_params:
|
| 2 |
+
tokenizer: "aubmindlab/bert-base-arabertv2"
|
| 3 |
+
num_shards: 10000
|
| 4 |
+
max_workers: 25
|
| 5 |
+
max_try_count: 3
|
| 6 |
+
timeout: 300
|
| 7 |
+
phonemizer_language: "ar"
|
| 8 |
+
preprocess_dir: "data/pl_bert"
|
| 9 |
+
hf_dataset_name: "wikimedia/wikipedia"
|
| 10 |
+
hf_dataset_split: "20231101.ar"
|
| 11 |
+
cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
|
| 12 |
+
|
| 13 |
+
training_params:
|
| 14 |
+
output_dir: "/pl_bert/checkpoints"
|
| 15 |
+
mixed_precision: "fp16"
|
| 16 |
+
batch_size: 96
|
| 17 |
+
save_interval: 1000
|
| 18 |
+
log_interval: 10
|
| 19 |
+
num_process: 1 # number of GPUs
|
| 20 |
+
num_steps: 1000000
|
| 21 |
+
learning_rate: 7e-5
|
| 22 |
+
training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
|
| 23 |
+
|
| 24 |
+
dataset_params:
|
| 25 |
+
word_separator: 87 # token idx used for word separation (W)
|
| 26 |
+
max_seq_length: 512 # max phoneme sequence length
|
| 27 |
+
word_pred_prob: 0.15 # probability to select work for prediction
|
| 28 |
+
phoneme_mask_prob: 0.8 # probability to mask phonemes
|
| 29 |
+
replace_prob: 0.1 # probablity to replace phonemes
|
| 30 |
+
|
| 31 |
+
model_params:
|
| 32 |
+
hidden_size: 768
|
| 33 |
+
num_attention_heads: 12
|
| 34 |
+
intermediate_size: 2048
|
| 35 |
+
max_position_embeddings: 512
|
| 36 |
+
num_hidden_layers: 12
|
| 37 |
+
dropout: 0.1
|
models/ar/pl-bert (fadi77)/models/mlm_only_non_diacritics/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:331314efb0d278271bb53d5a33985da8e5a82d73dd41e3e7b4881d96322edc09
|
| 3 |
+
size 72568654
|
models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/char_indexer.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
| 2 |
+
|
| 3 |
+
import string
|
| 4 |
+
|
| 5 |
+
PAD = "P"
|
| 6 |
+
PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
|
| 7 |
+
LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
|
| 8 |
+
LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
|
| 9 |
+
PHONEME_MASK = "M"
|
| 10 |
+
PHONEME_SEPARATOR = " "
|
| 11 |
+
UNKNOWN='U'
|
| 12 |
+
|
| 13 |
+
# Export all symbols:
|
| 14 |
+
symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
|
| 15 |
+
|
| 16 |
+
assert len(symbols) == len(set(symbols)) # no duplicates
|
| 17 |
+
|
| 18 |
+
class CharacterIndexer:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
|
| 21 |
+
|
| 22 |
+
def __call__(self, text):
|
| 23 |
+
return [self.word_index_dictionary[char] if char in self.word_index_dictionary
|
| 24 |
+
else self.word_index_dictionary[UNKNOWN] for char in text]
|
models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/config.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
preprocess_params:
|
| 2 |
+
tokenizer: "aubmindlab/bert-base-arabertv2"
|
| 3 |
+
num_shards: 10000
|
| 4 |
+
max_workers: 25
|
| 5 |
+
max_try_count: 3
|
| 6 |
+
timeout: 300
|
| 7 |
+
phonemizer_language: "ar"
|
| 8 |
+
preprocess_dir: "data/pl_bert"
|
| 9 |
+
hf_dataset_name: "wikimedia/wikipedia"
|
| 10 |
+
hf_dataset_split: "20231101.ar"
|
| 11 |
+
cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
|
| 12 |
+
|
| 13 |
+
training_params:
|
| 14 |
+
output_dir: "/pl_bert/checkpoints"
|
| 15 |
+
mixed_precision: "fp16"
|
| 16 |
+
batch_size: 96
|
| 17 |
+
save_interval: 1000
|
| 18 |
+
log_interval: 10
|
| 19 |
+
num_process: 1 # number of GPUs
|
| 20 |
+
num_steps: 1000000
|
| 21 |
+
learning_rate: 7e-5
|
| 22 |
+
training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
|
| 23 |
+
split: "diacritized"
|
| 24 |
+
|
| 25 |
+
dataset_params:
|
| 26 |
+
word_separator: 87 # token idx used for word separation (W)
|
| 27 |
+
max_seq_length: 512 # max phoneme sequence length
|
| 28 |
+
word_pred_prob: 0.15 # probability to select work for prediction
|
| 29 |
+
phoneme_mask_prob: 0.8 # probability to mask phonemes
|
| 30 |
+
replace_prob: 0.1 # probablity to replace phonemes
|
| 31 |
+
|
| 32 |
+
model_params:
|
| 33 |
+
pretrained_model: "/pl_bert/checkpoints/modal_phoneme_only_non_diacritics/step_116000.pth"
|
| 34 |
+
hidden_size: 768
|
| 35 |
+
num_attention_heads: 12
|
| 36 |
+
intermediate_size: 2048
|
| 37 |
+
max_position_embeddings: 512
|
| 38 |
+
num_hidden_layers: 12
|
| 39 |
+
dropout: 0.1
|
models/ar/pl-bert (fadi77)/models/mlm_only_with_diacritics/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b32ed12e3a3f5a4d1da88e261c217e36a8f4500d7155fae8324efd6c5bf2803b
|
| 3 |
+
size 72568548
|
models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/config.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
preprocess_params:
|
| 2 |
+
tokenizer: "aubmindlab/bert-base-arabertv2"
|
| 3 |
+
num_shards: 10000
|
| 4 |
+
max_workers: 25
|
| 5 |
+
max_try_count: 3
|
| 6 |
+
timeout: 300
|
| 7 |
+
phonemizer_language: "ar"
|
| 8 |
+
preprocess_dir: "data/pl_bert"
|
| 9 |
+
hf_dataset_name: "wikimedia/wikipedia"
|
| 10 |
+
hf_dataset_split: "20231101.ar"
|
| 11 |
+
output_dir: "wikipedia_20231101.ar.processed"
|
| 12 |
+
|
| 13 |
+
training_params:
|
| 14 |
+
output_dir: "checkpoints"
|
| 15 |
+
mixed_precision: "fp16"
|
| 16 |
+
batch_size: 16
|
| 17 |
+
save_interval: 5000
|
| 18 |
+
log_interval: 10
|
| 19 |
+
num_process: 1 # number of GPUs
|
| 20 |
+
num_steps: 1000000
|
| 21 |
+
learning_rate: 3e-5
|
| 22 |
+
|
| 23 |
+
dataset_params:
|
| 24 |
+
word_separator: 87 # token idx used for word separation (W)
|
| 25 |
+
max_seq_length: 512 # max phoneme sequence length
|
| 26 |
+
word_pred_prob: 0.15 # probability to select work for prediction
|
| 27 |
+
phoneme_mask_prob: 0.8 # probability to mask phonemes
|
| 28 |
+
replace_prob: 0.1 # probablity to replace phonemes
|
| 29 |
+
|
| 30 |
+
model_params:
|
| 31 |
+
hidden_size: 768
|
| 32 |
+
num_attention_heads: 12
|
| 33 |
+
intermediate_size: 2048
|
| 34 |
+
max_position_embeddings: 512
|
| 35 |
+
num_hidden_layers: 12
|
| 36 |
+
dropout: 0.1
|
models/ar/pl-bert (fadi77)/models/mlm_p2g_non_diacritics/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83c612ed18ac2283620496e11a01b95bf1432469ff2348fc58934af8b8fde888
|
| 3 |
+
size 709191470
|
models/en/mpbert (ydqmkkx)/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/en/mpbert (ydqmkkx)/PhonemeBERT.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4e22ebf7e9ae759c697ceb8fc837a58bcd3c76dd5a0692504fbdb6c9d2d51cf
|
| 3 |
+
size 747051
|
models/en/mpbert (ydqmkkx)/README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
datasets:
|
| 6 |
+
- bookcorpus
|
| 7 |
+
- wikipedia
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Mixed-Phoneme BERT
|
| 11 |
+
More information: https://github.com/ydqmkkx/PhonemeBERT
|
models/en/mpbert (ydqmkkx)/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"MpbertForPreTraining"
|
| 4 |
+
],
|
| 5 |
+
"vocab_size": 171,
|
| 6 |
+
"embedding_size": 768,
|
| 7 |
+
"hidden_size": 768,
|
| 8 |
+
"output_size": 768,
|
| 9 |
+
"intermediate_size": 3072,
|
| 10 |
+
"max_position_embeddings": 1024,
|
| 11 |
+
"num_attention_heads": 12,
|
| 12 |
+
"num_hidden_layers": 12,
|
| 13 |
+
"hidden_act": "gelu",
|
| 14 |
+
"attention_probs_dropout_prob": 0.1,
|
| 15 |
+
"hidden_dropout_prob": 0.1,
|
| 16 |
+
"layer_norm_eps": 1e-12,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"position_biased_input": false,
|
| 20 |
+
"position_embedding_type": "relative_key_query",
|
| 21 |
+
"use_sup_phoneme": true,
|
| 22 |
+
"sup_phoneme_vocab_size": 30000,
|
| 23 |
+
"grapheme_max_position_embeddings": 1024,
|
| 24 |
+
"grapheme_vocab_size": 84481,
|
| 25 |
+
"use_cache": false,
|
| 26 |
+
"gradient_checkpointing": false,
|
| 27 |
+
"transformers_version": "4.41.2"
|
| 28 |
+
}
|
models/en/mpbert (ydqmkkx)/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e913f42be9014e088f50ed797132b0c04115429de10819f5c01ea38b800562d4
|
| 3 |
+
size 444135926
|
models/en/mpbert (ydqmkkx)/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/ydqmkkx/mpbert
|
models/en/plbert (ydqmkkx)/.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
models/en/plbert (ydqmkkx)/PhonemeBERT.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4e22ebf7e9ae759c697ceb8fc837a58bcd3c76dd5a0692504fbdb6c9d2d51cf
|
| 3 |
+
size 747051
|
models/en/plbert (ydqmkkx)/README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
datasets:
|
| 6 |
+
- bookcorpus
|
| 7 |
+
- wikipedia
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Phoneme-Level BERT
|
| 11 |
+
More information: https://github.com/ydqmkkx/PhonemeBERT
|
models/en/plbert (ydqmkkx)/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"PlbertForPreTraining"
|
| 4 |
+
],
|
| 5 |
+
"vocab_size": 171,
|
| 6 |
+
"embedding_size": 768,
|
| 7 |
+
"hidden_size": 768,
|
| 8 |
+
"output_size": 768,
|
| 9 |
+
"intermediate_size": 3072,
|
| 10 |
+
"max_position_embeddings": 1024,
|
| 11 |
+
"num_attention_heads": 12,
|
| 12 |
+
"num_hidden_layers": 12,
|
| 13 |
+
"hidden_act": "gelu",
|
| 14 |
+
"attention_probs_dropout_prob": 0.1,
|
| 15 |
+
"hidden_dropout_prob": 0.1,
|
| 16 |
+
"layer_norm_eps": 1e-12,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"position_biased_input": false,
|
| 20 |
+
"position_embedding_type": "relative_key_query",
|
| 21 |
+
"use_sup_phoneme": false,
|
| 22 |
+
"sup_phoneme_vocab_size": 30000,
|
| 23 |
+
"grapheme_max_position_embeddings": 1024,
|
| 24 |
+
"grapheme_vocab_size": 84481,
|
| 25 |
+
"use_cache": false,
|
| 26 |
+
"gradient_checkpointing": false,
|
| 27 |
+
"transformers_version": "4.41.2"
|
| 28 |
+
}
|
models/en/plbert (ydqmkkx)/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:110f290b2adc0ac9c4994e494f61359ae45d91e1fe299d81ed69f86090d467cc
|
| 3 |
+
size 611719414
|
models/hi/fine-tuned-PL_BERT-hindi/.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
step_2500.t7 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
step_5000.t7 filter=lfs diff=lfs merge=lfs -text
|
models/hi/fine-tuned-PL_BERT-hindi/README.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PL-BERT Fine-Tuned on Hindi Wikipedia Dataset
|
| 2 |
+
|
| 3 |
+
This model is a fine-tuned version of **PL-BERT**, specifically trained on the Hindi subset of the Wiki40b dataset. The model has been optimized to understand and generate high-quality Hindi text, making it suitable for various NLP tasks in the Hindi language.
|
| 4 |
+
For more information about this model, check out the [GitHub](https://github.com/Ionio-io/PL-BERT-Fine-Tuned-hi-) repository.
|
| 5 |
+
|
| 6 |
+
## Model Overview
|
| 7 |
+
|
| 8 |
+
- **Model Name:** PL-BERT (Fine-tuned on Hindi)
|
| 9 |
+
- **Base Model:** PL-BERT (Multilingual BERT variant)
|
| 10 |
+
- **Dataset:** Hindi subset from Wiki40b (51,000 cleaned Wikipedia articles)
|
| 11 |
+
- **Precision:** Mixed precision (FP16)
|
| 12 |
+
|
| 13 |
+
The fine-tuning process focused on improving the model's ability to handle Hindi text more effectively by leveraging a large, cleaned corpus of Wikipedia articles in Hindi.
|
| 14 |
+
|
| 15 |
+
## Training Details
|
| 16 |
+
|
| 17 |
+
- **Model:** PL-BERT
|
| 18 |
+
- **Dataset:** Hindi subset from Wiki40b
|
| 19 |
+
- **Batch Size:** 64
|
| 20 |
+
- **Mixed Precision:** FP16
|
| 21 |
+
- **Optimizer:** AdamW
|
| 22 |
+
- **Training Steps:** 15,000
|
| 23 |
+
|
| 24 |
+
### Training Progress
|
| 25 |
+
|
| 26 |
+
- **Final Loss:** 1.879
|
| 27 |
+
- **Vocabulary Loss:** 0.49
|
| 28 |
+
- **Token Loss:** 1.465
|
| 29 |
+
|
| 30 |
+
### Validation Results
|
| 31 |
+
|
| 32 |
+
During training, we monitored performance with validation metrics:
|
| 33 |
+
|
| 34 |
+
- **Validation Loss:** 1.879
|
| 35 |
+
- **Vocabulary Accuracy:** 78.54%
|
| 36 |
+
- **Token Accuracy:** 82.30%
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
license: apache-2.0
|
| 41 |
+
---
|
models/hi/fine-tuned-PL_BERT-hindi/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Ionio-ai/fine-tuned-PL_BERT-hindi
|
models/hi/fine-tuned-PL_BERT-hindi/step_2500.t7
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32fa987b7a122dbdd18a2669065cc155b0b965cb2791dd31ff671b1bf26063ec
|
| 3 |
+
size 87387147
|
models/hi/fine-tuned-PL_BERT-hindi/step_5000.t7
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fd955319e7f7bd1c0bbc81e7819810d50ddd0dcd2ea98fd63ad0d0049ba5807
|
| 3 |
+
size 87387147
|
models/multi/multilingual-pl-bert/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
step_1100000.t7 filter=lfs diff=lfs merge=lfs -text
|
models/multi/multilingual-pl-bert/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
datasets:
|
| 4 |
+
- styletts2-community/multilingual-phonemes-10k-alpha
|
| 5 |
+
language:
|
| 6 |
+
- fr
|
| 7 |
+
- en
|
| 8 |
+
- es
|
| 9 |
+
- ca
|
| 10 |
+
- de
|
| 11 |
+
- el
|
| 12 |
+
- fa
|
| 13 |
+
- fi
|
| 14 |
+
- pt
|
| 15 |
+
- pl
|
| 16 |
+
- ru
|
| 17 |
+
- sv
|
| 18 |
+
- uk
|
| 19 |
+
- zh
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
# Multilingual PL-BERT checkpoint
|
| 23 |
+
|
| 24 |
+
The checkpoint open-sourced here is trained by Papercup using the open-source PL-BERT model found here https://github.com/yl4579/PL-BERT. It is trained to be supported by StyleTTS2, which can be found here: https://github.com/yl4579/StyleTTS2. You can see in the model card the languages that it has been trained on (the languages correspond to the crowdsourced dataset found here https://huggingface.co/datasets/styletts2-community/multilingual-phonemes-10k-alpha).
|
| 25 |
+
|
| 26 |
+
Notable differences compared to the default PL-BERT checkpoint and config available [here](https://github.com/yl4579/StyleTTS2/tree/main/Utils/PLBERT):
|
| 27 |
+
* Because we are working with many languages, we are using a different tokenizer now: `bert-base-multilingual-cased`.
|
| 28 |
+
* The PL-BERT model was trained on the data obtained from `styletts2-community/multilingual-phonemes-10k-alpha` for 1.1M iterations.
|
| 29 |
+
* The `token_maps.pkl` file has changed (also open-sourced here).
|
| 30 |
+
* We have changed the `util.py` file to deal with an error when loading `new_state_dict["embeddings.position_ids"]`.
|
| 31 |
+
|
| 32 |
+
## How do I train StyleTTS2 with this new PL-BERT checkpoint?
|
| 33 |
+
|
| 34 |
+
* Simply create a new folder under `Utils` in your StyleTTS2 repository. Call it, for example, `PLBERT_all_languages`.
|
| 35 |
+
* Copy paste into it `config.yml`, `step_1100000.t7` and `util.py`.
|
| 36 |
+
* Then, in your StyleTTS2 config file, change `PLBERT_dir` to `Utils/PLBERT_all_languages`. You will also need to change your import as such:
|
| 37 |
+
* Change `from Utils.PLBERT.util import load_plbert`
|
| 38 |
+
* To `from Utils.PLBERT_all_languages.util import load_plbert`
|
| 39 |
+
* Alternatively, you can just replace the relevant files in `Utils/PLBERT` and not have to change any code.
|
| 40 |
+
* Now, you need to create train and validation files. You will need to use `espeak` to create a file in the same format as the ones that exist in the `Data` folder of the StyleTTS2 repository. Careful! You will need to change the `language` argument to phonemise your text if it's not in English. You can find the correct language codes [here](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md). For example, Latin American Spanish is `es-419`
|
| 41 |
+
|
| 42 |
+
Voila, you can now train a multilingual StyleTTS2 model!
|
| 43 |
+
|
| 44 |
+
Thank you to Aaron (Yinghao) Li for these contributions.
|
models/multi/multilingual-pl-bert/config.yml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_dir: "Checkpoint_all_phonemes"
|
| 2 |
+
mixed_precision: "fp16"
|
| 3 |
+
data_folder: "wikipedia_20220301.en.processed"
|
| 4 |
+
batch_size: 32
|
| 5 |
+
save_interval: 20000
|
| 6 |
+
log_interval: 10
|
| 7 |
+
num_process: 1 # number of GPUs
|
| 8 |
+
num_steps: 2000000
|
| 9 |
+
|
| 10 |
+
dataset_params:
|
| 11 |
+
tokenizer: "bert-base-multilingual-cased"
|
| 12 |
+
token_separator: " " # token used for phoneme separator (space)
|
| 13 |
+
token_mask: "M" # token used for phoneme mask (M)
|
| 14 |
+
word_separator: 102 # token used for word separator (<formula>)
|
| 15 |
+
token_maps: "token_maps.pkl" # token map path
|
| 16 |
+
|
| 17 |
+
max_mel_length: 512 # max phoneme length
|
| 18 |
+
|
| 19 |
+
word_mask_prob: 0.15 # probability to mask the entire word
|
| 20 |
+
phoneme_mask_prob: 0.1 # probability to mask each phoneme
|
| 21 |
+
replace_prob: 0.2 # probablity to replace phonemes
|
| 22 |
+
|
| 23 |
+
model_params:
|
| 24 |
+
vocab_size: 178
|
| 25 |
+
hidden_size: 768
|
| 26 |
+
num_attention_heads: 12
|
| 27 |
+
intermediate_size: 2048
|
| 28 |
+
max_position_embeddings: 512
|
| 29 |
+
num_hidden_layers: 12
|
| 30 |
+
dropout: 0.1
|
models/multi/multilingual-pl-bert/issues.txt
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
===========================================================
|
| 2 |
+
[#2] How to inference it?
|
| 3 |
+
===========================================================
|
| 4 |
+
|
| 5 |
+
jonathansilvasantos, 20.03.2024
|
| 6 |
+
Q: Can someone share an inference script for testing this model independently? I'm exploring how to adapt FastPitch for compatibility with this model or other less computationally demanding text-to-speech (TTS) models.
|
| 7 |
+
|
| 8 |
+
raph30370 (Papercup AI org), 27.03.2024
|
| 9 |
+
A: I recommend you check out the StyleTTS2 repo on how to do inference with PL-BERT. Specifically in the train_second.py file for example, you can look at plbert = load_plbert(BERT_path) as a starting point in that script.
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
===========================================================
|
| 13 |
+
[#4] Language
|
| 14 |
+
===========================================================
|
| 15 |
+
|
| 16 |
+
YoussefBarakat, 02.05.2024
|
| 17 |
+
Q: If i'm training a new language does it have to be phonemized with e-speak?
|
| 18 |
+
|
| 19 |
+
raph30370 (Papercup AI org), 02.05.2024
|
| 20 |
+
A: I think that's how all of these languages here have been phonemized. If you're training everything together, they probably have to be under the same phoneme space, so yeah probably, unless you phonemize everything else with a new phonemizer as well.
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
===========================================================
|
| 24 |
+
[#5] Tokenizer Type
|
| 25 |
+
===========================================================
|
| 26 |
+
|
| 27 |
+
abhinand, 15.01.2025
|
| 28 |
+
Q: I notice that the vocabulary contains sub-word tokens.
|
| 29 |
+
|
| 30 |
+
...
|
| 31 |
+
{'word': '##م', 'token': 7030},
|
| 32 |
+
{'word': '##di', 'token': 7033},
|
| 33 |
+
{'word': '##kan', 'token': 7036},
|
| 34 |
+
{'word': '##ek', 'token': 7037},
|
| 35 |
+
{'word': '##ak', 'token': 7040},
|
| 36 |
+
{'word': '##ı', 'token': 7042},
|
| 37 |
+
{'word': '##lo', 'token': 7044},
|
| 38 |
+
{'word': '##ung', 'token': 7045},
|
| 39 |
+
...
|
| 40 |
+
|
| 41 |
+
But for PL-BERT, isn't it necessary to have 'word-level' tokens? Or am I misinterpreting here?
|
| 42 |
+
|
| 43 |
+
Another reference thread which discussed this -> https://github.com/yl4579/StyleTTS2/issues/286#issuecomment-2383835836
|
| 44 |
+
|
| 45 |
+
So does this mean I can use a tokenizer like Indic-BERT to build my token_maps for Indic-PL-BERT??
|
| 46 |
+
|
models/multi/multilingual-pl-bert/languages.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
French
|
| 2 |
+
English
|
| 3 |
+
Spanish
|
| 4 |
+
Catalan
|
| 5 |
+
German
|
| 6 |
+
Greek
|
| 7 |
+
Persian
|
| 8 |
+
Finnish
|
| 9 |
+
Portuguese
|
| 10 |
+
Polish
|
| 11 |
+
Russian
|
| 12 |
+
Swedish
|
| 13 |
+
Ukrainian
|
| 14 |
+
Chinese
|
models/multi/multilingual-pl-bert/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/papercup-ai/multilingual-pl-bert
|
models/multi/multilingual-pl-bert/step_1100000.t7
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e920e029e5226284f6042bef36514323c7fd0ae73c59e0ed4ccecd1f2916fd1
|
| 3 |
+
size 25179228
|
models/multi/multilingual-pl-bert/token_maps.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdb2b72e90be652f176c44fe4b101b897167bf1cd669ad76e11e9400928cc848
|
| 3 |
+
size 1858692
|
models/multi/multilingual-pl-bert/util.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import yaml
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import AlbertConfig, AlbertModel
|
| 5 |
+
|
| 6 |
+
class CustomAlbert(AlbertModel):
|
| 7 |
+
def forward(self, *args, **kwargs):
|
| 8 |
+
# Call the original forward method
|
| 9 |
+
outputs = super().forward(*args, **kwargs)
|
| 10 |
+
|
| 11 |
+
# Only return the last_hidden_state
|
| 12 |
+
return outputs.last_hidden_state
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load_plbert(log_dir):
|
| 16 |
+
config_path = os.path.join(log_dir, "config.yml")
|
| 17 |
+
plbert_config = yaml.safe_load(open(config_path))
|
| 18 |
+
|
| 19 |
+
albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
|
| 20 |
+
bert = CustomAlbert(albert_base_configuration)
|
| 21 |
+
|
| 22 |
+
files = os.listdir(log_dir)
|
| 23 |
+
ckpts = []
|
| 24 |
+
for f in os.listdir(log_dir):
|
| 25 |
+
if f.startswith("step_"): ckpts.append(f)
|
| 26 |
+
|
| 27 |
+
iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
|
| 28 |
+
iters = sorted(iters)[-1]
|
| 29 |
+
|
| 30 |
+
checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
|
| 31 |
+
state_dict = checkpoint['net']
|
| 32 |
+
from collections import OrderedDict
|
| 33 |
+
new_state_dict = OrderedDict()
|
| 34 |
+
for k, v in state_dict.items():
|
| 35 |
+
name = k[7:] # remove `module.`
|
| 36 |
+
if name.startswith('encoder.'):
|
| 37 |
+
name = name[8:] # remove `encoder.`
|
| 38 |
+
new_state_dict[name] = v
|
| 39 |
+
try:
|
| 40 |
+
del new_state_dict["embeddings.position_ids"]
|
| 41 |
+
except KeyError:
|
| 42 |
+
pass
|
| 43 |
+
bert.load_state_dict(new_state_dict, strict=False)
|
| 44 |
+
|
| 45 |
+
return bert
|