oronkam123 commited on
Commit
eabd695
·
verified ·
1 Parent(s): f93fce4

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. pt-create.py +22 -0
  2. train.ipynb +605 -0
  3. train.py +201 -0
  4. vocab.txt +181 -0
  5. weights/pl-bert-best.pt +3 -0
pt-create.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertForMaskedLM, BertConfig
2
+ import torch
3
+
4
+ # 1) Define exactly the same config you used for training
5
+ config = BertConfig(
6
+ vocab_size = 181,
7
+ hidden_size = 768,
8
+ num_hidden_layers = 12,
9
+ num_attention_heads = 12,
10
+ intermediate_size = 2048,
11
+ max_position_embeddings = 512,
12
+ hidden_dropout_prob = 0.1,
13
+ attention_probs_dropout_prob= 0.1,
14
+ )
15
+
16
+ # 2) Load the model from that checkpoint folder
17
+ checkpoint_dir = "/dev/hdd/Users/Oron/tts/pl-bert/pl-bert/checkpoint-746" # adjust to your actual path
18
+ model = BertForMaskedLM.from_pretrained(checkpoint_dir, config=config)
19
+
20
+ # 3) Save a raw .pt of its weights
21
+ torch.save(model.state_dict(), "pl-bert-interrupted.pt")
22
+ print("Saved interrupted-state weights to pl-bert-interrupted.pt")
train.ipynb ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n",
14
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
15
+ " warnings.warn(\n"
16
+ ]
17
+ },
18
+ {
19
+ "data": {
20
+ "text/html": [
21
+ "\n",
22
+ " <div>\n",
23
+ " \n",
24
+ " <progress value='11160' max='15240' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
25
+ " [11160/15240 45:51 < 16:46, 4.06 it/s, Epoch 7.32/10]\n",
26
+ " </div>\n",
27
+ " <table border=\"1\" class=\"dataframe\">\n",
28
+ " <thead>\n",
29
+ " <tr style=\"text-align: left;\">\n",
30
+ " <th>Epoch</th>\n",
31
+ " <th>Training Loss</th>\n",
32
+ " <th>Validation Loss</th>\n",
33
+ " <th>Accuracy</th>\n",
34
+ " <th>Perplexity</th>\n",
35
+ " </tr>\n",
36
+ " </thead>\n",
37
+ " <tbody>\n",
38
+ " <tr>\n",
39
+ " <td>1</td>\n",
40
+ " <td>0.604400</td>\n",
41
+ " <td>0.605723</td>\n",
42
+ " <td>0.726062</td>\n",
43
+ " <td>1.833437</td>\n",
44
+ " </tr>\n",
45
+ " <tr>\n",
46
+ " <td>2</td>\n",
47
+ " <td>0.554500</td>\n",
48
+ " <td>0.550530</td>\n",
49
+ " <td>0.744238</td>\n",
50
+ " <td>1.735228</td>\n",
51
+ " </tr>\n",
52
+ " <tr>\n",
53
+ " <td>3</td>\n",
54
+ " <td>0.524500</td>\n",
55
+ " <td>0.524182</td>\n",
56
+ " <td>0.757067</td>\n",
57
+ " <td>1.689540</td>\n",
58
+ " </tr>\n",
59
+ " <tr>\n",
60
+ " <td>4</td>\n",
61
+ " <td>0.242700</td>\n",
62
+ " <td>0.179025</td>\n",
63
+ " <td>0.947226</td>\n",
64
+ " <td>1.196386</td>\n",
65
+ " </tr>\n",
66
+ " <tr>\n",
67
+ " <td>5</td>\n",
68
+ " <td>0.152300</td>\n",
69
+ " <td>0.146584</td>\n",
70
+ " <td>0.953116</td>\n",
71
+ " <td>1.158002</td>\n",
72
+ " </tr>\n",
73
+ " <tr>\n",
74
+ " <td>6</td>\n",
75
+ " <td>0.141600</td>\n",
76
+ " <td>0.151233</td>\n",
77
+ " <td>0.952377</td>\n",
78
+ " <td>1.163258</td>\n",
79
+ " </tr>\n",
80
+ " <tr>\n",
81
+ " <td>7</td>\n",
82
+ " <td>0.140000</td>\n",
83
+ " <td>0.142038</td>\n",
84
+ " <td>0.954711</td>\n",
85
+ " <td>1.152668</td>\n",
86
+ " </tr>\n",
87
+ " </tbody>\n",
88
+ "</table><p>"
89
+ ],
90
+ "text/plain": [
91
+ "<IPython.core.display.HTML object>"
92
+ ]
93
+ },
94
+ "metadata": {},
95
+ "output_type": "display_data"
96
+ },
97
+ {
98
+ "name": "stderr",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
102
+ " warnings.warn(\n",
103
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
104
+ " warnings.warn(\n",
105
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
106
+ " warnings.warn(\n",
107
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
108
+ " warnings.warn(\n",
109
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
110
+ " warnings.warn(\n",
111
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
112
+ " warnings.warn(\n",
113
+ "/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
114
+ " warnings.warn(\n"
115
+ ]
116
+ },
117
+ {
118
+ "ename": "KeyboardInterrupt",
119
+ "evalue": "",
120
+ "output_type": "error",
121
+ "traceback": [
122
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
123
+ "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
124
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 198\u001b[39m\n\u001b[32m 188\u001b[39m trainer = Trainer(\n\u001b[32m 189\u001b[39m model=model,\n\u001b[32m 190\u001b[39m args=training_args,\n\u001b[32m (...)\u001b[39m\u001b[32m 194\u001b[39m compute_metrics=compute_metrics,\n\u001b[32m 195\u001b[39m )\n\u001b[32m 197\u001b[39m \u001b[38;5;66;03m# 10. Train & save\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m198\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 199\u001b[39m trainer.save_model(\u001b[33m\"\u001b[39m\u001b[33mpl-bert-final\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 200\u001b[39m tokenizer.save_pretrained(\u001b[33m\"\u001b[39m\u001b[33mpl-bert-final\u001b[39m\u001b[33m\"\u001b[39m)\n",
125
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/transformers/trainer.py:2206\u001b[39m, in \u001b[36mTrainer.train\u001b[39m\u001b[34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[39m\n\u001b[32m 2204\u001b[39m hf_hub_utils.enable_progress_bars()\n\u001b[32m 2205\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m2206\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2207\u001b[39m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m=\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2208\u001b[39m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2209\u001b[39m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2210\u001b[39m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m=\u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2211\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
126
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/transformers/trainer.py:2502\u001b[39m, in \u001b[36mTrainer._inner_training_loop\u001b[39m\u001b[34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[39m\n\u001b[32m 2500\u001b[39m update_step += \u001b[32m1\u001b[39m\n\u001b[32m 2501\u001b[39m num_batches = args.gradient_accumulation_steps \u001b[38;5;28;01mif\u001b[39;00m update_step != (total_updates - \u001b[32m1\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m remainder\n\u001b[32m-> \u001b[39m\u001b[32m2502\u001b[39m batch_samples, num_items_in_batch = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_batch_samples\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch_iterator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_batches\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2503\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i, inputs \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(batch_samples):\n\u001b[32m 2504\u001b[39m step += \u001b[32m1\u001b[39m\n",
127
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/transformers/trainer.py:5300\u001b[39m, in \u001b[36mTrainer.get_batch_samples\u001b[39m\u001b[34m(self, epoch_iterator, num_batches, device)\u001b[39m\n\u001b[32m 5298\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_batches):\n\u001b[32m 5299\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m5300\u001b[39m batch_samples.append(\u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mepoch_iterator\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[32m 5301\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[32m 5302\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
128
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/accelerate/data_loader.py:578\u001b[39m, in \u001b[36mDataLoaderShard.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 576\u001b[39m current_batch = send_to_device(current_batch, \u001b[38;5;28mself\u001b[39m.device, non_blocking=\u001b[38;5;28mself\u001b[39m._non_blocking)\n\u001b[32m 577\u001b[39m \u001b[38;5;28mself\u001b[39m._update_state_dict()\n\u001b[32m--> \u001b[39m\u001b[32m578\u001b[39m next_batch = \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdataloader_iter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 579\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m batch_index >= \u001b[38;5;28mself\u001b[39m.skip_batches:\n\u001b[32m 580\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m current_batch\n",
129
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py:733\u001b[39m, in \u001b[36m_BaseDataLoaderIter.__next__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 730\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 731\u001b[39m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[32m 732\u001b[39m \u001b[38;5;28mself\u001b[39m._reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m733\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 734\u001b[39m \u001b[38;5;28mself\u001b[39m._num_yielded += \u001b[32m1\u001b[39m\n\u001b[32m 735\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[32m 736\u001b[39m \u001b[38;5;28mself\u001b[39m._dataset_kind == _DatasetKind.Iterable\n\u001b[32m 737\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m._IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 738\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m._num_yielded > \u001b[38;5;28mself\u001b[39m._IterableDataset_len_called\n\u001b[32m 739\u001b[39m ):\n",
130
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py:789\u001b[39m, in \u001b[36m_SingleProcessDataLoaderIter._next_data\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 787\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 788\u001b[39m index = \u001b[38;5;28mself\u001b[39m._next_index() \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m789\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[32m 790\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._pin_memory:\n\u001b[32m 791\u001b[39m data = _utils.pin_memory.pin_memory(data, \u001b[38;5;28mself\u001b[39m._pin_memory_device)\n",
131
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py:50\u001b[39m, in \u001b[36m_MapDatasetFetcher.fetch\u001b[39m\u001b[34m(self, possibly_batched_index)\u001b[39m\n\u001b[32m 48\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.auto_collation:\n\u001b[32m 49\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m.dataset, \u001b[33m\"\u001b[39m\u001b[33m__getitems__\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.dataset.__getitems__:\n\u001b[32m---> \u001b[39m\u001b[32m50\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43m__getitems__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpossibly_batched_index\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 51\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 52\u001b[39m data = [\u001b[38;5;28mself\u001b[39m.dataset[idx] \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n",
132
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/arrow_dataset.py:2863\u001b[39m, in \u001b[36mDataset.__getitems__\u001b[39m\u001b[34m(self, keys)\u001b[39m\n\u001b[32m 2861\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__getitems__\u001b[39m(\u001b[38;5;28mself\u001b[39m, keys: \u001b[38;5;28mlist\u001b[39m) -> \u001b[38;5;28mlist\u001b[39m:\n\u001b[32m 2862\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Can be used to get a batch using a list of integers indices.\"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m2863\u001b[39m batch = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[34;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2864\u001b[39m n_examples = \u001b[38;5;28mlen\u001b[39m(batch[\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28miter\u001b[39m(batch))])\n\u001b[32m 2865\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m [{col: array[i] \u001b[38;5;28;01mfor\u001b[39;00m col, array \u001b[38;5;129;01min\u001b[39;00m batch.items()} \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n_examples)]\n",
133
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/arrow_dataset.py:2859\u001b[39m, in \u001b[36mDataset.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 2857\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._format_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._format_type \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m (\u001b[33m\"\u001b[39m\u001b[33marrow\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mpandas\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mpolars\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 2858\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m Column(\u001b[38;5;28mself\u001b[39m, key)\n\u001b[32m-> \u001b[39m\u001b[32m2859\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n",
134
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/arrow_dataset.py:2841\u001b[39m, in \u001b[36mDataset._getitem\u001b[39m\u001b[34m(self, key, **kwargs)\u001b[39m\n\u001b[32m 2839\u001b[39m formatter = get_formatter(format_type, features=\u001b[38;5;28mself\u001b[39m._info.features, **format_kwargs)\n\u001b[32m 2840\u001b[39m pa_subtable = query_table(\u001b[38;5;28mself\u001b[39m._data, key, indices=\u001b[38;5;28mself\u001b[39m._indices)\n\u001b[32m-> \u001b[39m\u001b[32m2841\u001b[39m formatted_output = \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2842\u001b[39m \u001b[43m \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m=\u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[32m 2843\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2844\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
135
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/formatting/formatting.py:657\u001b[39m, in \u001b[36mformat_table\u001b[39m\u001b[34m(table, key, formatter, format_columns, output_all_columns)\u001b[39m\n\u001b[32m 655\u001b[39m python_formatter = PythonFormatter(features=formatter.features)\n\u001b[32m 656\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m657\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 658\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 659\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
136
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/formatting/formatting.py:414\u001b[39m, in \u001b[36mFormatter.__call__\u001b[39m\u001b[34m(self, pa_table, query_type)\u001b[39m\n\u001b[32m 412\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.format_column(pa_table)\n\u001b[32m 413\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mbatch\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m414\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mformat_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n",
137
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/formatting/formatting.py:470\u001b[39m, in \u001b[36mPythonFormatter.format_batch\u001b[39m\u001b[34m(self, pa_table)\u001b[39m\n\u001b[32m 468\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.lazy:\n\u001b[32m 469\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m LazyBatch(pa_table, \u001b[38;5;28mself\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m470\u001b[39m batch = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpython_arrow_extractor\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mextract_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 471\u001b[39m batch = \u001b[38;5;28mself\u001b[39m.python_features_decoder.decode_batch(batch)\n\u001b[32m 472\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m batch\n",
138
+ "\u001b[36mFile \u001b[39m\u001b[32m/dev/hdd/Users/Oron/tts/tts-venv/lib/python3.12/site-packages/datasets/formatting/formatting.py:149\u001b[39m, in \u001b[36mPythonArrowExtractor.extract_batch\u001b[39m\u001b[34m(self, pa_table)\u001b[39m\n\u001b[32m 148\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mextract_batch\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa.Table) -> \u001b[38;5;28mdict\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m149\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpa_table\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_pydict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
139
+ "\u001b[31mKeyboardInterrupt\u001b[39m: "
140
+ ]
141
+ }
142
+ ],
143
+ "source": [
144
+ "#!/usr/bin/env python3\n",
145
+ "# pl-bert_training.py\n",
146
+ "\n",
147
+ "from datasets import load_dataset\n",
148
+ "from transformers import (\n",
149
+ " BertTokenizerFast,\n",
150
+ " BertConfig,\n",
151
+ " BertForMaskedLM,\n",
152
+ " DataCollatorForLanguageModeling,\n",
153
+ " Trainer,\n",
154
+ " TrainingArguments\n",
155
+ ")\n",
156
+ "from collections import Counter\n",
157
+ "import numpy as np\n",
158
+ "import math\n",
159
+ "from evaluate import load # use the 'evaluate' library for metrics\n",
160
+ "\n",
161
+ "# 1. Load exactly the first 100000 lines and slice into train (0-89999) and eval (90000-99999)\n",
162
+ "full_ds = load_dataset(\"thewh1teagle/phonikud-phonemes-data\", split=\"train[:5000000]\")\n",
163
+ "ds_train = full_ds.select(range(0, 4700000)) # first 90k examples\n",
164
+ "ds_eval = full_ds.select(range(4700000, 5000000)) # last 10k examples\n",
165
+ "\n",
166
+ "# 2. Split each raw line (text\\tphonemes) into its own column\n",
167
+ "def split_tab(examples):\n",
168
+ " heb, phon = [], []\n",
169
+ " for line in examples[\"text\"]:\n",
170
+ " h, p = line.split(\"\\t\")\n",
171
+ " heb.append(h)\n",
172
+ " phon.append(p)\n",
173
+ " return {\"hebrew\": heb, \"phonemes\": phon}\n",
174
+ "\n",
175
+ "# apply split_tab to both splits\n",
176
+ "ds_train = ds_train.map(\n",
177
+ " split_tab,\n",
178
+ " batched=True,\n",
179
+ " remove_columns=[\"text\"],\n",
180
+ ")\n",
181
+ "ds_eval = ds_eval.map(\n",
182
+ " split_tab,\n",
183
+ " batched=True,\n",
184
+ " remove_columns=[\"text\"],\n",
185
+ ")\n",
186
+ "\n",
187
+ "# 3. Build vocab from provided symbol sets\n",
188
+ "punctuation = ';:,.!?¡¿—…\"«»“” ’'\n",
189
+ "letters = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\"\n",
190
+ "letters_ipa = (\n",
191
+ " \"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯ혂ŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ\"\n",
192
+ ")\n",
193
+ "extend = \"\"\n",
194
+ "specials = [\"[PAD]\",\"[UNK]\",\"[CLS]\",\"[SEP]\",\"[MASK]\"]\n",
195
+ "all_chars = list(punctuation) + list(letters) + list(letters_ipa) + list(extend)\n",
196
+ "seen = set(); vocab_chars = []\n",
197
+ "for c in all_chars:\n",
198
+ " if c not in seen:\n",
199
+ " seen.add(c)\n",
200
+ " vocab_chars.append(c)\n",
201
+ "with open(\"vocab.txt\", \"w\", encoding=\"utf-8\") as vf:\n",
202
+ " for tok in specials:\n",
203
+ " vf.write(tok + \"\\n\")\n",
204
+ " for c in vocab_chars:\n",
205
+ " vf.write(c + \"\\n\")\n",
206
+ "\n",
207
+ "# 4. Init tokenizer & config\n",
208
+ "tokenizer = BertTokenizerFast(\n",
209
+ " vocab_file=\"vocab.txt\",\n",
210
+ " unk_token=\"[UNK]\", pad_token=\"[PAD]\", cls_token=\"[CLS]\",\n",
211
+ " sep_token=\"[SEP]\", mask_token=\"[MASK]\",\n",
212
+ " do_lower_case=False, strip_accents=False, tokenize_chinese_chars=False,\n",
213
+ ")\n",
214
+ "# config = BertConfig(\n",
215
+ "# vocab_size=len(tokenizer),\n",
216
+ "# hidden_size=256,\n",
217
+ "# num_hidden_layers=6,\n",
218
+ "# num_attention_heads=8,\n",
219
+ "# intermediate_size=1024,\n",
220
+ "# max_position_embeddings=512,\n",
221
+ "# )\n",
222
+ "\n",
223
+ "from transformers import BertConfig\n",
224
+ "\n",
225
+ "# config = BertConfig(\n",
226
+ "# vocab_size=len(tokenizer), # your vocab size\n",
227
+ "# hidden_size=512, # ← was 256\n",
228
+ "# num_hidden_layers=6, # same depth\n",
229
+ "# num_attention_heads=8, # 512 % 8 == 0 → 64‑dim per head\n",
230
+ "# intermediate_size=2048, # typically 4× hidden_size\n",
231
+ "# max_position_embeddings=512,\n",
232
+ "# )\n",
233
+ "\n",
234
+ "config = BertConfig(\n",
235
+ " vocab_size = len(tokenizer), # your phoneme vocab (178)\n",
236
+ " hidden_size = 768, # must match TTS hidden_size\n",
237
+ " num_hidden_layers = 12, # must match TTS num_hidden_layers\n",
238
+ " num_attention_heads = 12, # must match TTS num_attention_heads\n",
239
+ " intermediate_size = 2048, # typically 4× hidden_size\n",
240
+ " max_position_embeddings = 512, # same as TTS max_position_embeddings\n",
241
+ " hidden_dropout_prob = 0.1, # dropout for embeddings & attention\n",
242
+ " attention_probs_dropout_prob = 0.1, # dropout for attention scores\n",
243
+ ")\n",
244
+ "\n",
245
+ "\n",
246
+ "# config = BertConfig(\n",
247
+ "# num_hidden_layers=12,\n",
248
+ "# hidden_size=768,\n",
249
+ "# intermediate_size=2048,\n",
250
+ "# num_attention_heads=12,\n",
251
+ "# max_position_embeddings=512,\n",
252
+ "# )\n",
253
+ "\n",
254
+ "\n",
255
+ "# 5. Tokenize phonemes\n",
256
+ "def tokenize_fn(examples):\n",
257
+ " return tokenizer(\n",
258
+ " examples[\"phonemes\"],\n",
259
+ " return_attention_mask=True,\n",
260
+ " add_special_tokens=True,\n",
261
+ " )\n",
262
+ "\n",
263
+ "tokenized_train = ds_train.map(\n",
264
+ " tokenize_fn,\n",
265
+ " batched=True,\n",
266
+ " remove_columns=[\"hebrew\",\"phonemes\"]\n",
267
+ ")\n",
268
+ "tokenized_eval = ds_eval.map(\n",
269
+ " tokenize_fn,\n",
270
+ " batched=True,\n",
271
+ " remove_columns=[\"hebrew\",\"phonemes\"]\n",
272
+ ")\n",
273
+ "\n",
274
+ "# 6. Chunk into fixed-length blocks for MLM\n",
275
+ "block_size = 128\n",
276
+ "def group_texts(examples):\n",
277
+ " all_ids = sum(examples[\"input_ids\"], [])\n",
278
+ " result = {\"input_ids\":[], \"attention_mask\":[]}\n",
279
+ " for i in range(0, len(all_ids) - block_size + 1, block_size):\n",
280
+ " chunk = all_ids[i : i + block_size]\n",
281
+ " result[\"input_ids\"].append(chunk)\n",
282
+ " result[\"attention_mask\"].append([1] * block_size)\n",
283
+ " return result\n",
284
+ "\n",
285
+ "lm_train = tokenized_train.map(\n",
286
+ " group_texts,\n",
287
+ " batched=True,\n",
288
+ " remove_columns=list(tokenized_train.column_names),\n",
289
+ ")\n",
290
+ "lm_eval = tokenized_eval.map(\n",
291
+ " group_texts,\n",
292
+ " batched=True,\n",
293
+ " remove_columns=list(tokenized_eval.column_names),\n",
294
+ ")\n",
295
+ "\n",
296
+ "# 7. Data collator for MLM\n",
297
+ "data_collator = DataCollatorForLanguageModeling(\n",
298
+ " tokenizer=tokenizer, mlm=True, mlm_probability=0.15\n",
299
+ ")\n",
300
+ "\n",
301
+ "# 8. Metrics for masked-token accuracy + perplexity\n",
302
+ "accuracy_metric = load(\"accuracy\")\n",
303
+ "def compute_metrics(eval_pred):\n",
304
+ " logits, labels = eval_pred.predictions, eval_pred.label_ids\n",
305
+ " logits = logits.reshape(-1, logits.shape[-1])\n",
306
+ " labels = labels.reshape(-1)\n",
307
+ " mask = labels != -100\n",
308
+ "\n",
309
+ " preds = np.argmax(logits, axis=-1)\n",
310
+ " acc = accuracy_metric.compute(\n",
311
+ " predictions=preds[mask], references=labels[mask]\n",
312
+ " )[\"accuracy\"]\n",
313
+ "\n",
314
+ " max_logits = np.max(logits[mask], axis=-1, keepdims=True)\n",
315
+ " stable = logits[mask] - max_logits\n",
316
+ " logsumexp = max_logits.flatten() + np.log(np.exp(stable).sum(axis=-1))\n",
317
+ " true_logits = logits[mask, labels[mask]]\n",
318
+ " xent = -np.mean(true_logits - logsumexp)\n",
319
+ " ppl = float(np.exp(xent))\n",
320
+ "\n",
321
+ " return {\"accuracy\": acc, \"perplexity\": ppl}\n",
322
+ "\n",
323
+ "# 9. Model & Trainer\n",
324
+ "model = BertForMaskedLM(config)\n",
325
+ "training_args = TrainingArguments(\n",
326
+ " output_dir=\"pl-bert\",\n",
327
+ " overwrite_output_dir=True,\n",
328
+ " num_train_epochs=10,\n",
329
+ " per_device_train_batch_size=96,\n",
330
+ " warmup_steps=400,\n",
331
+ " per_device_eval_batch_size=196,\n",
332
+ " logging_strategy=\"steps\",\n",
333
+ " eval_strategy=\"epoch\",\n",
334
+ " save_strategy=\"epoch\",\n",
335
+ " logging_steps=25,\n",
336
+ " learning_rate=1e-5,\n",
337
+ " weight_decay=0.001,\n",
338
+ " push_to_hub=False,\n",
339
+ " # no_cuda=True,\n",
340
+ " eval_accumulation_steps=1,\n",
341
+ " save_total_limit=3,\n",
342
+ ")\n",
343
+ "trainer = Trainer(\n",
344
+ " model=model,\n",
345
+ " args=training_args,\n",
346
+ " data_collator=data_collator,\n",
347
+ " train_dataset=lm_train,\n",
348
+ " eval_dataset=lm_eval,\n",
349
+ " compute_metrics=compute_metrics,\n",
350
+ ")\n",
351
+ "\n",
352
+ "# 10. Train & save\n",
353
+ "trainer.train()\n",
354
+ "trainer.save_model(\"pl-bert-final\")\n",
355
+ "tokenizer.save_pretrained(\"pl-bert-final\")\n"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "metadata": {},
362
+ "outputs": [],
363
+ "source": [
364
+ "#!/usr/bin/env python3\n",
365
+ "# inference_plbert_full.py\n",
366
+ "\n",
367
+ "import torch\n",
368
+ "from transformers import BertTokenizerFast, BertModel\n",
369
+ "from phonikud import phonemize # make sure phonikud is in your PYTHONPATH\n",
370
+ "\n",
371
+ "MODEL_DIR = \"/dev/hdd/Users/Oron/tts/pl-bert/pl-bert/checkpoint-300\"\n",
372
+ "\n",
373
+ "def load_encoder(model_dir=MODEL_DIR):\n",
374
+ " tokenizer = BertTokenizerFast.from_pretrained(model_dir)\n",
375
+ " encoder = BertModel.from_pretrained(model_dir).eval()\n",
376
+ " return tokenizer, encoder\n",
377
+ "\n",
378
+ "def infer_embeddings(hebrew_niqqud: str, tokenizer, encoder):\n",
379
+ " # 1) Grapheme → Phoneme\n",
380
+ " phoneme_str = phonemize(hebrew_niqqud)\n",
381
+ " print(\"Phoneme string:\\n\", phoneme_str, \"\\n\")\n",
382
+ "\n",
383
+ " # 2) Split into single-character tokens\n",
384
+ " chars = list(phoneme_str)\n",
385
+ " inputs = tokenizer(\n",
386
+ " chars,\n",
387
+ " is_split_into_words=True,\n",
388
+ " add_special_tokens=True,\n",
389
+ " return_tensors=\"pt\"\n",
390
+ " )\n",
391
+ " print(\"Tokens:\", tokenizer.convert_ids_to_tokens(inputs.input_ids[0]), \"\\n\")\n",
392
+ "\n",
393
+ " # 3) Run through BERT encoder\n",
394
+ " with torch.no_grad():\n",
395
+ " outputs = encoder(**inputs)\n",
396
+ " embeddings = outputs.last_hidden_state # (1, seq_len, hidden_size)\n",
397
+ " print(\"Embeddings shape:\", embeddings.shape)\n",
398
+ " return embeddings\n",
399
+ "\n",
400
+ "if __name__ == \"__main__\":\n",
401
+ " tokenizer, encoder = load_encoder()\n",
402
+ "\n",
403
+ " sample_hebrew = \"הַאִם זֶה אֲנַ֫חְנוּ וְֽ|הֵם אוֹ כֻּו֯לָּ֫נוּ בְּֽיַחַד?\"\n",
404
+ " embeddings = infer_embeddings(sample_hebrew, tokenizer, encoder)\n",
405
+ "\n",
406
+ " # Optionally, save embeddings to disk:\n",
407
+ " torch.save(embeddings, \"sample_embeddings.pt\")\n",
408
+ " print(\"\\nSaved embeddings → sample_embeddings.pt\")\n"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": null,
414
+ "metadata": {},
415
+ "outputs": [],
416
+ "source": [
417
+ "#!/usr/bin/env python3\n",
418
+ "# g2p_evaluation.py\n",
419
+ "\n",
420
+ "import torch\n",
421
+ "from torch.utils.data import DataLoader\n",
422
+ "from datasets import load_dataset\n",
423
+ "from transformers import BertModel, BertTokenizerFast\n",
424
+ "from torch import nn\n",
425
+ "from tqdm import tqdm\n",
426
+ "\n",
427
+ "# ─── CONFIGURATION ────────────────────────────────────────────────────\n",
428
+ "BERT_MODEL_PATH = \"/dev/hdd/Users/Oron/tts/pl-bert/pl-bert/checkpoint-746\" # Your trained BERT model path\n",
429
+ "BATCH_SIZE = 196\n",
430
+ "EPOCHS = 5\n",
431
+ "LEARNING_RATE = 1e-4\n",
432
+ "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
433
+ "FREEZE_BERT = True # Set to True to freeze BERT weights\n",
434
+ "\n",
435
+ "# ─── LOAD DATASET ─────────────────────────────────────────────────────\n",
436
+ "full_ds = load_dataset(\"thewh1teagle/phonikud-phonemes-data\", split=\"train[:1000000]\")\n",
437
+ "ds_train = full_ds.select(range(0, 900000))\n",
438
+ "ds_eval = full_ds.select(range(900000, 1000000))\n",
439
+ "\n",
440
+ "# ─── SPLIT INTO INPUT AND TARGET ──────────────────────────────────────\n",
441
+ "def split_data(example):\n",
442
+ " text, phonemes = example[\"text\"].split(\"\\t\")\n",
443
+ " return {\"text\": text, \"phonemes\": phonemes}\n",
444
+ "\n",
445
+ "ds_train = ds_train.map(split_data, remove_columns=[\"text\"])\n",
446
+ "ds_eval = ds_eval.map(split_data, remove_columns=[\"text\"])\n",
447
+ "\n",
448
+ "# ─── TOKENIZATION ─────────────────────────────────────────────────────\n",
449
+ "tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_PATH)\n",
450
+ "\n",
451
+ "def tokenize(batch):\n",
452
+ " inputs = tokenizer(batch[\"text\"], padding='max_length', truncation=True, max_length=128)\n",
453
+ " targets = tokenizer(batch[\"phonemes\"], padding='max_length', truncation=True, max_length=128)\n",
454
+ " inputs[\"labels\"] = targets[\"input_ids\"]\n",
455
+ " return inputs\n",
456
+ "\n",
457
+ "train_enc = ds_train.map(tokenize, batched=True, remove_columns=[\"text\", \"phonemes\"])\n",
458
+ "eval_enc = ds_eval.map(tokenize, batched=True, remove_columns=[\"text\", \"phonemes\"])\n",
459
+ "\n",
460
+ "train_enc.set_format(type=\"torch\")\n",
461
+ "eval_enc.set_format(type=\"torch\")\n",
462
+ "\n",
463
+ "train_loader = DataLoader(train_enc, batch_size=BATCH_SIZE, shuffle=True)\n",
464
+ "eval_loader = DataLoader(eval_enc, batch_size=BATCH_SIZE)\n",
465
+ "\n",
466
+ "# ─── MODEL DEFINITION ──────────────────────────────────────────────────\n",
467
+ "class G2PModel(nn.Module):\n",
468
+ " def __init__(self, bert_path, vocab_size, freeze_bert):\n",
469
+ " super().__init__()\n",
470
+ " self.bert = BertModel.from_pretrained(bert_path)\n",
471
+ " if freeze_bert:\n",
472
+ " for param in self.bert.parameters():\n",
473
+ " param.requires_grad = False\n",
474
+ " self.linear = nn.Linear(self.bert.config.hidden_size, vocab_size)\n",
475
+ "\n",
476
+ " def forward(self, input_ids, attention_mask):\n",
477
+ " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
478
+ " logits = self.linear(outputs.last_hidden_state)\n",
479
+ " return logits\n",
480
+ "\n",
481
+ "model = G2PModel(BERT_MODEL_PATH, len(tokenizer), FREEZE_BERT).to(DEVICE)\n",
482
+ "\n",
483
+ "# ─── TRAINING SETUP ────────────────────────────────────────────────────\n",
484
+ "optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)\n",
485
+ "criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)\n",
486
+ "\n",
487
+ "# ─── TRAINING LOOP ─────────────────────────────────────────────────────\n",
488
+ "for epoch in range(EPOCHS):\n",
489
+ " model.train()\n",
490
+ " total_loss = 0\n",
491
+ " progress = tqdm(train_loader, desc=f\"Epoch {epoch+1}/{EPOCHS}\")\n",
492
+ " for batch in progress:\n",
493
+ " input_ids = batch[\"input_ids\"].to(DEVICE)\n",
494
+ " attention_mask = batch[\"attention_mask\"].to(DEVICE)\n",
495
+ " labels = batch[\"labels\"].to(DEVICE)\n",
496
+ "\n",
497
+ " optimizer.zero_grad()\n",
498
+ " logits = model(input_ids, attention_mask)\n",
499
+ "\n",
500
+ " loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))\n",
501
+ " loss.backward()\n",
502
+ " optimizer.step()\n",
503
+ "\n",
504
+ " total_loss += loss.item()\n",
505
+ " progress.set_postfix(avg_loss=total_loss / (progress.n + 1))\n",
506
+ "\n",
507
+ "# ─── EVALUATION LOOP ───────────────────────────────────────────────────\n",
508
+ " model.eval()\n",
509
+ " total_loss = 0\n",
510
+ " total_count = 0\n",
511
+ " top1_correct = 0\n",
512
+ " top5_correct = 0\n",
513
+ " with torch.no_grad():\n",
514
+ " progress = tqdm(eval_loader, desc=\"Evaluating\")\n",
515
+ " for batch in progress:\n",
516
+ " input_ids = batch[\"input_ids\"].to(DEVICE)\n",
517
+ " attention_mask = batch[\"attention_mask\"].to(DEVICE)\n",
518
+ " labels = batch[\"labels\"].to(DEVICE)\n",
519
+ "\n",
520
+ " logits = model(input_ids, attention_mask)\n",
521
+ " loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))\n",
522
+ " total_loss += loss.item()\n",
523
+ "\n",
524
+ " # Top‑1\n",
525
+ " top1 = logits.argmax(dim=-1)\n",
526
+ " # Top‑5 (מימד אחרון = 5 אפשרויות)\n",
527
+ " top5 = logits.topk(5, dim=-1).indices # [B, T, 5]\n",
528
+ "\n",
529
+ " mask = labels != tokenizer.pad_token_id # מתעלמים מ‑[PAD]\n",
530
+ " total_count += mask.sum().item()\n",
531
+ "\n",
532
+ " top1_correct += ((top1 == labels) & mask).sum().item()\n",
533
+ " # להפוך את labels ל‑[B, T, 1] ואז להשוות מול ‑top5\n",
534
+ " top5_correct += ((top5 == labels.unsqueeze(-1)) & mask.unsqueeze(-1)).any(dim=-1).sum().item()\n",
535
+ "\n",
536
+ " avg_loss = total_loss / (progress.n + 1)\n",
537
+ " g1_acc = top1_correct / total_count\n",
538
+ " g5_acc = top5_correct / total_count\n",
539
+ " progress.set_postfix(avg_loss=avg_loss, g1_acc=g1_acc, g5_acc=g5_acc)\n",
540
+ "\n",
541
+ " if total_count:\n",
542
+ " print(f\"Final Evaluation Loss: {avg_loss:.4f} | G1: {g1_acc:.2%} | G5: {g5_acc:.2%}\")\n"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": null,
548
+ "metadata": {},
549
+ "outputs": [],
550
+ "source": [
551
+ "from phonikud import phonemize\n",
552
+ "from transformers import BertTokenizerFast, BertModel\n",
553
+ "import torch\n",
554
+ "\n",
555
+ "# 1) Your sample\n",
556
+ "hebrew = \"שָׁלוֹם עוֹלָם\"\n",
557
+ "print(\"Hebrew text:\", hebrew)\n",
558
+ "print(\"Length (chars):\", len(hebrew))\n",
559
+ "\n",
560
+ "# 2) Phonemize\n",
561
+ "phon_str = phonemize(hebrew)\n",
562
+ "print(\"\\nPhonemizer output string:\", phon_str)\n",
563
+ "# split on spaces to get *word*‐level tokens:\n",
564
+ "word_tokens = phon_str.split()\n",
565
+ "# or to see IPA‐symbol tokens:\n",
566
+ "symbol_tokens = [c for c in phon_str.replace(\" \", \"\")]\n",
567
+ "print(\"→ Word tokens:\", word_tokens, f\"(count={len(word_tokens)})\")\n",
568
+ "print(\"→ IPA symbol tokens:\", symbol_tokens, f\"(count={len(symbol_tokens)})\")\n",
569
+ "\n",
570
+ "# 3) Load your pl-bert encoder & tokenizer\n",
571
+ "MODEL_DIR = \"/dev/hdd/Users/Oron/tts/pl-bert/pl-bert-final\"\n",
572
+ "tokenizer = BertTokenizerFast.from_pretrained(MODEL_DIR)\n",
573
+ "encoder = BertModel.from_pretrained(MODEL_DIR).eval()\n",
574
+ "\n",
575
+ "# 4) Tokenize phoneme symbols, run through encoder\n",
576
+ "inputs = tokenizer(symbol_tokens, is_split_into_words=True, return_tensors=\"pt\")\n",
577
+ "with torch.no_grad():\n",
578
+ " outputs = encoder(**inputs)\n",
579
+ "emb = outputs.last_hidden_state # (1, seq_len, hidden_size)\n",
580
+ "print(\"\\nEncoder got seq_len =\", emb.size(1), \"hidden_size =\", emb.size(2))\n"
581
+ ]
582
+ }
583
+ ],
584
+ "metadata": {
585
+ "kernelspec": {
586
+ "display_name": "tts-venv",
587
+ "language": "python",
588
+ "name": "python3"
589
+ },
590
+ "language_info": {
591
+ "codemirror_mode": {
592
+ "name": "ipython",
593
+ "version": 3
594
+ },
595
+ "file_extension": ".py",
596
+ "mimetype": "text/x-python",
597
+ "name": "python",
598
+ "nbconvert_exporter": "python",
599
+ "pygments_lexer": "ipython3",
600
+ "version": "3.12.2"
601
+ }
602
+ },
603
+ "nbformat": 4,
604
+ "nbformat_minor": 2
605
+ }
train.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # pl-bert_training.py
3
+
4
+ from datasets import load_dataset
5
+ from transformers import (
6
+ BertTokenizerFast,
7
+ BertConfig,
8
+ BertForMaskedLM,
9
+ DataCollatorForLanguageModeling,
10
+ Trainer,
11
+ TrainingArguments
12
+ )
13
+ from evaluate import load # use the 'evaluate' library for metrics
14
+ import torch
15
+ import yaml
16
+ import numpy as np
17
+
18
+ # # 1. Load dataset and split
19
+ # full_ds = load_dataset("thewh1teagle/phonikud-phonemes-data", split="train[:5000000]")
20
+ # ds_train = full_ds.select(range(0, 4700000)) # first 4.7M examples
21
+ # ds_eval = full_ds.select(range(4700000, 5000000)) # last 300k examples
22
+
23
+
24
+
25
+ # 2. Split "text" column into Hebrew and phonemes
26
+ def split_tab(examples):
27
+ heb, phon = [], []
28
+ for line in examples["text"]:
29
+ h, p = line.split("\t")
30
+ heb.append(h)
31
+ phon.append(p)
32
+ return {"hebrew": heb, "phonemes": phon}
33
+
34
+ ds_train = ds_train.map(split_tab, batched=True, remove_columns=["text"])
35
+ ds_eval = ds_eval.map(split_tab, batched=True, remove_columns=["text"])
36
+
37
+ # 3. Build character‐level phoneme vocab
38
+ punctuation = ';:,.!?¡¿—…"«»“” ’'
39
+ letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
40
+ letters_ipa = (
41
+ "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯ혂ŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ"
42
+ )
43
+ specials = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
44
+ all_chars = list(punctuation) + list(letters) + list(letters_ipa)
45
+ vocab_chars = []
46
+ seen = set()
47
+ for c in all_chars:
48
+ if c not in seen:
49
+ seen.add(c)
50
+ vocab_chars.append(c)
51
+
52
+ with open("vocab.txt", "w", encoding="utf-8") as vf:
53
+ for tok in specials:
54
+ vf.write(tok + "\n")
55
+ for c in vocab_chars:
56
+ vf.write(c + "\n")
57
+
58
+ # 4. Initialize tokenizer & model config
59
+ tokenizer = BertTokenizerFast(
60
+ vocab_file="vocab.txt",
61
+ unk_token="[UNK]", pad_token="[PAD]",
62
+ cls_token="[CLS]", sep_token="[SEP]",
63
+ mask_token="[MASK]",
64
+ do_lower_case=False,
65
+ strip_accents=False,
66
+ tokenize_chinese_chars=False,
67
+ )
68
+
69
+ config = BertConfig(
70
+ vocab_size = len(tokenizer),
71
+ hidden_size = 768,
72
+ num_hidden_layers = 12,
73
+ num_attention_heads = 12,
74
+ intermediate_size = 2048,
75
+ max_position_embeddings = 512,
76
+ hidden_dropout_prob = 0.1,
77
+ attention_probs_dropout_prob= 0.1,
78
+ )
79
+
80
+ # 5. Tokenize only the phoneme sequences
81
+ def tokenize_fn(examples):
82
+ return tokenizer(
83
+ examples["phonemes"],
84
+ return_attention_mask=True,
85
+ add_special_tokens=True,
86
+ )
87
+
88
+ tokenized_train = ds_train.map(
89
+ tokenize_fn,
90
+ batched=True,
91
+ remove_columns=["hebrew","phonemes"]
92
+ )
93
+ tokenized_eval = ds_eval.map(
94
+ tokenize_fn,
95
+ batched=True,
96
+ remove_columns=["hebrew","phonemes"]
97
+ )
98
+
99
+ # 6. Group into fixed-length blocks for MLM
100
+ block_size = 128
101
+ def group_texts(examples):
102
+ all_ids = sum(examples["input_ids"], [])
103
+ result = {"input_ids":[], "attention_mask":[]}
104
+ for i in range(0, len(all_ids) - block_size + 1, block_size):
105
+ chunk = all_ids[i : i + block_size]
106
+ result["input_ids"].append(chunk)
107
+ result["attention_mask"].append([1] * block_size)
108
+ return result
109
+
110
+ lm_train = tokenized_train.map(
111
+ group_texts,
112
+ batched=True,
113
+ remove_columns=list(tokenized_train.column_names),
114
+ )
115
+ lm_eval = tokenized_eval.map(
116
+ group_texts,
117
+ batched=True,
118
+ remove_columns=list(tokenized_eval.column_names),
119
+ )
120
+
121
+ # 7. Data collator for MLM
122
+ data_collator = DataCollatorForLanguageModeling(
123
+ tokenizer=tokenizer, mlm=True, mlm_probability=0.15
124
+ )
125
+
126
+ # 8. Metrics for accuracy + perplexity
127
+ accuracy_metric = load("accuracy")
128
+ def compute_metrics(eval_pred):
129
+ logits, labels = eval_pred.predictions, eval_pred.label_ids
130
+ logits = logits.reshape(-1, logits.shape[-1])
131
+ labels = labels.reshape(-1)
132
+ mask = labels != -100
133
+
134
+ preds = np.argmax(logits, axis=-1)
135
+ acc = accuracy_metric.compute(
136
+ predictions=preds[mask], references=labels[mask]
137
+ )["accuracy"]
138
+
139
+ max_logits = np.max(logits[mask], axis=-1, keepdims=True)
140
+ stable = logits[mask] - max_logits
141
+ logsumexp = max_logits.flatten() + np.log(np.exp(stable).sum(axis=-1))
142
+ true_logits = logits[mask, labels[mask]]
143
+ xent = -np.mean(true_logits - logsumexp)
144
+ ppl = float(np.exp(xent))
145
+
146
+ return {"accuracy": acc, "perplexity": ppl}
147
+
148
+ # 9. Initialize model & Trainer
149
+ model = BertForMaskedLM(config)
150
+
151
+ # 9b) Load your best .pt checkpoint into it
152
+ # ckpt_path = "/dev/hdd/Users/Oron/tts/pl-bert/pl-bert-best1.pt"
153
+ # state_dict = torch.load(ckpt_path, map_location="cpu")
154
+ # model.load_state_dict(state_dict, strict=False)
155
+ # print(f"[✔] Loaded pretrained PL‑BERT weights from {ckpt_path}")
156
+
157
+ training_args = TrainingArguments(
158
+ output_dir = "pl-bert",
159
+ overwrite_output_dir = True,
160
+ num_train_epochs = 20,
161
+ per_device_train_batch_size = 196,
162
+ per_device_eval_batch_size = 196,
163
+ warmup_steps = 400,
164
+ learning_rate = 1e-5,
165
+ weight_decay = 0.001,
166
+ eval_strategy = "epoch",
167
+ save_strategy = "epoch",
168
+ load_best_model_at_end = True,
169
+ metric_for_best_model = "perplexity",
170
+ greater_is_better = False,
171
+ logging_strategy = "steps",
172
+ logging_steps = 25,
173
+ save_total_limit = 3,
174
+ push_to_hub = False,
175
+ eval_accumulation_steps = 1,
176
+ # fp16=True, # uncomment if you want mixed precision
177
+ )
178
+
179
+ trainer = Trainer(
180
+ model = model,
181
+ args = training_args,
182
+ data_collator = data_collator,
183
+ train_dataset = lm_train,
184
+ eval_dataset = lm_eval,
185
+ compute_metrics = compute_metrics,
186
+ )
187
+
188
+ # 10. Train & save best checkpoint and .pt file
189
+ trainer.train()
190
+
191
+ best_ckpt = trainer.state.best_model_checkpoint
192
+ print(f"Best checkpoint directory: {best_ckpt}")
193
+
194
+ # Load the best checkpoint and save a raw .pt state_dict
195
+ best_model = BertForMaskedLM.from_pretrained(best_ckpt, config=config)
196
+ torch.save(best_model.state_dict(), "pl-bert-best.pt")
197
+ print("[✔] Saved best model weights to pl-bert-best.pt")
198
+
199
+ # Also keep HF format
200
+ best_model.save_pretrained("pl-bert-final")
201
+ tokenizer.save_pretrained("pl-bert-final")
vocab.txt ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ ;
7
+ :
8
+ ,
9
+ .
10
+ !
11
+ ?
12
+ ¡
13
+ ¿
14
+
15
+
16
+ "
17
+ «
18
+ »
19
+
20
+
21
+
22
+
23
+ A
24
+ B
25
+ C
26
+ D
27
+ E
28
+ F
29
+ G
30
+ H
31
+ I
32
+ J
33
+ K
34
+ L
35
+ M
36
+ N
37
+ O
38
+ P
39
+ Q
40
+ R
41
+ S
42
+ T
43
+ U
44
+ V
45
+ W
46
+ X
47
+ Y
48
+ Z
49
+ a
50
+ b
51
+ c
52
+ d
53
+ e
54
+ f
55
+ g
56
+ h
57
+ i
58
+ j
59
+ k
60
+ l
61
+ m
62
+ n
63
+ o
64
+ p
65
+ q
66
+ r
67
+ s
68
+ t
69
+ u
70
+ v
71
+ w
72
+ x
73
+ y
74
+ z
75
+ ɑ
76
+ ɐ
77
+ ɒ
78
+ æ
79
+ ɓ
80
+ ʙ
81
+ β
82
+ ɔ
83
+ ɕ
84
+ ç
85
+ ɗ
86
+ ɖ
87
+ ð
88
+ ʤ
89
+ ə
90
+ ɘ
91
+ ɚ
92
+ ɛ
93
+ ɜ
94
+ ɝ
95
+ ɞ
96
+ ɟ
97
+ ʄ
98
+ ɡ
99
+ ɠ
100
+ ɢ
101
+ ʛ
102
+ ɦ
103
+ ɧ
104
+ ħ
105
+ ɥ
106
+ ʜ
107
+ ɨ
108
+ ɪ
109
+ ʝ
110
+ ɭ
111
+ ɬ
112
+ ɫ
113
+ ɮ
114
+ ʟ
115
+ ɱ
116
+ ɯ
117
+
118
+ ŋ
119
+ ɳ
120
+ ɲ
121
+ ɴ
122
+ ø
123
+ ɵ
124
+ ɸ
125
+ θ
126
+ œ
127
+ ɶ
128
+ ʘ
129
+ ɹ
130
+ ɺ
131
+ ɾ
132
+ ɻ
133
+ ʀ
134
+ ʁ
135
+ ɽ
136
+ ʂ
137
+ ʃ
138
+ ʈ
139
+ ʧ
140
+ ʉ
141
+ ʊ
142
+ ʋ
143
+
144
+ ʌ
145
+ ɣ
146
+ ɤ
147
+ ʍ
148
+ χ
149
+ ʎ
150
+ ʏ
151
+ ʑ
152
+ ʐ
153
+ ʒ
154
+ ʔ
155
+ ʡ
156
+ ʕ
157
+ ʢ
158
+ ǀ
159
+ ǁ
160
+ ǂ
161
+ ǃ
162
+ ˈ
163
+ ˌ
164
+ ː
165
+ ˑ
166
+ ʼ
167
+ ʴ
168
+ ʰ
169
+ ʱ
170
+ ʲ
171
+ ʷ
172
+ ˠ
173
+ ˤ
174
+ ˞
175
+
176
+
177
+
178
+
179
+
180
+ ̩
181
+
weights/pl-bert-best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2341e9b1691986831540ce942fc6e434f3c4f96922ae7a16c8213c68c2047e7e
3
+ size 269262651