balochiml
/

balochi-tokenizer

Baluchi

balochi

tokenizer

Model card Files Files and versions

xet

Community

strickvl commited on Jun 3, 2023

Commit

b1d540a

1 Parent(s): e8868b3

add spacy trainer

Browse files

Files changed (1) hide show

src/train_tokenizer.ipynb +263 -7

src/train_tokenizer.ipynb CHANGED Viewed

@@ -47,6 +47,7 @@
    "source": [
     "import os\n",
     "\n",
     "def get_txt_file_paths(directory):\n",
     "    txt_file_paths = []\n",
     "    for root, dirs, files in os.walk(directory):\n",
@@ -56,11 +57,12 @@
     "                txt_file_paths.append(file_path)\n",
     "    return txt_file_paths\n",
     "\n",
     "# Replace \"directory_path\" with the actual path of the directory you want to search\n",
     "directory_path = \"../data/raw_text\"\n",
     "txt_paths = get_txt_file_paths(directory_path)\n",
     "\n",
-    "len(txt_paths)\n"
    ]
   },
   {
@@ -71,16 +73,17 @@
    "source": [
     "import re\n",
     "\n",
     "def clean_text(file_path):\n",
     "    # Open the file and read it into memory\n",
-    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
     "        text = file.read()\n",
     "\n",
     "    # Remove English-language characters and numbers\n",
-    "    text = re.sub(r'[a-zA-Z0-9]', '', text)\n",
     "\n",
     "    # Remove any excess whitespace\n",
-    "    text = re.sub(r'[^\\S\\n]+', ' ', text)\n",
     "\n",
     "    return text"
    ]
@@ -96,8 +99,18 @@
     "\n",
     "    # write the cleaned text to a new file with an incremented filename\n",
     "    # write the files all into the '../data/processed_text' directory\n",
-    "    with open(f'../data/processed_text/{path.split(\"/\")[-1]}', 'w', encoding='utf-8') as file:\n",
-    "        file.write(cleaned_text)\n"
    ]
   },
   {
@@ -119,6 +132,7 @@
    "outputs": [],
    "source": [
     "from tokenizers.pre_tokenizers import Whitespace\n",
     "tokenizer.pre_tokenizer = Whitespace()"
    ]
   },
@@ -266,7 +280,9 @@
     }
    ],
    "source": [
-    "sample_text = \"      آیک  جناورے اَت۔  لھتے گشیت آ سکیں کارزوالے ات کہ اگاں آزاتی دیگ بہ بیت، بازارءَ، لوگے ءَ، جاگاہ یے  ءَ،دپتر ء ُ کارگس یے  ءَ یا ھر ھما جاگاہ ءَ کہ شُت کنت مزنیں کارزوالی کنت۔گوں ھر کس ءَ جنگ ء ُ مڑ بیت۔گدء ُ پچاں  چنڈ چنڈ ء ُ راڑ راڑ کنت،کاگد ء ُ وانگیاں وارت ء ُ آدراہ کنت۔ورگی چیزاں اگاں وارت نکنت آھاں گٹ پاچیت ھراب کنت۔ایندگہ جناور چہ بندات ء َ ایشی ءِ کازوالیاں چہ وتا دیر دارگ ءِ کوشست کن اَنت۔ چیا کہ آ بازیں دگہ ھرابی ء ُ کارزوالی ھم کنت،پمیشکا کسانیں جناور  بالی مُرگ،کوہ پاچن،آسک ء ُ ایندگہ کسان کسانیں جناورچر آئی ءِ کارزوالیانی سوب ءَ آئی ءَ چہ سک باز شزار اَنت ۔\".replace(\"\\xa0\", \"\")\n",
     "sample_sentence = sample_text.split(\"۔\")[2]\n",
     "sample_sentence"
    ]
@@ -291,6 +307,246 @@
     "tokenizer.encode(sample_sentence).tokens"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

    "source": [
     "import os\n",
     "\n",
+    "\n",
     "def get_txt_file_paths(directory):\n",
     "    txt_file_paths = []\n",
     "    for root, dirs, files in os.walk(directory):\n",
     "                txt_file_paths.append(file_path)\n",
     "    return txt_file_paths\n",
     "\n",
+    "\n",
     "# Replace \"directory_path\" with the actual path of the directory you want to search\n",
     "directory_path = \"../data/raw_text\"\n",
     "txt_paths = get_txt_file_paths(directory_path)\n",
     "\n",
+    "len(txt_paths)"
    ]
   },
   {
    "source": [
     "import re\n",
     "\n",
+    "\n",
     "def clean_text(file_path):\n",
     "    # Open the file and read it into memory\n",
+    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
     "        text = file.read()\n",
     "\n",
     "    # Remove English-language characters and numbers\n",
+    "    text = re.sub(r\"[a-zA-Z0-9]\", \"\", text)\n",
     "\n",
     "    # Remove any excess whitespace\n",
+    "    text = re.sub(r\"[^\\S\\n]+\", \" \", text)\n",
     "\n",
     "    return text"
    ]
     "\n",
     "    # write the cleaned text to a new file with an incremented filename\n",
     "    # write the files all into the '../data/processed_text' directory\n",
+    "    with open(\n",
+    "        f'../data/processed_text/{path.split(\"/\")[-1]}', \"w\", encoding=\"utf-8\"\n",
+    "    ) as file:\n",
+    "        file.write(cleaned_text)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training a Tokenizer using 🤗 Tokenizers"
    ]
   },
   {
    "outputs": [],
    "source": [
     "from tokenizers.pre_tokenizers import Whitespace\n",
+    "\n",
     "tokenizer.pre_tokenizer = Whitespace()"
    ]
   },
     }
    ],
    "source": [
+    "sample_text = \"      آیک  جناورے اَت۔  لھتے گشیت آ سکیں کارزوالے ات کہ اگاں آزاتی دیگ بہ بیت، بازارءَ، لوگے ءَ، جاگاہ یے  ءَ،دپتر ء ُ کارگس یے  ءَ یا ھر ھما جاگاہ ءَ کہ شُت کنت مزنیں کارزوالی کنت۔گوں ھر کس ءَ جنگ ء ُ مڑ بیت۔گدء ُ پچاں  چنڈ چنڈ ء ُ راڑ راڑ کنت،کاگد ء ُ وانگیاں وارت ء ُ آدراہ کنت۔ورگی چیزاں اگاں وارت نکنت آھاں گٹ پاچیت ھراب کنت۔ایندگہ جناور چہ بندات ء َ ایشی ءِ کازوالیاں چہ وتا دیر دارگ ءِ کوشست کن اَنت۔ چیا کہ آ بازیں دگہ ھرابی ء ُ کارزوالی ھم کنت،پمیشکا کسانیں جناور  بالی مُرگ،کوہ پاچن،آسک ء ُ ایندگہ کسان کسانیں جناورچر آئی ءِ کارزوالیانی سوب ءَ آئی ءَ چہ سک باز شزار اَنت ۔\".replace(\n",
+    "    \"\\xa0\", \"\"\n",
+    ")\n",
     "sample_sentence = sample_text.split(\"۔\")[2]\n",
     "sample_sentence"
    ]
     "tokenizer.encode(sample_sentence).tokens"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training a custom tokenizer using Spacy and FastAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fastai.text.all import *\n",
+    "files = get_text_files(\"../data/processed_text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4294"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'*آمیتگءِ جُستءَمکن* لچّہ: *آمیتگءِ جُستءَمکن* آ میتگءَکہ من وتی شوکیں کسانی'"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "txt = files[0].open().read(); txt[:75]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(#146) ['*','آمیتگءِ','جُستءَمکن','*','لچّہ',':','*','آمیتگءِ','جُستءَمکن','*','آ','میتگءَکہ','من','وتی','شوکیں','کسانی','پیر','کُت','آ','میتگءِ','جسُتءَمکن','آ','میتگءِ','گیراں','مبو','بے','اوستیں','تاهیراں','مبو','آ'...]\n"
+     ]
+    }
+   ],
+   "source": [
+    "spacy = WordTokenizer()\n",
+    "toks = first(spacy([txt]))\n",
+    "print(coll_repr(toks, 30))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(#147) ['xxbos','*','آمیتگءِ','جُستءَمکن','*','لچّہ',':','*','آمیتگءِ','جُستءَمکن','*','آ','میتگءَکہ','من','وتی','شوکیں','کسانی','پیر','کُت','آ','میتگءِ','جسُتءَمکن','آ','میتگءِ','گیراں','مبو','بے','اوستیں','تاهیراں','مبو','آ'...]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tkn = Tokenizer(spacy)\n",
+    "print(coll_repr(tkn(txt), 31))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "txts = L(o.open().read() for o in files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def subword(size: int):\n",
+    "    sp = SubwordTokenizer(vocab_sz=size)\n",
+    "    sp.setup(txts)\n",
+    "    return \" \".join(first(sp([txt]))[:40])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'▁* آ می تگ ءِ ▁جُست ءَ م ک ن * ▁لچّہ : ▁* آ می تگ ءِ ▁جُست ءَ م ک ن * ▁آ ▁میتگ ءَ کہ ▁من ▁وتی ▁ش وکیں ▁کس انی ▁پیر ▁کُت ▁آ ▁میتگ ءِ ▁ج'"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword(1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'▁ * آ م ی ت گ ء ِ ▁ ج ُ س ت ء َ م ک ن * ▁ ل چ ّ ہ : ▁ * آ م ی ت گ ء ِ ▁ ج ُ س ت'"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subword(275)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(#147) ['xxbos','*','آمیتگءِ','جُستءَمکن','*','لچّہ',':','*','آمیتگءِ','جُستءَمکن'...]"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "toks200 = txts[:200].map(tkn)\n",
+    "toks200[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"(#4096) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','ءَ','ءِ','ءُ','۔','کہ','،','انت','من','اے','نہ','وتی','بیت','”','ات','چہ','گوں','اَنت','اِنت','پہ','بہ','‘','یک','آئی','.','آ','منی','ھم',')','کنت','بلوچی','3','تو','بلے','ئے',':','کنگ','(','بوتگ','آں','کن','؟'...]\""
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num = Numericalize()\n",
+    "num.setup(toks200)\n",
+    "coll_repr(num.vocab,50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TensorText([ 156, 2340,    0,  156,  563,   43,  156, 2340,    0,  156,   33,\n",
+       "               0,   16,   19, 1490,  831,  457,  102,   33, 1031])"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nums = num(toks)[:20]; nums"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'* آمیتگءِ xxunk * لچّہ : * آمیتگءِ xxunk * آ xxunk من وتی شوکیں کسانی پیر کُت آ میتگءِ'"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "' '.join(num.vocab[o] for o in nums)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,