balochiml
/

balochi-tokenizer

Model card Files Files and versions

strickvl commited on Jun 3, 2023

Commit

b3db2b8

·

1 Parent(s): 470e696

process and clean files

Files changed (1) hide show

src/train_tokenizer.ipynb +45 -0

src/train_tokenizer.ipynb CHANGED Viewed

@@ -20,6 +20,14 @@
     "# load_dataset(\"balochiml/balochi-language-data\", data_dir=\"data\", cache_dir=\"../data\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 13,
@@ -55,6 +63,43 @@
     "len(txt_paths)\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "# load_dataset(\"balochiml/balochi-language-data\", data_dir=\"data\", cache_dir=\"../data\")"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generate the processed data without English characters"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 13,
     "len(txt_paths)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def clean_text(file_path):\n",
+    "    # Open the file and read it into memory\n",
+    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "        text = file.read()\n",
+    "\n",
+    "    # Remove English-language characters and numbers\n",
+    "    text = re.sub(r'[a-zA-Z0-9]', '', text)\n",
+    "\n",
+    "    # Remove any excess whitespace\n",
+    "    text = re.sub(r'[^\\S\\n]+', ' ', text)\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for path in txt_paths:\n",
+    "    cleaned_text = clean_text(path)\n",
+    "\n",
+    "    # write the cleaned text to a new file with an incremented filename\n",
+    "    # write the files all into the '../data/processed_text' directory\n",
+    "    with open(f'../data/processed_text/{path.split(\"/\")[-1]}', 'w', encoding='utf-8') as file:\n",
+    "        file.write(cleaned_text)\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,