Add scripts for later job ft

Browse files

Files changed (6) hide show

notes/{data_preparation.ipynb → data_preparation_ft.ipynb} +0 -0
notes/data_preparation_pt.ipynb +626 -0
notes/fa.tar.gz +0 -3
src/fine-tuning/__init__.py +0 -0
src/{dictionary.py → fine-tuning/dictionary.py} +0 -0
src/{normalizer.py → fine-tuning/normalizer.py} +0 -0

notes/{data_preparation.ipynb → data_preparation_ft.ipynb} RENAMED Viewed

File without changes

notes/data_preparation_pt.ipynb ADDED Viewed

	@@ -0,0 +1,626 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']"
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sys.path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if \"../src\" not in sys.path:\n",
+    "    sys.path.insert(0, \"../src\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from normalizer import normalizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "سلام بر شما که می‌آیید و می‌آموزید که بی‌آرآیم \n",
+      "کتاب‌هایمان میدانی کجا‌ها ماه‌هاس که کی‌هامون و کیهان دنباله‌هاشون برای بهای هستند \n",
+      "میان‌‌افزار‌های امروزی نرم‌افزار سخت‌افزار امروز نوشت‌افزار‌ها \n",
+      "این کتاب بهترین در نوع شتر آسان‌تر هست \n",
+      "سه چیز هست که از پژوهش در این زمینه آموخته‌ام \n"
+     ]
+    }
+   ],
+   "source": [
+    "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
+    "\n",
+    "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n",
+    "print(normalizer({\"sentence\": input_text}, return_dict=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !mkdir -p /home/m3hrdadfi/code/data\n",
+    "# %cd /home/m3hrdadfi/code/data\n",
+    "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
+    "# %cd /home/m3hrdadfi/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import os\n",
+    "\n",
+    "# lang = \"fa\"\n",
+    "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n",
+    "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
+    "# print(abs_path_to_data)\n",
+    "# print(save_path)\n",
+    "# print()\n",
+    "# !ls {save_path}\n",
+    "# !ls {abs_path_to_data}/*.tsv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalizer_without_batch(text, pruning=False):\n",
+    "    try:\n",
+    "        batch = {\n",
+    "            \"sentence\": text\n",
+    "        }\n",
+    "        text = normalizer(batch, return_dict=False)\n",
+    "        \n",
+    "        if pruning:\n",
+    "            if not len(text.split()) > 3:\n",
+    "                text = None\n",
+    "        \n",
+    "    except:\n",
+    "        print(text)\n",
+    "        text = None\n",
+    "        \n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
+    "\n",
+    "# print(f\"Step 0: {len(test_df)}\")\n",
+    "\n",
+    "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
+    "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
+    "# test_df = test_df.dropna(subset=[\"path\"])\n",
+    "# test_df = test_df.drop(\"status\", 1)\n",
+    "# print(f\"Step 1: {len(test_df)}\")\n",
+    "\n",
+    "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n",
+    "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
+    "# test_df = test_df.dropna(subset=[\"sentence\"])\n",
+    "# print(f\"Step 2: {len(test_df)}\")\n",
+    "\n",
+    "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
+    "# test_df = test_df.drop_duplicates(subset=\"path\")\n",
+    "# print(f\"Step 3: {len(test_df)}\")\n",
+    "\n",
+    "# test_df = test_df.reset_index(drop=True)\n",
+    "# test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# _train_df = pd.concat([\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
+    "# ])\n",
+    "# print(len(_train_df))\n",
+    "\n",
+    "# train_df = pd.concat([\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
+    "#     pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
+    "# ])\n",
+    "# print(f\"Step 0: {len(train_df)}\")\n",
+    "\n",
+    "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
+    "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
+    "# train_df = train_df.dropna(subset=[\"path\"])\n",
+    "# train_df = train_df.drop(\"status\", 1)\n",
+    "# print(f\"Step 1: {len(train_df)}\")\n",
+    "\n",
+    "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n",
+    "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n",
+    "# train_df = train_df.dropna(subset=[\"sentence\"])\n",
+    "# print(f\"Step 2: {len(train_df)}\")\n",
+    "\n",
+    "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
+    "# train_df = train_df.drop_duplicates(subset=\"path\")\n",
+    "# print(f\"Step 3: {len(train_df)}\")\n",
+    "\n",
+    "# train_df = train_df.sample(frac=1)\n",
+    "# train_df = train_df.reset_index(drop=True)\n",
+    "# train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from tqdm import tqdm\n",
+    "\n",
+    "# testset_indices = []\n",
+    "\n",
+    "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
+    "#     _id = row[\"path\"]\n",
+    "#     finder = train_df[train_df[\"path\"] == _id]\n",
+    "#     if len(finder) > 0:\n",
+    "#         testset_indices.extend(list(finder.index))\n",
+    "\n",
+    "# testset_indices = list(set(testset_indices))\n",
+    "# print(f\"Found #{len(testset_indices)} test data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(len(train_df))\n",
+    "# train_df = train_df.drop(testset_indices)\n",
+    "# print(len(train_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pandas as pd\n",
+    "\n",
+    "# df = pd.concat([train_df, test_df], axis=0)\n",
+    "# # df = validated_df.copy()\n",
+    "# print(df.info())\n",
+    "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
+    "# # df = df.dropna(subset=[\"sentence\"])\n",
+    "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n",
+    "# df = df.reset_index(drop=True)\n",
+    "# print(df.info())\n",
+    "# df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import torchaudio\n",
+    "# import librosa\n",
+    "# import IPython.display as ipd\n",
+    "# import numpy as np\n",
+    "\n",
+    "# def load_audio(path):\n",
+    "#     speech, sr = torchaudio.load(path)\n",
+    "#     speech = speech[0].numpy().squeeze()    \n",
+    "#     speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
+    "    \n",
+    "#     print(speech.shape, sr)\n",
+    "    \n",
+    "#     ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n",
+    "# text = \" \".join(df[\"sentence\"].values.tolist())\n",
+    "# vocab = list(sorted(set(text)))\n",
+    "\n",
+    "# for v in main_vocab:\n",
+    "#     if v not in vocab:\n",
+    "#         print(\"v\", v)\n",
+    "\n",
+    "# print(len(main_vocab), len(vocab))\n",
+    "# print(len(vocab), vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import numpy as np\n",
+    "\n",
+    "\n",
+    "# idx = np.random.randint(0, len(df))\n",
+    "# # idx = 6140\n",
+    "# sample = df.iloc[idx]\n",
+    "# ipd.display(sample)\n",
+    "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
+    "# print()\n",
+    "# print(sample[\"prev_sentence\"])\n",
+    "# print(sample[\"sentence\"])\n",
+    "# print()\n",
+    "# load_audio(sample[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_train_df = train_df.copy()\n",
+    "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
+    "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
+    "# print(new_train_df.info())\n",
+    "# new_train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_test_df = test_df.copy()\n",
+    "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
+    "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
+    "# print(new_test_df.info())\n",
+    "# new_test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import shutil\n",
+    "# from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !mkdir -p {save_path}/clips\n",
+    "# !mkdir -p {save_path}/augs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
+    "#     shutil.copy(row[\"_path\"], row[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
+    "#     shutil.copy(row[\"_path\"], row[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # aug_train_df = new_train_df.copy()\n",
+    "# aug_train_df = new_train_df.sample(frac=0.1)\n",
+    "# aug_train_df = aug_train_df.reset_index(drop=True)\n",
+    "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
+    "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
+    "# print(aug_train_df.info())\n",
+    "# aug_train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(aug_train_df.iloc[0][\"_path\"])\n",
+    "# print(aug_train_df.iloc[0][\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # augmentation\n",
+    "\n",
+    "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
+    "# import numpy as np\n",
+    "# import soundfile as sf\n",
+    "\n",
+    "# augment = Compose([\n",
+    "# #     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
+    "# #     PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
+    "# #     Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
+    "#     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
+    "#     TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
+    "#     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
+    "# ])\n",
+    "\n",
+    "# def augmented_speech_file_to_array_fn(in_path, out_path):\n",
+    "#     speech_array, sampling_rate = torchaudio.load(in_path)\n",
+    "#     speech_array = speech_array.squeeze().numpy()\n",
+    "#     speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
+    "#     sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
+    "# #     augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
+    "# !ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
+    "# new_train_aug_df = new_train_df.copy()\n",
+    "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
+    "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
+    "# print(new_train_aug_df.info())\n",
+    "# new_train_aug_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
+    "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
+    "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_train_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# new_test_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pandas as pd\n",
+    "\n",
+    "# import os\n",
+    "# from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
+    "# print(train_df.info())\n",
+    "# train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
+    "# print(test_df.info())\n",
+    "# test_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# non_existed_train = []\n",
+    "\n",
+    "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
+    "#     if not os.path.exists(row[\"path\"]):\n",
+    "#         non_existed_train.extends(list(index))\n",
+    "#         break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import numpy as np\n",
+    "\n",
+    "\n",
+    "# idx = np.random.randint(0, len(train_df))\n",
+    "# # idx = 6140\n",
+    "# sample = train_df.iloc[idx]\n",
+    "# ipd.display(sample)\n",
+    "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
+    "# print()\n",
+    "# print(sample[\"prev_sentence\"])\n",
+    "# print(sample[\"sentence\"])\n",
+    "# print()\n",
+    "# load_audio(sample[\"path\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_df_half = train_df.copy()\n",
+    "# print(train_df_half.shape)\n",
+    "# train_df_half = train_df_half.dropna()\n",
+    "# print(train_df_half.shape)\n",
+    "# train_df_half = train_df_half.drop_duplicates()\n",
+    "# print(train_df_half.shape)\n",
+    "\n",
+    "# train_df_half = train_df_half.sample(frac=0.5)\n",
+    "# train_df_half = train_df_half.reset_index(drop=True)\n",
+    "# print(train_df_half.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "transformers",
+   "name": "transformers"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notes/fa.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9f3c53202d7d12dfe973604737fc11b0a50c9c94b85c4cae70fcc693fe2babb4
-size 7020110

src/fine-tuning/__init__.py ADDED Viewed

File without changes

src/{dictionary.py → fine-tuning/dictionary.py} RENAMED Viewed

File without changes

src/{normalizer.py → fine-tuning/normalizer.py} RENAMED Viewed

File without changes