khalid99ml
/

Banglish_translation

Model card Files Files and versions

xet

Community

khalid99ml commited on Dec 21, 2024

Commit

1623255

verified ·

1 Parent(s): 4f950e5

done

Browse files

Files changed (1) hide show

challange1.ipynb +237 -0

challange1.ipynb ADDED Viewed

	@@ -0,0 +1,237 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, DatasetDict\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from huggingface_hub import notebook_login\n",
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "619aead440a04dbd9eafa156e3713251",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "notebook_login()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = load_dataset(\"SKNahin/bengali-transliteration-data\")\n",
+    "\n",
+    "dataset = dataset[\"train\"].train_test_split(test_size=0.2, seed=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DatasetDict({\n",
+      "    train: Dataset({\n",
+      "        features: ['bn', 'rm'],\n",
+      "        num_rows: 4004\n",
+      "    })\n",
+      "    validation: Dataset({\n",
+      "        features: ['bn', 'rm'],\n",
+      "        num_rows: 1002\n",
+      "    })\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = DatasetDict({\n",
+    "    \"train\": dataset[\"train\"],\n",
+    "    \"validation\": dataset[\"test\"]\n",
+    "})\n",
+    "print(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'MBartForConditionalGeneration' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfacebook/mbart-large-50-many-to-many-mmt\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mMBartForConditionalGeneration\u001b[49m\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfacebook/mbart-large-50-many-to-many-mmt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m      3\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m MBart50TokenizerFast\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfacebook/mbart-large-50-many-to-many-mmt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m      4\u001b[0m \u001b[38;5;66;03m#this process took huge time as it is very big compared to the internet bandwith, so the next steps could not be run until its finishing\u001b[39;00m\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'MBartForConditionalGeneration' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "model_name='facebook/mbart-large-50-many-to-many-mmt'\n",
+    "model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
+    "tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
+    "#this process took huge time as it is very big compared to the internet bandwith, and it is still running so the next steps could not be run until its finishing\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    inputs = examples[\"banglish\"]\n",
+    "    targets = examples[\"bangla\"]\n",
+    "    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=\"max_length\")\n",
+    "    with tokenizer.as_target_tokenizer():\n",
+    "        labels = tokenizer(targets, max_length=128, truncation=True, padding=\"max_length\")\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset[\"train\"].column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    weight_decay=0.01,\n",
+    "    save_total_limit=2,\n",
+    "    num_train_epochs=5,\n",
+    "    predict_with_generate=True,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#training start using the provided dataset for fine tuning\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(\"banglish-to-bangla-model\")\n",
+    "tokenizer.save_pretrained(\"banglish-to-bangla-model\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}