Upload RoBERTa_Fine_Tuning_Emotion_classification.ipynb

Browse files

Files changed (1) hide show

RoBERTa_Fine_Tuning_Emotion_classification.ipynb +1612 -0

RoBERTa_Fine_Tuning_Emotion_classification.ipynb ADDED Viewed

	@@ -0,0 +1,1612 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU",
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "f848095d186b49e08417c293b642faed": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_4f6eac487752459b82e2a5ea7d5902c8",
+              "IPY_MODEL_f3a2348c535a47878bca775a1f5d50d5",
+              "IPY_MODEL_800b720695984617856bd1b4ec7a180c"
+            ],
+            "layout": "IPY_MODEL_a7911b3fad6a4db9b891e406745bcc19"
+          }
+        },
+        "4f6eac487752459b82e2a5ea7d5902c8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_c03b91347c1d43dc81d1c277c9b0ac0a",
+            "placeholder": "",
+            "style": "IPY_MODEL_2df14874354f4483a63532dae109082e",
+            "value": "model.safetensors: 100%"
+          }
+        },
+        "f3a2348c535a47878bca775a1f5d50d5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_f2e2a2f73c724d77bfd0dd01c574d192",
+            "max": 331055963,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_b9ac9418f0474c33a1f40e0e86a8fe74",
+            "value": 331055963
+          }
+        },
+        "800b720695984617856bd1b4ec7a180c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d44a2fd4cf724ef6a67662e69a626eee",
+            "placeholder": "",
+            "style": "IPY_MODEL_131a2ae47ff14ad38cc60f7434c76bfd",
+            "value": " 331M/331M [00:01&lt;00:00, 228MB/s]"
+          }
+        },
+        "a7911b3fad6a4db9b891e406745bcc19": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c03b91347c1d43dc81d1c277c9b0ac0a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2df14874354f4483a63532dae109082e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "f2e2a2f73c724d77bfd0dd01c574d192": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b9ac9418f0474c33a1f40e0e86a8fe74": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "d44a2fd4cf724ef6a67662e69a626eee": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "131a2ae47ff14ad38cc60f7434c76bfd": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wj6eoKzotv5I"
+      },
+      "source": [
+        "## Emotion Classification using Fine-tuned BERT model\n",
+        "\n",
+        "In this tutorial, I will show to fine-tune a language model (LM) for emotion classification with code adapted from this [tutorial](https://zablo.net/blog/post/custom-classifier-on-bert-model-guide-polemo2-sentiment-analysis/) by MARCIN ZABŁOCKI. I adapted his tutorial and modified the code to suit the emotion classification task using a different BERT model. Please refer to his tutorial for more detailed explanations for each code block. I really liked his tutorial because of the attention to detail and the use of high-level libraries to take care of certain parts of the model such as training and finding a good learning rate.\n",
+        "\n",
+        "Before you get started, make sure to enable `GPU` in the runtime and be sure to\n",
+        "restart the runtime in this environment after installing the `pytorch-lr-finder` library.\n",
+        "\n",
+        "This tutorial is in a rough draft so if you find any issues with this tutorial or have any further questions reach out to me via [Twitter](https://twitter.com/omarsar0).\n",
+        "\n",
+        "Note that the notebook was created a little while back so if something break it's because the code is not compatible with the library changes.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "G2tokZqttmTA"
+      },
+      "source": [
+        "%%capture\n",
+        "!pip install transformers tokenizers pytorch-lightning"
+      ],
+      "execution_count": 10,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Note: you need to Restart runtime after running this code segment"
+      ],
+      "metadata": {
+        "id": "I0jZnNegGhZj"
+      }
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "k9ZKIIGvuW5m"
+      },
+      "source": [
+        "%%capture\n",
+        "!git clone https://github.com/davidtvs/pytorch-lr-finder.git && cd pytorch-lr-finder && python setup.py install"
+      ],
+      "execution_count": 11,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qqRRWe4UuuIh",
+        "outputId": "a12be031-4bc9-404e-e741-9d4710b57683",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        }
+      },
+      "source": [
+        "import torch\n",
+        "from torch import nn\n",
+        "from typing import List\n",
+        "import torch.nn.functional as F\n",
+        "from transformers import DistilBertTokenizer, AutoTokenizer, AutoModelWithLMHead, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup\n",
+        "import logging\n",
+        "import os\n",
+        "from functools import lru_cache\n",
+        "from tokenizers import ByteLevelBPETokenizer\n",
+        "from tokenizers.processors import BertProcessing\n",
+        "import pytorch_lightning as pl\n",
+        "from torch.utils.data import DataLoader, Dataset\n",
+        "import pandas as pd\n",
+        "from argparse import Namespace\n",
+        "from sklearn.metrics import classification_report\n",
+        "torch.__version__"
+      ],
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "'2.2.1+cu121'"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            }
+          },
+          "metadata": {},
+          "execution_count": 12
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_whSBDujRiga"
+      },
+      "source": [
+        "## Load the Pretrained Language Model\n",
+        "We are first going to look at pretrained language model provided by HuggingFace models. We will use a variant of BERT, called DistilRoBERTa base. The `base` model has less parameters than the `larger` model.\n",
+        "\n",
+        "[RoBERTa](https://arxiv.org/abs/1907.11692) is a variant of of BERT which \"*modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates*\".\n",
+        "\n",
+        "Knowledge distillation help to train smaller LMs with similar performance and potential."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BvHNcMckSR4M"
+      },
+      "source": [
+        "First, let's load the tokenizer for this model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "BPbTd5lmuzQn"
+      },
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')"
+      ],
+      "execution_count": 13,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7KAbKMqJSWRo"
+      },
+      "source": [
+        "Now let's load the actual model with the LM head that takes care of the prediciton for the LM. When fine-tuning we don't use the head and instead use the base model. The code below shows how to do this:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "PCXYlMydzQlP",
+        "outputId": "2845314c-bfcb-47a5-9e83-fea79a4c4409",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 158,
+          "referenced_widgets": [
+            "f848095d186b49e08417c293b642faed",
+            "4f6eac487752459b82e2a5ea7d5902c8",
+            "f3a2348c535a47878bca775a1f5d50d5",
+            "800b720695984617856bd1b4ec7a180c",
+            "a7911b3fad6a4db9b891e406745bcc19",
+            "c03b91347c1d43dc81d1c277c9b0ac0a",
+            "2df14874354f4483a63532dae109082e",
+            "f2e2a2f73c724d77bfd0dd01c574d192",
+            "b9ac9418f0474c33a1f40e0e86a8fe74",
+            "d44a2fd4cf724ef6a67662e69a626eee",
+            "131a2ae47ff14ad38cc60f7434c76bfd"
+          ]
+        }
+      },
+      "source": [
+        "model = AutoModelWithLMHead.from_pretrained(\"distilroberta-base\")\n",
+        "base_model = model.base_model"
+      ],
+      "execution_count": 14,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/modeling_auto.py:1595: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "f848095d186b49e08417c293b642faed"
+            }
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+            "- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "K2_8S8BXSpNa"
+      },
+      "source": [
+        "Let's now try out the tokenizer first:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5fidSmH-zrY_",
+        "outputId": "b396329f-341c-40c5-9294-7e4019f7adf7",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "text = \"Elvis is the king of rock!\"\n",
+        "enc = tokenizer.encode_plus(text)\n",
+        "enc.keys()"
+      ],
+      "execution_count": 15,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "dict_keys(['input_ids', 'attention_mask'])"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 15
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "m8F8yQCDTDQi",
+        "outputId": "cc768922-4463-472d-bbfd-fda843517f48",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "print(enc)"
+      ],
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "{'input_ids': [0, 9682, 9578, 16, 5, 8453, 9, 3152, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "P3wSCLKW0ndh"
+      },
+      "source": [
+        "`input_ids` are the numerical encoding of the tokens in the vocabulary. `attention_mask` is an addition option used when batching sequences together and you want to tell the model which tokens should be attented to ([read more](https://huggingface.co/transformers/glossary.html#attention-mask)). The attention mask information helps when dealing with variance in the size of sequences and we need a way to tell the model that we don't want to attend to the padded indices of the sequence.\n",
+        "\n",
+        "We are only using `input_ids` and `attention_mask`\n",
+        "\n",
+        "We need to also unsqueeze to simulate batch processing\n",
+        "\n",
+        "Using DistilBertForSequenceClassification: https://huggingface.co/transformers/model_doc/distilbert.html#distilbertforsequenceclassification"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Mxsts4uT0PgA",
+        "outputId": "78dcf59f-cd7b-4d4e-8bf3-e807a9f35dbe",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "out = base_model(torch.tensor(enc[\"input_ids\"]).unsqueeze(0), torch.tensor(enc[\"attention_mask\"]).unsqueeze(0))\n",
+        "out[0].shape"
+      ],
+      "execution_count": 17,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "torch.Size([1, 10, 768])"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 17
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZiCO-n_1AHIf",
+        "outputId": "b8498d89-c107-4077-f5c3-37c0a19ef89b",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "## size of representation of one of the tokens\n",
+        "out[0][:,0,:].shape"
+      ],
+      "execution_count": 18,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "torch.Size([1, 768])"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 18
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "srwIb9nr4g4t"
+      },
+      "source": [
+        "`torch.Size([1, 768])` represents batch_size, number of tokens in input text (lenght of tokenized text), model's output hidden size."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "iAsg0H6g53Bf",
+        "outputId": "1892e9cd-fd84-4978-8dd2-037d21e3dfb8",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "t = \"Elvis is the king of rock\"\n",
+        "enc = tokenizer.encode_plus(t)\n",
+        "token_representations = base_model(torch.tensor(enc[\"input_ids\"]).unsqueeze(0))[0][0]\n",
+        "print(enc[\"input_ids\"])\n",
+        "print(tokenizer.decode(enc[\"input_ids\"]))\n",
+        "print(f\"Length: {len(enc['input_ids'])}\")\n",
+        "print(token_representations.shape)"
+      ],
+      "execution_count": 19,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[0, 9682, 9578, 16, 5, 8453, 9, 3152, 2]\n",
+            "<s>Elvis is the king of rock</s>\n",
+            "Length: 9\n",
+            "torch.Size([9, 768])\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9RFifOoY7Hsc"
+      },
+      "source": [
+        "## Building Custom Classification head on top of LM base model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vSUMm4Oq7nvR"
+      },
+      "source": [
+        "Use Mish activiation function as in the one proposed in the original tutorial"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tCEDXLxq628O"
+      },
+      "source": [
+        "# from https://github.com/digantamisra98/Mish/blob/b5f006660ac0b4c46e2c6958ad0301d7f9c59651/Mish/Torch/mish.py\n",
+        "@torch.jit.script\n",
+        "def mish(input):\n",
+        "    return input * torch.tanh(F.softplus(input))\n",
+        "\n",
+        "class Mish(nn.Module):\n",
+        "    def forward(self, input):\n",
+        "        return mish(input)"
+      ],
+      "execution_count": 20,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "C6Ln6KWm74ku"
+      },
+      "source": [
+        "The model we will use to do the fine-tuning"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9VDRSRsc71H2"
+      },
+      "source": [
+        "class EmoModel(nn.Module):\n",
+        "    def __init__(self, base_model, n_classes, base_model_output_size=768, dropout=0.05):\n",
+        "        super().__init__()\n",
+        "        self.base_model = base_model\n",
+        "\n",
+        "        self.classifier = nn.Sequential(\n",
+        "            nn.Dropout(dropout),\n",
+        "            nn.Linear(base_model_output_size, base_model_output_size),\n",
+        "            Mish(),\n",
+        "            nn.Dropout(dropout),\n",
+        "            nn.Linear(base_model_output_size, n_classes)\n",
+        "        )\n",
+        "\n",
+        "        for layer in self.classifier:\n",
+        "            if isinstance(layer, nn.Linear):\n",
+        "                layer.weight.data.normal_(mean=0.0, std=0.02)\n",
+        "                if layer.bias is not None:\n",
+        "                    layer.bias.data.zero_()\n",
+        "\n",
+        "    def forward(self, input_, *args):\n",
+        "        X, attention_mask = input_\n",
+        "        hidden_states = self.base_model(X, attention_mask=attention_mask)\n",
+        "\n",
+        "        # maybe do some pooling / RNNs... go crazy here!\n",
+        "\n",
+        "        # use the <s> representation\n",
+        "        return self.classifier(hidden_states[0][:, 0, :])"
+      ],
+      "execution_count": 21,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wjgME-3O8Yfo"
+      },
+      "source": [
+        "### Pretest the model with dummy text\n",
+        "We want to ensure that the model is returing the right information back."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Y6H9eF8A8XeV",
+        "outputId": "4bc9b2b2-9882-4218-b780-1af26e3b3969",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "classifier = EmoModel(AutoModelWithLMHead.from_pretrained(\"distilroberta-base\").base_model, 3)"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/modeling_auto.py:1595: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
+            "  warnings.warn(\n",
+            "Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+            "- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-sjfHJ_L9iNH"
+      },
+      "source": [
+        "X = torch.tensor(enc[\"input_ids\"]).unsqueeze(0).to('cpu')\n",
+        "attn = torch.tensor(enc[\"attention_mask\"]).unsqueeze(0).to('cpu')"
+      ],
+      "execution_count": 23,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "o6QhCuEC-y2z",
+        "outputId": "eed26cf5-303f-4098-ef84-3d4ab47d6f37",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "classifier((X, attn))"
+      ],
+      "execution_count": 24,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "tensor([[-0.0993,  0.0813, -0.1939]], grad_fn=<AddmmBackward0>)"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 24
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I-N7WSY7Cb7v"
+      },
+      "source": [
+        "## Prepare your dataset for fine-tuning"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "jDWkjaLV-5tj"
+      },
+      "source": [
+        "!mkdir -p tokenizer"
+      ],
+      "execution_count": 25,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wMMm5Ye1Db-m",
+        "outputId": "2227ea88-5302-43eb-d876-9e4a772a391d",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "## load pretrained tokenizer information\n",
+        "tokenizer.save_pretrained(\"tokenizer\")"
+      ],
+      "execution_count": 26,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "('tokenizer/tokenizer_config.json',\n",
+              " 'tokenizer/special_tokens_map.json',\n",
+              " 'tokenizer/vocab.json',\n",
+              " 'tokenizer/merges.txt',\n",
+              " 'tokenizer/added_tokens.json',\n",
+              " 'tokenizer/tokenizer.json')"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 26
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3FVtbmrzDkF8",
+        "outputId": "5d58c54e-5c35-4c79-e791-a1bc60d396e8",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "!ls tokenizer"
+      ],
+      "execution_count": 27,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "merges.txt  special_tokens_map.json  tokenizer_config.json  tokenizer.json  vocab.json\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BhTEgIaLEDRo"
+      },
+      "source": [
+        "Implement CollateFN using fast tokenizers.\n",
+        "This function basically takes care of proper tokenization and batches of sequences. This way you don't need to create your batches manually. Find out more about Tokenizers [here](https://github.com/huggingface/tokenizers/tree/master/bindings/python)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3SCLBZsMDn4s"
+      },
+      "source": [
+        "class TokenizersCollateFn:\n",
+        "    def __init__(self, max_tokens=512):\n",
+        "\n",
+        "        ## RoBERTa uses BPE tokenizer similar to GPT\n",
+        "        t = ByteLevelBPETokenizer(\n",
+        "            \"tokenizer/vocab.json\",\n",
+        "            \"tokenizer/merges.txt\"\n",
+        "        )\n",
+        "        t._tokenizer.post_processor = BertProcessing(\n",
+        "            (\"</s>\", t.token_to_id(\"</s>\")),\n",
+        "            (\"<s>\", t.token_to_id(\"<s>\")),\n",
+        "        )\n",
+        "        t.enable_truncation(max_tokens)\n",
+        "        t.enable_padding(length=max_tokens, pad_id=t.token_to_id(\"<pad>\"))\n",
+        "        self.tokenizer = t\n",
+        "\n",
+        "    def __call__(self, batch):\n",
+        "        encoded = self.tokenizer.encode_batch([x[0] for x in batch])\n",
+        "        sequences_padded = torch.tensor([enc.ids for enc in encoded])\n",
+        "        attention_masks_padded = torch.tensor([enc.attention_mask for enc in encoded])\n",
+        "        labels = torch.tensor([x[1] for x in batch])\n",
+        "\n",
+        "        return (sequences_padded, attention_masks_padded), labels"
+      ],
+      "execution_count": 28,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4hu70Ng0Eqls"
+      },
+      "source": [
+        "## Getting the Data and Preview it\n",
+        "Below we are going to load the data and show you how to create the splits. However, we don't need to split the data manually becuase I have already created the splits and stored those files seperately which you can quickly download below:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "JZ3SoJH3fUsq",
+        "outputId": "45966756-4264-434d-a33d-ca6cc53aac6a",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "!wget https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt\n",
+        "!wget https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt\n",
+        "!wget https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt"
+      ],
+      "execution_count": 29,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2024-03-15 23:58:45--  https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt\n",
+            "Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212\n",
+            "Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.\n",
+            "HTTP request sent, awaiting response... 302 Found\n",
+            "Location: /s/raw/ikkqxfdbdec3fuj/test.txt [following]\n",
+            "--2024-03-15 23:58:45--  https://www.dropbox.com/s/raw/ikkqxfdbdec3fuj/test.txt\n",
+            "Reusing existing connection to www.dropbox.com:443.\n",
+            "HTTP request sent, awaiting response... 404 Not Found\n",
+            "2024-03-15 23:58:45 ERROR 404: Not Found.\n",
+            "\n",
+            "--2024-03-15 23:58:45--  https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt\n",
+            "Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212\n",
+            "Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.\n",
+            "HTTP request sent, awaiting response... 302 Found\n",
+            "Location: /s/raw/1pzkadrvffbqw6o/train.txt [following]\n",
+            "--2024-03-15 23:58:45--  https://www.dropbox.com/s/raw/1pzkadrvffbqw6o/train.txt\n",
+            "Reusing existing connection to www.dropbox.com:443.\n",
+            "HTTP request sent, awaiting response... 404 Not Found\n",
+            "2024-03-15 23:58:46 ERROR 404: Not Found.\n",
+            "\n",
+            "--2024-03-15 23:58:46--  https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt\n",
+            "Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212\n",
+            "Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.\n",
+            "HTTP request sent, awaiting response... 302 Found\n",
+            "Location: /s/raw/2mzialpsgf9k5l3/val.txt [following]\n",
+            "--2024-03-15 23:58:46--  https://www.dropbox.com/s/raw/2mzialpsgf9k5l3/val.txt\n",
+            "Reusing existing connection to www.dropbox.com:443.\n",
+            "HTTP request sent, awaiting response... 404 Not Found\n",
+            "2024-03-15 23:58:46 ERROR 404: Not Found.\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "r_03fxufWX_G"
+      },
+      "source": [
+        "## export the datasets as txt files\n",
+        "## EXERCISE: Change this to an address\n",
+        "\n",
+        "train_path = \"train.txt\"\n",
+        "test_path = \"test.txt\"\n",
+        "val_path = \"val.txt\"\n",
+        "\n",
+        "## emotion labels\n",
+        "label2int = {\n",
+        "  \"sadness\": 0,\n",
+        "  \"joy\": 1,\n",
+        "  \"love\": 2,\n",
+        "  \"anger\": 3,\n",
+        "  \"fear\": 4,\n",
+        "  \"surprise\": 5\n",
+        "}\n",
+        "\n",
+        "emotions = [ \"sadness\", \"joy\", \"love\", \"anger\", \"fear\", \"surprise\"]"
+      ],
+      "execution_count": 30,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### A Quick Look at the dataset\n",
+        "Below is a few code sniphets to get a good idea of the dataset we are using here. You can skip this whole subsection if you like."
+      ],
+      "metadata": {
+        "id": "-FJ-wN1_zmkV"
+      }
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "t23zHggkEpc-",
+        "outputId": "3a9615d4-492f-4134-aaa4-43cf15234fb8",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "!wget https://www.dropbox.com/s/607ptdakxuh5i4s/merged_training.pkl"
+      ],
+      "execution_count": 31,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2024-03-15 23:58:46--  https://www.dropbox.com/s/607ptdakxuh5i4s/merged_training.pkl\n",
+            "Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212\n",
+            "Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.\n",
+            "HTTP request sent, awaiting response... 302 Found\n",
+            "Location: /s/raw/607ptdakxuh5i4s/merged_training.pkl [following]\n",
+            "--2024-03-15 23:58:46--  https://www.dropbox.com/s/raw/607ptdakxuh5i4s/merged_training.pkl\n",
+            "Reusing existing connection to www.dropbox.com:443.\n",
+            "HTTP request sent, awaiting response... 404 Not Found\n",
+            "2024-03-15 23:58:46 ERROR 404: Not Found.\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "PQrMSUTRF06B"
+      },
+      "source": [
+        "import pickle\n",
+        "\n",
+        "## helper function\n",
+        "def load_from_pickle(directory):\n",
+        "    return pickle.load(open(directory,\"rb\"))"
+      ],
+      "execution_count": 32,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XGz89mNSHaYM",
+        "outputId": "ca0ffab9-8002-43fe-8761-c4f98f495482",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 305
+        }
+      },
+      "source": [
+        "data = load_from_pickle(directory=\"merged_training.pkl\")\n",
+        "\n",
+        "## using a sample\n",
+        "data= data[data[\"emotions\"].isin(emotions)]\n",
+        "\n",
+        "\n",
+        "data = data.sample(n=20000);\n",
+        "\n",
+        "data.emotions.value_counts().plot.bar()"
+      ],
+      "execution_count": 33,
+      "outputs": [
+        {
+          "output_type": "error",
+          "ename": "FileNotFoundError",
+          "evalue": "[Errno 2] No such file or directory: 'merged_training.pkl'",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-33-b230c266f99a>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"merged_training.pkl\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m## using a sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"emotions\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0memotions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m<ipython-input-32-01bb35124bd3>\u001b[0m in \u001b[0;36mload_from_pickle\u001b[0;34m(directory)\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m## helper function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdirectory\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'merged_training.pkl'"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Comaf36-Hb6X"
+      },
+      "source": [
+        "data.count()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jYxc8fx_H3ad"
+      },
+      "source": [
+        "Data has been preprocessed already, using technique from this paper: https://www.aclweb.org/anthology/D18-1404/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "gYKK7ujRHfRt"
+      },
+      "source": [
+        "data.head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "JXovcl56NFPp"
+      },
+      "source": [
+        "## reset index\n",
+        "data.reset_index(drop=True, inplace=True)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "pSzoz9InH0Ta"
+      },
+      "source": [
+        "## check unique emotions in the dataset\n",
+        "data.emotions.unique()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rJm31gKShQus"
+      },
+      "source": [
+        "## Split the data and store into individual text files\n",
+        "\n",
+        "If you are using your own dataset and want to split it for training, you can uncomment the code below. Otherwise, just skip it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "6ooNxSnPiztL"
+      },
+      "source": [
+        "## uncomment the code below to generate the text files for your train, val, and test datasets.\n",
+        "\n",
+        "'''\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "import numpy as np\n",
+        "\n",
+        "# Creating training and validation sets using an 80-20 split\n",
+        "input_train, input_val, target_train, target_val = train_test_split(data.text.to_numpy(),\n",
+        "                                                                    data.emotions.to_numpy(),\n",
+        "                                                                    test_size=0.2)\n",
+        "\n",
+        "# Split the validataion further to obtain a holdout dataset (for testing) -- split 50:50\n",
+        "input_val, input_test, target_val, target_test = train_test_split(input_val, target_val, test_size=0.5)\n",
+        "\n",
+        "\n",
+        "## create a dataframe for each dataset\n",
+        "train_dataset = pd.DataFrame(data={\"text\": input_train, \"class\": target_train})\n",
+        "val_dataset = pd.DataFrame(data={\"text\": input_val, \"class\": target_val})\n",
+        "test_dataset = pd.DataFrame(data={\"text\": input_test, \"class\": target_test})\n",
+        "final_dataset = {\"train\": train_dataset, \"val\": val_dataset , \"test\": test_dataset }\n",
+        "\n",
+        "train_dataset.to_csv(train_path, sep=\";\",header=False, index=False)\n",
+        "val_dataset.to_csv(test_path, sep=\";\",header=False, index=False)\n",
+        "test_dataset.to_csv(val_path, sep=\";\",header=False, index=False)\n",
+        "'''"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rAD1J6c0dLp8"
+      },
+      "source": [
+        "## Create the Dataset object"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "aOOI69vwIYcN"
+      },
+      "source": [
+        "Create the Dataset object that will be used to load the different datasets."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Ktr6xeMuISin"
+      },
+      "source": [
+        "class EmoDataset(Dataset):\n",
+        "    def __init__(self, path):\n",
+        "        super().__init__()\n",
+        "        self.data_column = \"text\"\n",
+        "        self.class_column = \"class\"\n",
+        "        self.data = pd.read_csv(path, sep=\";\", header=None, names=[self.data_column, self.class_column],\n",
+        "                               engine=\"python\")\n",
+        "\n",
+        "    def __getitem__(self, idx):\n",
+        "        return self.data.loc[idx, self.data_column], label2int[self.data.loc[idx, self.class_column]]\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return self.data.shape[0]"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9EYQRq3qJH7n"
+      },
+      "source": [
+        "Sanity check"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "uGWw4wGEJGhJ"
+      },
+      "source": [
+        "ds = EmoDataset(train_path)\n",
+        "ds[19]"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0h6tTn9hd6v8"
+      },
+      "source": [
+        "## Training with PyTorchLightning\n",
+        "\n",
+        "[PyTorchLightning](https://www.pytorchlightning.ai/) is a library that abstracts the complexity of training neural networks with PyTorch. It is built on top of PyTorch and simplifies training.\n",
+        "\n",
+        "![](https://pytorch-lightning.readthedocs.io/en/latest/_images/pt_to_pl.png)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RJHhNRcZK7sV"
+      },
+      "source": [
+        "## Methods required by PyTorchLightning\n",
+        "\n",
+        "class TrainingModule(pl.LightningModule):\n",
+        "    def __init__(self, hparams):\n",
+        "        super().__init__()\n",
+        "        self.model = EmoModel(AutoModelWithLMHead.from_pretrained(\"distilroberta-base\").base_model, len(emotions))\n",
+        "        self.loss = nn.CrossEntropyLoss() ## combines LogSoftmax() and NLLLoss()\n",
+        "        #self.hparams = hparams\n",
+        "        self.hparams.update(vars(hparams))\n",
+        "\n",
+        "    def step(self, batch, step_name=\"train\"):\n",
+        "        X, y = batch\n",
+        "        loss = self.loss(self.forward(X), y)\n",
+        "        loss_key = f\"{step_name}_loss\"\n",
+        "        tensorboard_logs = {loss_key: loss}\n",
+        "\n",
+        "        return { (\"loss\" if step_name == \"train\" else loss_key): loss, 'log': tensorboard_logs,\n",
+        "               \"progress_bar\": {loss_key: loss}}\n",
+        "\n",
+        "    def forward(self, X, *args):\n",
+        "        return self.model(X, *args)\n",
+        "\n",
+        "    def training_step(self, batch, batch_idx):\n",
+        "        return self.step(batch, \"train\")\n",
+        "\n",
+        "    def validation_step(self, batch, batch_idx):\n",
+        "        return self.step(batch, \"val\")\n",
+        "\n",
+        "    def validation_end(self, outputs: List[dict]):\n",
+        "        loss = torch.stack([x[\"val_loss\"] for x in outputs]).mean()\n",
+        "        return {\"val_loss\": loss}\n",
+        "\n",
+        "    def test_step(self, batch, batch_idx):\n",
+        "        return self.step(batch, \"test\")\n",
+        "\n",
+        "    def train_dataloader(self):\n",
+        "        return self.create_data_loader(self.hparams.train_path, shuffle=True)\n",
+        "\n",
+        "    def val_dataloader(self):\n",
+        "        return self.create_data_loader(self.hparams.val_path)\n",
+        "\n",
+        "    def test_dataloader(self):\n",
+        "        return self.create_data_loader(self.hparams.test_path)\n",
+        "\n",
+        "    def create_data_loader(self, ds_path: str, shuffle=False):\n",
+        "        return DataLoader(\n",
+        "                    EmoDataset(ds_path),\n",
+        "                    batch_size=self.hparams.batch_size,\n",
+        "                    shuffle=shuffle,\n",
+        "                    collate_fn=TokenizersCollateFn()\n",
+        "        )\n",
+        "\n",
+        "    @lru_cache()\n",
+        "    def total_steps(self):\n",
+        "        return len(self.train_dataloader()) // self.hparams.accumulate_grad_batches * self.hparams.epochs\n",
+        "\n",
+        "    def configure_optimizers(self):\n",
+        "        ## use AdamW optimizer -- faster approach to training NNs\n",
+        "        ## read: https://www.fast.ai/2018/07/02/adam-weight-decay/\n",
+        "        optimizer = AdamW(self.model.parameters(), lr=self.hparams.lr)\n",
+        "        lr_scheduler = get_linear_schedule_with_warmup(\n",
+        "                    optimizer,\n",
+        "                    num_warmup_steps=self.hparams.warmup_steps,\n",
+        "                    num_training_steps=self.total_steps(),\n",
+        "        )\n",
+        "        return [optimizer], [{\"scheduler\": lr_scheduler, \"interval\": \"step\"}]"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OGc7Vw1moHxr"
+      },
+      "source": [
+        "## Finding Learning rate for the model\n",
+        "\n",
+        "The code below aims to obtain valuable information about the optimal learning rate during a pretraining run. Determine boundary and increase the leanring rate linearly or exponentially.\n",
+        "\n",
+        "More: https://github.com/davidtvs/pytorch-lr-finder"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xL4lNPDFoFyU"
+      },
+      "source": [
+        "lr=0.1 ## uper bound LR\n",
+        "from torch_lr_finder import LRFinder\n",
+        "hparams_tmp = Namespace(\n",
+        "    train_path=train_path,\n",
+        "    val_path=val_path,\n",
+        "    test_path=test_path,\n",
+        "    batch_size=16,\n",
+        "    warmup_steps=100,\n",
+        "    epochs=1,\n",
+        "    lr=lr,\n",
+        "    accumulate_grad_batches=1,\n",
+        ")\n",
+        "module = TrainingModule(hparams_tmp)\n",
+        "criterion = nn.CrossEntropyLoss()\n",
+        "optimizer = AdamW(module.parameters(), lr=5e-7) ## lower bound LR\n",
+        "lr_finder = LRFinder(module, optimizer, criterion, device=\"cuda\")\n",
+        "lr_finder.range_test(module.train_dataloader(), end_lr=100, num_iter=100, accumulation_steps=hparams_tmp.accumulate_grad_batches)\n",
+        "lr_finder.plot()\n",
+        "lr_finder.reset()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YdqP56M1oXav"
+      },
+      "source": [
+        "lr = 1e-4\n",
+        "lr"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vMab6vu0Bow0"
+      },
+      "source": [
+        "lr_finder.plot(show_lr=lr)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZhHutCseBxjJ"
+      },
+      "source": [
+        "## Training the Emotion Classifier"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "q3FiLr3LBrjs"
+      },
+      "source": [
+        "hparams = Namespace(\n",
+        "    train_path=train_path,\n",
+        "    val_path=val_path,\n",
+        "    test_path=test_path,\n",
+        "    batch_size=32,\n",
+        "    warmup_steps=100,\n",
+        "    epochs=1,\n",
+        "    lr=lr,\n",
+        "    accumulate_grad_batches=1\n",
+        ")\n",
+        "module = TrainingModule(hparams)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "N8Jv_U25B37g"
+      },
+      "source": [
+        "## garbage collection\n",
+        "import gc; gc.collect()\n",
+        "torch.cuda.empty_cache()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oRnl4HXvB5-T"
+      },
+      "source": [
+        "## train roughly for about 10-15 minutes with GPU enabled.\n",
+        "trainer = pl.Trainer(gpus=1, max_epochs=hparams.epochs, progress_bar_refresh_rate=10,\n",
+        "                     accumulate_grad_batches=hparams.accumulate_grad_batches)\n",
+        "\n",
+        "trainer.fit(module)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Y8kzE1AeB_ij"
+      },
+      "source": [
+        "with torch.no_grad():\n",
+        "    progress = [\"/\", \"-\", \"\\\\\", \"|\", \"/\", \"-\", \"\\\\\", \"|\"]\n",
+        "    module.eval()\n",
+        "    true_y, pred_y = [], []\n",
+        "    for i, batch_ in enumerate(module.test_dataloader()):\n",
+        "        (X, attn), y = batch_\n",
+        "        batch = (X.cuda(), attn.cuda())\n",
+        "        print(progress[i % len(progress)], end=\"\\r\")\n",
+        "        y_pred = torch.argmax(module(batch), dim=1)\n",
+        "        true_y.extend(y.cpu())\n",
+        "        pred_y.extend(y_pred.cpu())\n",
+        "print(\"\\n\" + \"_\" * 80)\n",
+        "print(classification_report(true_y, pred_y, target_names=label2int.keys(), digits=len(emotions)))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "U0_Z_4Pkl3fc"
+      },
+      "source": [
+        "!nvidia-smi"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "ifER7sn-Htge"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}