File size: 14,750 Bytes

17e2432

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hugging Face Datasets Library\n",
    " You can find the names of the datasets provided by the glue benchmark in the video 22,23\n",
    " \n",
    "  https://huggingface.co/docs/datasets/glue.html\n",
    "\n",
    "mrpc is one of the datasets provided by this benchmark to test para-\n",
    "phrases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# Load the dataset\n",
    "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
    "raw_datasets\n",
    "\n",
    "# The output is a DatasetDict object, which contains each split of the Dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
       "    num_rows: 3668\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Connect to each split by indexing!\n",
    "raw_datasets['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can see the number of training examples in the dataset as num_rows: 3668 in the above output~"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n",
       " 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n",
       " 'label': 1,\n",
       " 'idx': 0}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Access a given element by it's index:\n",
    "raw_datasets['train'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence1': ['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n",
       "  \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n",
       "  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',\n",
       "  'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',\n",
       "  'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'],\n",
       " 'sentence2': ['Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n",
       "  \"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\",\n",
       "  \"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\",\n",
       "  'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .',\n",
       "  'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .'],\n",
       " 'label': [1, 0, 1, 0, 1],\n",
       " 'idx': [0, 1, 2, 3, 4]}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Access a slice of your dataset:\n",
    "raw_datasets['train'][:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Access the sentence1 of the first element:\n",
    "raw_datasets['train'][0]['sentence1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n",
       " \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n",
       " 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',\n",
       " 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',\n",
       " 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Access the first 5 sentences of sentence1\n",
    "raw_datasets['train'][:5]['sentence1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence1': Value(dtype='string', id=None),\n",
       " 'sentence2': Value(dtype='string', id=None),\n",
       " 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),\n",
       " 'idx': Value(dtype='int32', id=None)}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Use the features attribute to see the information your dataset contains:\n",
    "raw_datasets['train'].features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Remember features are the input variables to your model.'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''Remember features are the input variables to your model.'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "390d148b78f84283b5c3273c08fca389",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/3668 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4be5e99804ce4588aefb566219523f97",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/408 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1bbbf398ada1455bb9b726a462e4b7e5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1725 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}\n"
     ]
    }
   ],
   "source": [
    "# To preprocess all the elements of the dataset we need to tokenize them!\n",
    "\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = 'bert-base-cased'\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example['sentence1'], example['sentence2'],\n",
    "                     padding='max_length',\n",
    "                     truncation=True,\n",
    "                     max_length=128)\n",
    "tokenized_datasets = raw_datasets.map(tokenize_function)\n",
    "print(tokenized_datasets.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'To speed up tokenization, the map method uses multiprocessing.\\nYou could also set the batched=True'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''To speed up tokenization, the map method uses multiprocessing.\n",
    "You could also set the batched=True'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "29966e62f612498d8d2d4c54d40467e3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/408 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = 'bert-base-cased'\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example['sentence1'], example['sentence2'],\n",
    "                     padding='max_length',\n",
    "                     truncation=True,\n",
    "                     max_length=128)\n",
    "    \n",
    "from datasets import load_dataset\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\",\"mrpc\") \n",
    "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Once done, we are almost ready for training!\\nJust remove the columns we don't need anymore by the remove columns function\\nRename the column label to labels\\nand use the format torch\""
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''Once done, we are almost ready for training!\n",
    "Just remove the columns we don't need anymore by the remove columns function\n",
    "Rename the column label to labels\n",
    "and use the format torch'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "    num_rows: 3668\n",
       "})"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_datasets = tokenized_datasets.remove_columns([\"idx\",\"sentence1\",\"sentence2\"])\n",
    "tokenized_datasets = tokenized_datasets.rename_column(\"label\",\"labels\")\n",
    "tokenized_datasets = tokenized_datasets.with_format(\"torch\") # The format could be torch/tensorflow/numpy~\n",
    "tokenized_datasets[\"train\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'If needed we can also generate a short sample of the dataset using the select method!'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''If needed we can also generate a short sample of the dataset using the select method!'''\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "    num_rows: 10\n",
       "})"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "small_train_dataset = tokenized_datasets[\"train\"].select(range(10))\n",
    "small_train_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''As you can see, the number of training examples has now reduced to 10'''"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}