{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Hugging Face Datasets Library\n", " You can find the names of the datasets provided by the glue benchmark in the video 22,23\n", " \n", " https://huggingface.co/docs/datasets/glue.html\n", "\n", "mrpc is one of the datasets provided by this benchmark to test para-\n", "phrases" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", " })\n", " validation: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 408\n", " })\n", " test: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 1725\n", " })\n", "})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", "# Load the dataset\n", "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n", "raw_datasets\n", "\n", "# The output is a DatasetDict object, which contains each split of the Dataset." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Connect to each split by indexing!\n", "raw_datasets['train']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# we can see the number of training examples in the dataset as num_rows: 3668 in the above output~" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n", " 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n", " 'label': 1,\n", " 'idx': 0}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Access a given element by it's index:\n", "raw_datasets['train'][0]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': ['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n", " \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n", " 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',\n", " 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',\n", " 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'],\n", " 'sentence2': ['Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n", " \"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\",\n", " \"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\",\n", " 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .',\n", " 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .'],\n", " 'label': [1, 0, 1, 0, 1],\n", " 'idx': [0, 1, 2, 3, 4]}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Access a slice of your dataset:\n", "raw_datasets['train'][:5]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Access the sentence1 of the first element:\n", "raw_datasets['train'][0]['sentence1']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n", " \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n", " 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',\n", " 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',\n", " 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Access the first 5 sentences of sentence1\n", "raw_datasets['train'][:5]['sentence1']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': Value(dtype='string', id=None),\n", " 'sentence2': Value(dtype='string', id=None),\n", " 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),\n", " 'idx': Value(dtype='int32', id=None)}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use the features attribute to see the information your dataset contains:\n", "raw_datasets['train'].features" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Remember features are the input variables to your model.'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''Remember features are the input variables to your model.'''" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "390d148b78f84283b5c3273c08fca389", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3668 [00:00