Upload 9 files

Browse files

Files changed (9) hide show

SMS_Spam.csv +0 -0
config.json +26 -0
dataset_dict.json +1 -0
model.safetensors +3 -0
spam-ham-classfication.ipynb +1000 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +56 -0
vocab.txt +0 -0

SMS_Spam.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 28996
+}

dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "validation", "test"]}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:732fd6d40215d66e9ba0fdf1530db021a764ed1a839c4279e3266a54258a0f71
+size 433270768

spam-ham-classfication.ipynb ADDED Viewed

	@@ -0,0 +1,1000 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "12349750",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Label': ['ham', 'ham', 'ham'],\n",
+       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
+       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
+       "  'Kallis is ready for bat in 2nd innings']}"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "data_files =\"E:/Hugging_Face/SMS_Spam.csv\"\n",
+    "spam_data = load_dataset(\"csv\", data_files = data_files, split = \"train\")\n",
+    "spam_data = spam_data.train_test_split(test_size = 0.2)\n",
+    "spam_data[\"train\"][:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "35f0392d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6740059d6df4ea7aceaf262ef339c94",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c2d8fd5629eb4e6aa0c91866c3ee2562",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['Label', 'Sentence'],\n",
+       "        num_rows: 4459\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['Label', 'Sentence'],\n",
+       "        num_rows: 1115\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def lower_case(example):\n",
+    "    return {\"Sentence\": example[\"Sentence\"].lower()}\n",
+    "\n",
+    "spam_data.map(lower_case)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9df36294",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d4f5f516b024a459dba03cb2b5e764b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a64af11c2cde4ef1b56a49b4ffb6b200",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def sen_len(example):\n",
+    "    return {\"length\": len(example[\"Sentence\"].split())}\n",
+    "\n",
+    "spam_data = spam_data.map(sen_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "db1d8406",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Label': ['ham', 'ham', 'ham'],\n",
+       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
+       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
+       "  'Kallis is ready for bat in 2nd innings'],\n",
+       " 'length': [11, 29, 8]}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spam_data[\"train\"][:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3e742939",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spam_data = spam_data.rename_column(\"Label\", \"labels\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a1d7c214",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ae1b7a15bd7e46e5aa763483d877ac5b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e33b493b97754c588a4847d069773fc3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import html\n",
+    "\n",
+    "spam_data = spam_data.map(lambda x: {\"Sentence\": html.unescape(x[\"Sentence\"])}, batched = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8fa3f455",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'labels': ['ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'spam',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'ham',\n",
+       "  'spam',\n",
+       "  'ham',\n",
+       "  'ham'],\n",
+       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
+       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
+       "  'Kallis is ready for bat in 2nd innings',\n",
+       "  'Gud mrng dear hav a nice day',\n",
+       "  'I not free today i haf 2 pick my parents up tonite...',\n",
+       "  'Good afternoon on this glorious anniversary day, my sweet J !! I hope this finds you happy and content, my Prey. I think of you and send a teasing kiss from across the sea coaxing images of fond souveniers ... You Cougar-Pen',\n",
+       "  'SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.',\n",
+       "  'Haha awesome, I might need to take you up on that, what you doin tonight?',\n",
+       "  'Ok...',\n",
+       "  'I am sorry it hurt you.',\n",
+       "  'Watching cartoon, listening music &amp; at eve had to go temple &amp; church.. What about u?',\n",
+       "  'Sent me de webadres for geting salary slip',\n",
+       "  'Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX',\n",
+       "  \"I want snow. It's just freezing and windy.\",\n",
+       "  ', im .. On the snowboarding trip. I was wondering if your planning to get everyone together befor we go..a meet and greet kind of affair? Cheers, ',\n",
+       "  'Siva is in hostel aha:-.',\n",
+       "  'CHEERS LOU! YEAH WAS A GOODNITE SHAME U NEVA CAME! C YA GAILxx',\n",
+       "  'URGENT! Your Mobile number has been awarded with a Â£2000 prize GUARANTEED. Call 09061790126 from land line. Claim 3030. Valid 12hrs only 150ppm',\n",
+       "  'Did u got that persons story',\n",
+       "  'Amazing : If you rearrange these letters it gives the same meaning... Dormitory = Dirty room Astronomer = Moon starer The eyes = They see Election results = Lies lets recount Mother-in-law = Woman Hitler Eleven plus two =Twelve plus one Its Amazing... !:-)'],\n",
+       " 'length': [11,\n",
+       "  29,\n",
+       "  8,\n",
+       "  7,\n",
+       "  12,\n",
+       "  42,\n",
+       "  8,\n",
+       "  15,\n",
+       "  1,\n",
+       "  6,\n",
+       "  16,\n",
+       "  8,\n",
+       "  22,\n",
+       "  8,\n",
+       "  27,\n",
+       "  5,\n",
+       "  13,\n",
+       "  23,\n",
+       "  6,\n",
+       "  44]}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spam_data[\"train\"][:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b59be7ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54748e89b52c45f5af2c1d96e6e6f91e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Casting the dataset:   0%|          | 0/4459 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bae9e8f5c4c84a19aa01e4bb2d65080e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Casting the dataset:   0%|          | 0/1115 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'labels': ClassLabel(names=['ham', 'spam']), 'Sentence': Value('string'), 'length': Value('int64')}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset, ClassLabel\n",
+    "\n",
+    "spam_data = spam_data.cast_column(\n",
+    "    \"labels\", ClassLabel(names=[\"ham\", \"spam\"])\n",
+    ")\n",
+    "\n",
+    "print(spam_data[\"train\"].features)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "b8a087d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'labels': [0, 0, 0],\n",
+       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
+       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
+       "  'Kallis is ready for bat in 2nd innings'],\n",
+       " 'length': [11, 29, 8]}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spam_data[\"train\"][:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "eae6b9a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "58ddfaa8aa3545879d58d0a955b886e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1ba22c48e19c4d53b37e54615139925e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'labels': 0,\n",
+       " 'Sentence': 'Are you up for the challenge? I know i am :)',\n",
+       " 'length': 11,\n",
+       " 'input_ids': [101,\n",
+       "  2372,\n",
+       "  1128,\n",
+       "  1146,\n",
+       "  1111,\n",
+       "  1103,\n",
+       "  4506,\n",
+       "  136,\n",
+       "  146,\n",
+       "  1221,\n",
+       "  178,\n",
+       "  1821,\n",
+       "  131,\n",
+       "  114,\n",
+       "  102],\n",
+       " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
+    "\n",
+    "def tokenize_function(example):\n",
+    "    return tokenizer(example[\"Sentence\"], truncation = True)\n",
+    "\n",
+    "tokenized_dataset = spam_data.map(tokenize_function, batched = True)\n",
+    "\n",
+    "tokenized_dataset[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "f04dabd4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 4459\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 1115\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenized_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "73f820b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spam_data_clean = tokenized_dataset[\"train\"].train_test_split(train_size = 0.8, seed = 42)\n",
+    "\n",
+    "spam_data_clean[\"validation\"] = spam_data_clean.pop(\"test\")\n",
+    "\n",
+    "spam_data_clean[\"test\"] = tokenized_dataset[\"test\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "70c743a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 3567\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 892\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 1115\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spam_data_clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "58ce2ac8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e8d1e81615c4bda9e8c9d38e102618e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/3567 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "22d34c3e8185484eb5f690b926cc561e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/892 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "08305f0b9791416fb2053582f7da8e44",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/1115 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "spam_data_clean.save_to_disk(\"Spam-Ham-Classification\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "14052e09",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'labels': [0, 0, 0],\n",
+       " 'Sentence': ['What your plan for pongal?',\n",
+       "  \"alright, I'll make sure the car is back tonight\",\n",
+       "  'Multiply the numbers independently and count decimal points then, for the division, push the decimal places like i showed you.'],\n",
+       " 'length': [5, 9, 20],\n",
+       " 'input_ids': [[101, 1327, 1240, 2197, 1111, 185, 4553, 1348, 136, 102],\n",
+       "  [101,\n",
+       "   15354,\n",
+       "   117,\n",
+       "   146,\n",
+       "   112,\n",
+       "   1325,\n",
+       "   1294,\n",
+       "   1612,\n",
+       "   1103,\n",
+       "   1610,\n",
+       "   1110,\n",
+       "   1171,\n",
+       "   3568,\n",
+       "   102],\n",
+       "  [101,\n",
+       "   18447,\n",
+       "   1643,\n",
+       "   1193,\n",
+       "   1103,\n",
+       "   2849,\n",
+       "   8942,\n",
+       "   1105,\n",
+       "   5099,\n",
+       "   1260,\n",
+       "   27924,\n",
+       "   1827,\n",
+       "   1173,\n",
+       "   117,\n",
+       "   1111,\n",
+       "   1103,\n",
+       "   2417,\n",
+       "   117,\n",
+       "   4684,\n",
+       "   1103,\n",
+       "   1260,\n",
+       "   27924,\n",
+       "   2844,\n",
+       "   1176,\n",
+       "   178,\n",
+       "   2799,\n",
+       "   1128,\n",
+       "   119,\n",
+       "   102]],\n",
+       " 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "  [0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0,\n",
+       "   0]],\n",
+       " 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
+       "  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
+       "  [1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1,\n",
+       "   1]]}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spam_data_clean[\"validation\"][:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "0f97ef10",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 3567\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 892\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
+       "        num_rows: 1115\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spam_data_clean.remove_columns([\"Sentence\",\"length\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "06c933a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_files = {\"train\": spam_data_clean[\"train\"], \"validation\": spam_data_clean[\"validation\"], \"test\": spam_data_clean[\"test\"]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "3959be63",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSequenceClassification, TrainingArguments\n",
+    "\n",
+    "training_args = TrainingArguments(\"test-trainer\",\n",
+    "                                 eval_strategy = \"epoch\",\n",
+    "                                 fp16 = True,\n",
+    "                                 #gradient_accumulation_steps = 4,\n",
+    "                                 #per_device_train_batch_size = 4,\n",
+    "                                 learning_rate= 1e-5,\n",
+    "                                 lr_scheduler_type = \"cosine\",)\n",
+    "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels = 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "bd40266e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorWithPadding\n",
+    "data_collator = DataCollatorWithPadding(tokenizer = tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "3bbc3fd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import evaluate, numpy as np\n",
+    "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n",
+    "\n",
+    "def compute_metrics(eval_preds):\n",
+    "    logits, labels = eval_preds\n",
+    "    preds = np.argmax(logits, axis=-1)\n",
+    "    return metric.compute(predictions=preds, references=labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "e46ffe8e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='1338' max='1338' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1338/1338 02:15, Epoch 3/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.045297</td>\n",
+       "      <td>0.989910</td>\n",
+       "      <td>0.962963</td>\n",
+       "      <td>0.983193</td>\n",
+       "      <td>0.943548</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.095300</td>\n",
+       "      <td>0.042776</td>\n",
+       "      <td>0.993274</td>\n",
+       "      <td>0.975207</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.951613</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.021200</td>\n",
+       "      <td>0.040522</td>\n",
+       "      <td>0.993274</td>\n",
+       "      <td>0.975207</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.951613</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=1338, training_loss=0.04511010432991746, metrics={'train_runtime': 136.1512, 'train_samples_per_second': 78.596, 'train_steps_per_second': 9.827, 'total_flos': 338812011541800.0, 'train_loss': 0.04511010432991746, 'epoch': 3.0})"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import Trainer\n",
+    "\n",
+    "trainer = Trainer(model,\n",
+    "                  training_args,\n",
+    "                  train_dataset = spam_data_clean[\"train\"],\n",
+    "                  eval_dataset = spam_data_clean[\"validation\"],\n",
+    "                  data_collator = data_collator,\n",
+    "                  processing_class = tokenizer,\n",
+    "                 compute_metrics=compute_metrics,)\n",
+    "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "c236f093",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='112' max='112' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [112/112 00:04]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 0.04052222892642021,\n",
+       " 'eval_accuracy': 0.9932735426008968,\n",
+       " 'eval_f1': 0.9752066115702479,\n",
+       " 'eval_precision': 1.0,\n",
+       " 'eval_recall': 0.9516129032258065,\n",
+       " 'eval_runtime': 5.1761,\n",
+       " 'eval_samples_per_second': 172.33,\n",
+       " 'eval_steps_per_second': 21.638,\n",
+       " 'epoch': 3.0}"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.evaluate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "1e6538eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('spam-classifier\\\\tokenizer_config.json',\n",
+       " 'spam-classifier\\\\special_tokens_map.json',\n",
+       " 'spam-classifier\\\\vocab.txt',\n",
+       " 'spam-classifier\\\\added_tokens.json',\n",
+       " 'spam-classifier\\\\tokenizer.json')"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.save_model(\"spam-ham-classification\")\n",
+    "tokenizer.save_pretrained(\"spam-classifier\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99dbfb57",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff