Sathya77 commited on
Commit
f3382ce
·
verified ·
1 Parent(s): a033cd6

Upload 9 files

Browse files
SMS_Spam.csv ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "problem_type": "single_label_classification",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.55.0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 28996
26
+ }
dataset_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"splits": ["train", "validation", "test"]}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:732fd6d40215d66e9ba0fdf1530db021a764ed1a839c4279e3266a54258a0f71
3
+ size 433270768
spam-ham-classfication.ipynb ADDED
@@ -0,0 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "12349750",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "{'Label': ['ham', 'ham', 'ham'],\n",
13
+ " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
14
+ " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit & Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning & Have A Happy Day:)',\n",
15
+ " 'Kallis is ready for bat in 2nd innings']}"
16
+ ]
17
+ },
18
+ "execution_count": 1,
19
+ "metadata": {},
20
+ "output_type": "execute_result"
21
+ }
22
+ ],
23
+ "source": [
24
+ "from datasets import load_dataset\n",
25
+ "\n",
26
+ "data_files =\"E:/Hugging_Face/SMS_Spam.csv\"\n",
27
+ "spam_data = load_dataset(\"csv\", data_files = data_files, split = \"train\")\n",
28
+ "spam_data = spam_data.train_test_split(test_size = 0.2)\n",
29
+ "spam_data[\"train\"][:3]"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 2,
35
+ "id": "35f0392d",
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "data": {
40
+ "application/vnd.jupyter.widget-view+json": {
41
+ "model_id": "e6740059d6df4ea7aceaf262ef339c94",
42
+ "version_major": 2,
43
+ "version_minor": 0
44
+ },
45
+ "text/plain": [
46
+ "Map: 0%| | 0/4459 [00:00<?, ? examples/s]"
47
+ ]
48
+ },
49
+ "metadata": {},
50
+ "output_type": "display_data"
51
+ },
52
+ {
53
+ "data": {
54
+ "application/vnd.jupyter.widget-view+json": {
55
+ "model_id": "c2d8fd5629eb4e6aa0c91866c3ee2562",
56
+ "version_major": 2,
57
+ "version_minor": 0
58
+ },
59
+ "text/plain": [
60
+ "Map: 0%| | 0/1115 [00:00<?, ? examples/s]"
61
+ ]
62
+ },
63
+ "metadata": {},
64
+ "output_type": "display_data"
65
+ },
66
+ {
67
+ "data": {
68
+ "text/plain": [
69
+ "DatasetDict({\n",
70
+ " train: Dataset({\n",
71
+ " features: ['Label', 'Sentence'],\n",
72
+ " num_rows: 4459\n",
73
+ " })\n",
74
+ " test: Dataset({\n",
75
+ " features: ['Label', 'Sentence'],\n",
76
+ " num_rows: 1115\n",
77
+ " })\n",
78
+ "})"
79
+ ]
80
+ },
81
+ "execution_count": 2,
82
+ "metadata": {},
83
+ "output_type": "execute_result"
84
+ }
85
+ ],
86
+ "source": [
87
+ "def lower_case(example):\n",
88
+ " return {\"Sentence\": example[\"Sentence\"].lower()}\n",
89
+ "\n",
90
+ "spam_data.map(lower_case)"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 3,
96
+ "id": "9df36294",
97
+ "metadata": {},
98
+ "outputs": [
99
+ {
100
+ "data": {
101
+ "application/vnd.jupyter.widget-view+json": {
102
+ "model_id": "1d4f5f516b024a459dba03cb2b5e764b",
103
+ "version_major": 2,
104
+ "version_minor": 0
105
+ },
106
+ "text/plain": [
107
+ "Map: 0%| | 0/4459 [00:00<?, ? examples/s]"
108
+ ]
109
+ },
110
+ "metadata": {},
111
+ "output_type": "display_data"
112
+ },
113
+ {
114
+ "data": {
115
+ "application/vnd.jupyter.widget-view+json": {
116
+ "model_id": "a64af11c2cde4ef1b56a49b4ffb6b200",
117
+ "version_major": 2,
118
+ "version_minor": 0
119
+ },
120
+ "text/plain": [
121
+ "Map: 0%| | 0/1115 [00:00<?, ? examples/s]"
122
+ ]
123
+ },
124
+ "metadata": {},
125
+ "output_type": "display_data"
126
+ }
127
+ ],
128
+ "source": [
129
+ "def sen_len(example):\n",
130
+ " return {\"length\": len(example[\"Sentence\"].split())}\n",
131
+ "\n",
132
+ "spam_data = spam_data.map(sen_len)"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 4,
138
+ "id": "db1d8406",
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "text/plain": [
144
+ "{'Label': ['ham', 'ham', 'ham'],\n",
145
+ " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
146
+ " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
147
+ " 'Kallis is ready for bat in 2nd innings'],\n",
148
+ " 'length': [11, 29, 8]}"
149
+ ]
150
+ },
151
+ "execution_count": 4,
152
+ "metadata": {},
153
+ "output_type": "execute_result"
154
+ }
155
+ ],
156
+ "source": [
157
+ "spam_data[\"train\"][:3]"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 8,
163
+ "id": "3e742939",
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "spam_data = spam_data.rename_column(\"Label\", \"labels\")"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 9,
173
+ "id": "a1d7c214",
174
+ "metadata": {},
175
+ "outputs": [
176
+ {
177
+ "data": {
178
+ "application/vnd.jupyter.widget-view+json": {
179
+ "model_id": "ae1b7a15bd7e46e5aa763483d877ac5b",
180
+ "version_major": 2,
181
+ "version_minor": 0
182
+ },
183
+ "text/plain": [
184
+ "Map: 0%| | 0/4459 [00:00<?, ? examples/s]"
185
+ ]
186
+ },
187
+ "metadata": {},
188
+ "output_type": "display_data"
189
+ },
190
+ {
191
+ "data": {
192
+ "application/vnd.jupyter.widget-view+json": {
193
+ "model_id": "e33b493b97754c588a4847d069773fc3",
194
+ "version_major": 2,
195
+ "version_minor": 0
196
+ },
197
+ "text/plain": [
198
+ "Map: 0%| | 0/1115 [00:00<?, ? examples/s]"
199
+ ]
200
+ },
201
+ "metadata": {},
202
+ "output_type": "display_data"
203
+ }
204
+ ],
205
+ "source": [
206
+ "import html\n",
207
+ "\n",
208
+ "spam_data = spam_data.map(lambda x: {\"Sentence\": html.unescape(x[\"Sentence\"])}, batched = True)"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": 10,
214
+ "id": "8fa3f455",
215
+ "metadata": {},
216
+ "outputs": [
217
+ {
218
+ "data": {
219
+ "text/plain": [
220
+ "{'labels': ['ham',\n",
221
+ " 'ham',\n",
222
+ " 'ham',\n",
223
+ " 'ham',\n",
224
+ " 'ham',\n",
225
+ " 'ham',\n",
226
+ " 'ham',\n",
227
+ " 'ham',\n",
228
+ " 'ham',\n",
229
+ " 'ham',\n",
230
+ " 'ham',\n",
231
+ " 'ham',\n",
232
+ " 'spam',\n",
233
+ " 'ham',\n",
234
+ " 'ham',\n",
235
+ " 'ham',\n",
236
+ " 'ham',\n",
237
+ " 'spam',\n",
238
+ " 'ham',\n",
239
+ " 'ham'],\n",
240
+ " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
241
+ " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
242
+ " 'Kallis is ready for bat in 2nd innings',\n",
243
+ " 'Gud mrng dear hav a nice day',\n",
244
+ " 'I not free today i haf 2 pick my parents up tonite...',\n",
245
+ " 'Good afternoon on this glorious anniversary day, my sweet J !! I hope this finds you happy and content, my Prey. I think of you and send a teasing kiss from across the sea coaxing images of fond souveniers ... You Cougar-Pen',\n",
246
+ " 'SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.',\n",
247
+ " 'Haha awesome, I might need to take you up on that, what you doin tonight?',\n",
248
+ " 'Ok...',\n",
249
+ " 'I am sorry it hurt you.',\n",
250
+ " 'Watching cartoon, listening music &amp; at eve had to go temple &amp; church.. What about u?',\n",
251
+ " 'Sent me de webadres for geting salary slip',\n",
252
+ " 'Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX',\n",
253
+ " \"I want snow. It's just freezing and windy.\",\n",
254
+ " ', im .. On the snowboarding trip. I was wondering if your planning to get everyone together befor we go..a meet and greet kind of affair? Cheers, ',\n",
255
+ " 'Siva is in hostel aha:-.',\n",
256
+ " 'CHEERS LOU! YEAH WAS A GOODNITE SHAME U NEVA CAME! C YA GAILxx',\n",
257
+ " 'URGENT! Your Mobile number has been awarded with a £2000 prize GUARANTEED. Call 09061790126 from land line. Claim 3030. Valid 12hrs only 150ppm',\n",
258
+ " 'Did u got that persons story',\n",
259
+ " 'Amazing : If you rearrange these letters it gives the same meaning... Dormitory = Dirty room Astronomer = Moon starer The eyes = They see Election results = Lies lets recount Mother-in-law = Woman Hitler Eleven plus two =Twelve plus one Its Amazing... !:-)'],\n",
260
+ " 'length': [11,\n",
261
+ " 29,\n",
262
+ " 8,\n",
263
+ " 7,\n",
264
+ " 12,\n",
265
+ " 42,\n",
266
+ " 8,\n",
267
+ " 15,\n",
268
+ " 1,\n",
269
+ " 6,\n",
270
+ " 16,\n",
271
+ " 8,\n",
272
+ " 22,\n",
273
+ " 8,\n",
274
+ " 27,\n",
275
+ " 5,\n",
276
+ " 13,\n",
277
+ " 23,\n",
278
+ " 6,\n",
279
+ " 44]}"
280
+ ]
281
+ },
282
+ "execution_count": 10,
283
+ "metadata": {},
284
+ "output_type": "execute_result"
285
+ }
286
+ ],
287
+ "source": [
288
+ "spam_data[\"train\"][:20]"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": 13,
294
+ "id": "b59be7ac",
295
+ "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "data": {
299
+ "application/vnd.jupyter.widget-view+json": {
300
+ "model_id": "54748e89b52c45f5af2c1d96e6e6f91e",
301
+ "version_major": 2,
302
+ "version_minor": 0
303
+ },
304
+ "text/plain": [
305
+ "Casting the dataset: 0%| | 0/4459 [00:00<?, ? examples/s]"
306
+ ]
307
+ },
308
+ "metadata": {},
309
+ "output_type": "display_data"
310
+ },
311
+ {
312
+ "data": {
313
+ "application/vnd.jupyter.widget-view+json": {
314
+ "model_id": "bae9e8f5c4c84a19aa01e4bb2d65080e",
315
+ "version_major": 2,
316
+ "version_minor": 0
317
+ },
318
+ "text/plain": [
319
+ "Casting the dataset: 0%| | 0/1115 [00:00<?, ? examples/s]"
320
+ ]
321
+ },
322
+ "metadata": {},
323
+ "output_type": "display_data"
324
+ },
325
+ {
326
+ "name": "stdout",
327
+ "output_type": "stream",
328
+ "text": [
329
+ "{'labels': ClassLabel(names=['ham', 'spam']), 'Sentence': Value('string'), 'length': Value('int64')}\n"
330
+ ]
331
+ }
332
+ ],
333
+ "source": [
334
+ "from datasets import load_dataset, ClassLabel\n",
335
+ "\n",
336
+ "spam_data = spam_data.cast_column(\n",
337
+ " \"labels\", ClassLabel(names=[\"ham\", \"spam\"])\n",
338
+ ")\n",
339
+ "\n",
340
+ "print(spam_data[\"train\"].features)\n"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 14,
346
+ "id": "b8a087d1",
347
+ "metadata": {},
348
+ "outputs": [
349
+ {
350
+ "data": {
351
+ "text/plain": [
352
+ "{'labels': [0, 0, 0],\n",
353
+ " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
354
+ " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
355
+ " 'Kallis is ready for bat in 2nd innings'],\n",
356
+ " 'length': [11, 29, 8]}"
357
+ ]
358
+ },
359
+ "execution_count": 14,
360
+ "metadata": {},
361
+ "output_type": "execute_result"
362
+ }
363
+ ],
364
+ "source": [
365
+ "spam_data[\"train\"][:3]"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 15,
371
+ "id": "eae6b9a7",
372
+ "metadata": {},
373
+ "outputs": [
374
+ {
375
+ "data": {
376
+ "application/vnd.jupyter.widget-view+json": {
377
+ "model_id": "58ddfaa8aa3545879d58d0a955b886e4",
378
+ "version_major": 2,
379
+ "version_minor": 0
380
+ },
381
+ "text/plain": [
382
+ "Map: 0%| | 0/4459 [00:00<?, ? examples/s]"
383
+ ]
384
+ },
385
+ "metadata": {},
386
+ "output_type": "display_data"
387
+ },
388
+ {
389
+ "data": {
390
+ "application/vnd.jupyter.widget-view+json": {
391
+ "model_id": "1ba22c48e19c4d53b37e54615139925e",
392
+ "version_major": 2,
393
+ "version_minor": 0
394
+ },
395
+ "text/plain": [
396
+ "Map: 0%| | 0/1115 [00:00<?, ? examples/s]"
397
+ ]
398
+ },
399
+ "metadata": {},
400
+ "output_type": "display_data"
401
+ },
402
+ {
403
+ "data": {
404
+ "text/plain": [
405
+ "{'labels': 0,\n",
406
+ " 'Sentence': 'Are you up for the challenge? I know i am :)',\n",
407
+ " 'length': 11,\n",
408
+ " 'input_ids': [101,\n",
409
+ " 2372,\n",
410
+ " 1128,\n",
411
+ " 1146,\n",
412
+ " 1111,\n",
413
+ " 1103,\n",
414
+ " 4506,\n",
415
+ " 136,\n",
416
+ " 146,\n",
417
+ " 1221,\n",
418
+ " 178,\n",
419
+ " 1821,\n",
420
+ " 131,\n",
421
+ " 114,\n",
422
+ " 102],\n",
423
+ " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
424
+ " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
425
+ ]
426
+ },
427
+ "execution_count": 15,
428
+ "metadata": {},
429
+ "output_type": "execute_result"
430
+ }
431
+ ],
432
+ "source": [
433
+ "from transformers import AutoTokenizer, AutoModel\n",
434
+ "\n",
435
+ "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
436
+ "\n",
437
+ "def tokenize_function(example):\n",
438
+ " return tokenizer(example[\"Sentence\"], truncation = True)\n",
439
+ "\n",
440
+ "tokenized_dataset = spam_data.map(tokenize_function, batched = True)\n",
441
+ "\n",
442
+ "tokenized_dataset[\"train\"][0]"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 16,
448
+ "id": "f04dabd4",
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "data": {
453
+ "text/plain": [
454
+ "DatasetDict({\n",
455
+ " train: Dataset({\n",
456
+ " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
457
+ " num_rows: 4459\n",
458
+ " })\n",
459
+ " test: Dataset({\n",
460
+ " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
461
+ " num_rows: 1115\n",
462
+ " })\n",
463
+ "})"
464
+ ]
465
+ },
466
+ "execution_count": 16,
467
+ "metadata": {},
468
+ "output_type": "execute_result"
469
+ }
470
+ ],
471
+ "source": [
472
+ "tokenized_dataset"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "execution_count": 17,
478
+ "id": "73f820b8",
479
+ "metadata": {},
480
+ "outputs": [],
481
+ "source": [
482
+ "spam_data_clean = tokenized_dataset[\"train\"].train_test_split(train_size = 0.8, seed = 42)\n",
483
+ "\n",
484
+ "spam_data_clean[\"validation\"] = spam_data_clean.pop(\"test\")\n",
485
+ "\n",
486
+ "spam_data_clean[\"test\"] = tokenized_dataset[\"test\"]"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 18,
492
+ "id": "70c743a6",
493
+ "metadata": {},
494
+ "outputs": [
495
+ {
496
+ "data": {
497
+ "text/plain": [
498
+ "DatasetDict({\n",
499
+ " train: Dataset({\n",
500
+ " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
501
+ " num_rows: 3567\n",
502
+ " })\n",
503
+ " validation: Dataset({\n",
504
+ " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
505
+ " num_rows: 892\n",
506
+ " })\n",
507
+ " test: Dataset({\n",
508
+ " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
509
+ " num_rows: 1115\n",
510
+ " })\n",
511
+ "})"
512
+ ]
513
+ },
514
+ "execution_count": 18,
515
+ "metadata": {},
516
+ "output_type": "execute_result"
517
+ }
518
+ ],
519
+ "source": [
520
+ "spam_data_clean"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": 19,
526
+ "id": "58ce2ac8",
527
+ "metadata": {},
528
+ "outputs": [
529
+ {
530
+ "data": {
531
+ "application/vnd.jupyter.widget-view+json": {
532
+ "model_id": "1e8d1e81615c4bda9e8c9d38e102618e",
533
+ "version_major": 2,
534
+ "version_minor": 0
535
+ },
536
+ "text/plain": [
537
+ "Saving the dataset (0/1 shards): 0%| | 0/3567 [00:00<?, ? examples/s]"
538
+ ]
539
+ },
540
+ "metadata": {},
541
+ "output_type": "display_data"
542
+ },
543
+ {
544
+ "data": {
545
+ "application/vnd.jupyter.widget-view+json": {
546
+ "model_id": "22d34c3e8185484eb5f690b926cc561e",
547
+ "version_major": 2,
548
+ "version_minor": 0
549
+ },
550
+ "text/plain": [
551
+ "Saving the dataset (0/1 shards): 0%| | 0/892 [00:00<?, ? examples/s]"
552
+ ]
553
+ },
554
+ "metadata": {},
555
+ "output_type": "display_data"
556
+ },
557
+ {
558
+ "data": {
559
+ "application/vnd.jupyter.widget-view+json": {
560
+ "model_id": "08305f0b9791416fb2053582f7da8e44",
561
+ "version_major": 2,
562
+ "version_minor": 0
563
+ },
564
+ "text/plain": [
565
+ "Saving the dataset (0/1 shards): 0%| | 0/1115 [00:00<?, ? examples/s]"
566
+ ]
567
+ },
568
+ "metadata": {},
569
+ "output_type": "display_data"
570
+ }
571
+ ],
572
+ "source": [
573
+ "spam_data_clean.save_to_disk(\"Spam-Ham-Classification\")"
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "execution_count": 20,
579
+ "id": "14052e09",
580
+ "metadata": {},
581
+ "outputs": [
582
+ {
583
+ "data": {
584
+ "text/plain": [
585
+ "{'labels': [0, 0, 0],\n",
586
+ " 'Sentence': ['What your plan for pongal?',\n",
587
+ " \"alright, I'll make sure the car is back tonight\",\n",
588
+ " 'Multiply the numbers independently and count decimal points then, for the division, push the decimal places like i showed you.'],\n",
589
+ " 'length': [5, 9, 20],\n",
590
+ " 'input_ids': [[101, 1327, 1240, 2197, 1111, 185, 4553, 1348, 136, 102],\n",
591
+ " [101,\n",
592
+ " 15354,\n",
593
+ " 117,\n",
594
+ " 146,\n",
595
+ " 112,\n",
596
+ " 1325,\n",
597
+ " 1294,\n",
598
+ " 1612,\n",
599
+ " 1103,\n",
600
+ " 1610,\n",
601
+ " 1110,\n",
602
+ " 1171,\n",
603
+ " 3568,\n",
604
+ " 102],\n",
605
+ " [101,\n",
606
+ " 18447,\n",
607
+ " 1643,\n",
608
+ " 1193,\n",
609
+ " 1103,\n",
610
+ " 2849,\n",
611
+ " 8942,\n",
612
+ " 1105,\n",
613
+ " 5099,\n",
614
+ " 1260,\n",
615
+ " 27924,\n",
616
+ " 1827,\n",
617
+ " 1173,\n",
618
+ " 117,\n",
619
+ " 1111,\n",
620
+ " 1103,\n",
621
+ " 2417,\n",
622
+ " 117,\n",
623
+ " 4684,\n",
624
+ " 1103,\n",
625
+ " 1260,\n",
626
+ " 27924,\n",
627
+ " 2844,\n",
628
+ " 1176,\n",
629
+ " 178,\n",
630
+ " 2799,\n",
631
+ " 1128,\n",
632
+ " 119,\n",
633
+ " 102]],\n",
634
+ " 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
635
+ " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
636
+ " [0,\n",
637
+ " 0,\n",
638
+ " 0,\n",
639
+ " 0,\n",
640
+ " 0,\n",
641
+ " 0,\n",
642
+ " 0,\n",
643
+ " 0,\n",
644
+ " 0,\n",
645
+ " 0,\n",
646
+ " 0,\n",
647
+ " 0,\n",
648
+ " 0,\n",
649
+ " 0,\n",
650
+ " 0,\n",
651
+ " 0,\n",
652
+ " 0,\n",
653
+ " 0,\n",
654
+ " 0,\n",
655
+ " 0,\n",
656
+ " 0,\n",
657
+ " 0,\n",
658
+ " 0,\n",
659
+ " 0,\n",
660
+ " 0,\n",
661
+ " 0,\n",
662
+ " 0,\n",
663
+ " 0,\n",
664
+ " 0]],\n",
665
+ " 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
666
+ " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
667
+ " [1,\n",
668
+ " 1,\n",
669
+ " 1,\n",
670
+ " 1,\n",
671
+ " 1,\n",
672
+ " 1,\n",
673
+ " 1,\n",
674
+ " 1,\n",
675
+ " 1,\n",
676
+ " 1,\n",
677
+ " 1,\n",
678
+ " 1,\n",
679
+ " 1,\n",
680
+ " 1,\n",
681
+ " 1,\n",
682
+ " 1,\n",
683
+ " 1,\n",
684
+ " 1,\n",
685
+ " 1,\n",
686
+ " 1,\n",
687
+ " 1,\n",
688
+ " 1,\n",
689
+ " 1,\n",
690
+ " 1,\n",
691
+ " 1,\n",
692
+ " 1,\n",
693
+ " 1,\n",
694
+ " 1,\n",
695
+ " 1]]}"
696
+ ]
697
+ },
698
+ "execution_count": 20,
699
+ "metadata": {},
700
+ "output_type": "execute_result"
701
+ }
702
+ ],
703
+ "source": [
704
+ "spam_data_clean[\"validation\"][:3]"
705
+ ]
706
+ },
707
+ {
708
+ "cell_type": "code",
709
+ "execution_count": 21,
710
+ "id": "0f97ef10",
711
+ "metadata": {},
712
+ "outputs": [
713
+ {
714
+ "data": {
715
+ "text/plain": [
716
+ "DatasetDict({\n",
717
+ " train: Dataset({\n",
718
+ " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
719
+ " num_rows: 3567\n",
720
+ " })\n",
721
+ " validation: Dataset({\n",
722
+ " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
723
+ " num_rows: 892\n",
724
+ " })\n",
725
+ " test: Dataset({\n",
726
+ " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
727
+ " num_rows: 1115\n",
728
+ " })\n",
729
+ "})"
730
+ ]
731
+ },
732
+ "execution_count": 21,
733
+ "metadata": {},
734
+ "output_type": "execute_result"
735
+ }
736
+ ],
737
+ "source": [
738
+ "spam_data_clean.remove_columns([\"Sentence\",\"length\"])"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": 22,
744
+ "id": "06c933a6",
745
+ "metadata": {},
746
+ "outputs": [],
747
+ "source": [
748
+ "data_files = {\"train\": spam_data_clean[\"train\"], \"validation\": spam_data_clean[\"validation\"], \"test\": spam_data_clean[\"test\"]}"
749
+ ]
750
+ },
751
+ {
752
+ "cell_type": "code",
753
+ "execution_count": 35,
754
+ "id": "3959be63",
755
+ "metadata": {},
756
+ "outputs": [
757
+ {
758
+ "name": "stderr",
759
+ "output_type": "stream",
760
+ "text": [
761
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
762
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
763
+ ]
764
+ }
765
+ ],
766
+ "source": [
767
+ "from transformers import AutoModelForSequenceClassification, TrainingArguments\n",
768
+ "\n",
769
+ "training_args = TrainingArguments(\"test-trainer\",\n",
770
+ " eval_strategy = \"epoch\",\n",
771
+ " fp16 = True,\n",
772
+ " #gradient_accumulation_steps = 4,\n",
773
+ " #per_device_train_batch_size = 4,\n",
774
+ " learning_rate= 1e-5,\n",
775
+ " lr_scheduler_type = \"cosine\",)\n",
776
+ "\n",
777
+ "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels = 2)"
778
+ ]
779
+ },
780
+ {
781
+ "cell_type": "code",
782
+ "execution_count": 36,
783
+ "id": "bd40266e",
784
+ "metadata": {},
785
+ "outputs": [],
786
+ "source": [
787
+ "from transformers import DataCollatorWithPadding\n",
788
+ "data_collator = DataCollatorWithPadding(tokenizer = tokenizer)"
789
+ ]
790
+ },
791
+ {
792
+ "cell_type": "code",
793
+ "execution_count": 37,
794
+ "id": "3bbc3fd2",
795
+ "metadata": {},
796
+ "outputs": [],
797
+ "source": [
798
+ "import evaluate, numpy as np\n",
799
+ "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n",
800
+ "\n",
801
+ "def compute_metrics(eval_preds):\n",
802
+ " logits, labels = eval_preds\n",
803
+ " preds = np.argmax(logits, axis=-1)\n",
804
+ " return metric.compute(predictions=preds, references=labels)"
805
+ ]
806
+ },
807
+ {
808
+ "cell_type": "code",
809
+ "execution_count": 38,
810
+ "id": "e46ffe8e",
811
+ "metadata": {},
812
+ "outputs": [
813
+ {
814
+ "data": {
815
+ "text/html": [
816
+ "\n",
817
+ " <div>\n",
818
+ " \n",
819
+ " <progress value='1338' max='1338' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
820
+ " [1338/1338 02:15, Epoch 3/3]\n",
821
+ " </div>\n",
822
+ " <table border=\"1\" class=\"dataframe\">\n",
823
+ " <thead>\n",
824
+ " <tr style=\"text-align: left;\">\n",
825
+ " <th>Epoch</th>\n",
826
+ " <th>Training Loss</th>\n",
827
+ " <th>Validation Loss</th>\n",
828
+ " <th>Accuracy</th>\n",
829
+ " <th>F1</th>\n",
830
+ " <th>Precision</th>\n",
831
+ " <th>Recall</th>\n",
832
+ " </tr>\n",
833
+ " </thead>\n",
834
+ " <tbody>\n",
835
+ " <tr>\n",
836
+ " <td>1</td>\n",
837
+ " <td>No log</td>\n",
838
+ " <td>0.045297</td>\n",
839
+ " <td>0.989910</td>\n",
840
+ " <td>0.962963</td>\n",
841
+ " <td>0.983193</td>\n",
842
+ " <td>0.943548</td>\n",
843
+ " </tr>\n",
844
+ " <tr>\n",
845
+ " <td>2</td>\n",
846
+ " <td>0.095300</td>\n",
847
+ " <td>0.042776</td>\n",
848
+ " <td>0.993274</td>\n",
849
+ " <td>0.975207</td>\n",
850
+ " <td>1.000000</td>\n",
851
+ " <td>0.951613</td>\n",
852
+ " </tr>\n",
853
+ " <tr>\n",
854
+ " <td>3</td>\n",
855
+ " <td>0.021200</td>\n",
856
+ " <td>0.040522</td>\n",
857
+ " <td>0.993274</td>\n",
858
+ " <td>0.975207</td>\n",
859
+ " <td>1.000000</td>\n",
860
+ " <td>0.951613</td>\n",
861
+ " </tr>\n",
862
+ " </tbody>\n",
863
+ "</table><p>"
864
+ ],
865
+ "text/plain": [
866
+ "<IPython.core.display.HTML object>"
867
+ ]
868
+ },
869
+ "metadata": {},
870
+ "output_type": "display_data"
871
+ },
872
+ {
873
+ "data": {
874
+ "text/plain": [
875
+ "TrainOutput(global_step=1338, training_loss=0.04511010432991746, metrics={'train_runtime': 136.1512, 'train_samples_per_second': 78.596, 'train_steps_per_second': 9.827, 'total_flos': 338812011541800.0, 'train_loss': 0.04511010432991746, 'epoch': 3.0})"
876
+ ]
877
+ },
878
+ "execution_count": 38,
879
+ "metadata": {},
880
+ "output_type": "execute_result"
881
+ }
882
+ ],
883
+ "source": [
884
+ "from transformers import Trainer\n",
885
+ "\n",
886
+ "trainer = Trainer(model,\n",
887
+ " training_args,\n",
888
+ " train_dataset = spam_data_clean[\"train\"],\n",
889
+ " eval_dataset = spam_data_clean[\"validation\"],\n",
890
+ " data_collator = data_collator,\n",
891
+ " processing_class = tokenizer,\n",
892
+ " compute_metrics=compute_metrics,)\n",
893
+ "\n",
894
+ "trainer.train()"
895
+ ]
896
+ },
897
+ {
898
+ "cell_type": "code",
899
+ "execution_count": 39,
900
+ "id": "c236f093",
901
+ "metadata": {},
902
+ "outputs": [
903
+ {
904
+ "data": {
905
+ "text/html": [
906
+ "\n",
907
+ " <div>\n",
908
+ " \n",
909
+ " <progress value='112' max='112' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
910
+ " [112/112 00:04]\n",
911
+ " </div>\n",
912
+ " "
913
+ ],
914
+ "text/plain": [
915
+ "<IPython.core.display.HTML object>"
916
+ ]
917
+ },
918
+ "metadata": {},
919
+ "output_type": "display_data"
920
+ },
921
+ {
922
+ "data": {
923
+ "text/plain": [
924
+ "{'eval_loss': 0.04052222892642021,\n",
925
+ " 'eval_accuracy': 0.9932735426008968,\n",
926
+ " 'eval_f1': 0.9752066115702479,\n",
927
+ " 'eval_precision': 1.0,\n",
928
+ " 'eval_recall': 0.9516129032258065,\n",
929
+ " 'eval_runtime': 5.1761,\n",
930
+ " 'eval_samples_per_second': 172.33,\n",
931
+ " 'eval_steps_per_second': 21.638,\n",
932
+ " 'epoch': 3.0}"
933
+ ]
934
+ },
935
+ "execution_count": 39,
936
+ "metadata": {},
937
+ "output_type": "execute_result"
938
+ }
939
+ ],
940
+ "source": [
941
+ "trainer.evaluate()"
942
+ ]
943
+ },
944
+ {
945
+ "cell_type": "code",
946
+ "execution_count": 40,
947
+ "id": "1e6538eb",
948
+ "metadata": {},
949
+ "outputs": [
950
+ {
951
+ "data": {
952
+ "text/plain": [
953
+ "('spam-classifier\\\\tokenizer_config.json',\n",
954
+ " 'spam-classifier\\\\special_tokens_map.json',\n",
955
+ " 'spam-classifier\\\\vocab.txt',\n",
956
+ " 'spam-classifier\\\\added_tokens.json',\n",
957
+ " 'spam-classifier\\\\tokenizer.json')"
958
+ ]
959
+ },
960
+ "execution_count": 40,
961
+ "metadata": {},
962
+ "output_type": "execute_result"
963
+ }
964
+ ],
965
+ "source": [
966
+ "trainer.save_model(\"spam-ham-classification\")\n",
967
+ "tokenizer.save_pretrained(\"spam-classifier\")"
968
+ ]
969
+ },
970
+ {
971
+ "cell_type": "code",
972
+ "execution_count": null,
973
+ "id": "99dbfb57",
974
+ "metadata": {},
975
+ "outputs": [],
976
+ "source": []
977
+ }
978
+ ],
979
+ "metadata": {
980
+ "kernelspec": {
981
+ "display_name": "Python 3 (ipykernel)",
982
+ "language": "python",
983
+ "name": "python3"
984
+ },
985
+ "language_info": {
986
+ "codemirror_mode": {
987
+ "name": "ipython",
988
+ "version": 3
989
+ },
990
+ "file_extension": ".py",
991
+ "mimetype": "text/x-python",
992
+ "name": "python",
993
+ "nbconvert_exporter": "python",
994
+ "pygments_lexer": "ipython3",
995
+ "version": "3.11.4"
996
+ }
997
+ },
998
+ "nbformat": 4,
999
+ "nbformat_minor": 5
1000
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff