In [1]:
from datasets import load_dataset

data_files ="E:/Hugging_Face/SMS_Spam.csv"
spam_data = load_dataset("csv", data_files = data_files, split = "train")
spam_data = spam_data.train_test_split(test_size = 0.2)
spam_data["train"][:3]

{'Label': ['ham', 'ham', 'ham'],
 'Sentence': ['Are you up for the challenge? I know i am :)',
  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',
  'Kallis is ready for bat in 2nd innings']}

In [2]:
def lower_case(example):
    return {"Sentence": example["Sentence"].lower()}

spam_data.map(lower_case)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Label', 'Sentence'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['Label', 'Sentence'],
        num_rows: 1115
    })
})

In [3]:
def sen_len(example):
    return {"length": len(example["Sentence"].split())}

spam_data = spam_data.map(sen_len)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [4]:
spam_data["train"][:3]

{'Label': ['ham', 'ham', 'ham'],
 'Sentence': ['Are you up for the challenge? I know i am :)',
  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',
  'Kallis is ready for bat in 2nd innings'],
 'length': [11, 29, 8]}

In [8]:
spam_data = spam_data.rename_column("Label", "labels")

In [9]:
import html

spam_data = spam_data.map(lambda x: {"Sentence": html.unescape(x["Sentence"])}, batched = True)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [10]:
spam_data["train"][:20]

{'labels': ['ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'ham',
  'spam',
  'ham',
  'ham',
  'ham',
  'ham',
  'spam',
  'ham',
  'ham'],
 'Sentence': ['Are you up for the challenge? I know i am :)',
  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',
  'Kallis is ready for bat in 2nd innings',
  'Gud mrng dear hav a nice day',
  'I not free today i haf 2 pick my parents up tonite...',
  'Good afternoon on this glorious anniversary day, my sweet J !! I hope this finds you happy and content, my Prey. I think of you and send a teasing kiss from across the sea coaxing images of fond souveniers ... You Cougar-Pen',
  'SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.',
  'Haha awesome, I might need to take you up on that, what you doin tonight?',
  'Ok...',
  'I am sorry it hurt you.',
  'Watching cartoon, listening mu

In [13]:
from datasets import load_dataset, ClassLabel

spam_data = spam_data.cast_column(
    "labels", ClassLabel(names=["ham", "spam"])
)

print(spam_data["train"].features)


Casting the dataset:   0%|          | 0/4459 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1115 [00:00<?, ? examples/s]

{'labels': ClassLabel(names=['ham', 'spam']), 'Sentence': Value('string'), 'length': Value('int64')}


In [14]:
spam_data["train"][:3]

{'labels': [0, 0, 0],
 'Sentence': ['Are you up for the challenge? I know i am :)',
  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',
  'Kallis is ready for bat in 2nd innings'],
 'length': [11, 29, 8]}

In [15]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(example):
    return tokenizer(example["Sentence"], truncation = True)

tokenized_dataset = spam_data.map(tokenize_function, batched = True)

tokenized_dataset["train"][0]

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

{'labels': 0,
 'Sentence': 'Are you up for the challenge? I know i am :)',
 'length': 11,
 'input_ids': [101,
  2372,
  1128,
  1146,
  1111,
  1103,
  4506,
  136,
  146,
  1221,
  178,
  1821,
  131,
  114,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1115
    })
})

In [17]:
spam_data_clean = tokenized_dataset["train"].train_test_split(train_size = 0.8, seed = 42)

spam_data_clean["validation"] = spam_data_clean.pop("test")

spam_data_clean["test"] = tokenized_dataset["test"]

In [18]:
spam_data_clean

DatasetDict({
    train: Dataset({
        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3567
    })
    validation: Dataset({
        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 892
    })
    test: Dataset({
        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1115
    })
})

In [19]:
spam_data_clean.save_to_disk("Spam-Ham-Classification")

Saving the dataset (0/1 shards):   0%|          | 0/3567 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/892 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1115 [00:00<?, ? examples/s]

In [20]:
spam_data_clean["validation"][:3]

{'labels': [0, 0, 0],
 'Sentence': ['What your plan for pongal?',
  "alright, I'll make sure the car is back tonight",
  'Multiply the numbers independently and count decimal points then, for the division, push the decimal places like i showed you.'],
 'length': [5, 9, 20],
 'input_ids': [[101, 1327, 1240, 2197, 1111, 185, 4553, 1348, 136, 102],
  [101,
   15354,
   117,
   146,
   112,
   1325,
   1294,
   1612,
   1103,
   1610,
   1110,
   1171,
   3568,
   102],
  [101,
   18447,
   1643,
   1193,
   1103,
   2849,
   8942,
   1105,
   5099,
   1260,
   27924,
   1827,
   1173,
   117,
   1111,
   1103,
   2417,
   117,
   4684,
   1103,
   1260,
   27924,
   2844,
   1176,
   178,
   2799,
   1128,
   119,
   102]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [21]:
spam_data_clean.remove_columns(["Sentence","length"])

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3567
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 892
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1115
    })
})

In [22]:
data_files = {"train": spam_data_clean["train"], "validation": spam_data_clean["validation"], "test": spam_data_clean["test"]}

In [35]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

training_args = TrainingArguments("test-trainer",
                                 eval_strategy = "epoch",
                                 fp16 = True,
                                 #gradient_accumulation_steps = 4,
                                 #per_device_train_batch_size = 4,
                                 learning_rate= 1e-5,
                                 lr_scheduler_type = "cosine",)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [37]:
import evaluate, numpy as np
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [38]:
from transformers import Trainer

trainer = Trainer(model,
                  training_args,
                  train_dataset = spam_data_clean["train"],
                  eval_dataset = spam_data_clean["validation"],
                  data_collator = data_collator,
                  processing_class = tokenizer,
                 compute_metrics=compute_metrics,)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.045297,0.98991,0.962963,0.983193,0.943548
2,0.095300,0.042776,0.993274,0.975207,1.0,0.951613
3,0.021200,0.040522,0.993274,0.975207,1.0,0.951613


TrainOutput(global_step=1338, training_loss=0.04511010432991746, metrics={'train_runtime': 136.1512, 'train_samples_per_second': 78.596, 'train_steps_per_second': 9.827, 'total_flos': 338812011541800.0, 'train_loss': 0.04511010432991746, 'epoch': 3.0})

In [39]:
trainer.evaluate()

{'eval_loss': 0.04052222892642021,
 'eval_accuracy': 0.9932735426008968,
 'eval_f1': 0.9752066115702479,
 'eval_precision': 1.0,
 'eval_recall': 0.9516129032258065,
 'eval_runtime': 5.1761,
 'eval_samples_per_second': 172.33,
 'eval_steps_per_second': 21.638,
 'epoch': 3.0}

In [40]:
trainer.save_model("spam-ham-classification")
tokenizer.save_pretrained("spam-classifier")

('spam-classifier\\tokenizer_config.json',
 'spam-classifier\\special_tokens_map.json',
 'spam-classifier\\vocab.txt',
 'spam-classifier\\added_tokens.json',
 'spam-classifier\\tokenizer.json')