| from datasets import load_dataset, Dataset |
| import random |
| import numpy as np |
| from transformers import ( |
| AutoTokenizer, |
| DataCollatorWithPadding, |
| AutoModelForSequenceClassification, |
| TrainingArguments, |
| Trainer, |
| PreTrainedTokenizer, |
| ElectraForSequenceClassification, |
| EarlyStoppingCallback |
| ) |
| from dataclasses import dataclass |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
|
|
| def process(batch: dict, tokenizer: PreTrainedTokenizer) -> dict: |
| |
| |
| new_labels = [] |
| for label in batch["Polarity"]: |
| if label in ["SP", "WP"]: |
| new_labels.append(1) |
| elif label in ["WN", "SN"]: |
| new_labels.append(0) |
| elif label == "NU": |
| new_labels.append(random.choice([1, 0])) |
| else: |
| new_labels.append(label) |
| inputs = tokenizer(batch["Text"], truncation=True) |
| batch["input_ids"] = inputs["input_ids"] |
| batch["attention_mask"] = inputs["attention_mask"] |
| batch["labels"] = new_labels |
| return batch |
|
|
|
|
| def compute_metrics(eval_pred): |
| logits, labels = eval_pred |
| predictions = logits.argmax(-1) |
| accuracy = accuracy_score(labels, predictions) |
| precision, recall, f1, _ = precision_recall_fscore_support( |
| labels, predictions, average='binary' |
| ) |
| return { |
| "accuracy": accuracy, |
| "precision": precision, |
| "recall": recall, |
| "f1": f1, |
| } |
|
|
|
|
| def pipeline(args): |
| model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2) |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) |
| dataset = load_dataset(args.dataset_name) |
| dataset = dataset.map(process, batched=True, fn_kwargs={'tokenizer': tokenizer}) |
| dataset = dataset["train"].train_test_split(args.split_ratio) |
| train_dataset = dataset["train"] |
| test_dataset = dataset["test"] |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
| trainer = Trainer( |
| model=model, |
| args=TrainingArguments( |
| output_dir="./results", |
| learning_rate=args.learning_rate, |
| per_device_train_batch_size=args.batch_size, |
| per_device_eval_batch_size=args.batch_size, |
| num_train_epochs=args.epochs, |
| weight_decay=0.01, |
| eval_strategy="steps", |
| save_strategy="steps", |
| load_best_model_at_end=True, |
| report_to="none", |
| save_steps=500, |
| eval_steps=500, |
| save_total_limit=1, |
| logging_steps=500, |
| fp16=args.fp16, |
| greater_is_better=True, |
| metric_for_best_model="f1", |
| ), |
| train_dataset=train_dataset, |
| eval_dataset=test_dataset, |
| processing_class=tokenizer, |
| data_collator=data_collator, |
| compute_metrics=compute_metrics, |
| callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] |
| ) |
|
|
| trainer.train() |
| trainer.evaluate() |
| trainer.predict(test_dataset) |
|
|
| |
| trainer.push_to_hub(args.hub_location) |
| tokenizer.push_to_hub(args.hub_location) |
|
|
| @dataclass |
| class Arguments: |
| model_name: str = "csebuetnlp/banglabert" |
| dataset_name: str = "SayedShaun/sentigold" |
| split_ratio: float = 0.1 |
| batch_size: int = 128 |
| epochs: int = 40 |
| learning_rate: float = 1e-5 |
| fp16: bool = True |
| hub_location: str = "SayedShaun/bangla-classifier-binary" |
|
|
| if __name__=="__main__": |
| args = Arguments() |
| pipeline(args) |