|
|
|
|
|
""" |
|
|
Bengali Sentiment - Fine-tuning Script |
|
|
======================================= |
|
|
Usage: python fine_tune_script.py |
|
|
""" |
|
|
|
|
|
from datasets import load_dataset |
|
|
from transformers import ( |
|
|
AutoTokenizer, AutoModelForSequenceClassification, |
|
|
Trainer, TrainingArguments, DataCollatorWithPadding |
|
|
) |
|
|
|
|
|
DATASET = "Swarnadeep-28/bn_code_mix_sentiment_dataset" |
|
|
MODEL = "distilbert-base-multilingual-cased" |
|
|
|
|
|
LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2, "Mixed": 3} |
|
|
|
|
|
def preprocess(examples): |
|
|
texts = [str(t) for t in examples["text"]] |
|
|
labels = [LABEL_MAP[l] for l in examples["label"]] |
|
|
tok = AutoTokenizer.from_pretrained(MODEL) |
|
|
tokenized = tok(texts, truncation=True, padding=True, max_length=128) |
|
|
tokenized["labels"] = labels |
|
|
return tokenized |
|
|
|
|
|
ds = load_dataset(DATASET) |
|
|
train = ds["train"].select(range(2000)).map(preprocess, batched=True, remove_columns=ds["train"].column_names) |
|
|
val = ds["validation"].select(range(500)).map(preprocess, batched=True, remove_columns=ds["validation"].column_names) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=4) |
|
|
|
|
|
args = TrainingArguments(output_dir="./output", num_train_epochs=3, per_device_train_batch_size=8, eval_strategy="epoch") |
|
|
trainer = Trainer(model=model, args=args, train_dataset=train, eval_dataset=val, data_collator=DataCollatorWithPadding()) |
|
|
trainer.train() |
|
|
trainer.save_model("./bengali_sentiment_finetuned") |
|
|
|