bengali-code-mix-sentiment / fine_tune_script.py
OsamaBinLikhon's picture
Add fine_tune_script.py
51027b7 verified
#!/usr/bin/env python3
"""
Bengali Sentiment - Fine-tuning Script
=======================================
Usage: python fine_tune_script.py
"""
from datasets import load_dataset
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
Trainer, TrainingArguments, DataCollatorWithPadding
)
DATASET = "Swarnadeep-28/bn_code_mix_sentiment_dataset"
MODEL = "distilbert-base-multilingual-cased"
LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2, "Mixed": 3}
def preprocess(examples):
texts = [str(t) for t in examples["text"]]
labels = [LABEL_MAP[l] for l in examples["label"]]
tok = AutoTokenizer.from_pretrained(MODEL)
tokenized = tok(texts, truncation=True, padding=True, max_length=128)
tokenized["labels"] = labels
return tokenized
ds = load_dataset(DATASET)
train = ds["train"].select(range(2000)).map(preprocess, batched=True, remove_columns=ds["train"].column_names)
val = ds["validation"].select(range(500)).map(preprocess, batched=True, remove_columns=ds["validation"].column_names)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=4)
args = TrainingArguments(output_dir="./output", num_train_epochs=3, per_device_train_batch_size=8, eval_strategy="epoch")
trainer = Trainer(model=model, args=args, train_dataset=train, eval_dataset=val, data_collator=DataCollatorWithPadding())
trainer.train()
trainer.save_model("./bengali_sentiment_finetuned")