#!/usr/bin/env python3 """ Bengali Sentiment - Fine-tuning Script ======================================= Usage: python fine_tune_script.py """ from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding ) DATASET = "Swarnadeep-28/bn_code_mix_sentiment_dataset" MODEL = "distilbert-base-multilingual-cased" LABEL_MAP = {"Negative": 0, "Neutral": 1, "Positive": 2, "Mixed": 3} def preprocess(examples): texts = [str(t) for t in examples["text"]] labels = [LABEL_MAP[l] for l in examples["label"]] tok = AutoTokenizer.from_pretrained(MODEL) tokenized = tok(texts, truncation=True, padding=True, max_length=128) tokenized["labels"] = labels return tokenized ds = load_dataset(DATASET) train = ds["train"].select(range(2000)).map(preprocess, batched=True, remove_columns=ds["train"].column_names) val = ds["validation"].select(range(500)).map(preprocess, batched=True, remove_columns=ds["validation"].column_names) model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=4) args = TrainingArguments(output_dir="./output", num_train_epochs=3, per_device_train_batch_size=8, eval_strategy="epoch") trainer = Trainer(model=model, args=args, train_dataset=train, eval_dataset=val, data_collator=DataCollatorWithPadding()) trainer.train() trainer.save_model("./bengali_sentiment_finetuned")