| import argparse, pandas as pd | |
| from datasets import Dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from training.utils import compute_metrics_sentiment | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model_name", default="distilbert-base-uncased") | |
| parser.add_argument("--train_csv", required=True) | |
| parser.add_argument("--eval_csv", required=True) | |
| parser.add_argument("--text_col", default="text") | |
| parser.add_argument("--label_col", default="label") | |
| parser.add_argument("--output_dir", default="./outputs/sentiment") | |
| parser.add_argument("--epochs", type=int, default=3) | |
| parser.add_argument("--batch_size", type=int, default=16) | |
| parser.add_argument("--lr", type=float, default=5e-5) | |
| args = parser.parse_args() | |
| train_df = pd.read_csv(args.train_csv) | |
| eval_df = pd.read_csv(args.eval_csv) | |
| label_names = sorted(train_df[args.label_col].unique().tolist()) | |
| label2id = {l:i for i,l in enumerate(label_names)} | |
| id2label = {i:l for l,i in label2id.items()} | |
| def encode(df): | |
| tok = tokenizer(df[args.text_col].tolist(), truncation=True, padding=True) | |
| tok["labels"] = [label2id[l] for l in df[args.label_col].tolist()] | |
| return tok | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |
| train_ds = Dataset.from_pandas(train_df).map(encode, batched=True, remove_columns=train_df.columns) | |
| eval_ds = Dataset.from_pandas(eval_df).map(encode, batched=True, remove_columns=eval_df.columns) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| args.model_name, num_labels=len(label_names), id2label=id2label, label2id=label2id | |
| ) | |
| training_args = TrainingArguments( | |
| output_dir=args.output_dir, | |
| evaluation_strategy="epoch", | |
| learning_rate=args.lr, | |
| per_device_train_batch_size=args.batch_size, | |
| per_device_eval_batch_size=args.batch_size, | |
| num_train_epochs=args.epochs, | |
| weight_decay=0.01, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="accuracy", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_ds, | |
| eval_dataset=eval_ds, | |
| tokenizer=tokenizer, | |
| compute_metrics=compute_metrics_sentiment, | |
| ) | |
| trainer.train() | |
| trainer.save_model(args.output_dir) | |
| tokenizer.save_pretrained(args.output_dir) | |