import argparse, pandas as pd from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from training.utils import compute_metrics_sentiment parser = argparse.ArgumentParser() parser.add_argument("--model_name", default="distilbert-base-uncased") parser.add_argument("--train_csv", required=True) parser.add_argument("--eval_csv", required=True) parser.add_argument("--text_col", default="text") parser.add_argument("--label_col", default="label") parser.add_argument("--output_dir", default="./outputs/sentiment") parser.add_argument("--epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--lr", type=float, default=5e-5) args = parser.parse_args() train_df = pd.read_csv(args.train_csv) eval_df = pd.read_csv(args.eval_csv) label_names = sorted(train_df[args.label_col].unique().tolist()) label2id = {l:i for i,l in enumerate(label_names)} id2label = {i:l for l,i in label2id.items()} def encode(df): tok = tokenizer(df[args.text_col].tolist(), truncation=True, padding=True) tok["labels"] = [label2id[l] for l in df[args.label_col].tolist()] return tok tokenizer = AutoTokenizer.from_pretrained(args.model_name) train_ds = Dataset.from_pandas(train_df).map(encode, batched=True, remove_columns=train_df.columns) eval_ds = Dataset.from_pandas(eval_df).map(encode, batched=True, remove_columns=eval_df.columns) model = AutoModelForSequenceClassification.from_pretrained( args.model_name, num_labels=len(label_names), id2label=id2label, label2id=label2id ) training_args = TrainingArguments( output_dir=args.output_dir, evaluation_strategy="epoch", learning_rate=args.lr, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.epochs, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="accuracy", ) trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds, tokenizer=tokenizer, compute_metrics=compute_metrics_sentiment, ) trainer.train() trainer.save_model(args.output_dir) tokenizer.save_pretrained(args.output_dir)