from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments ) import pandas as pd # Load dataset df = pd.read_csv("data/vibes.csv") dataset = load_dataset("csv", data_files="data/vibes.csv") labels = ["negative", "neutral", "positive"] label2id = {l: i for i, l in enumerate(labels)} id2label = {i: l for l, i in label2id.items()} def encode_labels(example): example["label"] = label2id[example["label"]] return example dataset = dataset.map(encode_labels) tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def tokenize(batch): return tokenizer(batch["text"], truncation=True, padding=True) dataset = dataset.map(tokenize, batched=True) dataset = dataset["train"].train_test_split(test_size=0.2) model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id ) training_args = TrainingArguments( output_dir="./model", evaluation_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=5, save_strategy="epoch", logging_dir="./logs", logging_steps=10 ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["test"], tokenizer=tokenizer ) trainer.train() trainer.save_model("./model") tokenizer.save_pretrained("./model")