import pandas as pd from datasets import Dataset from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments # Load datasets train_df = pd.read_csv("data/train.csv") test_df = pd.read_csv("data/test.csv") train_dataset = Dataset.from_pandas(train_df) test_dataset = Dataset.from_pandas(test_df) # Tokenizer tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") def tokenize(batch): return tokenizer(batch["text"], padding=True, truncation=True, max_length=128) train_dataset = train_dataset.map(tokenize, batched=True) test_dataset = test_dataset.map(tokenize, batched=True) # Model num_labels = len(train_df["label"].unique()) model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=num_labels ) # Training args training_args = TrainingArguments( output_dir="models/distilbert", eval_strategy="epoch", # Updated from evaluation_strategy save_strategy="epoch", logging_dir="logs", per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01 ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset ) trainer.train() trainer.save_model("models/distilbert") print("✅ Model trained and saved at models/distilbert")