File size: 1,450 Bytes
b018a0a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
# Load datasets
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)
# Model
num_labels = len(train_df["label"].unique())
model = DistilBertForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=num_labels
)
# Training args
training_args = TrainingArguments(
output_dir="models/distilbert",
eval_strategy="epoch", # Updated from evaluation_strategy
save_strategy="epoch",
logging_dir="logs",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
trainer.train()
trainer.save_model("models/distilbert")
print("✅ Model trained and saved at models/distilbert")
|