from datasets import load_dataset from transformers import ( DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments ) import pandas as pd # Load dataset df = pd.read_csv("data.csv") dataset = load_dataset("csv", data_files="data.csv") # Label mapping label_map = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2} df["label"] = df["label"].map(label_map) dataset = load_dataset("csv", data_files={"train": "data.csv"}) # Tokenizer tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") def tokenize(batch): return tokenizer(batch["text"], padding=True, truncation=True) dataset = dataset.map(tokenize, batched=True) # Model model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=3 ) # Training args training_args = TrainingArguments( output_dir="./results", evaluation_strategy="no", per_device_train_batch_size=4, num_train_epochs=3, save_strategy="epoch", logging_dir="./logs" ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"] ) trainer.train() model.save_pretrained("./model") tokenizer.save_pretrained("./model")