from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import TrainingArguments, Trainer import datasets model_name = "CohereForAI/c4ai-command-r-plus-08-2024" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def preprocess_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) dataset = datasets.load_from_disk('models/processed_dataset') tokenized_dataset = dataset.map(preprocess_function, batched=True) training_args = TrainingArguments( output_dir="models/finetuned_model", num_train_epochs=3, per_device_train_batch_size=8, save_steps=1000, save_total_limit=2 ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"] ) trainer.train()