| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from transformers import TrainingArguments, Trainer | |
| import datasets | |
| model_name = "CohereForAI/c4ai-command-r-plus-08-2024" | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def preprocess_function(examples): | |
| return tokenizer(examples["text"], padding="max_length", truncation=True) | |
| dataset = datasets.load_from_disk('models/processed_dataset') | |
| tokenized_dataset = dataset.map(preprocess_function, batched=True) | |
| training_args = TrainingArguments( | |
| output_dir="models/finetuned_model", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=8, | |
| save_steps=1000, | |
| save_total_limit=2 | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset["train"] | |
| ) | |
| trainer.train() |