| | |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
| |
|
| | |
| | dataset = load_dataset("imdb") |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") |
| | model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) |
| |
|
| | |
| | def tokenize_func(examples): |
| | return tokenizer(examples["text"], truncation=True, padding="max_length") |
| |
|
| | tokenized_datasets = dataset.map(tokenize_func, batched=True) |
| |
|
| | |
| | training_args = TrainingArguments(output_dir="my_model_folder", num_train_epochs=1) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_datasets["train"], |
| | eval_dataset=tokenized_datasets["test"], |
| | ) |
| |
|
| | trainer.train() |
| |
|