| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments |
| from datasets import load_dataset, load_from_disk |
|
|
| dataset = load_from_disk('finn_wake_dataset') |
|
|
| tokenizer = AutoTokenizer.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0") |
|
|
| tokenizer.save_pretrained(".results/checkpoint-12000/") |
|
|
| model = AutoModelForCausalLM.from_pretrained("tinyllama/tinyllama-1.1b-chat-v1.0") |
|
|
| if tokenizer.pad_token is None: |
| print("Tokenizer does not have a pad token set. Setting pad_token to eos_token.") |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| def tokenize_function(examples): |
| |
| tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128) |
| tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() |
| |
| return tokenized_inputs |
| |
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) |
| train_test_split = tokenized_dataset.train_test_split(test_size=0.1) |
|
|
| train_dataset = train_test_split['train'] |
| eval_dataset = train_test_split['test'] |
| training_args = TrainingArguments( |
| output_dir="./results", |
| num_train_epochs=3, |
| per_device_train_batch_size=1, |
| warmup_steps=500, |
| weight_decay=0.01, |
| logging_dir="./logs", |
| logging_steps=10, |
| save_strategy="steps", |
| save_steps=500, |
| save_total_limit=2, |
| use_cpu=True) |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| ) |
| |
| trainer.train(resume_from_checkpoint="./results/checkpoint-10000") |
|
|