Spaces:
Runtime error
Runtime error
| import re | |
| import os | |
| import transformers | |
| import torch | |
| from transformers import TextDataset, DataCollatorForLanguageModeling | |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel | |
| from transformers import Trainer, TrainingArguments | |
| print(torch.cuda.is_available()) | |
| def load_dataset(file_path, tokenizer, block_size=128): | |
| dataset = TextDataset( | |
| tokenizer=tokenizer, | |
| file_path=file_path, | |
| block_size=block_size, | |
| ) | |
| return dataset | |
| def load_data_collator(tokenizer, mlm=False): | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=mlm, | |
| ) | |
| return data_collator | |
| def train(train_file_path, model_name, output_dir, overwrite_output_dir, | |
| per_device_train_batch_size, num_train_epochs, save_steps, resume_from_checkpoint): | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| tokenizer = AutoTokenizer.from_pretrained("malteos/gpt2-uk") | |
| train_dataset = load_dataset(train_file_path, tokenizer) | |
| data_collator = load_data_collator(tokenizer) | |
| tokenizer.save_pretrained(output_dir) | |
| model = AutoModelForCausalLM.from_pretrained("malteos/gpt2-uk") | |
| model.save_pretrained(output_dir) | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| overwrite_output_dir=overwrite_output_dir, | |
| per_device_train_batch_size=per_device_train_batch_size, | |
| num_train_epochs=num_train_epochs, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| data_collator=data_collator, | |
| train_dataset=train_dataset, | |
| ) | |
| trainer.train(resume_from_checkpoint=resume_from_checkpoint) | |
| trainer.save_model() | |
| train_directory = 'H:/Finetunning/q_and_a' | |
| train_file_path = 'H:/Finetunning/journal.txt' | |
| model_name = train_directory | |
| output_dir = 'H:/Finetunning/custom_full_text' | |
| overwrite_output_dir = False | |
| per_device_train_batch_size = 8 | |
| num_train_epochs = 51 | |
| save_steps = 50000 | |
| print("Починаємо навчання...") | |
| train( | |
| train_file_path=train_file_path, | |
| model_name=model_name, | |
| output_dir=output_dir, | |
| overwrite_output_dir=overwrite_output_dir, | |
| per_device_train_batch_size=per_device_train_batch_size, | |
| num_train_epochs=num_train_epochs, | |
| save_steps=save_steps, | |
| resume_from_checkpoint=True # False для першого разу, True - з якоїсь точки остановки | |
| ) | |