| import pandas as pd | |
| import numpy as np | |
| import os | |
| def read_txt(file_path): | |
| text = "" | |
| try: | |
| with open(file_path, "r") as file: | |
| text = file.read() | |
| except: | |
| text = "" | |
| return text | |
| with open("train.txt", "w") as f: | |
| f.write('') | |
| data = "" | |
| for filename in os.listdir("./"): | |
| file_path = os.path.join("./", filename) | |
| if file_path.endswith(".txt") and (file_path != "train.txt"): | |
| data += read_txt(file_path) | |
| data = ' '.join(data.split('\n')) | |
| with open("train.txt", "a") as f: | |
| f.write(data) | |
| from transformers import TextDataset, DataCollatorForLanguageModeling | |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel | |
| from transformers import Trainer, TrainingArguments | |
| def load_dataset(file_path, tokenizer, block_size = 128): | |
| dataset = TextDataset( | |
| tokenizer = tokenizer, | |
| file_path = file_path, | |
| block_size = block_size, | |
| ) | |
| return dataset | |
| def load_data_collator(tokenizer, mlm = False): | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=mlm, | |
| ) | |
| return data_collator | |
| def train(train_file_path,model_name, | |
| output_dir, | |
| overwrite_output_dir, | |
| per_device_train_batch_size, | |
| num_train_epochs, | |
| save_steps): | |
| tokenizer = GPT2Tokenizer.from_pretrained(model_name) | |
| train_dataset = load_dataset(train_file_path, tokenizer) | |
| data_collator = load_data_collator(tokenizer) | |
| tokenizer.save_pretrained(output_dir) | |
| model = GPT2LMHeadModel.from_pretrained(model_name) | |
| model.save_pretrained(output_dir) | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| overwrite_output_dir=overwrite_output_dir, | |
| per_device_train_batch_size=per_device_train_batch_size, | |
| num_train_epochs=num_train_epochs, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| data_collator=data_collator, | |
| train_dataset=train_dataset, | |
| ) | |
| trainer.train() | |
| trainer.save_model() | |
| train_file_path = "train.txt" | |
| model_name = 'gpt2' | |
| output_dir = 'model' | |
| overwrite_output_dir = False | |
| per_device_train_batch_size = 8 | |
| num_train_epochs = 50.0 | |
| save_steps = 50000 | |
| train( | |
| train_file_path=train_file_path, | |
| model_name=model_name, | |
| output_dir=output_dir, | |
| overwrite_output_dir=overwrite_output_dir, | |
| per_device_train_batch_size=per_device_train_batch_size, | |
| num_train_epochs=num_train_epochs, | |
| save_steps=save_steps | |
| ) | |
| from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer | |
| def load_model(model_path): | |
| model = GPT2LMHeadModel.from_pretrained(model_path) | |
| return model | |
| def load_tokenizer(tokenizer_path): | |
| tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) | |
| return tokenizer | |
| def generate_text(model_path, sequence, max_length): | |
| model = load_model(model_path) | |
| tokenizer = load_tokenizer(model_path) | |
| ids = tokenizer.encode(f'{sequence}', return_tensors='pt') | |
| final_outputs = model.generate( | |
| ids, | |
| do_sample=True, | |
| max_length=max_length, | |
| pad_token_id=model.config.eos_token_id, | |
| top_k=50, | |
| top_p=0.95, | |
| ) | |
| print(tokenizer.decode(final_outputs[0], skip_special_tokens=True)) | |
| model_path = "/model/" | |
| sequence = "Hello!" | |
| max_len = 50 | |
| generate_text(model_path, sequence, max_len) |