|
|
import torch |
|
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments |
|
|
import os |
|
|
|
|
|
|
|
|
book_data_file = "data\Computer Networking_cleaned.txt" |
|
|
|
|
|
|
|
|
with open(book_data_file, "r", encoding="utf-8") as f: |
|
|
text = f.read() |
|
|
|
|
|
|
|
|
model_name = "gpt2" |
|
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
|
config = GPT2Config.from_pretrained(model_name) |
|
|
model = GPT2LMHeadModel.from_pretrained(model_name, config=config) |
|
|
|
|
|
|
|
|
max_sequence_length = 1024 |
|
|
chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)] |
|
|
|
|
|
|
|
|
input_ids = [] |
|
|
|
|
|
|
|
|
for chunk in chunks: |
|
|
input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False)) |
|
|
|
|
|
|
|
|
dataset = TextDataset(tokenizer=tokenizer, file_path=book_data_file, block_size=128, overwrite_cache=False) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./Cyber_LLM", |
|
|
overwrite_output_dir=True, |
|
|
num_train_epochs=1, |
|
|
per_device_train_batch_size=32, |
|
|
save_steps=10_000, |
|
|
save_total_limit=2, |
|
|
evaluation_strategy="steps", |
|
|
eval_steps=10_000, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), |
|
|
train_dataset=dataset, |
|
|
) |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
trainer.save_model("./Cyber_LLM") |
|
|
|
|
|
print("Training completed.") |
|
|
|