CyberSecurity_LLM / FineTuning_Cyber_LLM.py
at0m-b0mb's picture
Uploading the model
300ea65
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import os
# Define your book data file
book_data_file = "data\Computer Networking_cleaned.txt"
# Load the book data
with open(book_data_file, "r", encoding="utf-8") as f:
text = f.read()
# Initialize a GPT-2 model and tokenizer
model_name = "gpt2" # You can choose a different model size as needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
# Split the text into smaller chunks
max_sequence_length = 1024
chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]
# Initialize an empty list for input_ids
input_ids = []
# Tokenize the text data
for chunk in chunks:
input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False))
# Create a dataset and data collator for language modeling
dataset = TextDataset(tokenizer=tokenizer, file_path=book_data_file, block_size=128, overwrite_cache=False)
# Set up training arguments
training_args = TrainingArguments(
output_dir="./Cyber_LLM",
overwrite_output_dir=True,
num_train_epochs=1, # You can adjust the number of training epochs
per_device_train_batch_size=32,
save_steps=10_000,
save_total_limit=2,
evaluation_strategy="steps",
eval_steps=10_000,
)
# Initialize a trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
train_dataset=dataset,
)
# Train the model
trainer.train()
# Save the model
trainer.save_model("./Cyber_LLM")
print("Training completed.")