CyberSecurity_LLM / FineTuning_Cyber_LLM_v2.py
at0m-b0mb's picture
Uploading the model
300ea65
import os
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling, Trainer, TrainingArguments
class CustomTextDataset(Dataset):
def __init__(self, tokenizer, data_chunk, block_size):
self.examples = []
for chunk in data_chunk:
tokenized_text = tokenizer.encode(chunk, add_special_tokens=True)
self.examples.extend(tokenized_text)
self.block_size = block_size
def __len__(self):
return len(self.examples) - self.block_size
def __getitem__(self, i):
# Return a chunk of length block_size
return torch.tensor(self.examples[i:i + self.block_size])
# Define the folder containing text files
folder_path = "data"
# List all files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
# Initialize an empty list to store all text data
all_text_data = []
# Read all files in the folder and concatenate their contents
for file_name in file_list:
file_path = os.path.join(folder_path, file_name)
with open(file_path, "r", encoding="utf-8") as f:
file_text = f.read()
all_text_data.append(file_text)
# Concatenate all text data
text = " ".join(all_text_data)
# Initialize a GPT-2 model and tokenizer
model_name = "gpt2" # You can choose a different model size as needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
# Split the text into smaller chunks
max_sequence_length = 1024
chunks = [text[i:i + max_sequence_length] for i in range(0, len(text), max_sequence_length)]
# Initialize a custom dataset
dataset = CustomTextDataset(tokenizer=tokenizer, data_chunk=chunks, block_size=128)
# Set up training arguments
training_args = TrainingArguments(
output_dir="./Cyber_LLM",
overwrite_output_dir=True,
num_train_epochs=1, # You can adjust the number of training epochs
per_device_train_batch_size=32,
save_steps=10_000,
save_total_limit=2,
evaluation_strategy="epoch", # Adjusted to "epoch"
eval_steps=10_000,
)
# Initialize a trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
train_dataset=dataset,
)
# Train the model
trainer.train()
# Save the model
model.save_pretrained("./Cyber_LLM")
print("Training completed.")