CyberSecurity_LLM / FineTuning_Cyber_LLM_v3.py
at0m-b0mb's picture
Uploading the model
300ea65
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# Define your folder containing data files
data_folder = "data"
# Initialize a GPT-2 model and tokenizer
model_name = "gpt2" # You can choose a different model size as needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
# Initialize an empty list for input_ids
input_ids = []
# Read and process each file in the folder
for filename in os.listdir(data_folder):
file_path = os.path.join(data_folder, filename)
# Check if the path is a file
if os.path.isfile(file_path):
# Load the file data
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
# Split the text into smaller chunks
max_sequence_length = 1024
chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]
# Tokenize the text data
for chunk in chunks:
input_ids.extend(tokenizer.encode(chunk, add_special_tokens=True))
# Create a dataset and data collator for language modeling
dataset = TextDataset(tokenizer=tokenizer, inputs=input_ids, block_size=128)
# Set up training arguments
training_args = TrainingArguments(
output_dir="./Cyber_LLM",
overwrite_output_dir=True,
num_train_epochs=3, # You can adjust the number of training epochs
per_device_train_batch_size=4, # Adjust based on your GPU memory
save_steps=10_000,
save_total_limit=2,
evaluation_strategy="epoch",
eval_steps=10_000,
)
# Initialize a trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
train_dataset=dataset,
)
# Train the model
trainer.train()
# Save the model
model.save_pretrained("./Cyber_LLM")
print("Training completed.")