import os import torch from torch.utils.data import Dataset from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling, Trainer, TrainingArguments class CustomTextDataset(Dataset): def __init__(self, tokenizer, data_chunk, block_size): self.examples = [] for chunk in data_chunk: tokenized_text = tokenizer.encode(chunk, add_special_tokens=True) self.examples.extend(tokenized_text) self.block_size = block_size def __len__(self): return len(self.examples) - self.block_size def __getitem__(self, i): # Return a chunk of length block_size return torch.tensor(self.examples[i:i + self.block_size]) # Define the folder containing text files folder_path = "data" # List all files in the folder file_list = [f for f in os.listdir(folder_path) if f.endswith(".txt")] # Initialize an empty list to store all text data all_text_data = [] # Read all files in the folder and concatenate their contents for file_name in file_list: file_path = os.path.join(folder_path, file_name) with open(file_path, "r", encoding="utf-8") as f: file_text = f.read() all_text_data.append(file_text) # Concatenate all text data text = " ".join(all_text_data) # Initialize a GPT-2 model and tokenizer model_name = "gpt2" # You can choose a different model size as needed tokenizer = GPT2Tokenizer.from_pretrained(model_name) config = GPT2Config.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name, config=config) # Split the text into smaller chunks max_sequence_length = 1024 chunks = [text[i:i + max_sequence_length] for i in range(0, len(text), max_sequence_length)] # Initialize a custom dataset dataset = CustomTextDataset(tokenizer=tokenizer, data_chunk=chunks, block_size=128) # Set up training arguments training_args = TrainingArguments( output_dir="./Cyber_LLM", overwrite_output_dir=True, num_train_epochs=1, # You can adjust the number of training epochs per_device_train_batch_size=32, save_steps=10_000, save_total_limit=2, evaluation_strategy="epoch", # Adjusted to "epoch" eval_steps=10_000, ) # Initialize a trainer trainer = Trainer( model=model, args=training_args, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), train_dataset=dataset, ) # Train the model trainer.train() # Save the model model.save_pretrained("./Cyber_LLM") print("Training completed.")