import os from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments # Define your folder containing data files data_folder = "data" # Initialize a GPT-2 model and tokenizer model_name = "gpt2" # You can choose a different model size as needed tokenizer = GPT2Tokenizer.from_pretrained(model_name) config = GPT2Config.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name, config=config) # Initialize an empty list for input_ids input_ids = [] # Read and process each file in the folder for filename in os.listdir(data_folder): file_path = os.path.join(data_folder, filename) # Check if the path is a file if os.path.isfile(file_path): # Load the file data with open(file_path, "r", encoding="utf-8") as f: text = f.read() # Split the text into smaller chunks max_sequence_length = 1024 chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)] # Tokenize the text data for chunk in chunks: input_ids.extend(tokenizer.encode(chunk, add_special_tokens=True)) # Create a dataset and data collator for language modeling dataset = TextDataset(tokenizer=tokenizer, inputs=input_ids, block_size=128) # Set up training arguments training_args = TrainingArguments( output_dir="./Cyber_LLM", overwrite_output_dir=True, num_train_epochs=3, # You can adjust the number of training epochs per_device_train_batch_size=4, # Adjust based on your GPU memory save_steps=10_000, save_total_limit=2, evaluation_strategy="epoch", eval_steps=10_000, ) # Initialize a trainer trainer = Trainer( model=model, args=training_args, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), train_dataset=dataset, ) # Train the model trainer.train() # Save the model model.save_pretrained("./Cyber_LLM") print("Training completed.")