import torch from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments import os # Define your book data file book_data_file = "data\Computer Networking_cleaned.txt" # Load the book data with open(book_data_file, "r", encoding="utf-8") as f: text = f.read() # Initialize a GPT-2 model and tokenizer model_name = "gpt2" # You can choose a different model size as needed tokenizer = GPT2Tokenizer.from_pretrained(model_name) config = GPT2Config.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name, config=config) # Split the text into smaller chunks max_sequence_length = 1024 chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)] # Initialize an empty list for input_ids input_ids = [] # Tokenize the text data for chunk in chunks: input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False)) # Create a dataset and data collator for language modeling dataset = TextDataset(tokenizer=tokenizer, file_path=book_data_file, block_size=128, overwrite_cache=False) # Set up training arguments training_args = TrainingArguments( output_dir="./Cyber_LLM", overwrite_output_dir=True, num_train_epochs=1, # You can adjust the number of training epochs per_device_train_batch_size=32, save_steps=10_000, save_total_limit=2, evaluation_strategy="steps", eval_steps=10_000, ) # Initialize a trainer trainer = Trainer( model=model, args=training_args, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), train_dataset=dataset, ) # Train the model trainer.train() # Save the model trainer.save_model("./Cyber_LLM") print("Training completed.")