import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Define your folder containing data files
data_folder = "data"

# Initialize a GPT-2 model and tokenizer
model_name = "gpt2"  # You can choose a different model size as needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

# Initialize an empty list for input_ids
input_ids = []

# Read and process each file in the folder
for filename in os.listdir(data_folder):
    file_path = os.path.join(data_folder, filename)

    # Check if the path is a file
    if os.path.isfile(file_path):
        # Load the file data
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        # Split the text into smaller chunks
        max_sequence_length = 1024
        chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]

        # Tokenize the text data
        for chunk in chunks:
            input_ids.extend(tokenizer.encode(chunk, add_special_tokens=True))

# Create a dataset and data collator for language modeling
dataset = TextDataset(tokenizer=tokenizer, inputs=input_ids, block_size=128)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./Cyber_LLM",
    overwrite_output_dir=True,
    num_train_epochs=3,  # You can adjust the number of training epochs
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
    eval_steps=10_000,
)

# Initialize a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./Cyber_LLM")

print("Training completed.")