File size: 2,523 Bytes
300ea65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling, Trainer, TrainingArguments

class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, data_chunk, block_size):
        self.examples = []
        for chunk in data_chunk:
            tokenized_text = tokenizer.encode(chunk, add_special_tokens=True)
            self.examples.extend(tokenized_text)

        self.block_size = block_size

    def __len__(self):
        return len(self.examples) - self.block_size

    def __getitem__(self, i):
        # Return a chunk of length block_size
        return torch.tensor(self.examples[i:i + self.block_size])

# Define the folder containing text files
folder_path = "data"

# List all files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

# Initialize an empty list to store all text data
all_text_data = []

# Read all files in the folder and concatenate their contents
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        file_text = f.read()
        all_text_data.append(file_text)

# Concatenate all text data
text = " ".join(all_text_data)

# Initialize a GPT-2 model and tokenizer
model_name = "gpt2"  # You can choose a different model size as needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

# Split the text into smaller chunks
max_sequence_length = 1024
chunks = [text[i:i + max_sequence_length] for i in range(0, len(text), max_sequence_length)]

# Initialize a custom dataset
dataset = CustomTextDataset(tokenizer=tokenizer, data_chunk=chunks, block_size=128)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./Cyber_LLM",
    overwrite_output_dir=True,
    num_train_epochs=1,  # You can adjust the number of training epochs
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",  # Adjusted to "epoch"
    eval_steps=10_000,
)

# Initialize a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./Cyber_LLM")

print("Training completed.")