at0m-b0mb commited on
Commit
300ea65
·
1 Parent(s): e8ea76f

Uploading the model

Browse files
FineTuning_Cyber_LLM.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
3
+ import os
4
+
5
+ # Define your book data file
6
+ book_data_file = "data\Computer Networking_cleaned.txt"
7
+
8
+ # Load the book data
9
+ with open(book_data_file, "r", encoding="utf-8") as f:
10
+ text = f.read()
11
+
12
+ # Initialize a GPT-2 model and tokenizer
13
+ model_name = "gpt2" # You can choose a different model size as needed
14
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
15
+ config = GPT2Config.from_pretrained(model_name)
16
+ model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
17
+
18
+ # Split the text into smaller chunks
19
+ max_sequence_length = 1024
20
+ chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]
21
+
22
+ # Initialize an empty list for input_ids
23
+ input_ids = []
24
+
25
+ # Tokenize the text data
26
+ for chunk in chunks:
27
+ input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False))
28
+
29
+ # Create a dataset and data collator for language modeling
30
+ dataset = TextDataset(tokenizer=tokenizer, file_path=book_data_file, block_size=128, overwrite_cache=False)
31
+
32
+ # Set up training arguments
33
+ training_args = TrainingArguments(
34
+ output_dir="./Cyber_LLM",
35
+ overwrite_output_dir=True,
36
+ num_train_epochs=1, # You can adjust the number of training epochs
37
+ per_device_train_batch_size=32,
38
+ save_steps=10_000,
39
+ save_total_limit=2,
40
+ evaluation_strategy="steps",
41
+ eval_steps=10_000,
42
+ )
43
+
44
+ # Initialize a trainer
45
+ trainer = Trainer(
46
+ model=model,
47
+ args=training_args,
48
+ data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
49
+ train_dataset=dataset,
50
+ )
51
+
52
+ # Train the model
53
+ trainer.train()
54
+
55
+ # Save the model
56
+ trainer.save_model("./Cyber_LLM")
57
+
58
+ print("Training completed.")
FineTuning_Cyber_LLM_v2.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling, Trainer, TrainingArguments
5
+
6
+ class CustomTextDataset(Dataset):
7
+ def __init__(self, tokenizer, data_chunk, block_size):
8
+ self.examples = []
9
+ for chunk in data_chunk:
10
+ tokenized_text = tokenizer.encode(chunk, add_special_tokens=True)
11
+ self.examples.extend(tokenized_text)
12
+
13
+ self.block_size = block_size
14
+
15
+ def __len__(self):
16
+ return len(self.examples) - self.block_size
17
+
18
+ def __getitem__(self, i):
19
+ # Return a chunk of length block_size
20
+ return torch.tensor(self.examples[i:i + self.block_size])
21
+
22
+ # Define the folder containing text files
23
+ folder_path = "data"
24
+
25
+ # List all files in the folder
26
+ file_list = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
27
+
28
+ # Initialize an empty list to store all text data
29
+ all_text_data = []
30
+
31
+ # Read all files in the folder and concatenate their contents
32
+ for file_name in file_list:
33
+ file_path = os.path.join(folder_path, file_name)
34
+ with open(file_path, "r", encoding="utf-8") as f:
35
+ file_text = f.read()
36
+ all_text_data.append(file_text)
37
+
38
+ # Concatenate all text data
39
+ text = " ".join(all_text_data)
40
+
41
+ # Initialize a GPT-2 model and tokenizer
42
+ model_name = "gpt2" # You can choose a different model size as needed
43
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
44
+ config = GPT2Config.from_pretrained(model_name)
45
+ model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
46
+
47
+ # Split the text into smaller chunks
48
+ max_sequence_length = 1024
49
+ chunks = [text[i:i + max_sequence_length] for i in range(0, len(text), max_sequence_length)]
50
+
51
+ # Initialize a custom dataset
52
+ dataset = CustomTextDataset(tokenizer=tokenizer, data_chunk=chunks, block_size=128)
53
+
54
+ # Set up training arguments
55
+ training_args = TrainingArguments(
56
+ output_dir="./Cyber_LLM",
57
+ overwrite_output_dir=True,
58
+ num_train_epochs=1, # You can adjust the number of training epochs
59
+ per_device_train_batch_size=32,
60
+ save_steps=10_000,
61
+ save_total_limit=2,
62
+ evaluation_strategy="epoch", # Adjusted to "epoch"
63
+ eval_steps=10_000,
64
+ )
65
+
66
+ # Initialize a trainer
67
+ trainer = Trainer(
68
+ model=model,
69
+ args=training_args,
70
+ data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
71
+ train_dataset=dataset,
72
+ )
73
+
74
+ # Train the model
75
+ trainer.train()
76
+
77
+ # Save the model
78
+ model.save_pretrained("./Cyber_LLM")
79
+
80
+ print("Training completed.")
FineTuning_Cyber_LLM_v3.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
3
+
4
+ # Define your folder containing data files
5
+ data_folder = "data"
6
+
7
+ # Initialize a GPT-2 model and tokenizer
8
+ model_name = "gpt2" # You can choose a different model size as needed
9
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
10
+ config = GPT2Config.from_pretrained(model_name)
11
+ model = GPT2LMHeadModel.from_pretrained(model_name, config=config)
12
+
13
+ # Initialize an empty list for input_ids
14
+ input_ids = []
15
+
16
+ # Read and process each file in the folder
17
+ for filename in os.listdir(data_folder):
18
+ file_path = os.path.join(data_folder, filename)
19
+
20
+ # Check if the path is a file
21
+ if os.path.isfile(file_path):
22
+ # Load the file data
23
+ with open(file_path, "r", encoding="utf-8") as f:
24
+ text = f.read()
25
+
26
+ # Split the text into smaller chunks
27
+ max_sequence_length = 1024
28
+ chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]
29
+
30
+ # Tokenize the text data
31
+ for chunk in chunks:
32
+ input_ids.extend(tokenizer.encode(chunk, add_special_tokens=True))
33
+
34
+ # Create a dataset and data collator for language modeling
35
+ dataset = TextDataset(tokenizer=tokenizer, inputs=input_ids, block_size=128)
36
+
37
+ # Set up training arguments
38
+ training_args = TrainingArguments(
39
+ output_dir="./Cyber_LLM",
40
+ overwrite_output_dir=True,
41
+ num_train_epochs=3, # You can adjust the number of training epochs
42
+ per_device_train_batch_size=4, # Adjust based on your GPU memory
43
+ save_steps=10_000,
44
+ save_total_limit=2,
45
+ evaluation_strategy="epoch",
46
+ eval_steps=10_000,
47
+ )
48
+
49
+ # Initialize a trainer
50
+ trainer = Trainer(
51
+ model=model,
52
+ args=training_args,
53
+ data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
54
+ train_dataset=dataset,
55
+ )
56
+
57
+ # Train the model
58
+ trainer.train()
59
+
60
+ # Save the model
61
+ model.save_pretrained("./Cyber_LLM")
62
+
63
+ print("Training completed.")