at0m-b0mb
/

CyberSecurity_LLM

Model card Files Files and versions

CyberSecurity_LLM / FineTuning_Cyber_LLM_v3.py

at0m-b0mb's picture

Uploading the model

300ea65 about 2 years ago

history blame contribute delete

2.02 kB

	import os
	from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

	# Define your folder containing data files
	data_folder = "data"

	# Initialize a GPT-2 model and tokenizer
	model_name = "gpt2" # You can choose a different model size as needed
	tokenizer = GPT2Tokenizer.from_pretrained(model_name)
	config = GPT2Config.from_pretrained(model_name)
	model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

	# Initialize an empty list for input_ids
	input_ids = []

	# Read and process each file in the folder
	for filename in os.listdir(data_folder):
	file_path = os.path.join(data_folder, filename)

	# Check if the path is a file
	if os.path.isfile(file_path):
	# Load the file data
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()

	# Split the text into smaller chunks
	max_sequence_length = 1024
	chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]

	# Tokenize the text data
	for chunk in chunks:
	input_ids.extend(tokenizer.encode(chunk, add_special_tokens=True))

	# Create a dataset and data collator for language modeling
	dataset = TextDataset(tokenizer=tokenizer, inputs=input_ids, block_size=128)

	# Set up training arguments
	training_args = TrainingArguments(
	output_dir="./Cyber_LLM",
	overwrite_output_dir=True,
	num_train_epochs=3, # You can adjust the number of training epochs
	per_device_train_batch_size=4, # Adjust based on your GPU memory
	save_steps=10_000,
	save_total_limit=2,
	evaluation_strategy="epoch",
	eval_steps=10_000,
	)

	# Initialize a trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
	train_dataset=dataset,
	)

	# Train the model
	trainer.train()

	# Save the model
	model.save_pretrained("./Cyber_LLM")

	print("Training completed.")