at0m-b0mb
/

CyberSecurity_LLM

Model card Files Files and versions

CyberSecurity_LLM / FineTuning_Cyber_LLM.py

at0m-b0mb's picture

Uploading the model

300ea65 about 2 years ago

history blame contribute delete

1.78 kB

	import torch
	from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
	import os

	# Define your book data file
	book_data_file = "data\Computer Networking_cleaned.txt"

	# Load the book data
	with open(book_data_file, "r", encoding="utf-8") as f:
	text = f.read()

	# Initialize a GPT-2 model and tokenizer
	model_name = "gpt2" # You can choose a different model size as needed
	tokenizer = GPT2Tokenizer.from_pretrained(model_name)
	config = GPT2Config.from_pretrained(model_name)
	model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

	# Split the text into smaller chunks
	max_sequence_length = 1024
	chunks = [text[i:i+max_sequence_length] for i in range(0, len(text), max_sequence_length)]

	# Initialize an empty list for input_ids
	input_ids = []

	# Tokenize the text data
	for chunk in chunks:
	input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False))

	# Create a dataset and data collator for language modeling
	dataset = TextDataset(tokenizer=tokenizer, file_path=book_data_file, block_size=128, overwrite_cache=False)

	# Set up training arguments
	training_args = TrainingArguments(
	output_dir="./Cyber_LLM",
	overwrite_output_dir=True,
	num_train_epochs=1, # You can adjust the number of training epochs
	per_device_train_batch_size=32,
	save_steps=10_000,
	save_total_limit=2,
	evaluation_strategy="steps",
	eval_steps=10_000,
	)

	# Initialize a trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
	train_dataset=dataset,
	)

	# Train the model
	trainer.train()

	# Save the model
	trainer.save_model("./Cyber_LLM")

	print("Training completed.")