| from transformers import (
|
| AutoModelForSequenceClassification,
|
| AutoTokenizer,
|
| TrainingArguments,
|
| Trainer
|
| )
|
| from datasets import load_dataset
|
| import torch
|
|
|
|
|
| dataset = load_dataset("zxc4wewewe/offsec")
|
|
|
|
|
|
|
| def add_labels(example):
|
|
|
|
|
| malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject']
|
| text_lower = example["prompt"].lower()
|
| example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0
|
| return example
|
|
|
| dataset = dataset.map(add_labels)
|
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking")
|
| if tokenizer.pad_token is None:
|
| tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
| def tokenize_function(batch):
|
| tokenized = tokenizer(
|
| batch["prompt"],
|
| padding=True,
|
| truncation=True,
|
| max_length=512
|
| )
|
| tokenized["labels"] = batch["labels"]
|
| return tokenized
|
|
|
| dataset = dataset.map(tokenize_function, batched=True)
|
| dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
|
|
|
|
|
| model = AutoModelForSequenceClassification.from_pretrained(
|
| "zxc4wewewe/blackthinking",
|
| num_labels=2,
|
| torch_dtype=torch.float16,
|
| use_safetensors=True
|
| )
|
|
|
|
|
| training_args = TrainingArguments(
|
| output_dir="./safetensors_results",
|
| num_train_epochs=3,
|
| per_device_train_batch_size=4,
|
| gradient_accumulation_steps=2,
|
| learning_rate=2e-5,
|
| logging_steps=10,
|
| save_strategy="epoch",
|
|
|
|
|
| save_safetensors=True,
|
| load_best_model_at_end=True,
|
|
|
|
|
| fp16=torch.cuda.is_available(),
|
| report_to="none"
|
| )
|
|
|
|
|
| trainer = Trainer(
|
| model=model,
|
| args=training_args,
|
| train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)),
|
| eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None,
|
| tokenizer=tokenizer,
|
| )
|
|
|
|
|
| print("Starting training with SafeTensors format...")
|
| trainer.train()
|
|
|
|
|
| trainer.save_model("./final_safetensors_model")
|
| print("Model saved in SafeTensors format!")
|
|
|
|
|
| import os
|
| model_path = "./final_safetensors_model"
|
| files = os.listdir(model_path)
|
| print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])
|
|
|