File size: 2,917 Bytes
9f64888 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer
)
from datasets import load_dataset
import torch
# 1. Load dataset
dataset = load_dataset("zxc4wewewe/offsec")
# 2. Add labels (required for classification)
# Modify based on your actual classification task:
def add_labels(example):
# Example: Classify if prompt is malicious (1) or benign (0)
# Replace this logic with your actual labels!
malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject']
text_lower = example["prompt"].lower()
example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0
return example
dataset = dataset.map(add_labels)
# 3. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 4. Tokenize dataset
def tokenize_function(batch):
tokenized = tokenizer(
batch["prompt"],
padding=True,
truncation=True,
max_length=512
)
tokenized["labels"] = batch["labels"]
return tokenized
dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# 5. Load Model with SafeTensors support
model = AutoModelForSequenceClassification.from_pretrained(
"zxc4wewewe/blackthinking",
num_labels=2,
torch_dtype=torch.float16, # Optional: saves memory
use_safetensors=True # Force SafeTensors loading
)
# 6. Training Arguments with SafeTensors saving
training_args = TrainingArguments(
output_dir="./safetensors_results",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=2,
learning_rate=2e-5,
logging_steps=10,
save_strategy="epoch",
# SafeTensors Configuration
save_safetensors=True, # Save as .safetensors (not .bin)
load_best_model_at_end=True,
# Optional optimizations
fp16=torch.cuda.is_available(), # Use FP16 if GPU available
report_to="none"
)
# 7. Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)),
eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None,
tokenizer=tokenizer,
)
# 8. Train and Save
print("Starting training with SafeTensors format...")
trainer.train()
# Save final model in SafeTensors format
trainer.save_model("./final_safetensors_model")
print("Model saved in SafeTensors format!")
# 9. Verification - Check files
import os
model_path = "./final_safetensors_model"
files = os.listdir(model_path)
print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])
|