from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer ) from datasets import load_dataset import torch # 1. Load dataset dataset = load_dataset("zxc4wewewe/offsec") # 2. Add labels (required for classification) # Modify based on your actual classification task: def add_labels(example): # Example: Classify if prompt is malicious (1) or benign (0) # Replace this logic with your actual labels! malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject'] text_lower = example["prompt"].lower() example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0 return example dataset = dataset.map(add_labels) # 3. Load Tokenizer tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking") if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 4. Tokenize dataset def tokenize_function(batch): tokenized = tokenizer( batch["prompt"], padding=True, truncation=True, max_length=512 ) tokenized["labels"] = batch["labels"] return tokenized dataset = dataset.map(tokenize_function, batched=True) dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) # 5. Load Model with SafeTensors support model = AutoModelForSequenceClassification.from_pretrained( "zxc4wewewe/blackthinking", num_labels=2, torch_dtype=torch.float16, # Optional: saves memory use_safetensors=True # Force SafeTensors loading ) # 6. Training Arguments with SafeTensors saving training_args = TrainingArguments( output_dir="./safetensors_results", num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=2, learning_rate=2e-5, logging_steps=10, save_strategy="epoch", # SafeTensors Configuration save_safetensors=True, # Save as .safetensors (not .bin) load_best_model_at_end=True, # Optional optimizations fp16=torch.cuda.is_available(), # Use FP16 if GPU available report_to="none" ) # 7. Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)), eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None, tokenizer=tokenizer, ) # 8. Train and Save print("Starting training with SafeTensors format...") trainer.train() # Save final model in SafeTensors format trainer.save_model("./final_safetensors_model") print("Model saved in SafeTensors format!") # 9. Verification - Check files import os model_path = "./final_safetensors_model" files = os.listdir(model_path) print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])