from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer
)
from datasets import load_dataset
import torch

# 1. Load dataset
dataset = load_dataset("zxc4wewewe/offsec")

# 2. Add labels (required for classification)
# Modify based on your actual classification task:
def add_labels(example):
    # Example: Classify if prompt is malicious (1) or benign (0)
    # Replace this logic with your actual labels!
    malicious_keywords = ['hack', 'exploit', 'crack', 'bypass', 'inject']
    text_lower = example["prompt"].lower()
    example["labels"] = 1 if any(kw in text_lower for kw in malicious_keywords) else 0
    return example

dataset = dataset.map(add_labels)

# 3. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("zxc4wewewe/blackthinking")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 4. Tokenize dataset
def tokenize_function(batch):
    tokenized = tokenizer(
        batch["prompt"], 
        padding=True, 
        truncation=True, 
        max_length=512
    )
    tokenized["labels"] = batch["labels"]
    return tokenized

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# 5. Load Model with SafeTensors support
model = AutoModelForSequenceClassification.from_pretrained(
    "zxc4wewewe/blackthinking",
    num_labels=2,
    torch_dtype=torch.float16,  # Optional: saves memory
    use_safetensors=True        # Force SafeTensors loading
)

# 6. Training Arguments with SafeTensors saving
training_args = TrainingArguments(
    output_dir="./safetensors_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    logging_steps=10,
    save_strategy="epoch",
    
    # SafeTensors Configuration
    save_safetensors=True,      # Save as .safetensors (not .bin)
    load_best_model_at_end=True,
    
    # Optional optimizations
    fp16=torch.cuda.is_available(),  # Use FP16 if GPU available
    report_to="none"
)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].shuffle(seed=42).select(range(1000)),
    eval_dataset=dataset["test"].shuffle(seed=42).select(range(200)) if "test" in dataset else None,
    tokenizer=tokenizer,
)

# 8. Train and Save
print("Starting training with SafeTensors format...")
trainer.train()

# Save final model in SafeTensors format
trainer.save_model("./final_safetensors_model")
print("Model saved in SafeTensors format!")

# 9. Verification - Check files
import os
model_path = "./final_safetensors_model"
files = os.listdir(model_path)
print("Saved files:", [f for f in files if f.endswith(('.safetensors', '.json', '.txt'))])