jainsatyam26/guardrail-215k-splits
Viewer • Updated • 168k • 127
This model is a fine-tuned DeBERTa classifier for content safety, achieving high performance on safety classification tasks.
| Metric | Value |
|---|---|
| F1 Score | N/A |
| Accuracy | N/A |
| Unsafe F1 | N/A |
benignjailbreakS1 Violent CrimesS2 Non-Violent CrimesS4 Child Sexual ExploitationS7 PrivacyS10 HateS11 Self-HarmS12 Sexual ContentS14 Code Abusefrom transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("jainsatyam26/bertclassfier")
model = AutoModelForSequenceClassification.from_pretrained("jainsatyam26/bertclassfier")
def predict(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
predicted_id = torch.argmax(probs, dim=-1).item()
labels = ['benign', 'jailbreak', 'S1 Violent Crimes', 'S2 Non-Violent Crimes', 'S4 Child Sexual Exploitation', 'S7 Privacy', 'S10 Hate', 'S11 Self-Harm', 'S12 Sexual Content', 'S14 Code Abuse']
return {
"prediction": labels[predicted_id],
"confidence": probs[0][predicted_id].item(),
"all_scores": {labels[i]: probs[0][i].item() for i in range(len(labels))}
}
# Example
result = predict("How to make a bomb?")
print(result)
This model was trained with the following configuration:
{
"model_name": "microsoft/deberta-v3-large",
"dataset_name": "jainsatyam26/guardrail-215k-splits",
"max_length": 512,
"epochs": 4,
"batch_size": 8,
"grad_accum": 4,
"learning_rate": 1e-05,
"weight_decay": 0.01,
"warmup_ratio": 0.1,
"use_llrd": true,
"llrd_alpha": 0.9,
"use_multisample_dropout": true,
"num_dropout_samples": 5,
"dropout_rate": 0.3,
"use_label_smoothing": true,
"label_smoothing": 0.1,
"use_focal_loss": true,
"focal_alpha": 0.7,
"focal_gamma": 2.0,
"use_hard_negative": true,
"hard_negative_ratio": 0.3,
"num_folds": 3,
"optimize_thresholds": true,
"output_dir": "./guardrail_model",
"checkpoint_steps": 500,
"logging_steps": 50,
"eval_steps": 500,
"hf_repo_id": "jainsatyam26/bertclassfier",
"hf_token": "***REDACTED***",
"deploy_every_minutes": 30,
"deploy_every_steps": 400,
"auto_deploy": true,
"private_repo": false,
"auto_resume": true,
"resume_from_hf": true,
"use_wandb": true,
"wandb_project": "safety-classifier",
"fp16": false,
"bf16": true,
"dataloader_num_workers": 4,
"seed": 42
}
This model is automatically deployed every 30 minutes during training with:
Generated automatically during training - 2026-04-29 06:21:01