DATE-LM-Leaderboard / data /Applications /toxicity-homogeneous.json
github-actions[bot]
Clean Push to Hugging Face
a806362
[
{
"Rank": 0,
"Method": "GradSafe",
"Attribution Method Type": "Baseline",
"Model": "N/A",
"Model Size": "N/A",
"ToxicChat": 0.347,
"XSTest-response": 0.491,
"JailBreakBench": 0.802,
"AUPRC": 0.546,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "OpenAI Moderation",
"Attribution Method Type": "Baseline",
"Model": "N/A",
"Model Size": "N/A",
"ToxicChat": 0.243,
"XSTest-response": 0.378,
"JailBreakBench": 0.187,
"AUPRC": 0.269,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Llama-Guard-3-8B",
"Attribution Method Type": "Baseline",
"Model": "N/A",
"Model Size": "N/A",
"ToxicChat": 0.445,
"XSTest-response": 0.916,
"JailBreakBench": 0.985,
"AUPRC": 0.782,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Wildguard",
"Attribution Method Type": "Baseline",
"Model": "N/A",
"Model Size": "N/A",
"ToxicChat": 0.56,
"XSTest-response": 0.93,
"JailBreakBench": 0.989,
"AUPRC": 0.827,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "ShieldGemma-2b",
"Attribution Method Type": "Baseline",
"Model": "N/A",
"Model Size": "N/A",
"ToxicChat": 0.17,
"XSTest-response": 0.74,
"JailBreakBench": 0.664,
"AUPRC": 0.525,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "AEGIS-Defensive",
"Attribution Method Type": "Baseline",
"Model": "N/A",
"Model Size": "N/A",
"ToxicChat": 0.376,
"XSTest-response": 0.274,
"JailBreakBench": 0.346,
"AUPRC": 0.332,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Rep-Sim",
"Attribution Method Type": "Similarity",
"Model": "Pythia-1b",
"Model Size": "1B",
"ToxicChat": 0.374,
"XSTest-response": 0.657,
"JailBreakBench": 0.986,
"AUPRC": 0.672,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Grad Dot",
"Attribution Method Type": "Gradient",
"Model": "Pythia-1b",
"Model Size": "1B",
"ToxicChat": 0.084,
"XSTest-response": 0.483,
"JailBreakBench": 0.999,
"AUPRC": 0.522,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Grad Sim",
"Attribution Method Type": "Gradient",
"Model": "Pythia-1b",
"Model Size": "1B",
"ToxicChat": 0.106,
"XSTest-response": 0.647,
"JailBreakBench": 1.0,
"AUPRC": 0.584,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "LESS",
"Attribution Method Type": "Gradient",
"Model": "Pythia-1b",
"Model Size": "1B",
"ToxicChat": 0.388,
"XSTest-response": 0.724,
"JailBreakBench": 1.0,
"AUPRC": 0.704,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "DataInf",
"Attribution Method Type": "Gradient",
"Model": "Pythia-1b",
"Model Size": "1B",
"ToxicChat": 0.204,
"XSTest-response": 0.487,
"JailBreakBench": 0.999,
"AUPRC": 0.563,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "EKFAC",
"Attribution Method Type": "Gradient",
"Model": "Pythia-1b",
"Model Size": "1B",
"ToxicChat": 0.216,
"XSTest-response": 0.497,
"JailBreakBench": 1.0,
"AUPRC": 0.571,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Rep-Sim",
"Attribution Method Type": "Similarity",
"Model": "Llama-3.2-1B",
"Model Size": "1B",
"ToxicChat": 0.632,
"XSTest-response": 0.792,
"JailBreakBench": 0.854,
"AUPRC": 0.759,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Grad Dot",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.2-1B",
"Model Size": "1B",
"ToxicChat": 0.212,
"XSTest-response": 0.437,
"JailBreakBench": 1.0,
"AUPRC": 0.55,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Grad Sim",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.2-1B",
"Model Size": "1B",
"ToxicChat": 0.259,
"XSTest-response": 0.798,
"JailBreakBench": 1.0,
"AUPRC": 0.686,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "LESS",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.2-1B",
"Model Size": "1B",
"ToxicChat": 0.294,
"XSTest-response": 0.792,
"JailBreakBench": 1.0,
"AUPRC": 0.695,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "DataInf",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.2-1B",
"Model Size": "1B",
"ToxicChat": 0.215,
"XSTest-response": 0.442,
"JailBreakBench": 1.0,
"AUPRC": 0.552,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "EKFAC",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.2-1B",
"Model Size": "1B",
"ToxicChat": 0.264,
"XSTest-response": 0.562,
"JailBreakBench": 1.0,
"AUPRC": 0.609,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Rep-Sim",
"Attribution Method Type": "Similarity",
"Model": "Llama-3.1-8B",
"Model Size": "8B",
"ToxicChat": 0.989,
"XSTest-response": 0.999,
"JailBreakBench": 0.98,
"AUPRC": 0.989,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Grad Dot",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.1-8B",
"Model Size": "8B",
"ToxicChat": 0.47,
"XSTest-response": 0.368,
"JailBreakBench": 0.274,
"AUPRC": 0.371,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "Grad Sim",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.1-8B",
"Model Size": "8B",
"ToxicChat": 0.28,
"XSTest-response": 0.603,
"JailBreakBench": 0.82,
"AUPRC": 0.567,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "LESS",
"Attribution Method Type": "Gradient",
"Model": "Llama-3.1-8B",
"Model Size": "8B",
"ToxicChat": 0.499,
"XSTest-response": 0.615,
"JailBreakBench": 0.767,
"AUPRC": 0.627,
"Paper/Code/Contact Link": ""
},
{
"Rank": 0,
"Method": "test",
"Attribution Method Type": "Gradient",
"Model": "pythia",
"Model Size": "1B",
"Paper/Code/Contact Link": "test",
"ToxicChat": 0.3,
"XSTest-response": 0.3,
"JailBreakBench": 0.3,
"AUPRC": 0.3
}
]