BAD Classifier for TinyLlama/TinyLlama-1.1B-Chat-v1.0
Model Details
Detection Layer: 15
Dataset: BBQ (58942) + MMLU (20266)
Layer Performance
- Layer 11: 81.52%
- Layer 12: 83.95%
- Layer 13: 82.71%
- Layer 14: 82.92%
- Layer 15: 84.15%
- Layer 16: 83.93%
Usage
from huggingface_hub import hf_hub_download
import torch
import json
# Download
config_path = hf_hub_download("bitlabsdb/bad-classifier-tinyllama", "config.json")
model_path = hf_hub_download("bitlabsdb/bad-classifier-tinyllama", "pytorch_model.bin")
# Load config
with open(config_path) as f:
config = json.load(f)
# Define classifier
class BADClassifier(torch.nn.Module):
def __init__(self, input_dim):
super().__init__()
self.linear = torch.nn.Linear(input_dim, 2)
def forward(self, x):
return self.linear(x)
# Load
classifier = BADClassifier(config['input_dim'])
classifier.load_state_dict(torch.load(model_path))
Citation
@article{fairsteer2025,
title={FairSteer: Inference Time Debiasing for LLMs},
author={Li, Yichen et al.},
year={2025}
}