Upload README.md with huggingface_hub
e86fe08
verified
BAD Classifier for TinyLlama/TinyLlama-1.1B-Chat-v1.0
Model Details
- Detection Layer: 14
- Validation Accuracy: 76.00%
- Dataset: BBQ (58942) + MMLU (20266)
Layer Performance
Usage
from huggingface_hub import hf_hub_download
import torch
import json
config_path = hf_hub_download("bitlabsdb/bad-classifier-tinyllama", "config.json")
model_path = hf_hub_download("bitlabsdb/bad-classifier-tinyllama", "pytorch_model.bin")
with open(config_path) as f:
config = json.load(f)
class BADClassifier(torch.nn.Module):
def __init__(self, input_dim):
super().__init__()
self.linear = torch.nn.Linear(input_dim, 2)
def forward(self, x):
return self.linear(x)
classifier = BADClassifier(config['input_dim'])
classifier.load_state_dict(torch.load(model_path))
Citation
@article{fairsteer2025,
title={FairSteer: Inference Time Debiasing for LLMs},
author={Li, Yichen et al.},
year={2025}
}