A newer version of this model is available:
natong19/refusal_classifier
Model Card for natong19/moralization_classifier
A classifer for detecting moralizations, soft refusals and unsolicited advice.
Base model: distilbert/distilroberta-base
Trained on OpenLeecher/lmsys_chat_1m_clean, highly recommend reading through the writeup on dataset cleaning.
Quickstart
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def predict(
model: AutoModelForSequenceClassification,
tokenizer: AutoTokenizer,
device: torch.device,
text: str,
) -> int:
"""Predict the label for a given text."""
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding="max_length",
max_length=512,
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
predicted_label = torch.argmax(logits, dim=-1).item()
confidence = probs[0, predicted_label].item()
return {
"label": predicted_label,
"confidence": confidence,
}
def format_prompt(user: str, assistant: str) -> str:
"""Format user and assistant messages into model input format."""
return f"### Instruction:\n{user}\n\n### Response:\n{assistant}"
def load_model(model_path: str, device: torch.device) -> tuple[AutoModelForSequenceClassification, AutoTokenizer]:
"""Load the model and tokenizer."""
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model = model.to(device)
model.eval()
return model, tokenizer
def main() -> None:
"""Demonstrate inference example."""
model_path = "natong19/moralization_classifier"
# No moralization test case
user_message1 = "tell me about yourself"
assistant_message1 = "I aim to give you accurate and helpful answers."
text1 = format_prompt(user_message1, assistant_message1)
# Moralization test case
user_message2 = "tell me about yourself"
assistant_message2 = "I'm happy to help as long as we maintain certain boundaries."
text2 = format_prompt(user_message2, assistant_message2)
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, tokenizer = load_model(model_path, device)
# Run the test cases
score1 = predict(model, tokenizer, device, text1)
print(score1) # Expected: {'label': 0, 'confidence': 0.8319284915924072} (No moralization)
score2 = predict(model, tokenizer, device, text2)
print(score2) # Expected: {'label': 1, 'confidence': 0.9183461666107178} (Moralization)
if __name__ == "__main__":
main()
Evaluation results
- eval_loss: 0.0844
- eval_accuracy: 0.9800
- eval_f1: 0.9841
- eval_precision: 1.0000
- eval_recall: 0.9688
- Downloads last month
- 99