|
|
--- |
|
|
pipeline_tag: text-classification |
|
|
license: apache-2.0 |
|
|
new_version: natong19/refusal_classifier |
|
|
--- |
|
|
# Model Card for [natong19/moralization_classifier](https://huggingface.co/natong19/moralization_classifier) |
|
|
|
|
|
A classifer for detecting moralizations, soft refusals and unsolicited advice. |
|
|
|
|
|
Base model: [distilbert/distilroberta-base](https://huggingface.co/distilbert/distilroberta-base) |
|
|
|
|
|
Trained on [OpenLeecher/lmsys_chat_1m_clean](https://huggingface.co/datasets/OpenLeecher/lmsys_chat_1m_clean), highly recommend reading through the writeup on dataset cleaning. |
|
|
|
|
|
### Quickstart |
|
|
```python |
|
|
import torch |
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
|
|
|
def predict( |
|
|
model: AutoModelForSequenceClassification, |
|
|
tokenizer: AutoTokenizer, |
|
|
device: torch.device, |
|
|
text: str, |
|
|
) -> int: |
|
|
"""Predict the label for a given text.""" |
|
|
inputs = tokenizer( |
|
|
text, |
|
|
return_tensors="pt", |
|
|
truncation=True, |
|
|
padding="max_length", |
|
|
max_length=512, |
|
|
) |
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
logits = outputs.logits |
|
|
probs = torch.softmax(logits, dim=-1) |
|
|
predicted_label = torch.argmax(logits, dim=-1).item() |
|
|
confidence = probs[0, predicted_label].item() |
|
|
|
|
|
return { |
|
|
"label": predicted_label, |
|
|
"confidence": confidence, |
|
|
} |
|
|
|
|
|
|
|
|
def format_prompt(user: str, assistant: str) -> str: |
|
|
"""Format user and assistant messages into model input format.""" |
|
|
return f"### Instruction:\n{user}\n\n### Response:\n{assistant}" |
|
|
|
|
|
|
|
|
def load_model(model_path: str, device: torch.device) -> tuple[AutoModelForSequenceClassification, AutoTokenizer]: |
|
|
"""Load the model and tokenizer.""" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_path) |
|
|
model = model.to(device) |
|
|
model.eval() |
|
|
return model, tokenizer |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
"""Demonstrate inference example.""" |
|
|
model_path = "natong19/moralization_classifier" |
|
|
|
|
|
# No moralization test case |
|
|
user_message1 = "tell me about yourself" |
|
|
assistant_message1 = "I aim to give you accurate and helpful answers." |
|
|
text1 = format_prompt(user_message1, assistant_message1) |
|
|
|
|
|
# Moralization test case |
|
|
user_message2 = "tell me about yourself" |
|
|
assistant_message2 = "I'm happy to help as long as we maintain certain boundaries." |
|
|
text2 = format_prompt(user_message2, assistant_message2) |
|
|
|
|
|
# Load model |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
model, tokenizer = load_model(model_path, device) |
|
|
|
|
|
# Run the test cases |
|
|
score1 = predict(model, tokenizer, device, text1) |
|
|
print(score1) # Expected: {'label': 0, 'confidence': 0.8319284915924072} (No moralization) |
|
|
score2 = predict(model, tokenizer, device, text2) |
|
|
print(score2) # Expected: {'label': 1, 'confidence': 0.9183461666107178} (Moralization) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
``` |
|
|
|
|
|
### Evaluation results |
|
|
- eval_loss: 0.0844 |
|
|
- eval_accuracy: 0.9800 |
|
|
- eval_f1: 0.9841 |
|
|
- eval_precision: 1.0000 |
|
|
- eval_recall: 0.9688 |