natong19
/

moralization_classifier

Text Classification

Model card Files Files and versions

natong19 commited on Nov 27, 2025

Commit

1af790d

·

verified ·

1 Parent(s): 4c5957b

Create README.md

Files changed (1) hide show

README.md +91 -0

README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Model Card for [natong19/moralization_classifier](https://huggingface.co/natong19/moralization_classifier)
+A classifer for detecting moralizations, soft refusals and unsolicited advice.
+Trained on [OpenLeecher/lmsys_chat_1m_clean](https://huggingface.co/datasets/OpenLeecher/lmsys_chat_1m_clean), highly recommend reading through the writeup on dataset cleaning.
+### Example usage:
+```
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+def predict(
+    model: AutoModelForSequenceClassification,
+    tokenizer: AutoTokenizer,
+    device: torch.device,
+    text: str,
+) -> int:
+    """Predict the label for a given text."""
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=512,
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probs = torch.softmax(logits, dim=-1)
+        predicted_label = torch.argmax(logits, dim=-1).item()
+        confidence = probs[0, predicted_label].item()
+    return {
+        "label": predicted_label,
+        "confidence": confidence,
+    }
+def format_prompt(user: str, assistant: str) -> str:
+    """Format user and assistant messages into model input format."""
+    return f"### Instruction:\n{user}\n\n### Response:\n{assistant}"
+def load_model(model_path: str, device: torch.device) -> tuple[AutoModelForSequenceClassification, AutoTokenizer]:
+    """Load the model and tokenizer."""
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    model = model.to(device)
+    model.eval()
+    return model, tokenizer
+def main() -> None:
+    """Demonstrate inference example."""
+    model_path = "natong19/moralization_classifier"
+    # No moralization test case
+    user_message1 = "tell me about yourself"
+    assistant_message1 = "I aim to give you accurate and helpful answers."
+    text1 = format_prompt(user_message1, assistant_message1)
+    # Moralization test case
+    user_message2 = "tell me about yourself"
+    assistant_message2 = "I'm happy to help as long as we maintain certain boundaries."
+    text2 = format_prompt(user_message2, assistant_message2)
+    # Load model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model, tokenizer = load_model(model_path, device)
+    # Run the test cases
+    score1 = predict(model, tokenizer, device, text1)
+    print(score1) # Expected: {'label': 0, 'confidence': 0.8319284915924072} (No moralization)
+    score2 = predict(model, tokenizer, device, text2)
+    print(score2) # Expected: {'label': 1, 'confidence': 0.9183461666107178} (Moralization)
+if __name__ == "__main__":
+    main()
+```
+### Evaluation results:
+- eval_loss: 0.0844
+- eval_accuracy: 0.9800
+- eval_f1: 0.9841
+- eval_precision: 1.0000
+- eval_recall: 0.9688