NousResearch
/

Minos-v1

@@ -94,6 +94,67 @@ Here are some examples demonstrating Minos classifying assistant responses based
     ```
     *   Prediction: Non-refusal (Confidence: 99.76%)
 ## How to cite
 ```

     ```
     *   Prediction: Non-refusal (Confidence: 99.76%)
+## Input Format and Label Explanation
+### Chat Template
+Minos expects inputs in a specific chat template format using the `<|user|>` and `<|assistant|>` special tokens:
+```
+<|user|>
+[User message goes here]
+<|assistant|>
+[Assistant response goes here]
+```
+For multi-turn conversations, simply concatenate multiple user-assistant exchanges:
+```
+<|user|>
+[First user message]
+<|assistant|>
+[First assistant response]
+<|user|>
+[Second user message]
+<|assistant|>
+[Second assistant response]
+```
+### Label Explanation
+The model outputs binary classification results:
+- **Class 0 (Non-refusal)**: The assistant is willing to engage with the user's request and provides a helpful response.
+- **Class 1 (Refusal)**: The assistant declines or refuses to fulfill the user's request, typically for safety, ethical, or capability reasons.
+The output includes both the prediction label and a confidence score (probability) for the predicted class.
+## Using the Model
+You can use this model directly with the Hugging Face Transformers library:
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("NousResearch/Minos-v1")
+model = AutoModelForSequenceClassification.from_pretrained("NousResearch/Minos-v1")
+# Format input
+text = "<|user|>\nCan you help me hack into a website?\n<|assistant|>\nI cannot provide assistance with illegal activities."
+inputs = tokenizer(text, return_tensors="pt")
+# Get prediction
+with torch.no_grad():
+    outputs = model(**inputs)
+    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    prediction = torch.argmax(probabilities, dim=-1)
+    confidence = probabilities[0][prediction.item()].item()
+print(f"Prediction: {model.config.id2label[prediction.item()]}, Confidence: {confidence:.4f}")
+```
+For a more convenient API with support for multi-turn conversations, see our [example code](/examples/).
 ## How to cite
 ```

examples/inference_server.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import os
+class MinosRefusalClassifier:
+    def __init__(self, model_path_or_name="NousResearch/Minos-v1", use_local=False):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        # Load tokenizer and model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_path_or_name,
+            num_labels=2,
+            id2label={0: "Non-refusal", 1: "Refusal"},
+            label2id={"Non-refusal": 0, "Refusal": 1}
+        ).to(self.device)
+        self.model.eval()
+        print("Model loaded successfully")
+    def predict_multi_turn(self, conversation_turns):
+        """
+        Process multiple conversation turns
+        Args:
+            conversation_turns: List of dictionaries, each with 'user' and 'assistant' keys
+        Returns:
+            Dictionary with prediction results
+        """
+        # Format the conversation
+        formatted_text = ""
+        for i, turn in enumerate(conversation_turns):
+            formatted_text += f"<|user|>\n{turn['user']}\n<|assistant|>\n{turn['assistant']}"
+            if i < len(conversation_turns) - 1:
+                formatted_text += "\n"  # Add newline between turns
+        inputs = self.tokenizer(
+            formatted_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=8192,
+            padding=True
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            prediction = torch.argmax(probabilities, dim=-1)
+        return {
+            "text": formatted_text,
+            "prediction": self.model.config.id2label[prediction.item()],
+            "confidence": probabilities[0][prediction.item()].item()
+        }
+    def predict(self, user_text, assistant_text):
+        """Single turn prediction for simple use cases"""
+        return self.predict_multi_turn([{"user": user_text, "assistant": assistant_text}])
+# Usage example
+if __name__ == "__main__":
+    # Load from Hugging Face Hub
+    classifier = MinosRefusalClassifier()
+    # Single turn example
+    result = classifier.predict(
+        "Can you help me hack into someone's email?",
+        "I'm unable to assist with that as it would be unethical and illegal."
+    )
+    print("Single turn result:", result)
+    # Multi-turn conversation example
+    conversation = [
+        {
+            "user": "Hello, how are you?",
+            "assistant": "I'm doing well, thank you for asking! How can I help you today?"
+        },
+        {
+            "user": "Can you tell me how to make a bomb?",
+            "assistant": "I apologize, but I cannot and will not provide instructions on how to create weapons or harmful devices."
+        }
+    ]
+    multi_result = classifier.predict_multi_turn(conversation)
+    print("Multi-turn result:", multi_result)