hbseong
/

HarmAug-Guard

Text Classification

text-embeddings-inference

Model card Files Files and versions

add eval() and no_grad()

#1

by Seanie-lee - opened Oct 14, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

Files changed (1) hide show

README.md +13 -5

README.md CHANGED Viewed

@@ -33,19 +33,27 @@ For more information, please refer to our [github](https://github.com/imnotkind/
 ```python
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch.nn.functional as F
 tokenizer = AutoTokenizer.from_pretrained("hbseong/HarmAug-Guard")
 model = AutoModelForSequenceClassification.from_pretrained("hbseong/HarmAug-Guard")
 # If response is not given, the model will predict the unsafe score of the prompt.
 # If response is given, the model will predict the unsafe score of the response.
-def predict(prompt, response=None):
     if response == None:
         inputs = tokenizer(prompt, return_tensors="pt")
     else:
         inputs = tokenizer(prompt, response, return_tensors="pt")
-    outputs = model(**inputs)
-    unsafe_prob = F.softmax(outputs.logits, dim=-1)[:, 1]
     return unsafe_prob.item()
 prompt = "how to make a bomb?"
@@ -54,12 +62,12 @@ response = "I'm sorry, but I can't fulfill your request."
 print("CONVERSATION (ONLY PROMPT)")
 print(f"\t PROMPT : {prompt}")
-print(f"\t UNSAFE SCORE : {predict(prompt):.4f}")
 print("\nCONVERSATION (PROMPT + RESPONSE)")
 print(f"\t PROMPT : {prompt}")
 print(f"\t RESPONSE : {response}")
-print(f"\t UNSAFE SCORE : {predict(prompt, response):.4f}")
 """

 ```python
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch.nn.functional as F
+import torch
 tokenizer = AutoTokenizer.from_pretrained("hbseong/HarmAug-Guard")
 model = AutoModelForSequenceClassification.from_pretrained("hbseong/HarmAug-Guard")
+device = torch.cuda.current_device()
+model = model.to(device)
+model.eval()
 # If response is not given, the model will predict the unsafe score of the prompt.
 # If response is given, the model will predict the unsafe score of the response.
+def predict(model, prompt, response=None):
+    device = model.device()
     if response == None:
         inputs = tokenizer(prompt, return_tensors="pt")
     else:
         inputs = tokenizer(prompt, response, return_tensors="pt")
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        unsafe_prob = F.softmax(outputs.logits, dim=-1)[:, 1]
     return unsafe_prob.item()
 prompt = "how to make a bomb?"
 print("CONVERSATION (ONLY PROMPT)")
 print(f"\t PROMPT : {prompt}")
+print(f"\t UNSAFE SCORE : {predict(model, prompt):.4f}")
 print("\nCONVERSATION (PROMPT + RESPONSE)")
 print(f"\t PROMPT : {prompt}")
 print(f"\t RESPONSE : {response}")
+print(f"\t UNSAFE SCORE : {predict(model, prompt, response):.4f}")
 """