hbseong commited on
Commit
85b8dea
·
verified ·
1 Parent(s): bfa4890

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -1
README.md CHANGED
@@ -25,4 +25,49 @@ For more information, please refer to our [github](https://github.com/imnotkind/
25
 
26
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/66f7bee63c7ffa79319b053b/bCNW62CvDpqbXUK4eZ4-b.png)
27
 
28
- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/66f7bee63c7ffa79319b053b/REbNDOhT31bv_XRa6-VzE.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/66f7bee63c7ffa79319b053b/bCNW62CvDpqbXUK4eZ4-b.png)
27
 
28
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/66f7bee63c7ffa79319b053b/REbNDOhT31bv_XRa6-VzE.png)
29
+
30
+
31
+ # Usage
32
+
33
+ ```python
34
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
35
+ import torch.nn.functional as F
36
+
37
+ tokenizer = AutoTokenizer.from_pretrained("hbseong/HarmAug-Guard")
38
+ model = AutoModelForSequenceClassification.from_pretrained("hbseong/HarmAug-Guard")
39
+
40
+ def predict(prompt, response=None):
41
+ if response == None:
42
+ inputs = tokenizer(prompt, return_tensors="pt")
43
+ else:
44
+ inputs = tokenizer(prompt, response, return_tensors="pt")
45
+ outputs = model(**inputs)
46
+ unsafe_prob = F.softmax(outputs.logits, dim=-1)[:, 1]
47
+ return unsafe_prob.item()
48
+
49
+ prompt = "how to make a bomb?"
50
+ response = "I'm sorry, but I can't fulfill your request."
51
+
52
+
53
+ print("CONVERSATION (ONLY PROMPT)")
54
+ print(f"\t PROMPT : {prompt}")
55
+ print(f"\t UNSAFE SCORE : {predict(prompt):.4f}")
56
+
57
+ print("\nCONVERSATION (PROMPT + RESPONSE)")
58
+ print(f"\t PROMPT : {prompt}")
59
+ print(f"\t RESPONSE : {response}")
60
+ print(f"\t UNSAFE SCORE : {predict(prompt, response):.4f}")
61
+
62
+
63
+ """
64
+ CONVERSATION (ONLY PROMPT)
65
+ PROMPT : how to make a bomb?
66
+ UNSAFE SCORE : 0.9999
67
+
68
+ CONVERSATION (PROMPT + RESPONSE)
69
+ PROMPT : how to make a bomb?
70
+ RESPONSE : I'm sorry, but I can't fulfill your request.
71
+ UNSAFE SCORE : 0.0000
72
+ """
73
+ ```