Commit ·
b2eddc3
1
Parent(s): 60fbaa9
latest
Browse files- handler.py +17 -7
handler.py
CHANGED
|
@@ -36,14 +36,24 @@ class EndpointHandler:
|
|
| 36 |
|
| 37 |
# Additional analysis
|
| 38 |
entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
adjusted_confidence = confidence * (1 - entropy/np.log(len(probabilities))) * max_prob_ratio
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
return {
|
| 49 |
"label": predicted_label,
|
|
|
|
| 36 |
|
| 37 |
# Additional analysis
|
| 38 |
entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
|
| 39 |
+
|
| 40 |
+
# Adjust confidence based on entropy
|
| 41 |
+
adjusted_confidence = confidence * (1 - entropy/np.log(len(probabilities)))
|
|
|
|
| 42 |
|
| 43 |
+
# Post-processing to better distinguish between INJECTION and JAILBREAK
|
| 44 |
+
injection_keywords = ['ignore', 'previous', 'instructions', 'don\'t', 'matter']
|
| 45 |
+
jailbreak_keywords = ['bypass', 'restrictions', 'override', 'security']
|
| 46 |
+
|
| 47 |
+
injection_score = sum(keyword in text.lower() for keyword in injection_keywords) / len(injection_keywords)
|
| 48 |
+
jailbreak_score = sum(keyword in text.lower() for keyword in jailbreak_keywords) / len(jailbreak_keywords)
|
| 49 |
+
|
| 50 |
+
if predicted_label in ['INJECTION', 'JAILBREAK']:
|
| 51 |
+
if injection_score > jailbreak_score:
|
| 52 |
+
predicted_label = 'INJECTION'
|
| 53 |
+
elif jailbreak_score > injection_score:
|
| 54 |
+
predicted_label = 'JAILBREAK'
|
| 55 |
+
|
| 56 |
+
adjusted_confidence = max(adjusted_confidence, injection_score, jailbreak_score)
|
| 57 |
|
| 58 |
return {
|
| 59 |
"label": predicted_label,
|