GitHub Actions commited on
Commit ·
e60ea8b
1
Parent(s): cdddc93
Deploy backend from GitHub 84e7db36031d7a9fb7fb65f8ae369af6f069d7df
Browse files
backend/app/services/hf_service.py
CHANGED
|
@@ -104,7 +104,14 @@ async def get_embeddings(text: str) -> list[float]:
|
|
| 104 |
|
| 105 |
|
| 106 |
async def detect_harm(text: str) -> float:
|
| 107 |
-
"""Returns probability of harmful content (0-1). Non-fatal on failure.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
if not settings.HF_HARM_CLASSIFIER:
|
| 109 |
return 0.0
|
| 110 |
|
|
@@ -112,11 +119,34 @@ async def detect_harm(text: str) -> float:
|
|
| 112 |
result = await _hf_post(settings.HF_HARM_CLASSIFIER, {"inputs": text})
|
| 113 |
if isinstance(result, list) and len(result) > 0:
|
| 114 |
labels = result[0] if isinstance(result[0], list) else result
|
|
|
|
|
|
|
| 115 |
for item in labels:
|
| 116 |
label = item.get("label", "").lower()
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
return 0.0
|
| 121 |
except Exception as e:
|
| 122 |
logger.warning("HF harm classifier failed", error=str(e))
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
async def detect_harm(text: str) -> float:
|
| 107 |
+
"""Returns probability of harmful content (0-1). Non-fatal on failure.
|
| 108 |
+
|
| 109 |
+
The RoBERTa hate speech model returns labels like:
|
| 110 |
+
- 'hate' or 'hateful' for harmful content
|
| 111 |
+
- 'nothate' or 'not hate' for safe content
|
| 112 |
+
|
| 113 |
+
We need to return the score for the HARMFUL class, not just any matching label.
|
| 114 |
+
"""
|
| 115 |
if not settings.HF_HARM_CLASSIFIER:
|
| 116 |
return 0.0
|
| 117 |
|
|
|
|
| 119 |
result = await _hf_post(settings.HF_HARM_CLASSIFIER, {"inputs": text})
|
| 120 |
if isinstance(result, list) and len(result) > 0:
|
| 121 |
labels = result[0] if isinstance(result[0], list) else result
|
| 122 |
+
|
| 123 |
+
# First, try to find explicit harmful labels
|
| 124 |
for item in labels:
|
| 125 |
label = item.get("label", "").lower()
|
| 126 |
+
# Look for labels that indicate HARMFUL content
|
| 127 |
+
if any(k in label for k in ("hate", "hateful", "toxic", "harmful")):
|
| 128 |
+
# Make sure it's NOT a "nothate" or "not harmful" label
|
| 129 |
+
if not any(neg in label for neg in ("not", "no", "non")):
|
| 130 |
+
return float(item["score"])
|
| 131 |
+
|
| 132 |
+
# If we only found "nothate" labels, return inverse score
|
| 133 |
+
for item in labels:
|
| 134 |
+
label = item.get("label", "").lower()
|
| 135 |
+
if any(neg in label for neg in ("nothate", "not hate", "not harmful")):
|
| 136 |
+
# Return 1 - score (if 95% not harmful, then 5% harmful)
|
| 137 |
+
return float(1.0 - item["score"])
|
| 138 |
+
|
| 139 |
+
# Fallback: If model returns generic labels, assume lower score is safer
|
| 140 |
+
# Sort by score descending and check if highest is harmful
|
| 141 |
+
sorted_labels = sorted(labels, key=lambda x: x.get("score", 0), reverse=True)
|
| 142 |
+
if sorted_labels:
|
| 143 |
+
top_label = sorted_labels[0].get("label", "").lower()
|
| 144 |
+
if any(k in top_label for k in ("hate", "toxic", "harmful")) and \
|
| 145 |
+
not any(neg in top_label for neg in ("not", "no", "non")):
|
| 146 |
+
return float(sorted_labels[0]["score"])
|
| 147 |
+
|
| 148 |
+
# If still no match, return 0 (safe)
|
| 149 |
+
return 0.0
|
| 150 |
return 0.0
|
| 151 |
except Exception as e:
|
| 152 |
logger.warning("HF harm classifier failed", error=str(e))
|