GitHub Actions commited on
Commit
e60ea8b
·
1 Parent(s): cdddc93

Deploy backend from GitHub 84e7db36031d7a9fb7fb65f8ae369af6f069d7df

Browse files
Files changed (1) hide show
  1. backend/app/services/hf_service.py +34 -4
backend/app/services/hf_service.py CHANGED
@@ -104,7 +104,14 @@ async def get_embeddings(text: str) -> list[float]:
104
 
105
 
106
  async def detect_harm(text: str) -> float:
107
- """Returns probability of harmful content (0-1). Non-fatal on failure."""
 
 
 
 
 
 
 
108
  if not settings.HF_HARM_CLASSIFIER:
109
  return 0.0
110
 
@@ -112,11 +119,34 @@ async def detect_harm(text: str) -> float:
112
  result = await _hf_post(settings.HF_HARM_CLASSIFIER, {"inputs": text})
113
  if isinstance(result, list) and len(result) > 0:
114
  labels = result[0] if isinstance(result[0], list) else result
 
 
115
  for item in labels:
116
  label = item.get("label", "").lower()
117
- if any(k in label for k in ("hate", "toxic", "harmful", "hateful", "target")):
118
- return float(item["score"])
119
- return float(max(labels, key=lambda x: x.get("score", 0)).get("score", 0.0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  return 0.0
121
  except Exception as e:
122
  logger.warning("HF harm classifier failed", error=str(e))
 
104
 
105
 
106
  async def detect_harm(text: str) -> float:
107
+ """Returns probability of harmful content (0-1). Non-fatal on failure.
108
+
109
+ The RoBERTa hate speech model returns labels like:
110
+ - 'hate' or 'hateful' for harmful content
111
+ - 'nothate' or 'not hate' for safe content
112
+
113
+ We need to return the score for the HARMFUL class, not just any matching label.
114
+ """
115
  if not settings.HF_HARM_CLASSIFIER:
116
  return 0.0
117
 
 
119
  result = await _hf_post(settings.HF_HARM_CLASSIFIER, {"inputs": text})
120
  if isinstance(result, list) and len(result) > 0:
121
  labels = result[0] if isinstance(result[0], list) else result
122
+
123
+ # First, try to find explicit harmful labels
124
  for item in labels:
125
  label = item.get("label", "").lower()
126
+ # Look for labels that indicate HARMFUL content
127
+ if any(k in label for k in ("hate", "hateful", "toxic", "harmful")):
128
+ # Make sure it's NOT a "nothate" or "not harmful" label
129
+ if not any(neg in label for neg in ("not", "no", "non")):
130
+ return float(item["score"])
131
+
132
+ # If we only found "nothate" labels, return inverse score
133
+ for item in labels:
134
+ label = item.get("label", "").lower()
135
+ if any(neg in label for neg in ("nothate", "not hate", "not harmful")):
136
+ # Return 1 - score (if 95% not harmful, then 5% harmful)
137
+ return float(1.0 - item["score"])
138
+
139
+ # Fallback: If model returns generic labels, assume lower score is safer
140
+ # Sort by score descending and check if highest is harmful
141
+ sorted_labels = sorted(labels, key=lambda x: x.get("score", 0), reverse=True)
142
+ if sorted_labels:
143
+ top_label = sorted_labels[0].get("label", "").lower()
144
+ if any(k in top_label for k in ("hate", "toxic", "harmful")) and \
145
+ not any(neg in top_label for neg in ("not", "no", "non")):
146
+ return float(sorted_labels[0]["score"])
147
+
148
+ # If still no match, return 0 (safe)
149
+ return 0.0
150
  return 0.0
151
  except Exception as e:
152
  logger.warning("HF harm classifier failed", error=str(e))