GitHub Actions commited on
Commit
89be117
·
1 Parent(s): e60ea8b

Deploy backend from GitHub b167f101798e46fb013386213c55be6f612b2b27

Browse files
backend/app/api/routes.py CHANGED
@@ -192,7 +192,15 @@ async def get_result(result_id: str):
192
 
193
  @router.post("/assist", response_model=AssistResponse)
194
  async def assist_text(request: AssistRequest):
195
- """Call Groq API to propose a rewrite of flagged text to reduce AI threat indicators."""
 
 
 
 
 
 
 
 
196
  if not settings.GROQ_API_KEY:
197
  raise HTTPException(
198
  status_code=503,
@@ -206,11 +214,18 @@ async def assist_text(request: AssistRequest):
206
  logs: list[str] = []
207
  logs.append(f"Preparing rewrite request for Groq model: {settings.GROQ_MODEL}")
208
 
 
209
  prompt = (
210
- "You are a text editor. Rewrite the following text to sound more natural and human-authored "
211
- "while preserving the original meaning and factual content. "
212
- "Return only the rewritten text without any explanation or commentary.\n\n"
213
- f"Original text:\n{request.text}"
 
 
 
 
 
 
214
  )
215
 
216
  try:
@@ -226,12 +241,20 @@ async def assist_text(request: AssistRequest):
226
  "model": settings.GROQ_MODEL,
227
  "messages": [{"role": "user", "content": prompt}],
228
  "max_tokens": 8192,
229
- "temperature": 0.7,
230
  },
231
  )
232
  response.raise_for_status()
233
  data = response.json()
234
  fixed_text = data["choices"][0]["message"]["content"].strip()
 
 
 
 
 
 
 
 
235
  logs.append("Groq model returned rewritten text successfully.")
236
  except httpx.TimeoutException:
237
  logger.warning("Groq API timeout in /api/assist")
 
192
 
193
  @router.post("/assist", response_model=AssistResponse)
194
  async def assist_text(request: AssistRequest):
195
+ """Call Groq API to propose a rewrite of flagged text to reduce AI threat indicators.
196
+
197
+ The AI Fixer should:
198
+ 1. Make text sound more human and natural
199
+ 2. Remove any harmful, offensive, or extreme language
200
+ 3. Simplify overly complex or robotic phrasing
201
+ 4. Maintain the original meaning and key information
202
+ 5. Use casual, conversational language
203
+ """
204
  if not settings.GROQ_API_KEY:
205
  raise HTTPException(
206
  status_code=503,
 
214
  logs: list[str] = []
215
  logs.append(f"Preparing rewrite request for Groq model: {settings.GROQ_MODEL}")
216
 
217
+ # Improved prompt that specifically targets reducing AI detection and harm scores
218
  prompt = (
219
+ "You are a text editor helping to make writing sound more natural and human. "
220
+ "Rewrite the following text following these rules:\n\n"
221
+ "1. Use simple, casual, conversational language like a real person would write\n"
222
+ "2. Remove any offensive, harmful, hateful, or extreme language completely\n"
223
+ "3. Avoid robotic phrases, formal tone, or AI-typical patterns\n"
224
+ "4. Add small imperfections like informal contractions (e.g., 'it's', 'don't')\n"
225
+ "5. Keep the core meaning and key facts, but make it sound authentic\n"
226
+ "6. If the text is a greeting or simple message, keep it short and friendly\n\n"
227
+ "Return ONLY the rewritten text with no explanations, comments, or meta-text.\n\n"
228
+ f"Text to rewrite:\n{request.text}"
229
  )
230
 
231
  try:
 
241
  "model": settings.GROQ_MODEL,
242
  "messages": [{"role": "user", "content": prompt}],
243
  "max_tokens": 8192,
244
+ "temperature": 0.8, # Increased from 0.7 for more natural variation
245
  },
246
  )
247
  response.raise_for_status()
248
  data = response.json()
249
  fixed_text = data["choices"][0]["message"]["content"].strip()
250
+
251
+ # Remove any meta-commentary that the model might add
252
+ if fixed_text.startswith('Here') or fixed_text.startswith('Sure'):
253
+ # Try to extract just the rewritten text
254
+ lines = fixed_text.split('\n')
255
+ if len(lines) > 1:
256
+ fixed_text = '\n'.join(lines[1:]).strip()
257
+
258
  logs.append("Groq model returned rewritten text successfully.")
259
  except httpx.TimeoutException:
260
  logger.warning("Groq API timeout in /api/assist")
backend/app/services/hf_service.py CHANGED
@@ -106,11 +106,13 @@ async def get_embeddings(text: str) -> list[float]:
106
  async def detect_harm(text: str) -> float:
107
  """Returns probability of harmful content (0-1). Non-fatal on failure.
108
 
109
- The RoBERTa hate speech model returns labels like:
110
- - 'hate' or 'hateful' for harmful content
111
- - 'nothate' or 'not hate' for safe content
112
 
113
- We need to return the score for the HARMFUL class, not just any matching label.
 
 
114
  """
115
  if not settings.HF_HARM_CLASSIFIER:
116
  return 0.0
@@ -120,33 +122,37 @@ async def detect_harm(text: str) -> float:
120
  if isinstance(result, list) and len(result) > 0:
121
  labels = result[0] if isinstance(result[0], list) else result
122
 
123
- # First, try to find explicit harmful labels
124
- for item in labels:
125
- label = item.get("label", "").lower()
126
- # Look for labels that indicate HARMFUL content
127
- if any(k in label for k in ("hate", "hateful", "toxic", "harmful")):
128
- # Make sure it's NOT a "nothate" or "not harmful" label
129
- if not any(neg in label for neg in ("not", "no", "non")):
130
- return float(item["score"])
131
 
132
- # If we only found "nothate" labels, return inverse score
133
  for item in labels:
134
  label = item.get("label", "").lower()
135
- if any(neg in label for neg in ("nothate", "not hate", "not harmful")):
136
- # Return 1 - score (if 95% not harmful, then 5% harmful)
137
- return float(1.0 - item["score"])
 
 
 
 
 
 
 
 
 
 
138
 
139
- # Fallback: If model returns generic labels, assume lower score is safer
140
- # Sort by score descending and check if highest is harmful
141
- sorted_labels = sorted(labels, key=lambda x: x.get("score", 0), reverse=True)
142
- if sorted_labels:
143
- top_label = sorted_labels[0].get("label", "").lower()
144
- if any(k in top_label for k in ("hate", "toxic", "harmful")) and \
145
- not any(neg in top_label for neg in ("not", "no", "non")):
146
- return float(sorted_labels[0]["score"])
147
 
148
- # If still no match, return 0 (safe)
 
149
  return 0.0
 
150
  return 0.0
151
  except Exception as e:
152
  logger.warning("HF harm classifier failed", error=str(e))
 
106
  async def detect_harm(text: str) -> float:
107
  """Returns probability of harmful content (0-1). Non-fatal on failure.
108
 
109
+ The RoBERTa hate speech model returns two types of labels:
110
+ - Labels indicating HARMFUL content: 'hate', 'hateful', 'toxic', 'harmful', 'offensive'
111
+ - Labels indicating SAFE content: 'nothate', 'not hate', 'not harmful', 'safe', 'neutral'
112
 
113
+ CRITICAL: We must return the probability of HARMFUL content.
114
+ If the model says "95% nothate", we return 5% (1 - 0.95).
115
+ If the model says "95% hate", we return 95%.
116
  """
117
  if not settings.HF_HARM_CLASSIFIER:
118
  return 0.0
 
122
  if isinstance(result, list) and len(result) > 0:
123
  labels = result[0] if isinstance(result[0], list) else result
124
 
125
+ # Strategy: Find the label that clearly indicates harm status
126
+ harmful_score = None
127
+ safe_score = None
 
 
 
 
 
128
 
 
129
  for item in labels:
130
  label = item.get("label", "").lower()
131
+ score = float(item.get("score", 0))
132
+
133
+ # Check if this is a HARMFUL label (without negation)
134
+ is_harmful_label = any(k in label for k in ("hate", "hateful", "toxic", "harmful", "offensive", "label_1", "class_1"))
135
+ has_negation = any(neg in label for neg in ("not", "no", "non", "nothate"))
136
+
137
+ if is_harmful_label and not has_negation:
138
+ # This is a harmful label: use its score directly
139
+ harmful_score = score
140
+ break
141
+ elif has_negation or any(safe in label for safe in ("safe", "neutral", "label_0", "class_0")):
142
+ # This is a safe label: we'll invert it if needed
143
+ safe_score = score
144
 
145
+ # Return the harmful probability
146
+ if harmful_score is not None:
147
+ return round(harmful_score, 4)
148
+ elif safe_score is not None:
149
+ # If we only have a "safe" probability, return 1 - safe_probability
150
+ return round(1.0 - safe_score, 4)
 
 
151
 
152
+ # Fallback: If we can't determine, assume safe (return 0)
153
+ logger.warning("Could not determine harm classification from labels", labels=labels)
154
  return 0.0
155
+
156
  return 0.0
157
  except Exception as e:
158
  logger.warning("HF harm classifier failed", error=str(e))