GitHub Actions commited on
Commit ·
89be117
1
Parent(s): e60ea8b
Deploy backend from GitHub b167f101798e46fb013386213c55be6f612b2b27
Browse files- backend/app/api/routes.py +29 -6
- backend/app/services/hf_service.py +31 -25
backend/app/api/routes.py
CHANGED
|
@@ -192,7 +192,15 @@ async def get_result(result_id: str):
|
|
| 192 |
|
| 193 |
@router.post("/assist", response_model=AssistResponse)
|
| 194 |
async def assist_text(request: AssistRequest):
|
| 195 |
-
"""Call Groq API to propose a rewrite of flagged text to reduce AI threat indicators.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
if not settings.GROQ_API_KEY:
|
| 197 |
raise HTTPException(
|
| 198 |
status_code=503,
|
|
@@ -206,11 +214,18 @@ async def assist_text(request: AssistRequest):
|
|
| 206 |
logs: list[str] = []
|
| 207 |
logs.append(f"Preparing rewrite request for Groq model: {settings.GROQ_MODEL}")
|
| 208 |
|
|
|
|
| 209 |
prompt = (
|
| 210 |
-
"You are a text editor
|
| 211 |
-
"
|
| 212 |
-
"
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
)
|
| 215 |
|
| 216 |
try:
|
|
@@ -226,12 +241,20 @@ async def assist_text(request: AssistRequest):
|
|
| 226 |
"model": settings.GROQ_MODEL,
|
| 227 |
"messages": [{"role": "user", "content": prompt}],
|
| 228 |
"max_tokens": 8192,
|
| 229 |
-
"temperature": 0.
|
| 230 |
},
|
| 231 |
)
|
| 232 |
response.raise_for_status()
|
| 233 |
data = response.json()
|
| 234 |
fixed_text = data["choices"][0]["message"]["content"].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
logs.append("Groq model returned rewritten text successfully.")
|
| 236 |
except httpx.TimeoutException:
|
| 237 |
logger.warning("Groq API timeout in /api/assist")
|
|
|
|
| 192 |
|
| 193 |
@router.post("/assist", response_model=AssistResponse)
|
| 194 |
async def assist_text(request: AssistRequest):
|
| 195 |
+
"""Call Groq API to propose a rewrite of flagged text to reduce AI threat indicators.
|
| 196 |
+
|
| 197 |
+
The AI Fixer should:
|
| 198 |
+
1. Make text sound more human and natural
|
| 199 |
+
2. Remove any harmful, offensive, or extreme language
|
| 200 |
+
3. Simplify overly complex or robotic phrasing
|
| 201 |
+
4. Maintain the original meaning and key information
|
| 202 |
+
5. Use casual, conversational language
|
| 203 |
+
"""
|
| 204 |
if not settings.GROQ_API_KEY:
|
| 205 |
raise HTTPException(
|
| 206 |
status_code=503,
|
|
|
|
| 214 |
logs: list[str] = []
|
| 215 |
logs.append(f"Preparing rewrite request for Groq model: {settings.GROQ_MODEL}")
|
| 216 |
|
| 217 |
+
# Improved prompt that specifically targets reducing AI detection and harm scores
|
| 218 |
prompt = (
|
| 219 |
+
"You are a text editor helping to make writing sound more natural and human. "
|
| 220 |
+
"Rewrite the following text following these rules:\n\n"
|
| 221 |
+
"1. Use simple, casual, conversational language like a real person would write\n"
|
| 222 |
+
"2. Remove any offensive, harmful, hateful, or extreme language completely\n"
|
| 223 |
+
"3. Avoid robotic phrases, formal tone, or AI-typical patterns\n"
|
| 224 |
+
"4. Add small imperfections like informal contractions (e.g., 'it's', 'don't')\n"
|
| 225 |
+
"5. Keep the core meaning and key facts, but make it sound authentic\n"
|
| 226 |
+
"6. If the text is a greeting or simple message, keep it short and friendly\n\n"
|
| 227 |
+
"Return ONLY the rewritten text with no explanations, comments, or meta-text.\n\n"
|
| 228 |
+
f"Text to rewrite:\n{request.text}"
|
| 229 |
)
|
| 230 |
|
| 231 |
try:
|
|
|
|
| 241 |
"model": settings.GROQ_MODEL,
|
| 242 |
"messages": [{"role": "user", "content": prompt}],
|
| 243 |
"max_tokens": 8192,
|
| 244 |
+
"temperature": 0.8, # Increased from 0.7 for more natural variation
|
| 245 |
},
|
| 246 |
)
|
| 247 |
response.raise_for_status()
|
| 248 |
data = response.json()
|
| 249 |
fixed_text = data["choices"][0]["message"]["content"].strip()
|
| 250 |
+
|
| 251 |
+
# Remove any meta-commentary that the model might add
|
| 252 |
+
if fixed_text.startswith('Here') or fixed_text.startswith('Sure'):
|
| 253 |
+
# Try to extract just the rewritten text
|
| 254 |
+
lines = fixed_text.split('\n')
|
| 255 |
+
if len(lines) > 1:
|
| 256 |
+
fixed_text = '\n'.join(lines[1:]).strip()
|
| 257 |
+
|
| 258 |
logs.append("Groq model returned rewritten text successfully.")
|
| 259 |
except httpx.TimeoutException:
|
| 260 |
logger.warning("Groq API timeout in /api/assist")
|
backend/app/services/hf_service.py
CHANGED
|
@@ -106,11 +106,13 @@ async def get_embeddings(text: str) -> list[float]:
|
|
| 106 |
async def detect_harm(text: str) -> float:
|
| 107 |
"""Returns probability of harmful content (0-1). Non-fatal on failure.
|
| 108 |
|
| 109 |
-
The RoBERTa hate speech model returns
|
| 110 |
-
- 'hate'
|
| 111 |
-
- 'nothate'
|
| 112 |
|
| 113 |
-
We
|
|
|
|
|
|
|
| 114 |
"""
|
| 115 |
if not settings.HF_HARM_CLASSIFIER:
|
| 116 |
return 0.0
|
|
@@ -120,33 +122,37 @@ async def detect_harm(text: str) -> float:
|
|
| 120 |
if isinstance(result, list) and len(result) > 0:
|
| 121 |
labels = result[0] if isinstance(result[0], list) else result
|
| 122 |
|
| 123 |
-
#
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
# Look for labels that indicate HARMFUL content
|
| 127 |
-
if any(k in label for k in ("hate", "hateful", "toxic", "harmful")):
|
| 128 |
-
# Make sure it's NOT a "nothate" or "not harmful" label
|
| 129 |
-
if not any(neg in label for neg in ("not", "no", "non")):
|
| 130 |
-
return float(item["score"])
|
| 131 |
|
| 132 |
-
# If we only found "nothate" labels, return inverse score
|
| 133 |
for item in labels:
|
| 134 |
label = item.get("label", "").lower()
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
#
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
not any(neg in top_label for neg in ("not", "no", "non")):
|
| 146 |
-
return float(sorted_labels[0]["score"])
|
| 147 |
|
| 148 |
-
# If
|
|
|
|
| 149 |
return 0.0
|
|
|
|
| 150 |
return 0.0
|
| 151 |
except Exception as e:
|
| 152 |
logger.warning("HF harm classifier failed", error=str(e))
|
|
|
|
| 106 |
async def detect_harm(text: str) -> float:
|
| 107 |
"""Returns probability of harmful content (0-1). Non-fatal on failure.
|
| 108 |
|
| 109 |
+
The RoBERTa hate speech model returns two types of labels:
|
| 110 |
+
- Labels indicating HARMFUL content: 'hate', 'hateful', 'toxic', 'harmful', 'offensive'
|
| 111 |
+
- Labels indicating SAFE content: 'nothate', 'not hate', 'not harmful', 'safe', 'neutral'
|
| 112 |
|
| 113 |
+
CRITICAL: We must return the probability of HARMFUL content.
|
| 114 |
+
If the model says "95% nothate", we return 5% (1 - 0.95).
|
| 115 |
+
If the model says "95% hate", we return 95%.
|
| 116 |
"""
|
| 117 |
if not settings.HF_HARM_CLASSIFIER:
|
| 118 |
return 0.0
|
|
|
|
| 122 |
if isinstance(result, list) and len(result) > 0:
|
| 123 |
labels = result[0] if isinstance(result[0], list) else result
|
| 124 |
|
| 125 |
+
# Strategy: Find the label that clearly indicates harm status
|
| 126 |
+
harmful_score = None
|
| 127 |
+
safe_score = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
|
|
|
| 129 |
for item in labels:
|
| 130 |
label = item.get("label", "").lower()
|
| 131 |
+
score = float(item.get("score", 0))
|
| 132 |
+
|
| 133 |
+
# Check if this is a HARMFUL label (without negation)
|
| 134 |
+
is_harmful_label = any(k in label for k in ("hate", "hateful", "toxic", "harmful", "offensive", "label_1", "class_1"))
|
| 135 |
+
has_negation = any(neg in label for neg in ("not", "no", "non", "nothate"))
|
| 136 |
+
|
| 137 |
+
if is_harmful_label and not has_negation:
|
| 138 |
+
# This is a harmful label: use its score directly
|
| 139 |
+
harmful_score = score
|
| 140 |
+
break
|
| 141 |
+
elif has_negation or any(safe in label for safe in ("safe", "neutral", "label_0", "class_0")):
|
| 142 |
+
# This is a safe label: we'll invert it if needed
|
| 143 |
+
safe_score = score
|
| 144 |
|
| 145 |
+
# Return the harmful probability
|
| 146 |
+
if harmful_score is not None:
|
| 147 |
+
return round(harmful_score, 4)
|
| 148 |
+
elif safe_score is not None:
|
| 149 |
+
# If we only have a "safe" probability, return 1 - safe_probability
|
| 150 |
+
return round(1.0 - safe_score, 4)
|
|
|
|
|
|
|
| 151 |
|
| 152 |
+
# Fallback: If we can't determine, assume safe (return 0)
|
| 153 |
+
logger.warning("Could not determine harm classification from labels", labels=labels)
|
| 154 |
return 0.0
|
| 155 |
+
|
| 156 |
return 0.0
|
| 157 |
except Exception as e:
|
| 158 |
logger.warning("HF harm classifier failed", error=str(e))
|