Spaces:

Arko007
/

security

Running

App Files Files Community

GitHub Actions commited on 10 days ago

Commit

89be117

1 Parent(s): e60ea8b

Deploy backend from GitHub b167f101798e46fb013386213c55be6f612b2b27

Browse files

Files changed (2) hide show

backend/app/api/routes.py +29 -6
backend/app/services/hf_service.py +31 -25

backend/app/api/routes.py CHANGED Viewed

@@ -192,7 +192,15 @@ async def get_result(result_id: str):
 @router.post("/assist", response_model=AssistResponse)
 async def assist_text(request: AssistRequest):
-    """Call Groq API to propose a rewrite of flagged text to reduce AI threat indicators."""
     if not settings.GROQ_API_KEY:
         raise HTTPException(
             status_code=503,
@@ -206,11 +214,18 @@ async def assist_text(request: AssistRequest):
     logs: list[str] = []
     logs.append(f"Preparing rewrite request for Groq model: {settings.GROQ_MODEL}")
     prompt = (
-        "You are a text editor. Rewrite the following text to sound more natural and human-authored "
-        "while preserving the original meaning and factual content. "
-        "Return only the rewritten text without any explanation or commentary.\n\n"
-        f"Original text:\n{request.text}"
     )
     try:
@@ -226,12 +241,20 @@ async def assist_text(request: AssistRequest):
                     "model": settings.GROQ_MODEL,
                     "messages": [{"role": "user", "content": prompt}],
                     "max_tokens": 8192,
-                    "temperature": 0.7,
                 },
             )
             response.raise_for_status()
             data = response.json()
             fixed_text = data["choices"][0]["message"]["content"].strip()
             logs.append("Groq model returned rewritten text successfully.")
     except httpx.TimeoutException:
         logger.warning("Groq API timeout in /api/assist")

 @router.post("/assist", response_model=AssistResponse)
 async def assist_text(request: AssistRequest):
+    """Call Groq API to propose a rewrite of flagged text to reduce AI threat indicators.
+    The AI Fixer should:
+    1. Make text sound more human and natural
+    2. Remove any harmful, offensive, or extreme language
+    3. Simplify overly complex or robotic phrasing
+    4. Maintain the original meaning and key information
+    5. Use casual, conversational language
+    """
     if not settings.GROQ_API_KEY:
         raise HTTPException(
             status_code=503,
     logs: list[str] = []
     logs.append(f"Preparing rewrite request for Groq model: {settings.GROQ_MODEL}")
+    # Improved prompt that specifically targets reducing AI detection and harm scores
     prompt = (
+        "You are a text editor helping to make writing sound more natural and human. "
+        "Rewrite the following text following these rules:\n\n"
+        "1. Use simple, casual, conversational language like a real person would write\n"
+        "2. Remove any offensive, harmful, hateful, or extreme language completely\n"
+        "3. Avoid robotic phrases, formal tone, or AI-typical patterns\n"
+        "4. Add small imperfections like informal contractions (e.g., 'it's', 'don't')\n"
+        "5. Keep the core meaning and key facts, but make it sound authentic\n"
+        "6. If the text is a greeting or simple message, keep it short and friendly\n\n"
+        "Return ONLY the rewritten text with no explanations, comments, or meta-text.\n\n"
+        f"Text to rewrite:\n{request.text}"
     )
     try:
                     "model": settings.GROQ_MODEL,
                     "messages": [{"role": "user", "content": prompt}],
                     "max_tokens": 8192,
+                    "temperature": 0.8,  # Increased from 0.7 for more natural variation
                 },
             )
             response.raise_for_status()
             data = response.json()
             fixed_text = data["choices"][0]["message"]["content"].strip()
+            # Remove any meta-commentary that the model might add
+            if fixed_text.startswith('Here') or fixed_text.startswith('Sure'):
+                # Try to extract just the rewritten text
+                lines = fixed_text.split('\n')
+                if len(lines) > 1:
+                    fixed_text = '\n'.join(lines[1:]).strip()
             logs.append("Groq model returned rewritten text successfully.")
     except httpx.TimeoutException:
         logger.warning("Groq API timeout in /api/assist")

backend/app/services/hf_service.py CHANGED Viewed

@@ -106,11 +106,13 @@ async def get_embeddings(text: str) -> list[float]:
 async def detect_harm(text: str) -> float:
     """Returns probability of harmful content (0-1). Non-fatal on failure.
-    The RoBERTa hate speech model returns labels like:
-    - 'hate' or 'hateful' for harmful content
-    - 'nothate' or 'not hate' for safe content
-    We need to return the score for the HARMFUL class, not just any matching label.
     """
     if not settings.HF_HARM_CLASSIFIER:
         return 0.0
@@ -120,33 +122,37 @@ async def detect_harm(text: str) -> float:
         if isinstance(result, list) and len(result) > 0:
             labels = result[0] if isinstance(result[0], list) else result
-            # First, try to find explicit harmful labels
-            for item in labels:
-                label = item.get("label", "").lower()
-                # Look for labels that indicate HARMFUL content
-                if any(k in label for k in ("hate", "hateful", "toxic", "harmful")):
-                    # Make sure it's NOT a "nothate" or "not harmful" label
-                    if not any(neg in label for neg in ("not", "no", "non")):
-                        return float(item["score"])
-            # If we only found "nothate" labels, return inverse score
             for item in labels:
                 label = item.get("label", "").lower()
-                if any(neg in label for neg in ("nothate", "not hate", "not harmful")):
-                    # Return 1 - score (if 95% not harmful, then 5% harmful)
-                    return float(1.0 - item["score"])
-            # Fallback: If model returns generic labels, assume lower score is safer
-            # Sort by score descending and check if highest is harmful
-            sorted_labels = sorted(labels, key=lambda x: x.get("score", 0), reverse=True)
-            if sorted_labels:
-                top_label = sorted_labels[0].get("label", "").lower()
-                if any(k in top_label for k in ("hate", "toxic", "harmful")) and \
-                   not any(neg in top_label for neg in ("not", "no", "non")):
-                    return float(sorted_labels[0]["score"])
-            # If still no match, return 0 (safe)
             return 0.0
         return 0.0
     except Exception as e:
         logger.warning("HF harm classifier failed", error=str(e))

 async def detect_harm(text: str) -> float:
     """Returns probability of harmful content (0-1). Non-fatal on failure.
+    The RoBERTa hate speech model returns two types of labels:
+    - Labels indicating HARMFUL content: 'hate', 'hateful', 'toxic', 'harmful', 'offensive'
+    - Labels indicating SAFE content: 'nothate', 'not hate', 'not harmful', 'safe', 'neutral'
+    CRITICAL: We must return the probability of HARMFUL content.
+    If the model says "95% nothate", we return 5% (1 - 0.95).
+    If the model says "95% hate", we return 95%.
     """
     if not settings.HF_HARM_CLASSIFIER:
         return 0.0
         if isinstance(result, list) and len(result) > 0:
             labels = result[0] if isinstance(result[0], list) else result
+            # Strategy: Find the label that clearly indicates harm status
+            harmful_score = None
+            safe_score = None
             for item in labels:
                 label = item.get("label", "").lower()
+                score = float(item.get("score", 0))
+                # Check if this is a HARMFUL label (without negation)
+                is_harmful_label = any(k in label for k in ("hate", "hateful", "toxic", "harmful", "offensive", "label_1", "class_1"))
+                has_negation = any(neg in label for neg in ("not", "no", "non", "nothate"))
+                if is_harmful_label and not has_negation:
+                    # This is a harmful label: use its score directly
+                    harmful_score = score
+                    break
+                elif has_negation or any(safe in label for safe in ("safe", "neutral", "label_0", "class_0")):
+                    # This is a safe label: we'll invert it if needed
+                    safe_score = score
+            # Return the harmful probability
+            if harmful_score is not None:
+                return round(harmful_score, 4)
+            elif safe_score is not None:
+                # If we only have a "safe" probability, return 1 - safe_probability
+                return round(1.0 - safe_score, 4)
+            # Fallback: If we can't determine, assume safe (return 0)
+            logger.warning("Could not determine harm classification from labels", labels=labels)
             return 0.0
         return 0.0
     except Exception as e:
         logger.warning("HF harm classifier failed", error=str(e))