Shreya Pal commited on
Commit
4bd6a99
·
1 Parent(s): 6325644

Swap moderation endpoint from OpenAI to free HuggingFace API

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. server/app.py +43 -64
requirements.txt CHANGED
@@ -15,4 +15,5 @@ scikit-learn==1.4.2
15
  huggingface_hub==0.23.0
16
  openai
17
  openenv-core
18
- python-dotenv
 
 
15
  huggingface_hub==0.23.0
16
  openai
17
  openenv-core
18
+ python-dotenv
19
+ requests
server/app.py CHANGED
@@ -4,6 +4,7 @@ from fastapi.middleware.cors import CORSMiddleware
4
  import os
5
 
6
  import json
 
7
  from openai import OpenAI
8
  from dotenv import load_dotenv
9
 
@@ -107,74 +108,52 @@ async def state():
107
  class ModerationRequest(BaseModel):
108
  text: str
109
 
110
- def openai_moderate(text: str, hf_scores: dict) -> dict:
111
- system_prompt = """You are an expert content moderation AI.
 
 
 
 
 
112
 
113
- You will receive:
114
- 1. The original text submitted by a user
115
- 2. Toxicity scores (0.0-1.0) from a HuggingFace RoBERTa model across harm categories
116
 
117
- Your job is to make a final moderation decision based on the FULL CONTEXT and INTENT of the text -- not just individual words or scores. Consider:
118
- - Sarcasm, irony, or dark humour that may look toxic but is not genuinely harmful
119
- - Coded language or subtle threats that low scores might miss
120
- - Context that changes meaning (e.g. "I'll destroy you at chess" vs a real threat)
121
- - Whether content targets a specific person or group maliciously
122
 
123
- Respond ONLY with a valid JSON object -- no markdown fences, no extra text:
124
- {
125
- "decision": "allow" or "flag" or "remove",
126
- "confidence": <float 0.0-1.0>,
127
- "explanation": "<1-2 sentence plain-English explanation of your reasoning>"
128
- }
129
 
130
- allow = safe content, no harm intended or likely
131
- flag = ambiguous, mildly toxic, sarcastic, context-dependent -- needs human review
132
- remove = clear hate speech, credible threats, targeted harassment, highly toxic content"""
 
 
 
133
 
134
- user_prompt = f"Text to moderate: {text}\n\nToxicity Scores:\n{json.dumps(hf_scores, indent=2)}"
135
 
136
- try:
137
- response = get_openai_client().chat.completions.create(
138
- model="gpt-4o-mini",
139
- messages=[
140
- {"role": "system", "content": system_prompt},
141
- {"role": "user", "content": user_prompt}
142
- ],
143
- response_format={"type": "json_object"},
144
- temperature=0.0
145
- )
146
- content = response.choices[0].message.content.strip()
147
-
148
- # Security strip markdown edges
149
- if content.startswith("```"):
150
- lines = content.split("\n")
151
- if lines[0].startswith("```"):
152
- lines = lines[1:]
153
- if lines and lines[-1].startswith("```"):
154
- lines = lines[:-1]
155
- content = "\n".join(lines).strip()
156
-
157
- data = json.loads(content)
158
-
159
- # Hard cap enforcements
160
- decision = data.get("decision", "flag").lower()
161
- if decision not in ["allow", "flag", "remove"]:
162
- decision = "flag"
163
-
164
- try:
165
- confidence = float(data.get("confidence", 0.5))
166
- except:
167
- confidence = 0.5
168
- confidence = max(0.0, min(1.0, confidence))
169
-
170
- return {
171
- "decision": decision,
172
- "confidence": confidence,
173
- "explanation": str(data.get("explanation", "Automatically flagged due to complex parsing context."))
174
- }
175
-
176
- except Exception as e:
177
- raise RuntimeError(f"OpenAI call failed: {e}")
178
 
179
  @app.post("/moderate")
180
  def moderate(request: ModerationRequest):
@@ -208,8 +187,8 @@ def moderate(request: ModerationRequest):
208
  "obscene": float(scores.get("obscene", 0.0))
209
  }
210
 
211
- # Stage 2: OpenAI Deep Reasoning
212
- llm_result = openai_moderate(text, ai_scores)
213
 
214
  return {
215
  "decision": llm_result["decision"],
 
4
  import os
5
 
6
  import json
7
+ import requests
8
  from openai import OpenAI
9
  from dotenv import load_dotenv
10
 
 
108
  class ModerationRequest(BaseModel):
109
  text: str
110
 
111
+ def hf_moderate(text: str, hf_scores: dict) -> dict:
112
+ api_key = os.getenv("HF_TOKEN")
113
+
114
+ relevant_keys = ["toxicity", "severe_toxicity", "insult", "threat", "obscene", "identity_attack"]
115
+ filtered_scores = {k: round(hf_scores.get(k, 0.0), 3) for k in relevant_keys if k in hf_scores}
116
+
117
+ prompt = f"""<s>[INST] You are a content moderation AI. Given the text and toxicity scores below, respond ONLY with a JSON object — no markdown, no extra text.
118
 
119
+ Text: "{text}"
120
+ Toxicity scores: {json.dumps(filtered_scores)}
 
121
 
122
+ Rules:
123
+ - "allow" = safe, no harm intended
124
+ - "flag" = ambiguous, sarcastic, or mildly toxic
125
+ - "remove" = hate speech, threats, harassment
 
126
 
127
+ Respond with exactly this format:
128
+ {{\"decision\": \"allow\" or \"flag\" or \"remove\", \"confidence\": <0.0-1.0>, \"explanation\": \"<1 sentence reason>\"}} [/INST]"""
 
 
 
 
129
 
130
+ response = requests.post(
131
+ "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3",
132
+ headers={"Authorization": f"Bearer {api_key}"},
133
+ json={"inputs": prompt, "parameters": {"max_new_tokens": 100, "return_full_text": False}},
134
+ timeout=30
135
+ )
136
 
137
+ raw = response.json()
138
 
139
+ if isinstance(raw, list):
140
+ text_out = raw[0].get("generated_text", "")
141
+ else:
142
+ text_out = str(raw)
143
+
144
+ import re
145
+ match = re.search(r'\{.*?\}', text_out, re.DOTALL)
146
+ if match:
147
+ result = json.loads(match.group())
148
+ else:
149
+ raise ValueError(f"No JSON found in response: {text_out}")
150
+
151
+ result["decision"] = result.get("decision", "flag").lower()
152
+ if result["decision"] not in ("allow", "flag", "remove"):
153
+ result["decision"] = "flag"
154
+ result["confidence"] = min(max(float(result.get("confidence", 0.5)), 0.0), 1.0)
155
+ result["explanation"] = result.get("explanation", "No explanation provided.")
156
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  @app.post("/moderate")
159
  def moderate(request: ModerationRequest):
 
187
  "obscene": float(scores.get("obscene", 0.0))
188
  }
189
 
190
+ # Stage 2: HuggingFace Deep Reasoning
191
+ llm_result = hf_moderate(text, ai_scores)
192
 
193
  return {
194
  "decision": llm_result["decision"],