text_moderator / app.py
Samyak000's picture
Update app.py
6cab860 verified
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
app = FastAPI(title="AMEP Multi-Layer Moderation")
# 1. Toxicity Model (Your existing one)
toxic_moderator = pipeline("text-classification", model="unitary/toxic-bert", top_k=None)
# 2. Safety Model (Zero-Shot for Drugs/Illegal content)
# This model can detect categories you define yourself
safety_moderator = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
class Content(BaseModel):
text: str
user_id: str
def check_content_safety(text: str):
# --- Layer 1: Toxicity Check ---
toxic_results = toxic_moderator(text)[0]
for category in toxic_results:
if category['score'] > 0.7:
return {"is_safe": False, "reason": f"Toxic ({category['label']})"}
# --- Layer 2: Illegal Substance/Drug Check ---
# We ask the model to rank the text against these specific labels
candidate_labels = [
"harmful instruction",
"promotion of drugs or self-harm",
"educational or preventive discussion",
"benign academic content"
]
safety_results = safety_moderator(text, candidate_labels)
# If the top result is "drug use" or "illegal" with high confidence, reject it
top_label = safety_results['labels'][0]
top_score = safety_results['scores'][0]
if top_label in ["drug use or sales", "illegal activity"] and top_score > 0.6:
return {"is_safe": False, "reason": "Prohibited Content (Drugs/Illegal)"}
return {"is_safe": True}
@app.post("/moderate/question")
async def moderate_question(content: Content):
result = check_content_safety(content.text)
if not result["is_safe"]:
return {"status": "rejected", "message": f"Question flagged: {result['reason']}"}
return {"status": "approved", "message": "Question is safe"}
# ... keep the /moderate/answer endpoint the same using check_content_safety
@app.post("/moderate/answer")
async def moderate_answer(content: Content):
result = check_content_safety(content.text)
if not result["is_safe"]:
return {"status": "rejected", "message": f"Answer contains {result['reason']}"}
return {"status": "approved", "message": "Answer verified"}