Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import pipeline | |
| app = FastAPI(title="AMEP Multi-Layer Moderation") | |
| # 1. Toxicity Model (Your existing one) | |
| toxic_moderator = pipeline("text-classification", model="unitary/toxic-bert", top_k=None) | |
| # 2. Safety Model (Zero-Shot for Drugs/Illegal content) | |
| # This model can detect categories you define yourself | |
| safety_moderator = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| class Content(BaseModel): | |
| text: str | |
| user_id: str | |
| def check_content_safety(text: str): | |
| # --- Layer 1: Toxicity Check --- | |
| toxic_results = toxic_moderator(text)[0] | |
| for category in toxic_results: | |
| if category['score'] > 0.7: | |
| return {"is_safe": False, "reason": f"Toxic ({category['label']})"} | |
| # --- Layer 2: Illegal Substance/Drug Check --- | |
| # We ask the model to rank the text against these specific labels | |
| candidate_labels = [ | |
| "harmful instruction", | |
| "promotion of drugs or self-harm", | |
| "educational or preventive discussion", | |
| "benign academic content" | |
| ] | |
| safety_results = safety_moderator(text, candidate_labels) | |
| # If the top result is "drug use" or "illegal" with high confidence, reject it | |
| top_label = safety_results['labels'][0] | |
| top_score = safety_results['scores'][0] | |
| if top_label in ["drug use or sales", "illegal activity"] and top_score > 0.6: | |
| return {"is_safe": False, "reason": "Prohibited Content (Drugs/Illegal)"} | |
| return {"is_safe": True} | |
| async def moderate_question(content: Content): | |
| result = check_content_safety(content.text) | |
| if not result["is_safe"]: | |
| return {"status": "rejected", "message": f"Question flagged: {result['reason']}"} | |
| return {"status": "approved", "message": "Question is safe"} | |
| # ... keep the /moderate/answer endpoint the same using check_content_safety | |
| async def moderate_answer(content: Content): | |
| result = check_content_safety(content.text) | |
| if not result["is_safe"]: | |
| return {"status": "rejected", "message": f"Answer contains {result['reason']}"} | |
| return {"status": "approved", "message": "Answer verified"} | |