Spaces:
Running
Running
| """ | |
| Feedback Moderation API | |
| ======================= | |
| AI-powered microservice that detects toxic / abusive language in text | |
| using local Hugging Face models, enhanced with an optional LLM | |
| verification layer. | |
| Architecture | |
| ------------ | |
| 1. **Local classifier** – fast, private, runs entirely on your machine. | |
| * *Multilingual model* – English, French, Italian, and other languages. | |
| * *Dedicated Arabic model* – higher accuracy for Arabic text. | |
| 2. **LLM verification** (optional) – when the classifier's confidence | |
| falls in a configurable "grey zone", the text is sent to a free | |
| Hugging Face Inference API LLM for a second opinion. This catches | |
| edge cases without adding latency to clear-cut predictions. | |
| 3. **Language detection** – powered by *lingua-language-detector*. | |
| """ | |
| import json | |
| import logging | |
| import os | |
| import re | |
| from contextlib import asynccontextmanager | |
| from dotenv import load_dotenv | |
| from fastapi import Depends, FastAPI, HTTPException, Security | |
| from fastapi.security import APIKeyHeader | |
| from lingua import Language, LanguageDetectorBuilder | |
| from openai import OpenAI | |
| from pydantic import BaseModel, Field | |
| from transformers import pipeline | |
| logger = logging.getLogger("moderator") | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)s %(name)s: %(message)s", | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| load_dotenv() | |
| API_KEY: str = os.getenv("MODERATOR_API_KEY", "change-me-before-production") | |
| TOXICITY_THRESHOLD: float = float(os.getenv("TOXICITY_THRESHOLD", "0.70")) | |
| ARABIC_TOXICITY_THRESHOLD: float = float( | |
| os.getenv("ARABIC_TOXICITY_THRESHOLD", "0.45") | |
| ) | |
| MODEL_NAME: str = os.getenv( | |
| "MODEL_NAME", "citizenlab/distilbert-base-multilingual-cased-toxicity" | |
| ) | |
| ARABIC_MODEL_NAME: str = os.getenv( | |
| "ARABIC_MODEL_NAME", "Hate-speech-CNERG/dehatebert-mono-arabic" | |
| ) | |
| # LLM verification settings | |
| OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY") | |
| ARABIC_LLM_MODEL_NAME: str = os.getenv( | |
| "ARABIC_LLM_MODEL_NAME", "glm-5.1" | |
| ) | |
| ARABIC_LLM_BASE_URL: str = os.getenv( | |
| "ARABIC_LLM_BASE_URL", | |
| "https://opencode.ai/zen/go/v1", | |
| ) | |
| LLM_VERIFY_LOW: float = float(os.getenv("LLM_VERIFY_LOW", "0.40")) | |
| LLM_VERIFY_HIGH: float = float(os.getenv("LLM_VERIFY_HIGH", "0.85")) | |
| # --------------------------------------------------------------------------- | |
| # Language detector – lightweight, built once at import time | |
| # --------------------------------------------------------------------------- | |
| language_detector = ( | |
| LanguageDetectorBuilder.from_languages( | |
| Language.ARABIC, Language.ENGLISH, Language.FRENCH, Language.ITALIAN, | |
| ) | |
| .with_preloaded_language_models() | |
| .build() | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # AI Models – loaded once at startup via the lifespan context manager | |
| # --------------------------------------------------------------------------- | |
| toxicity_classifier = None # multilingual (en / fr / it / …) | |
| arabic_classifier = None # dedicated Arabic hate-speech model | |
| arabic_llm_client: OpenAI | None = None # OpenRouter client for Arabic LLM | |
| def _load_pipeline_with_retry(task: str, model: str, max_retries: int = 5): | |
| """Load a HF pipeline with exponential backoff for rate-limit errors.""" | |
| import time | |
| for attempt in range(1, max_retries + 1): | |
| try: | |
| return pipeline( | |
| task, | |
| model=model, | |
| tokenizer=model, | |
| ) | |
| except (OSError, ValueError) as exc: | |
| if "429" in str(exc) and attempt < max_retries: | |
| wait = 2 ** attempt # 2, 4, 8, 16, 32 s | |
| logger.warning( | |
| "Rate-limited loading %s (attempt %d/%d). " | |
| "Retrying in %ds …", | |
| model, attempt, max_retries, wait, | |
| ) | |
| time.sleep(wait) | |
| else: | |
| raise | |
| async def lifespan(app: FastAPI): | |
| """Load both AI models when the server starts; release on shutdown.""" | |
| global toxicity_classifier, arabic_classifier, arabic_llm_client | |
| toxicity_classifier = _load_pipeline_with_retry( | |
| "text-classification", MODEL_NAME, | |
| ) | |
| arabic_classifier = _load_pipeline_with_retry( | |
| "text-classification", ARABIC_MODEL_NAME, | |
| ) | |
| # Set up the Arabic LLM client via OpenRouter (optional) | |
| if OPENROUTER_API_KEY: | |
| arabic_llm_client = OpenAI( | |
| base_url=ARABIC_LLM_BASE_URL, | |
| api_key=OPENROUTER_API_KEY, | |
| ) | |
| logger.info( | |
| "Arabic LLM verification enabled (model=%s, base_url=%s)", | |
| ARABIC_LLM_MODEL_NAME, | |
| ARABIC_LLM_BASE_URL, | |
| ) | |
| else: | |
| arabic_llm_client = None | |
| logger.info( | |
| "Arabic LLM verification disabled – set OPENROUTER_API_KEY to enable" | |
| ) | |
| yield | |
| toxicity_classifier = None | |
| arabic_classifier = None | |
| arabic_llm_client = None | |
| # --------------------------------------------------------------------------- | |
| # FastAPI Application | |
| # --------------------------------------------------------------------------- | |
| app = FastAPI( | |
| title="Wasla Feedback Moderation API", | |
| description=( | |
| "## Overview\n\n" | |
| "AI-powered content moderation microservice that detects **toxic, abusive, " | |
| "hateful, and offensive language** in user-submitted text across multiple languages.\n\n" | |
| "### 🏗️ Architecture — Three Layers of AI\n\n" | |
| "| Layer | Purpose | Latency |\n" | |
| "| ----- | ------- | ------- |\n" | |
| "| **Local Multilingual Model** | Fast classification for English, French, Italian & more | ~50 ms |\n" | |
| "| **Dedicated Arabic Model** | Higher accuracy for Arabic text (87.8 % val. accuracy) | ~50 ms |\n" | |
| "| **LLM Verification** *(optional)* | Second opinion for ambiguous predictions | ~1–3 s |\n\n" | |
| "### 🌍 Supported Languages\n\n" | |
| "| Language | ISO Code | Model |\n" | |
| "| -------- | -------- | ----- |\n" | |
| "| English | `en` | Multilingual |\n" | |
| "| Arabic | `ar` | Dedicated Arabic |\n" | |
| "| French | `fr` | Multilingual |\n" | |
| "| Italian | `it` | Multilingual |\n\n" | |
| "### 🔒 Authentication\n\n" | |
| "All moderation endpoints require an **API key** sent via the `X-API-Key` header. " | |
| "The health endpoint is public.\n\n" | |
| "### 🤖 LLM Verification\n\n" | |
| "When the local model's confidence falls in a configurable grey zone " | |
| "(default: 0.40 – 0.85), the text is forwarded to a Hugging Face Inference API LLM " | |
| "for a second opinion. The blended score is returned. Set `HF_TOKEN` to enable.\n\n" | |
| "---\n" | |
| "*Data never leaves your infrastructure (except optional LLM calls via HF API).*" | |
| ), | |
| version="1.2.0", | |
| lifespan=lifespan, | |
| openapi_tags=[ | |
| { | |
| "name": "Health", | |
| "description": "Server health and readiness checks. No authentication required.", | |
| }, | |
| { | |
| "name": "Moderation", | |
| "description": ( | |
| "Core content moderation endpoints. Submit text and receive a " | |
| "toxicity verdict with confidence score, detected language, and " | |
| "LLM verification status." | |
| ), | |
| }, | |
| ], | |
| license_info={ | |
| "name": "MIT", | |
| "url": "https://opensource.org/licenses/MIT", | |
| }, | |
| contact={ | |
| "name": "Wasla API Support", | |
| }, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Security – API Key via X-API-Key header | |
| # --------------------------------------------------------------------------- | |
| api_key_header = APIKeyHeader( | |
| name="X-API-Key", | |
| auto_error=True, | |
| description=( | |
| "Your secret API key. Send it in the `X-API-Key` header with every " | |
| "request to the moderation endpoint. Set via the `MODERATOR_API_KEY` " | |
| "environment variable on the server." | |
| ), | |
| ) | |
| def verify_api_key(api_key: str = Security(api_key_header)) -> str: | |
| """Reject requests that do not carry a valid API key.""" | |
| if api_key != API_KEY: | |
| raise HTTPException( | |
| status_code=403, | |
| detail="Invalid or missing API key.", | |
| ) | |
| return api_key | |
| # --------------------------------------------------------------------------- | |
| # Request / Response schemas | |
| # --------------------------------------------------------------------------- | |
| class FeedbackRequest(BaseModel): | |
| """Incoming text to moderate.""" | |
| text: str = Field( | |
| ..., | |
| min_length=1, | |
| max_length=5000, | |
| description=( | |
| "The user-submitted text to analyse for toxicity. " | |
| "Supports English, Arabic, French, and Italian. " | |
| "The language is auto-detected and routed to the " | |
| "appropriate model. Maximum 5 000 characters." | |
| ), | |
| json_schema_extra={ | |
| "examples": [ | |
| "This product is terrible and I hate everything about it!", | |
| "شكراً لكم على الخدمة الممتازة", | |
| "Merci beaucoup pour votre aide", | |
| ], | |
| }, | |
| ) | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| {"text": "You are stupid and worthless"}, | |
| {"text": "أنت حيوان قذر"}, | |
| {"text": "Thank you for the great service!"}, | |
| ] | |
| } | |
| } | |
| class ModerationResponse(BaseModel): | |
| """ | |
| Toxicity analysis result. | |
| The response contains a boolean verdict (`has_bad_words`), a numeric | |
| confidence score, the raw classifier label, the detected language, | |
| and whether the optional LLM verification layer was used. | |
| """ | |
| has_bad_words: bool = Field( | |
| ..., | |
| description=( | |
| "**True** when the text is classified as toxic with confidence " | |
| "above the configured threshold (default 0.70, or 0.45 for Arabic). " | |
| "Use this as the primary flag to accept or reject user content." | |
| ), | |
| ) | |
| confidence: float = Field( | |
| ..., | |
| ge=0.0, | |
| le=1.0, | |
| description=( | |
| "Probability that the text is toxic, normalised to a 0.0 – 1.0 scale. " | |
| "**0.0** = certainly clean, **1.0** = certainly toxic. " | |
| "When LLM verification is used, this is a blended score " | |
| "(40 % local model + 60 % LLM)." | |
| ), | |
| ) | |
| label: str = Field( | |
| ..., | |
| description=( | |
| "Raw label from the classifier. Possible values depend on the model:\n" | |
| "- Multilingual model: `toxic` or `not_toxic`\n" | |
| "- Arabic model: `HATE` or `NON_HATE`" | |
| ), | |
| ) | |
| detected_language: str = Field( | |
| ..., | |
| description=( | |
| "ISO 639-1 language code detected in the input text. " | |
| "Possible values: `en` (English), `ar` (Arabic), " | |
| "`fr` (French), `it` (Italian), or `unknown`." | |
| ), | |
| ) | |
| llm_verified: bool = Field( | |
| False, | |
| description=( | |
| "**True** when the LLM verification layer was invoked to " | |
| "refine the prediction. This happens when the local model's " | |
| "confidence falls in the grey zone and `HF_TOKEN` is configured." | |
| ), | |
| ) | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "has_bad_words": True, | |
| "confidence": 0.9944, | |
| "label": "toxic", | |
| "detected_language": "en", | |
| "llm_verified": False, | |
| }, | |
| { | |
| "has_bad_words": True, | |
| "confidence": 0.7728, | |
| "label": "HATE", | |
| "detected_language": "ar", | |
| "llm_verified": True, | |
| }, | |
| { | |
| "has_bad_words": False, | |
| "confidence": 0.0015, | |
| "label": "not_toxic", | |
| "detected_language": "en", | |
| "llm_verified": False, | |
| }, | |
| ] | |
| } | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| # Maps lingua Language enum → ISO 639-1 codes | |
| _LANG_ISO = { | |
| Language.ARABIC: "ar", | |
| Language.ENGLISH: "en", | |
| Language.FRENCH: "fr", | |
| Language.ITALIAN: "it", | |
| } | |
| def _detect_language(text: str) -> str: | |
| """Return ISO 639-1 code for the detected language, or 'unknown'.""" | |
| detected = language_detector.detect_language_of(text) | |
| return _LANG_ISO.get(detected, "unknown") | |
| def _normalise_toxic_score(label: str, score: float, is_arabic: bool) -> float: | |
| """ | |
| Convert the raw classifier output into a single "toxic probability" | |
| value in [0, 1], regardless of model. | |
| * **Multilingual model** labels: ``toxic`` / ``not_toxic`` | |
| * **Arabic model** labels: ``HATE`` / ``NON_HATE`` | |
| """ | |
| if is_arabic: | |
| # dehatebert-mono-arabic: HATE → toxic, NON_HATE → clean | |
| return score if label == "HATE" else 1.0 - score | |
| else: | |
| # multilingual: toxic → toxic, not_toxic → clean | |
| return score if label == "toxic" else 1.0 - score | |
| _ARABIC_LLM_SYSTEM_PROMPT = ( | |
| "You are an Arabic content moderation assistant. " | |
| "Given Arabic text, decide if it contains toxic, abusive, hateful, or obscene language.\n" | |
| "IMPORTANT: Reply with ONLY a JSON object, nothing else. No explanation, no markdown, no thinking aloud.\n" | |
| "Example reply: {\"toxic\": true, \"score\": 0.92}\n" | |
| "\"toxic\" is boolean. \"score\" is your confidence from 0.0 (clean) to 1.0 (toxic).\n" | |
| "Now reply with JSON only." | |
| ) | |
| def _arabic_llm_verify(text: str) -> dict | None: | |
| """ | |
| Ask the OpenRouter Arabic LLM whether *text* is toxic. | |
| Returns ``{"toxic": bool, "score": float}`` on success, | |
| or ``None`` if the LLM is unavailable / returns garbage. | |
| """ | |
| if arabic_llm_client is None: | |
| logger.warning("arabic_llm_client is None – skipping LLM call") | |
| return None | |
| try: | |
| logger.info("Calling Arabic LLM model=%s base_url=%s", ARABIC_LLM_MODEL_NAME, ARABIC_LLM_BASE_URL) | |
| response = arabic_llm_client.chat.completions.create( | |
| model=ARABIC_LLM_MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": _ARABIC_LLM_SYSTEM_PROMPT}, | |
| {"role": "user", "content": text}, | |
| ], | |
| max_tokens=2048, | |
| temperature=0.0, | |
| ) | |
| if not response.choices: | |
| logger.warning("No choices in LLM response") | |
| return None | |
| choice = response.choices[0] | |
| logger.info("LLM finish_reason=%s", choice.finish_reason) | |
| # The model may put the answer in reasoning_content (thinking) or content | |
| raw_raw = choice.message.content | |
| reasoning = getattr(choice.message, "reasoning_content", None) | |
| raw = None | |
| if raw_raw and raw_raw.strip(): | |
| raw = raw_raw.strip() | |
| elif reasoning and reasoning.strip(): | |
| raw = reasoning.strip() | |
| if raw is None: | |
| logger.warning("LLM returned empty content and reasoning") | |
| return None | |
| # Try to extract JSON from the response even if wrapped in markdown | |
| json_match = re.search(r"\{.*\}", raw, re.DOTALL) | |
| if not json_match: | |
| logger.warning("LLM returned non-JSON: %s", raw[:200]) | |
| return None | |
| result = json.loads(json_match.group()) | |
| if "toxic" in result and "score" in result: | |
| logger.info("LLM result: toxic=%s score=%s", result["toxic"], result["score"]) | |
| return { | |
| "toxic": bool(result["toxic"]), | |
| "score": float(result["score"]), | |
| } | |
| logger.warning("LLM JSON missing keys: %s", result) | |
| return None | |
| except Exception as exc: | |
| logger.exception("Arabic LLM verification failed: %s", exc) | |
| return None | |
| # --------------------------------------------------------------------------- | |
| # Endpoints | |
| # --------------------------------------------------------------------------- | |
| def health_check(): | |
| """Liveness / readiness probe for orchestrators (Docker, K8s, etc.).""" | |
| return { | |
| "status": "ok", | |
| "multilingual_model_loaded": toxicity_classifier is not None, | |
| "arabic_model_loaded": arabic_classifier is not None, | |
| "arabic_llm_verification_enabled": arabic_llm_client is not None, | |
| } | |
| def moderate_feedback( | |
| request: FeedbackRequest, | |
| _api_key: str = Depends(verify_api_key), | |
| ): | |
| """ | |
| Run the AI toxicity classifier on the submitted text. | |
| Pipeline: | |
| 1. Detect the language of the input text. | |
| 2. Route Arabic → dedicated Arabic model, others → multilingual. | |
| 3. Normalise confidence to a single toxic probability. | |
| 4. Arabic text → always ask the LLM for moderation and blend scores. | |
| """ | |
| if toxicity_classifier is None or arabic_classifier is None: | |
| raise HTTPException( | |
| status_code=503, | |
| detail="Models are not loaded yet. Try again shortly.", | |
| ) | |
| # 1. Detect language | |
| lang = _detect_language(request.text) | |
| is_arabic = lang == "ar" | |
| # 2. Route to the appropriate model | |
| classifier = arabic_classifier if is_arabic else toxicity_classifier | |
| results = classifier(request.text) | |
| prediction = results[0] | |
| label: str = prediction["label"] | |
| score: float = prediction["score"] | |
| # 3. Normalise to a single toxic probability | |
| toxic_score = _normalise_toxic_score(label, score, is_arabic) | |
| # 4. Arabic text → always ask the LLM for moderation | |
| llm_used = False | |
| if is_arabic and arabic_llm_client: | |
| llm_result = _arabic_llm_verify(request.text) | |
| if llm_result is not None: | |
| llm_used = True | |
| llm_score = llm_result["score"] | |
| # Blend: 30 % local model + 70 % LLM (LLM is the primary judge for Arabic) | |
| toxic_score = 0.3 * toxic_score + 0.7 * llm_score | |
| # Override label with LLM verdict | |
| label = "HATE" if llm_result["toxic"] else "NON_HATE" | |
| logger.info( | |
| "Arabic LLM moderation: local=%.4f llm=%.4f blended=%.4f toxic=%s", | |
| _normalise_toxic_score(label, score, is_arabic), | |
| llm_score, | |
| toxic_score, | |
| llm_result["toxic"], | |
| ) | |
| else: | |
| logger.warning("Arabic LLM returned None – falling back to local model") | |
| elif is_arabic and not arabic_llm_client: | |
| logger.warning("Arabic text detected but arabic_llm_client is None (missing OPENROUTER_API_KEY?)") | |
| # 5. Apply language-specific threshold | |
| threshold = ARABIC_TOXICITY_THRESHOLD if is_arabic else TOXICITY_THRESHOLD | |
| is_bad = toxic_score >= threshold | |
| return ModerationResponse( | |
| has_bad_words=is_bad, | |
| confidence=round(toxic_score, 4), | |
| label=label, | |
| detected_language=lang, | |
| llm_verified=llm_used, | |
| ) | |