Spaces:

chariscait
/

EmoSphere

Sleeping

File size: 8,207 Bytes

"""Face Emotion Detector — Real inference using EfficientNet or MobileNet.

Supports multiple backends:
  1. transformers (HuggingFace) — most accurate, GPU recommended
  2. ONNX Runtime — fastest CPU inference
  3. MediaPipe + OpenCV — lightweight fallback
"""

from __future__ import annotations

import time
import io
from pathlib import Path
from typing import Optional

import numpy as np

try:
    import cv2
    HAS_CV2 = True
except ImportError:
    HAS_CV2 = False

try:
    from PIL import Image
    HAS_PIL = True
except ImportError:
    HAS_PIL = False

try:
    from transformers import pipeline
    HAS_TRANSFORMERS = True
except ImportError:
    HAS_TRANSFORMERS = False

try:
    import mediapipe as mp
    HAS_MEDIAPIPE = True
except ImportError:
    HAS_MEDIAPIPE = False

from models import (
    EmotionLabel, EMOTION_LABELS, EmotionScore,
    EmotionDetectionResult, CulturalRegion, CULTURAL_ADJUSTMENT,
)


# FER model label → EmoSphere label mapping
FER_TO_EMOSPHERE = {
    "angry":    EmotionLabel.ANGER,
    "disgust":  EmotionLabel.DISGUST,
    "fear":     EmotionLabel.FEAR,
    "happy":    EmotionLabel.JOY,
    "sad":      EmotionLabel.SADNESS,
    "surprise": EmotionLabel.SURPRISE,
    "neutral":  EmotionLabel.NEUTRAL,
}

# HuggingFace model options (tested, public, no auth needed)
FACE_MODELS = [
    "trpakov/vit-face-expression",           # ViT, good accuracy
    "dima806/facial_emotions_image_detection", # EfficientNet based
]


class FaceEmotionDetector:
    """Real face emotion detection with HuggingFace transformers."""

    def __init__(self, model_name: str | None = None, device: str = "cpu"):
        self.model_name = model_name or FACE_MODELS[0]
        self.device = device
        self.pipe = None
        self.face_cascade = None
        self.loaded = False

    def load(self) -> None:
        """Load the face emotion classification pipeline."""
        if self.loaded:
            return

        # Load face detector (OpenCV cascade for face cropping)
        if HAS_CV2:
            cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
            self.face_cascade = cv2.CascadeClassifier(cascade_path)

        # Load emotion classifier
        if HAS_TRANSFORMERS:
            try:
                self.pipe = pipeline(
                    "image-classification",
                    model=self.model_name,
                    device=self.device,
                    top_k=None,  # Return all classes
                )
                print(f"[FaceDetector] Loaded model: {self.model_name}")
            except Exception as e:
                print(f"[FaceDetector] Failed to load {self.model_name}: {e}")
                # Try fallback model
                try:
                    self.pipe = pipeline(
                        "image-classification",
                        model=FACE_MODELS[1],
                        device=self.device,
                        top_k=None,
                    )
                    self.model_name = FACE_MODELS[1]
                    print(f"[FaceDetector] Loaded fallback: {self.model_name}")
                except Exception as e2:
                    print(f"[FaceDetector] All models failed: {e2}")
                    print("[FaceDetector] Running in simulation mode")
        else:
            print("[FaceDetector] transformers not available, simulation mode")

        self.loaded = True

    def _decode_image(self, image_data: bytes) -> Optional[Image.Image]:
        """Decode bytes to PIL Image."""
        if not HAS_PIL:
            return None
        try:
            return Image.open(io.BytesIO(image_data)).convert("RGB")
        except Exception:
            return None

    def _detect_face(self, image: Image.Image) -> Optional[Image.Image]:
        """Detect and crop face from image. Returns cropped face or full image."""
        if not HAS_CV2 or self.face_cascade is None:
            return image

        img_array = np.array(image)
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        faces = self.face_cascade.detectMultiScale(
            gray, scaleFactor=1.1, minNeighbors=5, minSize=(48, 48)
        )

        if len(faces) == 0:
            return image  # No face found, use full image

        # Use largest face
        x, y, w, h = max(faces, key=lambda f: f[2] * f[3])
        # Add 20% padding
        pad = int(max(w, h) * 0.2)
        x1 = max(0, x - pad)
        y1 = max(0, y - pad)
        x2 = min(img_array.shape[1], x + w + pad)
        y2 = min(img_array.shape[0], y + h + pad)

        face_crop = image.crop((x1, y1, x2, y2))
        return face_crop

    def _map_scores(
        self, predictions: list[dict], cultural_region: CulturalRegion
    ) -> dict[EmotionLabel, float]:
        """Map model predictions to EmoSphere emotion labels."""
        scores: dict[EmotionLabel, float] = {label: 0.0 for label in EMOTION_LABELS}

        for pred in predictions:
            model_label = pred["label"].lower().strip()
            score = pred["score"]

            # Map to EmoSphere label
            emo_label = FER_TO_EMOSPHERE.get(model_label)
            if emo_label:
                # Accumulate (angry + disgust both go to disgust)
                scores[emo_label] = max(scores[emo_label], score)

        # Fill anger from disgust context
        if scores[EmotionLabel.DISGUST] > 0.2:
            scores[EmotionLabel.ANGER] = scores[EmotionLabel.DISGUST] * 0.3

        # Fill unmapped labels (love, calm) from contextual hints
        # Joy with low intensity → calm; high joy → love component
        if scores[EmotionLabel.JOY] > 0.3:
            scores[EmotionLabel.LOVE] = scores[EmotionLabel.JOY] * 0.15
            scores[EmotionLabel.CALM] = scores[EmotionLabel.JOY] * 0.1
        if scores[EmotionLabel.NEUTRAL] > 0.4:
            scores[EmotionLabel.CALM] = scores[EmotionLabel.NEUTRAL] * 0.3

        # Cultural adjustment
        factor = CULTURAL_ADJUSTMENT.get(cultural_region, 1.0)
        if factor != 1.0:
            for label in EMOTION_LABELS:
                scores[label] = min(scores[label] ** (1.0 / factor), 1.0)

        # Normalize
        total = sum(scores.values())
        if total > 0:
            scores = {k: v / total for k, v in scores.items()}

        return scores

    def _simulate(self) -> dict[EmotionLabel, float]:
        """Fallback simulation when no model is available."""
        raw = np.random.dirichlet(np.ones(len(EMOTION_LABELS)) * 0.5)
        return {label: float(raw[i]) for i, label in enumerate(EMOTION_LABELS)}

    def detect(
        self,
        image_data: bytes | np.ndarray,
        cultural_region: CulturalRegion = CulturalRegion.UNIVERSAL,
    ) -> EmotionDetectionResult:
        """Detect emotion from face image."""
        start = time.time()

        if self.pipe is not None and HAS_PIL:
            # Real inference
            if isinstance(image_data, bytes):
                image = self._decode_image(image_data)
            else:
                image = Image.fromarray(
                    (image_data * 255).astype(np.uint8) if image_data.max() <= 1.0
                    else image_data.astype(np.uint8)
                )

            if image is None:
                scores = self._simulate()
            else:
                # Detect and crop face
                face = self._detect_face(image)
                # Run model
                predictions = self.pipe(face)
                scores = self._map_scores(predictions, cultural_region)
        else:
            scores = self._simulate()

        # Build result
        emotion_scores = [
            EmotionScore(label=label, score=scores[label], confidence=scores[label] * 0.85)
            for label in EMOTION_LABELS
        ]
        dominant = max(scores, key=scores.get)  # type: ignore

        return EmotionDetectionResult(
            dominant=dominant,
            dominant_score=scores[dominant],
            scores=emotion_scores,
            modality="face",
            confidence=scores[dominant] * 0.80,
            processing_time_ms=(time.time() - start) * 1000,
            cultural_region=cultural_region,
        )