Spaces:
Running
Running
| """Face Emotion Detector — Real inference using EfficientNet or MobileNet. | |
| Supports multiple backends: | |
| 1. transformers (HuggingFace) — most accurate, GPU recommended | |
| 2. ONNX Runtime — fastest CPU inference | |
| 3. MediaPipe + OpenCV — lightweight fallback | |
| """ | |
| from __future__ import annotations | |
| import time | |
| import io | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| try: | |
| import cv2 | |
| HAS_CV2 = True | |
| except ImportError: | |
| HAS_CV2 = False | |
| try: | |
| from PIL import Image | |
| HAS_PIL = True | |
| except ImportError: | |
| HAS_PIL = False | |
| try: | |
| from transformers import pipeline | |
| HAS_TRANSFORMERS = True | |
| except ImportError: | |
| HAS_TRANSFORMERS = False | |
| try: | |
| import mediapipe as mp | |
| HAS_MEDIAPIPE = True | |
| except ImportError: | |
| HAS_MEDIAPIPE = False | |
| from models import ( | |
| EmotionLabel, EMOTION_LABELS, EmotionScore, | |
| EmotionDetectionResult, CulturalRegion, CULTURAL_ADJUSTMENT, | |
| ) | |
| # FER model label → EmoSphere label mapping | |
| FER_TO_EMOSPHERE = { | |
| "angry": EmotionLabel.ANGER, | |
| "disgust": EmotionLabel.DISGUST, | |
| "fear": EmotionLabel.FEAR, | |
| "happy": EmotionLabel.JOY, | |
| "sad": EmotionLabel.SADNESS, | |
| "surprise": EmotionLabel.SURPRISE, | |
| "neutral": EmotionLabel.NEUTRAL, | |
| } | |
| # HuggingFace model options (tested, public, no auth needed) | |
| FACE_MODELS = [ | |
| "trpakov/vit-face-expression", # ViT, good accuracy | |
| "dima806/facial_emotions_image_detection", # EfficientNet based | |
| ] | |
| class FaceEmotionDetector: | |
| """Real face emotion detection with HuggingFace transformers.""" | |
| def __init__(self, model_name: str | None = None, device: str = "cpu"): | |
| self.model_name = model_name or FACE_MODELS[0] | |
| self.device = device | |
| self.pipe = None | |
| self.face_cascade = None | |
| self.loaded = False | |
| def load(self) -> None: | |
| """Load the face emotion classification pipeline.""" | |
| if self.loaded: | |
| return | |
| # Load face detector (OpenCV cascade for face cropping) | |
| if HAS_CV2: | |
| cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml" | |
| self.face_cascade = cv2.CascadeClassifier(cascade_path) | |
| # Load emotion classifier | |
| if HAS_TRANSFORMERS: | |
| try: | |
| self.pipe = pipeline( | |
| "image-classification", | |
| model=self.model_name, | |
| device=self.device, | |
| top_k=None, # Return all classes | |
| ) | |
| print(f"[FaceDetector] Loaded model: {self.model_name}") | |
| except Exception as e: | |
| print(f"[FaceDetector] Failed to load {self.model_name}: {e}") | |
| # Try fallback model | |
| try: | |
| self.pipe = pipeline( | |
| "image-classification", | |
| model=FACE_MODELS[1], | |
| device=self.device, | |
| top_k=None, | |
| ) | |
| self.model_name = FACE_MODELS[1] | |
| print(f"[FaceDetector] Loaded fallback: {self.model_name}") | |
| except Exception as e2: | |
| print(f"[FaceDetector] All models failed: {e2}") | |
| print("[FaceDetector] Running in simulation mode") | |
| else: | |
| print("[FaceDetector] transformers not available, simulation mode") | |
| self.loaded = True | |
| def _decode_image(self, image_data: bytes) -> Optional[Image.Image]: | |
| """Decode bytes to PIL Image.""" | |
| if not HAS_PIL: | |
| return None | |
| try: | |
| return Image.open(io.BytesIO(image_data)).convert("RGB") | |
| except Exception: | |
| return None | |
| def _detect_face(self, image: Image.Image) -> Optional[Image.Image]: | |
| """Detect and crop face from image. Returns cropped face or full image.""" | |
| if not HAS_CV2 or self.face_cascade is None: | |
| return image | |
| img_array = np.array(image) | |
| gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
| faces = self.face_cascade.detectMultiScale( | |
| gray, scaleFactor=1.1, minNeighbors=5, minSize=(48, 48) | |
| ) | |
| if len(faces) == 0: | |
| return image # No face found, use full image | |
| # Use largest face | |
| x, y, w, h = max(faces, key=lambda f: f[2] * f[3]) | |
| # Add 20% padding | |
| pad = int(max(w, h) * 0.2) | |
| x1 = max(0, x - pad) | |
| y1 = max(0, y - pad) | |
| x2 = min(img_array.shape[1], x + w + pad) | |
| y2 = min(img_array.shape[0], y + h + pad) | |
| face_crop = image.crop((x1, y1, x2, y2)) | |
| return face_crop | |
| def _map_scores( | |
| self, predictions: list[dict], cultural_region: CulturalRegion | |
| ) -> dict[EmotionLabel, float]: | |
| """Map model predictions to EmoSphere emotion labels.""" | |
| scores: dict[EmotionLabel, float] = {label: 0.0 for label in EMOTION_LABELS} | |
| for pred in predictions: | |
| model_label = pred["label"].lower().strip() | |
| score = pred["score"] | |
| # Map to EmoSphere label | |
| emo_label = FER_TO_EMOSPHERE.get(model_label) | |
| if emo_label: | |
| # Accumulate (angry + disgust both go to disgust) | |
| scores[emo_label] = max(scores[emo_label], score) | |
| # Fill anger from disgust context | |
| if scores[EmotionLabel.DISGUST] > 0.2: | |
| scores[EmotionLabel.ANGER] = scores[EmotionLabel.DISGUST] * 0.3 | |
| # Fill unmapped labels (love, calm) from contextual hints | |
| # Joy with low intensity → calm; high joy → love component | |
| if scores[EmotionLabel.JOY] > 0.3: | |
| scores[EmotionLabel.LOVE] = scores[EmotionLabel.JOY] * 0.15 | |
| scores[EmotionLabel.CALM] = scores[EmotionLabel.JOY] * 0.1 | |
| if scores[EmotionLabel.NEUTRAL] > 0.4: | |
| scores[EmotionLabel.CALM] = scores[EmotionLabel.NEUTRAL] * 0.3 | |
| # Cultural adjustment | |
| factor = CULTURAL_ADJUSTMENT.get(cultural_region, 1.0) | |
| if factor != 1.0: | |
| for label in EMOTION_LABELS: | |
| scores[label] = min(scores[label] ** (1.0 / factor), 1.0) | |
| # Normalize | |
| total = sum(scores.values()) | |
| if total > 0: | |
| scores = {k: v / total for k, v in scores.items()} | |
| return scores | |
| def _simulate(self) -> dict[EmotionLabel, float]: | |
| """Fallback simulation when no model is available.""" | |
| raw = np.random.dirichlet(np.ones(len(EMOTION_LABELS)) * 0.5) | |
| return {label: float(raw[i]) for i, label in enumerate(EMOTION_LABELS)} | |
| def detect( | |
| self, | |
| image_data: bytes | np.ndarray, | |
| cultural_region: CulturalRegion = CulturalRegion.UNIVERSAL, | |
| ) -> EmotionDetectionResult: | |
| """Detect emotion from face image.""" | |
| start = time.time() | |
| if self.pipe is not None and HAS_PIL: | |
| # Real inference | |
| if isinstance(image_data, bytes): | |
| image = self._decode_image(image_data) | |
| else: | |
| image = Image.fromarray( | |
| (image_data * 255).astype(np.uint8) if image_data.max() <= 1.0 | |
| else image_data.astype(np.uint8) | |
| ) | |
| if image is None: | |
| scores = self._simulate() | |
| else: | |
| # Detect and crop face | |
| face = self._detect_face(image) | |
| # Run model | |
| predictions = self.pipe(face) | |
| scores = self._map_scores(predictions, cultural_region) | |
| else: | |
| scores = self._simulate() | |
| # Build result | |
| emotion_scores = [ | |
| EmotionScore(label=label, score=scores[label], confidence=scores[label] * 0.85) | |
| for label in EMOTION_LABELS | |
| ] | |
| dominant = max(scores, key=scores.get) # type: ignore | |
| return EmotionDetectionResult( | |
| dominant=dominant, | |
| dominant_score=scores[dominant], | |
| scores=emotion_scores, | |
| modality="face", | |
| confidence=scores[dominant] * 0.80, | |
| processing_time_ms=(time.time() - start) * 1000, | |
| cultural_region=cultural_region, | |
| ) | |