""" Emotion detection using a PyTorch vision transformer (no TensorFlow needed). Model: dima806/face_emotions_image_detection (ViT-tiny, ~28MB) Labels: angry, disgust, fear, happy, sad, surprise, neutral Downloaded from HuggingFace Hub on first use. """ from __future__ import annotations import os # Must be set before any transformers import to block TF (crashes without AVX on Rosetta) os.environ.setdefault("USE_TF", "0") os.environ.setdefault("USE_JAX", "0") os.environ.setdefault("USE_TORCH", "1") os.environ.setdefault("TRANSFORMERS_NO_TF", "1") os.environ.setdefault("TRANSFORMERS_NO_JAX", "1") from typing import Dict, Tuple import numpy as np import torch from PIL import Image EMOTION_LABELS = ["angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"] DISPLAY_LABELS = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise", "Neutral"] # Map model output labels → display labels (capitalized) _LABEL_MAP = {l.lower(): l.capitalize() for l in DISPLAY_LABELS} # Also map variants _LABEL_MAP.update({"happiness": "Happy", "sadness": "Sad", "anger": "Angry", "neutral": "Neutral", "disgust": "Disgust", "fear": "Fear", "surprise": "Surprise"}) MODEL_ID = "dima806/face_emotions_image_detection" class EmotionDetector: """ Lightweight ViT-based facial expression recognizer. Input : face crop as RGB numpy array (any size). Output: dict mapping emotion label → probability. """ def __init__(self) -> None: self._pipe = None # lazy load def _load(self): import os # Prevent transformers from loading TensorFlow (crashes on machines without AVX) os.environ["USE_TF"] = "0" os.environ["USE_JAX"] = "0" os.environ["USE_TORCH"] = "1" os.environ["TRANSFORMERS_NO_TF"] = "1" os.environ["TRANSFORMERS_NO_JAX"] = "1" from transformers import pipeline print("[emotion] Loading emotion model (first run)…") pipe = pipeline( "image-classification", model = MODEL_ID, top_k = None, device = -1, # CPU ) print("[emotion] Loaded.") return pipe # ── public ──────────────────────────────────────────────────────────── def predict(self, face_rgb: np.ndarray) -> Dict[str, float]: """ Returns dict: { 'Happy': 0.87, 'Sad': 0.05, … } """ if self._pipe is None: self._pipe = self._load() pil = Image.fromarray(face_rgb).convert("RGB") results = self._pipe(pil) # Normalise labels → Title Case and fill any missing emotions out: Dict[str, float] = {lbl: 0.0 for lbl in DISPLAY_LABELS} for r in results: label = _LABEL_MAP.get(r["label"].lower(), r["label"].capitalize()) out[label] = float(r["score"]) return out def top_emotion(self, face_rgb: np.ndarray) -> "Tuple[str, float]": probs = self.predict(face_rgb) label = max(probs, key=probs.get) return label, probs[label]