"""EmoSphere Posture & Gesture Emotion Detector. Uses MediaPipe Pose + Hands landmarks to estimate body posture and gesture cues, then maps them to emotion probabilities via a rule-based heuristic engine. Posture signals: - Shoulder slump / elevation (sadness vs confidence) - Head tilt / drop (interest, submission, sadness) - Arm openness / crossing (comfort vs defensiveness) - Overall body tension / relaxation - Forward lean (engagement, aggression) Gesture signals: - Hand-to-face gestures (anxiety, contemplation) - Fist clenching (anger, frustration) - Open palms (openness, honesty) - Self-touching / fidgeting (anxiety, discomfort) - Hand waving / movement energy (agitation vs calm) - Pointing / directional gestures (anger, dominance) """ from __future__ import annotations import time from typing import Optional import numpy as np from models import ( EmotionLabel, EMOTION_LABELS, EmotionScore, EmotionDetectionResult, CulturalRegion, CULTURAL_ADJUSTMENT, ) # Try to import MediaPipe for real pose estimation try: import mediapipe as mp # Verify solutions module exists (missing in some versions/Python 3.13) _test = mp.solutions.pose HAS_MEDIAPIPE = True except (ImportError, AttributeError): HAS_MEDIAPIPE = False try: from PIL import Image import io HAS_PIL = True except ImportError: HAS_PIL = False class PostureEmotionDetector: """Detect emotions from body posture and hand gestures.""" def __init__(self, device: str = "cpu"): self.device = device self.loaded = False self.pose = None self.hands = None def load(self) -> None: if HAS_MEDIAPIPE: try: self.pose = mp.solutions.pose.Pose( static_image_mode=True, model_complexity=1, min_detection_confidence=0.5, ) except Exception as e: print(f"[PostureDetector] Pose init error: {e}") try: self.hands = mp.solutions.hands.Hands( static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5, ) except Exception as e: print(f"[PostureDetector] Hands init error: {e}") self.loaded = True parts = [] if self.pose: parts.append("pose") if self.hands: parts.append("hands") mode = "+".join(parts) if parts else "heuristic-simulation" print(f"[PostureDetector] Loaded ({mode})") def detect( self, image_bytes: bytes, cultural_region: CulturalRegion = CulturalRegion.UNIVERSAL ) -> EmotionDetectionResult: start = time.time() features = self._extract_features(image_bytes) raw_scores = self._features_to_emotions(features) # Cultural adjustment adj = CULTURAL_ADJUSTMENT.get(cultural_region, 1.0) for label in raw_scores: if label != EmotionLabel.NEUTRAL: raw_scores[label] *= adj # Normalize total = sum(raw_scores.values()) if total > 0: raw_scores = {k: v / total for k, v in raw_scores.items()} dominant = max(raw_scores, key=raw_scores.get) confidence = raw_scores[dominant] * features.get("detection_confidence", 0.7) scores = [ EmotionScore(label=label, score=raw_scores.get(label, 0.0), confidence=confidence) for label in EMOTION_LABELS ] return EmotionDetectionResult( dominant=dominant, dominant_score=raw_scores[dominant], scores=scores, modality="posture/gesture", confidence=min(confidence, 1.0), processing_time_ms=(time.time() - start) * 1000, cultural_region=cultural_region, ) def _extract_features(self, image_bytes: bytes) -> dict: """Extract posture + gesture features from image using MediaPipe.""" if (self.pose or self.hands) and HAS_PIL: try: import cv2 import numpy as np nparr = np.frombuffer(image_bytes, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) if img is not None: rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Posture features from Pose landmarks features = {} if self.pose: pose_results = self.pose.process(rgb) if pose_results.pose_landmarks: features = self._landmarks_to_features(pose_results.pose_landmarks, img.shape) # Gesture features from Hand landmarks gesture_features = {"fist_clenching": 0.0, "open_palms": 0.0, "fidgeting": 0.0, "pointing": 0.0} if self.hands: hand_results = self.hands.process(rgb) if hand_results.multi_hand_landmarks: gesture_features = self._hand_gestures(hand_results.multi_hand_landmarks) if features: features.update(gesture_features) return features elif gesture_features.get("fist_clenching", 0) > 0 or gesture_features.get("open_palms", 0) > 0: # Have hand data but no pose — build minimal features base = self._default_features() base.update(gesture_features) base["detection_confidence"] = 0.5 return base except Exception as e: print(f"[PostureDetector] Feature extraction error: {e}") # Simulation fallback with neutral-ish features (not random) return self._default_features() def _default_features(self) -> dict: """Return neutral default features when detection fails.""" return { "shoulder_slump": 0.2, "shoulder_elevation": 0.5, "head_drop": 0.15, "head_tilt": 0.1, "arm_openness": 0.6, "arm_crossing": 0.1, "hand_face_proximity": 0.15, "body_tension": 0.35, "body_lean_forward": 0.2, "movement_energy": 0.3, "overall_openness": 0.6, "fist_clenching": 0.0, "open_palms": 0.0, "fidgeting": 0.0, "pointing": 0.0, "detection_confidence": 0.4, } def _hand_gestures(self, hand_landmarks_list) -> dict: """Extract gesture features from MediaPipe Hand landmarks. Gestures detected: - Fist clenching: all fingers curled (anger, frustration) - Open palms: all fingers extended (openness, calm) - Fidgeting: rapid small movements (anxiety) - Pointing: index extended, others curled (dominance, anger) """ fist_score = 0.0 open_score = 0.0 point_score = 0.0 n_hands = len(hand_landmarks_list) for hand_lm in hand_landmarks_list: lm = hand_lm.landmark # Finger tip indices: thumb=4, index=8, middle=12, ring=16, pinky=20 # Finger MCP indices: thumb=2, index=5, middle=9, ring=13, pinky=17 # Check if fingers are curled (tip below MCP in y) fingers_curled = 0 fingers_extended = 0 # Index finger if lm[8].y > lm[6].y: # tip below PIP fingers_curled += 1 else: fingers_extended += 1 # Middle finger if lm[12].y > lm[10].y: fingers_curled += 1 else: fingers_extended += 1 # Ring finger if lm[16].y > lm[14].y: fingers_curled += 1 else: fingers_extended += 1 # Pinky if lm[20].y > lm[18].y: fingers_curled += 1 else: fingers_extended += 1 # Fist: all 4 fingers curled if fingers_curled >= 4: fist_score += 1.0 elif fingers_curled >= 3: fist_score += 0.5 # Open palm: all 4 fingers extended if fingers_extended >= 4: open_score += 1.0 elif fingers_extended >= 3: open_score += 0.5 # Pointing: only index extended if lm[8].y < lm[6].y and fingers_curled >= 3: point_score += 1.0 # Normalize by number of hands if n_hands > 0: fist_score = min(1.0, fist_score / n_hands) open_score = min(1.0, open_score / n_hands) point_score = min(1.0, point_score / n_hands) return { "fist_clenching": float(fist_score), "open_palms": float(open_score), "fidgeting": 0.0, # requires temporal tracking (future) "pointing": float(point_score), } def _landmarks_to_features(self, landmarks, img_shape) -> dict: """Convert MediaPipe pose landmarks to posture features.""" lm = landmarks.landmark h, w = img_shape[:2] def pt(idx): return np.array([lm[idx].x * w, lm[idx].y * h, lm[idx].z * w]) # Key landmarks l_shoulder = pt(11) r_shoulder = pt(12) l_hip = pt(23) r_hip = pt(24) l_elbow = pt(13) r_elbow = pt(14) l_wrist = pt(15) r_wrist = pt(16) nose = pt(0) l_ear = pt(7) r_ear = pt(8) # Shoulder analysis shoulder_center = (l_shoulder + r_shoulder) / 2 hip_center = (l_hip + r_hip) / 2 shoulder_width = np.linalg.norm(l_shoulder[:2] - r_shoulder[:2]) torso_height = np.linalg.norm(shoulder_center[:2] - hip_center[:2]) # Shoulder slump: shoulders dropping forward (z-depth) shoulder_slump = max(0, (l_shoulder[2] + r_shoulder[2]) / 2) / (w * 0.1 + 1e-6) shoulder_slump = min(shoulder_slump, 1.0) # Shoulder elevation relative to ears ear_y = (l_ear[1] + r_ear[1]) / 2 shoulder_elevation = 1.0 - min(1.0, abs(shoulder_center[1] - ear_y) / (torso_height + 1e-6)) # Head drop: nose below shoulder line head_drop = max(0, nose[1] - shoulder_center[1]) / (torso_height * 0.3 + 1e-6) head_drop = min(head_drop, 1.0) # Head tilt: ear height difference head_tilt = abs(l_ear[1] - r_ear[1]) / (shoulder_width * 0.3 + 1e-6) head_tilt = min(head_tilt, 1.0) # Arm openness: elbows distance relative to shoulder width elbow_dist = np.linalg.norm(l_elbow[:2] - r_elbow[:2]) arm_openness = min(1.0, elbow_dist / (shoulder_width * 2.0 + 1e-6)) # Arm crossing: wrists close to opposite shoulders l_cross = np.linalg.norm(l_wrist[:2] - r_shoulder[:2]) / (shoulder_width + 1e-6) r_cross = np.linalg.norm(r_wrist[:2] - l_shoulder[:2]) / (shoulder_width + 1e-6) arm_crossing = max(0, 1.0 - min(l_cross, r_cross)) # Hand-to-face proximity face_center = nose[:2] l_hand_face = np.linalg.norm(l_wrist[:2] - face_center) / (torso_height + 1e-6) r_hand_face = np.linalg.norm(r_wrist[:2] - face_center) / (torso_height + 1e-6) hand_face_proximity = max(0, 1.0 - min(l_hand_face, r_hand_face)) # Body tension: shoulder elevation + arm tightness body_tension = (shoulder_elevation * 0.5 + (1.0 - arm_openness) * 0.5) # Forward lean body_lean_forward = max(0, shoulder_center[2] - hip_center[2]) / (w * 0.05 + 1e-6) body_lean_forward = min(body_lean_forward, 1.0) # Movement energy (approximated from landmark visibility/spread) wrist_spread = np.linalg.norm(l_wrist[:2] - r_wrist[:2]) / (shoulder_width * 3.0 + 1e-6) movement_energy = min(1.0, wrist_spread) # Overall openness overall_openness = (arm_openness * 0.4 + (1.0 - arm_crossing) * 0.3 + (1.0 - body_tension) * 0.3) # Detection confidence from landmark visibility avg_vis = np.mean([lm[i].visibility for i in [0, 7, 8, 11, 12, 13, 14, 15, 16, 23, 24]]) return { "shoulder_slump": float(shoulder_slump), "shoulder_elevation": float(shoulder_elevation), "head_drop": float(head_drop), "head_tilt": float(head_tilt), "arm_openness": float(arm_openness), "arm_crossing": float(arm_crossing), "hand_face_proximity": float(hand_face_proximity), "body_tension": float(body_tension), "body_lean_forward": float(body_lean_forward), "movement_energy": float(movement_energy), "overall_openness": float(overall_openness), # Gesture features (populated by _hand_gestures if hands detected) "fist_clenching": 0.0, "open_palms": float(arm_openness * 0.5), # approximate from arm openness "fidgeting": 0.0, "pointing": 0.0, "detection_confidence": float(avg_vis), } def _features_to_emotions(self, f: dict) -> dict: """Map posture + gesture features to emotion probabilities using clinical heuristics.""" scores = {label: 0.0 for label in EMOTION_LABELS} # Gesture features (default 0 if not available) fist = f.get("fist_clenching", 0.0) palms = f.get("open_palms", 0.0) fidget = f.get("fidgeting", 0.0) point = f.get("pointing", 0.0) # Sadness: slumped shoulders, head drop, closed posture, low energy scores[EmotionLabel.SADNESS] = ( f["shoulder_slump"] * 0.22 + f["head_drop"] * 0.22 + (1.0 - f["arm_openness"]) * 0.18 + (1.0 - f["movement_energy"]) * 0.13 + f["arm_crossing"] * 0.13 + (1.0 - palms) * 0.06 # closed hands + fidget * 0.06 ) # Joy: open posture, open palms, high energy, no tension scores[EmotionLabel.JOY] = ( f["arm_openness"] * 0.22 + f["overall_openness"] * 0.20 + f["movement_energy"] * 0.18 + (1.0 - f["shoulder_slump"]) * 0.12 + (1.0 - f["body_tension"]) * 0.10 + palms * 0.10 # open palms = positive + (1.0 - fist) * 0.08 ) # Fear: tension, shoulders elevated, arms close, self-touching, fidgeting scores[EmotionLabel.FEAR] = ( f["body_tension"] * 0.25 + f["shoulder_elevation"] * 0.15 + (1.0 - f["arm_openness"]) * 0.15 + f["hand_face_proximity"] * 0.15 # self-touching gesture + (1.0 - f["overall_openness"]) * 0.12 + fidget * 0.10 # fidgeting gesture + (1.0 - palms) * 0.08 ) # Surprise: elevated shoulders, lean forward, head tilt, open palms scores[EmotionLabel.SURPRISE] = ( f["shoulder_elevation"] * 0.22 + f["body_lean_forward"] * 0.20 + f["head_tilt"] * 0.18 + f["movement_energy"] * 0.15 + f["arm_openness"] * 0.12 + palms * 0.08 # hands open in surprise + point * 0.05 ) # Disgust: arm crossing, lean back, tension, hand-to-face (covering nose/mouth) scores[EmotionLabel.DISGUST] = ( f["arm_crossing"] * 0.25 + (1.0 - f["body_lean_forward"]) * 0.18 + f["body_tension"] * 0.18 + (1.0 - f["arm_openness"]) * 0.15 + f["hand_face_proximity"] * 0.15 # covering face gesture + (1.0 - palms) * 0.09 ) # Love: open, relaxed, leaning forward, open palms, low tension scores[EmotionLabel.LOVE] = ( f["body_lean_forward"] * 0.20 + f["overall_openness"] * 0.20 + f["arm_openness"] * 0.18 + (1.0 - f["body_tension"]) * 0.15 + (1.0 - f["arm_crossing"]) * 0.10 + palms * 0.10 # open palms = warmth + (1.0 - fist) * 0.07 ) # Calm: relaxed, neutral posture, low energy, open palms, no fidgeting scores[EmotionLabel.CALM] = ( (1.0 - f["body_tension"]) * 0.25 + (1.0 - f["movement_energy"]) * 0.18 + f["overall_openness"] * 0.15 + (1.0 - f["shoulder_elevation"]) * 0.12 + (1.0 - f["hand_face_proximity"]) * 0.10 + palms * 0.10 # relaxed open hands + (1.0 - fidget) * 0.10 ) # Anger: fist clenching, high tension, high energy, forward lean, pointing scores[EmotionLabel.ANGER] = ( fist * 0.20 # fist clenching = key anger gesture + f["body_tension"] * 0.18 + f["movement_energy"] * 0.18 + (1.0 - f["arm_openness"]) * 0.12 + f["body_lean_forward"] * 0.12 + point * 0.12 # pointing gesture = aggression + (1.0 - f["overall_openness"]) * 0.08 ) # Neutral: absence of strong signals extremes = max(scores.values()) - min(scores.values()) scores[EmotionLabel.NEUTRAL] = max(0.0, 1.0 - extremes * 2.0) * 0.5 return scores