Spaces:
Running
Running
| """EmoSphere Posture & Gesture Emotion Detector. | |
| Uses MediaPipe Pose + Hands landmarks to estimate body posture and gesture cues, | |
| then maps them to emotion probabilities via a rule-based heuristic engine. | |
| Posture signals: | |
| - Shoulder slump / elevation (sadness vs confidence) | |
| - Head tilt / drop (interest, submission, sadness) | |
| - Arm openness / crossing (comfort vs defensiveness) | |
| - Overall body tension / relaxation | |
| - Forward lean (engagement, aggression) | |
| Gesture signals: | |
| - Hand-to-face gestures (anxiety, contemplation) | |
| - Fist clenching (anger, frustration) | |
| - Open palms (openness, honesty) | |
| - Self-touching / fidgeting (anxiety, discomfort) | |
| - Hand waving / movement energy (agitation vs calm) | |
| - Pointing / directional gestures (anger, dominance) | |
| """ | |
| from __future__ import annotations | |
| import time | |
| from typing import Optional | |
| import numpy as np | |
| from models import ( | |
| EmotionLabel, EMOTION_LABELS, EmotionScore, | |
| EmotionDetectionResult, CulturalRegion, CULTURAL_ADJUSTMENT, | |
| ) | |
| # Try to import MediaPipe for real pose estimation | |
| try: | |
| import mediapipe as mp | |
| # Verify solutions module exists (missing in some versions/Python 3.13) | |
| _test = mp.solutions.pose | |
| HAS_MEDIAPIPE = True | |
| except (ImportError, AttributeError): | |
| HAS_MEDIAPIPE = False | |
| try: | |
| from PIL import Image | |
| import io | |
| HAS_PIL = True | |
| except ImportError: | |
| HAS_PIL = False | |
| class PostureEmotionDetector: | |
| """Detect emotions from body posture and hand gestures.""" | |
| def __init__(self, device: str = "cpu"): | |
| self.device = device | |
| self.loaded = False | |
| self.pose = None | |
| self.hands = None | |
| def load(self) -> None: | |
| if HAS_MEDIAPIPE: | |
| try: | |
| self.pose = mp.solutions.pose.Pose( | |
| static_image_mode=True, | |
| model_complexity=1, | |
| min_detection_confidence=0.5, | |
| ) | |
| except Exception as e: | |
| print(f"[PostureDetector] Pose init error: {e}") | |
| try: | |
| self.hands = mp.solutions.hands.Hands( | |
| static_image_mode=True, | |
| max_num_hands=2, | |
| min_detection_confidence=0.5, | |
| ) | |
| except Exception as e: | |
| print(f"[PostureDetector] Hands init error: {e}") | |
| self.loaded = True | |
| parts = [] | |
| if self.pose: | |
| parts.append("pose") | |
| if self.hands: | |
| parts.append("hands") | |
| mode = "+".join(parts) if parts else "heuristic-simulation" | |
| print(f"[PostureDetector] Loaded ({mode})") | |
| def detect( | |
| self, image_bytes: bytes, cultural_region: CulturalRegion = CulturalRegion.UNIVERSAL | |
| ) -> EmotionDetectionResult: | |
| start = time.time() | |
| features = self._extract_features(image_bytes) | |
| raw_scores = self._features_to_emotions(features) | |
| # Cultural adjustment | |
| adj = CULTURAL_ADJUSTMENT.get(cultural_region, 1.0) | |
| for label in raw_scores: | |
| if label != EmotionLabel.NEUTRAL: | |
| raw_scores[label] *= adj | |
| # Normalize | |
| total = sum(raw_scores.values()) | |
| if total > 0: | |
| raw_scores = {k: v / total for k, v in raw_scores.items()} | |
| dominant = max(raw_scores, key=raw_scores.get) | |
| confidence = raw_scores[dominant] * features.get("detection_confidence", 0.7) | |
| scores = [ | |
| EmotionScore(label=label, score=raw_scores.get(label, 0.0), confidence=confidence) | |
| for label in EMOTION_LABELS | |
| ] | |
| return EmotionDetectionResult( | |
| dominant=dominant, | |
| dominant_score=raw_scores[dominant], | |
| scores=scores, | |
| modality="posture/gesture", | |
| confidence=min(confidence, 1.0), | |
| processing_time_ms=(time.time() - start) * 1000, | |
| cultural_region=cultural_region, | |
| ) | |
| def _extract_features(self, image_bytes: bytes) -> dict: | |
| """Extract posture + gesture features from image using MediaPipe.""" | |
| if (self.pose or self.hands) and HAS_PIL: | |
| try: | |
| import cv2 | |
| import numpy as np | |
| nparr = np.frombuffer(image_bytes, np.uint8) | |
| img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) | |
| if img is not None: | |
| rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| # Posture features from Pose landmarks | |
| features = {} | |
| if self.pose: | |
| pose_results = self.pose.process(rgb) | |
| if pose_results.pose_landmarks: | |
| features = self._landmarks_to_features(pose_results.pose_landmarks, img.shape) | |
| # Gesture features from Hand landmarks | |
| gesture_features = {"fist_clenching": 0.0, "open_palms": 0.0, | |
| "fidgeting": 0.0, "pointing": 0.0} | |
| if self.hands: | |
| hand_results = self.hands.process(rgb) | |
| if hand_results.multi_hand_landmarks: | |
| gesture_features = self._hand_gestures(hand_results.multi_hand_landmarks) | |
| if features: | |
| features.update(gesture_features) | |
| return features | |
| elif gesture_features.get("fist_clenching", 0) > 0 or gesture_features.get("open_palms", 0) > 0: | |
| # Have hand data but no pose — build minimal features | |
| base = self._default_features() | |
| base.update(gesture_features) | |
| base["detection_confidence"] = 0.5 | |
| return base | |
| except Exception as e: | |
| print(f"[PostureDetector] Feature extraction error: {e}") | |
| # Simulation fallback with neutral-ish features (not random) | |
| return self._default_features() | |
| def _default_features(self) -> dict: | |
| """Return neutral default features when detection fails.""" | |
| return { | |
| "shoulder_slump": 0.2, | |
| "shoulder_elevation": 0.5, | |
| "head_drop": 0.15, | |
| "head_tilt": 0.1, | |
| "arm_openness": 0.6, | |
| "arm_crossing": 0.1, | |
| "hand_face_proximity": 0.15, | |
| "body_tension": 0.35, | |
| "body_lean_forward": 0.2, | |
| "movement_energy": 0.3, | |
| "overall_openness": 0.6, | |
| "fist_clenching": 0.0, | |
| "open_palms": 0.0, | |
| "fidgeting": 0.0, | |
| "pointing": 0.0, | |
| "detection_confidence": 0.4, | |
| } | |
| def _hand_gestures(self, hand_landmarks_list) -> dict: | |
| """Extract gesture features from MediaPipe Hand landmarks. | |
| Gestures detected: | |
| - Fist clenching: all fingers curled (anger, frustration) | |
| - Open palms: all fingers extended (openness, calm) | |
| - Fidgeting: rapid small movements (anxiety) | |
| - Pointing: index extended, others curled (dominance, anger) | |
| """ | |
| fist_score = 0.0 | |
| open_score = 0.0 | |
| point_score = 0.0 | |
| n_hands = len(hand_landmarks_list) | |
| for hand_lm in hand_landmarks_list: | |
| lm = hand_lm.landmark | |
| # Finger tip indices: thumb=4, index=8, middle=12, ring=16, pinky=20 | |
| # Finger MCP indices: thumb=2, index=5, middle=9, ring=13, pinky=17 | |
| # Check if fingers are curled (tip below MCP in y) | |
| fingers_curled = 0 | |
| fingers_extended = 0 | |
| # Index finger | |
| if lm[8].y > lm[6].y: # tip below PIP | |
| fingers_curled += 1 | |
| else: | |
| fingers_extended += 1 | |
| # Middle finger | |
| if lm[12].y > lm[10].y: | |
| fingers_curled += 1 | |
| else: | |
| fingers_extended += 1 | |
| # Ring finger | |
| if lm[16].y > lm[14].y: | |
| fingers_curled += 1 | |
| else: | |
| fingers_extended += 1 | |
| # Pinky | |
| if lm[20].y > lm[18].y: | |
| fingers_curled += 1 | |
| else: | |
| fingers_extended += 1 | |
| # Fist: all 4 fingers curled | |
| if fingers_curled >= 4: | |
| fist_score += 1.0 | |
| elif fingers_curled >= 3: | |
| fist_score += 0.5 | |
| # Open palm: all 4 fingers extended | |
| if fingers_extended >= 4: | |
| open_score += 1.0 | |
| elif fingers_extended >= 3: | |
| open_score += 0.5 | |
| # Pointing: only index extended | |
| if lm[8].y < lm[6].y and fingers_curled >= 3: | |
| point_score += 1.0 | |
| # Normalize by number of hands | |
| if n_hands > 0: | |
| fist_score = min(1.0, fist_score / n_hands) | |
| open_score = min(1.0, open_score / n_hands) | |
| point_score = min(1.0, point_score / n_hands) | |
| return { | |
| "fist_clenching": float(fist_score), | |
| "open_palms": float(open_score), | |
| "fidgeting": 0.0, # requires temporal tracking (future) | |
| "pointing": float(point_score), | |
| } | |
| def _landmarks_to_features(self, landmarks, img_shape) -> dict: | |
| """Convert MediaPipe pose landmarks to posture features.""" | |
| lm = landmarks.landmark | |
| h, w = img_shape[:2] | |
| def pt(idx): | |
| return np.array([lm[idx].x * w, lm[idx].y * h, lm[idx].z * w]) | |
| # Key landmarks | |
| l_shoulder = pt(11) | |
| r_shoulder = pt(12) | |
| l_hip = pt(23) | |
| r_hip = pt(24) | |
| l_elbow = pt(13) | |
| r_elbow = pt(14) | |
| l_wrist = pt(15) | |
| r_wrist = pt(16) | |
| nose = pt(0) | |
| l_ear = pt(7) | |
| r_ear = pt(8) | |
| # Shoulder analysis | |
| shoulder_center = (l_shoulder + r_shoulder) / 2 | |
| hip_center = (l_hip + r_hip) / 2 | |
| shoulder_width = np.linalg.norm(l_shoulder[:2] - r_shoulder[:2]) | |
| torso_height = np.linalg.norm(shoulder_center[:2] - hip_center[:2]) | |
| # Shoulder slump: shoulders dropping forward (z-depth) | |
| shoulder_slump = max(0, (l_shoulder[2] + r_shoulder[2]) / 2) / (w * 0.1 + 1e-6) | |
| shoulder_slump = min(shoulder_slump, 1.0) | |
| # Shoulder elevation relative to ears | |
| ear_y = (l_ear[1] + r_ear[1]) / 2 | |
| shoulder_elevation = 1.0 - min(1.0, abs(shoulder_center[1] - ear_y) / (torso_height + 1e-6)) | |
| # Head drop: nose below shoulder line | |
| head_drop = max(0, nose[1] - shoulder_center[1]) / (torso_height * 0.3 + 1e-6) | |
| head_drop = min(head_drop, 1.0) | |
| # Head tilt: ear height difference | |
| head_tilt = abs(l_ear[1] - r_ear[1]) / (shoulder_width * 0.3 + 1e-6) | |
| head_tilt = min(head_tilt, 1.0) | |
| # Arm openness: elbows distance relative to shoulder width | |
| elbow_dist = np.linalg.norm(l_elbow[:2] - r_elbow[:2]) | |
| arm_openness = min(1.0, elbow_dist / (shoulder_width * 2.0 + 1e-6)) | |
| # Arm crossing: wrists close to opposite shoulders | |
| l_cross = np.linalg.norm(l_wrist[:2] - r_shoulder[:2]) / (shoulder_width + 1e-6) | |
| r_cross = np.linalg.norm(r_wrist[:2] - l_shoulder[:2]) / (shoulder_width + 1e-6) | |
| arm_crossing = max(0, 1.0 - min(l_cross, r_cross)) | |
| # Hand-to-face proximity | |
| face_center = nose[:2] | |
| l_hand_face = np.linalg.norm(l_wrist[:2] - face_center) / (torso_height + 1e-6) | |
| r_hand_face = np.linalg.norm(r_wrist[:2] - face_center) / (torso_height + 1e-6) | |
| hand_face_proximity = max(0, 1.0 - min(l_hand_face, r_hand_face)) | |
| # Body tension: shoulder elevation + arm tightness | |
| body_tension = (shoulder_elevation * 0.5 + (1.0 - arm_openness) * 0.5) | |
| # Forward lean | |
| body_lean_forward = max(0, shoulder_center[2] - hip_center[2]) / (w * 0.05 + 1e-6) | |
| body_lean_forward = min(body_lean_forward, 1.0) | |
| # Movement energy (approximated from landmark visibility/spread) | |
| wrist_spread = np.linalg.norm(l_wrist[:2] - r_wrist[:2]) / (shoulder_width * 3.0 + 1e-6) | |
| movement_energy = min(1.0, wrist_spread) | |
| # Overall openness | |
| overall_openness = (arm_openness * 0.4 + (1.0 - arm_crossing) * 0.3 + (1.0 - body_tension) * 0.3) | |
| # Detection confidence from landmark visibility | |
| avg_vis = np.mean([lm[i].visibility for i in [0, 7, 8, 11, 12, 13, 14, 15, 16, 23, 24]]) | |
| return { | |
| "shoulder_slump": float(shoulder_slump), | |
| "shoulder_elevation": float(shoulder_elevation), | |
| "head_drop": float(head_drop), | |
| "head_tilt": float(head_tilt), | |
| "arm_openness": float(arm_openness), | |
| "arm_crossing": float(arm_crossing), | |
| "hand_face_proximity": float(hand_face_proximity), | |
| "body_tension": float(body_tension), | |
| "body_lean_forward": float(body_lean_forward), | |
| "movement_energy": float(movement_energy), | |
| "overall_openness": float(overall_openness), | |
| # Gesture features (populated by _hand_gestures if hands detected) | |
| "fist_clenching": 0.0, | |
| "open_palms": float(arm_openness * 0.5), # approximate from arm openness | |
| "fidgeting": 0.0, | |
| "pointing": 0.0, | |
| "detection_confidence": float(avg_vis), | |
| } | |
| def _features_to_emotions(self, f: dict) -> dict: | |
| """Map posture + gesture features to emotion probabilities using clinical heuristics.""" | |
| scores = {label: 0.0 for label in EMOTION_LABELS} | |
| # Gesture features (default 0 if not available) | |
| fist = f.get("fist_clenching", 0.0) | |
| palms = f.get("open_palms", 0.0) | |
| fidget = f.get("fidgeting", 0.0) | |
| point = f.get("pointing", 0.0) | |
| # Sadness: slumped shoulders, head drop, closed posture, low energy | |
| scores[EmotionLabel.SADNESS] = ( | |
| f["shoulder_slump"] * 0.22 | |
| + f["head_drop"] * 0.22 | |
| + (1.0 - f["arm_openness"]) * 0.18 | |
| + (1.0 - f["movement_energy"]) * 0.13 | |
| + f["arm_crossing"] * 0.13 | |
| + (1.0 - palms) * 0.06 # closed hands | |
| + fidget * 0.06 | |
| ) | |
| # Joy: open posture, open palms, high energy, no tension | |
| scores[EmotionLabel.JOY] = ( | |
| f["arm_openness"] * 0.22 | |
| + f["overall_openness"] * 0.20 | |
| + f["movement_energy"] * 0.18 | |
| + (1.0 - f["shoulder_slump"]) * 0.12 | |
| + (1.0 - f["body_tension"]) * 0.10 | |
| + palms * 0.10 # open palms = positive | |
| + (1.0 - fist) * 0.08 | |
| ) | |
| # Fear: tension, shoulders elevated, arms close, self-touching, fidgeting | |
| scores[EmotionLabel.FEAR] = ( | |
| f["body_tension"] * 0.25 | |
| + f["shoulder_elevation"] * 0.15 | |
| + (1.0 - f["arm_openness"]) * 0.15 | |
| + f["hand_face_proximity"] * 0.15 # self-touching gesture | |
| + (1.0 - f["overall_openness"]) * 0.12 | |
| + fidget * 0.10 # fidgeting gesture | |
| + (1.0 - palms) * 0.08 | |
| ) | |
| # Surprise: elevated shoulders, lean forward, head tilt, open palms | |
| scores[EmotionLabel.SURPRISE] = ( | |
| f["shoulder_elevation"] * 0.22 | |
| + f["body_lean_forward"] * 0.20 | |
| + f["head_tilt"] * 0.18 | |
| + f["movement_energy"] * 0.15 | |
| + f["arm_openness"] * 0.12 | |
| + palms * 0.08 # hands open in surprise | |
| + point * 0.05 | |
| ) | |
| # Disgust: arm crossing, lean back, tension, hand-to-face (covering nose/mouth) | |
| scores[EmotionLabel.DISGUST] = ( | |
| f["arm_crossing"] * 0.25 | |
| + (1.0 - f["body_lean_forward"]) * 0.18 | |
| + f["body_tension"] * 0.18 | |
| + (1.0 - f["arm_openness"]) * 0.15 | |
| + f["hand_face_proximity"] * 0.15 # covering face gesture | |
| + (1.0 - palms) * 0.09 | |
| ) | |
| # Love: open, relaxed, leaning forward, open palms, low tension | |
| scores[EmotionLabel.LOVE] = ( | |
| f["body_lean_forward"] * 0.20 | |
| + f["overall_openness"] * 0.20 | |
| + f["arm_openness"] * 0.18 | |
| + (1.0 - f["body_tension"]) * 0.15 | |
| + (1.0 - f["arm_crossing"]) * 0.10 | |
| + palms * 0.10 # open palms = warmth | |
| + (1.0 - fist) * 0.07 | |
| ) | |
| # Calm: relaxed, neutral posture, low energy, open palms, no fidgeting | |
| scores[EmotionLabel.CALM] = ( | |
| (1.0 - f["body_tension"]) * 0.25 | |
| + (1.0 - f["movement_energy"]) * 0.18 | |
| + f["overall_openness"] * 0.15 | |
| + (1.0 - f["shoulder_elevation"]) * 0.12 | |
| + (1.0 - f["hand_face_proximity"]) * 0.10 | |
| + palms * 0.10 # relaxed open hands | |
| + (1.0 - fidget) * 0.10 | |
| ) | |
| # Anger: fist clenching, high tension, high energy, forward lean, pointing | |
| scores[EmotionLabel.ANGER] = ( | |
| fist * 0.20 # fist clenching = key anger gesture | |
| + f["body_tension"] * 0.18 | |
| + f["movement_energy"] * 0.18 | |
| + (1.0 - f["arm_openness"]) * 0.12 | |
| + f["body_lean_forward"] * 0.12 | |
| + point * 0.12 # pointing gesture = aggression | |
| + (1.0 - f["overall_openness"]) * 0.08 | |
| ) | |
| # Neutral: absence of strong signals | |
| extremes = max(scores.values()) - min(scores.values()) | |
| scores[EmotionLabel.NEUTRAL] = max(0.0, 1.0 - extremes * 2.0) * 0.5 | |
| return scores | |