EmoSphere / posture_detector.py
chariscait's picture
Add gesture detection with MediaPipe Hands (fist, open palm, pointing)
b655a99 verified
"""EmoSphere Posture & Gesture Emotion Detector.
Uses MediaPipe Pose + Hands landmarks to estimate body posture and gesture cues,
then maps them to emotion probabilities via a rule-based heuristic engine.
Posture signals:
- Shoulder slump / elevation (sadness vs confidence)
- Head tilt / drop (interest, submission, sadness)
- Arm openness / crossing (comfort vs defensiveness)
- Overall body tension / relaxation
- Forward lean (engagement, aggression)
Gesture signals:
- Hand-to-face gestures (anxiety, contemplation)
- Fist clenching (anger, frustration)
- Open palms (openness, honesty)
- Self-touching / fidgeting (anxiety, discomfort)
- Hand waving / movement energy (agitation vs calm)
- Pointing / directional gestures (anger, dominance)
"""
from __future__ import annotations
import time
from typing import Optional
import numpy as np
from models import (
EmotionLabel, EMOTION_LABELS, EmotionScore,
EmotionDetectionResult, CulturalRegion, CULTURAL_ADJUSTMENT,
)
# Try to import MediaPipe for real pose estimation
try:
import mediapipe as mp
# Verify solutions module exists (missing in some versions/Python 3.13)
_test = mp.solutions.pose
HAS_MEDIAPIPE = True
except (ImportError, AttributeError):
HAS_MEDIAPIPE = False
try:
from PIL import Image
import io
HAS_PIL = True
except ImportError:
HAS_PIL = False
class PostureEmotionDetector:
"""Detect emotions from body posture and hand gestures."""
def __init__(self, device: str = "cpu"):
self.device = device
self.loaded = False
self.pose = None
self.hands = None
def load(self) -> None:
if HAS_MEDIAPIPE:
try:
self.pose = mp.solutions.pose.Pose(
static_image_mode=True,
model_complexity=1,
min_detection_confidence=0.5,
)
except Exception as e:
print(f"[PostureDetector] Pose init error: {e}")
try:
self.hands = mp.solutions.hands.Hands(
static_image_mode=True,
max_num_hands=2,
min_detection_confidence=0.5,
)
except Exception as e:
print(f"[PostureDetector] Hands init error: {e}")
self.loaded = True
parts = []
if self.pose:
parts.append("pose")
if self.hands:
parts.append("hands")
mode = "+".join(parts) if parts else "heuristic-simulation"
print(f"[PostureDetector] Loaded ({mode})")
def detect(
self, image_bytes: bytes, cultural_region: CulturalRegion = CulturalRegion.UNIVERSAL
) -> EmotionDetectionResult:
start = time.time()
features = self._extract_features(image_bytes)
raw_scores = self._features_to_emotions(features)
# Cultural adjustment
adj = CULTURAL_ADJUSTMENT.get(cultural_region, 1.0)
for label in raw_scores:
if label != EmotionLabel.NEUTRAL:
raw_scores[label] *= adj
# Normalize
total = sum(raw_scores.values())
if total > 0:
raw_scores = {k: v / total for k, v in raw_scores.items()}
dominant = max(raw_scores, key=raw_scores.get)
confidence = raw_scores[dominant] * features.get("detection_confidence", 0.7)
scores = [
EmotionScore(label=label, score=raw_scores.get(label, 0.0), confidence=confidence)
for label in EMOTION_LABELS
]
return EmotionDetectionResult(
dominant=dominant,
dominant_score=raw_scores[dominant],
scores=scores,
modality="posture/gesture",
confidence=min(confidence, 1.0),
processing_time_ms=(time.time() - start) * 1000,
cultural_region=cultural_region,
)
def _extract_features(self, image_bytes: bytes) -> dict:
"""Extract posture + gesture features from image using MediaPipe."""
if (self.pose or self.hands) and HAS_PIL:
try:
import cv2
import numpy as np
nparr = np.frombuffer(image_bytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
if img is not None:
rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Posture features from Pose landmarks
features = {}
if self.pose:
pose_results = self.pose.process(rgb)
if pose_results.pose_landmarks:
features = self._landmarks_to_features(pose_results.pose_landmarks, img.shape)
# Gesture features from Hand landmarks
gesture_features = {"fist_clenching": 0.0, "open_palms": 0.0,
"fidgeting": 0.0, "pointing": 0.0}
if self.hands:
hand_results = self.hands.process(rgb)
if hand_results.multi_hand_landmarks:
gesture_features = self._hand_gestures(hand_results.multi_hand_landmarks)
if features:
features.update(gesture_features)
return features
elif gesture_features.get("fist_clenching", 0) > 0 or gesture_features.get("open_palms", 0) > 0:
# Have hand data but no pose — build minimal features
base = self._default_features()
base.update(gesture_features)
base["detection_confidence"] = 0.5
return base
except Exception as e:
print(f"[PostureDetector] Feature extraction error: {e}")
# Simulation fallback with neutral-ish features (not random)
return self._default_features()
def _default_features(self) -> dict:
"""Return neutral default features when detection fails."""
return {
"shoulder_slump": 0.2,
"shoulder_elevation": 0.5,
"head_drop": 0.15,
"head_tilt": 0.1,
"arm_openness": 0.6,
"arm_crossing": 0.1,
"hand_face_proximity": 0.15,
"body_tension": 0.35,
"body_lean_forward": 0.2,
"movement_energy": 0.3,
"overall_openness": 0.6,
"fist_clenching": 0.0,
"open_palms": 0.0,
"fidgeting": 0.0,
"pointing": 0.0,
"detection_confidence": 0.4,
}
def _hand_gestures(self, hand_landmarks_list) -> dict:
"""Extract gesture features from MediaPipe Hand landmarks.
Gestures detected:
- Fist clenching: all fingers curled (anger, frustration)
- Open palms: all fingers extended (openness, calm)
- Fidgeting: rapid small movements (anxiety)
- Pointing: index extended, others curled (dominance, anger)
"""
fist_score = 0.0
open_score = 0.0
point_score = 0.0
n_hands = len(hand_landmarks_list)
for hand_lm in hand_landmarks_list:
lm = hand_lm.landmark
# Finger tip indices: thumb=4, index=8, middle=12, ring=16, pinky=20
# Finger MCP indices: thumb=2, index=5, middle=9, ring=13, pinky=17
# Check if fingers are curled (tip below MCP in y)
fingers_curled = 0
fingers_extended = 0
# Index finger
if lm[8].y > lm[6].y: # tip below PIP
fingers_curled += 1
else:
fingers_extended += 1
# Middle finger
if lm[12].y > lm[10].y:
fingers_curled += 1
else:
fingers_extended += 1
# Ring finger
if lm[16].y > lm[14].y:
fingers_curled += 1
else:
fingers_extended += 1
# Pinky
if lm[20].y > lm[18].y:
fingers_curled += 1
else:
fingers_extended += 1
# Fist: all 4 fingers curled
if fingers_curled >= 4:
fist_score += 1.0
elif fingers_curled >= 3:
fist_score += 0.5
# Open palm: all 4 fingers extended
if fingers_extended >= 4:
open_score += 1.0
elif fingers_extended >= 3:
open_score += 0.5
# Pointing: only index extended
if lm[8].y < lm[6].y and fingers_curled >= 3:
point_score += 1.0
# Normalize by number of hands
if n_hands > 0:
fist_score = min(1.0, fist_score / n_hands)
open_score = min(1.0, open_score / n_hands)
point_score = min(1.0, point_score / n_hands)
return {
"fist_clenching": float(fist_score),
"open_palms": float(open_score),
"fidgeting": 0.0, # requires temporal tracking (future)
"pointing": float(point_score),
}
def _landmarks_to_features(self, landmarks, img_shape) -> dict:
"""Convert MediaPipe pose landmarks to posture features."""
lm = landmarks.landmark
h, w = img_shape[:2]
def pt(idx):
return np.array([lm[idx].x * w, lm[idx].y * h, lm[idx].z * w])
# Key landmarks
l_shoulder = pt(11)
r_shoulder = pt(12)
l_hip = pt(23)
r_hip = pt(24)
l_elbow = pt(13)
r_elbow = pt(14)
l_wrist = pt(15)
r_wrist = pt(16)
nose = pt(0)
l_ear = pt(7)
r_ear = pt(8)
# Shoulder analysis
shoulder_center = (l_shoulder + r_shoulder) / 2
hip_center = (l_hip + r_hip) / 2
shoulder_width = np.linalg.norm(l_shoulder[:2] - r_shoulder[:2])
torso_height = np.linalg.norm(shoulder_center[:2] - hip_center[:2])
# Shoulder slump: shoulders dropping forward (z-depth)
shoulder_slump = max(0, (l_shoulder[2] + r_shoulder[2]) / 2) / (w * 0.1 + 1e-6)
shoulder_slump = min(shoulder_slump, 1.0)
# Shoulder elevation relative to ears
ear_y = (l_ear[1] + r_ear[1]) / 2
shoulder_elevation = 1.0 - min(1.0, abs(shoulder_center[1] - ear_y) / (torso_height + 1e-6))
# Head drop: nose below shoulder line
head_drop = max(0, nose[1] - shoulder_center[1]) / (torso_height * 0.3 + 1e-6)
head_drop = min(head_drop, 1.0)
# Head tilt: ear height difference
head_tilt = abs(l_ear[1] - r_ear[1]) / (shoulder_width * 0.3 + 1e-6)
head_tilt = min(head_tilt, 1.0)
# Arm openness: elbows distance relative to shoulder width
elbow_dist = np.linalg.norm(l_elbow[:2] - r_elbow[:2])
arm_openness = min(1.0, elbow_dist / (shoulder_width * 2.0 + 1e-6))
# Arm crossing: wrists close to opposite shoulders
l_cross = np.linalg.norm(l_wrist[:2] - r_shoulder[:2]) / (shoulder_width + 1e-6)
r_cross = np.linalg.norm(r_wrist[:2] - l_shoulder[:2]) / (shoulder_width + 1e-6)
arm_crossing = max(0, 1.0 - min(l_cross, r_cross))
# Hand-to-face proximity
face_center = nose[:2]
l_hand_face = np.linalg.norm(l_wrist[:2] - face_center) / (torso_height + 1e-6)
r_hand_face = np.linalg.norm(r_wrist[:2] - face_center) / (torso_height + 1e-6)
hand_face_proximity = max(0, 1.0 - min(l_hand_face, r_hand_face))
# Body tension: shoulder elevation + arm tightness
body_tension = (shoulder_elevation * 0.5 + (1.0 - arm_openness) * 0.5)
# Forward lean
body_lean_forward = max(0, shoulder_center[2] - hip_center[2]) / (w * 0.05 + 1e-6)
body_lean_forward = min(body_lean_forward, 1.0)
# Movement energy (approximated from landmark visibility/spread)
wrist_spread = np.linalg.norm(l_wrist[:2] - r_wrist[:2]) / (shoulder_width * 3.0 + 1e-6)
movement_energy = min(1.0, wrist_spread)
# Overall openness
overall_openness = (arm_openness * 0.4 + (1.0 - arm_crossing) * 0.3 + (1.0 - body_tension) * 0.3)
# Detection confidence from landmark visibility
avg_vis = np.mean([lm[i].visibility for i in [0, 7, 8, 11, 12, 13, 14, 15, 16, 23, 24]])
return {
"shoulder_slump": float(shoulder_slump),
"shoulder_elevation": float(shoulder_elevation),
"head_drop": float(head_drop),
"head_tilt": float(head_tilt),
"arm_openness": float(arm_openness),
"arm_crossing": float(arm_crossing),
"hand_face_proximity": float(hand_face_proximity),
"body_tension": float(body_tension),
"body_lean_forward": float(body_lean_forward),
"movement_energy": float(movement_energy),
"overall_openness": float(overall_openness),
# Gesture features (populated by _hand_gestures if hands detected)
"fist_clenching": 0.0,
"open_palms": float(arm_openness * 0.5), # approximate from arm openness
"fidgeting": 0.0,
"pointing": 0.0,
"detection_confidence": float(avg_vis),
}
def _features_to_emotions(self, f: dict) -> dict:
"""Map posture + gesture features to emotion probabilities using clinical heuristics."""
scores = {label: 0.0 for label in EMOTION_LABELS}
# Gesture features (default 0 if not available)
fist = f.get("fist_clenching", 0.0)
palms = f.get("open_palms", 0.0)
fidget = f.get("fidgeting", 0.0)
point = f.get("pointing", 0.0)
# Sadness: slumped shoulders, head drop, closed posture, low energy
scores[EmotionLabel.SADNESS] = (
f["shoulder_slump"] * 0.22
+ f["head_drop"] * 0.22
+ (1.0 - f["arm_openness"]) * 0.18
+ (1.0 - f["movement_energy"]) * 0.13
+ f["arm_crossing"] * 0.13
+ (1.0 - palms) * 0.06 # closed hands
+ fidget * 0.06
)
# Joy: open posture, open palms, high energy, no tension
scores[EmotionLabel.JOY] = (
f["arm_openness"] * 0.22
+ f["overall_openness"] * 0.20
+ f["movement_energy"] * 0.18
+ (1.0 - f["shoulder_slump"]) * 0.12
+ (1.0 - f["body_tension"]) * 0.10
+ palms * 0.10 # open palms = positive
+ (1.0 - fist) * 0.08
)
# Fear: tension, shoulders elevated, arms close, self-touching, fidgeting
scores[EmotionLabel.FEAR] = (
f["body_tension"] * 0.25
+ f["shoulder_elevation"] * 0.15
+ (1.0 - f["arm_openness"]) * 0.15
+ f["hand_face_proximity"] * 0.15 # self-touching gesture
+ (1.0 - f["overall_openness"]) * 0.12
+ fidget * 0.10 # fidgeting gesture
+ (1.0 - palms) * 0.08
)
# Surprise: elevated shoulders, lean forward, head tilt, open palms
scores[EmotionLabel.SURPRISE] = (
f["shoulder_elevation"] * 0.22
+ f["body_lean_forward"] * 0.20
+ f["head_tilt"] * 0.18
+ f["movement_energy"] * 0.15
+ f["arm_openness"] * 0.12
+ palms * 0.08 # hands open in surprise
+ point * 0.05
)
# Disgust: arm crossing, lean back, tension, hand-to-face (covering nose/mouth)
scores[EmotionLabel.DISGUST] = (
f["arm_crossing"] * 0.25
+ (1.0 - f["body_lean_forward"]) * 0.18
+ f["body_tension"] * 0.18
+ (1.0 - f["arm_openness"]) * 0.15
+ f["hand_face_proximity"] * 0.15 # covering face gesture
+ (1.0 - palms) * 0.09
)
# Love: open, relaxed, leaning forward, open palms, low tension
scores[EmotionLabel.LOVE] = (
f["body_lean_forward"] * 0.20
+ f["overall_openness"] * 0.20
+ f["arm_openness"] * 0.18
+ (1.0 - f["body_tension"]) * 0.15
+ (1.0 - f["arm_crossing"]) * 0.10
+ palms * 0.10 # open palms = warmth
+ (1.0 - fist) * 0.07
)
# Calm: relaxed, neutral posture, low energy, open palms, no fidgeting
scores[EmotionLabel.CALM] = (
(1.0 - f["body_tension"]) * 0.25
+ (1.0 - f["movement_energy"]) * 0.18
+ f["overall_openness"] * 0.15
+ (1.0 - f["shoulder_elevation"]) * 0.12
+ (1.0 - f["hand_face_proximity"]) * 0.10
+ palms * 0.10 # relaxed open hands
+ (1.0 - fidget) * 0.10
)
# Anger: fist clenching, high tension, high energy, forward lean, pointing
scores[EmotionLabel.ANGER] = (
fist * 0.20 # fist clenching = key anger gesture
+ f["body_tension"] * 0.18
+ f["movement_energy"] * 0.18
+ (1.0 - f["arm_openness"]) * 0.12
+ f["body_lean_forward"] * 0.12
+ point * 0.12 # pointing gesture = aggression
+ (1.0 - f["overall_openness"]) * 0.08
)
# Neutral: absence of strong signals
extremes = max(scores.values()) - min(scores.values())
scores[EmotionLabel.NEUTRAL] = max(0.0, 1.0 - extremes * 2.0) * 0.5
return scores