Spaces:

Nausad
/

EMOTIONAI

Sleeping

App Files Files Community

EMOTIONAI / app.py

Nausad

Update app.py

f781a88 verified 19 days ago

raw

history blame contribute delete

45.7 kB

	"""
	Multimodal Emotion Recognition v4 — Gradio LIVE UI (always-on capture)
	Continuously streams webcam + mic in background threads, shows a live mirror
	of the user's own video, and runs inference on a rolling 4-second clip every
	few seconds — predict() is unchanged from the one-shot version.
	"""

	import json
	import os
	import subprocess
	import tempfile
	import threading
	import time
	from collections import deque

	# Load .env BEFORE any HuggingFace imports (model.py needs HF_TOKEN)
	from dotenv import load_dotenv
	load_dotenv()

	HF_TOKEN = os.getenv("HF_TOKEN")
	if HF_TOKEN:
	os.environ["HF_TOKEN"] = HF_TOKEN

	import cv2
	import numpy as np
	import soundfile as sf
	import sounddevice as sd
	import torch
	import gradio as gr
	import mediapipe as mp
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	from PIL import Image
	import torchvision.transforms.functional as TF

	from model import SERModel, FERModel, MultimodalEmotionModel
	from groq import Groq
	import collections
	from dataclasses import dataclass, field
	from typing import Optional
	from enum import Enum
	import concurrent.futures
	import re

	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	groq_client = Groq(api_key=GROQ_API_KEY)

	GROQ_MODEL = "llama-3.1-8b-instant"

	# =========================================================
	# EMOTIONAL STATE TRACKER + CONTEXT ENGINE + LLM RESPONSE
	# =========================================================

	@dataclass
	class EmotionSnapshot:
	label : str
	confidence : float
	gate : float
	timestamp : float = field(default_factory=time.time)


	class EmotionalStateTracker:
	def __init__(self, window: int = 3):
	self.history : collections.deque = collections.deque(maxlen=window)
	self.last_response_emotion : Optional[str] = None
	self.last_response_time : float = 0.0
	self.previous_emotion : Optional[str] = None

	def update(self, label: str, confidence: float, gate: float):
	if self.history and self.history[-1].label != label:
	self.previous_emotion = self.history[-1].label
	self.history.append(EmotionSnapshot(label, confidence, gate))

	@property
	def dominant_emotion(self) -> Optional[str]:
	if not self.history:
	return None
	return collections.Counter(s.label for s in self.history).most_common(1)[0][0]

	@property
	def is_stable(self) -> bool:
	if len(self.history) < 2:
	return False
	return len(set(s.label for s in list(self.history)[-2:])) == 1

	@property
	def just_shifted(self) -> bool:
	if len(self.history) < 2:
	return False
	h = list(self.history)
	return h[-2].label != h[-1].label

	@property
	def mean_confidence(self) -> float:
	if not self.history:
	return 0.0
	return sum(s.confidence for s in self.history) / len(self.history)

	@property
	def intensity(self) -> str:
	c = self.mean_confidence
	if c > 0.75: return "high"
	if c > 0.50: return "medium"
	return "low"

	@property
	def dominant_modality(self) -> str:
	if not self.history:
	return "balanced"
	avg = sum(s.gate for s in self.history) / len(self.history)
	if avg > 0.65: return "voice" # Only call it "voice" if it's very high
	if avg < 0.35: return "face" # Only call it "face" if it's very low
	return "balanced (trusting both)"

	def should_respond(self, cooldown_secs: float = 8.0) -> bool:
	now = time.time()
	cooldown = (now - self.last_response_time) > cooldown_secs
	shifted = self.dominant_emotion != self.last_response_emotion
	if self.just_shifted and (now - self.last_response_time) > 3.0:
	return True
	return self.is_stable and (cooldown or shifted)

	def mark_responded(self):
	self.last_response_time = time.time()
	self.last_response_emotion = self.dominant_emotion


	class ResponseType(Enum):
	NONE = "none"
	ACKNOWLEDGE = "acknowledge"
	EMPATHISE = "empathise"
	ENCOURAGE = "encourage"
	CHECK_IN = "check_in"
	SHIFT_NOTED = "shift_noted"


	def decide_response_type(tracker: EmotionalStateTracker) -> ResponseType:
	if tracker.just_shifted and len(tracker.history) >= 2:
	if tracker.should_respond():
	return ResponseType.SHIFT_NOTED

	if not tracker.should_respond():
	return ResponseType.NONE

	emotion = tracker.dominant_emotion

	if emotion == "neutral":
	if len(tracker.history) == tracker.history.maxlen:
	if all(s.label == "neutral" for s in tracker.history):
	return ResponseType.CHECK_IN

	if emotion in ("angry", "sad"):
	return ResponseType.EMPATHISE if tracker.intensity == "high" \
	else ResponseType.ACKNOWLEDGE

	if emotion == "happy":
	return ResponseType.ENCOURAGE

	return ResponseType.ACKNOWLEDGE

	_conversation_history : list[dict] = []
	_pending_response : Optional[concurrent.futures.Future] = None
	_groq_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
	MAX_HISTORY = 6

	def generate_response(tracker: EmotionalStateTracker,
	response_type: ResponseType) -> Optional[str]:
	if response_type == ResponseType.NONE:
	return None

	emotion = tracker.dominant_emotion
	intensity = tracker.intensity
	modality = tracker.dominant_modality
	shifted = tracker.just_shifted

	system_prompt = """\
	You are a calm, empathetic emotion-aware assistant embedded in a real-time \
	emotion recognition system.

	Your role is to acknowledge the user's detected emotional state in a way \
	that feels natural and supportive — like a thoughtful colleague, not a therapist.

	Rules you must always follow:
	- Keep responses to 1-2 sentences maximum. Never longer.
	- Never say "I detected" or "the system shows" — speak directly to the user.
	- Never repeat a response you have already given in this conversation.
	- Do not ask multiple questions in one response. One question maximum.
	- Match your tone to the intensity: low intensity = gentle observation, \
	high intensity = more direct empathy.
	- If the emotion is neutral, keep it light — do not over-interpret calmness.
	- Never use bullet points, lists, or markdown formatting.
	- Sound like a human, not an AI assistant.
	- When the user's emotion has just shifted drastically (e.g., happy to sad), \
	acknowledge the change itself — ask gently what happened rather than just \
	responding to the new emotion in isolation."""

	shift_context = ""
	if response_type == ResponseType.SHIFT_NOTED and tracker.previous_emotion:
	shift_context = f"\n- Shifted FROM: {tracker.previous_emotion} \u2192 TO: {emotion}"
	shift_context += "\n- IMPORTANT: Acknowledge this sudden change. Ask what happened."

	context_message = f"""\
	Current emotional context:
	- Detected emotion: {emotion}
	- Intensity: {intensity}
	- Detected primarily through: {modality}
	- Emotion just shifted from previous state: {shifted}{shift_context}
	- Response type needed: {response_type.value}
	- Recent emotion history: {[s.label for s in tracker.history]}

	Generate a single natural response appropriate for this context."""

	messages = (
	[{"role": "system", "content": system_prompt}]
	+ _conversation_history[-MAX_HISTORY:]
	+ [{"role": "user", "content": context_message}]
	)

	try:
	completion = groq_client.chat.completions.create(
	model=GROQ_MODEL,
	messages=messages,
	temperature=0.7,
	max_tokens=80,
	top_p=0.9,
	)
	response = completion.choices[0].message.content.strip()
	except Exception as e:
	print(f"[Groq] API call failed: {e}")
	return None

	_conversation_history.append({"role": "user", "content": context_message})
	_conversation_history.append({"role": "assistant", "content": response})
	tracker.mark_responded()
	return response

	tracker = EmotionalStateTracker(window=3)

	def _parse_gate_value(gate_desc: str) -> float:
	match = re.search(r"([0-9]+\.[0-9]+)", gate_desc)
	return float(match.group(1)) if match else 0.5

	# =========================================================
	# CONFIG
	# =========================================================
	MODEL_PATH = "multimodal_final_v3.pt"
	SAMPLE_RATE = 48000
	MAX_DURATION = 3.5
	MAX_SAMPLES = int(SAMPLE_RATE * MAX_DURATION)

	NUM_FRAMES = 32
	IMG_SIZE = 224
	VISUAL_DIM = 768

	EMOTIONS = ["angry", "happy", "neutral", "sad"]
	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# --- NEW: vit-face-expression requires 0.5 normalization ---
	_IMAGENET_MEAN = [0.5, 0.5, 0.5]
	_IMAGENET_STD = [0.5, 0.5, 0.5]

	# --- LIVE CAPTURE ---
	CLIP_SECONDS = 3.0
	CAM_FPS = 20
	MIRROR_FPS = 10
	TICK_INTERVAL = 5.0
	HISTORY_SECONDS = 60
	WEBCAM_INDEX = 0

	# ── UI: pastel green accent ──────────────────────────────
	EMOTION_COLORS = {
	"angry": "#e07070",
	"happy": "#a8c5a0",
	"neutral": "#8a857a",
	"sad": "#6b9bd1",
	}


	# =========================================================
	# AUDIO PIPELINE
	# =========================================================
	def _resample_linear(wav, orig_sr, target_sr):
	if orig_sr == target_sr or len(wav) < 2:
	return wav.astype(np.float32)
	target_len = int(round(len(wav) * target_sr / orig_sr))
	x_old = np.linspace(0.0, 1.0, num=len(wav), endpoint=False)
	x_new = np.linspace(0.0, 1.0, num=target_len, endpoint=False)
	return np.interp(x_new, x_old, wav).astype(np.float32)

	def load_wav_from_file(path, sr=SAMPLE_RATE):
	try:
	wav, orig_sr = sf.read(path, dtype="float32", always_2d=False)
	if wav.ndim > 1:
	wav = wav.mean(axis=1)
	return _resample_linear(wav, orig_sr, sr)
	except Exception:
	pass

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	tmp_path = tmp.name
	try:
	subprocess.run(
	["ffmpeg", "-y", "-loglevel", "error",
	"-i", path, "-vn", "-ac", "1", "-ar", str(sr),
	"-f", "wav", "-acodec", "pcm_s16le", tmp_path],
	check=True, capture_output=True, shell=True,
	)
	wav, _ = sf.read(tmp_path, dtype="float32", always_2d=False)
	if wav.ndim > 1:
	wav = wav.mean(axis=1)
	return wav.astype(np.float32)
	finally:
	if os.path.exists(tmp_path):
	os.remove(tmp_path)

	def vad_trim(wav, thr=0.05, frame_ms=20):
	hop = int(SAMPLE_RATE * frame_ms / 1000)
	if len(wav) < hop:
	return wav
	rms = np.array([
	np.sqrt(np.mean(wav[i:i + hop] ** 2))
	for i in range(0, len(wav), hop)
	])
	if len(rms) == 0 or rms.max() <= 1e-8:
	return wav
	mask = rms > thr * rms.max()
	if mask.sum() == 0:
	return wav
	idx = np.where(mask)[0]
	return wav[idx[0] * hop : min(len(wav), idx[-1] * hop + hop)]

	def normalize_amplitude(wav, peak=0.95):
	mx = np.abs(wav).max()
	return wav * (peak / mx) if mx > 1e-6 else wav

	def segment_and_pad(wav, length=MAX_SAMPLES):
	if len(wav) >= length:
	start = (len(wav) - length) // 2
	return wav[start:start + length]
	return np.pad(wav, (0, length - len(wav)))

	def preprocess_audio(path):
	try:
	wav = load_wav_from_file(path)
	wav = vad_trim(wav)
	wav = normalize_amplitude(wav)
	return segment_and_pad(wav)
	except Exception as e:
	print(f"[audio] failed: {e}")
	return None


	# =========================================================
	# FACE / CLIP PIPELINE
	# =========================================================
	from mediapipe.tasks import python as mp_python
	from mediapipe.tasks.python import vision as mp_vision

	_tflite_path = os.path.join(os.path.dirname(__file__), 'blaze_face_short_range.tflite')
	_base_options = mp_python.BaseOptions(model_asset_path=_tflite_path)
	_options = mp_vision.FaceDetectorOptions(base_options=_base_options, min_detection_confidence=0.3)
	_face_detector = mp_vision.FaceDetector.create_from_options(_options)

	def _clip_eval_transform(frames):
	processed = []
	for f in frames:
	f = TF.resize(f, (IMG_SIZE, IMG_SIZE),
	interpolation=TF.InterpolationMode.BICUBIC)
	t = TF.to_tensor(f)
	t = TF.normalize(t, _IMAGENET_MEAN, _IMAGENET_STD)
	processed.append(t)
	return torch.stack(processed)

	def extract_face_crop(frame_bgr, margin_frac=0.1, min_confidence=0.3):
	rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
	mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
	res = _face_detector.detect(mp_image)
	if not res.detections:
	return None, 0.0

	det = res.detections[0]
	score = det.categories[0].score if det.categories else 0.0
	bbox = det.bounding_box

	ih, iw = frame_bgr.shape[:2]

	x = bbox.origin_x
	y = bbox.origin_y
	w = bbox.width
	h = bbox.height

	mx = int(w * margin_frac)
	my = int(h * margin_frac)

	x1, y1 = max(0, x - mx), max(0, y - my)
	x2, y2 = min(iw, x + w + mx), min(ih, y + h + my)
	if (x2 - x1) < 40 or (y2 - y1) < 40:
	return None, 0.0

	return Image.fromarray(rgb[y1:y2, x1:x2]), score

	def read_all_frames(video_path):
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return []
	frames = []
	while True:
	ok, frame = cap.read()
	if not ok:
	break
	frames.append(frame)
	cap.release()
	return frames

	def build_clip_from_video(video_path, num_frames=NUM_FRAMES):
	frames_bgr = read_all_frames(video_path)
	if len(frames_bgr) == 0:
	return [], [], 0 # NEW: returning empty scores list

	total = len(frames_bgr)
	positions = np.linspace(0.1, 0.9, num_frames)
	sampled_idx = [min(int(total * p), total - 1) for p in positions]

	faces = []
	scores = [] # NEW: Track scores
	for idx in sampled_idx:
	face, score = extract_face_crop(frames_bgr[idx])
	if face is not None:
	faces.append(face)
	scores.append(score)

	n_detected = len(faces)
	if n_detected < num_frames // 2:
	return [], [], n_detected

	# Pad if we have fewer than num_frames
	while len(faces) < num_frames:
	faces.append(faces[-1])
	scores.append(scores[-1])

	return faces[:num_frames], scores[:num_frames], n_detected


	# =========================================================
	# MODEL LOAD
	# =========================================================
	def load_model():
	ser = SERModel(n_class=4, load_pretrained=False)
	fer = FERModel(
	n_class=4, n_frames=NUM_FRAMES,
	embed_dim=VISUAL_DIM, img_size=IMG_SIZE,
	load_pretrained=False,
	)
	ckpt = torch.load(MODEL_PATH, map_location=DEVICE, weights_only=False)
	cfg = ckpt.get("config", {})
	net = MultimodalEmotionModel(
	ser=ser, fer=fer,
	fusion_dim=cfg.get("fusion_dim", 512),
	n_classes=cfg.get("n_classes", 4),
	visual_dim=cfg.get("visual_dim", VISUAL_DIM),
	dropout=cfg.get("dropout", 0.3),
	gate_hidden_mult=2, # <-- add this

	)
	net.load_state_dict(ckpt["model_state"], strict=False)
	net.to(DEVICE).eval()
	labels = cfg.get("emotion_labels", EMOTIONS)
	n_frames = cfg.get("num_frames", NUM_FRAMES)
	return net, labels, ckpt.get("results", {}), n_frames


	print(f"[model] loading on {DEVICE}...")
	MODEL, EMOTION_LABELS, RESULTS, CKPT_NUM_FRAMES = load_model()
	print(f"[model] ready. labels = {EMOTION_LABELS}, n_frames = {CKPT_NUM_FRAMES}")


	# =========================================================
	# predict()
	# =========================================================
	def predict(video_path):
	import os
	import torch
	import numpy as np

	empty = {label: 0.0 for label in EMOTION_LABELS}

	# -----------------------------
	# Safety checks
	# -----------------------------
	if video_path is None or not os.path.exists(video_path):
	return empty, "—", "awaiting input", ""

	try:
	# -----------------------------------------
	# 1. Feature extraction
	# -----------------------------------------
	faces, scores, n_detected = build_clip_from_video(
	video_path,
	num_frames=CKPT_NUM_FRAMES
	)

	audio = preprocess_audio(video_path)

	if faces is None or len(faces) == 0:
	return (
	empty,
	"—",
	"no face detected",
	f"only {n_detected}/{CKPT_NUM_FRAMES} frames had a face"
	)

	if audio is None:
	return empty, "—", "audio extraction failed", ""

	# -----------------------------------------
	# 2. Prepare tensors
	# -----------------------------------------
	wav_tensor = (
	torch.from_numpy(audio)
	.float()
	.unsqueeze(0)
	.to(DEVICE)
	)

	clip_tensor = (
	_clip_eval_transform(faces)
	.unsqueeze(0)
	.to(DEVICE)
	)

	scores_tensor = (
	torch.tensor(scores, dtype=torch.float32)
	.unsqueeze(0)
	.to(DEVICE)
	)

	# -----------------------------------------
	# 3. Inference
	# -----------------------------------------
	with torch.no_grad():
	logits, gate = MODEL(
	wav_tensor,
	clip_tensor,
	img_scores=scores_tensor,
	return_gate=True
	)

	gate_mean = float(gate.mean().item())

	# =================================================
	# BIAS CORRECTION ON LOGITS (NOT PROBABILITIES)
	# tune these empirically if needed
	# =================================================
	label_idx = {lab:i for i,lab in enumerate(EMOTION_LABELS)}

	if "sad" in label_idx:
	logits[:, label_idx["sad"]] -= 0.30

	if "neutral" in label_idx:
	logits[:, label_idx["neutral"]] += 0.15

	if "happy" in label_idx:
	logits[:, label_idx["happy"]] += 0.15


	# -----------------------------------------
	# Optional temperature calibration
	# >1 smoother, <1 sharper
	# -----------------------------------------
	TEMPERATURE = 1.2
	logits = logits / TEMPERATURE

	probs = torch.softmax(
	logits,
	dim=1
	).cpu().numpy()[0]


	# -----------------------------------------
	# 4. Map probabilities
	# -----------------------------------------
	probs_dict = {
	label: float(probs[i])
	for i, label in enumerate(EMOTION_LABELS)
	}


	# -----------------------------------------
	# 5. Confidence-margin stabilization
	# prevents noisy label flipping
	# -----------------------------------------
	sorted_preds = sorted(
	probs_dict.items(),
	key=lambda x: x[1],
	reverse=True
	)

	top_label, top_prob = sorted_preds[0]
	second_prob = sorted_preds[1][1]

	confidence_gap = top_prob - second_prob

	# if prediction is weak/ambiguous -> neutral fallback
	if confidence_gap < 0.08 and "neutral" in probs_dict:
	pred_label = "neutral"
	else:
	pred_label = top_label


	# -----------------------------------------
	# 6. Gate interpretation
	# assumes:
	# fused = gateaudio + (1-gate)visual
	# -----------------------------------------
	if gate_mean > 0.65:
	gate_desc = f"{gate_mean:.3f} · leans audio"

	elif gate_mean < 0.35:
	gate_desc = f"{gate_mean:.3f} · leans visual"

	else:
	gate_desc = (
	f"{gate_mean:.3f} · balanced (trusting both)"
	)


	# -----------------------------------------
	# 7. Status
	# -----------------------------------------
	status = (
	f"{n_detected}/{CKPT_NUM_FRAMES} frames detected "
	f"· ViT + WavLM Gated Fusion"
	)

	return (
	probs_dict,
	pred_label,
	gate_desc,
	status
	)

	except Exception as e:
	return (
	empty,
	"—",
	"inference failed",
	str(e)
	)

	# =========================================================
	# CONTINUOUS BACKGROUND CAPTURE
	# =========================================================
	_cam_lock = threading.Lock()
	_aud_lock = threading.Lock()

	FRAME_BUFFER = deque(maxlen=int((CLIP_SECONDS + 1.0) * CAM_FPS))
	AUDIO_BUFFER_MAXLEN = int((CLIP_SECONDS + 1.0) * SAMPLE_RATE)
	AUDIO_BUFFER = deque(maxlen=AUDIO_BUFFER_MAXLEN)

	_capture_running = threading.Event()
	_cam_thread = {"t": None}
	_aud_stream = {"s": None}
	_cam_frame_shape = {"wh": (640, 480)}

	def _cam_loop():
	cap = cv2.VideoCapture(WEBCAM_INDEX)
	if not cap.isOpened():
	print("[cam] could not open webcam")
	_capture_running.clear()
	return

	w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) or 640
	h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) or 480
	_cam_frame_shape["wh"] = (w, h)

	interval = 1.0 / CAM_FPS
	next_t = time.time()
	while _capture_running.is_set():
	ok, frame = cap.read()
	if not ok:
	time.sleep(0.01)
	continue
	ts = time.time()
	with _cam_lock:
	FRAME_BUFFER.append((ts, frame))
	next_t += interval
	sleep_for = next_t - time.time()
	if sleep_for > 0:
	time.sleep(sleep_for)
	else:
	next_t = time.time()

	cap.release()

	def _audio_callback(indata, frames, time_info, status):
	mono = indata[:, 0] if indata.ndim > 1 else indata
	with _aud_lock:
	AUDIO_BUFFER.extend(mono.tolist())

	def start_capture_threads():
	if _capture_running.is_set():
	return True
	_capture_running.set()

	t = threading.Thread(target=_cam_loop, daemon=True)
	_cam_thread["t"] = t
	t.start()

	try:
	stream = sd.InputStream(
	device=9,
	samplerate=SAMPLE_RATE,
	channels=1,
	dtype="float32",
	callback=_audio_callback,
	blocksize=1024,
	)
	stream.start()
	_aud_stream["s"] = stream
	except Exception as e:
	print(f"[audio] stream failed: {e}")
	_aud_stream["s"] = None

	time.sleep(0.5)
	return True

	def stop_capture_threads():
	_capture_running.clear()
	if _aud_stream["s"] is not None:
	try:
	_aud_stream["s"].stop()
	_aud_stream["s"].close()
	except Exception:
	pass
	_aud_stream["s"] = None
	with _cam_lock:
	FRAME_BUFFER.clear()
	with _aud_lock:
	AUDIO_BUFFER.clear()

	def snapshot_clip_to_mp4(seconds=CLIP_SECONDS, fps=CAM_FPS, sr=SAMPLE_RATE):
	with _cam_lock:
	if len(FRAME_BUFFER) == 0:
	return None
	newest_t = FRAME_BUFFER[-1][0]
	cutoff = newest_t - seconds
	clip_frames = [(ts, f.copy()) for (ts, f) in FRAME_BUFFER if ts >= cutoff]

	if len(clip_frames) < max(4, int(seconds * fps) // 4):
	return None

	with _aud_lock:
	audio_samples = np.array(AUDIO_BUFFER, dtype=np.float32)
	target_len = int(seconds * sr)
	if len(audio_samples) >= target_len:
	audio_samples = audio_samples[-target_len:]

	w, h = _cam_frame_shape["wh"]
	tmp_video = tempfile.NamedTemporaryFile(suffix="_v.mp4", delete=False).name
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	writer = cv2.VideoWriter(tmp_video, fourcc, fps, (w, h))
	for _, frame in clip_frames:
	writer.write(frame)
	writer.release()

	tmp_audio = None
	if audio_samples.size > 0:
	tmp_audio = tempfile.NamedTemporaryFile(suffix="_a.wav", delete=False).name
	sf.write(tmp_audio, audio_samples, sr, subtype="PCM_16")

	tmp_final = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
	try:
	if tmp_audio is not None:
	subprocess.run(
	["ffmpeg", "-y", "-loglevel", "error",
	"-i", tmp_video, "-i", tmp_audio,
	"-c:v", "copy", "-c:a", "aac", "-shortest",
	tmp_final],
	check=True, capture_output=True,shell=True ,
	)
	else:
	subprocess.run(
	["ffmpeg", "-y", "-loglevel", "error",
	"-i", tmp_video, "-c:v", "copy", tmp_final],
	check=True, capture_output=True, shell=True,
	)
	except Exception as e:
	print(f"[mux] ffmpeg failed: {e}")
	if os.path.exists(tmp_final):
	try: os.remove(tmp_final)
	except Exception: pass
	tmp_final = tmp_video

	for p in (tmp_video, tmp_audio):
	if p and p != tmp_final and os.path.exists(p):
	try: os.remove(p)
	except Exception: pass

	return tmp_final


	# =========================================================
	# HISTORY & TIMELINE PLOT
	# =========================================================
	HISTORY = deque(maxlen=500)
	SESSION_START = {"t": None}

	def _make_timeline_plot():
	fig, ax = plt.subplots(figsize=(10, 3.2), dpi=110)

	bg = "#1a1a18"
	grid = "#2d2b27"
	ink = "#f5f2ea"
	ink_dim = "#8a857a"
	accent = "#a8c5a0"

	fig.patch.set_facecolor(bg)
	ax.set_facecolor(bg)

	if len(HISTORY) == 0:
	ax.text(0.5, 0.5, "awaiting data",
	ha="center", va="center",
	color=ink_dim, fontsize=14,
	fontstyle="italic", family="serif",
	transform=ax.transAxes)
	for spine in ax.spines.values():
	spine.set_visible(False)
	ax.set_xticks([]); ax.set_yticks([])
	plt.tight_layout()
	return fig

	times = np.array([h[0] for h in HISTORY])
	for emo in EMOTION_LABELS:
	ys = np.array([h[1].get(emo, 0.0) for h in HISTORY])
	ax.plot(times, ys,
	color=EMOTION_COLORS.get(emo, accent),
	linewidth=2.0, marker="o", markersize=4,
	label=emo)

	ax.set_ylim(-0.02, 1.02)
	ax.set_xlabel("seconds", color=ink_dim, fontsize=10, family="monospace")
	ax.set_ylabel("confidence", color=ink_dim, fontsize=10, family="monospace")
	ax.tick_params(colors=ink_dim, labelsize=9)
	ax.grid(True, color=grid, linewidth=0.5, alpha=0.6)
	for spine in ax.spines.values():
	spine.set_color(grid); spine.set_linewidth(0.8)

	leg = ax.legend(
	loc="upper left", ncol=4, frameon=False, fontsize=10,
	labelcolor=ink, prop={"family": "monospace"},
	bbox_to_anchor=(0.0, 1.18),
	)
	for t in leg.get_texts():
	t.set_color(ink)

	xmax = float(times[-1])
	xmin = max(0.0, xmax - HISTORY_SECONDS)
	ax.set_xlim(xmin, max(xmax, xmin + 1))

	plt.tight_layout()
	return fig


	# =========================================================
	# UI helpers
	# =========================================================
	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Fraunces:ital,wght@0,300;0,400;0,600;1,400;1,600&family=JetBrains+Mono:wght@400;500&display=swap');

	:root {
	--bg: #0f0f0e;
	--bg-card: #1a1a18;
	--bg-elev: #242421;
	--ink: #f5f2ea;
	--ink-dim: #8a857a;
	--ink-faint: #4a4740;
	--accent: #a8c5a0;
	--accent-hover: #bdd4b8;
	--rule: #2d2b27;
	}

	* { box-sizing: border-box; }
	html, body, .gradio-container { background: var(--bg) !important; }

	.gradio-container {
	max-width: 1180px !important;
	margin: 0 auto !important;
	font-family: 'Fraunces', serif !important;
	color: var(--ink) !important;
	padding: 48px 32px !important;
	}

	.hero {
	padding: 0 0 40px 0;
	border-bottom: 1px solid var(--rule);
	margin-bottom: 40px;
	text-align: center;
	width: 100%;
	}
	.hero-eyebrow {
	font-family: 'JetBrains Mono', monospace;
	font-size: 11px; letter-spacing: 0.22em;
	text-transform: uppercase; color: var(--accent);
	margin-bottom: 20px;
	}
	.hero-title {
	font-family: 'Fraunces', serif; font-weight: 300;
	font-size: clamp(36px, 5vw, 64px); line-height: 1.05;
	letter-spacing: -0.025em; color: var(--ink);
	margin: 0 0 16px 0;
	}
	.hero-title em { font-style: italic; font-weight: 400; color: var(--accent); }
	.hero-sub {
	font-family: 'JetBrains Mono', monospace;
	font-size: 13px; color: var(--ink-dim);
	line-height: 1.7;
	max-width: 100%;
	margin: 0 auto;
	}

	.section-label {
	font-family: 'JetBrains Mono', monospace;
	font-size: 10px; letter-spacing: 0.24em;
	text-transform: uppercase; color: var(--ink-dim);
	margin: 0 0 14px 0;
	display: flex; align-items: center; gap: 12px;
	}
	.section-label::before {
	content: ''; width: 28px; height: 1px;
	background: var(--accent); display: inline-block;
	}

	.primary button, button.primary, .gr-button-primary {
	background: var(--accent) !important; color: var(--bg) !important;
	border: none !important; border-radius: 2px !important;
	font-family: 'JetBrains Mono', monospace !important;
	font-size: 12px !important; font-weight: 500 !important;
	letter-spacing: 0.14em !important; text-transform: uppercase !important;
	padding: 14px 28px !important; transition: all 0.2s ease !important;
	box-shadow: none !important;
	}
	.primary button:hover, button.primary:hover {
	background: var(--accent-hover) !important; transform: translateY(-1px);
	}
	.secondary button, button.secondary {
	background: transparent !important; color: var(--ink-dim) !important;
	border: 1px solid var(--rule) !important; border-radius: 2px !important;
	font-family: 'JetBrains Mono', monospace !important;
	font-size: 11px !important; letter-spacing: 0.14em !important;
	text-transform: uppercase !important; padding: 14px 24px !important;
	}
	.secondary button:hover, button.secondary:hover {
	border-color: var(--accent) !important; color: var(--accent) !important;
	}

	.prediction-card {
	background: var(--bg-card); border: 1px solid var(--rule);
	border-left: 2px solid var(--accent);
	padding: 36px 30px; border-radius: 2px;
	min-height: 220px;
	display: flex; flex-direction: column; justify-content: center;
	}
	.prediction-eyebrow {
	font-family: 'JetBrains Mono', monospace;
	font-size: 10px; letter-spacing: 0.24em;
	text-transform: uppercase; color: var(--ink-dim);
	margin-bottom: 14px;
	}
	.prediction-value {
	font-family: 'Fraunces', serif; font-style: italic;
	font-weight: 400; font-size: 72px; line-height: 1;
	letter-spacing: -0.025em; color: var(--ink); margin: 0;
	}
	.prediction-placeholder {
	font-family: 'Fraunces', serif; font-style: italic;
	font-weight: 300; font-size: 42px;
	color: var(--ink-faint); letter-spacing: -0.02em;
	}

	.gate-row {
	display: flex; align-items: baseline; justify-content: space-between;
	padding: 16px 22px; margin-top: 12px;
	background: var(--bg-card); border: 1px solid var(--rule); border-radius: 2px;
	font-family: 'JetBrains Mono', monospace; font-size: 12px;
	}
	.gate-label { color: var(--ink-dim); letter-spacing: 0.18em; text-transform: uppercase; font-size: 10px; }
	.gate-value { color: var(--accent); font-weight: 500; letter-spacing: 0.04em; }

	.metrics-strip {
	display: flex; gap: 0;
	border: 1px solid var(--rule); border-radius: 2px;
	overflow: hidden; background: var(--bg-card);
	margin-top: 16px;
	}
	.metric { flex: 1; padding: 22px 24px; border-right: 1px solid var(--rule); }
	.metric:last-child { border-right: none; }
	.metric-label {
	font-family: 'JetBrains Mono', monospace;
	font-size: 10px; letter-spacing: 0.22em;
	text-transform: uppercase; color: var(--ink-dim); margin-bottom: 8px;
	}
	.metric-value {
	font-family: 'Fraunces', serif; font-style: italic;
	font-weight: 400; font-size: 32px; color: var(--ink);
	line-height: 1; letter-spacing: -0.02em;
	}
	.metric-value .unit { font-size: 14px; color: var(--ink-dim); font-style: normal; margin-left: 2px; }
	.metric--accent .metric-value { color: var(--accent); }

	.mirror, .mirror > div {
	background: var(--bg-card) !important; border: 1px solid var(--rule) !important;
	border-radius: 2px !important;
	}
	.mirror img { border-radius: 2px !important; background: var(--bg-elev) !important; }

	.output-class, .gr-label, .label-container {
	background: var(--bg-card) !important; border: 1px solid var(--rule) !important;
	border-radius: 2px !important; padding: 20px !important;
	}
	.label-wrap, .confidence-set { font-family: 'JetBrains Mono', monospace !important; }
	.label-wrap .confidence-set .bar, .gr-label .bar {
	background: var(--accent) !important; border-radius: 0 !important;
	}

	.status-line {
	font-family: 'JetBrains Mono', monospace;
	font-size: 11px; color: var(--ink-dim); letter-spacing: 0.06em;
	padding: 14px 0 0 0; border-top: 1px dashed var(--rule); margin-top: 18px;
	}
	.status-line::before { content: '› '; color: var(--accent); }

	.live-dot {
	display: inline-block; width: 8px; height: 8px;
	background: var(--accent); border-radius: 50%;
	margin-right: 8px; box-shadow: 0 0 10px var(--accent);
	animation: pulse 1.2s ease-in-out infinite;
	vertical-align: middle;
	}
	@keyframes pulse {
	0%, 100% { opacity: 1; transform: scale(1); }
	50% { opacity: 0.4; transform: scale(0.85); }
	}

	.button-row { margin-top: 16px !important; gap: 12px !important; }

	footer, .show-api, .built-with { display: none !important; }

	::-webkit-scrollbar { width: 10px; height: 10px; }
	::-webkit-scrollbar-track { background: var(--bg); }
	::-webkit-scrollbar-thumb { background: var(--rule); border-radius: 0; }
	::-webkit-scrollbar-thumb:hover { background: var(--ink-faint); }

	.plot-panel, .plot-panel > div {
	background: var(--bg-card) !important;
	border: 1px solid var(--rule) !important;
	border-radius: 2px !important;
	}
	"""



	def wrap_prediction(pred_text, live=False):
	if pred_text in ("—", "", None):
	eyebrow = '<span class="live-dot"></span>Listening…' if live else "Predicted Emotion"
	return f"""<div class="prediction-card">
	<div class="prediction-eyebrow">{eyebrow}</div>
	<div class="prediction-placeholder">awaiting input</div>
	</div>"""
	eyebrow = '<span class="live-dot"></span>Predicted · live' if live else "Predicted Emotion"
	return f"""<div class="prediction-card">
	<div class="prediction-eyebrow">{eyebrow}</div>
	<div class="prediction-value">{pred_text.lower()}</div>
	</div>"""

	def wrap_gate(gate_text):
	return f"""<div class="gate-row">
	<span class="gate-label">Modality Gate</span>
	<span class="gate-value">{gate_text}</span>
	</div>"""

	def wrap_status(status_text):
	if not status_text:
	return '<div class="status-line">Click Start — the webcam will stay on and predictions update continuously.</div>'
	return f'<div class="status-line">{status_text}</div>'

	def wrap_context(text: str) -> str:
	if not text:
	return ""
	return f"""<div style="
	margin-top:16px;
	padding:18px 22px;
	background:#1a1a18;
	border:1px solid #2d2b27;
	border-left:2px solid #a8c5a0;
	border-radius:2px;
	font-family:'Fraunces',serif;
	font-style:italic;
	font-size:1.05em;
	color:#f5f2ea;
	line-height:1.6;
	">💬 {text}</div>"""


	# =========================================================
	# UI CALLBACKS
	# =========================================================
	def get_latest_mirror_frame():
	with _cam_lock:
	if len(FRAME_BUFFER) == 0:
	return None
	_, frame = FRAME_BUFFER[-1]
	frame = frame.copy()
	rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	return Image.fromarray(rgb)

	def tick_inference():
	if not _capture_running.is_set():
	return (
	{label: 0.0 for label in EMOTION_LABELS},
	wrap_prediction("—"),
	wrap_gate("capture not running"),
	wrap_status("press Start to begin"),
	_make_timeline_plot(),
	"",
	)

	if SESSION_START["t"] is None:
	SESSION_START["t"] = time.time()

	clip_path = snapshot_clip_to_mp4()
	if clip_path is None:
	return (
	{label: 0.0 for label in EMOTION_LABELS},
	wrap_prediction("—", live=True),
	wrap_gate("buffering…"),
	wrap_status("filling buffer — first prediction in a moment"),
	_make_timeline_plot(),
	"",
	)

	try:
	probs, pred, gate_desc, status = predict(clip_path)
	finally:
	if os.path.exists(clip_path):
	try: os.remove(clip_path)
	except Exception: pass

	global _pending_response

	if pred not in ("—", "", None) and sum(probs.values()) > 0:
	elapsed = time.time() - SESSION_START["t"]
	HISTORY.append((elapsed, dict(probs)))

	gate_val = _parse_gate_value(gate_desc)
	confidence = probs.get(pred, 0.0)
	tracker.update(pred, confidence, gate_val)

	# ── Write predicted emotion to shared file for bot_walk.py ───────
	try:
	_emotion_state_file = os.path.join(os.path.dirname(__file__), "emotion_state.json")
	with open(_emotion_state_file, "w") as f:
	json.dump({"emotion": pred, "confidence": confidence, "timestamp": time.time()}, f)
	except Exception:
	pass

	response_text = ""
	if _pending_response is not None and _pending_response.done():
	result = _pending_response.result()
	if result:
	response_text = result

	response_type = decide_response_type(tracker)
	if response_type != ResponseType.NONE:
	_pending_response = _groq_executor.submit(
	generate_response, tracker, response_type
	)
	else:
	response_text = ""

	return (
	probs,
	wrap_prediction(pred, live=True),
	wrap_gate(gate_desc),
	wrap_status(status),
	_make_timeline_plot(),
	wrap_context(response_text),
	)

	def start_session():
	tracker.history.clear()
	_conversation_history.clear()

	HISTORY.clear()
	SESSION_START["t"] = time.time()
	start_capture_threads()
	return (
	gr.Timer(active=True),
	gr.Timer(active=True),
	wrap_status(f"live · {CLIP_SECONDS:.0f}s rolling window · predictions every {TICK_INTERVAL:.0f}s"),
	_make_timeline_plot(),
	)

	def stop_session():
	stop_capture_threads()
	return (
	gr.Timer(active=False),
	gr.Timer(active=False),
	wrap_status("stopped · press Start to resume"),
	)

	def clear_history():
	tracker.history.clear()
	_conversation_history.clear()

	HISTORY.clear()
	SESSION_START["t"] = time.time() if _capture_running.is_set() else None
	return (
	{label: 0.0 for label in EMOTION_LABELS},
	wrap_prediction("—", live=_capture_running.is_set()),
	wrap_gate("awaiting input"),
	wrap_status("history cleared"),
	_make_timeline_plot(),
	"",
	)

	def analyze_uploaded_video(video_file):
	empty = {label: 0.0 for label in EMOTION_LABELS}
	if video_file is None:
	return (
	empty,
	wrap_prediction("—"),
	wrap_gate("awaiting upload"),
	wrap_status(""),
	)

	probs, pred, gate_desc, status = predict(video_file)

	return (
	probs,
	wrap_prediction(pred, live=False),
	wrap_gate(gate_desc),
	wrap_status(status),
	)


	# =========================================================
	# UI
	# =========================================================
	with gr.Blocks(title="Emotion Recognition v4", css=CUSTOM_CSS, theme=gr.themes.Base()) as demo:

	gr.HTML(
	"""
	<div class="hero">
	<h1 class="hero-title">Emotion <em>recognition</em><br/>in real time.</h1>
	<div class="hero-sub">
	WavLM-Base+ and ViT-Face-Expression with confidence-weighted temporal pooling,
	fused by a learned gate.<br/>
	Use the Live tab for real-time analysis, or upload a pre-recorded video.
	</div>
	</div>
	"""
	)

	with gr.Tabs() as tabs:
	# ============================================================
	# TAB 1: LIVE ANALYSIS
	# ============================================================
	with gr.Tab("🎙️ Live Analysis", id="live"):
	mirror_timer = gr.Timer(1.0 / MIRROR_FPS, active=False)
	infer_timer = gr.Timer(TICK_INTERVAL, active=False)

	with gr.Row(equal_height=False):
	with gr.Column(scale=5):
	gr.HTML('<div class="section-label">01 · Live Feed</div>')
	mirror_out = gr.Image(
	show_label=False,
	elem_classes="mirror",
	height=360,
	type="pil",
	)
	with gr.Row(elem_classes="button-row"):
	start_btn = gr.Button("Start", elem_classes="primary", scale=2)
	stop_btn = gr.Button("Stop", elem_classes="secondary", scale=1)
	clear_btn = gr.Button("Clear", elem_classes="secondary", scale=1)
	status_html = gr.HTML(wrap_status(""))

	with gr.Column(scale=4):
	gr.HTML('<div class="section-label">02 · Current Prediction</div>')
	pred_html = gr.HTML(wrap_prediction("—"))
	gate_html = gr.HTML(wrap_gate("awaiting input"))
	context_html = gr.HTML("")
	gr.HTML('<div class="section-label" style="margin-top:24px;">03 · Class Probabilities</div>')
	label_out = gr.Label(
	num_top_classes=4,
	show_label=False,
	value={label: 0.0 for label in EMOTION_LABELS},
	)

	gr.HTML('<div class="section-label" style="margin-top:36px;">04 · Emotion Timeline</div>')
	plot_out = gr.Plot(value=_make_timeline_plot(), show_label=False,
	elem_classes="plot-panel")



	mirror_timer.tick(
	fn=get_latest_mirror_frame,
	inputs=None,
	outputs=[mirror_out],
	show_progress="hidden",
	)

	infer_timer.tick(
	fn=tick_inference,
	inputs=None,
	outputs=[label_out, pred_html, gate_html, status_html, plot_out, context_html],
	show_progress="hidden",
	)

	start_btn.click(
	fn=start_session,
	inputs=None,
	outputs=[mirror_timer, infer_timer, status_html, plot_out],
	)

	stop_btn.click(
	fn=stop_session,
	inputs=None,
	outputs=[mirror_timer, infer_timer, status_html],
	)

	clear_btn.click(
	fn=clear_history,
	inputs=None,
	outputs=[label_out, pred_html, gate_html, status_html, plot_out, context_html],
	)

	# ============================================================
	# TAB 2: UPLOAD VIDEO
	# ============================================================
	with gr.Tab("📁 Upload Video", id="upload"):
	gr.HTML('<div class="section-label">Upload a pre-recorded video for analysis</div>')
	gr.HTML('<div style="font-family: JetBrains Mono, monospace; font-size: 12px; color: #8a857a; margin-bottom: 18px;">'
	'Upload a video file containing both audio and video. '
	'The model will run a one-shot multimodal analysis (no live LLM responses).</div>')

	with gr.Row(equal_height=False):
	with gr.Column(scale=5):
	upload_input = gr.Video(label="Upload Video", sources=["upload"])
	analyze_btn = gr.Button("Analyze", elem_classes="primary")
	upload_status_html = gr.HTML(wrap_status(""))

	with gr.Column(scale=4):
	gr.HTML('<div class="section-label">Prediction</div>')
	upload_pred_html = gr.HTML(wrap_prediction("—"))
	upload_gate_html = gr.HTML(wrap_gate("awaiting upload"))
	gr.HTML('<div class="section-label" style="margin-top:24px;">Class Probabilities</div>')
	upload_label_out = gr.Label(
	num_top_classes=4,
	show_label=False,
	value={label: 0.0 for label in EMOTION_LABELS},
	)

	analyze_btn.click(
	fn=analyze_uploaded_video,
	inputs=[upload_input],
	outputs=[upload_label_out, upload_pred_html, upload_gate_html, upload_status_html],
	)


	if __name__ == '__main__':
	demo.launch(server_name="0.0.0.0", server_port=7860)