Fayza38 commited on
Commit
9570783
·
verified ·
1 Parent(s): 4ec144d

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +311 -659
pipeline.py CHANGED
@@ -1,659 +1,311 @@
1
- import os
2
- import subprocess
3
- import cv2
4
- import json
5
- import math
6
- import torch
7
- import librosa
8
- import ffmpeg
9
- import numpy as np
10
- import soundfile as sf
11
- import mediapipe as mp
12
- from PIL import Image
13
- from transformers import AutoImageProcessor, AutoModelForImageClassification, pipeline
14
- from sentence_transformers import SentenceTransformer, CrossEncoder
15
- from sklearn.metrics.pairwise import cosine_similarity
16
- from mediapipe.tasks import python
17
- from mediapipe.tasks.python import vision
18
-
19
- # Ignore unnecessary warnings
20
- import warnings
21
- warnings.filterwarnings("ignore", category=UserWarning)
22
- warnings.filterwarnings("ignore", category=FutureWarning)
23
-
24
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
-
26
- # 2. Download and Initialize Mediapipe once (Global)
27
- MODEL_PATH = "face_landmarker.task"
28
- if not os.path.exists(MODEL_PATH):
29
- os.system(f"wget -O {MODEL_PATH} -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task")
30
-
31
- # 3. Initialize Models
32
- asr = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
33
- semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
34
- cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
35
-
36
- FACE_MODEL_NAME = "dima806/facial_emotions_image_detection"
37
- face_processor = AutoImageProcessor.from_pretrained(FACE_MODEL_NAME)
38
- face_model = AutoModelForImageClassification.from_pretrained(FACE_MODEL_NAME).to(device).eval()
39
-
40
- # Emotion Mapping for Wheel
41
- emotion_va = {
42
- "happy": (0.8, 0.2), "fear": (0.2, 0.8), "angry": (-0.7, 0.65),
43
- "sad": (-0.65, -0.55), "surprise": (0.1, -0.75), "disgust": (0.6, -0.4), "neutral": (0.0, 0.0)
44
- }
45
- EMOTION_RING = [
46
- ("Happy", 0, 0.84), ("Surprise", 45, 0.84), ("Fear", 100, 0.84),
47
- ("Sad", 160, 0.84), ("Disgust", 215, 0.84), ("Angry", 270, 0.84)
48
- ]
49
-
50
- ##Utility functions
51
-
52
- def normalize(v, mn, mx):
53
- return np.clip((v - mn) / (mx - mn), 0, 1) if mx - mn != 0 else 0.0
54
-
55
- def extract_audio(v_in, a_out):
56
- ffmpeg.input(v_in).output(a_out, ac=1, ar=16000).overwrite_output().run(quiet=True)
57
-
58
- def merge_audio_video(v_in, a_in, v_out):
59
- ffmpeg.output(ffmpeg.input(v_in).video, ffmpeg.input(a_in).audio, v_out, vcodec="libx264", acodec="aac").overwrite_output().run(quiet=True)
60
-
61
- def draw_face_box(frame, x, y, w, h, emotion_name=""):
62
- color, th, cl = (0, 255, 100), 2, 20 # Green color
63
- cv2.rectangle(frame, (x, y), (x+w, y+h), color, 1)
64
-
65
- # Add emotion name above face box
66
- if emotion_name:
67
- cv2.putText(
68
- frame,
69
- emotion_name.upper(),
70
- (x + 10, y - 15),
71
- cv2.FONT_HERSHEY_DUPLEX,
72
- 0.7,
73
- (0, 255, 100),
74
- 2,
75
- cv2.LINE_AA
76
- )
77
-
78
- # Corners
79
- for px, py, dx, dy in [(x,y,cl,0), (x,y,0,cl), (x+w,y,-cl,0), (x+w,y,0,cl), (x,y+h,cl,0), (x,y+h,0,-cl), (x+w,y+h,-cl,0), (x+w,y+h,0,-cl)]:
80
- cv2.line(frame, (px, py), (px+dx, py+dy), color, 5)
81
- return frame
82
-
83
- def compute_eye_contact_ratio(frame, landmarks):
84
- h, w, _ = frame.shape
85
- def ear(idx):
86
- p = [np.array([landmarks[i].x * w, landmarks[i].y * h]) for i in idx]
87
- return (np.linalg.norm(p[1]-p[5]) + np.linalg.norm(p[2]-p[4])) / (2.0 * np.linalg.norm(p[0]-p[3]))
88
- avg_ear = (ear([33, 160, 158, 133, 153, 144]) + ear([362, 385, 387, 263, 373, 380])) / 2.0
89
- return min(max(avg_ear * 3, 0), 1)
90
-
91
- def analyze_face_emotion(frame):
92
- img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
93
- inputs = face_processor(images=img, return_tensors="pt").to(device)
94
- with torch.no_grad():
95
- outputs = face_model(**inputs)
96
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
97
- return {face_model.config.id2label[i].lower(): float(probs[i]) for i in range(len(probs))}
98
-
99
- ##Audio analysis
100
-
101
- def extract_audio_features(y, sr):
102
- duration = librosa.get_duration(y=y, sr=sr)
103
- if duration == 0:
104
- return {"pitch_std": 0, "jitter": 0, "energy_std": 0, "pause_ratio": 0, "speech_rate": 0}
105
-
106
- # Pitch & Jitter
107
- f0 = librosa.yin(y, fmin=75, fmax=300, sr=sr)
108
- f0 = f0[~np.isnan(f0)]
109
- pitch_std = np.std(f0) if len(f0) else 0
110
- jitter = np.mean(np.abs(np.diff(f0)) / np.maximum(f0[:-1], 1e-6)) if len(f0) > 1 else 0
111
-
112
- # Energy
113
- rms = librosa.feature.rms(y=y)[0]
114
- energy_std = np.std(rms)
115
-
116
- intervals = librosa.effects.split(y, top_db=20)
117
- speech_duration = sum((e - s) for s, e in intervals) / sr
118
- pause_ratio = 1 - (speech_duration / duration) if duration > 0 else 0
119
-
120
- # Speech Rate
121
- oenv = librosa.onset.onset_strength(y=y, sr=sr)
122
- onsets = librosa.onset.onset_detect(onset_envelope=oenv, sr=sr)
123
- speech_rate = len(onsets) / duration if duration > 0 else 0
124
-
125
- return {
126
- "pitch_std": pitch_std,
127
- "jitter": jitter,
128
- "energy_std": energy_std,
129
- "pause_ratio": pause_ratio,
130
- "speech_rate": speech_rate
131
- }
132
-
133
-
134
- def compute_audio_scores(features, baseline=None):
135
- """
136
- Fairness-aware audio scoring with personal baseline comparison
137
- """
138
- # Use standard defaults if no baseline provided
139
- if baseline is None:
140
- baseline = {"pitch_std": 30.0, "energy_std": 0.05, "jitter": 0.02, "pause_ratio": 0.2, "speech_rate": 4.0}
141
-
142
- # Calculate Relative Ratios (Current / Baseline)
143
- pitch_ratio = features["pitch_std"] / max(baseline["pitch_std"], 1e-6)
144
- energy_ratio = features["energy_std"] / max(baseline["energy_std"], 1e-6)
145
- rate_ratio = features["speech_rate"] / max(baseline["speech_rate"], 1e-6)
146
-
147
- # Stress Score (Relative)
148
- pitch_dev = abs(1 - pitch_ratio)
149
- energy_dev = abs(1 - energy_ratio)
150
- stress_val = (pitch_dev * 0.4 + energy_dev * 0.4 + features["jitter"] * 0.2) * 150
151
- stress = np.clip(stress_val + 20, 0, 100)
152
-
153
- # Clarity Score (Relative)
154
- pause_dev = max(0, features["pause_ratio"] - baseline["pause_ratio"])
155
- clarity = 100 - (pause_dev * 120 + features["jitter"] * 400)
156
-
157
- # Confidence Score (Relative)
158
- rate_dev = abs(1 - rate_ratio)
159
- confidence_audio = 100 - (rate_dev * 40 + energy_dev * 30 + features["pause_ratio"] * 50)
160
-
161
- # Tone classification based on relative shifts
162
- tones = {
163
- "Confident": confidence_audio,
164
- "Hesitant": features["pause_ratio"] * 150,
165
- "Excited": (energy_ratio - 1) * 100 if energy_ratio > 1 else 0,
166
- "Unstable": stress,
167
- "Natural": 100 - (pitch_dev * 60 + rate_dev * 40)
168
- }
169
-
170
- dominant_tone = max(tones, key=tones.get)
171
-
172
- return {
173
- "confidence_audio": round(float(np.clip(confidence_audio, 0, 100)), 2),
174
- "clarity": round(float(np.clip(clarity, 0, 100)), 2),
175
- "stress": round(float(np.clip(stress, 0, 100)), 2),
176
- "pauses": round(float(features["pause_ratio"] * 100), 2),
177
- "tone_of_voice": dominant_tone
178
- }
179
-
180
- def analyze_audio_segment(audio_path, baseline=None):
181
- """
182
- Main entry point for audio segment analysis
183
- """
184
- y, sr = librosa.load(audio_path, sr=16000)
185
- features = extract_audio_features(y, sr)
186
- return compute_audio_scores(features, baseline)
187
-
188
-
189
- ##Text analysis
190
-
191
- def get_user_answer(audio_path):
192
- """Transcribe audio using Whisper"""
193
- result = asr(audio_path, chunk_length_s=20)
194
- return result["text"].strip()
195
-
196
-
197
- def compute_similarity_score(user_answer, ideal_answer):
198
- emb = semantic_model.encode([user_answer, ideal_answer])
199
- sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
200
- score = float(sim * 100)
201
- return round(max(0, score), 2)
202
-
203
- def compute_relevance_score(question, user_answer):
204
- raw_score = cross_encoder.predict([(question, user_answer)])[0]
205
- prob = 1 / (1 + np.exp(-raw_score))
206
- score = float(prob * 100)
207
- return round(max(0, score), 2)
208
-
209
- ##Video
210
-
211
- # Eye indices
212
- LEFT_EYE = [33, 160, 158, 133, 153, 144]
213
- RIGHT_EYE = [362, 385, 387, 263, 373, 380]
214
-
215
- # Eye Contact Function
216
- def compute_eye_contact_ratio(frame, landmarks):
217
- """
218
- Compute eye contact ratio from detected face landmarks
219
- """
220
-
221
- if not landmarks:
222
- return 0.5
223
-
224
- h, w, _ = frame.shape
225
-
226
- def ear(indices):
227
- points = [
228
- np.array([
229
- landmarks[i].x * w,
230
- landmarks[i].y * h
231
- ])
232
- for i in indices
233
- ]
234
-
235
- v1 = np.linalg.norm(points[1] - points[5])
236
- v2 = np.linalg.norm(points[2] - points[4])
237
- h_dist = np.linalg.norm(points[0] - points[3])
238
-
239
- return (v1 + v2) / (2.0 * h_dist)
240
-
241
- ear_left = ear(LEFT_EYE)
242
- ear_right = ear(RIGHT_EYE)
243
-
244
- avg_ear = (ear_left + ear_right) / 2.0
245
-
246
- eye_score = min(max(avg_ear * 3, 0), 1)
247
-
248
- return eye_score
249
-
250
- def analyze_face_emotion(frame):
251
- """
252
- Predict facial emotion probabilities from single frame
253
- """
254
-
255
- # Convert BGR to RGB
256
- rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
257
- image = Image.fromarray(rgb)
258
-
259
- # Preprocess
260
- inputs = face_processor(images=image, return_tensors="pt").to(device)
261
-
262
- with torch.no_grad():
263
- outputs = face_model(**inputs)
264
-
265
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
266
- labels = face_model.config.id2label
267
-
268
- emotion_probs = {
269
- labels[i].lower(): float(probs[i])
270
- for i in range(len(probs))
271
- }
272
-
273
- return emotion_probs
274
-
275
- def draw_face_box(frame, x, y, w, h, emotion_label="Neutral"):
276
- """
277
- Draw face bounding box with emotion label above it
278
- """
279
-
280
- # Green color for face box
281
- color = (0, 255, 0)
282
-
283
- thickness = 2
284
- corner_len = 22
285
-
286
- # Main rectangle
287
- cv2.rectangle(frame, (x, y), (x+w, y+h), color, thickness)
288
-
289
- # Decorative corner lines
290
- for (px, py, dx, dy) in [
291
- (x, y, corner_len, 0), (x, y, 0, corner_len),
292
- (x+w, y, -corner_len, 0), (x+w, y, 0, corner_len),
293
- (x, y+h, corner_len, 0), (x, y+h, 0, -corner_len),
294
- (x+w, y+h, -corner_len, 0), (x+w, y+h, 0, -corner_len),
295
- ]:
296
- cv2.line(frame, (px, py), (px+dx, py+dy), color, 4)
297
-
298
- # Draw emotion text above the face box
299
- label_text = emotion_label.capitalize()
300
-
301
- (tw, th), _ = cv2.getTextSize(
302
- label_text,
303
- cv2.FONT_HERSHEY_SIMPLEX,
304
- 0.7,
305
- 2
306
- )
307
-
308
- text_x = x + (w - tw) // 2
309
- text_y = y - 10
310
-
311
- cv2.putText(
312
- frame,
313
- label_text,
314
- (text_x, text_y),
315
- cv2.FONT_HERSHEY_SIMPLEX,
316
- 0.7,
317
- (0, 255, 0),
318
- 2,
319
- cv2.LINE_AA
320
- )
321
-
322
- return frame
323
-
324
- def compute_valence_arousal_from_probs(emotion_probs):
325
- """Computing Valence and Arousal from emotion probabilities"""
326
- v, a, total = 0.0, 0.0, 0.0
327
-
328
- for emo, score in emotion_probs.items():
329
- emo = emo.lower()
330
- if emo in emotion_va:
331
- v += emotion_va[emo][0] * score
332
- a += emotion_va[emo][1] * score
333
- total += score
334
-
335
- if total == 0:
336
- return 0.0, 0.0
337
-
338
- return v / total, a / total
339
-
340
- def draw_full_emotion_wheel(panel, center, radius, valence, arousal,
341
- dominant_emotion="neutral"):
342
- cx, cy = center
343
-
344
- # Circle background
345
- cv2.circle(panel, center, radius + 5, (15, 15, 25), -1)
346
- cv2.circle(panel, center, radius, (60, 60, 85), 2)
347
- for rf in [0.33, 0.66]:
348
- cv2.circle(panel, center, int(radius * rf), (35, 35, 50), 1)
349
-
350
- # Drawing dividing lines between emotions
351
- for angle_deg in range(0, 360, 60):
352
- rad = math.radians(angle_deg)
353
- x1 = int(cx + radius * math.cos(rad))
354
- y1 = int(cy - radius * math.sin(rad))
355
- cv2.line(panel, (cx, cy), (x1, y1), (40, 40, 60), 1)
356
-
357
- # Drawing emotion labels
358
- ef, es, et = cv2.FONT_HERSHEY_SIMPLEX, 0.40, 1
359
- for emotion_data in EMOTION_RING:
360
- if emotion_data[1] is None:
361
- continue
362
-
363
- label, angle_deg, rf = emotion_data
364
- rad = math.radians(angle_deg)
365
- lx = int(cx + rf * radius * math.cos(rad))
366
- ly = int(cy - rf * radius * math.sin(rad))
367
- (tw, th), _ = cv2.getTextSize(label, ef, es, et)
368
- tx, ty = lx - tw//2, ly + th//2
369
-
370
- # Highlight active emotion
371
- if label.lower() == dominant_emotion.lower():
372
- cv2.putText(panel, label, (tx, ty), ef, es+0.08, (0, 255, 200), 2, cv2.LINE_AA)
373
- else:
374
- cv2.putText(panel, label, (tx, ty), ef, es, (190, 190, 255), et, cv2.LINE_AA)
375
-
376
- # Neutral in center
377
- nc = (0, 255, 200) if dominant_emotion == "neutral" else (160, 160, 160)
378
- (tw, th), _ = cv2.getTextSize("Neutral", ef, es, et)
379
- cv2.putText(panel, "Neutral", (cx-tw//2, cy+th//2), ef, es, nc, et, cv2.LINE_AA)
380
-
381
- # Animated dot with glow
382
- dot_x = int(cx + valence * radius * 0.88)
383
- dot_y = int(cy - arousal * radius * 0.88)
384
- cv2.circle(panel, (dot_x, dot_y), 15, (160, 120, 0), -1)
385
- cv2.circle(panel, (dot_x, dot_y), 11, (220, 180, 0), -1)
386
- cv2.circle(panel, (dot_x, dot_y), 7, (255, 230, 60), -1)
387
-
388
- return panel
389
-
390
- BAR_CONFIGS = [
391
- ("Confidence", (70, 180, 255), (30, 50, 100)), # light blue
392
- ("Clarity", (100, 220, 150), (25, 70, 50)), # light cyan
393
- ("Stress", (255, 120, 100), (100, 40, 30)), # light coral
394
- ]
395
-
396
- def draw_metric_bars(panel,
397
- bars_x_start,
398
- bar_y_top,
399
- bar_height,
400
- bar_width,
401
- bar_gap,
402
- confidence,
403
- clarity,
404
- stress):
405
- """
406
- Draw horizontal metric bars with label above each bar
407
- """
408
-
409
- values = [confidence, clarity, stress]
410
- labels_list = ["Confidence", "Clarity", "Stress"]
411
-
412
- # Extra vertical space for labels
413
- label_space = 20
414
-
415
- for i, value in enumerate(values):
416
-
417
- label, fill_color, bg_color = BAR_CONFIGS[i]
418
-
419
- # Each bar block height = label + bar + gap
420
- y = bar_y_top + i * (bar_height + label_space + bar_gap)
421
-
422
- x_right = bars_x_start + bar_width
423
-
424
- filled = int((value / 100) * bar_width)
425
-
426
- # Draw label above bar
427
- cv2.putText(
428
- panel,
429
- label,
430
- (bars_x_start, y),
431
- cv2.FONT_HERSHEY_DUPLEX,
432
- 0.6,
433
- (230, 230, 230),
434
- 1,
435
- cv2.LINE_AA
436
- )
437
-
438
- # Move bar slightly down to leave space for label
439
- bar_y = y + 8
440
-
441
- # Draw background bar
442
- cv2.rectangle(
443
- panel,
444
- (bars_x_start, bar_y),
445
- (x_right, bar_y + bar_height),
446
- bg_color,
447
- -1
448
- )
449
-
450
- # Draw filled portion
451
- cv2.rectangle(
452
- panel,
453
- (bars_x_start, bar_y),
454
- (bars_x_start + filled, bar_y + bar_height),
455
- fill_color,
456
- -1
457
- )
458
-
459
- # Draw percentage text
460
- cv2.putText(
461
- panel,
462
- f"{int(value)}%",
463
- (bars_x_start + 12, bar_y + bar_height - 6),
464
- cv2.FONT_HERSHEY_SIMPLEX,
465
- 0.6,
466
- (255, 255, 255),
467
- 2,
468
- cv2.LINE_AA
469
- )
470
-
471
- return panel
472
-
473
- ##Integrated Video Processing (Analysis + Annotation)
474
-
475
- def process_video_segment(video_path, output_dir, segment_id, audio_scores_global=None):
476
- base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
477
- options = vision.FaceLandmarkerOptions(base_options=base_options, running_mode=vision.RunningMode.VIDEO, num_faces=1)
478
-
479
- cap = cv2.VideoCapture(video_path)
480
- fps, width, height = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
481
- temp_video = os.path.join(output_dir, f"temp_annotated_{segment_id}.mp4")
482
- # out = cv2.VideoWriter(temp_video, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
483
- # Use 'avc1' or 'H264' for web compatibility
484
- fourcc = cv2.VideoWriter_fourcc(*'XVID')
485
- out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))
486
-
487
- face_conf_accum, eye_accum, frame_idx = [], [], 0
488
- smooth_v, smooth_a, dom_emo = 0.0, 0.0, "neutral"
489
-
490
- # --- Optimization Variables ---
491
- frame_stride = 3 # Process AI every 3 frames
492
- last_results = None
493
- last_emotions = None
494
- last_eye_s = 0.5
495
- last_lm = None
496
- # ------------------------------
497
-
498
- b_conf = audio_scores_global.get("confidence_audio", 50)
499
- b_clar = audio_scores_global.get("clarity", 50)
500
- b_stress = audio_scores_global.get("stress", 20)
501
-
502
- with vision.FaceLandmarker.create_from_options(options) as landmarker:
503
- while cap.isOpened():
504
- ret, frame = cap.read()
505
- if not ret:
506
- break
507
-
508
- # 1. RUN HEAVY AI ONLY ON STRIDE FRAMES
509
- if frame_idx % frame_stride == 0:
510
- mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
511
- last_results = landmarker.detect_for_video(mp_image, int((frame_idx/fps)*1000))
512
-
513
- if last_results.face_landmarks:
514
- last_lm = last_results.face_landmarks[0]
515
- last_emotions = analyze_face_emotion(frame)
516
- last_eye_s = compute_eye_contact_ratio(frame, last_lm)
517
-
518
- # 2. USE LAST KNOWN DATA FOR CALCULATIONS & DRAWING
519
- d_conf, d_clar, d_stress = b_conf, b_clar, b_stress
520
-
521
- if last_results and last_results.face_landmarks:
522
- # Use current local variables from 'last' successful AI run
523
- curr_f_conf = (last_emotions.get("neutral", 0) + last_emotions.get("happy", 0)) * 100
524
- d_conf = (b_conf * 0.7) + (curr_f_conf * 0.3)
525
- d_clar = (b_clar * 0.8) + (last_eye_s * 100 * 0.2)
526
- d_stress = (b_stress * 0.7) + ((last_emotions.get("sad",0)+last_emotions.get("angry",0))*30)
527
-
528
- # Update accumulators only on stride frames to keep averages accurate
529
- if frame_idx % frame_stride == 0:
530
- face_conf_accum.append(curr_f_conf)
531
- eye_accum.append(last_eye_s)
532
-
533
- dom_emo = max(last_emotions, key=last_emotions.get)
534
- v_t = sum(emotion_va[e][0]*s for e,s in last_emotions.items() if e in emotion_va)
535
- a_t = sum(emotion_va[e][1]*s for e,s in last_emotions.items() if e in emotion_va)
536
-
537
- # Keep smoothing every frame for fluid movement
538
- smooth_v += 0.15 * (v_t - smooth_v)
539
- smooth_a += 0.15 * (a_t - smooth_a)
540
-
541
- # Draw face box using the last known landmarks
542
- xs, ys = [l.x*width for l in last_lm], [l.y*height for l in last_lm]
543
- draw_face_box(
544
- frame,
545
- int(min(xs)), int(min(ys)),
546
- int(max(xs) - min(xs)), int(max(ys) - min(ys)),
547
- dom_emo
548
- )
549
-
550
- # 3. ALWAYS DRAW UI (Wheel and Bars)
551
- frame = draw_full_emotion_wheel(frame, (width-130, height-100), 90, smooth_v, smooth_a, dom_emo)
552
- frame = draw_metric_bars(frame, 30, height-160, 28, 200, 6, d_conf, d_clar, d_stress)
553
-
554
- out.write(frame)
555
- frame_idx += 1
556
-
557
- cap.release()
558
- out.release()
559
- return temp_video, np.mean(face_conf_accum) if face_conf_accum else 50, np.mean(eye_accum)*100 if eye_accum else 50
560
-
561
- ##Main pipeline
562
- def run_intervision_pipeline(video_path, questions_config, output_dir):
563
- if not os.path.exists(video_path):
564
- return f"Error: Video file not found at {video_path}"
565
-
566
- os.makedirs(output_dir, exist_ok=True)
567
-
568
- # Establish baseline from first 10s
569
- try:
570
- y_b, sr_b = librosa.load(video_path, sr=16000, duration=10)
571
- baseline = extract_audio_features(y_b, sr_b)
572
- except Exception as e:
573
- print(f"Baseline Load Warning: {e}. Using defaults.")
574
- baseline = None
575
-
576
- final_reports, segments = [], []
577
-
578
- for q in questions_config:
579
- q_id = q['question_id']
580
- raw_seg = os.path.join(output_dir, f"q{q_id}_raw.mp4")
581
- wav_p = os.path.join(output_dir, f"q{q_id}.wav")
582
-
583
- # Precise FFmpeg cutting with error handling
584
- duration = q["end_time"] - q["start_time"]
585
- try:
586
- subprocess.run([
587
- 'ffmpeg', '-y', '-ss', str(q["start_time"]), '-t', str(duration),
588
- '-i', video_path, '-c:v', 'libx264', '-c:a', 'aac', '-strict', 'experimental', raw_seg
589
- ], check=True, capture_output=True)
590
- except subprocess.CalledProcessError as e:
591
- print(f"Skipping Question {q_id}: Time range might be out of video bounds.")
592
- continue
593
-
594
- # Audio Extraction
595
- try:
596
- y, sr = librosa.load(raw_seg, sr=16000)
597
- import soundfile as sf
598
- sf.write(wav_p, y, sr)
599
- except Exception as e:
600
- print(f"Error extracting audio for Q{q_id}: {e}")
601
- continue
602
-
603
- # Audio Analysis
604
- a_scores = compute_audio_scores(extract_audio_features(y, sr), baseline)
605
-
606
- # Whisper Transcription
607
- try:
608
- transcription_data = asr(wav_p, chunk_length_s=30, return_timestamps=True)
609
- transcription = transcription_data["text"].strip()
610
- except:
611
- transcription = "[Transcription Error]"
612
-
613
- similarity_score = compute_similarity_score(transcription, q["ideal_answer"])
614
- relevance_score = compute_relevance_score(q["question_text"], transcription)
615
-
616
- # Visual Analysis
617
- try:
618
- ann_v, f_c, e_c = process_video_segment(raw_seg, output_dir, q_id, a_scores)
619
-
620
- final_v = os.path.join(output_dir, f"q{q_id}_final.mp4")
621
- subprocess.run([
622
- 'ffmpeg', '-y', '-i', ann_v, '-i', raw_seg, '-map', '0:v', '-map', '1:a',
623
- '-c:v', 'copy', '-c:a', 'aac', final_v
624
- ], check=True, capture_output=True)
625
-
626
- segments.append(final_v)
627
-
628
- final_reports.append({
629
- "questionId": q_id,
630
- "userAnswerText": transcription,
631
- "toneOfVoice": a_scores["tone_of_voice"],
632
- "clarity": a_scores["clarity"],
633
- "stress": a_scores["stress"],
634
- "confidence": round((a_scores["confidence_audio"] + f_c + e_c) / 3, 2),
635
- "pauses": a_scores["pauses"],
636
- "score": similarity_score,
637
- "relevance": relevance_score
638
- })
639
- except Exception as e:
640
- print(f"Visual analysis failed for Q{q_id}: {e}")
641
-
642
- torch.cuda.empty_cache()
643
-
644
- # Final concatenation
645
- if segments:
646
- list_path = os.path.join(output_dir, "list.txt")
647
- with open(list_path, "w") as f:
648
- for s in segments:
649
- f.write(f"file '{os.path.abspath(s)}'\n")
650
-
651
- final_output = os.path.join(output_dir, "Intervision_Final_Result.mp4")
652
- os.system(f"ffmpeg -f concat -safe 0 -i {list_path} -c:v libx264 -preset superfast -crf 23 -c:a aac -y {final_output}")
653
-
654
- with open(os.path.join(output_dir, "report.json"), "w") as f:
655
- json.dump({"listOfAnswerReport": final_reports}, f, indent=4)
656
-
657
- return f"Successfully processed {len(segments)} questions."
658
- else:
659
- return "No segments were processed. Check your video time ranges."
 
1
+ import os
2
+ import subprocess
3
+ import cv2
4
+ import json
5
+ import math
6
+ import torch
7
+ import librosa
8
+ import numpy as np
9
+ import mediapipe as mp
10
+ from PIL import Image
11
+ from transformers import AutoImageProcessor, AutoModelForImageClassification, pipeline
12
+ from sentence_transformers import SentenceTransformer, CrossEncoder
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ from mediapipe.tasks import python
15
+ from mediapipe.tasks.python import vision
16
+ import warnings
17
+
18
+ # Suppress unnecessary warnings for a cleaner console output
19
+ warnings.filterwarnings("ignore", category=UserWarning)
20
+ warnings.filterwarnings("ignore", category=FutureWarning)
21
+
22
+ # Set device to GPU if available
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+
25
+ # --- Configuration & Mappings ---
26
+
27
+ # Tone Mapping: 0: Hesitant, 1: Confident, 2: Unstable, 3: Natural
28
+ TONE_MAPPING = {
29
+ "Hesitant": 0,
30
+ "Confident": 1,
31
+ "Unstable": 2,
32
+ "Natural": 3,
33
+ "Excited": 3 # Excitement is treated as a high-energy Natural/Positive state
34
+ }
35
+
36
+ # Emotion Valence-Arousal coordinates for the Emotion Wheel
37
+ emotion_va = {
38
+ "happy": (0.8, 0.2), "fear": (0.2, 0.8), "angry": (-0.7, 0.65),
39
+ "sad": (-0.65, -0.55), "surprise": (0.1, -0.75), "disgust": (0.6, -0.4), "neutral": (0.0, 0.0)
40
+ }
41
+
42
+ # Static data for drawing the Emotion Ring labels
43
+ EMOTION_RING = [
44
+ ("Happy", 0, 0.84), ("Surprise", 45, 0.84), ("Fear", 100, 0.84),
45
+ ("Sad", 160, 0.84), ("Disgust", 215, 0.84), ("Angry", 270, 0.84)
46
+ ]
47
+
48
+ # --- Model Initialization ---
49
+
50
+ # Download Mediapipe Task file if not exists
51
+ MODEL_PATH = "face_landmarker.task"
52
+ if not os.path.exists(MODEL_PATH):
53
+ print("[INFO] Downloading Mediapipe Face Landmarker model...")
54
+ os.system(f"wget -O {MODEL_PATH} -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task")
55
+
56
+ # NLP Models
57
+ asr = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
58
+ semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
59
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
60
+
61
+ # Visual Emotion Model
62
+ FACE_MODEL_NAME = "dima806/facial_emotions_image_detection"
63
+ face_processor = AutoImageProcessor.from_pretrained(FACE_MODEL_NAME)
64
+ face_model = AutoModelForImageClassification.from_pretrained(FACE_MODEL_NAME).to(device).eval()
65
+
66
+ # --- Audio Analysis Modules ---
67
+
68
+ def extract_audio_features(y, sr):
69
+ """Calculates physical audio properties like pitch, jitter, and energy."""
70
+ duration = librosa.get_duration(y=y, sr=sr)
71
+ if duration == 0:
72
+ return {"pitch_std": 0, "jitter": 0, "energy_std": 0, "pause_ratio": 0, "speech_rate": 0}
73
+
74
+ # Pitch tracking using YIN algorithm
75
+ f0 = librosa.yin(y, fmin=75, fmax=300, sr=sr)
76
+ f0 = f0[~np.isnan(f0)]
77
+ pitch_std = np.std(f0) if len(f0) else 0
78
+ jitter = np.mean(np.abs(np.diff(f0)) / np.maximum(f0[:-1], 1e-6)) if len(f0) > 1 else 0
79
+
80
+ # Energy tracking (RMS)
81
+ rms = librosa.feature.rms(y=y)[0]
82
+ energy_std = np.std(rms)
83
+
84
+ # Speech vs Pause detection
85
+ intervals = librosa.effects.split(y, top_db=20)
86
+ speech_duration = sum((e - s) for s, e in intervals) / sr
87
+ pause_ratio = 1 - (speech_duration / duration) if duration > 0 else 0
88
+
89
+ # Speech rate (Word onsets per second)
90
+ oenv = librosa.onset.onset_strength(y=y, sr=sr)
91
+ onsets = librosa.onset.onset_detect(onset_envelope=oenv, sr=sr)
92
+ speech_rate = len(onsets) / duration if duration > 0 else 0
93
+
94
+ return {
95
+ "pitch_std": pitch_std, "jitter": jitter, "energy_std": energy_std,
96
+ "pause_ratio": pause_ratio, "speech_rate": speech_rate
97
+ }
98
+
99
+ def compute_audio_scores(features, baseline=None):
100
+ """Translates raw audio features into behavioral scores and tone IDs."""
101
+ if baseline is None:
102
+ baseline = {"pitch_std": 30.0, "energy_std": 0.05, "jitter": 0.02, "pause_ratio": 0.2, "speech_rate": 4.0}
103
+
104
+ # Relative ratios compared to personal baseline
105
+ p_ratio = features["pitch_std"] / max(baseline["pitch_std"], 1e-6)
106
+ e_ratio = features["energy_std"] / max(baseline["energy_std"], 1e-6)
107
+ r_ratio = features["speech_rate"] / max(baseline["speech_rate"], 1e-6)
108
+
109
+ # Behavioral Formulas
110
+ stress = np.clip((abs(1 - p_ratio) * 0.4 + abs(1 - e_ratio) * 0.4 + features["jitter"] * 0.2) * 150 + 20, 0, 100)
111
+ clarity = np.clip(100 - (max(0, features["pause_ratio"] - baseline["pause_ratio"]) * 120 + features["jitter"] * 400), 0, 100)
112
+ conf_audio = np.clip(100 - (abs(1 - r_ratio) * 40 + abs(1 - e_ratio) * 30 + features["pause_ratio"] * 50), 0, 100)
113
+
114
+ # Tone Classification
115
+ tones = {
116
+ "Confident": conf_audio,
117
+ "Hesitant": features["pause_ratio"] * 150,
118
+ "Unstable": stress,
119
+ "Natural": 100 - (abs(1 - p_ratio) * 60 + abs(1 - r_ratio) * 40)
120
+ }
121
+ dom_tone_name = max(tones, key=tones.get)
122
+
123
+ return {
124
+ "confidence_audio": round(float(conf_audio), 2),
125
+ "clarity": round(float(clarity), 2),
126
+ "stress": round(float(stress), 2),
127
+ "pauses": round(float(features["pause_ratio"] * 100), 2),
128
+ "tone_of_voice": TONE_MAPPING.get(dom_tone_name, 3) # Backend Integer ID
129
+ }
130
+
131
+ # --- Visual Analysis & Drawing ---
132
+
133
+ def compute_eye_contact_ratio(frame, landmarks):
134
+ """Calculates Eye Aspect Ratio (EAR) to estimate focus/contact."""
135
+ if not landmarks: return 0.5
136
+ h, w, _ = frame.shape
137
+ def ear(idx):
138
+ p = [np.array([landmarks[i].x * w, landmarks[i].y * h]) for i in idx]
139
+ return (np.linalg.norm(p[1]-p[5]) + np.linalg.norm(p[2]-p[4])) / (2.0 * np.linalg.norm(p[0]-p[3]))
140
+
141
+ avg_ear = (ear([33, 160, 158, 133, 153, 144]) + ear([362, 385, 387, 263, 373, 380])) / 2.0
142
+ return min(max(avg_ear * 3, 0), 1)
143
+
144
+ def analyze_face_emotion(frame):
145
+ """Predicts emotion probabilities using Vision Transformer (ViT)."""
146
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
147
+ img = Image.fromarray(rgb)
148
+ inputs = face_processor(images=img, return_tensors="pt").to(device)
149
+ with torch.no_grad():
150
+ outputs = face_model(**inputs)
151
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
152
+ labels = face_model.config.id2label
153
+ return {labels[i].lower(): float(probs[i]) for i in range(len(probs))}
154
+
155
+ def draw_face_ui(frame, x, y, w, h, emotion_label):
156
+ """Draws professional bounding box and active emotion label."""
157
+ color = (0, 255, 0)
158
+ cv2.rectangle(frame, (x, y), (x+w, y+h), color, 1)
159
+ # Corners
160
+ c_len = 20
161
+ for px, py, dx, dy in [(x,y,c_len,0), (x,y,0,c_len), (x+w,y,-c_len,0), (x+w,y,0,c_len), (x,y+h,c_len,0), (x,y+h,0,-c_len), (x+w,y+h,-c_len,0), (x+w,y+h,0,-c_len)]:
162
+ cv2.line(frame, (px, py), (px+dx, py+dy), color, 4)
163
+ cv2.putText(frame, emotion_label.upper(), (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
164
+ return frame
165
+
166
+ def draw_emotion_wheel(panel, center, radius, valence, arousal, dominant_emo):
167
+ """Renders the Valence-Arousal circular UI."""
168
+ cx, cy = center
169
+ cv2.circle(panel, center, radius, (60, 60, 85), 2) # Outer Ring
170
+ # Labels
171
+ for label, angle, rf in EMOTION_RING:
172
+ rad = math.radians(angle)
173
+ lx, ly = int(cx + rf*radius*math.cos(rad)), int(cy - rf*radius*math.sin(rad))
174
+ color = (0, 255, 200) if label.lower() == dominant_emo else (180, 180, 180)
175
+ cv2.putText(panel, label, (lx-20, ly), cv2.FONT_HERSHEY_SIMPLEX, 0.35, color, 1)
176
+ # Animated Dot
177
+ dx, dy = int(cx + valence*radius*0.8), int(cy - arousal*radius*0.8)
178
+ cv2.circle(panel, (dx, dy), 8, (255, 230, 60), -1)
179
+ return panel
180
+
181
+ def draw_metric_bars(frame, x, y, confidence, clarity, stress):
182
+ """Renders horizontal performance bars."""
183
+ metrics = [("Confidence", confidence, (70, 180, 255)), ("Clarity", clarity, (100, 220, 150)), ("Stress", stress, (255, 120, 100))]
184
+ for i, (label, val, col) in enumerate(metrics):
185
+ curr_y = y + i * 40
186
+ cv2.putText(frame, label, (x, curr_y-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
187
+ cv2.rectangle(frame, (x, curr_y), (x+200, curr_y+15), (50,50,50), -1) # BG
188
+ cv2.rectangle(frame, (x, curr_y), (x+int(val*2), curr_y+15), col, -1) # Fill
189
+ return frame
190
+
191
+ # --- Core Video Processing ---
192
+
193
+ def process_video_segment(video_path, output_dir, q_id, audio_scores):
194
+ """Analyzes visual data and generates annotated video."""
195
+ base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
196
+ options = vision.FaceLandmarkerOptions(base_options=base_options, running_mode=vision.RunningMode.VIDEO)
197
+
198
+ cap = cv2.VideoCapture(video_path)
199
+ fps = cap.get(cv2.CAP_PROP_FPS)
200
+ w, h = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
201
+
202
+ temp_v = os.path.join(output_dir, f"annotated_{q_id}.mp4")
203
+ out = cv2.VideoWriter(temp_v, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
204
+
205
+ frame_idx, face_accum, eye_accum = 0, [], []
206
+ s_v, s_a, dom_emo = 0.0, 0.0, "neutral"
207
+
208
+ with vision.FaceLandmarker.create_from_options(options) as landmarker:
209
+ while cap.isOpened():
210
+ ret, frame = cap.read()
211
+ if not ret: break
212
+
213
+ # Optimization: Run AI every 3 frames
214
+ if frame_idx % 3 == 0:
215
+ mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
216
+ res = landmarker.detect_for_video(mp_img, int((frame_idx/fps)*1000))
217
+
218
+ if res.face_landmarks:
219
+ lm = res.face_landmarks[0]
220
+ emotions = analyze_face_emotion(frame)
221
+ eye_s = compute_eye_contact_ratio(frame, lm)
222
+
223
+ # Statistical accumulation
224
+ face_accum.append((emotions.get("neutral", 0) + emotions.get("happy", 0)) * 100)
225
+ eye_accum.append(eye_s)
226
+
227
+ dom_emo = max(emotions, key=emotions.get)
228
+ v_t = sum(emotion_va[e][0]*s for e,s in emotions.items() if e in emotion_va)
229
+ a_t = sum(emotion_va[e][1]*s for e,s in emotions.items() if e in emotion_va)
230
+
231
+ # Moving average for smooth UI animation
232
+ s_v += 0.2 * (v_t - s_v)
233
+ s_a += 0.2 * (a_t - s_a)
234
+
235
+ # Draw Bounding Box
236
+ xs, ys = [l.x*w for l in lm], [l.y*h for l in lm]
237
+ draw_face_ui(frame, int(min(xs)), int(min(ys)), int(max(xs)-min(xs)), int(max(ys)-min(ys)), dom_emo)
238
+
239
+ # UI Overlays
240
+ draw_emotion_wheel(frame, (w-100, h-100), 70, s_v, s_a, dom_emo)
241
+ draw_metric_bars(frame, 30, h-120, audio_scores["confidence_audio"], audio_scores["clarity"], audio_scores["stress"])
242
+
243
+ out.write(frame)
244
+ frame_idx += 1
245
+
246
+ cap.release()
247
+ out.release()
248
+ return temp_v, np.mean(face_accum) if face_accum else 50, np.mean(eye_accum)*100 if eye_accum else 50
249
+
250
+ # --- Main Entry Point ---
251
+
252
+ def run_intervision_pipeline(video_path, questions_config, output_dir):
253
+ """Main pipeline execution for all interview questions."""
254
+ os.makedirs(output_dir, exist_ok=True)
255
+
256
+ # Baseline for fairness (first 10s)
257
+ try:
258
+ yb, srb = librosa.load(video_path, sr=16000, duration=10)
259
+ baseline = extract_audio_features(yb, srb)
260
+ except: baseline = None
261
+
262
+ final_reports, video_segments = [], []
263
+
264
+ for q in questions_config:
265
+ q_id = q['question_id']
266
+ raw_seg = os.path.join(output_dir, f"q{q_id}_raw.mp4")
267
+
268
+ # Clip segment using FFmpeg
269
+ dur = q["end_time"] - q["start_time"]
270
+ subprocess.run(['ffmpeg', '-y', '-ss', str(q["start_time"]), '-t', str(dur), '-i', video_path, '-c', 'copy', raw_seg], quiet=True)
271
+
272
+ # Audio Processing
273
+ y, sr = librosa.load(raw_seg, sr=16000)
274
+ a_scores = compute_audio_scores(extract_audio_features(y, sr), baseline)
275
+
276
+ # NLP Analysis
277
+ transcription = asr(raw_seg)["text"].strip()
278
+ sim_score = round(float(cosine_similarity(semantic_model.encode([transcription, q["ideal_answer"]]))[0][0]*100), 2)
279
+ rel_score = round(float(1/(1+np.exp(-cross_encoder.predict([(q["question_text"], transcription)])[0]))*100), 2)
280
+
281
+ # Visual Analysis
282
+ ann_v, f_conf, e_conf = process_video_segment(raw_seg, output_dir, q_id, a_scores)
283
+ final_v = os.path.join(output_dir, f"q{q_id}_final.mp4")
284
+ subprocess.run(['ffmpeg', '-y', '-i', ann_v, '-i', raw_seg, '-map', '0:v', '-map', '1:a', '-c:v', 'copy', '-c:a', 'aac', final_v], quiet=True)
285
+
286
+ video_segments.append(final_v)
287
+ final_reports.append({
288
+ "questionId": q_id,
289
+ "userAnswerText": transcription,
290
+ "toneOfVoice": a_scores["tone_of_voice"], # Integer ID (0-3)
291
+ "clarity": a_scores["clarity"],
292
+ "stress": a_scores["stress"],
293
+ "confidence": round((a_scores["confidence_audio"] + f_conf + e_conf) / 3, 2),
294
+ "pauses": a_scores["pauses"],
295
+ "score": sim_score,
296
+ "relevance": rel_score
297
+ })
298
+
299
+ # Final Concatenation
300
+ if video_segments:
301
+ list_p = os.path.join(output_dir, "list.txt")
302
+ with open(list_p, "w") as f:
303
+ for s in video_segments: f.write(f"file '{os.path.abspath(s)}'\n")
304
+
305
+ final_out = os.path.join(output_dir, "Intervision_Final_Result.mp4")
306
+ os.system(f"ffmpeg -f concat -safe 0 -i {list_p} -c copy -y {final_out}")
307
+
308
+ with open(os.path.join(output_dir, "report.json"), "w") as f:
309
+ json.dump({"listOfAnswerReport": final_reports}, f, indent=4)
310
+
311
+ return "Pipeline Finished Successfully"