Upload 5 files
Browse files- .gitattributes +1 -0
- Dockerfile +21 -0
- app.py +281 -0
- face_landmarker.task +3 -0
- pipeline.py +659 -0
- requirements.txt +23 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
face_landmarker.task filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime with GPU support if needed, or standard CPU
|
| 2 |
+
FROM python:3.9
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /code
|
| 6 |
+
|
| 7 |
+
# Install system dependencies for OpenCV and MediaPipe
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
libgl1-mesa-glx \
|
| 10 |
+
libglib2.0-0 \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy requirements and install
|
| 14 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 15 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy the rest of the application
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Command to run the application
|
| 21 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import requests
|
| 4 |
+
import cloudinary
|
| 5 |
+
import cloudinary.uploader
|
| 6 |
+
from requests.adapters import HTTPAdapter
|
| 7 |
+
from urllib3.util.retry import Retry
|
| 8 |
+
from fastapi import FastAPI, Body, HTTPException, BackgroundTasks
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from pipeline import run_intervision_pipeline
|
| 12 |
+
|
| 13 |
+
# --- Setup Retry Strategy ---
|
| 14 |
+
retry_strategy = Retry(
|
| 15 |
+
total=3,
|
| 16 |
+
backoff_factor=1, # Wait 1s, 2s, 4s between retries
|
| 17 |
+
status_forcelist=[429, 500, 502, 503, 504],
|
| 18 |
+
)
|
| 19 |
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
| 20 |
+
http = requests.Session()
|
| 21 |
+
http.mount("https://", adapter)
|
| 22 |
+
http.mount("http://", adapter)
|
| 23 |
+
|
| 24 |
+
# Load environment variables from .env file
|
| 25 |
+
load_dotenv()
|
| 26 |
+
|
| 27 |
+
app = FastAPI(title="Intervision AI Engine")
|
| 28 |
+
|
| 29 |
+
# Cloudinary Configuration
|
| 30 |
+
cloudinary.config(
|
| 31 |
+
cloud_name = os.getenv("CLOUDINARY_CLOUD_NAME"),
|
| 32 |
+
api_key = os.getenv("CLOUDINARY_API_KEY"),
|
| 33 |
+
api_secret = os.getenv("CLOUDINARY_API_SECRET")
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Directory Setup
|
| 37 |
+
RESULT_DIR = "temp_data/results"
|
| 38 |
+
UPLOAD_DIR = "temp_data/uploads"
|
| 39 |
+
os.makedirs(RESULT_DIR, exist_ok=True)
|
| 40 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
def time_to_seconds(t_str: str) -> int:
|
| 43 |
+
"""Converts HH:MM:SS timestamp format to total seconds."""
|
| 44 |
+
if not t_str: return 0
|
| 45 |
+
h, m, s = map(int, t_str.split(':'))
|
| 46 |
+
return h * 3600 + m * 60 + s
|
| 47 |
+
|
| 48 |
+
def background_processing(session_data: dict):
|
| 49 |
+
"""
|
| 50 |
+
Handles heavy AI processing: video download, pipeline execution,
|
| 51 |
+
result upload, and backend notification (callback).
|
| 52 |
+
"""
|
| 53 |
+
session_id = session_data.get('sessionId')
|
| 54 |
+
video_url = session_data.get('originalVideoUrl')
|
| 55 |
+
callback_url = session_data.get('callbackBaseUrl')
|
| 56 |
+
|
| 57 |
+
print(f"[LOG] Processing started for session: {session_id}")
|
| 58 |
+
|
| 59 |
+
# 1. Download the original video from the provided URL
|
| 60 |
+
local_input_path = os.path.join(UPLOAD_DIR, f"{session_id}_input.mp4")
|
| 61 |
+
# 1. Download with increased timeout and Retry logic
|
| 62 |
+
try:
|
| 63 |
+
print(f"[LOG] Downloading video: {video_url}")
|
| 64 |
+
# Increased timeout to 300s (5 minutes) for large files
|
| 65 |
+
response = http.get(video_url, stream=True, timeout=300)
|
| 66 |
+
response.raise_for_status()
|
| 67 |
+
with open(local_input_path, 'wb') as f:
|
| 68 |
+
for chunk in response.iter_content(chunk_size=1024*1024):
|
| 69 |
+
f.write(chunk)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"[DOWNLOAD ERROR]: {e}")
|
| 72 |
+
# Notify backend that it failed due to download
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
# 2. Prepare question list for the AI Pipeline
|
| 76 |
+
final_questions = []
|
| 77 |
+
skipped_failed_reports = []
|
| 78 |
+
|
| 79 |
+
for q in session_data.get('answers', []):
|
| 80 |
+
if q.get('isAnswered'):
|
| 81 |
+
final_questions.append({
|
| 82 |
+
"question_id": q['aiQuestionId'],
|
| 83 |
+
"question_text": q['questionText'],
|
| 84 |
+
"ideal_answer": q['expectedAnswer'],
|
| 85 |
+
"start_time": time_to_seconds(q['startedAt']),
|
| 86 |
+
"end_time": time_to_seconds(q['submittedAt'])
|
| 87 |
+
})
|
| 88 |
+
else:
|
| 89 |
+
# Handle questions that weren't answered during the session
|
| 90 |
+
skipped_failed_reports.append({
|
| 91 |
+
"questionId": q['aiQuestionId'],
|
| 92 |
+
"userAnswerText": "N/A",
|
| 93 |
+
"score": 0.0,
|
| 94 |
+
"relevance": 0.0,
|
| 95 |
+
"confidence": 0.0,
|
| 96 |
+
"stress": 0.0,
|
| 97 |
+
"clarity": 0.0,
|
| 98 |
+
"pauses": 0.0,
|
| 99 |
+
"toneOfVoice": "N/A",
|
| 100 |
+
"status": "skipped" if q.get('isSkipped') else "failed"
|
| 101 |
+
})
|
| 102 |
+
|
| 103 |
+
# 3. Execute AI Pipeline (Analysis & Visualization)
|
| 104 |
+
ai_results = []
|
| 105 |
+
if final_questions:
|
| 106 |
+
# run_intervision_pipeline generates Intervision_Final_Result.mp4
|
| 107 |
+
run_intervision_pipeline(local_input_path, final_questions, RESULT_DIR)
|
| 108 |
+
report_path = os.path.join(RESULT_DIR, "report.json")
|
| 109 |
+
if os.path.exists(report_path):
|
| 110 |
+
with open(report_path, "r") as f:
|
| 111 |
+
ai_results = json.load(f).get("listOfAnswerReport", [])
|
| 112 |
+
|
| 113 |
+
# 4. Upload the processed video to Cloudinary
|
| 114 |
+
final_video_path = os.path.join(RESULT_DIR, "Intervision_Final_Result.mp4")
|
| 115 |
+
final_video_url = None
|
| 116 |
+
if os.path.exists(final_video_path):
|
| 117 |
+
try:
|
| 118 |
+
upload_res = cloudinary.uploader.upload(
|
| 119 |
+
final_video_path,
|
| 120 |
+
public_id=f"res_{session_id}",
|
| 121 |
+
folder="intervision_results",
|
| 122 |
+
resource_type="video",
|
| 123 |
+
chunk_size=6000000
|
| 124 |
+
)
|
| 125 |
+
final_video_url = upload_res.get("secure_url")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"[UPLOAD ERROR]: {e}")
|
| 128 |
+
|
| 129 |
+
# 5. Construct final payload and notify Backend via Callback
|
| 130 |
+
final_payload = {
|
| 131 |
+
"sessionId": session_id,
|
| 132 |
+
"finalVideoUrl": final_video_url,
|
| 133 |
+
"report": ai_results + skipped_failed_reports
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
# Notify backend that processing is complete
|
| 138 |
+
cb_response = requests.post(f"{callback_url}/api/ai-callback", json=final_payload, timeout=30)
|
| 139 |
+
print(f"[LOG] Callback sent to {callback_url}. Status: {cb_response.status_code}")
|
| 140 |
+
|
| 141 |
+
# 6. Local Cleanup: Remove files to save disk space
|
| 142 |
+
if os.path.exists(local_input_path): os.remove(local_input_path)
|
| 143 |
+
if os.path.exists(final_video_path): os.remove(final_video_path)
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
print(f"[CALLBACK ERROR]: {e}")
|
| 147 |
+
|
| 148 |
+
@app.post("/process-interview/")
|
| 149 |
+
async def process_interview(background_tasks: BackgroundTasks, data: dict = Body(...)):
|
| 150 |
+
"""Entry point to start the AI analysis asynchronously."""
|
| 151 |
+
background_tasks.add_task(background_processing, data)
|
| 152 |
+
return {"message": "Processing started", "sessionId": data.get('sessionId')}
|
| 153 |
+
|
| 154 |
+
@app.post("/delete-video-by-url/")
|
| 155 |
+
async def delete_video_by_url(data: dict = Body(...)):
|
| 156 |
+
"""
|
| 157 |
+
Deletes a video from Cloudinary based on its URL.
|
| 158 |
+
Input JSON: {"videoUrl": "https://..."}
|
| 159 |
+
"""
|
| 160 |
+
video_url = data.get("videoUrl")
|
| 161 |
+
if not video_url:
|
| 162 |
+
raise HTTPException(status_code=400, detail="videoUrl is required")
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
# Logic to extract the public_id from a Cloudinary URL
|
| 166 |
+
# Example: .../folder/public_id.mp4 -> folder/public_id
|
| 167 |
+
url_parts = video_url.split('/')
|
| 168 |
+
filename_with_ext = url_parts[-1]
|
| 169 |
+
filename = filename_with_ext.split('.')[0]
|
| 170 |
+
|
| 171 |
+
# Check if the video is inside the results folder
|
| 172 |
+
folder = url_parts[-2] if "intervision_results" in url_parts[-2] else ""
|
| 173 |
+
public_id = f"{folder}/{filename}" if folder else filename
|
| 174 |
+
|
| 175 |
+
# Trigger deletion from Cloudinary
|
| 176 |
+
result = cloudinary.uploader.destroy(public_id, resource_type="video")
|
| 177 |
+
|
| 178 |
+
if result.get("result") == "ok":
|
| 179 |
+
return {"status": "success", "message": f"Deleted {public_id}"}
|
| 180 |
+
return {"status": "failed", "details": result}
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 184 |
+
|
| 185 |
+
if __name__ == "__main__":
|
| 186 |
+
import uvicorn
|
| 187 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 188 |
+
|
| 189 |
+
# @app.post("/process-interview-test/")
|
| 190 |
+
# async def process_test(data: dict = Body(...)):
|
| 191 |
+
# try:
|
| 192 |
+
# print(f"--- [TEST LOG] Processing Session: {data['sessionId']} ---")
|
| 193 |
+
|
| 194 |
+
# # 1. Path Check
|
| 195 |
+
# local_path = r"D:\FayzaAhmed\Graduation_project\models\MultiModal\deployment\interview_test.mp4"
|
| 196 |
+
# if not os.path.exists(local_path):
|
| 197 |
+
# return {"error": f"Video file not found at {local_path}"}
|
| 198 |
+
|
| 199 |
+
# # 2. Prepare Data
|
| 200 |
+
# final_questions = []
|
| 201 |
+
# for q in data['answers']:
|
| 202 |
+
# if q.get('isAnswered'):
|
| 203 |
+
# final_questions.append({
|
| 204 |
+
# "question_id": q['aiQuestionId'],
|
| 205 |
+
# "question_text": q['questionText'],
|
| 206 |
+
# "ideal_answer": q['expectedAnswer'],
|
| 207 |
+
# "start_time": time_to_seconds(q['startedAt']),
|
| 208 |
+
# "end_time": time_to_seconds(q['submittedAt'])
|
| 209 |
+
# })
|
| 210 |
+
|
| 211 |
+
# # 3. Run Pipeline
|
| 212 |
+
# print("[LOG] Running AI Pipeline...")
|
| 213 |
+
# run_intervision_pipeline(local_path, final_questions, RESULT_DIR)
|
| 214 |
+
|
| 215 |
+
# # 4. Upload
|
| 216 |
+
# print("[LOG] Uploading to Cloudinary...")
|
| 217 |
+
# final_video_path = os.path.join(RESULT_DIR, "Intervision_Final_Result.mp4")
|
| 218 |
+
# upload_res = cloudinary.uploader.upload(
|
| 219 |
+
# final_video_path,
|
| 220 |
+
# public_id=f"{data['sessionId']}_test",
|
| 221 |
+
# folder="intervision_tests",
|
| 222 |
+
# resource_type="video" # This is the important part
|
| 223 |
+
# )
|
| 224 |
+
|
| 225 |
+
# # 5. Load Report
|
| 226 |
+
# report_path = os.path.join(RESULT_DIR, "report.json")
|
| 227 |
+
# if not os.path.exists(report_path):
|
| 228 |
+
# return {"error": "Pipeline finished but report.json was not created."}
|
| 229 |
+
|
| 230 |
+
# with open(report_path, "r") as f:
|
| 231 |
+
# ai_results = json.load(f)["listOfAnswerReport"]
|
| 232 |
+
|
| 233 |
+
# return {
|
| 234 |
+
# "status": "Success",
|
| 235 |
+
# "videoUrl": upload_res.get("secure_url"),
|
| 236 |
+
# "report": ai_results
|
| 237 |
+
# }
|
| 238 |
+
|
| 239 |
+
# except Exception as e:
|
| 240 |
+
# print(f"[CRITICAL ERROR]: {str(e)}")
|
| 241 |
+
# return {"error": str(e), "traceback": "Check Terminal for details"}
|
| 242 |
+
|
| 243 |
+
"""QUESTIONS_CONFIG =
|
| 244 |
+
[
|
| 245 |
+
{
|
| 246 |
+
"question_id": 1,
|
| 247 |
+
"question_text": "how do you describe yourself",
|
| 248 |
+
"ideal_answer": "Being different means you have to work at belonging...",
|
| 249 |
+
"start_time": 0,
|
| 250 |
+
"end_time": 15,
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"question_id": 2,
|
| 254 |
+
"question_text": "Tell us about your biggest achievement",
|
| 255 |
+
"ideal_answer": "I am proud of accomplishing...",
|
| 256 |
+
"start_time": 15,
|
| 257 |
+
"end_time": 24,
|
| 258 |
+
}
|
| 259 |
+
]
|
| 260 |
+
"""
|
| 261 |
+
|
| 262 |
+
"""
|
| 263 |
+
{
|
| 264 |
+
"sessionId": "test-session-123",
|
| 265 |
+
"originalVideoUrl": "local_test_no_url",
|
| 266 |
+
"callbackBaseUrl": "http://localhost:8000",
|
| 267 |
+
"answers": [
|
| 268 |
+
{
|
| 269 |
+
"questionId": "q-1",
|
| 270 |
+
"aiQuestionId": 1,
|
| 271 |
+
"questionText": "How does the speaker encourage people to deal with their differences and uniqueness?",
|
| 272 |
+
"expectedAnswer": "When you're different, you have to work at the longing. Everybody wants to feel valued and accepted, and we think it should happen spontaneously, but it doesn't. Sometimes society tells us, and we tell ourselves, we don't fit the mold. Take a piece of paper and write down what makes you different. And I want you to celebrate it today and every day. Shout it from the rooftops. What makes me different is what has made me stand out and be successful. I also encourage you to be curious and ask, what is on other people's pieces of paper? What makes them different? Let's celebrate those imperfections that make us special. I hope that it teaches you that nobody has a claim on the word normal. We are all different. We are all quirky and unique, and that is what makes us wonderful.",
|
| 273 |
+
"isAnswered": true,
|
| 274 |
+
"isSkipped": false,
|
| 275 |
+
"isFailed": false,
|
| 276 |
+
"startedAt": "00:00:00",
|
| 277 |
+
"submittedAt": "00:00:55"
|
| 278 |
+
}
|
| 279 |
+
]
|
| 280 |
+
}
|
| 281 |
+
"""
|
face_landmarker.task
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
|
| 3 |
+
size 3758596
|
pipeline.py
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import cv2
|
| 4 |
+
import json
|
| 5 |
+
import math
|
| 6 |
+
import torch
|
| 7 |
+
import librosa
|
| 8 |
+
import ffmpeg
|
| 9 |
+
import numpy as np
|
| 10 |
+
import soundfile as sf
|
| 11 |
+
import mediapipe as mp
|
| 12 |
+
from PIL import Image
|
| 13 |
+
from transformers import AutoImageProcessor, AutoModelForImageClassification, pipeline
|
| 14 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 16 |
+
from mediapipe.tasks import python
|
| 17 |
+
from mediapipe.tasks.python import vision
|
| 18 |
+
|
| 19 |
+
# Ignore unnecessary warnings
|
| 20 |
+
import warnings
|
| 21 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
| 22 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 23 |
+
|
| 24 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 25 |
+
|
| 26 |
+
# 2. Download and Initialize Mediapipe once (Global)
|
| 27 |
+
MODEL_PATH = "face_landmarker.task"
|
| 28 |
+
if not os.path.exists(MODEL_PATH):
|
| 29 |
+
os.system(f"wget -O {MODEL_PATH} -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task")
|
| 30 |
+
|
| 31 |
+
# 3. Initialize Models
|
| 32 |
+
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
|
| 33 |
+
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 34 |
+
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
| 35 |
+
|
| 36 |
+
FACE_MODEL_NAME = "dima806/facial_emotions_image_detection"
|
| 37 |
+
face_processor = AutoImageProcessor.from_pretrained(FACE_MODEL_NAME)
|
| 38 |
+
face_model = AutoModelForImageClassification.from_pretrained(FACE_MODEL_NAME).to(device).eval()
|
| 39 |
+
|
| 40 |
+
# Emotion Mapping for Wheel
|
| 41 |
+
emotion_va = {
|
| 42 |
+
"happy": (0.8, 0.2), "fear": (0.2, 0.8), "angry": (-0.7, 0.65),
|
| 43 |
+
"sad": (-0.65, -0.55), "surprise": (0.1, -0.75), "disgust": (0.6, -0.4), "neutral": (0.0, 0.0)
|
| 44 |
+
}
|
| 45 |
+
EMOTION_RING = [
|
| 46 |
+
("Happy", 0, 0.84), ("Surprise", 45, 0.84), ("Fear", 100, 0.84),
|
| 47 |
+
("Sad", 160, 0.84), ("Disgust", 215, 0.84), ("Angry", 270, 0.84)
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
##Utility functions
|
| 51 |
+
|
| 52 |
+
def normalize(v, mn, mx):
|
| 53 |
+
return np.clip((v - mn) / (mx - mn), 0, 1) if mx - mn != 0 else 0.0
|
| 54 |
+
|
| 55 |
+
def extract_audio(v_in, a_out):
|
| 56 |
+
ffmpeg.input(v_in).output(a_out, ac=1, ar=16000).overwrite_output().run(quiet=True)
|
| 57 |
+
|
| 58 |
+
def merge_audio_video(v_in, a_in, v_out):
|
| 59 |
+
ffmpeg.output(ffmpeg.input(v_in).video, ffmpeg.input(a_in).audio, v_out, vcodec="libx264", acodec="aac").overwrite_output().run(quiet=True)
|
| 60 |
+
|
| 61 |
+
def draw_face_box(frame, x, y, w, h, emotion_name=""):
|
| 62 |
+
color, th, cl = (0, 255, 100), 2, 20 # Green color
|
| 63 |
+
cv2.rectangle(frame, (x, y), (x+w, y+h), color, 1)
|
| 64 |
+
|
| 65 |
+
# Add emotion name above face box
|
| 66 |
+
if emotion_name:
|
| 67 |
+
cv2.putText(
|
| 68 |
+
frame,
|
| 69 |
+
emotion_name.upper(),
|
| 70 |
+
(x + 10, y - 15),
|
| 71 |
+
cv2.FONT_HERSHEY_DUPLEX,
|
| 72 |
+
0.7,
|
| 73 |
+
(0, 255, 100),
|
| 74 |
+
2,
|
| 75 |
+
cv2.LINE_AA
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Corners
|
| 79 |
+
for px, py, dx, dy in [(x,y,cl,0), (x,y,0,cl), (x+w,y,-cl,0), (x+w,y,0,cl), (x,y+h,cl,0), (x,y+h,0,-cl), (x+w,y+h,-cl,0), (x+w,y+h,0,-cl)]:
|
| 80 |
+
cv2.line(frame, (px, py), (px+dx, py+dy), color, 5)
|
| 81 |
+
return frame
|
| 82 |
+
|
| 83 |
+
def compute_eye_contact_ratio(frame, landmarks):
|
| 84 |
+
h, w, _ = frame.shape
|
| 85 |
+
def ear(idx):
|
| 86 |
+
p = [np.array([landmarks[i].x * w, landmarks[i].y * h]) for i in idx]
|
| 87 |
+
return (np.linalg.norm(p[1]-p[5]) + np.linalg.norm(p[2]-p[4])) / (2.0 * np.linalg.norm(p[0]-p[3]))
|
| 88 |
+
avg_ear = (ear([33, 160, 158, 133, 153, 144]) + ear([362, 385, 387, 263, 373, 380])) / 2.0
|
| 89 |
+
return min(max(avg_ear * 3, 0), 1)
|
| 90 |
+
|
| 91 |
+
def analyze_face_emotion(frame):
|
| 92 |
+
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
| 93 |
+
inputs = face_processor(images=img, return_tensors="pt").to(device)
|
| 94 |
+
with torch.no_grad():
|
| 95 |
+
outputs = face_model(**inputs)
|
| 96 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
|
| 97 |
+
return {face_model.config.id2label[i].lower(): float(probs[i]) for i in range(len(probs))}
|
| 98 |
+
|
| 99 |
+
##Audio analysis
|
| 100 |
+
|
| 101 |
+
def extract_audio_features(y, sr):
|
| 102 |
+
duration = librosa.get_duration(y=y, sr=sr)
|
| 103 |
+
if duration == 0:
|
| 104 |
+
return {"pitch_std": 0, "jitter": 0, "energy_std": 0, "pause_ratio": 0, "speech_rate": 0}
|
| 105 |
+
|
| 106 |
+
# Pitch & Jitter
|
| 107 |
+
f0 = librosa.yin(y, fmin=75, fmax=300, sr=sr)
|
| 108 |
+
f0 = f0[~np.isnan(f0)]
|
| 109 |
+
pitch_std = np.std(f0) if len(f0) else 0
|
| 110 |
+
jitter = np.mean(np.abs(np.diff(f0)) / np.maximum(f0[:-1], 1e-6)) if len(f0) > 1 else 0
|
| 111 |
+
|
| 112 |
+
# Energy
|
| 113 |
+
rms = librosa.feature.rms(y=y)[0]
|
| 114 |
+
energy_std = np.std(rms)
|
| 115 |
+
|
| 116 |
+
intervals = librosa.effects.split(y, top_db=20)
|
| 117 |
+
speech_duration = sum((e - s) for s, e in intervals) / sr
|
| 118 |
+
pause_ratio = 1 - (speech_duration / duration) if duration > 0 else 0
|
| 119 |
+
|
| 120 |
+
# Speech Rate
|
| 121 |
+
oenv = librosa.onset.onset_strength(y=y, sr=sr)
|
| 122 |
+
onsets = librosa.onset.onset_detect(onset_envelope=oenv, sr=sr)
|
| 123 |
+
speech_rate = len(onsets) / duration if duration > 0 else 0
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"pitch_std": pitch_std,
|
| 127 |
+
"jitter": jitter,
|
| 128 |
+
"energy_std": energy_std,
|
| 129 |
+
"pause_ratio": pause_ratio,
|
| 130 |
+
"speech_rate": speech_rate
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def compute_audio_scores(features, baseline=None):
|
| 135 |
+
"""
|
| 136 |
+
Fairness-aware audio scoring with personal baseline comparison
|
| 137 |
+
"""
|
| 138 |
+
# Use standard defaults if no baseline provided
|
| 139 |
+
if baseline is None:
|
| 140 |
+
baseline = {"pitch_std": 30.0, "energy_std": 0.05, "jitter": 0.02, "pause_ratio": 0.2, "speech_rate": 4.0}
|
| 141 |
+
|
| 142 |
+
# Calculate Relative Ratios (Current / Baseline)
|
| 143 |
+
pitch_ratio = features["pitch_std"] / max(baseline["pitch_std"], 1e-6)
|
| 144 |
+
energy_ratio = features["energy_std"] / max(baseline["energy_std"], 1e-6)
|
| 145 |
+
rate_ratio = features["speech_rate"] / max(baseline["speech_rate"], 1e-6)
|
| 146 |
+
|
| 147 |
+
# Stress Score (Relative)
|
| 148 |
+
pitch_dev = abs(1 - pitch_ratio)
|
| 149 |
+
energy_dev = abs(1 - energy_ratio)
|
| 150 |
+
stress_val = (pitch_dev * 0.4 + energy_dev * 0.4 + features["jitter"] * 0.2) * 150
|
| 151 |
+
stress = np.clip(stress_val + 20, 0, 100)
|
| 152 |
+
|
| 153 |
+
# Clarity Score (Relative)
|
| 154 |
+
pause_dev = max(0, features["pause_ratio"] - baseline["pause_ratio"])
|
| 155 |
+
clarity = 100 - (pause_dev * 120 + features["jitter"] * 400)
|
| 156 |
+
|
| 157 |
+
# Confidence Score (Relative)
|
| 158 |
+
rate_dev = abs(1 - rate_ratio)
|
| 159 |
+
confidence_audio = 100 - (rate_dev * 40 + energy_dev * 30 + features["pause_ratio"] * 50)
|
| 160 |
+
|
| 161 |
+
# Tone classification based on relative shifts
|
| 162 |
+
tones = {
|
| 163 |
+
"Confident": confidence_audio,
|
| 164 |
+
"Hesitant": features["pause_ratio"] * 150,
|
| 165 |
+
"Excited": (energy_ratio - 1) * 100 if energy_ratio > 1 else 0,
|
| 166 |
+
"Unstable": stress,
|
| 167 |
+
"Natural": 100 - (pitch_dev * 60 + rate_dev * 40)
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
dominant_tone = max(tones, key=tones.get)
|
| 171 |
+
|
| 172 |
+
return {
|
| 173 |
+
"confidence_audio": round(float(np.clip(confidence_audio, 0, 100)), 2),
|
| 174 |
+
"clarity": round(float(np.clip(clarity, 0, 100)), 2),
|
| 175 |
+
"stress": round(float(np.clip(stress, 0, 100)), 2),
|
| 176 |
+
"pauses": round(float(features["pause_ratio"] * 100), 2),
|
| 177 |
+
"tone_of_voice": dominant_tone
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
def analyze_audio_segment(audio_path, baseline=None):
|
| 181 |
+
"""
|
| 182 |
+
Main entry point for audio segment analysis
|
| 183 |
+
"""
|
| 184 |
+
y, sr = librosa.load(audio_path, sr=16000)
|
| 185 |
+
features = extract_audio_features(y, sr)
|
| 186 |
+
return compute_audio_scores(features, baseline)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
##Text analysis
|
| 190 |
+
|
| 191 |
+
def get_user_answer(audio_path):
|
| 192 |
+
"""Transcribe audio using Whisper"""
|
| 193 |
+
result = asr(audio_path, chunk_length_s=20)
|
| 194 |
+
return result["text"].strip()
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def compute_similarity_score(user_answer, ideal_answer):
|
| 198 |
+
emb = semantic_model.encode([user_answer, ideal_answer])
|
| 199 |
+
sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
|
| 200 |
+
score = float(sim * 100)
|
| 201 |
+
return round(max(0, score), 2)
|
| 202 |
+
|
| 203 |
+
def compute_relevance_score(question, user_answer):
|
| 204 |
+
raw_score = cross_encoder.predict([(question, user_answer)])[0]
|
| 205 |
+
prob = 1 / (1 + np.exp(-raw_score))
|
| 206 |
+
score = float(prob * 100)
|
| 207 |
+
return round(max(0, score), 2)
|
| 208 |
+
|
| 209 |
+
##Video
|
| 210 |
+
|
| 211 |
+
# Eye indices
|
| 212 |
+
LEFT_EYE = [33, 160, 158, 133, 153, 144]
|
| 213 |
+
RIGHT_EYE = [362, 385, 387, 263, 373, 380]
|
| 214 |
+
|
| 215 |
+
# Eye Contact Function
|
| 216 |
+
def compute_eye_contact_ratio(frame, landmarks):
|
| 217 |
+
"""
|
| 218 |
+
Compute eye contact ratio from detected face landmarks
|
| 219 |
+
"""
|
| 220 |
+
|
| 221 |
+
if not landmarks:
|
| 222 |
+
return 0.5
|
| 223 |
+
|
| 224 |
+
h, w, _ = frame.shape
|
| 225 |
+
|
| 226 |
+
def ear(indices):
|
| 227 |
+
points = [
|
| 228 |
+
np.array([
|
| 229 |
+
landmarks[i].x * w,
|
| 230 |
+
landmarks[i].y * h
|
| 231 |
+
])
|
| 232 |
+
for i in indices
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
v1 = np.linalg.norm(points[1] - points[5])
|
| 236 |
+
v2 = np.linalg.norm(points[2] - points[4])
|
| 237 |
+
h_dist = np.linalg.norm(points[0] - points[3])
|
| 238 |
+
|
| 239 |
+
return (v1 + v2) / (2.0 * h_dist)
|
| 240 |
+
|
| 241 |
+
ear_left = ear(LEFT_EYE)
|
| 242 |
+
ear_right = ear(RIGHT_EYE)
|
| 243 |
+
|
| 244 |
+
avg_ear = (ear_left + ear_right) / 2.0
|
| 245 |
+
|
| 246 |
+
eye_score = min(max(avg_ear * 3, 0), 1)
|
| 247 |
+
|
| 248 |
+
return eye_score
|
| 249 |
+
|
| 250 |
+
def analyze_face_emotion(frame):
|
| 251 |
+
"""
|
| 252 |
+
Predict facial emotion probabilities from single frame
|
| 253 |
+
"""
|
| 254 |
+
|
| 255 |
+
# Convert BGR to RGB
|
| 256 |
+
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 257 |
+
image = Image.fromarray(rgb)
|
| 258 |
+
|
| 259 |
+
# Preprocess
|
| 260 |
+
inputs = face_processor(images=image, return_tensors="pt").to(device)
|
| 261 |
+
|
| 262 |
+
with torch.no_grad():
|
| 263 |
+
outputs = face_model(**inputs)
|
| 264 |
+
|
| 265 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
|
| 266 |
+
labels = face_model.config.id2label
|
| 267 |
+
|
| 268 |
+
emotion_probs = {
|
| 269 |
+
labels[i].lower(): float(probs[i])
|
| 270 |
+
for i in range(len(probs))
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
return emotion_probs
|
| 274 |
+
|
| 275 |
+
def draw_face_box(frame, x, y, w, h, emotion_label="Neutral"):
|
| 276 |
+
"""
|
| 277 |
+
Draw face bounding box with emotion label above it
|
| 278 |
+
"""
|
| 279 |
+
|
| 280 |
+
# Green color for face box
|
| 281 |
+
color = (0, 255, 0)
|
| 282 |
+
|
| 283 |
+
thickness = 2
|
| 284 |
+
corner_len = 22
|
| 285 |
+
|
| 286 |
+
# Main rectangle
|
| 287 |
+
cv2.rectangle(frame, (x, y), (x+w, y+h), color, thickness)
|
| 288 |
+
|
| 289 |
+
# Decorative corner lines
|
| 290 |
+
for (px, py, dx, dy) in [
|
| 291 |
+
(x, y, corner_len, 0), (x, y, 0, corner_len),
|
| 292 |
+
(x+w, y, -corner_len, 0), (x+w, y, 0, corner_len),
|
| 293 |
+
(x, y+h, corner_len, 0), (x, y+h, 0, -corner_len),
|
| 294 |
+
(x+w, y+h, -corner_len, 0), (x+w, y+h, 0, -corner_len),
|
| 295 |
+
]:
|
| 296 |
+
cv2.line(frame, (px, py), (px+dx, py+dy), color, 4)
|
| 297 |
+
|
| 298 |
+
# Draw emotion text above the face box
|
| 299 |
+
label_text = emotion_label.capitalize()
|
| 300 |
+
|
| 301 |
+
(tw, th), _ = cv2.getTextSize(
|
| 302 |
+
label_text,
|
| 303 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 304 |
+
0.7,
|
| 305 |
+
2
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
text_x = x + (w - tw) // 2
|
| 309 |
+
text_y = y - 10
|
| 310 |
+
|
| 311 |
+
cv2.putText(
|
| 312 |
+
frame,
|
| 313 |
+
label_text,
|
| 314 |
+
(text_x, text_y),
|
| 315 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 316 |
+
0.7,
|
| 317 |
+
(0, 255, 0),
|
| 318 |
+
2,
|
| 319 |
+
cv2.LINE_AA
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
return frame
|
| 323 |
+
|
| 324 |
+
def compute_valence_arousal_from_probs(emotion_probs):
|
| 325 |
+
"""Computing Valence and Arousal from emotion probabilities"""
|
| 326 |
+
v, a, total = 0.0, 0.0, 0.0
|
| 327 |
+
|
| 328 |
+
for emo, score in emotion_probs.items():
|
| 329 |
+
emo = emo.lower()
|
| 330 |
+
if emo in emotion_va:
|
| 331 |
+
v += emotion_va[emo][0] * score
|
| 332 |
+
a += emotion_va[emo][1] * score
|
| 333 |
+
total += score
|
| 334 |
+
|
| 335 |
+
if total == 0:
|
| 336 |
+
return 0.0, 0.0
|
| 337 |
+
|
| 338 |
+
return v / total, a / total
|
| 339 |
+
|
| 340 |
+
def draw_full_emotion_wheel(panel, center, radius, valence, arousal,
|
| 341 |
+
dominant_emotion="neutral"):
|
| 342 |
+
cx, cy = center
|
| 343 |
+
|
| 344 |
+
# Circle background
|
| 345 |
+
cv2.circle(panel, center, radius + 5, (15, 15, 25), -1)
|
| 346 |
+
cv2.circle(panel, center, radius, (60, 60, 85), 2)
|
| 347 |
+
for rf in [0.33, 0.66]:
|
| 348 |
+
cv2.circle(panel, center, int(radius * rf), (35, 35, 50), 1)
|
| 349 |
+
|
| 350 |
+
# Drawing dividing lines between emotions
|
| 351 |
+
for angle_deg in range(0, 360, 60):
|
| 352 |
+
rad = math.radians(angle_deg)
|
| 353 |
+
x1 = int(cx + radius * math.cos(rad))
|
| 354 |
+
y1 = int(cy - radius * math.sin(rad))
|
| 355 |
+
cv2.line(panel, (cx, cy), (x1, y1), (40, 40, 60), 1)
|
| 356 |
+
|
| 357 |
+
# Drawing emotion labels
|
| 358 |
+
ef, es, et = cv2.FONT_HERSHEY_SIMPLEX, 0.40, 1
|
| 359 |
+
for emotion_data in EMOTION_RING:
|
| 360 |
+
if emotion_data[1] is None:
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
label, angle_deg, rf = emotion_data
|
| 364 |
+
rad = math.radians(angle_deg)
|
| 365 |
+
lx = int(cx + rf * radius * math.cos(rad))
|
| 366 |
+
ly = int(cy - rf * radius * math.sin(rad))
|
| 367 |
+
(tw, th), _ = cv2.getTextSize(label, ef, es, et)
|
| 368 |
+
tx, ty = lx - tw//2, ly + th//2
|
| 369 |
+
|
| 370 |
+
# Highlight active emotion
|
| 371 |
+
if label.lower() == dominant_emotion.lower():
|
| 372 |
+
cv2.putText(panel, label, (tx, ty), ef, es+0.08, (0, 255, 200), 2, cv2.LINE_AA)
|
| 373 |
+
else:
|
| 374 |
+
cv2.putText(panel, label, (tx, ty), ef, es, (190, 190, 255), et, cv2.LINE_AA)
|
| 375 |
+
|
| 376 |
+
# Neutral in center
|
| 377 |
+
nc = (0, 255, 200) if dominant_emotion == "neutral" else (160, 160, 160)
|
| 378 |
+
(tw, th), _ = cv2.getTextSize("Neutral", ef, es, et)
|
| 379 |
+
cv2.putText(panel, "Neutral", (cx-tw//2, cy+th//2), ef, es, nc, et, cv2.LINE_AA)
|
| 380 |
+
|
| 381 |
+
# Animated dot with glow
|
| 382 |
+
dot_x = int(cx + valence * radius * 0.88)
|
| 383 |
+
dot_y = int(cy - arousal * radius * 0.88)
|
| 384 |
+
cv2.circle(panel, (dot_x, dot_y), 15, (160, 120, 0), -1)
|
| 385 |
+
cv2.circle(panel, (dot_x, dot_y), 11, (220, 180, 0), -1)
|
| 386 |
+
cv2.circle(panel, (dot_x, dot_y), 7, (255, 230, 60), -1)
|
| 387 |
+
|
| 388 |
+
return panel
|
| 389 |
+
|
| 390 |
+
BAR_CONFIGS = [
|
| 391 |
+
("Confidence", (70, 180, 255), (30, 50, 100)), # light blue
|
| 392 |
+
("Clarity", (100, 220, 150), (25, 70, 50)), # light cyan
|
| 393 |
+
("Stress", (255, 120, 100), (100, 40, 30)), # light coral
|
| 394 |
+
]
|
| 395 |
+
|
| 396 |
+
def draw_metric_bars(panel,
|
| 397 |
+
bars_x_start,
|
| 398 |
+
bar_y_top,
|
| 399 |
+
bar_height,
|
| 400 |
+
bar_width,
|
| 401 |
+
bar_gap,
|
| 402 |
+
confidence,
|
| 403 |
+
clarity,
|
| 404 |
+
stress):
|
| 405 |
+
"""
|
| 406 |
+
Draw horizontal metric bars with label above each bar
|
| 407 |
+
"""
|
| 408 |
+
|
| 409 |
+
values = [confidence, clarity, stress]
|
| 410 |
+
labels_list = ["Confidence", "Clarity", "Stress"]
|
| 411 |
+
|
| 412 |
+
# Extra vertical space for labels
|
| 413 |
+
label_space = 20
|
| 414 |
+
|
| 415 |
+
for i, value in enumerate(values):
|
| 416 |
+
|
| 417 |
+
label, fill_color, bg_color = BAR_CONFIGS[i]
|
| 418 |
+
|
| 419 |
+
# Each bar block height = label + bar + gap
|
| 420 |
+
y = bar_y_top + i * (bar_height + label_space + bar_gap)
|
| 421 |
+
|
| 422 |
+
x_right = bars_x_start + bar_width
|
| 423 |
+
|
| 424 |
+
filled = int((value / 100) * bar_width)
|
| 425 |
+
|
| 426 |
+
# Draw label above bar
|
| 427 |
+
cv2.putText(
|
| 428 |
+
panel,
|
| 429 |
+
label,
|
| 430 |
+
(bars_x_start, y),
|
| 431 |
+
cv2.FONT_HERSHEY_DUPLEX,
|
| 432 |
+
0.6,
|
| 433 |
+
(230, 230, 230),
|
| 434 |
+
1,
|
| 435 |
+
cv2.LINE_AA
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Move bar slightly down to leave space for label
|
| 439 |
+
bar_y = y + 8
|
| 440 |
+
|
| 441 |
+
# Draw background bar
|
| 442 |
+
cv2.rectangle(
|
| 443 |
+
panel,
|
| 444 |
+
(bars_x_start, bar_y),
|
| 445 |
+
(x_right, bar_y + bar_height),
|
| 446 |
+
bg_color,
|
| 447 |
+
-1
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
# Draw filled portion
|
| 451 |
+
cv2.rectangle(
|
| 452 |
+
panel,
|
| 453 |
+
(bars_x_start, bar_y),
|
| 454 |
+
(bars_x_start + filled, bar_y + bar_height),
|
| 455 |
+
fill_color,
|
| 456 |
+
-1
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
# Draw percentage text
|
| 460 |
+
cv2.putText(
|
| 461 |
+
panel,
|
| 462 |
+
f"{int(value)}%",
|
| 463 |
+
(bars_x_start + 12, bar_y + bar_height - 6),
|
| 464 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 465 |
+
0.6,
|
| 466 |
+
(255, 255, 255),
|
| 467 |
+
2,
|
| 468 |
+
cv2.LINE_AA
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
return panel
|
| 472 |
+
|
| 473 |
+
##Integrated Video Processing (Analysis + Annotation)
|
| 474 |
+
|
| 475 |
+
def process_video_segment(video_path, output_dir, segment_id, audio_scores_global=None):
|
| 476 |
+
base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
|
| 477 |
+
options = vision.FaceLandmarkerOptions(base_options=base_options, running_mode=vision.RunningMode.VIDEO, num_faces=1)
|
| 478 |
+
|
| 479 |
+
cap = cv2.VideoCapture(video_path)
|
| 480 |
+
fps, width, height = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 481 |
+
temp_video = os.path.join(output_dir, f"temp_annotated_{segment_id}.mp4")
|
| 482 |
+
# out = cv2.VideoWriter(temp_video, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
|
| 483 |
+
# Use 'avc1' or 'H264' for web compatibility
|
| 484 |
+
fourcc = cv2.VideoWriter_fourcc(*'XVID')
|
| 485 |
+
out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))
|
| 486 |
+
|
| 487 |
+
face_conf_accum, eye_accum, frame_idx = [], [], 0
|
| 488 |
+
smooth_v, smooth_a, dom_emo = 0.0, 0.0, "neutral"
|
| 489 |
+
|
| 490 |
+
# --- Optimization Variables ---
|
| 491 |
+
frame_stride = 3 # Process AI every 3 frames
|
| 492 |
+
last_results = None
|
| 493 |
+
last_emotions = None
|
| 494 |
+
last_eye_s = 0.5
|
| 495 |
+
last_lm = None
|
| 496 |
+
# ------------------------------
|
| 497 |
+
|
| 498 |
+
b_conf = audio_scores_global.get("confidence_audio", 50)
|
| 499 |
+
b_clar = audio_scores_global.get("clarity", 50)
|
| 500 |
+
b_stress = audio_scores_global.get("stress", 20)
|
| 501 |
+
|
| 502 |
+
with vision.FaceLandmarker.create_from_options(options) as landmarker:
|
| 503 |
+
while cap.isOpened():
|
| 504 |
+
ret, frame = cap.read()
|
| 505 |
+
if not ret:
|
| 506 |
+
break
|
| 507 |
+
|
| 508 |
+
# 1. RUN HEAVY AI ONLY ON STRIDE FRAMES
|
| 509 |
+
if frame_idx % frame_stride == 0:
|
| 510 |
+
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
| 511 |
+
last_results = landmarker.detect_for_video(mp_image, int((frame_idx/fps)*1000))
|
| 512 |
+
|
| 513 |
+
if last_results.face_landmarks:
|
| 514 |
+
last_lm = last_results.face_landmarks[0]
|
| 515 |
+
last_emotions = analyze_face_emotion(frame)
|
| 516 |
+
last_eye_s = compute_eye_contact_ratio(frame, last_lm)
|
| 517 |
+
|
| 518 |
+
# 2. USE LAST KNOWN DATA FOR CALCULATIONS & DRAWING
|
| 519 |
+
d_conf, d_clar, d_stress = b_conf, b_clar, b_stress
|
| 520 |
+
|
| 521 |
+
if last_results and last_results.face_landmarks:
|
| 522 |
+
# Use current local variables from 'last' successful AI run
|
| 523 |
+
curr_f_conf = (last_emotions.get("neutral", 0) + last_emotions.get("happy", 0)) * 100
|
| 524 |
+
d_conf = (b_conf * 0.7) + (curr_f_conf * 0.3)
|
| 525 |
+
d_clar = (b_clar * 0.8) + (last_eye_s * 100 * 0.2)
|
| 526 |
+
d_stress = (b_stress * 0.7) + ((last_emotions.get("sad",0)+last_emotions.get("angry",0))*30)
|
| 527 |
+
|
| 528 |
+
# Update accumulators only on stride frames to keep averages accurate
|
| 529 |
+
if frame_idx % frame_stride == 0:
|
| 530 |
+
face_conf_accum.append(curr_f_conf)
|
| 531 |
+
eye_accum.append(last_eye_s)
|
| 532 |
+
|
| 533 |
+
dom_emo = max(last_emotions, key=last_emotions.get)
|
| 534 |
+
v_t = sum(emotion_va[e][0]*s for e,s in last_emotions.items() if e in emotion_va)
|
| 535 |
+
a_t = sum(emotion_va[e][1]*s for e,s in last_emotions.items() if e in emotion_va)
|
| 536 |
+
|
| 537 |
+
# Keep smoothing every frame for fluid movement
|
| 538 |
+
smooth_v += 0.15 * (v_t - smooth_v)
|
| 539 |
+
smooth_a += 0.15 * (a_t - smooth_a)
|
| 540 |
+
|
| 541 |
+
# Draw face box using the last known landmarks
|
| 542 |
+
xs, ys = [l.x*width for l in last_lm], [l.y*height for l in last_lm]
|
| 543 |
+
draw_face_box(
|
| 544 |
+
frame,
|
| 545 |
+
int(min(xs)), int(min(ys)),
|
| 546 |
+
int(max(xs) - min(xs)), int(max(ys) - min(ys)),
|
| 547 |
+
dom_emo
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
# 3. ALWAYS DRAW UI (Wheel and Bars)
|
| 551 |
+
frame = draw_full_emotion_wheel(frame, (width-130, height-100), 90, smooth_v, smooth_a, dom_emo)
|
| 552 |
+
frame = draw_metric_bars(frame, 30, height-160, 28, 200, 6, d_conf, d_clar, d_stress)
|
| 553 |
+
|
| 554 |
+
out.write(frame)
|
| 555 |
+
frame_idx += 1
|
| 556 |
+
|
| 557 |
+
cap.release()
|
| 558 |
+
out.release()
|
| 559 |
+
return temp_video, np.mean(face_conf_accum) if face_conf_accum else 50, np.mean(eye_accum)*100 if eye_accum else 50
|
| 560 |
+
|
| 561 |
+
##Main pipeline
|
| 562 |
+
def run_intervision_pipeline(video_path, questions_config, output_dir):
|
| 563 |
+
if not os.path.exists(video_path):
|
| 564 |
+
return f"Error: Video file not found at {video_path}"
|
| 565 |
+
|
| 566 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 567 |
+
|
| 568 |
+
# Establish baseline from first 10s
|
| 569 |
+
try:
|
| 570 |
+
y_b, sr_b = librosa.load(video_path, sr=16000, duration=10)
|
| 571 |
+
baseline = extract_audio_features(y_b, sr_b)
|
| 572 |
+
except Exception as e:
|
| 573 |
+
print(f"Baseline Load Warning: {e}. Using defaults.")
|
| 574 |
+
baseline = None
|
| 575 |
+
|
| 576 |
+
final_reports, segments = [], []
|
| 577 |
+
|
| 578 |
+
for q in questions_config:
|
| 579 |
+
q_id = q['question_id']
|
| 580 |
+
raw_seg = os.path.join(output_dir, f"q{q_id}_raw.mp4")
|
| 581 |
+
wav_p = os.path.join(output_dir, f"q{q_id}.wav")
|
| 582 |
+
|
| 583 |
+
# Precise FFmpeg cutting with error handling
|
| 584 |
+
duration = q["end_time"] - q["start_time"]
|
| 585 |
+
try:
|
| 586 |
+
subprocess.run([
|
| 587 |
+
'ffmpeg', '-y', '-ss', str(q["start_time"]), '-t', str(duration),
|
| 588 |
+
'-i', video_path, '-c:v', 'libx264', '-c:a', 'aac', '-strict', 'experimental', raw_seg
|
| 589 |
+
], check=True, capture_output=True)
|
| 590 |
+
except subprocess.CalledProcessError as e:
|
| 591 |
+
print(f"Skipping Question {q_id}: Time range might be out of video bounds.")
|
| 592 |
+
continue
|
| 593 |
+
|
| 594 |
+
# Audio Extraction
|
| 595 |
+
try:
|
| 596 |
+
y, sr = librosa.load(raw_seg, sr=16000)
|
| 597 |
+
import soundfile as sf
|
| 598 |
+
sf.write(wav_p, y, sr)
|
| 599 |
+
except Exception as e:
|
| 600 |
+
print(f"Error extracting audio for Q{q_id}: {e}")
|
| 601 |
+
continue
|
| 602 |
+
|
| 603 |
+
# Audio Analysis
|
| 604 |
+
a_scores = compute_audio_scores(extract_audio_features(y, sr), baseline)
|
| 605 |
+
|
| 606 |
+
# Whisper Transcription
|
| 607 |
+
try:
|
| 608 |
+
transcription_data = asr(wav_p, chunk_length_s=30, return_timestamps=True)
|
| 609 |
+
transcription = transcription_data["text"].strip()
|
| 610 |
+
except:
|
| 611 |
+
transcription = "[Transcription Error]"
|
| 612 |
+
|
| 613 |
+
similarity_score = compute_similarity_score(transcription, q["ideal_answer"])
|
| 614 |
+
relevance_score = compute_relevance_score(q["question_text"], transcription)
|
| 615 |
+
|
| 616 |
+
# Visual Analysis
|
| 617 |
+
try:
|
| 618 |
+
ann_v, f_c, e_c = process_video_segment(raw_seg, output_dir, q_id, a_scores)
|
| 619 |
+
|
| 620 |
+
final_v = os.path.join(output_dir, f"q{q_id}_final.mp4")
|
| 621 |
+
subprocess.run([
|
| 622 |
+
'ffmpeg', '-y', '-i', ann_v, '-i', raw_seg, '-map', '0:v', '-map', '1:a',
|
| 623 |
+
'-c:v', 'copy', '-c:a', 'aac', final_v
|
| 624 |
+
], check=True, capture_output=True)
|
| 625 |
+
|
| 626 |
+
segments.append(final_v)
|
| 627 |
+
|
| 628 |
+
final_reports.append({
|
| 629 |
+
"questionId": q_id,
|
| 630 |
+
"userAnswerText": transcription,
|
| 631 |
+
"toneOfVoice": a_scores["tone_of_voice"],
|
| 632 |
+
"clarity": a_scores["clarity"],
|
| 633 |
+
"stress": a_scores["stress"],
|
| 634 |
+
"confidence": round((a_scores["confidence_audio"] + f_c + e_c) / 3, 2),
|
| 635 |
+
"pauses": a_scores["pauses"],
|
| 636 |
+
"score": similarity_score,
|
| 637 |
+
"relevance": relevance_score
|
| 638 |
+
})
|
| 639 |
+
except Exception as e:
|
| 640 |
+
print(f"Visual analysis failed for Q{q_id}: {e}")
|
| 641 |
+
|
| 642 |
+
torch.cuda.empty_cache()
|
| 643 |
+
|
| 644 |
+
# Final concatenation
|
| 645 |
+
if segments:
|
| 646 |
+
list_path = os.path.join(output_dir, "list.txt")
|
| 647 |
+
with open(list_path, "w") as f:
|
| 648 |
+
for s in segments:
|
| 649 |
+
f.write(f"file '{os.path.abspath(s)}'\n")
|
| 650 |
+
|
| 651 |
+
final_output = os.path.join(output_dir, "Intervision_Final_Result.mp4")
|
| 652 |
+
os.system(f"ffmpeg -f concat -safe 0 -i {list_path} -c:v libx264 -preset superfast -crf 23 -c:a aac -y {final_output}")
|
| 653 |
+
|
| 654 |
+
with open(os.path.join(output_dir, "report.json"), "w") as f:
|
| 655 |
+
json.dump({"listOfAnswerReport": final_reports}, f, indent=4)
|
| 656 |
+
|
| 657 |
+
return f"Successfully processed {len(segments)} questions."
|
| 658 |
+
else:
|
| 659 |
+
return "No segments were processed. Check your video time ranges."
|
requirements.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
python-multipart
|
| 4 |
+
ffmpeg-python
|
| 5 |
+
transformers
|
| 6 |
+
timm
|
| 7 |
+
mediapipe
|
| 8 |
+
deepface
|
| 9 |
+
sentence-transformers
|
| 10 |
+
librosa
|
| 11 |
+
torchaudio
|
| 12 |
+
opencv-python
|
| 13 |
+
numpy
|
| 14 |
+
soundfile
|
| 15 |
+
pillow
|
| 16 |
+
scikit-learn
|
| 17 |
+
ffmpeg
|
| 18 |
+
python-multipart
|
| 19 |
+
cloudinary
|
| 20 |
+
requests
|
| 21 |
+
python-dotenv
|
| 22 |
+
opencv-python-headless
|
| 23 |
+
openai-whisper
|