Fayza38 commited on
Commit
9b5af42
·
verified ·
1 Parent(s): e31e956

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +21 -0
  3. app.py +281 -0
  4. face_landmarker.task +3 -0
  5. pipeline.py +659 -0
  6. requirements.txt +23 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ face_landmarker.task filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime with GPU support if needed, or standard CPU
2
+ FROM python:3.9
3
+
4
+ # Set working directory
5
+ WORKDIR /code
6
+
7
+ # Install system dependencies for OpenCV and MediaPipe
8
+ RUN apt-get update && apt-get install -y \
9
+ libgl1-mesa-glx \
10
+ libglib2.0-0 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements and install
14
+ COPY ./requirements.txt /code/requirements.txt
15
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
16
+
17
+ # Copy the rest of the application
18
+ COPY . .
19
+
20
+ # Command to run the application
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import cloudinary
5
+ import cloudinary.uploader
6
+ from requests.adapters import HTTPAdapter
7
+ from urllib3.util.retry import Retry
8
+ from fastapi import FastAPI, Body, HTTPException, BackgroundTasks
9
+ from dotenv import load_dotenv
10
+ from datetime import datetime
11
+ from pipeline import run_intervision_pipeline
12
+
13
+ # --- Setup Retry Strategy ---
14
+ retry_strategy = Retry(
15
+ total=3,
16
+ backoff_factor=1, # Wait 1s, 2s, 4s between retries
17
+ status_forcelist=[429, 500, 502, 503, 504],
18
+ )
19
+ adapter = HTTPAdapter(max_retries=retry_strategy)
20
+ http = requests.Session()
21
+ http.mount("https://", adapter)
22
+ http.mount("http://", adapter)
23
+
24
+ # Load environment variables from .env file
25
+ load_dotenv()
26
+
27
+ app = FastAPI(title="Intervision AI Engine")
28
+
29
+ # Cloudinary Configuration
30
+ cloudinary.config(
31
+ cloud_name = os.getenv("CLOUDINARY_CLOUD_NAME"),
32
+ api_key = os.getenv("CLOUDINARY_API_KEY"),
33
+ api_secret = os.getenv("CLOUDINARY_API_SECRET")
34
+ )
35
+
36
+ # Directory Setup
37
+ RESULT_DIR = "temp_data/results"
38
+ UPLOAD_DIR = "temp_data/uploads"
39
+ os.makedirs(RESULT_DIR, exist_ok=True)
40
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
41
+
42
+ def time_to_seconds(t_str: str) -> int:
43
+ """Converts HH:MM:SS timestamp format to total seconds."""
44
+ if not t_str: return 0
45
+ h, m, s = map(int, t_str.split(':'))
46
+ return h * 3600 + m * 60 + s
47
+
48
+ def background_processing(session_data: dict):
49
+ """
50
+ Handles heavy AI processing: video download, pipeline execution,
51
+ result upload, and backend notification (callback).
52
+ """
53
+ session_id = session_data.get('sessionId')
54
+ video_url = session_data.get('originalVideoUrl')
55
+ callback_url = session_data.get('callbackBaseUrl')
56
+
57
+ print(f"[LOG] Processing started for session: {session_id}")
58
+
59
+ # 1. Download the original video from the provided URL
60
+ local_input_path = os.path.join(UPLOAD_DIR, f"{session_id}_input.mp4")
61
+ # 1. Download with increased timeout and Retry logic
62
+ try:
63
+ print(f"[LOG] Downloading video: {video_url}")
64
+ # Increased timeout to 300s (5 minutes) for large files
65
+ response = http.get(video_url, stream=True, timeout=300)
66
+ response.raise_for_status()
67
+ with open(local_input_path, 'wb') as f:
68
+ for chunk in response.iter_content(chunk_size=1024*1024):
69
+ f.write(chunk)
70
+ except Exception as e:
71
+ print(f"[DOWNLOAD ERROR]: {e}")
72
+ # Notify backend that it failed due to download
73
+ return
74
+
75
+ # 2. Prepare question list for the AI Pipeline
76
+ final_questions = []
77
+ skipped_failed_reports = []
78
+
79
+ for q in session_data.get('answers', []):
80
+ if q.get('isAnswered'):
81
+ final_questions.append({
82
+ "question_id": q['aiQuestionId'],
83
+ "question_text": q['questionText'],
84
+ "ideal_answer": q['expectedAnswer'],
85
+ "start_time": time_to_seconds(q['startedAt']),
86
+ "end_time": time_to_seconds(q['submittedAt'])
87
+ })
88
+ else:
89
+ # Handle questions that weren't answered during the session
90
+ skipped_failed_reports.append({
91
+ "questionId": q['aiQuestionId'],
92
+ "userAnswerText": "N/A",
93
+ "score": 0.0,
94
+ "relevance": 0.0,
95
+ "confidence": 0.0,
96
+ "stress": 0.0,
97
+ "clarity": 0.0,
98
+ "pauses": 0.0,
99
+ "toneOfVoice": "N/A",
100
+ "status": "skipped" if q.get('isSkipped') else "failed"
101
+ })
102
+
103
+ # 3. Execute AI Pipeline (Analysis & Visualization)
104
+ ai_results = []
105
+ if final_questions:
106
+ # run_intervision_pipeline generates Intervision_Final_Result.mp4
107
+ run_intervision_pipeline(local_input_path, final_questions, RESULT_DIR)
108
+ report_path = os.path.join(RESULT_DIR, "report.json")
109
+ if os.path.exists(report_path):
110
+ with open(report_path, "r") as f:
111
+ ai_results = json.load(f).get("listOfAnswerReport", [])
112
+
113
+ # 4. Upload the processed video to Cloudinary
114
+ final_video_path = os.path.join(RESULT_DIR, "Intervision_Final_Result.mp4")
115
+ final_video_url = None
116
+ if os.path.exists(final_video_path):
117
+ try:
118
+ upload_res = cloudinary.uploader.upload(
119
+ final_video_path,
120
+ public_id=f"res_{session_id}",
121
+ folder="intervision_results",
122
+ resource_type="video",
123
+ chunk_size=6000000
124
+ )
125
+ final_video_url = upload_res.get("secure_url")
126
+ except Exception as e:
127
+ print(f"[UPLOAD ERROR]: {e}")
128
+
129
+ # 5. Construct final payload and notify Backend via Callback
130
+ final_payload = {
131
+ "sessionId": session_id,
132
+ "finalVideoUrl": final_video_url,
133
+ "report": ai_results + skipped_failed_reports
134
+ }
135
+
136
+ try:
137
+ # Notify backend that processing is complete
138
+ cb_response = requests.post(f"{callback_url}/api/ai-callback", json=final_payload, timeout=30)
139
+ print(f"[LOG] Callback sent to {callback_url}. Status: {cb_response.status_code}")
140
+
141
+ # 6. Local Cleanup: Remove files to save disk space
142
+ if os.path.exists(local_input_path): os.remove(local_input_path)
143
+ if os.path.exists(final_video_path): os.remove(final_video_path)
144
+
145
+ except Exception as e:
146
+ print(f"[CALLBACK ERROR]: {e}")
147
+
148
+ @app.post("/process-interview/")
149
+ async def process_interview(background_tasks: BackgroundTasks, data: dict = Body(...)):
150
+ """Entry point to start the AI analysis asynchronously."""
151
+ background_tasks.add_task(background_processing, data)
152
+ return {"message": "Processing started", "sessionId": data.get('sessionId')}
153
+
154
+ @app.post("/delete-video-by-url/")
155
+ async def delete_video_by_url(data: dict = Body(...)):
156
+ """
157
+ Deletes a video from Cloudinary based on its URL.
158
+ Input JSON: {"videoUrl": "https://..."}
159
+ """
160
+ video_url = data.get("videoUrl")
161
+ if not video_url:
162
+ raise HTTPException(status_code=400, detail="videoUrl is required")
163
+
164
+ try:
165
+ # Logic to extract the public_id from a Cloudinary URL
166
+ # Example: .../folder/public_id.mp4 -> folder/public_id
167
+ url_parts = video_url.split('/')
168
+ filename_with_ext = url_parts[-1]
169
+ filename = filename_with_ext.split('.')[0]
170
+
171
+ # Check if the video is inside the results folder
172
+ folder = url_parts[-2] if "intervision_results" in url_parts[-2] else ""
173
+ public_id = f"{folder}/{filename}" if folder else filename
174
+
175
+ # Trigger deletion from Cloudinary
176
+ result = cloudinary.uploader.destroy(public_id, resource_type="video")
177
+
178
+ if result.get("result") == "ok":
179
+ return {"status": "success", "message": f"Deleted {public_id}"}
180
+ return {"status": "failed", "details": result}
181
+
182
+ except Exception as e:
183
+ raise HTTPException(status_code=500, detail=str(e))
184
+
185
+ if __name__ == "__main__":
186
+ import uvicorn
187
+ uvicorn.run(app, host="0.0.0.0", port=8000)
188
+
189
+ # @app.post("/process-interview-test/")
190
+ # async def process_test(data: dict = Body(...)):
191
+ # try:
192
+ # print(f"--- [TEST LOG] Processing Session: {data['sessionId']} ---")
193
+
194
+ # # 1. Path Check
195
+ # local_path = r"D:\FayzaAhmed\Graduation_project\models\MultiModal\deployment\interview_test.mp4"
196
+ # if not os.path.exists(local_path):
197
+ # return {"error": f"Video file not found at {local_path}"}
198
+
199
+ # # 2. Prepare Data
200
+ # final_questions = []
201
+ # for q in data['answers']:
202
+ # if q.get('isAnswered'):
203
+ # final_questions.append({
204
+ # "question_id": q['aiQuestionId'],
205
+ # "question_text": q['questionText'],
206
+ # "ideal_answer": q['expectedAnswer'],
207
+ # "start_time": time_to_seconds(q['startedAt']),
208
+ # "end_time": time_to_seconds(q['submittedAt'])
209
+ # })
210
+
211
+ # # 3. Run Pipeline
212
+ # print("[LOG] Running AI Pipeline...")
213
+ # run_intervision_pipeline(local_path, final_questions, RESULT_DIR)
214
+
215
+ # # 4. Upload
216
+ # print("[LOG] Uploading to Cloudinary...")
217
+ # final_video_path = os.path.join(RESULT_DIR, "Intervision_Final_Result.mp4")
218
+ # upload_res = cloudinary.uploader.upload(
219
+ # final_video_path,
220
+ # public_id=f"{data['sessionId']}_test",
221
+ # folder="intervision_tests",
222
+ # resource_type="video" # This is the important part
223
+ # )
224
+
225
+ # # 5. Load Report
226
+ # report_path = os.path.join(RESULT_DIR, "report.json")
227
+ # if not os.path.exists(report_path):
228
+ # return {"error": "Pipeline finished but report.json was not created."}
229
+
230
+ # with open(report_path, "r") as f:
231
+ # ai_results = json.load(f)["listOfAnswerReport"]
232
+
233
+ # return {
234
+ # "status": "Success",
235
+ # "videoUrl": upload_res.get("secure_url"),
236
+ # "report": ai_results
237
+ # }
238
+
239
+ # except Exception as e:
240
+ # print(f"[CRITICAL ERROR]: {str(e)}")
241
+ # return {"error": str(e), "traceback": "Check Terminal for details"}
242
+
243
+ """QUESTIONS_CONFIG =
244
+ [
245
+ {
246
+ "question_id": 1,
247
+ "question_text": "how do you describe yourself",
248
+ "ideal_answer": "Being different means you have to work at belonging...",
249
+ "start_time": 0,
250
+ "end_time": 15,
251
+ },
252
+ {
253
+ "question_id": 2,
254
+ "question_text": "Tell us about your biggest achievement",
255
+ "ideal_answer": "I am proud of accomplishing...",
256
+ "start_time": 15,
257
+ "end_time": 24,
258
+ }
259
+ ]
260
+ """
261
+
262
+ """
263
+ {
264
+ "sessionId": "test-session-123",
265
+ "originalVideoUrl": "local_test_no_url",
266
+ "callbackBaseUrl": "http://localhost:8000",
267
+ "answers": [
268
+ {
269
+ "questionId": "q-1",
270
+ "aiQuestionId": 1,
271
+ "questionText": "How does the speaker encourage people to deal with their differences and uniqueness?",
272
+ "expectedAnswer": "When you're different, you have to work at the longing. Everybody wants to feel valued and accepted, and we think it should happen spontaneously, but it doesn't. Sometimes society tells us, and we tell ourselves, we don't fit the mold. Take a piece of paper and write down what makes you different. And I want you to celebrate it today and every day. Shout it from the rooftops. What makes me different is what has made me stand out and be successful. I also encourage you to be curious and ask, what is on other people's pieces of paper? What makes them different? Let's celebrate those imperfections that make us special. I hope that it teaches you that nobody has a claim on the word normal. We are all different. We are all quirky and unique, and that is what makes us wonderful.",
273
+ "isAnswered": true,
274
+ "isSkipped": false,
275
+ "isFailed": false,
276
+ "startedAt": "00:00:00",
277
+ "submittedAt": "00:00:55"
278
+ }
279
+ ]
280
+ }
281
+ """
face_landmarker.task ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
3
+ size 3758596
pipeline.py ADDED
@@ -0,0 +1,659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import cv2
4
+ import json
5
+ import math
6
+ import torch
7
+ import librosa
8
+ import ffmpeg
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import mediapipe as mp
12
+ from PIL import Image
13
+ from transformers import AutoImageProcessor, AutoModelForImageClassification, pipeline
14
+ from sentence_transformers import SentenceTransformer, CrossEncoder
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ from mediapipe.tasks import python
17
+ from mediapipe.tasks.python import vision
18
+
19
+ # Ignore unnecessary warnings
20
+ import warnings
21
+ warnings.filterwarnings("ignore", category=UserWarning)
22
+ warnings.filterwarnings("ignore", category=FutureWarning)
23
+
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+
26
+ # 2. Download and Initialize Mediapipe once (Global)
27
+ MODEL_PATH = "face_landmarker.task"
28
+ if not os.path.exists(MODEL_PATH):
29
+ os.system(f"wget -O {MODEL_PATH} -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task")
30
+
31
+ # 3. Initialize Models
32
+ asr = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
33
+ semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
34
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
35
+
36
+ FACE_MODEL_NAME = "dima806/facial_emotions_image_detection"
37
+ face_processor = AutoImageProcessor.from_pretrained(FACE_MODEL_NAME)
38
+ face_model = AutoModelForImageClassification.from_pretrained(FACE_MODEL_NAME).to(device).eval()
39
+
40
+ # Emotion Mapping for Wheel
41
+ emotion_va = {
42
+ "happy": (0.8, 0.2), "fear": (0.2, 0.8), "angry": (-0.7, 0.65),
43
+ "sad": (-0.65, -0.55), "surprise": (0.1, -0.75), "disgust": (0.6, -0.4), "neutral": (0.0, 0.0)
44
+ }
45
+ EMOTION_RING = [
46
+ ("Happy", 0, 0.84), ("Surprise", 45, 0.84), ("Fear", 100, 0.84),
47
+ ("Sad", 160, 0.84), ("Disgust", 215, 0.84), ("Angry", 270, 0.84)
48
+ ]
49
+
50
+ ##Utility functions
51
+
52
+ def normalize(v, mn, mx):
53
+ return np.clip((v - mn) / (mx - mn), 0, 1) if mx - mn != 0 else 0.0
54
+
55
+ def extract_audio(v_in, a_out):
56
+ ffmpeg.input(v_in).output(a_out, ac=1, ar=16000).overwrite_output().run(quiet=True)
57
+
58
+ def merge_audio_video(v_in, a_in, v_out):
59
+ ffmpeg.output(ffmpeg.input(v_in).video, ffmpeg.input(a_in).audio, v_out, vcodec="libx264", acodec="aac").overwrite_output().run(quiet=True)
60
+
61
+ def draw_face_box(frame, x, y, w, h, emotion_name=""):
62
+ color, th, cl = (0, 255, 100), 2, 20 # Green color
63
+ cv2.rectangle(frame, (x, y), (x+w, y+h), color, 1)
64
+
65
+ # Add emotion name above face box
66
+ if emotion_name:
67
+ cv2.putText(
68
+ frame,
69
+ emotion_name.upper(),
70
+ (x + 10, y - 15),
71
+ cv2.FONT_HERSHEY_DUPLEX,
72
+ 0.7,
73
+ (0, 255, 100),
74
+ 2,
75
+ cv2.LINE_AA
76
+ )
77
+
78
+ # Corners
79
+ for px, py, dx, dy in [(x,y,cl,0), (x,y,0,cl), (x+w,y,-cl,0), (x+w,y,0,cl), (x,y+h,cl,0), (x,y+h,0,-cl), (x+w,y+h,-cl,0), (x+w,y+h,0,-cl)]:
80
+ cv2.line(frame, (px, py), (px+dx, py+dy), color, 5)
81
+ return frame
82
+
83
+ def compute_eye_contact_ratio(frame, landmarks):
84
+ h, w, _ = frame.shape
85
+ def ear(idx):
86
+ p = [np.array([landmarks[i].x * w, landmarks[i].y * h]) for i in idx]
87
+ return (np.linalg.norm(p[1]-p[5]) + np.linalg.norm(p[2]-p[4])) / (2.0 * np.linalg.norm(p[0]-p[3]))
88
+ avg_ear = (ear([33, 160, 158, 133, 153, 144]) + ear([362, 385, 387, 263, 373, 380])) / 2.0
89
+ return min(max(avg_ear * 3, 0), 1)
90
+
91
+ def analyze_face_emotion(frame):
92
+ img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
93
+ inputs = face_processor(images=img, return_tensors="pt").to(device)
94
+ with torch.no_grad():
95
+ outputs = face_model(**inputs)
96
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
97
+ return {face_model.config.id2label[i].lower(): float(probs[i]) for i in range(len(probs))}
98
+
99
+ ##Audio analysis
100
+
101
+ def extract_audio_features(y, sr):
102
+ duration = librosa.get_duration(y=y, sr=sr)
103
+ if duration == 0:
104
+ return {"pitch_std": 0, "jitter": 0, "energy_std": 0, "pause_ratio": 0, "speech_rate": 0}
105
+
106
+ # Pitch & Jitter
107
+ f0 = librosa.yin(y, fmin=75, fmax=300, sr=sr)
108
+ f0 = f0[~np.isnan(f0)]
109
+ pitch_std = np.std(f0) if len(f0) else 0
110
+ jitter = np.mean(np.abs(np.diff(f0)) / np.maximum(f0[:-1], 1e-6)) if len(f0) > 1 else 0
111
+
112
+ # Energy
113
+ rms = librosa.feature.rms(y=y)[0]
114
+ energy_std = np.std(rms)
115
+
116
+ intervals = librosa.effects.split(y, top_db=20)
117
+ speech_duration = sum((e - s) for s, e in intervals) / sr
118
+ pause_ratio = 1 - (speech_duration / duration) if duration > 0 else 0
119
+
120
+ # Speech Rate
121
+ oenv = librosa.onset.onset_strength(y=y, sr=sr)
122
+ onsets = librosa.onset.onset_detect(onset_envelope=oenv, sr=sr)
123
+ speech_rate = len(onsets) / duration if duration > 0 else 0
124
+
125
+ return {
126
+ "pitch_std": pitch_std,
127
+ "jitter": jitter,
128
+ "energy_std": energy_std,
129
+ "pause_ratio": pause_ratio,
130
+ "speech_rate": speech_rate
131
+ }
132
+
133
+
134
+ def compute_audio_scores(features, baseline=None):
135
+ """
136
+ Fairness-aware audio scoring with personal baseline comparison
137
+ """
138
+ # Use standard defaults if no baseline provided
139
+ if baseline is None:
140
+ baseline = {"pitch_std": 30.0, "energy_std": 0.05, "jitter": 0.02, "pause_ratio": 0.2, "speech_rate": 4.0}
141
+
142
+ # Calculate Relative Ratios (Current / Baseline)
143
+ pitch_ratio = features["pitch_std"] / max(baseline["pitch_std"], 1e-6)
144
+ energy_ratio = features["energy_std"] / max(baseline["energy_std"], 1e-6)
145
+ rate_ratio = features["speech_rate"] / max(baseline["speech_rate"], 1e-6)
146
+
147
+ # Stress Score (Relative)
148
+ pitch_dev = abs(1 - pitch_ratio)
149
+ energy_dev = abs(1 - energy_ratio)
150
+ stress_val = (pitch_dev * 0.4 + energy_dev * 0.4 + features["jitter"] * 0.2) * 150
151
+ stress = np.clip(stress_val + 20, 0, 100)
152
+
153
+ # Clarity Score (Relative)
154
+ pause_dev = max(0, features["pause_ratio"] - baseline["pause_ratio"])
155
+ clarity = 100 - (pause_dev * 120 + features["jitter"] * 400)
156
+
157
+ # Confidence Score (Relative)
158
+ rate_dev = abs(1 - rate_ratio)
159
+ confidence_audio = 100 - (rate_dev * 40 + energy_dev * 30 + features["pause_ratio"] * 50)
160
+
161
+ # Tone classification based on relative shifts
162
+ tones = {
163
+ "Confident": confidence_audio,
164
+ "Hesitant": features["pause_ratio"] * 150,
165
+ "Excited": (energy_ratio - 1) * 100 if energy_ratio > 1 else 0,
166
+ "Unstable": stress,
167
+ "Natural": 100 - (pitch_dev * 60 + rate_dev * 40)
168
+ }
169
+
170
+ dominant_tone = max(tones, key=tones.get)
171
+
172
+ return {
173
+ "confidence_audio": round(float(np.clip(confidence_audio, 0, 100)), 2),
174
+ "clarity": round(float(np.clip(clarity, 0, 100)), 2),
175
+ "stress": round(float(np.clip(stress, 0, 100)), 2),
176
+ "pauses": round(float(features["pause_ratio"] * 100), 2),
177
+ "tone_of_voice": dominant_tone
178
+ }
179
+
180
+ def analyze_audio_segment(audio_path, baseline=None):
181
+ """
182
+ Main entry point for audio segment analysis
183
+ """
184
+ y, sr = librosa.load(audio_path, sr=16000)
185
+ features = extract_audio_features(y, sr)
186
+ return compute_audio_scores(features, baseline)
187
+
188
+
189
+ ##Text analysis
190
+
191
+ def get_user_answer(audio_path):
192
+ """Transcribe audio using Whisper"""
193
+ result = asr(audio_path, chunk_length_s=20)
194
+ return result["text"].strip()
195
+
196
+
197
+ def compute_similarity_score(user_answer, ideal_answer):
198
+ emb = semantic_model.encode([user_answer, ideal_answer])
199
+ sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
200
+ score = float(sim * 100)
201
+ return round(max(0, score), 2)
202
+
203
+ def compute_relevance_score(question, user_answer):
204
+ raw_score = cross_encoder.predict([(question, user_answer)])[0]
205
+ prob = 1 / (1 + np.exp(-raw_score))
206
+ score = float(prob * 100)
207
+ return round(max(0, score), 2)
208
+
209
+ ##Video
210
+
211
+ # Eye indices
212
+ LEFT_EYE = [33, 160, 158, 133, 153, 144]
213
+ RIGHT_EYE = [362, 385, 387, 263, 373, 380]
214
+
215
+ # Eye Contact Function
216
+ def compute_eye_contact_ratio(frame, landmarks):
217
+ """
218
+ Compute eye contact ratio from detected face landmarks
219
+ """
220
+
221
+ if not landmarks:
222
+ return 0.5
223
+
224
+ h, w, _ = frame.shape
225
+
226
+ def ear(indices):
227
+ points = [
228
+ np.array([
229
+ landmarks[i].x * w,
230
+ landmarks[i].y * h
231
+ ])
232
+ for i in indices
233
+ ]
234
+
235
+ v1 = np.linalg.norm(points[1] - points[5])
236
+ v2 = np.linalg.norm(points[2] - points[4])
237
+ h_dist = np.linalg.norm(points[0] - points[3])
238
+
239
+ return (v1 + v2) / (2.0 * h_dist)
240
+
241
+ ear_left = ear(LEFT_EYE)
242
+ ear_right = ear(RIGHT_EYE)
243
+
244
+ avg_ear = (ear_left + ear_right) / 2.0
245
+
246
+ eye_score = min(max(avg_ear * 3, 0), 1)
247
+
248
+ return eye_score
249
+
250
+ def analyze_face_emotion(frame):
251
+ """
252
+ Predict facial emotion probabilities from single frame
253
+ """
254
+
255
+ # Convert BGR to RGB
256
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
257
+ image = Image.fromarray(rgb)
258
+
259
+ # Preprocess
260
+ inputs = face_processor(images=image, return_tensors="pt").to(device)
261
+
262
+ with torch.no_grad():
263
+ outputs = face_model(**inputs)
264
+
265
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
266
+ labels = face_model.config.id2label
267
+
268
+ emotion_probs = {
269
+ labels[i].lower(): float(probs[i])
270
+ for i in range(len(probs))
271
+ }
272
+
273
+ return emotion_probs
274
+
275
+ def draw_face_box(frame, x, y, w, h, emotion_label="Neutral"):
276
+ """
277
+ Draw face bounding box with emotion label above it
278
+ """
279
+
280
+ # Green color for face box
281
+ color = (0, 255, 0)
282
+
283
+ thickness = 2
284
+ corner_len = 22
285
+
286
+ # Main rectangle
287
+ cv2.rectangle(frame, (x, y), (x+w, y+h), color, thickness)
288
+
289
+ # Decorative corner lines
290
+ for (px, py, dx, dy) in [
291
+ (x, y, corner_len, 0), (x, y, 0, corner_len),
292
+ (x+w, y, -corner_len, 0), (x+w, y, 0, corner_len),
293
+ (x, y+h, corner_len, 0), (x, y+h, 0, -corner_len),
294
+ (x+w, y+h, -corner_len, 0), (x+w, y+h, 0, -corner_len),
295
+ ]:
296
+ cv2.line(frame, (px, py), (px+dx, py+dy), color, 4)
297
+
298
+ # Draw emotion text above the face box
299
+ label_text = emotion_label.capitalize()
300
+
301
+ (tw, th), _ = cv2.getTextSize(
302
+ label_text,
303
+ cv2.FONT_HERSHEY_SIMPLEX,
304
+ 0.7,
305
+ 2
306
+ )
307
+
308
+ text_x = x + (w - tw) // 2
309
+ text_y = y - 10
310
+
311
+ cv2.putText(
312
+ frame,
313
+ label_text,
314
+ (text_x, text_y),
315
+ cv2.FONT_HERSHEY_SIMPLEX,
316
+ 0.7,
317
+ (0, 255, 0),
318
+ 2,
319
+ cv2.LINE_AA
320
+ )
321
+
322
+ return frame
323
+
324
+ def compute_valence_arousal_from_probs(emotion_probs):
325
+ """Computing Valence and Arousal from emotion probabilities"""
326
+ v, a, total = 0.0, 0.0, 0.0
327
+
328
+ for emo, score in emotion_probs.items():
329
+ emo = emo.lower()
330
+ if emo in emotion_va:
331
+ v += emotion_va[emo][0] * score
332
+ a += emotion_va[emo][1] * score
333
+ total += score
334
+
335
+ if total == 0:
336
+ return 0.0, 0.0
337
+
338
+ return v / total, a / total
339
+
340
+ def draw_full_emotion_wheel(panel, center, radius, valence, arousal,
341
+ dominant_emotion="neutral"):
342
+ cx, cy = center
343
+
344
+ # Circle background
345
+ cv2.circle(panel, center, radius + 5, (15, 15, 25), -1)
346
+ cv2.circle(panel, center, radius, (60, 60, 85), 2)
347
+ for rf in [0.33, 0.66]:
348
+ cv2.circle(panel, center, int(radius * rf), (35, 35, 50), 1)
349
+
350
+ # Drawing dividing lines between emotions
351
+ for angle_deg in range(0, 360, 60):
352
+ rad = math.radians(angle_deg)
353
+ x1 = int(cx + radius * math.cos(rad))
354
+ y1 = int(cy - radius * math.sin(rad))
355
+ cv2.line(panel, (cx, cy), (x1, y1), (40, 40, 60), 1)
356
+
357
+ # Drawing emotion labels
358
+ ef, es, et = cv2.FONT_HERSHEY_SIMPLEX, 0.40, 1
359
+ for emotion_data in EMOTION_RING:
360
+ if emotion_data[1] is None:
361
+ continue
362
+
363
+ label, angle_deg, rf = emotion_data
364
+ rad = math.radians(angle_deg)
365
+ lx = int(cx + rf * radius * math.cos(rad))
366
+ ly = int(cy - rf * radius * math.sin(rad))
367
+ (tw, th), _ = cv2.getTextSize(label, ef, es, et)
368
+ tx, ty = lx - tw//2, ly + th//2
369
+
370
+ # Highlight active emotion
371
+ if label.lower() == dominant_emotion.lower():
372
+ cv2.putText(panel, label, (tx, ty), ef, es+0.08, (0, 255, 200), 2, cv2.LINE_AA)
373
+ else:
374
+ cv2.putText(panel, label, (tx, ty), ef, es, (190, 190, 255), et, cv2.LINE_AA)
375
+
376
+ # Neutral in center
377
+ nc = (0, 255, 200) if dominant_emotion == "neutral" else (160, 160, 160)
378
+ (tw, th), _ = cv2.getTextSize("Neutral", ef, es, et)
379
+ cv2.putText(panel, "Neutral", (cx-tw//2, cy+th//2), ef, es, nc, et, cv2.LINE_AA)
380
+
381
+ # Animated dot with glow
382
+ dot_x = int(cx + valence * radius * 0.88)
383
+ dot_y = int(cy - arousal * radius * 0.88)
384
+ cv2.circle(panel, (dot_x, dot_y), 15, (160, 120, 0), -1)
385
+ cv2.circle(panel, (dot_x, dot_y), 11, (220, 180, 0), -1)
386
+ cv2.circle(panel, (dot_x, dot_y), 7, (255, 230, 60), -1)
387
+
388
+ return panel
389
+
390
+ BAR_CONFIGS = [
391
+ ("Confidence", (70, 180, 255), (30, 50, 100)), # light blue
392
+ ("Clarity", (100, 220, 150), (25, 70, 50)), # light cyan
393
+ ("Stress", (255, 120, 100), (100, 40, 30)), # light coral
394
+ ]
395
+
396
+ def draw_metric_bars(panel,
397
+ bars_x_start,
398
+ bar_y_top,
399
+ bar_height,
400
+ bar_width,
401
+ bar_gap,
402
+ confidence,
403
+ clarity,
404
+ stress):
405
+ """
406
+ Draw horizontal metric bars with label above each bar
407
+ """
408
+
409
+ values = [confidence, clarity, stress]
410
+ labels_list = ["Confidence", "Clarity", "Stress"]
411
+
412
+ # Extra vertical space for labels
413
+ label_space = 20
414
+
415
+ for i, value in enumerate(values):
416
+
417
+ label, fill_color, bg_color = BAR_CONFIGS[i]
418
+
419
+ # Each bar block height = label + bar + gap
420
+ y = bar_y_top + i * (bar_height + label_space + bar_gap)
421
+
422
+ x_right = bars_x_start + bar_width
423
+
424
+ filled = int((value / 100) * bar_width)
425
+
426
+ # Draw label above bar
427
+ cv2.putText(
428
+ panel,
429
+ label,
430
+ (bars_x_start, y),
431
+ cv2.FONT_HERSHEY_DUPLEX,
432
+ 0.6,
433
+ (230, 230, 230),
434
+ 1,
435
+ cv2.LINE_AA
436
+ )
437
+
438
+ # Move bar slightly down to leave space for label
439
+ bar_y = y + 8
440
+
441
+ # Draw background bar
442
+ cv2.rectangle(
443
+ panel,
444
+ (bars_x_start, bar_y),
445
+ (x_right, bar_y + bar_height),
446
+ bg_color,
447
+ -1
448
+ )
449
+
450
+ # Draw filled portion
451
+ cv2.rectangle(
452
+ panel,
453
+ (bars_x_start, bar_y),
454
+ (bars_x_start + filled, bar_y + bar_height),
455
+ fill_color,
456
+ -1
457
+ )
458
+
459
+ # Draw percentage text
460
+ cv2.putText(
461
+ panel,
462
+ f"{int(value)}%",
463
+ (bars_x_start + 12, bar_y + bar_height - 6),
464
+ cv2.FONT_HERSHEY_SIMPLEX,
465
+ 0.6,
466
+ (255, 255, 255),
467
+ 2,
468
+ cv2.LINE_AA
469
+ )
470
+
471
+ return panel
472
+
473
+ ##Integrated Video Processing (Analysis + Annotation)
474
+
475
+ def process_video_segment(video_path, output_dir, segment_id, audio_scores_global=None):
476
+ base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
477
+ options = vision.FaceLandmarkerOptions(base_options=base_options, running_mode=vision.RunningMode.VIDEO, num_faces=1)
478
+
479
+ cap = cv2.VideoCapture(video_path)
480
+ fps, width, height = cap.get(cv2.CAP_PROP_FPS), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
481
+ temp_video = os.path.join(output_dir, f"temp_annotated_{segment_id}.mp4")
482
+ # out = cv2.VideoWriter(temp_video, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
483
+ # Use 'avc1' or 'H264' for web compatibility
484
+ fourcc = cv2.VideoWriter_fourcc(*'XVID')
485
+ out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))
486
+
487
+ face_conf_accum, eye_accum, frame_idx = [], [], 0
488
+ smooth_v, smooth_a, dom_emo = 0.0, 0.0, "neutral"
489
+
490
+ # --- Optimization Variables ---
491
+ frame_stride = 3 # Process AI every 3 frames
492
+ last_results = None
493
+ last_emotions = None
494
+ last_eye_s = 0.5
495
+ last_lm = None
496
+ # ------------------------------
497
+
498
+ b_conf = audio_scores_global.get("confidence_audio", 50)
499
+ b_clar = audio_scores_global.get("clarity", 50)
500
+ b_stress = audio_scores_global.get("stress", 20)
501
+
502
+ with vision.FaceLandmarker.create_from_options(options) as landmarker:
503
+ while cap.isOpened():
504
+ ret, frame = cap.read()
505
+ if not ret:
506
+ break
507
+
508
+ # 1. RUN HEAVY AI ONLY ON STRIDE FRAMES
509
+ if frame_idx % frame_stride == 0:
510
+ mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
511
+ last_results = landmarker.detect_for_video(mp_image, int((frame_idx/fps)*1000))
512
+
513
+ if last_results.face_landmarks:
514
+ last_lm = last_results.face_landmarks[0]
515
+ last_emotions = analyze_face_emotion(frame)
516
+ last_eye_s = compute_eye_contact_ratio(frame, last_lm)
517
+
518
+ # 2. USE LAST KNOWN DATA FOR CALCULATIONS & DRAWING
519
+ d_conf, d_clar, d_stress = b_conf, b_clar, b_stress
520
+
521
+ if last_results and last_results.face_landmarks:
522
+ # Use current local variables from 'last' successful AI run
523
+ curr_f_conf = (last_emotions.get("neutral", 0) + last_emotions.get("happy", 0)) * 100
524
+ d_conf = (b_conf * 0.7) + (curr_f_conf * 0.3)
525
+ d_clar = (b_clar * 0.8) + (last_eye_s * 100 * 0.2)
526
+ d_stress = (b_stress * 0.7) + ((last_emotions.get("sad",0)+last_emotions.get("angry",0))*30)
527
+
528
+ # Update accumulators only on stride frames to keep averages accurate
529
+ if frame_idx % frame_stride == 0:
530
+ face_conf_accum.append(curr_f_conf)
531
+ eye_accum.append(last_eye_s)
532
+
533
+ dom_emo = max(last_emotions, key=last_emotions.get)
534
+ v_t = sum(emotion_va[e][0]*s for e,s in last_emotions.items() if e in emotion_va)
535
+ a_t = sum(emotion_va[e][1]*s for e,s in last_emotions.items() if e in emotion_va)
536
+
537
+ # Keep smoothing every frame for fluid movement
538
+ smooth_v += 0.15 * (v_t - smooth_v)
539
+ smooth_a += 0.15 * (a_t - smooth_a)
540
+
541
+ # Draw face box using the last known landmarks
542
+ xs, ys = [l.x*width for l in last_lm], [l.y*height for l in last_lm]
543
+ draw_face_box(
544
+ frame,
545
+ int(min(xs)), int(min(ys)),
546
+ int(max(xs) - min(xs)), int(max(ys) - min(ys)),
547
+ dom_emo
548
+ )
549
+
550
+ # 3. ALWAYS DRAW UI (Wheel and Bars)
551
+ frame = draw_full_emotion_wheel(frame, (width-130, height-100), 90, smooth_v, smooth_a, dom_emo)
552
+ frame = draw_metric_bars(frame, 30, height-160, 28, 200, 6, d_conf, d_clar, d_stress)
553
+
554
+ out.write(frame)
555
+ frame_idx += 1
556
+
557
+ cap.release()
558
+ out.release()
559
+ return temp_video, np.mean(face_conf_accum) if face_conf_accum else 50, np.mean(eye_accum)*100 if eye_accum else 50
560
+
561
+ ##Main pipeline
562
+ def run_intervision_pipeline(video_path, questions_config, output_dir):
563
+ if not os.path.exists(video_path):
564
+ return f"Error: Video file not found at {video_path}"
565
+
566
+ os.makedirs(output_dir, exist_ok=True)
567
+
568
+ # Establish baseline from first 10s
569
+ try:
570
+ y_b, sr_b = librosa.load(video_path, sr=16000, duration=10)
571
+ baseline = extract_audio_features(y_b, sr_b)
572
+ except Exception as e:
573
+ print(f"Baseline Load Warning: {e}. Using defaults.")
574
+ baseline = None
575
+
576
+ final_reports, segments = [], []
577
+
578
+ for q in questions_config:
579
+ q_id = q['question_id']
580
+ raw_seg = os.path.join(output_dir, f"q{q_id}_raw.mp4")
581
+ wav_p = os.path.join(output_dir, f"q{q_id}.wav")
582
+
583
+ # Precise FFmpeg cutting with error handling
584
+ duration = q["end_time"] - q["start_time"]
585
+ try:
586
+ subprocess.run([
587
+ 'ffmpeg', '-y', '-ss', str(q["start_time"]), '-t', str(duration),
588
+ '-i', video_path, '-c:v', 'libx264', '-c:a', 'aac', '-strict', 'experimental', raw_seg
589
+ ], check=True, capture_output=True)
590
+ except subprocess.CalledProcessError as e:
591
+ print(f"Skipping Question {q_id}: Time range might be out of video bounds.")
592
+ continue
593
+
594
+ # Audio Extraction
595
+ try:
596
+ y, sr = librosa.load(raw_seg, sr=16000)
597
+ import soundfile as sf
598
+ sf.write(wav_p, y, sr)
599
+ except Exception as e:
600
+ print(f"Error extracting audio for Q{q_id}: {e}")
601
+ continue
602
+
603
+ # Audio Analysis
604
+ a_scores = compute_audio_scores(extract_audio_features(y, sr), baseline)
605
+
606
+ # Whisper Transcription
607
+ try:
608
+ transcription_data = asr(wav_p, chunk_length_s=30, return_timestamps=True)
609
+ transcription = transcription_data["text"].strip()
610
+ except:
611
+ transcription = "[Transcription Error]"
612
+
613
+ similarity_score = compute_similarity_score(transcription, q["ideal_answer"])
614
+ relevance_score = compute_relevance_score(q["question_text"], transcription)
615
+
616
+ # Visual Analysis
617
+ try:
618
+ ann_v, f_c, e_c = process_video_segment(raw_seg, output_dir, q_id, a_scores)
619
+
620
+ final_v = os.path.join(output_dir, f"q{q_id}_final.mp4")
621
+ subprocess.run([
622
+ 'ffmpeg', '-y', '-i', ann_v, '-i', raw_seg, '-map', '0:v', '-map', '1:a',
623
+ '-c:v', 'copy', '-c:a', 'aac', final_v
624
+ ], check=True, capture_output=True)
625
+
626
+ segments.append(final_v)
627
+
628
+ final_reports.append({
629
+ "questionId": q_id,
630
+ "userAnswerText": transcription,
631
+ "toneOfVoice": a_scores["tone_of_voice"],
632
+ "clarity": a_scores["clarity"],
633
+ "stress": a_scores["stress"],
634
+ "confidence": round((a_scores["confidence_audio"] + f_c + e_c) / 3, 2),
635
+ "pauses": a_scores["pauses"],
636
+ "score": similarity_score,
637
+ "relevance": relevance_score
638
+ })
639
+ except Exception as e:
640
+ print(f"Visual analysis failed for Q{q_id}: {e}")
641
+
642
+ torch.cuda.empty_cache()
643
+
644
+ # Final concatenation
645
+ if segments:
646
+ list_path = os.path.join(output_dir, "list.txt")
647
+ with open(list_path, "w") as f:
648
+ for s in segments:
649
+ f.write(f"file '{os.path.abspath(s)}'\n")
650
+
651
+ final_output = os.path.join(output_dir, "Intervision_Final_Result.mp4")
652
+ os.system(f"ffmpeg -f concat -safe 0 -i {list_path} -c:v libx264 -preset superfast -crf 23 -c:a aac -y {final_output}")
653
+
654
+ with open(os.path.join(output_dir, "report.json"), "w") as f:
655
+ json.dump({"listOfAnswerReport": final_reports}, f, indent=4)
656
+
657
+ return f"Successfully processed {len(segments)} questions."
658
+ else:
659
+ return "No segments were processed. Check your video time ranges."
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ ffmpeg-python
5
+ transformers
6
+ timm
7
+ mediapipe
8
+ deepface
9
+ sentence-transformers
10
+ librosa
11
+ torchaudio
12
+ opencv-python
13
+ numpy
14
+ soundfile
15
+ pillow
16
+ scikit-learn
17
+ ffmpeg
18
+ python-multipart
19
+ cloudinary
20
+ requests
21
+ python-dotenv
22
+ opencv-python-headless
23
+ openai-whisper