Matb09 commited on
Commit
51a005e
·
1 Parent(s): 78bd840
Files changed (3) hide show
  1. call_endpoint.py +101 -0
  2. handler.py +291 -0
  3. requirements.txt +22 -0
call_endpoint.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: call_endpoint.py
2
+
3
+ import requests
4
+ import base64
5
+ import argparse
6
+ import os
7
+ import json
8
+ import pandas as pd
9
+
10
+
11
+ def call_emotion_endpoint(video_path: str, output_prefix: str, analysis_type: str, api_url: str, api_token: str):
12
+ """
13
+ Calls the multimodal emotion analysis endpoint.
14
+
15
+ Args:
16
+ video_path (str): Path to the input video file.
17
+ output_prefix (str): Prefix for saving output files (e.g., 'my_analysis').
18
+ analysis_type (str): The type of analysis to perform ('audio', 'facial', 'text').
19
+ api_url (str): The URL of the inference endpoint.
20
+ api_token (str): Your Hugging Face API token.
21
+ """
22
+ # 1. Prepare headers and read/encode video
23
+ headers = {"Authorization": f"Bearer {api_token}", "Content-Type": "application/json"}
24
+ try:
25
+ with open(video_path, "rb") as f:
26
+ video_bytes = f.read()
27
+ encoded_video = base64.b64encode(video_bytes).decode("utf-8")
28
+ print(f"Successfully read and encoded '{video_path}'")
29
+ except FileNotFoundError:
30
+ print(f"Error: Input file not found at '{video_path}'")
31
+ return
32
+
33
+ # 2. Construct the JSON payload
34
+ payload = {
35
+ "inputs": {
36
+ "video": encoded_video,
37
+ "analysis_type": analysis_type
38
+ }
39
+ }
40
+
41
+ # 3. Make the POST request
42
+ print(f"Sending request for '{analysis_type}' analysis to endpoint: {api_url}")
43
+ response = requests.post(api_url, headers=headers, json=payload)
44
+
45
+ # 4. Process the response
46
+ if response.status_code == 200:
47
+ try:
48
+ response_data = response.json()
49
+
50
+ if response_data.get("status") == "error":
51
+ print(f"Endpoint returned an error: {response_data.get('message')}")
52
+ return
53
+
54
+ print("Success! Processing response...")
55
+
56
+ # Save any DataFrame results to CSV/XLSX
57
+ for key, value in response_data.items():
58
+ if key.endswith("_data") and value:
59
+ df = pd.read_json(value, orient='split')
60
+ output_path = f"{output_prefix}_{key}.xlsx"
61
+ df.to_excel(output_path, index=False)
62
+ print(f"Saved DataFrame to '{output_path}'")
63
+
64
+ # Save any base64 encoded files
65
+ if "processed_video" in response_data:
66
+ video_b64 = response_data["processed_video"]
67
+ decoded_video_bytes = base64.b64decode(video_b64)
68
+ output_path = f"{output_prefix}_processed_video.mp4"
69
+ with open(output_path, "wb") as f:
70
+ f.write(decoded_video_bytes)
71
+ print(f"Saved processed video to '{output_path}'")
72
+
73
+ except (requests.exceptions.JSONDecodeError, KeyError, TypeError) as e:
74
+ print(f"Error processing the response: {e}")
75
+ print("Response content:", response.text)
76
+ else:
77
+ print(f"Error: Endpoint returned status code {response.status_code}")
78
+ print("Response content:", response.text)
79
+
80
+
81
+ if __name__ == "__main__":
82
+ parser = argparse.ArgumentParser(description="Call the multimodal emotion analysis endpoint.")
83
+ parser.add_argument("video_path", type=str, help="Path to the input video file.")
84
+ parser.add_argument("analysis_type", type=str, choices=['audio', 'facial', 'text'], help="Type of analysis to run.")
85
+ parser.add_argument("--output_prefix", type=str, default="analysis_result", help="Prefix for output files.")
86
+ parser.add_argument("--api_url", type=str, required=True, help="The URL of the inference endpoint.")
87
+ parser.add_argument("--api_token", type=str, default=os.environ.get("HF_API_TOKEN"),
88
+ help="Your HF API token (or set HF_API_TOKEN env var).")
89
+
90
+ args = parser.parse_args()
91
+
92
+ if not args.api_token:
93
+ raise ValueError("Hugging Face API token is required.")
94
+
95
+ call_emotion_endpoint(
96
+ video_path=args.video_path,
97
+ output_prefix=args.output_prefix,
98
+ analysis_type=args.analysis_type,
99
+ api_url=args.api_url,
100
+ api_token=args.api_token
101
+ )
handler.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: handler.py
2
+
3
+ import base64
4
+ import io
5
+ import os
6
+ import tempfile
7
+ from typing import Dict, Any
8
+
9
+ import cv2
10
+ import librosa
11
+ import numpy as np
12
+ import pandas as pd
13
+ import torch
14
+ import whisper_timestamped as whisper
15
+ from fer import FER
16
+ from moviepy.editor import VideoFileClip, AudioFileClip
17
+ from torch.nn.functional import softmax
18
+ from transformers import AutoModelForAudioClassification, pipeline
19
+ from translate import Translator
20
+
21
+
22
+ class EndpointHandler:
23
+ def __init__(self, path=""):
24
+ """
25
+ Loads all models onto the device. This is called once when the endpoint starts.
26
+ """
27
+ print("Loading models...")
28
+ # Use GPU if available
29
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ print(f"Using device: {self.device}")
31
+
32
+ # 1. Audio Emotion Model
33
+ self.audio_model = AutoModelForAudioClassification.from_pretrained(
34
+ "3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True
35
+ ).to(self.device)
36
+ self.audio_mean = self.audio_model.config.mean
37
+ self.audio_std = self.audio_model.config.std
38
+
39
+ # 2. Facial Emotion Model
40
+ self.face_detector = FER(mtcnn=True)
41
+
42
+ # 3. Text Emotion Model
43
+ self.text_classifier = pipeline(
44
+ "text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None, device=self.device
45
+ )
46
+
47
+ # 4. Transcription Model
48
+ self.transcription_model = whisper.load_model("medium", device=self.device)
49
+
50
+ # 5. Translator
51
+ self.translator = Translator(from_lang='ko', to_lang='en')
52
+
53
+ print("All models loaded successfully.")
54
+
55
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
56
+ """
57
+ Handles an inference request.
58
+
59
+ Args:
60
+ data (Dict[str, Any]): Dictionary containing request parameters. Expected keys:
61
+ 'video': Base64 encoded video string.
62
+ 'analysis_type': One of "audio", "facial", or "text".
63
+ """
64
+ print("Received inference request.")
65
+
66
+ # --- 1. Parameter Extraction ---
67
+ if 'inputs' in data and isinstance(data['inputs'], dict):
68
+ params = data['inputs']
69
+ else:
70
+ params = data
71
+
72
+ b64_video = params.get("video")
73
+ if not b64_video:
74
+ raise ValueError("Missing 'video' parameter (base64 encoded string)")
75
+
76
+ analysis_type = params.get("analysis_type")
77
+ if analysis_type not in ["audio", "facial", "text"]:
78
+ raise ValueError("Missing or invalid 'analysis_type'. Must be 'audio', 'facial', or 'text'.")
79
+
80
+ # --- 2. Video Decoding ---
81
+ video_bytes = base64.b64decode(b64_video)
82
+
83
+ # Use a temporary file to store the video, as the original functions expect a path
84
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as temp_video_file:
85
+ temp_video_file.write(video_bytes)
86
+ temp_video_file.flush() # Ensure all data is written
87
+ video_path = temp_video_file.name
88
+ print(f"Video saved to temporary file: {video_path}")
89
+
90
+ # --- 3. Dispatch to correct analysis function ---
91
+ try:
92
+ if analysis_type == "audio":
93
+ result = self._analyze_audio_emotions(video_path)
94
+ elif analysis_type == "facial":
95
+ result = self._detect_faces_and_emotions(video_path)
96
+ elif analysis_type == "text":
97
+ result = self._process_video_text(video_path)
98
+
99
+ print("Analysis completed successfully.")
100
+ return {"status": "success", **result}
101
+
102
+ except Exception as e:
103
+ print(f"Error during {analysis_type} analysis: {e}")
104
+ # It's good practice to return a structured error
105
+ return {"status": "error", "message": str(e)}
106
+
107
+ # ===================================================================
108
+ # REFACTORED ANALYSIS FUNCTIONS
109
+ # ===================================================================
110
+
111
+ def _analyze_audio_emotions(self, video_path: str) -> Dict:
112
+ temp_audio_path = None
113
+ try:
114
+ # Extract audio
115
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
116
+ temp_audio_path = temp_audio_file.name
117
+ VideoFileClip(video_path).audio.write_audiofile(temp_audio_path, codec="pcm_s16le", logger=None)
118
+
119
+ raw_wav, _ = librosa.load(temp_audio_path, sr=self.audio_model.config.sampling_rate)
120
+ norm_wav = (raw_wav - self.audio_mean) / (self.audio_std + 1e-6)
121
+
122
+ times, emotions_dfs = [], []
123
+ for start_time in range(0, len(norm_wav), self.audio_model.config.sampling_rate):
124
+ audio_segment = norm_wav[start_time:start_time + self.audio_model.config.sampling_rate]
125
+
126
+ # Process segment
127
+ audio_np = np.array(audio_segment)
128
+ mask = torch.ones(1, len(audio_np)).to(self.device)
129
+ wavs = torch.tensor(audio_np).unsqueeze(0).to(self.device)
130
+ with torch.no_grad():
131
+ pred = self.audio_model(wavs, mask)
132
+ logits = pred.logits if hasattr(pred, 'logits') else pred[0]
133
+ labels = {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 7: 'Neutral'}
134
+ probabilities = softmax(logits, dim=-1).squeeze(0)[[0, 1, 2, 3, 4, 5, 7]]
135
+ probabilities = probabilities / probabilities.sum()
136
+ df = pd.DataFrame([probabilities.cpu().numpy()], columns=labels.values())
137
+
138
+ times.append(start_time / self.audio_model.config.sampling_rate)
139
+ emotions_dfs.append(df)
140
+
141
+ emotions_df = pd.concat(emotions_dfs, ignore_index=True)
142
+ emotions_df.insert(0, "Time(s)", times)
143
+ emotion_rename_map = {'Angry': 'anger', 'Sad': 'sadness', 'Happy': 'happy', 'Surprise': 'surprise',
144
+ 'Fear': 'fear', 'Disgust': 'disgust', 'Neutral': 'neutral'}
145
+ emotions_df.rename(columns=emotion_rename_map, inplace=True)
146
+
147
+ # Return DataFrame as JSON
148
+ return {"emotions_data": emotions_df.to_json(orient='split')}
149
+
150
+ finally:
151
+ if temp_audio_path and os.path.exists(temp_audio_path):
152
+ os.remove(temp_audio_path)
153
+
154
+ def _detect_faces_and_emotions(self, video_path: str) -> Dict:
155
+ emotions_data = []
156
+ output_video_path = None
157
+ try:
158
+ # Create a temporary file for the output video
159
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_out_video:
160
+ output_video_path = temp_out_video.name
161
+
162
+ original_video = VideoFileClip(video_path)
163
+ cap = cv2.VideoCapture(video_path)
164
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
165
+ w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
166
+ h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
167
+
168
+ # Use a temporary path for the video writer intermediate file
169
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video_writer_file:
170
+ temp_video_writer_path = temp_video_writer_file.name
171
+
172
+ out = cv2.VideoWriter(temp_video_writer_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
173
+
174
+ frame_number = 0
175
+ while cap.isOpened():
176
+ ret, frame = cap.read()
177
+ if not ret: break
178
+
179
+ time_seconds = round(frame_number / fps)
180
+ result = self.face_detector.detect_emotions(frame)
181
+
182
+ for face in result:
183
+ box = face["box"]
184
+ emotions = face["emotions"]
185
+ emotions["Time(s)"] = time_seconds
186
+ emotions_data.append(emotions)
187
+ cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 155, 255), 2)
188
+ for i, (emotion, score) in enumerate(emotions.items()):
189
+ if emotion == "Time(s)": continue
190
+ color = (211, 211, 211) if score < 0.01 else (255, 0, 0)
191
+ cv2.putText(frame, f"{emotion}: {score:.2f}", (box[0], box[1] + box[3] + 30 + i * 15),
192
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
193
+ out.write(frame)
194
+ frame_number += 1
195
+
196
+ cap.release()
197
+ out.release()
198
+
199
+ # Combine processed video frames with original audio
200
+ processed_video_clip = VideoFileClip(temp_video_writer_path)
201
+ final_clip = processed_video_clip.set_audio(original_video.audio)
202
+ final_clip.write_videofile(output_video_path, codec='libx264', logger=None)
203
+ os.remove(temp_video_writer_path) # Clean up intermediate video
204
+
205
+ # Read the final video bytes and encode to base64
206
+ with open(output_video_path, "rb") as f:
207
+ processed_video_b64 = base64.b64encode(f.read()).decode("utf-8")
208
+
209
+ # Process DataFrame
210
+ emotions_df = pd.DataFrame(emotions_data)
211
+ if not emotions_df.empty:
212
+ emotions_df['Time(s)'] = emotions_df['Time(s)'].round().astype(int)
213
+ max_time = emotions_df['Time(s)'].max()
214
+ all_times = pd.DataFrame({'Time(s)': range(max_time + 1)})
215
+ avg_scores = emotions_df.groupby("Time(s)").mean().reset_index()
216
+ df_merged = pd.merge(all_times, avg_scores, on='Time(s)', how='left').fillna(0)
217
+ df_merged['Time(s)'] = df_merged['Time(s)'].astype(str) + " sec"
218
+ else:
219
+ df_merged = pd.DataFrame()
220
+
221
+ return {
222
+ "emotions_data": df_merged.to_json(orient='split'),
223
+ "processed_video": processed_video_b64
224
+ }
225
+ finally:
226
+ if output_video_path and os.path.exists(output_video_path):
227
+ os.remove(output_video_path)
228
+
229
+ def _process_video_text(self, video_path: str) -> Dict:
230
+ temp_audio_path = None
231
+ try:
232
+ video_clip = VideoFileClip(video_path)
233
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
234
+ temp_audio_path = temp_audio_file.name
235
+ video_clip.audio.write_audiofile(temp_audio_path, logger=None)
236
+
237
+ audio = whisper.load_audio(temp_audio_path)
238
+ result = whisper.transcribe(self.transcription_model, audio)
239
+
240
+ segments_data = [{'text': seg['text'], 'start': seg['start'], 'end': seg['end']} for seg in
241
+ result['segments']]
242
+ segments_df = pd.DataFrame(segments_data)
243
+
244
+ if segments_df.empty:
245
+ return {"words_data": pd.DataFrame().to_json(orient='split'),
246
+ "segments_data": pd.DataFrame().to_json(orient='split')}
247
+
248
+ segments_df['Translated_Text'] = segments_df['text'].apply(lambda x: self.translator.translate(x))
249
+ segments_df['Sentiment_Scores'] = segments_df['Translated_Text'].apply(
250
+ lambda x: {entry['label']: entry['score'] for entry in self.text_classifier(x)[0]})
251
+ sentiment_df = segments_df['Sentiment_Scores'].apply(pd.Series)
252
+ final_segments_df = pd.concat([segments_df.drop(columns=['Sentiment_Scores']), sentiment_df], axis=1)
253
+
254
+ # Process words data
255
+ word_texts, word_starts, word_ends = [], [], []
256
+ for segment in result['segments']:
257
+ for word in segment['words']:
258
+ word_texts.append(word['text'])
259
+ word_starts.append(word['start'])
260
+ word_ends.append(word['end'])
261
+
262
+ words_df = pd.DataFrame({'text': word_texts, 'start': word_starts, 'end': word_ends})
263
+ words_df['second'] = words_df['start'].apply(lambda x: int(np.ceil(x)))
264
+ words_grouped = words_df.groupby('second').agg(
265
+ {'text': lambda x: ' '.join(x), 'start': 'min', 'end': 'max'}).reset_index()
266
+
267
+ max_second = int(video_clip.duration)
268
+ all_seconds = pd.DataFrame({'second': np.arange(0, max_second + 1)})
269
+ words_grouped = all_seconds.merge(words_grouped, on='second', how='left').fillna(
270
+ {'text': '', 'start': 0, 'end': 0})
271
+
272
+ emotion_columns = final_segments_df.columns.difference(['text', 'start', 'end', 'Translated_Text'])
273
+ for col in emotion_columns:
274
+ words_grouped[col] = np.nan
275
+
276
+ for i, row in words_grouped.iterrows():
277
+ matching_segment = final_segments_df[
278
+ (final_segments_df['start'] <= row['start']) & (final_segments_df['end'] >= row['end'])]
279
+ if not matching_segment.empty:
280
+ for emotion in emotion_columns:
281
+ words_grouped.at[i, emotion] = matching_segment.iloc[0][emotion]
282
+
283
+ words_grouped[emotion_columns] = words_grouped[emotion_columns].fillna(0)
284
+
285
+ return {
286
+ "words_data": words_grouped.to_json(orient='split'),
287
+ "segments_data": final_segments_df.to_json(orient='split')
288
+ }
289
+ finally:
290
+ if temp_audio_path and os.path.exists(temp_audio_path):
291
+ os.remove(temp_audio_path)
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: requirements.txt
2
+
3
+ # Core ML/AI
4
+ torch
5
+ transformers
6
+ tf-keras
7
+ tensorflow==2.15.0
8
+ fer
9
+ whisper_timestamped
10
+ git+https://github.com/openai/whisper.git@v20231117
11
+
12
+ # Data and Video/Audio Processing
13
+ pandas
14
+ moviepy
15
+ librosa
16
+ opencv-python-headless
17
+ numpy
18
+ Pillow
19
+ openpyxl
20
+
21
+ # Other
22
+ translate