- call_endpoint.py +101 -0
- handler.py +291 -0
- requirements.txt +22 -0
call_endpoint.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# File: call_endpoint.py
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
import base64
|
| 5 |
+
import argparse
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def call_emotion_endpoint(video_path: str, output_prefix: str, analysis_type: str, api_url: str, api_token: str):
|
| 12 |
+
"""
|
| 13 |
+
Calls the multimodal emotion analysis endpoint.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
video_path (str): Path to the input video file.
|
| 17 |
+
output_prefix (str): Prefix for saving output files (e.g., 'my_analysis').
|
| 18 |
+
analysis_type (str): The type of analysis to perform ('audio', 'facial', 'text').
|
| 19 |
+
api_url (str): The URL of the inference endpoint.
|
| 20 |
+
api_token (str): Your Hugging Face API token.
|
| 21 |
+
"""
|
| 22 |
+
# 1. Prepare headers and read/encode video
|
| 23 |
+
headers = {"Authorization": f"Bearer {api_token}", "Content-Type": "application/json"}
|
| 24 |
+
try:
|
| 25 |
+
with open(video_path, "rb") as f:
|
| 26 |
+
video_bytes = f.read()
|
| 27 |
+
encoded_video = base64.b64encode(video_bytes).decode("utf-8")
|
| 28 |
+
print(f"Successfully read and encoded '{video_path}'")
|
| 29 |
+
except FileNotFoundError:
|
| 30 |
+
print(f"Error: Input file not found at '{video_path}'")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
# 2. Construct the JSON payload
|
| 34 |
+
payload = {
|
| 35 |
+
"inputs": {
|
| 36 |
+
"video": encoded_video,
|
| 37 |
+
"analysis_type": analysis_type
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# 3. Make the POST request
|
| 42 |
+
print(f"Sending request for '{analysis_type}' analysis to endpoint: {api_url}")
|
| 43 |
+
response = requests.post(api_url, headers=headers, json=payload)
|
| 44 |
+
|
| 45 |
+
# 4. Process the response
|
| 46 |
+
if response.status_code == 200:
|
| 47 |
+
try:
|
| 48 |
+
response_data = response.json()
|
| 49 |
+
|
| 50 |
+
if response_data.get("status") == "error":
|
| 51 |
+
print(f"Endpoint returned an error: {response_data.get('message')}")
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
print("Success! Processing response...")
|
| 55 |
+
|
| 56 |
+
# Save any DataFrame results to CSV/XLSX
|
| 57 |
+
for key, value in response_data.items():
|
| 58 |
+
if key.endswith("_data") and value:
|
| 59 |
+
df = pd.read_json(value, orient='split')
|
| 60 |
+
output_path = f"{output_prefix}_{key}.xlsx"
|
| 61 |
+
df.to_excel(output_path, index=False)
|
| 62 |
+
print(f"Saved DataFrame to '{output_path}'")
|
| 63 |
+
|
| 64 |
+
# Save any base64 encoded files
|
| 65 |
+
if "processed_video" in response_data:
|
| 66 |
+
video_b64 = response_data["processed_video"]
|
| 67 |
+
decoded_video_bytes = base64.b64decode(video_b64)
|
| 68 |
+
output_path = f"{output_prefix}_processed_video.mp4"
|
| 69 |
+
with open(output_path, "wb") as f:
|
| 70 |
+
f.write(decoded_video_bytes)
|
| 71 |
+
print(f"Saved processed video to '{output_path}'")
|
| 72 |
+
|
| 73 |
+
except (requests.exceptions.JSONDecodeError, KeyError, TypeError) as e:
|
| 74 |
+
print(f"Error processing the response: {e}")
|
| 75 |
+
print("Response content:", response.text)
|
| 76 |
+
else:
|
| 77 |
+
print(f"Error: Endpoint returned status code {response.status_code}")
|
| 78 |
+
print("Response content:", response.text)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
parser = argparse.ArgumentParser(description="Call the multimodal emotion analysis endpoint.")
|
| 83 |
+
parser.add_argument("video_path", type=str, help="Path to the input video file.")
|
| 84 |
+
parser.add_argument("analysis_type", type=str, choices=['audio', 'facial', 'text'], help="Type of analysis to run.")
|
| 85 |
+
parser.add_argument("--output_prefix", type=str, default="analysis_result", help="Prefix for output files.")
|
| 86 |
+
parser.add_argument("--api_url", type=str, required=True, help="The URL of the inference endpoint.")
|
| 87 |
+
parser.add_argument("--api_token", type=str, default=os.environ.get("HF_API_TOKEN"),
|
| 88 |
+
help="Your HF API token (or set HF_API_TOKEN env var).")
|
| 89 |
+
|
| 90 |
+
args = parser.parse_args()
|
| 91 |
+
|
| 92 |
+
if not args.api_token:
|
| 93 |
+
raise ValueError("Hugging Face API token is required.")
|
| 94 |
+
|
| 95 |
+
call_emotion_endpoint(
|
| 96 |
+
video_path=args.video_path,
|
| 97 |
+
output_prefix=args.output_prefix,
|
| 98 |
+
analysis_type=args.analysis_type,
|
| 99 |
+
api_url=args.api_url,
|
| 100 |
+
api_token=args.api_token
|
| 101 |
+
)
|
handler.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# File: handler.py
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
|
| 9 |
+
import cv2
|
| 10 |
+
import librosa
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import torch
|
| 14 |
+
import whisper_timestamped as whisper
|
| 15 |
+
from fer import FER
|
| 16 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 17 |
+
from torch.nn.functional import softmax
|
| 18 |
+
from transformers import AutoModelForAudioClassification, pipeline
|
| 19 |
+
from translate import Translator
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class EndpointHandler:
|
| 23 |
+
def __init__(self, path=""):
|
| 24 |
+
"""
|
| 25 |
+
Loads all models onto the device. This is called once when the endpoint starts.
|
| 26 |
+
"""
|
| 27 |
+
print("Loading models...")
|
| 28 |
+
# Use GPU if available
|
| 29 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
+
print(f"Using device: {self.device}")
|
| 31 |
+
|
| 32 |
+
# 1. Audio Emotion Model
|
| 33 |
+
self.audio_model = AutoModelForAudioClassification.from_pretrained(
|
| 34 |
+
"3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True
|
| 35 |
+
).to(self.device)
|
| 36 |
+
self.audio_mean = self.audio_model.config.mean
|
| 37 |
+
self.audio_std = self.audio_model.config.std
|
| 38 |
+
|
| 39 |
+
# 2. Facial Emotion Model
|
| 40 |
+
self.face_detector = FER(mtcnn=True)
|
| 41 |
+
|
| 42 |
+
# 3. Text Emotion Model
|
| 43 |
+
self.text_classifier = pipeline(
|
| 44 |
+
"text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None, device=self.device
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# 4. Transcription Model
|
| 48 |
+
self.transcription_model = whisper.load_model("medium", device=self.device)
|
| 49 |
+
|
| 50 |
+
# 5. Translator
|
| 51 |
+
self.translator = Translator(from_lang='ko', to_lang='en')
|
| 52 |
+
|
| 53 |
+
print("All models loaded successfully.")
|
| 54 |
+
|
| 55 |
+
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
| 56 |
+
"""
|
| 57 |
+
Handles an inference request.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
data (Dict[str, Any]): Dictionary containing request parameters. Expected keys:
|
| 61 |
+
'video': Base64 encoded video string.
|
| 62 |
+
'analysis_type': One of "audio", "facial", or "text".
|
| 63 |
+
"""
|
| 64 |
+
print("Received inference request.")
|
| 65 |
+
|
| 66 |
+
# --- 1. Parameter Extraction ---
|
| 67 |
+
if 'inputs' in data and isinstance(data['inputs'], dict):
|
| 68 |
+
params = data['inputs']
|
| 69 |
+
else:
|
| 70 |
+
params = data
|
| 71 |
+
|
| 72 |
+
b64_video = params.get("video")
|
| 73 |
+
if not b64_video:
|
| 74 |
+
raise ValueError("Missing 'video' parameter (base64 encoded string)")
|
| 75 |
+
|
| 76 |
+
analysis_type = params.get("analysis_type")
|
| 77 |
+
if analysis_type not in ["audio", "facial", "text"]:
|
| 78 |
+
raise ValueError("Missing or invalid 'analysis_type'. Must be 'audio', 'facial', or 'text'.")
|
| 79 |
+
|
| 80 |
+
# --- 2. Video Decoding ---
|
| 81 |
+
video_bytes = base64.b64decode(b64_video)
|
| 82 |
+
|
| 83 |
+
# Use a temporary file to store the video, as the original functions expect a path
|
| 84 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as temp_video_file:
|
| 85 |
+
temp_video_file.write(video_bytes)
|
| 86 |
+
temp_video_file.flush() # Ensure all data is written
|
| 87 |
+
video_path = temp_video_file.name
|
| 88 |
+
print(f"Video saved to temporary file: {video_path}")
|
| 89 |
+
|
| 90 |
+
# --- 3. Dispatch to correct analysis function ---
|
| 91 |
+
try:
|
| 92 |
+
if analysis_type == "audio":
|
| 93 |
+
result = self._analyze_audio_emotions(video_path)
|
| 94 |
+
elif analysis_type == "facial":
|
| 95 |
+
result = self._detect_faces_and_emotions(video_path)
|
| 96 |
+
elif analysis_type == "text":
|
| 97 |
+
result = self._process_video_text(video_path)
|
| 98 |
+
|
| 99 |
+
print("Analysis completed successfully.")
|
| 100 |
+
return {"status": "success", **result}
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Error during {analysis_type} analysis: {e}")
|
| 104 |
+
# It's good practice to return a structured error
|
| 105 |
+
return {"status": "error", "message": str(e)}
|
| 106 |
+
|
| 107 |
+
# ===================================================================
|
| 108 |
+
# REFACTORED ANALYSIS FUNCTIONS
|
| 109 |
+
# ===================================================================
|
| 110 |
+
|
| 111 |
+
def _analyze_audio_emotions(self, video_path: str) -> Dict:
|
| 112 |
+
temp_audio_path = None
|
| 113 |
+
try:
|
| 114 |
+
# Extract audio
|
| 115 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
|
| 116 |
+
temp_audio_path = temp_audio_file.name
|
| 117 |
+
VideoFileClip(video_path).audio.write_audiofile(temp_audio_path, codec="pcm_s16le", logger=None)
|
| 118 |
+
|
| 119 |
+
raw_wav, _ = librosa.load(temp_audio_path, sr=self.audio_model.config.sampling_rate)
|
| 120 |
+
norm_wav = (raw_wav - self.audio_mean) / (self.audio_std + 1e-6)
|
| 121 |
+
|
| 122 |
+
times, emotions_dfs = [], []
|
| 123 |
+
for start_time in range(0, len(norm_wav), self.audio_model.config.sampling_rate):
|
| 124 |
+
audio_segment = norm_wav[start_time:start_time + self.audio_model.config.sampling_rate]
|
| 125 |
+
|
| 126 |
+
# Process segment
|
| 127 |
+
audio_np = np.array(audio_segment)
|
| 128 |
+
mask = torch.ones(1, len(audio_np)).to(self.device)
|
| 129 |
+
wavs = torch.tensor(audio_np).unsqueeze(0).to(self.device)
|
| 130 |
+
with torch.no_grad():
|
| 131 |
+
pred = self.audio_model(wavs, mask)
|
| 132 |
+
logits = pred.logits if hasattr(pred, 'logits') else pred[0]
|
| 133 |
+
labels = {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 7: 'Neutral'}
|
| 134 |
+
probabilities = softmax(logits, dim=-1).squeeze(0)[[0, 1, 2, 3, 4, 5, 7]]
|
| 135 |
+
probabilities = probabilities / probabilities.sum()
|
| 136 |
+
df = pd.DataFrame([probabilities.cpu().numpy()], columns=labels.values())
|
| 137 |
+
|
| 138 |
+
times.append(start_time / self.audio_model.config.sampling_rate)
|
| 139 |
+
emotions_dfs.append(df)
|
| 140 |
+
|
| 141 |
+
emotions_df = pd.concat(emotions_dfs, ignore_index=True)
|
| 142 |
+
emotions_df.insert(0, "Time(s)", times)
|
| 143 |
+
emotion_rename_map = {'Angry': 'anger', 'Sad': 'sadness', 'Happy': 'happy', 'Surprise': 'surprise',
|
| 144 |
+
'Fear': 'fear', 'Disgust': 'disgust', 'Neutral': 'neutral'}
|
| 145 |
+
emotions_df.rename(columns=emotion_rename_map, inplace=True)
|
| 146 |
+
|
| 147 |
+
# Return DataFrame as JSON
|
| 148 |
+
return {"emotions_data": emotions_df.to_json(orient='split')}
|
| 149 |
+
|
| 150 |
+
finally:
|
| 151 |
+
if temp_audio_path and os.path.exists(temp_audio_path):
|
| 152 |
+
os.remove(temp_audio_path)
|
| 153 |
+
|
| 154 |
+
def _detect_faces_and_emotions(self, video_path: str) -> Dict:
|
| 155 |
+
emotions_data = []
|
| 156 |
+
output_video_path = None
|
| 157 |
+
try:
|
| 158 |
+
# Create a temporary file for the output video
|
| 159 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_out_video:
|
| 160 |
+
output_video_path = temp_out_video.name
|
| 161 |
+
|
| 162 |
+
original_video = VideoFileClip(video_path)
|
| 163 |
+
cap = cv2.VideoCapture(video_path)
|
| 164 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
| 165 |
+
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 166 |
+
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 167 |
+
|
| 168 |
+
# Use a temporary path for the video writer intermediate file
|
| 169 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video_writer_file:
|
| 170 |
+
temp_video_writer_path = temp_video_writer_file.name
|
| 171 |
+
|
| 172 |
+
out = cv2.VideoWriter(temp_video_writer_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
|
| 173 |
+
|
| 174 |
+
frame_number = 0
|
| 175 |
+
while cap.isOpened():
|
| 176 |
+
ret, frame = cap.read()
|
| 177 |
+
if not ret: break
|
| 178 |
+
|
| 179 |
+
time_seconds = round(frame_number / fps)
|
| 180 |
+
result = self.face_detector.detect_emotions(frame)
|
| 181 |
+
|
| 182 |
+
for face in result:
|
| 183 |
+
box = face["box"]
|
| 184 |
+
emotions = face["emotions"]
|
| 185 |
+
emotions["Time(s)"] = time_seconds
|
| 186 |
+
emotions_data.append(emotions)
|
| 187 |
+
cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 155, 255), 2)
|
| 188 |
+
for i, (emotion, score) in enumerate(emotions.items()):
|
| 189 |
+
if emotion == "Time(s)": continue
|
| 190 |
+
color = (211, 211, 211) if score < 0.01 else (255, 0, 0)
|
| 191 |
+
cv2.putText(frame, f"{emotion}: {score:.2f}", (box[0], box[1] + box[3] + 30 + i * 15),
|
| 192 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
|
| 193 |
+
out.write(frame)
|
| 194 |
+
frame_number += 1
|
| 195 |
+
|
| 196 |
+
cap.release()
|
| 197 |
+
out.release()
|
| 198 |
+
|
| 199 |
+
# Combine processed video frames with original audio
|
| 200 |
+
processed_video_clip = VideoFileClip(temp_video_writer_path)
|
| 201 |
+
final_clip = processed_video_clip.set_audio(original_video.audio)
|
| 202 |
+
final_clip.write_videofile(output_video_path, codec='libx264', logger=None)
|
| 203 |
+
os.remove(temp_video_writer_path) # Clean up intermediate video
|
| 204 |
+
|
| 205 |
+
# Read the final video bytes and encode to base64
|
| 206 |
+
with open(output_video_path, "rb") as f:
|
| 207 |
+
processed_video_b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 208 |
+
|
| 209 |
+
# Process DataFrame
|
| 210 |
+
emotions_df = pd.DataFrame(emotions_data)
|
| 211 |
+
if not emotions_df.empty:
|
| 212 |
+
emotions_df['Time(s)'] = emotions_df['Time(s)'].round().astype(int)
|
| 213 |
+
max_time = emotions_df['Time(s)'].max()
|
| 214 |
+
all_times = pd.DataFrame({'Time(s)': range(max_time + 1)})
|
| 215 |
+
avg_scores = emotions_df.groupby("Time(s)").mean().reset_index()
|
| 216 |
+
df_merged = pd.merge(all_times, avg_scores, on='Time(s)', how='left').fillna(0)
|
| 217 |
+
df_merged['Time(s)'] = df_merged['Time(s)'].astype(str) + " sec"
|
| 218 |
+
else:
|
| 219 |
+
df_merged = pd.DataFrame()
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
"emotions_data": df_merged.to_json(orient='split'),
|
| 223 |
+
"processed_video": processed_video_b64
|
| 224 |
+
}
|
| 225 |
+
finally:
|
| 226 |
+
if output_video_path and os.path.exists(output_video_path):
|
| 227 |
+
os.remove(output_video_path)
|
| 228 |
+
|
| 229 |
+
def _process_video_text(self, video_path: str) -> Dict:
|
| 230 |
+
temp_audio_path = None
|
| 231 |
+
try:
|
| 232 |
+
video_clip = VideoFileClip(video_path)
|
| 233 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
|
| 234 |
+
temp_audio_path = temp_audio_file.name
|
| 235 |
+
video_clip.audio.write_audiofile(temp_audio_path, logger=None)
|
| 236 |
+
|
| 237 |
+
audio = whisper.load_audio(temp_audio_path)
|
| 238 |
+
result = whisper.transcribe(self.transcription_model, audio)
|
| 239 |
+
|
| 240 |
+
segments_data = [{'text': seg['text'], 'start': seg['start'], 'end': seg['end']} for seg in
|
| 241 |
+
result['segments']]
|
| 242 |
+
segments_df = pd.DataFrame(segments_data)
|
| 243 |
+
|
| 244 |
+
if segments_df.empty:
|
| 245 |
+
return {"words_data": pd.DataFrame().to_json(orient='split'),
|
| 246 |
+
"segments_data": pd.DataFrame().to_json(orient='split')}
|
| 247 |
+
|
| 248 |
+
segments_df['Translated_Text'] = segments_df['text'].apply(lambda x: self.translator.translate(x))
|
| 249 |
+
segments_df['Sentiment_Scores'] = segments_df['Translated_Text'].apply(
|
| 250 |
+
lambda x: {entry['label']: entry['score'] for entry in self.text_classifier(x)[0]})
|
| 251 |
+
sentiment_df = segments_df['Sentiment_Scores'].apply(pd.Series)
|
| 252 |
+
final_segments_df = pd.concat([segments_df.drop(columns=['Sentiment_Scores']), sentiment_df], axis=1)
|
| 253 |
+
|
| 254 |
+
# Process words data
|
| 255 |
+
word_texts, word_starts, word_ends = [], [], []
|
| 256 |
+
for segment in result['segments']:
|
| 257 |
+
for word in segment['words']:
|
| 258 |
+
word_texts.append(word['text'])
|
| 259 |
+
word_starts.append(word['start'])
|
| 260 |
+
word_ends.append(word['end'])
|
| 261 |
+
|
| 262 |
+
words_df = pd.DataFrame({'text': word_texts, 'start': word_starts, 'end': word_ends})
|
| 263 |
+
words_df['second'] = words_df['start'].apply(lambda x: int(np.ceil(x)))
|
| 264 |
+
words_grouped = words_df.groupby('second').agg(
|
| 265 |
+
{'text': lambda x: ' '.join(x), 'start': 'min', 'end': 'max'}).reset_index()
|
| 266 |
+
|
| 267 |
+
max_second = int(video_clip.duration)
|
| 268 |
+
all_seconds = pd.DataFrame({'second': np.arange(0, max_second + 1)})
|
| 269 |
+
words_grouped = all_seconds.merge(words_grouped, on='second', how='left').fillna(
|
| 270 |
+
{'text': '', 'start': 0, 'end': 0})
|
| 271 |
+
|
| 272 |
+
emotion_columns = final_segments_df.columns.difference(['text', 'start', 'end', 'Translated_Text'])
|
| 273 |
+
for col in emotion_columns:
|
| 274 |
+
words_grouped[col] = np.nan
|
| 275 |
+
|
| 276 |
+
for i, row in words_grouped.iterrows():
|
| 277 |
+
matching_segment = final_segments_df[
|
| 278 |
+
(final_segments_df['start'] <= row['start']) & (final_segments_df['end'] >= row['end'])]
|
| 279 |
+
if not matching_segment.empty:
|
| 280 |
+
for emotion in emotion_columns:
|
| 281 |
+
words_grouped.at[i, emotion] = matching_segment.iloc[0][emotion]
|
| 282 |
+
|
| 283 |
+
words_grouped[emotion_columns] = words_grouped[emotion_columns].fillna(0)
|
| 284 |
+
|
| 285 |
+
return {
|
| 286 |
+
"words_data": words_grouped.to_json(orient='split'),
|
| 287 |
+
"segments_data": final_segments_df.to_json(orient='split')
|
| 288 |
+
}
|
| 289 |
+
finally:
|
| 290 |
+
if temp_audio_path and os.path.exists(temp_audio_path):
|
| 291 |
+
os.remove(temp_audio_path)
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# File: requirements.txt
|
| 2 |
+
|
| 3 |
+
# Core ML/AI
|
| 4 |
+
torch
|
| 5 |
+
transformers
|
| 6 |
+
tf-keras
|
| 7 |
+
tensorflow==2.15.0
|
| 8 |
+
fer
|
| 9 |
+
whisper_timestamped
|
| 10 |
+
git+https://github.com/openai/whisper.git@v20231117
|
| 11 |
+
|
| 12 |
+
# Data and Video/Audio Processing
|
| 13 |
+
pandas
|
| 14 |
+
moviepy
|
| 15 |
+
librosa
|
| 16 |
+
opencv-python-headless
|
| 17 |
+
numpy
|
| 18 |
+
Pillow
|
| 19 |
+
openpyxl
|
| 20 |
+
|
| 21 |
+
# Other
|
| 22 |
+
translate
|