winno
/

video-analysis

Model card Files Files and versions

xet

Community

Matb09 commited on Jul 8, 2025

Commit

51a005e

1 Parent(s): 78bd840

k

Browse files

Files changed (3) hide show

call_endpoint.py +101 -0
handler.py +291 -0
requirements.txt +22 -0

call_endpoint.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# File: call_endpoint.py
+import requests
+import base64
+import argparse
+import os
+import json
+import pandas as pd
+def call_emotion_endpoint(video_path: str, output_prefix: str, analysis_type: str, api_url: str, api_token: str):
+    """
+    Calls the multimodal emotion analysis endpoint.
+    Args:
+        video_path (str): Path to the input video file.
+        output_prefix (str): Prefix for saving output files (e.g., 'my_analysis').
+        analysis_type (str): The type of analysis to perform ('audio', 'facial', 'text').
+        api_url (str): The URL of the inference endpoint.
+        api_token (str): Your Hugging Face API token.
+    """
+    # 1. Prepare headers and read/encode video
+    headers = {"Authorization": f"Bearer {api_token}", "Content-Type": "application/json"}
+    try:
+        with open(video_path, "rb") as f:
+            video_bytes = f.read()
+        encoded_video = base64.b64encode(video_bytes).decode("utf-8")
+        print(f"Successfully read and encoded '{video_path}'")
+    except FileNotFoundError:
+        print(f"Error: Input file not found at '{video_path}'")
+        return
+    # 2. Construct the JSON payload
+    payload = {
+        "inputs": {
+            "video": encoded_video,
+            "analysis_type": analysis_type
+        }
+    }
+    # 3. Make the POST request
+    print(f"Sending request for '{analysis_type}' analysis to endpoint: {api_url}")
+    response = requests.post(api_url, headers=headers, json=payload)
+    # 4. Process the response
+    if response.status_code == 200:
+        try:
+            response_data = response.json()
+            if response_data.get("status") == "error":
+                print(f"Endpoint returned an error: {response_data.get('message')}")
+                return
+            print("Success! Processing response...")
+            # Save any DataFrame results to CSV/XLSX
+            for key, value in response_data.items():
+                if key.endswith("_data") and value:
+                    df = pd.read_json(value, orient='split')
+                    output_path = f"{output_prefix}_{key}.xlsx"
+                    df.to_excel(output_path, index=False)
+                    print(f"Saved DataFrame to '{output_path}'")
+            # Save any base64 encoded files
+            if "processed_video" in response_data:
+                video_b64 = response_data["processed_video"]
+                decoded_video_bytes = base64.b64decode(video_b64)
+                output_path = f"{output_prefix}_processed_video.mp4"
+                with open(output_path, "wb") as f:
+                    f.write(decoded_video_bytes)
+                print(f"Saved processed video to '{output_path}'")
+        except (requests.exceptions.JSONDecodeError, KeyError, TypeError) as e:
+            print(f"Error processing the response: {e}")
+            print("Response content:", response.text)
+    else:
+        print(f"Error: Endpoint returned status code {response.status_code}")
+        print("Response content:", response.text)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Call the multimodal emotion analysis endpoint.")
+    parser.add_argument("video_path", type=str, help="Path to the input video file.")
+    parser.add_argument("analysis_type", type=str, choices=['audio', 'facial', 'text'], help="Type of analysis to run.")
+    parser.add_argument("--output_prefix", type=str, default="analysis_result", help="Prefix for output files.")
+    parser.add_argument("--api_url", type=str, required=True, help="The URL of the inference endpoint.")
+    parser.add_argument("--api_token", type=str, default=os.environ.get("HF_API_TOKEN"),
+                        help="Your HF API token (or set HF_API_TOKEN env var).")
+    args = parser.parse_args()
+    if not args.api_token:
+        raise ValueError("Hugging Face API token is required.")
+    call_emotion_endpoint(
+        video_path=args.video_path,
+        output_prefix=args.output_prefix,
+        analysis_type=args.analysis_type,
+        api_url=args.api_url,
+        api_token=args.api_token
+    )

handler.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# File: handler.py
+import base64
+import io
+import os
+import tempfile
+from typing import Dict, Any
+import cv2
+import librosa
+import numpy as np
+import pandas as pd
+import torch
+import whisper_timestamped as whisper
+from fer import FER
+from moviepy.editor import VideoFileClip, AudioFileClip
+from torch.nn.functional import softmax
+from transformers import AutoModelForAudioClassification, pipeline
+from translate import Translator
+class EndpointHandler:
+    def __init__(self, path=""):
+        """
+        Loads all models onto the device. This is called once when the endpoint starts.
+        """
+        print("Loading models...")
+        # Use GPU if available
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+        # 1. Audio Emotion Model
+        self.audio_model = AutoModelForAudioClassification.from_pretrained(
+            "3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True
+        ).to(self.device)
+        self.audio_mean = self.audio_model.config.mean
+        self.audio_std = self.audio_model.config.std
+        # 2. Facial Emotion Model
+        self.face_detector = FER(mtcnn=True)
+        # 3. Text Emotion Model
+        self.text_classifier = pipeline(
+            "text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None, device=self.device
+        )
+        # 4. Transcription Model
+        self.transcription_model = whisper.load_model("medium", device=self.device)
+        # 5. Translator
+        self.translator = Translator(from_lang='ko', to_lang='en')
+        print("All models loaded successfully.")
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Handles an inference request.
+        Args:
+            data (Dict[str, Any]): Dictionary containing request parameters. Expected keys:
+                'video': Base64 encoded video string.
+                'analysis_type': One of "audio", "facial", or "text".
+        """
+        print("Received inference request.")
+        # --- 1. Parameter Extraction ---
+        if 'inputs' in data and isinstance(data['inputs'], dict):
+            params = data['inputs']
+        else:
+            params = data
+        b64_video = params.get("video")
+        if not b64_video:
+            raise ValueError("Missing 'video' parameter (base64 encoded string)")
+        analysis_type = params.get("analysis_type")
+        if analysis_type not in ["audio", "facial", "text"]:
+            raise ValueError("Missing or invalid 'analysis_type'. Must be 'audio', 'facial', or 'text'.")
+        # --- 2. Video Decoding ---
+        video_bytes = base64.b64decode(b64_video)
+        # Use a temporary file to store the video, as the original functions expect a path
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as temp_video_file:
+            temp_video_file.write(video_bytes)
+            temp_video_file.flush()  # Ensure all data is written
+            video_path = temp_video_file.name
+            print(f"Video saved to temporary file: {video_path}")
+            # --- 3. Dispatch to correct analysis function ---
+            try:
+                if analysis_type == "audio":
+                    result = self._analyze_audio_emotions(video_path)
+                elif analysis_type == "facial":
+                    result = self._detect_faces_and_emotions(video_path)
+                elif analysis_type == "text":
+                    result = self._process_video_text(video_path)
+                print("Analysis completed successfully.")
+                return {"status": "success", **result}
+            except Exception as e:
+                print(f"Error during {analysis_type} analysis: {e}")
+                # It's good practice to return a structured error
+                return {"status": "error", "message": str(e)}
+    # ===================================================================
+    #  REFACTORED ANALYSIS FUNCTIONS
+    # ===================================================================
+    def _analyze_audio_emotions(self, video_path: str) -> Dict:
+        temp_audio_path = None
+        try:
+            # Extract audio
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+                temp_audio_path = temp_audio_file.name
+                VideoFileClip(video_path).audio.write_audiofile(temp_audio_path, codec="pcm_s16le", logger=None)
+            raw_wav, _ = librosa.load(temp_audio_path, sr=self.audio_model.config.sampling_rate)
+            norm_wav = (raw_wav - self.audio_mean) / (self.audio_std + 1e-6)
+            times, emotions_dfs = [], []
+            for start_time in range(0, len(norm_wav), self.audio_model.config.sampling_rate):
+                audio_segment = norm_wav[start_time:start_time + self.audio_model.config.sampling_rate]
+                # Process segment
+                audio_np = np.array(audio_segment)
+                mask = torch.ones(1, len(audio_np)).to(self.device)
+                wavs = torch.tensor(audio_np).unsqueeze(0).to(self.device)
+                with torch.no_grad():
+                    pred = self.audio_model(wavs, mask)
+                logits = pred.logits if hasattr(pred, 'logits') else pred[0]
+                labels = {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 7: 'Neutral'}
+                probabilities = softmax(logits, dim=-1).squeeze(0)[[0, 1, 2, 3, 4, 5, 7]]
+                probabilities = probabilities / probabilities.sum()
+                df = pd.DataFrame([probabilities.cpu().numpy()], columns=labels.values())
+                times.append(start_time / self.audio_model.config.sampling_rate)
+                emotions_dfs.append(df)
+            emotions_df = pd.concat(emotions_dfs, ignore_index=True)
+            emotions_df.insert(0, "Time(s)", times)
+            emotion_rename_map = {'Angry': 'anger', 'Sad': 'sadness', 'Happy': 'happy', 'Surprise': 'surprise',
+                                  'Fear': 'fear', 'Disgust': 'disgust', 'Neutral': 'neutral'}
+            emotions_df.rename(columns=emotion_rename_map, inplace=True)
+            # Return DataFrame as JSON
+            return {"emotions_data": emotions_df.to_json(orient='split')}
+        finally:
+            if temp_audio_path and os.path.exists(temp_audio_path):
+                os.remove(temp_audio_path)
+    def _detect_faces_and_emotions(self, video_path: str) -> Dict:
+        emotions_data = []
+        output_video_path = None
+        try:
+            # Create a temporary file for the output video
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_out_video:
+                output_video_path = temp_out_video.name
+            original_video = VideoFileClip(video_path)
+            cap = cv2.VideoCapture(video_path)
+            fps = int(cap.get(cv2.CAP_PROP_FPS))
+            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            # Use a temporary path for the video writer intermediate file
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video_writer_file:
+                temp_video_writer_path = temp_video_writer_file.name
+            out = cv2.VideoWriter(temp_video_writer_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
+            frame_number = 0
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret: break
+                time_seconds = round(frame_number / fps)
+                result = self.face_detector.detect_emotions(frame)
+                for face in result:
+                    box = face["box"]
+                    emotions = face["emotions"]
+                    emotions["Time(s)"] = time_seconds
+                    emotions_data.append(emotions)
+                    cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 155, 255), 2)
+                    for i, (emotion, score) in enumerate(emotions.items()):
+                        if emotion == "Time(s)": continue
+                        color = (211, 211, 211) if score < 0.01 else (255, 0, 0)
+                        cv2.putText(frame, f"{emotion}: {score:.2f}", (box[0], box[1] + box[3] + 30 + i * 15),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
+                out.write(frame)
+                frame_number += 1
+            cap.release()
+            out.release()
+            # Combine processed video frames with original audio
+            processed_video_clip = VideoFileClip(temp_video_writer_path)
+            final_clip = processed_video_clip.set_audio(original_video.audio)
+            final_clip.write_videofile(output_video_path, codec='libx264', logger=None)
+            os.remove(temp_video_writer_path)  # Clean up intermediate video
+            # Read the final video bytes and encode to base64
+            with open(output_video_path, "rb") as f:
+                processed_video_b64 = base64.b64encode(f.read()).decode("utf-8")
+            # Process DataFrame
+            emotions_df = pd.DataFrame(emotions_data)
+            if not emotions_df.empty:
+                emotions_df['Time(s)'] = emotions_df['Time(s)'].round().astype(int)
+                max_time = emotions_df['Time(s)'].max()
+                all_times = pd.DataFrame({'Time(s)': range(max_time + 1)})
+                avg_scores = emotions_df.groupby("Time(s)").mean().reset_index()
+                df_merged = pd.merge(all_times, avg_scores, on='Time(s)', how='left').fillna(0)
+                df_merged['Time(s)'] = df_merged['Time(s)'].astype(str) + " sec"
+            else:
+                df_merged = pd.DataFrame()
+            return {
+                "emotions_data": df_merged.to_json(orient='split'),
+                "processed_video": processed_video_b64
+            }
+        finally:
+            if output_video_path and os.path.exists(output_video_path):
+                os.remove(output_video_path)
+    def _process_video_text(self, video_path: str) -> Dict:
+        temp_audio_path = None
+        try:
+            video_clip = VideoFileClip(video_path)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+                temp_audio_path = temp_audio_file.name
+                video_clip.audio.write_audiofile(temp_audio_path, logger=None)
+            audio = whisper.load_audio(temp_audio_path)
+            result = whisper.transcribe(self.transcription_model, audio)
+            segments_data = [{'text': seg['text'], 'start': seg['start'], 'end': seg['end']} for seg in
+                             result['segments']]
+            segments_df = pd.DataFrame(segments_data)
+            if segments_df.empty:
+                return {"words_data": pd.DataFrame().to_json(orient='split'),
+                        "segments_data": pd.DataFrame().to_json(orient='split')}
+            segments_df['Translated_Text'] = segments_df['text'].apply(lambda x: self.translator.translate(x))
+            segments_df['Sentiment_Scores'] = segments_df['Translated_Text'].apply(
+                lambda x: {entry['label']: entry['score'] for entry in self.text_classifier(x)[0]})
+            sentiment_df = segments_df['Sentiment_Scores'].apply(pd.Series)
+            final_segments_df = pd.concat([segments_df.drop(columns=['Sentiment_Scores']), sentiment_df], axis=1)
+            # Process words data
+            word_texts, word_starts, word_ends = [], [], []
+            for segment in result['segments']:
+                for word in segment['words']:
+                    word_texts.append(word['text'])
+                    word_starts.append(word['start'])
+                    word_ends.append(word['end'])
+            words_df = pd.DataFrame({'text': word_texts, 'start': word_starts, 'end': word_ends})
+            words_df['second'] = words_df['start'].apply(lambda x: int(np.ceil(x)))
+            words_grouped = words_df.groupby('second').agg(
+                {'text': lambda x: ' '.join(x), 'start': 'min', 'end': 'max'}).reset_index()
+            max_second = int(video_clip.duration)
+            all_seconds = pd.DataFrame({'second': np.arange(0, max_second + 1)})
+            words_grouped = all_seconds.merge(words_grouped, on='second', how='left').fillna(
+                {'text': '', 'start': 0, 'end': 0})
+            emotion_columns = final_segments_df.columns.difference(['text', 'start', 'end', 'Translated_Text'])
+            for col in emotion_columns:
+                words_grouped[col] = np.nan
+            for i, row in words_grouped.iterrows():
+                matching_segment = final_segments_df[
+                    (final_segments_df['start'] <= row['start']) & (final_segments_df['end'] >= row['end'])]
+                if not matching_segment.empty:
+                    for emotion in emotion_columns:
+                        words_grouped.at[i, emotion] = matching_segment.iloc[0][emotion]
+            words_grouped[emotion_columns] = words_grouped[emotion_columns].fillna(0)
+            return {
+                "words_data": words_grouped.to_json(orient='split'),
+                "segments_data": final_segments_df.to_json(orient='split')
+            }
+        finally:
+            if temp_audio_path and os.path.exists(temp_audio_path):
+                os.remove(temp_audio_path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# File: requirements.txt
+# Core ML/AI
+torch
+transformers
+tf-keras
+tensorflow==2.15.0
+fer
+whisper_timestamped
+git+https://github.com/openai/whisper.git@v20231117
+# Data and Video/Audio Processing
+pandas
+moviepy
+librosa
+opencv-python-headless
+numpy
+Pillow
+openpyxl
+# Other
+translate