Spaces:

vyluong
/

PoC_ASR_v6_dev

Sleeping

File size: 3,841 Bytes

857b1b2

import os
import sys
import time
import logging
import librosa
import numpy as np
import torch

from torch.nn import functional as F
from huggingface_hub import hf_hub_download

logger = logging.getLogger(__name__)

# HuggingFace repo
AVAILABLE_MODELS = {
    "dual_emotion": "vyluong/emo_dual_classi"
}

emotion_labels = ['Angry', 'Anxiety', 'Happy', 'Sad', 'Neutral']

EMOTION_META = {
    "Angry": {"emoji": "😡", "color": "#ff4d4f"},
    "Anxiety": {"emoji": "😰", "color": "#faad14"},
    "Happy": {"emoji": "😊", "color": "#52c41a"},
    "Sad": {"emoji": "😢", "color": "#1890ff"},
    "Neutral": {"emoji": "😐", "color": "#d9d9d9"},
}



class EmotionService:

    _models = {}
    
    emotion_labels = emotion_labels
    meta = EMOTION_META

    @classmethod
    def load_dual_model(cls, repo_id, device):

        logger.info(f"Downloading model from HF: {repo_id}")

        model_file = hf_hub_download(
            repo_id=repo_id,
            filename="pytorch_model.bin"
        )

        model_code = hf_hub_download(
            repo_id=repo_id,
            filename="model.py"
        )

        # add model folder to python path
        import importlib.util
        
        spec = importlib.util.spec_from_file_location("hf_model", model_code)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        
        Dual = module.Dual

        model = Dual()

        state_dict = torch.load(model_file, map_location=device)

        model.load_state_dict(state_dict)

        model.to(device)
        model.eval()

        logger.info("Emotion model loaded successfully")

        return model



    @classmethod
    def get_model(cls, model_name="dual_emotion"):

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if model_name in cls._models:
            return cls._models[model_name]

        repo_id = AVAILABLE_MODELS[model_name]

        model = cls.load_dual_model(repo_id, device)

        cls._models[model_name] = model

        return model


    @classmethod
    def preload_model(cls):

        logger.info("Preloading emotion model...")

        cls.get_model()

        logger.info("Emotion model ready")


    # extract mfcc from segments
    @staticmethod
    def extract_mfcc_segment(
        audio: np.ndarray,
        sr: int,
        start: float,
        end: float,
        duration: float = 5.0,
        n_mfcc: int = 128,
        n_fft: int = 2048,
        hop_length: int = 512
    ):

        start_sample = int(start * sr)
        end_sample = int(end * sr)

        segment = audio[start_sample:end_sample]

        if len(segment) == 0:
            return None

        target_len = int(sr * duration)

        if len(segment) < target_len:
            segment = np.pad(segment,(0,target_len-len(segment)),mode="symmetric")
        else:
            segment = segment[:target_len]

        mfcc = librosa.feature.mfcc(
            y=segment,
            sr=sr,
            n_mfcc=n_mfcc,
            n_fft=n_fft,
            hop_length=hop_length
        )

        return mfcc


    @classmethod
    def predict_from_mfcc(cls, mfcc):

        model = cls.get_model()

        tensor = torch.from_numpy(mfcc).unsqueeze(0).unsqueeze(0).float()

        device = next(model.parameters()).device
        tensor = tensor.to(device)

        with torch.no_grad():

            output = model(tensor)

        probs = F.softmax(output.squeeze(), dim=0).cpu().numpy()

        label = cls.emotion_labels[np.argmax(probs)]

        return label


    # predict from segments
    @classmethod
    def predict_segment(cls, audio, sr, start, end):

        mfcc = cls.extract_mfcc_segment(audio, sr, start, end)

        if mfcc is None:
            return "Neutral"

        return cls.predict_from_mfcc(mfcc)