vyluong's picture
PoC deployment
857b1b2 verified
import os
import sys
import time
import logging
import librosa
import numpy as np
import torch
from torch.nn import functional as F
from huggingface_hub import hf_hub_download
logger = logging.getLogger(__name__)
# HuggingFace repo
AVAILABLE_MODELS = {
"dual_emotion": "vyluong/emo_dual_classi"
}
emotion_labels = ['Angry', 'Anxiety', 'Happy', 'Sad', 'Neutral']
EMOTION_META = {
"Angry": {"emoji": "๐Ÿ˜ก", "color": "#ff4d4f"},
"Anxiety": {"emoji": "๐Ÿ˜ฐ", "color": "#faad14"},
"Happy": {"emoji": "๐Ÿ˜Š", "color": "#52c41a"},
"Sad": {"emoji": "๐Ÿ˜ข", "color": "#1890ff"},
"Neutral": {"emoji": "๐Ÿ˜", "color": "#d9d9d9"},
}
class EmotionService:
_models = {}
emotion_labels = emotion_labels
meta = EMOTION_META
@classmethod
def load_dual_model(cls, repo_id, device):
logger.info(f"Downloading model from HF: {repo_id}")
model_file = hf_hub_download(
repo_id=repo_id,
filename="pytorch_model.bin"
)
model_code = hf_hub_download(
repo_id=repo_id,
filename="model.py"
)
# add model folder to python path
import importlib.util
spec = importlib.util.spec_from_file_location("hf_model", model_code)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
Dual = module.Dual
model = Dual()
state_dict = torch.load(model_file, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()
logger.info("Emotion model loaded successfully")
return model
@classmethod
def get_model(cls, model_name="dual_emotion"):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if model_name in cls._models:
return cls._models[model_name]
repo_id = AVAILABLE_MODELS[model_name]
model = cls.load_dual_model(repo_id, device)
cls._models[model_name] = model
return model
@classmethod
def preload_model(cls):
logger.info("Preloading emotion model...")
cls.get_model()
logger.info("Emotion model ready")
# extract mfcc from segments
@staticmethod
def extract_mfcc_segment(
audio: np.ndarray,
sr: int,
start: float,
end: float,
duration: float = 5.0,
n_mfcc: int = 128,
n_fft: int = 2048,
hop_length: int = 512
):
start_sample = int(start * sr)
end_sample = int(end * sr)
segment = audio[start_sample:end_sample]
if len(segment) == 0:
return None
target_len = int(sr * duration)
if len(segment) < target_len:
segment = np.pad(segment,(0,target_len-len(segment)),mode="symmetric")
else:
segment = segment[:target_len]
mfcc = librosa.feature.mfcc(
y=segment,
sr=sr,
n_mfcc=n_mfcc,
n_fft=n_fft,
hop_length=hop_length
)
return mfcc
@classmethod
def predict_from_mfcc(cls, mfcc):
model = cls.get_model()
tensor = torch.from_numpy(mfcc).unsqueeze(0).unsqueeze(0).float()
device = next(model.parameters()).device
tensor = tensor.to(device)
with torch.no_grad():
output = model(tensor)
probs = F.softmax(output.squeeze(), dim=0).cpu().numpy()
label = cls.emotion_labels[np.argmax(probs)]
return label
# predict from segments
@classmethod
def predict_segment(cls, audio, sr, start, end):
mfcc = cls.extract_mfcc_segment(audio, sr, start, end)
if mfcc is None:
return "Neutral"
return cls.predict_from_mfcc(mfcc)