ustwo-api / scripts /eval_audio_on_subset.py
asdfasdfqrqwer's picture
Deploy from GitHub 2026-04-23T03:56:31Z
c857b85
Raw
History Blame Contribute Delete
5.4 kB
#!/usr/bin/env python3
"""Evaluate audio model (LoRA ONNX or base) on a subset of val manifest.
Usage:
python scripts/eval_audio_on_subset.py \
--val-manifest data/lora_dataset/val_manifest.json \
--source ravdess \
--model lora_onnx \
--onnx data/models/lora_emotion2vec_7class/model.onnx
"""
from __future__ import annotations
import argparse
import json
import logging
from collections import Counter
from pathlib import Path
import numpy as np
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
PROJECT_LABELS = ["neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"]
LORA_LABELS = ["happiness", "anger", "disgust", "fear", "neutral", "sadness", "surprise"]
LORA_TO_PROJECT = {
"happiness": "joy", "anger": "anger", "disgust": "disgust",
"fear": "fear", "neutral": "neutral", "sadness": "sadness", "surprise": "surprise",
}
BASE_LABEL_MAP = {
"angry": "anger", "disgusted": "disgust", "fearful": "fear",
"happy": "joy", "neutral": "neutral", "sad": "sadness", "surprised": "surprise",
"other": "neutral", "unknown": "neutral",
"生气/angry": "anger", "厌恶/disgusted": "disgust", "恐惧/fearful": "fear",
"开心/happy": "joy", "中立/neutral": "neutral", "难过/sad": "sadness",
"吃惊/surprised": "surprise", "其他/other": "neutral", "<unk>": "neutral",
}
def predict_lora_onnx(audio_path: str, session, max_seconds: float = 15.0):
import soundfile as sf
audio, sr = sf.read(audio_path, dtype="float32")
if audio.ndim == 2:
audio = audio.mean(axis=1)
if sr != 16000:
import librosa
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
max_samples = int(max_seconds * 16000)
if len(audio) > max_samples:
audio = audio[:max_samples]
waveform = audio.reshape(1, -1).astype(np.float32)
logits = session.run(None, {"waveform": waveform})[0]
exp_logits = np.exp(logits - logits.max(axis=-1, keepdims=True))
probs = (exp_logits / exp_logits.sum(axis=-1, keepdims=True)).squeeze()
scores = {label: 0.0 for label in PROJECT_LABELS}
for lora_label, prob in zip(LORA_LABELS, probs):
scores[LORA_TO_PROJECT[lora_label]] = float(prob)
return max(scores, key=scores.get)
def predict_base(audio_path: str, funasr_model):
try:
output = funasr_model.generate(audio_path, granularity="utterance", extract_embedding=False)
except Exception:
return "neutral"
scores = {label: 0.0 for label in PROJECT_LABELS}
if output and isinstance(output, list) and len(output) > 0:
rec = output[0]
for native_label, score in zip(rec.get("labels", []), rec.get("scores", [])):
pl = BASE_LABEL_MAP.get(native_label, "neutral")
scores[pl] += float(score)
return max(scores, key=scores.get)
def f1_score(y_true, y_pred, label):
tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label)
fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label)
fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label)
if tp + fp == 0 or tp + fn == 0:
return 0.0
p = tp / (tp + fp); r = tp / (tp + fn)
return 2 * p * r / (p + r) if (p + r) > 0 else 0.0
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--val-manifest", default="data/lora_dataset/val_manifest.json")
parser.add_argument("--source", default="ravdess", help="Filter by source: ravdess, 263, 71631")
parser.add_argument("--model", choices=["lora_onnx", "base"], default="lora_onnx")
parser.add_argument("--onnx", default="data/models/lora_emotion2vec_7class/model.onnx")
args = parser.parse_args()
with open(args.val_manifest) as f:
val = json.load(f)
samples = [s for s in val if s["source"] == args.source]
logger.info("Evaluating %s on %d %s samples", args.model, len(samples), args.source)
# Normalize labels: happiness → joy
for s in samples:
if s["label"] == "happiness":
s["label"] = "joy"
if args.model == "lora_onnx":
import onnxruntime as ort
session = ort.InferenceSession(args.onnx, providers=["CPUExecutionProvider"])
predict_fn = lambda p: predict_lora_onnx(p, session)
else:
from funasr import AutoModel
model = AutoModel(model="iic/emotion2vec_plus_base", device="cpu", hub="hf")
predict_fn = lambda p: predict_base(p, model)
y_true, y_pred = [], []
for i, s in enumerate(samples):
y_true.append(s["label"])
y_pred.append(predict_fn(s["path"]))
if (i + 1) % 100 == 0:
logger.info("Progress: %d / %d", i + 1, len(samples))
# Per-class F1
f1s = {label: f1_score(y_true, y_pred, label) for label in PROJECT_LABELS}
macro_f1 = np.mean(list(f1s.values()))
acc = sum(1 for t, p in zip(y_true, y_pred) if t == p) / len(samples)
print()
print(f"=== {args.model} on {args.source} ({len(samples)} samples) ===")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Accuracy: {acc:.4f}")
print("Per-class F1:")
for label, f1 in f1s.items():
support = sum(1 for t in y_true if t == label)
if support > 0:
print(f" {label:<12} {f1:.4f} (n={support})")
if __name__ == "__main__":
main()