#!/usr/bin/env python3 """Evaluate audio model (LoRA ONNX or base) on a subset of val manifest. Usage: python scripts/eval_audio_on_subset.py \ --val-manifest data/lora_dataset/val_manifest.json \ --source ravdess \ --model lora_onnx \ --onnx data/models/lora_emotion2vec_7class/model.onnx """ from __future__ import annotations import argparse import json import logging from collections import Counter from pathlib import Path import numpy as np logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) PROJECT_LABELS = ["neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"] LORA_LABELS = ["happiness", "anger", "disgust", "fear", "neutral", "sadness", "surprise"] LORA_TO_PROJECT = { "happiness": "joy", "anger": "anger", "disgust": "disgust", "fear": "fear", "neutral": "neutral", "sadness": "sadness", "surprise": "surprise", } BASE_LABEL_MAP = { "angry": "anger", "disgusted": "disgust", "fearful": "fear", "happy": "joy", "neutral": "neutral", "sad": "sadness", "surprised": "surprise", "other": "neutral", "unknown": "neutral", "生气/angry": "anger", "厌恶/disgusted": "disgust", "恐惧/fearful": "fear", "开心/happy": "joy", "中立/neutral": "neutral", "难过/sad": "sadness", "吃惊/surprised": "surprise", "其他/other": "neutral", "": "neutral", } def predict_lora_onnx(audio_path: str, session, max_seconds: float = 15.0): import soundfile as sf audio, sr = sf.read(audio_path, dtype="float32") if audio.ndim == 2: audio = audio.mean(axis=1) if sr != 16000: import librosa audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) max_samples = int(max_seconds * 16000) if len(audio) > max_samples: audio = audio[:max_samples] waveform = audio.reshape(1, -1).astype(np.float32) logits = session.run(None, {"waveform": waveform})[0] exp_logits = np.exp(logits - logits.max(axis=-1, keepdims=True)) probs = (exp_logits / exp_logits.sum(axis=-1, keepdims=True)).squeeze() scores = {label: 0.0 for label in PROJECT_LABELS} for lora_label, prob in zip(LORA_LABELS, probs): scores[LORA_TO_PROJECT[lora_label]] = float(prob) return max(scores, key=scores.get) def predict_base(audio_path: str, funasr_model): try: output = funasr_model.generate(audio_path, granularity="utterance", extract_embedding=False) except Exception: return "neutral" scores = {label: 0.0 for label in PROJECT_LABELS} if output and isinstance(output, list) and len(output) > 0: rec = output[0] for native_label, score in zip(rec.get("labels", []), rec.get("scores", [])): pl = BASE_LABEL_MAP.get(native_label, "neutral") scores[pl] += float(score) return max(scores, key=scores.get) def f1_score(y_true, y_pred, label): tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label) fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label) fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label) if tp + fp == 0 or tp + fn == 0: return 0.0 p = tp / (tp + fp); r = tp / (tp + fn) return 2 * p * r / (p + r) if (p + r) > 0 else 0.0 def main(): parser = argparse.ArgumentParser() parser.add_argument("--val-manifest", default="data/lora_dataset/val_manifest.json") parser.add_argument("--source", default="ravdess", help="Filter by source: ravdess, 263, 71631") parser.add_argument("--model", choices=["lora_onnx", "base"], default="lora_onnx") parser.add_argument("--onnx", default="data/models/lora_emotion2vec_7class/model.onnx") args = parser.parse_args() with open(args.val_manifest) as f: val = json.load(f) samples = [s for s in val if s["source"] == args.source] logger.info("Evaluating %s on %d %s samples", args.model, len(samples), args.source) # Normalize labels: happiness → joy for s in samples: if s["label"] == "happiness": s["label"] = "joy" if args.model == "lora_onnx": import onnxruntime as ort session = ort.InferenceSession(args.onnx, providers=["CPUExecutionProvider"]) predict_fn = lambda p: predict_lora_onnx(p, session) else: from funasr import AutoModel model = AutoModel(model="iic/emotion2vec_plus_base", device="cpu", hub="hf") predict_fn = lambda p: predict_base(p, model) y_true, y_pred = [], [] for i, s in enumerate(samples): y_true.append(s["label"]) y_pred.append(predict_fn(s["path"])) if (i + 1) % 100 == 0: logger.info("Progress: %d / %d", i + 1, len(samples)) # Per-class F1 f1s = {label: f1_score(y_true, y_pred, label) for label in PROJECT_LABELS} macro_f1 = np.mean(list(f1s.values())) acc = sum(1 for t, p in zip(y_true, y_pred) if t == p) / len(samples) print() print(f"=== {args.model} on {args.source} ({len(samples)} samples) ===") print(f"Macro F1: {macro_f1:.4f}") print(f"Accuracy: {acc:.4f}") print("Per-class F1:") for label, f1 in f1s.items(): support = sum(1 for t in y_true if t == label) if support > 0: print(f" {label:<12} {f1:.4f} (n={support})") if __name__ == "__main__": main()