| |
| """Evaluate audio model (LoRA ONNX or base) on a subset of val manifest. |
| |
| Usage: |
| python scripts/eval_audio_on_subset.py \ |
| --val-manifest data/lora_dataset/val_manifest.json \ |
| --source ravdess \ |
| --model lora_onnx \ |
| --onnx data/models/lora_emotion2vec_7class/model.onnx |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import logging |
| from collections import Counter |
| from pathlib import Path |
|
|
| import numpy as np |
|
|
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
| PROJECT_LABELS = ["neutral", "joy", "sadness", "anger", "surprise", "fear", "disgust"] |
|
|
| LORA_LABELS = ["happiness", "anger", "disgust", "fear", "neutral", "sadness", "surprise"] |
| LORA_TO_PROJECT = { |
| "happiness": "joy", "anger": "anger", "disgust": "disgust", |
| "fear": "fear", "neutral": "neutral", "sadness": "sadness", "surprise": "surprise", |
| } |
|
|
| BASE_LABEL_MAP = { |
| "angry": "anger", "disgusted": "disgust", "fearful": "fear", |
| "happy": "joy", "neutral": "neutral", "sad": "sadness", "surprised": "surprise", |
| "other": "neutral", "unknown": "neutral", |
| "生气/angry": "anger", "厌恶/disgusted": "disgust", "恐惧/fearful": "fear", |
| "开心/happy": "joy", "中立/neutral": "neutral", "难过/sad": "sadness", |
| "吃惊/surprised": "surprise", "其他/other": "neutral", "<unk>": "neutral", |
| } |
|
|
|
|
| def predict_lora_onnx(audio_path: str, session, max_seconds: float = 15.0): |
| import soundfile as sf |
| audio, sr = sf.read(audio_path, dtype="float32") |
| if audio.ndim == 2: |
| audio = audio.mean(axis=1) |
| if sr != 16000: |
| import librosa |
| audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) |
| max_samples = int(max_seconds * 16000) |
| if len(audio) > max_samples: |
| audio = audio[:max_samples] |
| waveform = audio.reshape(1, -1).astype(np.float32) |
| logits = session.run(None, {"waveform": waveform})[0] |
| exp_logits = np.exp(logits - logits.max(axis=-1, keepdims=True)) |
| probs = (exp_logits / exp_logits.sum(axis=-1, keepdims=True)).squeeze() |
| scores = {label: 0.0 for label in PROJECT_LABELS} |
| for lora_label, prob in zip(LORA_LABELS, probs): |
| scores[LORA_TO_PROJECT[lora_label]] = float(prob) |
| return max(scores, key=scores.get) |
|
|
|
|
| def predict_base(audio_path: str, funasr_model): |
| try: |
| output = funasr_model.generate(audio_path, granularity="utterance", extract_embedding=False) |
| except Exception: |
| return "neutral" |
| scores = {label: 0.0 for label in PROJECT_LABELS} |
| if output and isinstance(output, list) and len(output) > 0: |
| rec = output[0] |
| for native_label, score in zip(rec.get("labels", []), rec.get("scores", [])): |
| pl = BASE_LABEL_MAP.get(native_label, "neutral") |
| scores[pl] += float(score) |
| return max(scores, key=scores.get) |
|
|
|
|
| def f1_score(y_true, y_pred, label): |
| tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label) |
| fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label) |
| fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label) |
| if tp + fp == 0 or tp + fn == 0: |
| return 0.0 |
| p = tp / (tp + fp); r = tp / (tp + fn) |
| return 2 * p * r / (p + r) if (p + r) > 0 else 0.0 |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--val-manifest", default="data/lora_dataset/val_manifest.json") |
| parser.add_argument("--source", default="ravdess", help="Filter by source: ravdess, 263, 71631") |
| parser.add_argument("--model", choices=["lora_onnx", "base"], default="lora_onnx") |
| parser.add_argument("--onnx", default="data/models/lora_emotion2vec_7class/model.onnx") |
| args = parser.parse_args() |
|
|
| with open(args.val_manifest) as f: |
| val = json.load(f) |
| samples = [s for s in val if s["source"] == args.source] |
| logger.info("Evaluating %s on %d %s samples", args.model, len(samples), args.source) |
|
|
| |
| for s in samples: |
| if s["label"] == "happiness": |
| s["label"] = "joy" |
|
|
| if args.model == "lora_onnx": |
| import onnxruntime as ort |
| session = ort.InferenceSession(args.onnx, providers=["CPUExecutionProvider"]) |
| predict_fn = lambda p: predict_lora_onnx(p, session) |
| else: |
| from funasr import AutoModel |
| model = AutoModel(model="iic/emotion2vec_plus_base", device="cpu", hub="hf") |
| predict_fn = lambda p: predict_base(p, model) |
|
|
| y_true, y_pred = [], [] |
| for i, s in enumerate(samples): |
| y_true.append(s["label"]) |
| y_pred.append(predict_fn(s["path"])) |
| if (i + 1) % 100 == 0: |
| logger.info("Progress: %d / %d", i + 1, len(samples)) |
|
|
| |
| f1s = {label: f1_score(y_true, y_pred, label) for label in PROJECT_LABELS} |
| macro_f1 = np.mean(list(f1s.values())) |
| acc = sum(1 for t, p in zip(y_true, y_pred) if t == p) / len(samples) |
|
|
| print() |
| print(f"=== {args.model} on {args.source} ({len(samples)} samples) ===") |
| print(f"Macro F1: {macro_f1:.4f}") |
| print(f"Accuracy: {acc:.4f}") |
| print("Per-class F1:") |
| for label, f1 in f1s.items(): |
| support = sum(1 for t in y_true if t == label) |
| if support > 0: |
| print(f" {label:<12} {f1:.4f} (n={support})") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|