# -*- coding: utf-8 -*- """ 音源质量评分模块 提供多维度的音频质量评估,用于筛选最佳样本 支持时长、音量稳定性、音高稳定性三个评估维度 """ import logging import numpy as np from typing import Dict, List, Optional, Tuple logger = logging.getLogger(__name__) def duration_score(duration: float) -> float: """ 时长评分:适中时长得分最高 参数: duration: 音频时长(秒) 返回: 0~1 的分数 评分逻辑: - 过短(<0.2s):发音不完整,低分 - 最佳范围(0.3~0.8s):满分 - 过长(>1.0s):可能包含多字或拖音,递减 """ if duration < 0.2: return duration / 0.2 * 0.5 # 0~0.5分 elif duration <= 0.8: return 1.0 # 满分 elif duration <= 1.2: return 1.0 - (duration - 0.8) / 0.4 * 0.3 # 0.7~1.0分 else: return max(0.3, 0.7 - (duration - 1.2) * 0.2) # 递减,最低0.3 def rms_variance_score(audio: np.ndarray, sr: int, frame_ms: int = 20) -> float: """ 音量稳定性评分:RMS 方差越小越好 参数: audio: 音频数据(numpy 数组) sr: 采样率 frame_ms: 帧长度(毫秒) 返回: 0~1 的分数 计算步骤: 1. 将音频分帧 2. 计算每帧的 RMS 能量 3. 计算 RMS 序列的方差 4. 归一化到 0~1 分数 """ frame_size = int(sr * frame_ms / 1000) if frame_size <= 0: return 0.5 frames = len(audio) // frame_size if frames < 2: return 0.5 # 太短无法评估 rms_values = [] for i in range(frames): frame = audio[i * frame_size : (i + 1) * frame_size] rms = np.sqrt(np.mean(frame.astype(np.float64) ** 2)) rms_values.append(rms) if len(rms_values) < 2: return 0.5 # 归一化 RMS 值(避免绝对值影响) rms_array = np.array(rms_values) mean_rms = np.mean(rms_array) if mean_rms > 0: rms_normalized = rms_array / mean_rms variance = np.var(rms_normalized) else: variance = 0 # 归一化:方差越小分数越高 # 经验阈值:方差 < 0.01 为优秀,> 0.5 为较差 score = max(0, 1.0 - variance * 2) return min(1.0, score) def f0_variance_score(audio: np.ndarray, sr: int) -> float: """ 音高稳定性评分:F0 方差越小越好 参数: audio: 音频数据(numpy 数组) sr: 采样率 返回: 0~1 的分数 计算步骤: 1. 使用 librosa.pyin 提取 F0 2. 过滤无声帧(F0=NaN) 3. 转换为音分计算方差 4. 归一化到 0~1 分数 """ try: import librosa except ImportError: logger.warning("librosa 未安装,无法计算 F0 方差") return 0.5 try: # 提取 F0(使用 pyin 算法) f0, voiced_flag, voiced_probs = librosa.pyin( audio.astype(np.float32), fmin=librosa.note_to_hz('C2'), # ~65Hz fmax=librosa.note_to_hz('C6'), # ~1047Hz sr=sr ) # 过滤无效值 valid_f0 = f0[~np.isnan(f0)] if len(valid_f0) < 3: return 0.5 # 无法评估 # 转换为音分(cents)计算方差,避免频率绝对值影响 # cents = 1200 * log2(f / f_ref) median_f0 = np.median(valid_f0) if median_f0 <= 0: return 0.5 f0_cents = 1200 * np.log2(valid_f0 / median_f0) variance = np.var(f0_cents) # 归一化:方差 < 100 cents² 为优秀,> 10000 cents² 为较差 # 100 cents ≈ 1个半音 score = max(0, 1.0 - variance / 10000) return min(1.0, score) except Exception as e: logger.warning(f"F0 计算失败: {e}") return 0.5 class QualityScorer: """ 音频质量评分器 支持多维度评估和加权综合评分 """ # 默认权重 DEFAULT_WEIGHTS = { "duration": 0.3, "rms": 0.3, "f0": 0.4 } def __init__( self, enabled_metrics: Optional[List[str]] = None, weights: Optional[Dict[str, float]] = None ): """ 初始化评分器 参数: enabled_metrics: 启用的评分维度,如 ["duration", "rms", "f0"] weights: 各维度权重 """ self.enabled_metrics = enabled_metrics or ["duration"] self.weights = weights or self.DEFAULT_WEIGHTS.copy() def score( self, audio: np.ndarray, sr: int, duration: Optional[float] = None ) -> Dict[str, float]: """ 计算音频质量分数 参数: audio: 音频数据 sr: 采样率 duration: 音频时长(秒),如不提供则自动计算 返回: 包含各维度分数和综合分数的字典 """ if duration is None: duration = len(audio) / sr scores = {} if "duration" in self.enabled_metrics: scores["duration"] = duration_score(duration) if "rms" in self.enabled_metrics: scores["rms"] = rms_variance_score(audio, sr) if "f0" in self.enabled_metrics: scores["f0"] = f0_variance_score(audio, sr) # 计算加权综合分数 if scores: total_weight = sum(self.weights.get(k, 0) for k in scores.keys()) if total_weight > 0: combined = sum( scores[k] * self.weights.get(k, 0) for k in scores.keys() ) / total_weight else: combined = sum(scores.values()) / len(scores) scores["combined"] = combined else: scores["combined"] = 0.5 return scores def score_from_file(self, wav_path: str) -> Dict[str, float]: """ 从文件计算质量分数 参数: wav_path: 音频文件路径 返回: 包含各维度分数和综合分数的字典 """ try: import soundfile as sf audio, sr = sf.read(wav_path) # 转换为单声道 if len(audio.shape) > 1: audio = audio.mean(axis=1) return self.score(audio, sr) except Exception as e: logger.error(f"读取音频文件失败 {wav_path}: {e}") return {"combined": 0.5} def calculate_quality_score( audio: np.ndarray, sr: int, weights: Optional[Dict[str, float]] = None, enabled_metrics: Optional[List[str]] = None ) -> float: """ 便捷函数:计算综合质量评分 参数: audio: 音频数据 sr: 采样率 weights: 各维度权重 enabled_metrics: 启用的评分维度 返回: 0~1 的综合分数 """ scorer = QualityScorer(enabled_metrics=enabled_metrics, weights=weights) scores = scorer.score(audio, sr) return scores.get("combined", 0.5)