""" 评估指标模块 Metrics for PAD Predictor Evaluation 该模块包含了PAD预测器的各种评估指标,包括: - 回归指标:MAE、RMSE、R² - 置信度评估指标:ECE(Expected Calibration Error) - 可靠性图表功能 """ import torch import torch.nn.functional as F import numpy as np from typing import Dict, List, Tuple, Optional, Any import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import logging class RegressionMetrics: """回归评估指标类""" def __init__(self): self.logger = logging.getLogger(__name__) @staticmethod def mae(y_true: torch.Tensor, y_pred: torch.Tensor, reduction: str = 'mean') -> torch.Tensor: """ 平均绝对误差 (Mean Absolute Error) Args: y_true: 真实值 y_pred: 预测值 reduction: 聚合方式 ('mean', 'sum', 'none') Returns: MAE值 """ mae = torch.mean(torch.abs(y_pred - y_true), dim=0) if reduction == 'mean': return torch.mean(mae) elif reduction == 'sum': return torch.sum(mae) else: return mae @staticmethod def rmse(y_true: torch.Tensor, y_pred: torch.Tensor, reduction: str = 'mean') -> torch.Tensor: """ 均方根误差 (Root Mean Square Error) Args: y_true: 真实值 y_pred: 预测值 reduction: 聚合方式 ('mean', 'sum', 'none') Returns: RMSE值 """ mse = torch.mean((y_pred - y_true) ** 2, dim=0) rmse = torch.sqrt(mse) if reduction == 'mean': return torch.mean(rmse) elif reduction == 'sum': return torch.sum(rmse) else: return rmse @staticmethod def r2_score(y_true: torch.Tensor, y_pred: torch.Tensor, reduction: str = 'mean') -> torch.Tensor: """ R²决定系数 (Coefficient of Determination) Args: y_true: 真实值 y_pred: 预测值 reduction: 聚合方式 ('mean', 'sum', 'none') Returns: R²值 """ # 计算总平方和 ss_tot = torch.sum((y_true - torch.mean(y_true, dim=0)) ** 2, dim=0) # 计算残差平方和 ss_res = torch.sum((y_true - y_pred) ** 2, dim=0) # 避免除零 r2 = 1 - (ss_res / (ss_tot + 1e-8)) if reduction == 'mean': return torch.mean(r2) elif reduction == 'sum': return torch.sum(r2) else: return r2 @staticmethod def robust_r2(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor: """ 稳健R²决定系数(Robust R² for Multi-Output Regression) 先对所有维度求和SS_res和SS_tot,然后计算一个总的R²。 这种方法更适合多目标回归,因为它考虑了所有目标的总方差。 公式:R²_robust = 1 - Σ(SS_res_all) / Σ(SS_tot_all) Args: y_true: 真实值,形状为 (batch_size, output_dim) y_pred: 预测值,形状为 (batch_size, output_dim) Returns: 稳健R²值(标量) """ # 对所有维度和样本求和的残差平方和 ss_res_total = torch.sum((y_true - y_pred) ** 2) # 对所有维度和样本求和的总平方和 ss_tot_total = torch.sum((y_true - torch.mean(y_true, dim=0)) ** 2) # 避免除零 r2_robust = 1 - (ss_res_total / (ss_tot_total + 1e-8)) return r2_robust @staticmethod def mape(y_true: torch.Tensor, y_pred: torch.Tensor, reduction: str = 'mean') -> torch.Tensor: """ 平均绝对百分比误差 (Mean Absolute Percentage Error) Args: y_true: 真实值 y_pred: 预测值 reduction: 聚合方式 Returns: MAPE值 """ # 避免除零 mape = torch.mean(torch.abs((y_pred - y_true) / (y_true + 1e-8)), dim=0) if reduction == 'mean': return torch.mean(mape) elif reduction == 'sum': return torch.sum(mape) else: return mape def compute_all_metrics(self, y_true: torch.Tensor, y_pred: torch.Tensor, component_names: List[str] = None) -> Dict[str, Dict[str, float]]: """ 计算所有回归指标 Args: y_true: 真实值,形状为 (batch_size, output_dim) y_pred: 预测值,形状为 (batch_size, output_dim) component_names: 组件名称列表 Returns: 包含所有指标的嵌套字典 """ if component_names is None: component_names = ['delta_pad_p', 'delta_pad_a', 'delta_pad_d'] # 3维输出(移除confidence和delta_pressure) metrics = {} # 计算整体指标 metrics['overall'] = { 'mae': self.mae(y_true, y_pred).item(), 'rmse': self.rmse(y_true, y_pred).item(), 'r2': self.r2_score(y_true, y_pred).item(), 'r2_robust': self.robust_r2(y_true, y_pred).item(), # 新增稳健R² 'mape': self.mape(y_true, y_pred).item() } # 计算各组件指标 component_metrics = {} for i, name in enumerate(component_names): if i < y_true.size(1): component_metrics[name] = { 'mae': self.mae(y_true[:, i], y_pred[:, i]).item(), 'rmse': self.rmse(y_true[:, i], y_pred[:, i]).item(), 'r2': self.r2_score(y_true[:, i], y_pred[:, i]).item(), 'mape': self.mape(y_true[:, i], y_pred[:, i]).item() } metrics['components'] = component_metrics return metrics def print_diagnostic_metrics(self, y_true: torch.Tensor, y_pred: torch.Tensor, component_names: List[str] = None) -> None: """ 打印诊断模式下的详细指标(每个维度的独立得分) Args: y_true: 真实值,形状为 (batch_size, output_dim) y_pred: 预测值,形状为 (batch_size, output_dim) component_names: 组件名称列表 """ if component_names is None: component_names = ['ΔPAD_P', 'ΔPAD_A', 'ΔPAD_D'] # 3维输出 print("\n" + "="*80) print("🔍 诊断模式:各维度独立指标") print("="*80) # 计算稳健R² r2_robust = self.robust_r2(y_true, y_pred).item() r2_mean = self.r2_score(y_true, y_pred).item() print(f"\n📊 整体指标:") print(f" 稳健 R² (Robust R²): {r2_robust:.6f} ← 所有维度总方差比") print(f" 平均 R² (Mean R²) : {r2_mean:.6f} ← 各维度R²的算术平均") print(f" 差异 : {r2_robust - r2_mean:+.6f}") print(f"\n📐 各维度详细指标:") print(f"{'维度':<15} {'R²':<12} {'MAE':<12} {'RMSE':<12} {'MAPE':<12}") print("-" * 80) for i, name in enumerate(component_names): if i < y_true.size(1): mae = self.mae(y_true[:, i], y_pred[:, i]).item() rmse = self.rmse(y_true[:, i], y_pred[:, i]).item() r2 = self.r2_score(y_true[:, i], y_pred[:, i]).item() mape = self.mape(y_true[:, i], y_pred[:, i]).item() # R²值颜色标记 r2_str = f"{r2:.6f}" if r2 >= 0.8: r2_str = f"✅ {r2_str}" elif r2 >= 0.5: r2_str = f"⚠️ {r2_str}" else: r2_str = f"❌ {r2_str}" print(f"{name:<15} {r2_str:<12} {mae:<12.6f} {rmse:<12.6f} {mape:<12.6f}") print("="*80 + "\n") class CalibrationMetrics: """置信度校准评估指标类""" def __init__(self, n_bins: int = 10): """ 初始化校准指标 Args: n_bins: 分箱数量 """ self.n_bins = n_bins self.logger = logging.getLogger(__name__) def expected_calibration_error(self, predictions: torch.Tensor, targets: torch.Tensor, confidences: torch.Tensor) -> Tuple[float, List[Tuple]]: """ 计算期望校准误差 (Expected Calibration Error) Args: predictions: 预测值,形状为 (batch_size, 4) targets: 真实值,形状为 (batch_size, 4) confidences: 置信度,形状为 (batch_size, 1) Returns: ECE值和分箱信息 """ # 计算预测误差 errors = torch.mean((predictions - targets) ** 2, dim=1, keepdim=True) # 将置信度归一化到[0,1] confidences_norm = torch.sigmoid(confidences) # 分箱 bin_boundaries = torch.linspace(0, 1, self.n_bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] ece = torch.tensor(0.0, device=confidences_norm.device) bin_info = [] for bin_lower, bin_upper in zip(bin_lowers, bin_uppers): # 找到在当前分箱中的样本 in_bin = (confidences_norm > bin_lower) & (confidences_norm <= bin_upper) prop_in_bin = in_bin.float().mean() if prop_in_bin > 0: # 计算当前分箱的平均置信度和平均误差 avg_confidence_in_bin = confidences_norm[in_bin].mean() avg_error_in_bin = errors[in_bin].mean() # 计算ECE贡献 ece += torch.abs(avg_confidence_in_bin - avg_error_in_bin) * prop_in_bin bin_info.append({ 'bin_lower': bin_lower.item(), 'bin_upper': bin_upper.item(), 'count': in_bin.sum().item(), 'avg_confidence': avg_confidence_in_bin.item(), 'avg_error': avg_error_in_bin.item(), 'accuracy': (1 - avg_error_in_bin).item() }) return ece.item(), bin_info def reliability_diagram(self, predictions: torch.Tensor, targets: torch.Tensor, confidences: torch.Tensor, save_path: Optional[str] = None) -> None: """ 绘制可靠性图表 Args: predictions: 预测值 targets: 真实值 confidences: 置信度 save_path: 保存路径 """ ece, bin_info = self.expected_calibration_error(predictions, targets, confidences) # 提取分箱信息 bin_lowers = [info['bin_lower'] for info in bin_info] bin_uppers = [info['bin_upper'] for info in bin_info] avg_confidences = [info['avg_confidence'] for info in bin_info] accuracies = [info['accuracy'] for info in bin_info] counts = [info['count'] for info in bin_info] # 计算分箱中心 bin_centers = [(lower + upper) / 2 for lower, upper in zip(bin_lowers, bin_uppers)] # 创建图表 plt.figure(figsize=(10, 6)) # 绘制可靠性图表 plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration') plt.plot(bin_centers, accuracies, 'bo-', label='Model', linewidth=2, markersize=8) # 添加柱状图显示样本数量 ax2 = plt.gca().twinx() ax2.bar(bin_centers, counts, width=0.1, alpha=0.3, color='gray', label='Sample Count') ax2.set_ylabel('Sample Count', fontsize=12) ax2.set_ylim(0, max(counts) * 1.2 if counts else 1) # 设置图表属性 plt.xlabel('Confidence', fontsize=12) plt.ylabel('Accuracy', fontsize=12) plt.title(f'Reliability Diagram (ECE = {ece:.4f})', fontsize=14) plt.legend(loc='upper left') plt.grid(True, alpha=0.3) plt.xlim(0, 1) plt.ylim(0, 1) # 保存图表 if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') self.logger.info(f"可靠性图表已保存到: {save_path}") plt.show() def sharpness(self, confidences: torch.Tensor) -> float: """ 计算置信度的锐度 (Sharpness) Args: confidences: 置信度 Returns: 锐度值(置信度的标准差) """ confidences_norm = torch.sigmoid(confidences) return torch.std(confidences_norm).item() class PADMetrics: """PAD特定的评估指标类""" def __init__(self): self.regression_metrics = RegressionMetrics() self.calibration_metrics = CalibrationMetrics() self.logger = logging.getLogger(__name__) def evaluate_predictions(self, predictions: torch.Tensor, targets: torch.Tensor, component_names: List[str] = None) -> Dict[str, Any]: """ 全面评估预测结果 Args: predictions: 预测值,形状为 (batch_size, 4) 或 (4,) targets: 真实值,形状为 (batch_size, 4) 或 (4,) component_names: 组件名称列表 Returns: 包含所有评估指标的字典 """ if component_names is None: component_names = ['delta_pad_p', 'delta_pad_a', 'delta_pad_d'] # 3维输出 # 确保张量至少是2维的 if predictions.dim() == 1: predictions = predictions.unsqueeze(0) if targets.dim() == 1: targets = targets.unsqueeze(0) results = {} # 1. 回归指标 regression_results = self.regression_metrics.compute_all_metrics( predictions, targets, component_names ) results['regression'] = regression_results # 添加稳健R²到顶层结果中方便访问 results['r2_robust'] = regression_results['overall']['r2_robust'] results['r2_mean'] = regression_results['overall']['r2'] # 2. PAD特定的指标 # 计算PAD向量的角度误差 delta_pad_pred = predictions[:, :3] delta_pad_true = targets[:, :3] # 计算余弦相似度 cos_sim = F.cosine_similarity(delta_pad_pred, delta_pad_true, dim=1) angle_error = torch.acos(torch.clamp(cos_sim, -1 + 1e-8, 1 - 1e-8)) * 180 / np.pi results['pad_specific'] = { 'cosine_similarity_mean': cos_sim.mean().item(), 'cosine_similarity_std': cos_sim.std().item(), 'angle_error_mean': angle_error.mean().item(), 'angle_error_std': angle_error.std().item() } return results def evaluate_predictions_diagnostic(self, predictions: torch.Tensor, targets: torch.Tensor, component_names: List[str] = None) -> Dict[str, Any]: """ 诊断模式评估:打印详细指标并返回结果 Args: predictions: 预测值 targets: 真实值 component_names: 组件名称列表 Returns: 包含所有评估指标的字典 """ # 先打印诊断指标 self.regression_metrics.print_diagnostic_metrics(predictions, targets, component_names) # 然后返回完整结果 return self.evaluate_predictions(predictions, targets, component_names) def generate_evaluation_report(self, predictions: torch.Tensor, targets: torch.Tensor, save_path: Optional[str] = None) -> str: """ 生成评估报告 Args: predictions: 预测值 targets: 真实值 save_path: 报告保存路径 Returns: 评估报告文本 """ results = self.evaluate_predictions(predictions, targets) # 生成报告 report = [] report.append("=" * 60) report.append("PAD预测器评估报告") report.append("=" * 60) # 整体回归指标 report.append("\n1. 整体回归指标:") overall = results['regression']['overall'] report.append(f" MAE: {overall['mae']:.6f}") report.append(f" RMSE: {overall['rmse']:.6f}") report.append(f" R² (平均): {overall['r2']:.6f}") report.append(f" R² (稳健): {overall['r2_robust']:.6f} ← 所有维度总方差比") report.append(f" MAPE: {overall['mape']:.6f}") # 组件回归指标 report.append("\n2. 各组件回归指标:") components = results['regression']['components'] for name, metrics in components.items(): report.append(f" {name}:") report.append(f" MAE: {metrics['mae']:.6f}") report.append(f" RMSE: {metrics['rmse']:.6f}") report.append(f" R²: {metrics['r2']:.6f}") # 校准指标(已移除 - Confidence 不再作为输出维度) # 注:置信度现在通过 MC Dropout 动态计算,不包含在评估报告中 # report.append("\n3. 置信度校准指标:") # calibration = results.get('calibration', {}) # report.append(f" ECE: {calibration.get('ece', 0):.6f}") # report.append(f" Sharpness: {calibration.get('sharpness', 0):.6f}") # PAD特定指标 report.append("\n3. PAD特定指标:") pad_specific = results['pad_specific'] report.append(f" 余弦相似度 (均值±标准差): {pad_specific['cosine_similarity_mean']:.4f} ± {pad_specific['cosine_similarity_std']:.4f}") report.append(f" 角度误差 (均值±标准差): {pad_specific['angle_error_mean']:.2f}° ± {pad_specific['angle_error_std']:.2f}°") report.append("\n" + "=" * 60) report_text = "\n".join(report) # 保存报告 if save_path: with open(save_path, 'w', encoding='utf-8') as f: f.write(report_text) self.logger.info(f"评估报告已保存到: {save_path}") return report_text def create_metrics(metric_type: str = 'pad', **kwargs) -> Any: """ 创建评估指标的工厂函数 Args: metric_type: 指标类型 ('regression', 'calibration', 'pad') **kwargs: 指标参数 Returns: 指标实例 """ if metric_type == 'regression': return RegressionMetrics() elif metric_type == 'calibration': return CalibrationMetrics(**kwargs) elif metric_type == 'pad': return PADMetrics() else: raise ValueError(f"不支持的指标类型: {metric_type}") if __name__ == "__main__": # 测试代码 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 创建测试数据 batch_size = 100 predictions = torch.randn(batch_size, 5).to(device) targets = torch.randn(batch_size, 5).to(device) print("测试评估指标:") print(f"输入形状: {predictions.shape}") # 测试回归指标 regression_metrics = RegressionMetrics() regression_results = regression_metrics.compute_all_metrics(predictions, targets) print(f"\n整体回归指标:") for key, value in regression_results['overall'].items(): print(f" {key}: {value:.6f}") # 测试校准指标 calibration_metrics = CalibrationMetrics(n_bins=10) pred_components = predictions[:, :4] target_components = targets[:, :4] pred_confidence = predictions[:, 4:5] ece, bin_info = calibration_metrics.expected_calibration_error( pred_components, target_components, pred_confidence ) print(f"\nECE: {ece:.6f}") # 测试PAD指标 pad_metrics = PADMetrics() full_results = pad_metrics.evaluate_predictions(predictions, targets) print(f"\n校准指标:") calibration = full_results['calibration'] print(f" ECE: {calibration['ece']:.6f}") print(f" Sharpness: {calibration['sharpness']:.6f}") print(f"\nPAD特定指标:") pad_specific = full_results['pad_specific'] for key, value in pad_specific.items(): print(f" {key}: {value:.6f}") # 生成评估报告 report = pad_metrics.generate_evaluation_report(predictions, targets) print(f"\n评估报告:") print(report) print("\n评估指标测试完成!")