| """ |
| Visualization Support for AegisLM Scoring System. |
| |
| Provides calibration curve data, reliability graph data, |
| and other visualization-ready data for advanced metrics. |
| """ |
|
|
| import numpy as np |
| from typing import List, Dict, Any, Tuple, Optional |
| from dataclasses import dataclass |
| from datetime import datetime |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| @dataclass |
| class CalibrationCurvePoint: |
| """Single point on calibration curve.""" |
| confidence_bin: float |
| accuracy: float |
| sample_count: int |
| ideal_confidence: float |
| calibration_error: float |
|
|
|
|
| @dataclass |
| class ReliabilityDiagramData: |
| """Data for reliability diagram visualization.""" |
| bins: List[float] |
| accuracies: List[float] |
| confidences: List[float] |
| sample_counts: List[int] |
| ideal_line: List[float] |
| ece: float |
|
|
|
|
| @dataclass |
| class ConsistencyHeatmapData: |
| """Data for consistency heatmap visualization.""" |
| matrix: List[List[float]] |
| labels: List[str] |
| title: str |
| color_scale: str |
|
|
|
|
| @dataclass |
| class MetricsTimeSeries: |
| """Time series data for metrics visualization.""" |
| timestamps: List[datetime] |
| calibration_scores: List[float] |
| reliability_scores: List[float] |
| consistency_scores: List[float] |
| overall_scores: List[float] |
|
|
|
|
| class ScoringVisualizer: |
| """ |
| Visualization support for scoring system metrics. |
| |
| Generates chart-ready data for calibration curves, |
| reliability diagrams, and other advanced metrics visualizations. |
| """ |
| |
| def __init__(self): |
| """Initialize scoring visualizer.""" |
| self.color_palette = [ |
| "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", |
| "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf" |
| ] |
| |
| def generate_calibration_curve_data( |
| self, |
| confidences: List[float], |
| correctness: List[bool], |
| n_bins: int = 10 |
| ) -> ReliabilityDiagramData: |
| """ |
| Generate data for calibration curve visualization. |
| |
| Args: |
| confidences: List of confidence scores |
| correctness: List of correctness indicators |
| n_bins: Number of bins for calibration curve |
| |
| Returns: |
| ReliabilityDiagramData: Calibration curve data |
| """ |
| if len(confidences) != len(correctness) or not confidences: |
| return ReliabilityDiagramData( |
| bins=[], |
| accuracies=[], |
| confidences=[], |
| sample_counts=[], |
| ideal_line=[], |
| ece=0.0 |
| ) |
| |
| |
| bin_boundaries = np.linspace(0, 1, n_bins + 1) |
| bin_lowers = bin_boundaries[:-1] |
| bin_uppers = bin_boundaries[1:] |
| bin_centers = (bin_lowers + bin_uppers) / 2 |
| |
| |
| bin_accuracies = [] |
| bin_confidences = [] |
| bin_counts = [] |
| |
| for i in range(n_bins): |
| bin_mask = (np.array(confidences) > bin_lowers[i]) & \ |
| (np.array(confidences) <= bin_uppers[i]) |
| |
| bin_samples = np.sum(bin_mask) |
| |
| if bin_samples > 0: |
| bin_correctness = np.array(correctness)[bin_mask] |
| bin_confidence_vals = np.array(confidences)[bin_mask] |
| |
| accuracy = np.mean(bin_correctness) |
| avg_confidence = np.mean(bin_confidence_vals) |
| |
| bin_accuracies.append(accuracy) |
| bin_confidences.append(avg_confidence) |
| bin_counts.append(int(bin_samples)) |
| else: |
| bin_accuracies.append(0.0) |
| bin_confidences.append(bin_centers[i]) |
| bin_counts.append(0) |
| |
| |
| ece = self._calculate_ece_from_bins( |
| bin_centers, bin_accuracies, bin_counts, len(confidences) |
| ) |
| |
| |
| ideal_line = list(bin_centers) |
| |
| return ReliabilityDiagramData( |
| bins=bin_centers.tolist(), |
| accuracies=bin_accuracies, |
| confidences=bin_confidences, |
| sample_counts=bin_counts, |
| ideal_line=ideal_line, |
| ece=ece |
| ) |
| |
| def generate_reliability_graph_data( |
| self, |
| reliability_metrics: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Generate data for reliability graph visualization. |
| |
| Args: |
| reliability_metrics: Reliability analysis metrics |
| |
| Returns: |
| Dict[str, Any]: Reliability graph data |
| """ |
| |
| reliability_data = reliability_metrics.get("reliability_data", []) |
| |
| if not reliability_data: |
| return { |
| "reliability_diagram": { |
| "bins": [], |
| "accuracies": [], |
| "confidences": [], |
| "sample_counts": [] |
| }, |
| "confidence_distribution": { |
| "labels": [], |
| "values": [] |
| }, |
| "issue_breakdown": { |
| "labels": [], |
| "values": [] |
| } |
| } |
| |
| |
| bins = [point["confidence_avg"] for point in reliability_data] |
| accuracies = [point["accuracy"] for point in reliability_data] |
| confidences = [point["confidence_avg"] for point in reliability_data] |
| sample_counts = [point["sample_count"] for point in reliability_data] |
| |
| |
| confidence_distribution = self._generate_confidence_distribution(reliability_data) |
| |
| |
| issue_breakdown = self._generate_issue_breakdown(reliability_metrics) |
| |
| return { |
| "reliability_diagram": { |
| "bins": bins, |
| "accuracies": accuracies, |
| "confidences": confidences, |
| "sample_counts": sample_counts |
| }, |
| "confidence_distribution": confidence_distribution, |
| "issue_breakdown": issue_breakdown |
| } |
| |
| def generate_consistency_heatmap_data( |
| self, |
| consistency_results: List[Any] |
| ) -> ConsistencyHeatmapData: |
| """ |
| Generate data for consistency heatmap visualization. |
| |
| Args: |
| consistency_results: List of consistency test results |
| |
| Returns: |
| ConsistencyHeatmapData: Heatmap data |
| """ |
| if not consistency_results: |
| return ConsistencyHeatmapData( |
| matrix=[], |
| labels=[], |
| title="Consistency Heatmap", |
| color_scale="RdYlBu" |
| ) |
| |
| |
| n_tests = len(consistency_results) |
| matrix = [] |
| |
| for i in range(n_tests): |
| row = [] |
| for j in range(n_tests): |
| if i == j: |
| |
| row.append(1.0) |
| else: |
| |
| similarity = self._calculate_test_similarity( |
| consistency_results[i], consistency_results[j] |
| ) |
| row.append(similarity) |
| matrix.append(row) |
| |
| |
| labels = [f"Test {i+1}" for i in range(n_tests)] |
| |
| return ConsistencyHeatmapData( |
| matrix=matrix, |
| labels=labels, |
| title="Response Consistency Heatmap", |
| color_scale="RdYlBu" |
| ) |
| |
| def generate_metrics_time_series( |
| self, |
| metrics_history: List[Dict[str, Any]] |
| ) -> MetricsTimeSeries: |
| """ |
| Generate time series data for metrics visualization. |
| |
| Args: |
| metrics_history: List of historical metrics data |
| |
| Returns: |
| MetricsTimeSeries: Time series data |
| """ |
| if not metrics_history: |
| return MetricsTimeSeries( |
| timestamps=[], |
| calibration_scores=[], |
| reliability_scores=[], |
| consistency_scores=[], |
| overall_scores=[] |
| ) |
| |
| timestamps = [] |
| calibration_scores = [] |
| reliability_scores = [] |
| consistency_scores = [] |
| overall_scores = [] |
| |
| for metrics in metrics_history: |
| |
| if "timestamp" in metrics: |
| if isinstance(metrics["timestamp"], str): |
| timestamp = datetime.fromisoformat(metrics["timestamp"]) |
| else: |
| timestamp = metrics["timestamp"] |
| else: |
| timestamp = datetime.utcnow() |
| |
| timestamps.append(timestamp) |
| |
| |
| calibration_scores.append(metrics.get("calibration_score", 0.5)) |
| reliability_scores.append(metrics.get("reliability_score", 0.5)) |
| consistency_scores.append(metrics.get("consistency_score", 0.5)) |
| overall_scores.append(metrics.get("overall_quality_score", 0.5)) |
| |
| return MetricsTimeSeries( |
| timestamps=timestamps, |
| calibration_scores=calibration_scores, |
| reliability_scores=reliability_scores, |
| consistency_scores=consistency_scores, |
| overall_scores=overall_scores |
| ) |
| |
| def generate_advanced_metrics_dashboard( |
| self, |
| advanced_metrics: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Generate comprehensive dashboard data for advanced metrics. |
| |
| Args: |
| advanced_metrics: Advanced metrics data |
| |
| Returns: |
| Dict[str, Any]: Dashboard data |
| """ |
| dashboard_data = { |
| "overview": { |
| "overall_score": advanced_metrics.get("overall_quality_score", 0.0), |
| "quality_grade": advanced_metrics.get("quality_grade", "N/A"), |
| "calibration_score": advanced_metrics.get("calibration_score", 0.0), |
| "reliability_score": advanced_metrics.get("reliability_score", 0.0), |
| "consistency_score": advanced_metrics.get("consistency_score", 0.0) |
| }, |
| "detailed_metrics": { |
| "confidence_quality": advanced_metrics.get("confidence_quality", 0.0), |
| "prediction_stability": advanced_metrics.get("prediction_stability", 0.0), |
| "response_coherence": advanced_metrics.get("response_coherence", 0.0) |
| }, |
| "breakdowns": { |
| "calibration": advanced_metrics.get("calibration_breakdown", {}), |
| "reliability": advanced_metrics.get("reliability_breakdown", {}), |
| "consistency": advanced_metrics.get("consistency_breakdown", {}) |
| }, |
| "recommendations": advanced_metrics.get("improvement_suggestions", []) |
| } |
| |
| return dashboard_data |
| |
| def _calculate_ece_from_bins( |
| self, |
| bin_centers: List[float], |
| bin_accuracies: List[float], |
| bin_counts: List[int], |
| total_samples: int |
| ) -> float: |
| """ |
| Calculate Expected Calibration Error from bin data. |
| |
| Args: |
| bin_centers: Bin center values |
| bin_accuracies: Bin accuracies |
| bin_counts: Bin sample counts |
| total_samples: Total number of samples |
| |
| Returns: |
| float: Expected Calibration Error |
| """ |
| ece = 0.0 |
| |
| for i in range(len(bin_centers)): |
| if bin_counts[i] > 0: |
| weight = bin_counts[i] / total_samples |
| error = abs(bin_accuracies[i] - bin_centers[i]) |
| ece += weight * error |
| |
| return ece |
| |
| def _generate_confidence_distribution( |
| self, |
| reliability_data: List[Dict[str, Any]] |
| ) -> Dict[str, Any]: |
| """ |
| Generate confidence distribution data. |
| |
| Args: |
| reliability_data: Reliability data points |
| |
| Returns: |
| Dict[str, Any]: Confidence distribution data |
| """ |
| |
| confidence_ranges = { |
| "0.0-0.2": 0, |
| "0.2-0.4": 0, |
| "0.4-0.6": 0, |
| "0.6-0.8": 0, |
| "0.8-1.0": 0 |
| } |
| |
| for point in reliability_data: |
| confidence = point["confidence_avg"] |
| count = point["sample_count"] |
| |
| if confidence <= 0.2: |
| confidence_ranges["0.0-0.2"] += count |
| elif confidence <= 0.4: |
| confidence_ranges["0.2-0.4"] += count |
| elif confidence <= 0.6: |
| confidence_ranges["0.4-0.6"] += count |
| elif confidence <= 0.8: |
| confidence_ranges["0.6-0.8"] += count |
| else: |
| confidence_ranges["0.8-1.0"] += count |
| |
| return { |
| "labels": list(confidence_ranges.keys()), |
| "values": list(confidence_ranges.values()) |
| } |
| |
| def _generate_issue_breakdown( |
| self, |
| reliability_metrics: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Generate issue breakdown data. |
| |
| Args: |
| reliability_metrics: Reliability metrics |
| |
| Returns: |
| Dict[str, Any]: Issue breakdown data |
| """ |
| issues = reliability_metrics.get("issues", []) |
| |
| |
| issue_counts = {} |
| for issue in issues: |
| issue_str = str(issue) |
| issue_counts[issue_str] = issue_counts.get(issue_str, 0) + 1 |
| |
| return { |
| "labels": list(issue_counts.keys()), |
| "values": list(issue_counts.values()) |
| } |
| |
| def _calculate_test_similarity( |
| self, |
| test1: Any, |
| test2: Any |
| ) -> float: |
| """ |
| Calculate similarity between two test results. |
| |
| Args: |
| test1: First test result |
| test2: Second test result |
| |
| Returns: |
| float: Similarity score (0-1) |
| """ |
| try: |
| |
| score1 = getattr(test1, 'consistency_score', 0.5) |
| score2 = getattr(test2, 'consistency_score', 0.5) |
| |
| |
| similarity = 1.0 - abs(score1 - score2) |
| |
| return max(0.0, min(1.0, similarity)) |
| except Exception: |
| return 0.5 |
| |
| def export_chart_data( |
| self, |
| chart_type: str, |
| data: Dict[str, Any], |
| format: str = "json" |
| ) -> Dict[str, Any]: |
| """ |
| Export chart data in specified format. |
| |
| Args: |
| chart_type: Type of chart |
| data: Chart data |
| format: Export format ('json', 'csv', 'chartjs') |
| |
| Returns: |
| Dict[str, Any]: Exported data |
| """ |
| if format == "json": |
| return { |
| "chart_type": chart_type, |
| "data": data, |
| "export_timestamp": datetime.utcnow().isoformat(), |
| "format": "json" |
| } |
| |
| elif format == "chartjs": |
| |
| return self._convert_to_chartjs_format(chart_type, data) |
| |
| elif format == "csv": |
| |
| return self._convert_to_csv_format(chart_type, data) |
| |
| else: |
| raise ValueError(f"Unsupported export format: {format}") |
| |
| def _convert_to_chartjs_format( |
| self, |
| chart_type: str, |
| data: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Convert data to Chart.js format. |
| |
| Args: |
| chart_type: Chart type |
| data: Raw data |
| |
| Returns: |
| Dict[str, Any]: Chart.js formatted data |
| """ |
| if chart_type == "calibration_curve": |
| return { |
| "type": "line", |
| "data": { |
| "labels": data.get("bins", []), |
| "datasets": [ |
| { |
| "label": "Actual Calibration", |
| "data": data.get("accuracies", []), |
| "borderColor": self.color_palette[0], |
| "backgroundColor": self.color_palette[0] + "20", |
| "borderWidth": 2 |
| }, |
| { |
| "label": "Ideal Calibration", |
| "data": data.get("ideal_line", []), |
| "borderColor": self.color_palette[1], |
| "backgroundColor": self.color_palette[1] + "20", |
| "borderWidth": 2, |
| "borderDash": [5, 5] |
| } |
| ] |
| }, |
| "options": { |
| "responsive": True, |
| "plugins": { |
| "title": { |
| "display": True, |
| "text": "Calibration Curve" |
| } |
| }, |
| "scales": { |
| "x": { |
| "title": { |
| "display": True, |
| "text": "Confidence" |
| } |
| }, |
| "y": { |
| "title": { |
| "display": True, |
| "text": "Accuracy" |
| }, |
| "min": 0, |
| "max": 1 |
| } |
| } |
| } |
| } |
| |
| elif chart_type == "reliability_diagram": |
| return { |
| "type": "bar", |
| "data": { |
| "labels": data.get("bins", []), |
| "datasets": [ |
| { |
| "label": "Accuracy", |
| "data": data.get("accuracies", []), |
| "backgroundColor": self.color_palette[0] + "80", |
| "borderColor": self.color_palette[0], |
| "borderWidth": 1 |
| } |
| ] |
| }, |
| "options": { |
| "responsive": True, |
| "plugins": { |
| "title": { |
| "display": True, |
| "text": "Reliability Diagram" |
| } |
| }, |
| "scales": { |
| "x": { |
| "title": { |
| "display": True, |
| "text": "Confidence Bins" |
| } |
| }, |
| "y": { |
| "title": { |
| "display": True, |
| "text": "Accuracy" |
| }, |
| "min": 0, |
| "max": 1 |
| } |
| } |
| } |
| } |
| |
| else: |
| return {"error": f"Unsupported chart type: {chart_type}"} |
| |
| def _convert_to_csv_format( |
| self, |
| chart_type: str, |
| data: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Convert data to CSV format. |
| |
| Args: |
| chart_type: Chart type |
| data: Raw data |
| |
| Returns: |
| Dict[str, Any]: CSV formatted data |
| """ |
| if chart_type == "calibration_curve": |
| headers = ["Bin", "Accuracy", "Confidence", "Ideal"] |
| rows = [] |
| |
| bins = data.get("bins", []) |
| accuracies = data.get("accuracies", []) |
| confidences = data.get("confidences", []) |
| ideal_line = data.get("ideal_line", []) |
| |
| for i in range(len(bins)): |
| rows.append([ |
| bins[i], |
| accuracies[i], |
| confidences[i], |
| ideal_line[i] |
| ]) |
| |
| return { |
| "headers": headers, |
| "rows": rows, |
| "format": "csv" |
| } |
| |
| else: |
| return {"error": f"CSV format not supported for chart type: {chart_type}"} |
|
|
|
|
| |
| scoring_visualizer = ScoringVisualizer() |
|
|
|
|
| def get_scoring_visualizer() -> ScoringVisualizer: |
| """ |
| Get the global scoring visualizer instance. |
| |
| Returns: |
| ScoringVisualizer: Global instance |
| """ |
| return scoring_visualizer |
|
|