Spaces:
Sleeping
Sleeping
| """ | |
| Comprehensive Evaluation Summary Generator | |
| Creates detailed evaluation summaries with insights, trends, and recommendations | |
| for system optimization and quality improvement. | |
| """ | |
| import json | |
| import os | |
| from datetime import datetime | |
| from typing import Any, Dict, List | |
| class EvaluationSummaryGenerator: | |
| """Generate executive summaries and detailed insights from evaluation results.""" | |
| def __init__(self, results_file: str): | |
| """Initialize with evaluation results.""" | |
| self.results_file = results_file | |
| self.results = self._load_results() | |
| def _load_results(self) -> Dict[str, Any]: | |
| """Load evaluation results from file.""" | |
| try: | |
| with open(self.results_file, "r") as f: | |
| return json.load(f) | |
| except Exception as e: | |
| print(f"Error loading results: {e}") | |
| return {} | |
| def generate_executive_summary(self) -> Dict[str, Any]: | |
| """Generate executive summary for stakeholders.""" | |
| if not self.results: | |
| return {"error": "No results available"} | |
| summary = self.results.get("summary", {}) | |
| results = self.results.get("results", []) | |
| # Calculate key metrics | |
| total_questions = summary.get("n_questions", 0) | |
| success_rate = summary.get("success_rate", 0) | |
| avg_latency = summary.get("avg_latency_s", 0) | |
| groundedness = summary.get("avg_groundedness_score", 1.0) | |
| citation_accuracy = summary.get("avg_citation_accuracy", 0) | |
| # Calculate composite scores | |
| performance_score = self._calculate_performance_score( | |
| success_rate, avg_latency, groundedness, citation_accuracy | |
| ) | |
| quality_grade = self._calculate_quality_grade(performance_score) | |
| # Generate insights | |
| key_insights = self._generate_key_insights(summary, results) | |
| recommendations = self._generate_recommendations(summary, results) | |
| return { | |
| "evaluation_date": datetime.now().isoformat(), | |
| "system_performance": { | |
| "overall_grade": quality_grade["grade"], | |
| "performance_score": performance_score, | |
| "status": quality_grade["status"], | |
| "confidence": quality_grade["confidence"], | |
| }, | |
| "key_metrics": { | |
| "questions_evaluated": total_questions, | |
| "system_reliability": f"{success_rate * 100:.1f}%", | |
| "average_response_time": f"{avg_latency:.2f}s", | |
| "content_accuracy": f"{groundedness * 100:.1f}%", | |
| "source_attribution": f"{citation_accuracy * 100:.1f}%", | |
| }, | |
| "key_insights": key_insights, | |
| "recommendations": recommendations, | |
| "risk_assessment": self._assess_risks(summary, results), | |
| "next_actions": self._generate_next_actions(summary, results), | |
| } | |
| def _calculate_performance_score( | |
| self, success_rate: float, latency: float, groundedness: float, citation: float | |
| ) -> float: | |
| """Calculate composite performance score.""" | |
| # Normalize latency (assume 10s is worst case, 1s is best case) | |
| latency_score = max(0, min(1, (10 - latency) / 9)) | |
| # Weighted scoring | |
| weights = { | |
| "reliability": 0.25, # System uptime and success rate | |
| "speed": 0.25, # Response time performance | |
| "accuracy": 0.30, # Content quality and groundedness | |
| "attribution": 0.20, # Citation and source accuracy | |
| } | |
| score = ( | |
| success_rate * weights["reliability"] | |
| + latency_score * weights["speed"] | |
| + groundedness * weights["accuracy"] | |
| + citation * weights["attribution"] | |
| ) | |
| return round(score, 3) | |
| def _calculate_quality_grade(self, performance_score: float) -> Dict[str, Any]: | |
| """Convert performance score to letter grade.""" | |
| if performance_score >= 0.95: | |
| return {"grade": "A+", "status": "Exceptional", "confidence": "Very High"} | |
| elif performance_score >= 0.90: | |
| return {"grade": "A", "status": "Excellent", "confidence": "High"} | |
| elif performance_score >= 0.80: | |
| return {"grade": "B+", "status": "Very Good", "confidence": "High"} | |
| elif performance_score >= 0.70: | |
| return {"grade": "B", "status": "Good", "confidence": "Medium"} | |
| elif performance_score >= 0.60: | |
| return {"grade": "C+", "status": "Fair", "confidence": "Medium"} | |
| elif performance_score >= 0.50: | |
| return {"grade": "C", "status": "Acceptable", "confidence": "Low"} | |
| else: | |
| return {"grade": "D", "status": "Needs Improvement", "confidence": "Low"} | |
| def _generate_key_insights(self, summary: Dict, results: List) -> List[Dict[str, Any]]: | |
| """Generate key insights from evaluation data.""" | |
| insights = [] | |
| success_rate = summary.get("success_rate", 0) | |
| avg_latency = summary.get("avg_latency_s", 0) | |
| groundedness = summary.get("avg_groundedness_score", 1.0) | |
| citation_accuracy = summary.get("avg_citation_accuracy", 0) | |
| # System reliability insight | |
| if success_rate == 1.0: | |
| insights.append( | |
| { | |
| "type": "strength", | |
| "category": "reliability", | |
| "title": "Perfect System Reliability", | |
| "description": "100% of evaluation queries completed successfully with no system failures.", | |
| "impact": "high", | |
| "confidence": 1.0, | |
| } | |
| ) | |
| elif success_rate >= 0.95: | |
| insights.append( | |
| { | |
| "type": "strength", | |
| "category": "reliability", | |
| "title": "Excellent System Reliability", | |
| "description": ( | |
| f"System achieved {success_rate*100:.1f}% success rate, " "exceeding industry standards." | |
| ), | |
| "impact": "medium", | |
| "confidence": 0.9, | |
| } | |
| ) | |
| else: | |
| insights.append( | |
| { | |
| "type": "concern", | |
| "category": "reliability", | |
| "title": "System Reliability Issues", | |
| "description": ( | |
| f"Success rate of {success_rate*100:.1f}% indicates " | |
| f"reliability concerns requiring attention." | |
| ), | |
| "impact": "high", | |
| "confidence": 0.8, | |
| } | |
| ) | |
| # Response time insight | |
| if avg_latency <= 3: | |
| insights.append( | |
| { | |
| "type": "strength", | |
| "category": "performance", | |
| "title": "Fast Response Times", | |
| "description": f"Average response time of {avg_latency:.1f}s meets user experience expectations.", | |
| "impact": "medium", | |
| "confidence": 0.9, | |
| } | |
| ) | |
| elif avg_latency <= 6: | |
| insights.append( | |
| { | |
| "type": "opportunity", | |
| "category": "performance", | |
| "title": "Response Time Optimization Opportunity", | |
| "description": ( | |
| f"Response time of {avg_latency:.1f}s has room for improvement " f"to enhance user experience." | |
| ), | |
| "impact": "medium", | |
| "confidence": 0.8, | |
| } | |
| ) | |
| else: | |
| insights.append( | |
| { | |
| "type": "concern", | |
| "category": "performance", | |
| "title": "Slow Response Times", | |
| "description": ( | |
| f"Average response time of {avg_latency:.1f}s " f"significantly impacts user experience." | |
| ), | |
| "impact": "high", | |
| "confidence": 0.9, | |
| } | |
| ) | |
| # Content quality insight | |
| if groundedness >= 0.95: | |
| insights.append( | |
| { | |
| "type": "strength", | |
| "category": "quality", | |
| "title": "Exceptional Content Quality", | |
| "description": f"Content groundedness of {groundedness*100:.1f}% indicates highly accurate, fact-based responses.", | |
| "impact": "high", | |
| "confidence": 1.0, | |
| } | |
| ) | |
| elif groundedness >= 0.8: | |
| insights.append( | |
| { | |
| "type": "strength", | |
| "category": "quality", | |
| "title": "Good Content Quality", | |
| "description": f"Content groundedness of {groundedness*100:.1f}% shows reliable factual accuracy.", | |
| "impact": "medium", | |
| "confidence": 0.8, | |
| } | |
| ) | |
| else: | |
| insights.append( | |
| { | |
| "type": "concern", | |
| "category": "quality", | |
| "title": "Content Quality Issues", | |
| "description": f"Groundedness score of {groundedness*100:.1f}% indicates potential factual accuracy problems.", | |
| "impact": "high", | |
| "confidence": 0.9, | |
| } | |
| ) | |
| # Citation quality insight | |
| if citation_accuracy >= 0.8: | |
| insights.append( | |
| { | |
| "type": "strength", | |
| "category": "attribution", | |
| "title": "Excellent Source Attribution", | |
| "description": f"Citation accuracy of {citation_accuracy*100:.1f}% provides strong source transparency.", | |
| "impact": "medium", | |
| "confidence": 0.9, | |
| } | |
| ) | |
| elif citation_accuracy >= 0.5: | |
| insights.append( | |
| { | |
| "type": "opportunity", | |
| "category": "attribution", | |
| "title": "Citation Accuracy Improvement Needed", | |
| "description": f"Citation accuracy of {citation_accuracy*100:.1f}% has significant room for improvement.", | |
| "impact": "medium", | |
| "confidence": 0.8, | |
| } | |
| ) | |
| else: | |
| insights.append( | |
| { | |
| "type": "concern", | |
| "category": "attribution", | |
| "title": "Poor Source Attribution", | |
| "description": f"Citation accuracy of {citation_accuracy*100:.1f}% is critically low and needs immediate attention.", | |
| "impact": "high", | |
| "confidence": 0.95, | |
| } | |
| ) | |
| return insights | |
| def _generate_recommendations(self, summary: Dict, results: List) -> List[Dict[str, Any]]: | |
| """Generate actionable recommendations.""" | |
| recommendations = [] | |
| citation_accuracy = summary.get("avg_citation_accuracy", 0) | |
| avg_latency = summary.get("avg_latency_s", 0) | |
| # Citation improvement recommendation | |
| if citation_accuracy < 0.5: | |
| recommendations.append( | |
| { | |
| "priority": "high", | |
| "category": "attribution", | |
| "title": "Implement Enhanced Citation Matching", | |
| "description": "Develop improved algorithms for matching generated content to source documents.", | |
| "estimated_effort": "2-3 weeks", | |
| "expected_impact": "80% improvement in citation accuracy", | |
| "implementation_steps": [ | |
| "Analyze current citation extraction patterns", | |
| "Implement fuzzy matching for source attribution", | |
| "Add semantic similarity scoring for citations", | |
| "Test and validate improved citation logic", | |
| ], | |
| } | |
| ) | |
| # Performance optimization recommendation | |
| if avg_latency > 4: | |
| recommendations.append( | |
| { | |
| "priority": "medium", | |
| "category": "performance", | |
| "title": "Optimize Response Time Performance", | |
| "description": "Implement caching and optimization strategies to reduce average response time.", | |
| "estimated_effort": "3-4 weeks", | |
| "expected_impact": "40% reduction in response time", | |
| "implementation_steps": [ | |
| "Implement query result caching", | |
| "Optimize vector search performance", | |
| "Consider parallel processing for document retrieval", | |
| "Profile and optimize LLM integration", | |
| ], | |
| } | |
| ) | |
| # Monitoring recommendation (always relevant) | |
| recommendations.append( | |
| { | |
| "priority": "medium", | |
| "category": "monitoring", | |
| "title": "Enhance Real-time Monitoring", | |
| "description": "Implement comprehensive monitoring and alerting for proactive system management.", | |
| "estimated_effort": "1-2 weeks", | |
| "expected_impact": "Improved system reliability and faster issue detection", | |
| "implementation_steps": [ | |
| "Set up performance threshold alerting", | |
| "Implement quality degradation detection", | |
| "Add user experience monitoring", | |
| "Create automated reporting dashboards", | |
| ], | |
| } | |
| ) | |
| return recommendations | |
| def _assess_risks(self, summary: Dict, results: List) -> List[Dict[str, Any]]: | |
| """Assess potential risks and their mitigation strategies.""" | |
| risks = [] | |
| citation_accuracy = summary.get("avg_citation_accuracy", 0) | |
| avg_latency = summary.get("avg_latency_s", 0) | |
| success_rate = summary.get("success_rate", 1.0) | |
| # Citation accuracy risk | |
| if citation_accuracy < 0.3: | |
| risks.append( | |
| { | |
| "risk_level": "high", | |
| "category": "compliance", | |
| "title": "Poor Source Attribution Risk", | |
| "description": "Low citation accuracy may impact user trust and regulatory compliance.", | |
| "probability": "high", | |
| "impact": "high", | |
| "mitigation": "Prioritize citation algorithm improvements and manual review processes.", | |
| } | |
| ) | |
| # Performance risk | |
| if avg_latency > 8: | |
| risks.append( | |
| { | |
| "risk_level": "medium", | |
| "category": "user_experience", | |
| "title": "User Experience Degradation Risk", | |
| "description": "Slow response times may lead to user abandonment and reduced adoption.", | |
| "probability": "medium", | |
| "impact": "medium", | |
| "mitigation": "Implement performance optimization and caching strategies.", | |
| } | |
| ) | |
| # Reliability risk | |
| if success_rate < 0.9: | |
| risks.append( | |
| { | |
| "risk_level": "high", | |
| "category": "system_reliability", | |
| "title": "System Reliability Risk", | |
| "description": "System failures impact user confidence and business continuity.", | |
| "probability": "medium", | |
| "impact": "high", | |
| "mitigation": "Improve error handling, implement circuit breakers, and enhance monitoring.", | |
| } | |
| ) | |
| return risks | |
| def _generate_next_actions(self, summary: Dict, results: List) -> List[Dict[str, Any]]: | |
| """Generate specific next actions with timelines.""" | |
| actions = [] | |
| citation_accuracy = summary.get("avg_citation_accuracy", 0) | |
| avg_latency = summary.get("avg_latency_s", 0) | |
| # Immediate actions (1-2 weeks) | |
| if citation_accuracy < 0.2: | |
| actions.append( | |
| { | |
| "timeline": "immediate", | |
| "priority": "critical", | |
| "action": "Investigate Citation Algorithm Failure", | |
| "owner": "Engineering Team", | |
| "deliverable": "Root cause analysis and emergency fix for citation matching", | |
| } | |
| ) | |
| # Short-term actions (2-4 weeks) | |
| if citation_accuracy < 0.6: | |
| actions.append( | |
| { | |
| "timeline": "short_term", | |
| "priority": "high", | |
| "action": "Redesign Citation Matching System", | |
| "owner": "Engineering Team", | |
| "deliverable": "Enhanced citation algorithm with >80% accuracy", | |
| } | |
| ) | |
| if avg_latency > 6: | |
| actions.append( | |
| { | |
| "timeline": "short_term", | |
| "priority": "high", | |
| "action": "Implement Response Time Optimization", | |
| "owner": "Engineering Team", | |
| "deliverable": "Performance improvements achieving <4s average response time", | |
| } | |
| ) | |
| # Medium-term actions (1-3 months) | |
| actions.append( | |
| { | |
| "timeline": "medium_term", | |
| "priority": "medium", | |
| "action": "Enhance Evaluation Framework", | |
| "owner": "Engineering Team", | |
| "deliverable": "Automated quality monitoring and regression detection system", | |
| } | |
| ) | |
| return actions | |
| def generate_markdown_summary(self) -> str: | |
| """Generate markdown executive summary.""" | |
| exec_summary = self.generate_executive_summary() | |
| if "error" in exec_summary: | |
| return f"# Evaluation Summary\n\nError: {exec_summary['error']}" | |
| markdown = """# RAG System Evaluation - Executive Summary | |
| ## Overall Assessment | |
| **System Grade:** {system_perf['overall_grade']} ({system_perf['status']}) | |
| **Performance Score:** {system_perf['performance_score']}/1.0 | |
| **Evaluation Date:** {exec_summary['evaluation_date'][:10]} | |
| ## Key Performance Indicators | |
| | Metric | Value | Status | | |
| |--------|-------|--------| | |
| | Questions Evaluated | {key_metrics['questions_evaluated']} | β Complete | | |
| | System Reliability | {key_metrics['system_reliability']} | {"β " if "100" in key_metrics['system_reliability'] else "β οΈ"} | | |
| | Average Response Time | {key_metrics['average_response_time']} | {"β " if float(key_metrics['average_response_time'][:-1]) <= 3 else "β οΈ"} | | |
| | Content Accuracy | {key_metrics['content_accuracy']} | {"β " if "100" in key_metrics['content_accuracy'] else "β οΈ"} | | |
| | Source Attribution | {key_metrics['source_attribution']} | {"β " if float(key_metrics['source_attribution'][:-1]) >= 80 else "β"} | | |
| ## Key Insights | |
| """ | |
| # Add insights by category | |
| insights = exec_summary["key_insights"] | |
| for insight in insights: | |
| icon = "β " if insight["type"] == "strength" else "β οΈ" if insight["type"] == "opportunity" else "β" | |
| markdown += f"### {icon} {insight['title']}\n{insight['description']}\n\n" | |
| markdown += "## Priority Recommendations\n\n" | |
| # Add top recommendations | |
| recommendations = exec_summary["recommendations"][:3] # Top 3 | |
| for i, rec in enumerate(recommendations, 1): | |
| priority_icon = "π΄" if rec["priority"] == "high" else "π‘" if rec["priority"] == "medium" else "π’" | |
| markdown += f"### {i}. {priority_icon} {rec['title']}\n" | |
| markdown += f"**Effort:** {rec['estimated_effort']} | **Impact:** {rec['expected_impact']}\n\n" | |
| markdown += f"{rec['description']}\n\n" | |
| markdown += "## Risk Assessment\n\n" | |
| # Add critical risks | |
| risks = exec_summary["risk_assessment"] | |
| for risk in risks: | |
| risk_icon = "π΄" if risk["risk_level"] == "high" else "π‘" | |
| markdown += f"### {risk_icon} {risk['title']}\n" | |
| markdown += f"**Impact:** {risk['impact']} | **Probability:** {risk['probability']}\n\n" | |
| markdown += f"{risk['description']}\n\n" | |
| markdown += f"**Mitigation:** {risk['mitigation']}\n\n" | |
| return markdown | |
| def main(): | |
| """Generate and display executive summary.""" | |
| results_file = "/Users/sethmcknight/Developer/msse-ai-engineering/evaluation/enhanced_results.json" | |
| if not os.path.exists(results_file): | |
| print(f"Results file not found: {results_file}") | |
| return | |
| print("π Generating executive summary...") | |
| generator = EvaluationSummaryGenerator(results_file) | |
| exec_summary = generator.generate_executive_summary() | |
| if "error" in exec_summary: | |
| print(f"β Error: {exec_summary['error']}") | |
| return | |
| # Save executive summary | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| summary_file = f"/Users/sethmcknight/Developer/msse-ai-engineering/evaluation/executive_summary_{timestamp}.json" | |
| with open(summary_file, "w") as f: | |
| json.dump(exec_summary, f, indent=2) | |
| # Generate markdown version | |
| markdown_summary = generator.generate_markdown_summary() | |
| markdown_file = summary_file.replace(".json", ".md") | |
| with open(markdown_file, "w") as f: | |
| f.write(markdown_summary) | |
| print(f"π Executive summary saved: {summary_file}") | |
| print(f"π Markdown summary saved: {markdown_file}") | |
| # Display key findings | |
| print(f"\n{'='*60}") | |
| print("π― EXECUTIVE SUMMARY") | |
| print(f"{'='*60}") | |
| # Get system performance from exec_summary | |
| system_performance = exec_summary.get("system_performance", {}) | |
| print( | |
| f"Overall Grade: {system_performance.get('overall_grade', 'N/A')} ({system_performance.get('status', 'Unknown')})" | |
| ) | |
| print(f"Performance Score: {system_performance.get('performance_score', 0)}/1.0") | |
| print(f"Confidence Level: {system_performance.get('confidence', 0)}") | |
| print("\nπ KEY METRICS:") | |
| for metric, value in exec_summary["key_metrics"].items(): | |
| print(f" β’ {metric.replace('_', ' ').title()}: {value}") | |
| print("\nπ TOP INSIGHTS:") | |
| for insight in exec_summary["key_insights"][:3]: | |
| icon = "β " if insight["type"] == "strength" else "β οΈ" if insight["type"] == "opportunity" else "β" | |
| print(f" {icon} {insight['title']}") | |
| print("\nπ― PRIORITY ACTIONS:") | |
| for action in exec_summary["next_actions"][:3]: | |
| print(f" β’ {action['action']} ({action['timeline']})") | |
| print("\nβ Executive summary complete!") | |
| if __name__ == "__main__": | |
| main() | |