Spaces:
Sleeping
Sleeping
| """ | |
| Visual Evaluator Agent for CoDA. | |
| Assesses generated visualizations across multiple quality dimensions | |
| using multimodal LLM capabilities to analyze the output image. | |
| """ | |
| import logging | |
| from pathlib import Path | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| from coda.core.base_agent import AgentContext, BaseAgent | |
| from coda.core.llm import LLMProvider | |
| from coda.core.memory import SharedMemory | |
| logger = logging.getLogger(__name__) | |
| class QualityScores(BaseModel): | |
| """Quality scores for different dimensions.""" | |
| overall: float = Field(ge=0, le=10, description="Overall quality score") | |
| readability: float = Field(ge=0, le=10, description="How easy to read and understand") | |
| accuracy: float = Field(ge=0, le=10, description="How accurately it represents the data") | |
| aesthetics: float = Field(ge=0, le=10, description="Visual appeal and design quality") | |
| layout: float = Field(ge=0, le=10, description="Layout and spacing quality") | |
| correctness: float = Field(ge=0, le=10, description="Technical correctness") | |
| class VisualEvaluation(BaseModel): | |
| """Structured output from the Visual Evaluator.""" | |
| scores: QualityScores = Field(default_factory=lambda: QualityScores( | |
| overall=5.0, readability=5.0, accuracy=5.0, aesthetics=5.0, layout=5.0, correctness=5.0 | |
| )) | |
| strengths: list[str] = Field(default_factory=list) | |
| issues: list[str] = Field(default_factory=list) | |
| priority_fixes: list[str] = Field(default_factory=list) | |
| todo_completion: dict[str, bool] = Field(default_factory=dict) | |
| recommendations: list[str] = Field(default_factory=list) | |
| passes_threshold: bool = Field(default=False) | |
| class VisualEvaluatorAgent(BaseAgent[VisualEvaluation]): | |
| """ | |
| Evaluates visualization quality using multimodal analysis. | |
| Analyzes the output image against the original requirements | |
| and provides detailed feedback for iterative refinement. | |
| """ | |
| MEMORY_KEY = "visual_evaluation" | |
| def __init__( | |
| self, | |
| llm: LLMProvider, | |
| memory: SharedMemory, | |
| min_overall_score: float = 7.0, | |
| name: Optional[str] = None, | |
| ) -> None: | |
| super().__init__(llm, memory, name or "VisualEvaluator") | |
| self._min_score = min_overall_score | |
| def execute(self, context: AgentContext) -> VisualEvaluation: | |
| """Execute visual evaluation using the vision model.""" | |
| logger.info(f"[{self._name}] Evaluating visualization quality") | |
| execution_result = self._get_from_memory("execution_result") | |
| if not execution_result or not execution_result.get("success"): | |
| return VisualEvaluation( | |
| scores=QualityScores( | |
| overall=0, readability=0, accuracy=0, | |
| aesthetics=0, layout=0, correctness=0 | |
| ), | |
| strengths=[], | |
| issues=["Visualization generation failed"], | |
| priority_fixes=["Fix code execution errors"], | |
| todo_completion={}, | |
| recommendations=["Debug and fix code errors first"], | |
| passes_threshold=False, | |
| ) | |
| output_file = execution_result.get("output_file") | |
| if not output_file or not Path(output_file).exists(): | |
| return VisualEvaluation( | |
| scores=QualityScores( | |
| overall=0, readability=0, accuracy=0, | |
| aesthetics=0, layout=0, correctness=0 | |
| ), | |
| strengths=[], | |
| issues=["Output file not found"], | |
| priority_fixes=["Ensure code saves output correctly"], | |
| todo_completion={}, | |
| recommendations=["Check savefig call in code"], | |
| passes_threshold=False, | |
| ) | |
| prompt = self._build_evaluation_prompt(context) | |
| system_prompt = self._get_system_prompt() | |
| try: | |
| response = self._llm.complete_with_image( | |
| prompt=prompt, | |
| image_path=output_file, | |
| system_prompt=system_prompt, | |
| ) | |
| result = self._parse_response(response.content) | |
| self._store_result(result) | |
| logger.info( | |
| f"[{self._name}] Evaluation complete: " | |
| f"overall={result.scores.overall}, passes={result.passes_threshold}" | |
| ) | |
| return result | |
| except Exception as e: | |
| logger.error(f"[{self._name}] Evaluation failed: {e}") | |
| # Return a fallback evaluation instead of crashing | |
| fallback = VisualEvaluation( | |
| scores=QualityScores( | |
| overall=5.0, readability=5.0, accuracy=5.0, | |
| aesthetics=5.0, layout=5.0, correctness=5.0 | |
| ), | |
| strengths=["Backup evaluation (parsing failed)"], | |
| issues=[f"Evaluation parsing error: {str(e)}"], | |
| priority_fixes=[], | |
| todo_completion={}, | |
| recommendations=[], | |
| passes_threshold=False | |
| ) | |
| self._store_result(fallback) | |
| return fallback | |
| def _get_system_prompt(self) -> str: | |
| return """You are a Visualization Quality Evaluator specialist. | |
| Your expertise is in assessing data visualizations for quality, effectiveness, and adherence to best practices. | |
| Evaluate visualizations on these dimensions: | |
| 1. Readability: Clear labels, appropriate font sizes, uncluttered design | |
| 2. Accuracy: Correct representation of data, appropriate scales | |
| 3. Aesthetics: Visual appeal, harmonious colors, professional appearance | |
| 4. Layout: Good use of space, proper alignment, balanced composition | |
| 5. Correctness: Technically correct chart type, proper axis handling | |
| Be rigorous but fair in your assessment. Provide specific, actionable feedback. | |
| Always respond with a valid JSON object matching the required schema.""" | |
| def _build_evaluation_prompt(self, context: AgentContext) -> str: | |
| query_analysis = self._get_from_memory("query_analysis") or {} | |
| visual_mapping = self._get_from_memory("visual_mapping") or {} | |
| design_spec = self._get_from_memory("design_spec") or {} | |
| todo_list = query_analysis.get("todo_list", []) | |
| return f"""Evaluate this visualization against the original requirements. | |
| Original Query: {context.query} | |
| Requirements: | |
| - Visualization Type: {visual_mapping.get('chart_type', 'Unknown')} | |
| - Goals: {visual_mapping.get('visualization_goals', [])} | |
| - TODO Items: {todo_list} | |
| Design Specifications: | |
| - Color Scheme: {design_spec.get('color_scheme', {})} | |
| - Success Indicators: {design_spec.get('success_indicators', [])} | |
| Evaluate the visualization image and provide a JSON response with: | |
| - scores: {{ | |
| "overall": 0-10, | |
| "readability": 0-10, | |
| "accuracy": 0-10, | |
| "aesthetics": 0-10, | |
| "layout": 0-10, | |
| "correctness": 0-10 | |
| }} | |
| - strengths: List of positive aspects | |
| - issues: List of problems found | |
| - priority_fixes: Most important fixes (max 3) | |
| - todo_completion: {{"todo_item": true/false}} for each TODO | |
| - recommendations: Improvement suggestions | |
| - passes_threshold: true if overall >= {self._min_score} | |
| JSON Response:""" | |
| def _build_prompt(self, context: AgentContext) -> str: | |
| return self._build_evaluation_prompt(context) | |
| def _parse_response(self, response: str) -> VisualEvaluation: | |
| data = self._extract_json(response) | |
| # Ensure scores exists and is properly formatted | |
| scores_data = data.get("scores", {}) | |
| if isinstance(scores_data, dict): | |
| # Ensure all required fields have defaults | |
| scores_data.setdefault("overall", 5.0) | |
| scores_data.setdefault("readability", 5.0) | |
| scores_data.setdefault("accuracy", 5.0) | |
| scores_data.setdefault("aesthetics", 5.0) | |
| scores_data.setdefault("layout", 5.0) | |
| scores_data.setdefault("correctness", 5.0) | |
| data["scores"] = QualityScores(**scores_data) | |
| # Ensure list fields are lists | |
| for field in ["strengths", "issues", "priority_fixes", "recommendations"]: | |
| if field not in data or not isinstance(data[field], list): | |
| data[field] = [data[field]] if isinstance(data.get(field), str) else [] | |
| # Ensure todo_completion is a dict | |
| if not isinstance(data.get("todo_completion"), dict): | |
| data["todo_completion"] = {} | |
| # Calculate passes_threshold if not provided | |
| if "passes_threshold" not in data: | |
| overall = data.get("scores") | |
| if isinstance(overall, QualityScores): | |
| data["passes_threshold"] = overall.overall >= self._min_score | |
| else: | |
| data["passes_threshold"] = False | |
| return VisualEvaluation(**data) | |
| def _get_output_key(self) -> str: | |
| return self.MEMORY_KEY | |