SCoDA / coda /agents /visual_evaluator.py
vanishingradient's picture
Added init files
9281fab
"""
Visual Evaluator Agent for CoDA.
Assesses generated visualizations across multiple quality dimensions
using multimodal LLM capabilities to analyze the output image.
"""
import logging
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, Field
from coda.core.base_agent import AgentContext, BaseAgent
from coda.core.llm import LLMProvider
from coda.core.memory import SharedMemory
logger = logging.getLogger(__name__)
class QualityScores(BaseModel):
"""Quality scores for different dimensions."""
overall: float = Field(ge=0, le=10, description="Overall quality score")
readability: float = Field(ge=0, le=10, description="How easy to read and understand")
accuracy: float = Field(ge=0, le=10, description="How accurately it represents the data")
aesthetics: float = Field(ge=0, le=10, description="Visual appeal and design quality")
layout: float = Field(ge=0, le=10, description="Layout and spacing quality")
correctness: float = Field(ge=0, le=10, description="Technical correctness")
class VisualEvaluation(BaseModel):
"""Structured output from the Visual Evaluator."""
scores: QualityScores = Field(default_factory=lambda: QualityScores(
overall=5.0, readability=5.0, accuracy=5.0, aesthetics=5.0, layout=5.0, correctness=5.0
))
strengths: list[str] = Field(default_factory=list)
issues: list[str] = Field(default_factory=list)
priority_fixes: list[str] = Field(default_factory=list)
todo_completion: dict[str, bool] = Field(default_factory=dict)
recommendations: list[str] = Field(default_factory=list)
passes_threshold: bool = Field(default=False)
class VisualEvaluatorAgent(BaseAgent[VisualEvaluation]):
"""
Evaluates visualization quality using multimodal analysis.
Analyzes the output image against the original requirements
and provides detailed feedback for iterative refinement.
"""
MEMORY_KEY = "visual_evaluation"
def __init__(
self,
llm: LLMProvider,
memory: SharedMemory,
min_overall_score: float = 7.0,
name: Optional[str] = None,
) -> None:
super().__init__(llm, memory, name or "VisualEvaluator")
self._min_score = min_overall_score
def execute(self, context: AgentContext) -> VisualEvaluation:
"""Execute visual evaluation using the vision model."""
logger.info(f"[{self._name}] Evaluating visualization quality")
execution_result = self._get_from_memory("execution_result")
if not execution_result or not execution_result.get("success"):
return VisualEvaluation(
scores=QualityScores(
overall=0, readability=0, accuracy=0,
aesthetics=0, layout=0, correctness=0
),
strengths=[],
issues=["Visualization generation failed"],
priority_fixes=["Fix code execution errors"],
todo_completion={},
recommendations=["Debug and fix code errors first"],
passes_threshold=False,
)
output_file = execution_result.get("output_file")
if not output_file or not Path(output_file).exists():
return VisualEvaluation(
scores=QualityScores(
overall=0, readability=0, accuracy=0,
aesthetics=0, layout=0, correctness=0
),
strengths=[],
issues=["Output file not found"],
priority_fixes=["Ensure code saves output correctly"],
todo_completion={},
recommendations=["Check savefig call in code"],
passes_threshold=False,
)
prompt = self._build_evaluation_prompt(context)
system_prompt = self._get_system_prompt()
try:
response = self._llm.complete_with_image(
prompt=prompt,
image_path=output_file,
system_prompt=system_prompt,
)
result = self._parse_response(response.content)
self._store_result(result)
logger.info(
f"[{self._name}] Evaluation complete: "
f"overall={result.scores.overall}, passes={result.passes_threshold}"
)
return result
except Exception as e:
logger.error(f"[{self._name}] Evaluation failed: {e}")
# Return a fallback evaluation instead of crashing
fallback = VisualEvaluation(
scores=QualityScores(
overall=5.0, readability=5.0, accuracy=5.0,
aesthetics=5.0, layout=5.0, correctness=5.0
),
strengths=["Backup evaluation (parsing failed)"],
issues=[f"Evaluation parsing error: {str(e)}"],
priority_fixes=[],
todo_completion={},
recommendations=[],
passes_threshold=False
)
self._store_result(fallback)
return fallback
def _get_system_prompt(self) -> str:
return """You are a Visualization Quality Evaluator specialist.
Your expertise is in assessing data visualizations for quality, effectiveness, and adherence to best practices.
Evaluate visualizations on these dimensions:
1. Readability: Clear labels, appropriate font sizes, uncluttered design
2. Accuracy: Correct representation of data, appropriate scales
3. Aesthetics: Visual appeal, harmonious colors, professional appearance
4. Layout: Good use of space, proper alignment, balanced composition
5. Correctness: Technically correct chart type, proper axis handling
Be rigorous but fair in your assessment. Provide specific, actionable feedback.
Always respond with a valid JSON object matching the required schema."""
def _build_evaluation_prompt(self, context: AgentContext) -> str:
query_analysis = self._get_from_memory("query_analysis") or {}
visual_mapping = self._get_from_memory("visual_mapping") or {}
design_spec = self._get_from_memory("design_spec") or {}
todo_list = query_analysis.get("todo_list", [])
return f"""Evaluate this visualization against the original requirements.
Original Query: {context.query}
Requirements:
- Visualization Type: {visual_mapping.get('chart_type', 'Unknown')}
- Goals: {visual_mapping.get('visualization_goals', [])}
- TODO Items: {todo_list}
Design Specifications:
- Color Scheme: {design_spec.get('color_scheme', {})}
- Success Indicators: {design_spec.get('success_indicators', [])}
Evaluate the visualization image and provide a JSON response with:
- scores: {{
"overall": 0-10,
"readability": 0-10,
"accuracy": 0-10,
"aesthetics": 0-10,
"layout": 0-10,
"correctness": 0-10
}}
- strengths: List of positive aspects
- issues: List of problems found
- priority_fixes: Most important fixes (max 3)
- todo_completion: {{"todo_item": true/false}} for each TODO
- recommendations: Improvement suggestions
- passes_threshold: true if overall >= {self._min_score}
JSON Response:"""
def _build_prompt(self, context: AgentContext) -> str:
return self._build_evaluation_prompt(context)
def _parse_response(self, response: str) -> VisualEvaluation:
data = self._extract_json(response)
# Ensure scores exists and is properly formatted
scores_data = data.get("scores", {})
if isinstance(scores_data, dict):
# Ensure all required fields have defaults
scores_data.setdefault("overall", 5.0)
scores_data.setdefault("readability", 5.0)
scores_data.setdefault("accuracy", 5.0)
scores_data.setdefault("aesthetics", 5.0)
scores_data.setdefault("layout", 5.0)
scores_data.setdefault("correctness", 5.0)
data["scores"] = QualityScores(**scores_data)
# Ensure list fields are lists
for field in ["strengths", "issues", "priority_fixes", "recommendations"]:
if field not in data or not isinstance(data[field], list):
data[field] = [data[field]] if isinstance(data.get(field), str) else []
# Ensure todo_completion is a dict
if not isinstance(data.get("todo_completion"), dict):
data["todo_completion"] = {}
# Calculate passes_threshold if not provided
if "passes_threshold" not in data:
overall = data.get("scores")
if isinstance(overall, QualityScores):
data["passes_threshold"] = overall.overall >= self._min_score
else:
data["passes_threshold"] = False
return VisualEvaluation(**data)
def _get_output_key(self) -> str:
return self.MEMORY_KEY