Design-System-Automation / tests /test_agent_evals_live.py
riazmo's picture
v3: ReAct multi-agent architecture with parallel execution
abab3e7
#!/usr/bin/env python3
"""
Live LLM Agent Evaluations with DeepEval
==========================================
Tests the 4 AI agents with REAL HuggingFace API calls + DeepEval metrics.
Unlike test_agent_evals.py (mock), this hits live LLMs and evaluates output quality.
WHAT THIS TESTS:
- Does the LLM return valid JSON? (not just our parser)
- Is the brand identification sensible for known colors?
- Does the benchmark advisor pick a relevant system?
- Are priority fixes ranked by actual impact?
- Does NEXUS reference all 3 upstream agents?
- Are self-evaluation confidence scores honest?
REQUIRES:
- HF_TOKEN env var set (HuggingFace Pro $9/month)
- pip install deepeval (optional — falls back to manual assertions)
RUN:
# With DeepEval dashboard:
deepeval test run tests/test_agent_evals_live.py -v
# With plain pytest:
pytest tests/test_agent_evals_live.py -v -s --timeout=120
# Skip if no HF_TOKEN:
pytest tests/test_agent_evals_live.py -v -k "not live"
COST: ~$0.003 per full run (4 agent calls)
TIME: ~30s sequential, ~10s with parallelized agents
"""
import asyncio
import json
import os
import sys
from typing import Optional
import pytest
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Skip all tests if no HF_TOKEN
HF_TOKEN = os.getenv("HF_TOKEN", "")
SKIP_REASON = "HF_TOKEN not set — skipping live LLM evals (set HF_TOKEN to run)"
pytestmark = pytest.mark.skipif(not HF_TOKEN, reason=SKIP_REASON)
from agents.llm_agents import (
BrandIdentifierAgent,
BenchmarkAdvisorAgent,
BestPracticesValidatorAgent,
HeadSynthesizerAgent,
BrandIdentification,
BenchmarkAdvice,
BestPracticesResult,
HeadSynthesis,
)
# Try importing DeepEval
try:
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import GEval
from deepeval.metrics.g_eval import GEvalParameter
HAS_DEEPEVAL = True
except ImportError:
HAS_DEEPEVAL = False
# =============================================================================
# LIVE HF CLIENT
# =============================================================================
def get_live_client():
"""Get the real HF inference client."""
from core.hf_inference import get_inference_client
return get_inference_client()
# =============================================================================
# REALISTIC TEST DATA (simulates a real website extraction)
# =============================================================================
# Simulates tokens extracted from a SaaS dashboard website
LIVE_COLOR_TOKENS = {
"primary-button": {"value": "#2563eb", "frequency": 45, "context": "buttons, links, CTAs"},
"secondary-button": {"value": "#7c3aed", "frequency": 18, "context": "secondary actions"},
"success": {"value": "#16a34a", "frequency": 12, "context": "success states, badges"},
"warning": {"value": "#eab308", "frequency": 8, "context": "warnings, alerts"},
"error": {"value": "#dc2626", "frequency": 6, "context": "error states"},
"text-primary": {"value": "#111827", "frequency": 200, "context": "headings, body text"},
"text-secondary": {"value": "#6b7280", "frequency": 150, "context": "secondary text, labels"},
"text-muted": {"value": "#9ca3af", "frequency": 80, "context": "placeholders, disabled"},
"bg-white": {"value": "#ffffff", "frequency": 300, "context": "page background"},
"bg-gray-50": {"value": "#f9fafb", "frequency": 100, "context": "card backgrounds"},
"bg-gray-100": {"value": "#f3f4f6", "frequency": 60, "context": "section backgrounds"},
"border": {"value": "#e5e7eb", "frequency": 90, "context": "borders, dividers"},
"light-accent": {"value": "#bfdbfe", "frequency": 15, "context": "highlights, selected"},
}
LIVE_SEMANTIC_ANALYSIS = {
"brand": [
{"hex": "#2563eb", "name": "primary-button", "context": "buttons, links, CTAs"},
{"hex": "#7c3aed", "name": "secondary-button", "context": "secondary actions"},
],
"text": [
{"hex": "#111827", "name": "text-primary"},
{"hex": "#6b7280", "name": "text-secondary"},
],
"status": [
{"hex": "#16a34a", "name": "success"},
{"hex": "#dc2626", "name": "error"},
],
}
# Mock benchmark comparison objects (same structure as real pipeline)
class _BenchmarkSystem:
def __init__(self, name, icon, scale_ratio, base_size, spacing_base, best_for):
self.name = name
self.icon = icon
self.typography = {"scale_ratio": scale_ratio, "base_size": base_size}
self.spacing = {"base": spacing_base}
self.best_for = best_for
class _BenchmarkComparison:
def __init__(self, benchmark, similarity_score, overall_match_pct, type_ratio_diff, base_size_diff, spacing_grid_diff):
self.benchmark = benchmark
self.similarity_score = similarity_score
self.overall_match_pct = overall_match_pct
self.type_ratio_diff = type_ratio_diff
self.base_size_diff = base_size_diff
self.spacing_grid_diff = spacing_grid_diff
LIVE_BENCHMARK_COMPARISONS = [
_BenchmarkComparison(
benchmark=_BenchmarkSystem("Shopify Polaris", "🟢", 1.2, 16, 4, ["e-commerce", "admin"]),
similarity_score=0.15, overall_match_pct=85, type_ratio_diff=0.05, base_size_diff=0, spacing_grid_diff=0,
),
_BenchmarkComparison(
benchmark=_BenchmarkSystem("Material Design 3", "🔵", 1.25, 16, 8, ["mobile", "web"]),
similarity_score=0.20, overall_match_pct=80, type_ratio_diff=0.1, base_size_diff=0, spacing_grid_diff=4,
),
_BenchmarkComparison(
benchmark=_BenchmarkSystem("Atlassian Design System", "🔷", 1.143, 14, 8, ["enterprise", "tools"]),
similarity_score=0.25, overall_match_pct=75, type_ratio_diff=0.007, base_size_diff=2, spacing_grid_diff=4,
),
]
# Mock RuleEngineResults (realistic values)
class _MockTypography:
detected_ratio = 1.15
base_size = 16.0
sizes_px = [12, 14, 16, 18, 20, 24, 30, 36, 48]
is_consistent = False
variance = 0.18
scale_name = "Major Second"
closest_standard_ratio = 1.125
recommendation = 1.25
recommendation_name = "Major Third"
def to_dict(self):
return {"detected_ratio": self.detected_ratio, "base_size": self.base_size, "sizes_px": self.sizes_px}
class _MockSpacing:
detected_base = 4
is_aligned = True
alignment_percentage = 92.0
misaligned_values = [6, 10]
recommendation = 4
recommendation_reason = "4px grid with 92% alignment"
current_values = [4, 8, 12, 16, 20, 24, 32, 48, 64]
suggested_scale = [0, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64]
def to_dict(self):
return {"detected_base": self.detected_base, "alignment_percentage": self.alignment_percentage}
class _MockColorStats:
total_count = 42
unique_count = 13
duplicate_count = 29
gray_count = 5
saturated_count = 5
near_duplicates = [("#f3f4f6", "#f9fafb", 0.02)]
hue_distribution = {"blue": 3, "purple": 1, "green": 1, "red": 1, "yellow": 1, "gray": 6}
def to_dict(self):
return {"total": self.total_count, "unique": self.unique_count}
class _MockAccessibility:
def __init__(self, hex_color, name, passes, contrast_white, fix=None, fix_contrast=None):
self.hex_color = hex_color
self.name = name
self.passes_aa_normal = passes
self.contrast_on_white = contrast_white
self.contrast_on_black = 21.0 - contrast_white # approximate
self.suggested_fix = fix
self.suggested_fix_contrast = fix_contrast
def to_dict(self):
return {"color": self.hex_color, "aa_normal": self.passes_aa_normal}
LIVE_ACCESSIBILITY = [
_MockAccessibility("#2563eb", "primary-button", True, 4.68),
_MockAccessibility("#7c3aed", "secondary-button", True, 5.32),
_MockAccessibility("#9ca3af", "text-muted", False, 2.85, "#6b7280", 4.56),
_MockAccessibility("#eab308", "warning", False, 2.09, "#a16207", 4.52),
_MockAccessibility("#bfdbfe", "light-accent", False, 1.51, "#3b82f6", 4.68),
]
class MockRuleEngineResults:
typography = _MockTypography()
spacing = _MockSpacing()
color_stats = _MockColorStats()
accessibility = LIVE_ACCESSIBILITY
aa_failures = 3
consistency_score = 68
def to_dict(self):
return {
"typography": self.typography.to_dict(),
"spacing": self.spacing.to_dict(),
"color_stats": self.color_stats.to_dict(),
"accessibility": [a.to_dict() for a in self.accessibility],
"summary": {"aa_failures": self.aa_failures, "consistency_score": self.consistency_score},
}
# =============================================================================
# HELPER: Run async in pytest
# =============================================================================
def run_async(coro):
"""Run async function in sync context."""
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
# =============================================================================
# LIVE TESTS: AURORA (Brand Identifier)
# =============================================================================
class TestAuroraLive:
"""Live evaluation of AURORA — Brand Identifier agent."""
@pytest.fixture(scope="class")
def aurora_result(self):
client = get_live_client()
agent = BrandIdentifierAgent(client)
return run_async(agent.analyze(
color_tokens=LIVE_COLOR_TOKENS,
semantic_analysis=LIVE_SEMANTIC_ANALYSIS,
))
def test_returns_brand_identification(self, aurora_result):
"""AURORA returns a BrandIdentification dataclass."""
assert isinstance(aurora_result, BrandIdentification)
def test_identifies_primary_as_blue(self, aurora_result):
"""AURORA should identify #2563eb (blue) as brand primary — it has highest frequency in buttons."""
bp = aurora_result.brand_primary
assert isinstance(bp, dict), f"Expected dict, got {type(bp)}"
color = bp.get("color", "").lower()
# Should be blue (#2563eb) — the dominant CTA color
assert color == "#2563eb", f"Expected #2563eb as primary, got {color}"
def test_confidence_is_high(self, aurora_result):
"""With 45 button usages, confidence should be high."""
bp = aurora_result.brand_primary
confidence = bp.get("confidence", "").lower()
assert confidence in ("high", "very high"), f"Expected high confidence, got '{confidence}'"
def test_palette_strategy_identified(self, aurora_result):
"""Palette strategy should be identified (blue + purple = near-analogous)."""
assert aurora_result.palette_strategy != ""
assert aurora_result.palette_strategy in (
"analogous", "complementary", "triadic", "monochromatic",
"split-complementary", "near-analogous", "random",
)
def test_cohesion_score_reasonable(self, aurora_result):
"""Cohesion score 1-10, this palette is decent so expect 5+."""
score = aurora_result.cohesion_score
assert 1 <= score <= 10, f"Cohesion score out of range: {score}"
assert score >= 4, f"Expected 4+ for a decent SaaS palette, got {score}"
def test_self_evaluation_present(self, aurora_result):
"""Self-evaluation includes confidence and data_quality."""
se = aurora_result.self_evaluation
assert isinstance(se, dict)
assert "confidence" in se, f"Missing confidence in self_evaluation: {se}"
def test_json_serializable(self, aurora_result):
"""Output is fully JSON-serializable."""
d = aurora_result.to_dict()
json_str = json.dumps(d)
assert len(json_str) > 50
def test_deepeval_quality(self, aurora_result):
"""DeepEval G-Eval: Is the brand analysis coherent and useful?"""
if not HAS_DEEPEVAL:
pytest.skip("DeepEval not installed — run: pip install deepeval")
test_case = LLMTestCase(
input=f"Analyze brand colors: primary-button=#2563eb (45 uses), secondary=#7c3aed (18 uses), 13 total colors",
actual_output=json.dumps(aurora_result.to_dict(), indent=2),
)
coherence_metric = GEval(
name="Brand Analysis Coherence",
criteria="The brand analysis should correctly identify the most-used button color as primary, provide a valid palette strategy, and include reasoning that references usage frequency.",
evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
threshold=0.6,
)
assert_test(test_case, [coherence_metric])
# =============================================================================
# LIVE TESTS: ATLAS (Benchmark Advisor)
# =============================================================================
class TestAtlasLive:
"""Live evaluation of ATLAS — Benchmark Advisor agent."""
@pytest.fixture(scope="class")
def atlas_result(self):
client = get_live_client()
agent = BenchmarkAdvisorAgent(client)
return run_async(agent.analyze(
user_ratio=1.15,
user_base=16,
user_spacing=4,
benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
))
def test_returns_benchmark_advice(self, atlas_result):
assert isinstance(atlas_result, BenchmarkAdvice)
def test_recommends_known_benchmark(self, atlas_result):
"""Should recommend one of the provided benchmarks."""
rec = atlas_result.recommended_benchmark.lower()
assert any(name in rec for name in ["polaris", "material", "atlassian"]), \
f"Unexpected benchmark: {atlas_result.recommended_benchmark}"
def test_reasoning_non_empty(self, atlas_result):
"""Reasoning explains WHY this benchmark fits."""
assert len(atlas_result.reasoning) > 20, \
f"Reasoning too short: '{atlas_result.reasoning}'"
def test_alignment_changes_actionable(self, atlas_result):
"""Alignment changes should be a list of specific steps."""
changes = atlas_result.alignment_changes
assert isinstance(changes, list)
assert len(changes) >= 1, "Expected at least 1 alignment change"
def test_pros_and_cons_present(self, atlas_result):
"""Both pros and cons should be listed."""
assert isinstance(atlas_result.pros_of_alignment, list)
assert len(atlas_result.pros_of_alignment) >= 1
def test_self_evaluation_present(self, atlas_result):
se = atlas_result.self_evaluation
assert isinstance(se, dict)
assert "confidence" in se
def test_deepeval_quality(self, atlas_result):
"""DeepEval G-Eval: Is the benchmark recommendation well-reasoned?"""
if not HAS_DEEPEVAL:
pytest.skip("DeepEval not installed")
test_case = LLMTestCase(
input="Compare against: Polaris (85%), Material 3 (80%), Atlassian (75%)",
actual_output=json.dumps(atlas_result.to_dict(), indent=2),
)
relevance_metric = GEval(
name="Benchmark Recommendation Relevance",
criteria="The recommendation should pick the highest-matching benchmark, explain why structurally, and list concrete alignment changes needed.",
evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
threshold=0.6,
)
assert_test(test_case, [relevance_metric])
# =============================================================================
# LIVE TESTS: SENTINEL (Best Practices Validator)
# =============================================================================
class TestSentinelLive:
"""Live evaluation of SENTINEL — Best Practices Validator agent."""
@pytest.fixture(scope="class")
def sentinel_result(self):
client = get_live_client()
agent = BestPracticesValidatorAgent(client)
return run_async(agent.analyze(
rule_engine_results=MockRuleEngineResults(),
))
def test_returns_best_practices_result(self, sentinel_result):
assert isinstance(sentinel_result, BestPracticesResult)
def test_score_in_range(self, sentinel_result):
"""Score should be 0-100."""
assert 0 <= sentinel_result.overall_score <= 100
def test_score_reflects_failures(self, sentinel_result):
"""With 3 AA failures and inconsistent type scale, score should be < 80."""
assert sentinel_result.overall_score < 85, \
f"Score {sentinel_result.overall_score} seems too high for 3 AA failures + inconsistent type"
def test_priority_fixes_ranked(self, sentinel_result):
"""Priority fixes should exist and be ranked."""
fixes = sentinel_result.priority_fixes
assert isinstance(fixes, list)
assert len(fixes) >= 1, "Expected at least 1 priority fix"
# First fix should address accessibility (most impactful)
if isinstance(fixes[0], dict):
first_issue = str(fixes[0].get("issue", "")).lower()
# Should mention contrast/accessibility/AA in top fixes
assert any(kw in first_issue for kw in ("contrast", "aa", "accessib", "color")), \
f"Top fix doesn't address accessibility: '{first_issue}'"
def test_checks_cover_key_areas(self, sentinel_result):
"""Checks should cover contrast, type scale, spacing."""
if sentinel_result.checks:
check_keys = " ".join(str(k).lower() for k in sentinel_result.checks.keys())
# At least 2 of these should appear
areas_found = sum(1 for area in ["contrast", "type", "spacing", "color"]
if area in check_keys)
assert areas_found >= 2, f"Only {areas_found} key areas in checks: {list(sentinel_result.checks.keys())}"
def test_self_evaluation_present(self, sentinel_result):
se = sentinel_result.self_evaluation
assert isinstance(se, dict)
def test_deepeval_quality(self, sentinel_result):
"""DeepEval G-Eval: Are priority fixes correctly ordered by impact?"""
if not HAS_DEEPEVAL:
pytest.skip("DeepEval not installed")
test_case = LLMTestCase(
input="Rule engine: 3 AA failures, inconsistent type scale (variance=0.18), 4px grid 92% aligned, 13 unique colors",
actual_output=json.dumps(sentinel_result.to_dict(), indent=2),
)
impact_metric = GEval(
name="Priority Fix Impact Ordering",
criteria="Accessibility failures should be ranked highest priority since they affect legal compliance and usability. Type scale inconsistency and color consolidation should follow.",
evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
threshold=0.6,
)
assert_test(test_case, [impact_metric])
# =============================================================================
# LIVE TESTS: NEXUS (Head Synthesizer)
# =============================================================================
class TestNexusLive:
"""Live evaluation of NEXUS — Head Synthesizer agent."""
@pytest.fixture(scope="class")
def nexus_result(self):
client = get_live_client()
# First run the 3 upstream agents
aurora_agent = BrandIdentifierAgent(client)
atlas_agent = BenchmarkAdvisorAgent(client)
sentinel_agent = BestPracticesValidatorAgent(client)
aurora_result = run_async(aurora_agent.analyze(
color_tokens=LIVE_COLOR_TOKENS,
semantic_analysis=LIVE_SEMANTIC_ANALYSIS,
))
atlas_result = run_async(atlas_agent.analyze(
user_ratio=1.15,
user_base=16,
user_spacing=4,
benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
))
sentinel_result = run_async(sentinel_agent.analyze(
rule_engine_results=MockRuleEngineResults(),
))
# Now run NEXUS with real upstream outputs
nexus_agent = HeadSynthesizerAgent(client)
return run_async(nexus_agent.synthesize(
rule_engine_results=MockRuleEngineResults(),
benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
brand_identification=aurora_result,
benchmark_advice=atlas_result,
best_practices=sentinel_result,
))
def test_returns_head_synthesis(self, nexus_result):
assert isinstance(nexus_result, HeadSynthesis)
def test_executive_summary_substantial(self, nexus_result):
"""Executive summary should be a meaningful paragraph."""
assert len(nexus_result.executive_summary) > 50, \
f"Summary too short ({len(nexus_result.executive_summary)} chars): '{nexus_result.executive_summary}'"
def test_top_3_actions_present(self, nexus_result):
"""Should provide 3 action items."""
assert isinstance(nexus_result.top_3_actions, list)
assert len(nexus_result.top_3_actions) >= 2, \
f"Expected 2+ actions, got {len(nexus_result.top_3_actions)}"
def test_scores_present(self, nexus_result):
"""Overall scores dict should have key metrics."""
scores = nexus_result.scores
assert isinstance(scores, dict)
assert len(scores) >= 1, "Expected at least 1 score dimension"
def test_color_recommendations_present(self, nexus_result):
"""Should include color-specific recommendations."""
recs = nexus_result.color_recommendations
assert isinstance(recs, list)
# With 3 AA failures, should have some color recs
# (may be empty if NEXUS consolidates into actions instead)
def test_references_all_agents(self, nexus_result):
"""Executive summary should reference brand + benchmark + practices."""
summary_lower = nexus_result.executive_summary.lower()
to_dict = json.dumps(nexus_result.to_dict()).lower()
# NEXUS should incorporate insights from all 3 agents
# Check in full output since summary might be concise
has_brand = any(kw in to_dict for kw in ("brand", "primary", "color"))
has_benchmark = any(kw in to_dict for kw in ("benchmark", "polaris", "material", "system"))
has_practices = any(kw in to_dict for kw in ("accessibility", "contrast", "score", "fix"))
assert has_brand, "NEXUS output missing brand analysis references"
assert has_practices, "NEXUS output missing best practices references"
def test_self_evaluation_present(self, nexus_result):
se = nexus_result.self_evaluation
assert isinstance(se, dict)
def test_json_serializable(self, nexus_result):
d = nexus_result.to_dict()
json_str = json.dumps(d)
assert len(json_str) > 100
def test_deepeval_synthesis_quality(self, nexus_result):
"""DeepEval G-Eval: Does NEXUS produce a coherent synthesis?"""
if not HAS_DEEPEVAL:
pytest.skip("DeepEval not installed")
test_case = LLMTestCase(
input="Synthesize: AURORA found blue primary (#2563eb), ATLAS recommends Polaris (85% match), SENTINEL found 3 AA failures, score 68/100",
actual_output=json.dumps(nexus_result.to_dict(), indent=2),
)
synthesis_metric = GEval(
name="Synthesis Quality",
criteria="The synthesis should: (1) reference findings from all 3 upstream agents, (2) prioritize actionable recommendations, (3) include an executive summary that a non-technical stakeholder could understand, (4) not contradict upstream agent findings.",
evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
threshold=0.6,
)
assert_test(test_case, [synthesis_metric])
# =============================================================================
# CROSS-AGENT CONSISTENCY TEST
# =============================================================================
class TestCrossAgentConsistency:
"""Tests that verify consistency across all 4 agents."""
@pytest.fixture(scope="class")
def all_results(self):
"""Run all 4 agents and return results."""
client = get_live_client()
aurora = run_async(BrandIdentifierAgent(client).analyze(
color_tokens=LIVE_COLOR_TOKENS,
semantic_analysis=LIVE_SEMANTIC_ANALYSIS,
))
atlas = run_async(BenchmarkAdvisorAgent(client).analyze(
user_ratio=1.15, user_base=16, user_spacing=4,
benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
))
sentinel = run_async(BestPracticesValidatorAgent(client).analyze(
rule_engine_results=MockRuleEngineResults(),
))
nexus = run_async(HeadSynthesizerAgent(client).synthesize(
rule_engine_results=MockRuleEngineResults(),
benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
brand_identification=aurora,
benchmark_advice=atlas,
best_practices=sentinel,
))
return {"aurora": aurora, "atlas": atlas, "sentinel": sentinel, "nexus": nexus}
def test_all_agents_return_results(self, all_results):
"""All 4 agents should return non-None results."""
for name, result in all_results.items():
assert result is not None, f"{name} returned None"
def test_all_have_self_evaluation(self, all_results):
"""Every agent should include self-evaluation."""
for name, result in all_results.items():
se = result.self_evaluation
assert isinstance(se, dict), f"{name} self_evaluation is not dict: {type(se)}"
def test_validation_passes(self, all_results):
"""All agent outputs pass schema validation."""
from core.validation import validate_agent_output
validations = {
"aurora": all_results["aurora"],
"atlas": all_results["atlas"],
"sentinel": all_results["sentinel"],
"nexus": all_results["nexus"],
}
for agent_name, result in validations.items():
is_valid, error = validate_agent_output(result, agent_name)
assert is_valid, f"{agent_name} validation failed: {error}"
def test_nexus_score_near_sentinel(self, all_results):
"""NEXUS overall score should be within 20 points of SENTINEL score."""
sentinel_score = all_results["sentinel"].overall_score
nexus_scores = all_results["nexus"].scores
if "overall" in nexus_scores:
nexus_score = nexus_scores["overall"]
diff = abs(nexus_score - sentinel_score)
assert diff <= 25, \
f"NEXUS ({nexus_score}) and SENTINEL ({sentinel_score}) scores differ by {diff} — should be within 25"
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s", "--timeout=120"])