Spaces:

riazmo
/

Design-System-Automation

Runtime error

File size: 27,062 Bytes

abab3e7

#!/usr/bin/env python3
"""
Live LLM Agent Evaluations with DeepEval
==========================================

Tests the 4 AI agents with REAL HuggingFace API calls + DeepEval metrics.
Unlike test_agent_evals.py (mock), this hits live LLMs and evaluates output quality.

WHAT THIS TESTS:
  - Does the LLM return valid JSON? (not just our parser)
  - Is the brand identification sensible for known colors?
  - Does the benchmark advisor pick a relevant system?
  - Are priority fixes ranked by actual impact?
  - Does NEXUS reference all 3 upstream agents?
  - Are self-evaluation confidence scores honest?

REQUIRES:
  - HF_TOKEN env var set (HuggingFace Pro $9/month)
  - pip install deepeval (optional — falls back to manual assertions)

RUN:
  # With DeepEval dashboard:
  deepeval test run tests/test_agent_evals_live.py -v

  # With plain pytest:
  pytest tests/test_agent_evals_live.py -v -s --timeout=120

  # Skip if no HF_TOKEN:
  pytest tests/test_agent_evals_live.py -v -k "not live"

COST: ~$0.003 per full run (4 agent calls)
TIME: ~30s sequential, ~10s with parallelized agents
"""

import asyncio
import json
import os
import sys
from typing import Optional

import pytest

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Skip all tests if no HF_TOKEN
HF_TOKEN = os.getenv("HF_TOKEN", "")
SKIP_REASON = "HF_TOKEN not set — skipping live LLM evals (set HF_TOKEN to run)"
pytestmark = pytest.mark.skipif(not HF_TOKEN, reason=SKIP_REASON)

from agents.llm_agents import (
    BrandIdentifierAgent,
    BenchmarkAdvisorAgent,
    BestPracticesValidatorAgent,
    HeadSynthesizerAgent,
    BrandIdentification,
    BenchmarkAdvice,
    BestPracticesResult,
    HeadSynthesis,
)

# Try importing DeepEval
try:
    from deepeval import assert_test
    from deepeval.test_case import LLMTestCase
    from deepeval.metrics import GEval
    from deepeval.metrics.g_eval import GEvalParameter

    HAS_DEEPEVAL = True
except ImportError:
    HAS_DEEPEVAL = False


# =============================================================================
# LIVE HF CLIENT
# =============================================================================

def get_live_client():
    """Get the real HF inference client."""
    from core.hf_inference import get_inference_client
    return get_inference_client()


# =============================================================================
# REALISTIC TEST DATA (simulates a real website extraction)
# =============================================================================

# Simulates tokens extracted from a SaaS dashboard website
LIVE_COLOR_TOKENS = {
    "primary-button": {"value": "#2563eb", "frequency": 45, "context": "buttons, links, CTAs"},
    "secondary-button": {"value": "#7c3aed", "frequency": 18, "context": "secondary actions"},
    "success": {"value": "#16a34a", "frequency": 12, "context": "success states, badges"},
    "warning": {"value": "#eab308", "frequency": 8, "context": "warnings, alerts"},
    "error": {"value": "#dc2626", "frequency": 6, "context": "error states"},
    "text-primary": {"value": "#111827", "frequency": 200, "context": "headings, body text"},
    "text-secondary": {"value": "#6b7280", "frequency": 150, "context": "secondary text, labels"},
    "text-muted": {"value": "#9ca3af", "frequency": 80, "context": "placeholders, disabled"},
    "bg-white": {"value": "#ffffff", "frequency": 300, "context": "page background"},
    "bg-gray-50": {"value": "#f9fafb", "frequency": 100, "context": "card backgrounds"},
    "bg-gray-100": {"value": "#f3f4f6", "frequency": 60, "context": "section backgrounds"},
    "border": {"value": "#e5e7eb", "frequency": 90, "context": "borders, dividers"},
    "light-accent": {"value": "#bfdbfe", "frequency": 15, "context": "highlights, selected"},
}

LIVE_SEMANTIC_ANALYSIS = {
    "brand": [
        {"hex": "#2563eb", "name": "primary-button", "context": "buttons, links, CTAs"},
        {"hex": "#7c3aed", "name": "secondary-button", "context": "secondary actions"},
    ],
    "text": [
        {"hex": "#111827", "name": "text-primary"},
        {"hex": "#6b7280", "name": "text-secondary"},
    ],
    "status": [
        {"hex": "#16a34a", "name": "success"},
        {"hex": "#dc2626", "name": "error"},
    ],
}


# Mock benchmark comparison objects (same structure as real pipeline)
class _BenchmarkSystem:
    def __init__(self, name, icon, scale_ratio, base_size, spacing_base, best_for):
        self.name = name
        self.icon = icon
        self.typography = {"scale_ratio": scale_ratio, "base_size": base_size}
        self.spacing = {"base": spacing_base}
        self.best_for = best_for


class _BenchmarkComparison:
    def __init__(self, benchmark, similarity_score, overall_match_pct, type_ratio_diff, base_size_diff, spacing_grid_diff):
        self.benchmark = benchmark
        self.similarity_score = similarity_score
        self.overall_match_pct = overall_match_pct
        self.type_ratio_diff = type_ratio_diff
        self.base_size_diff = base_size_diff
        self.spacing_grid_diff = spacing_grid_diff


LIVE_BENCHMARK_COMPARISONS = [
    _BenchmarkComparison(
        benchmark=_BenchmarkSystem("Shopify Polaris", "🟢", 1.2, 16, 4, ["e-commerce", "admin"]),
        similarity_score=0.15, overall_match_pct=85, type_ratio_diff=0.05, base_size_diff=0, spacing_grid_diff=0,
    ),
    _BenchmarkComparison(
        benchmark=_BenchmarkSystem("Material Design 3", "🔵", 1.25, 16, 8, ["mobile", "web"]),
        similarity_score=0.20, overall_match_pct=80, type_ratio_diff=0.1, base_size_diff=0, spacing_grid_diff=4,
    ),
    _BenchmarkComparison(
        benchmark=_BenchmarkSystem("Atlassian Design System", "🔷", 1.143, 14, 8, ["enterprise", "tools"]),
        similarity_score=0.25, overall_match_pct=75, type_ratio_diff=0.007, base_size_diff=2, spacing_grid_diff=4,
    ),
]


# Mock RuleEngineResults (realistic values)
class _MockTypography:
    detected_ratio = 1.15
    base_size = 16.0
    sizes_px = [12, 14, 16, 18, 20, 24, 30, 36, 48]
    is_consistent = False
    variance = 0.18
    scale_name = "Major Second"
    closest_standard_ratio = 1.125
    recommendation = 1.25
    recommendation_name = "Major Third"

    def to_dict(self):
        return {"detected_ratio": self.detected_ratio, "base_size": self.base_size, "sizes_px": self.sizes_px}


class _MockSpacing:
    detected_base = 4
    is_aligned = True
    alignment_percentage = 92.0
    misaligned_values = [6, 10]
    recommendation = 4
    recommendation_reason = "4px grid with 92% alignment"
    current_values = [4, 8, 12, 16, 20, 24, 32, 48, 64]
    suggested_scale = [0, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64]

    def to_dict(self):
        return {"detected_base": self.detected_base, "alignment_percentage": self.alignment_percentage}


class _MockColorStats:
    total_count = 42
    unique_count = 13
    duplicate_count = 29
    gray_count = 5
    saturated_count = 5
    near_duplicates = [("#f3f4f6", "#f9fafb", 0.02)]
    hue_distribution = {"blue": 3, "purple": 1, "green": 1, "red": 1, "yellow": 1, "gray": 6}

    def to_dict(self):
        return {"total": self.total_count, "unique": self.unique_count}


class _MockAccessibility:
    def __init__(self, hex_color, name, passes, contrast_white, fix=None, fix_contrast=None):
        self.hex_color = hex_color
        self.name = name
        self.passes_aa_normal = passes
        self.contrast_on_white = contrast_white
        self.contrast_on_black = 21.0 - contrast_white  # approximate
        self.suggested_fix = fix
        self.suggested_fix_contrast = fix_contrast

    def to_dict(self):
        return {"color": self.hex_color, "aa_normal": self.passes_aa_normal}


LIVE_ACCESSIBILITY = [
    _MockAccessibility("#2563eb", "primary-button", True, 4.68),
    _MockAccessibility("#7c3aed", "secondary-button", True, 5.32),
    _MockAccessibility("#9ca3af", "text-muted", False, 2.85, "#6b7280", 4.56),
    _MockAccessibility("#eab308", "warning", False, 2.09, "#a16207", 4.52),
    _MockAccessibility("#bfdbfe", "light-accent", False, 1.51, "#3b82f6", 4.68),
]


class MockRuleEngineResults:
    typography = _MockTypography()
    spacing = _MockSpacing()
    color_stats = _MockColorStats()
    accessibility = LIVE_ACCESSIBILITY
    aa_failures = 3
    consistency_score = 68

    def to_dict(self):
        return {
            "typography": self.typography.to_dict(),
            "spacing": self.spacing.to_dict(),
            "color_stats": self.color_stats.to_dict(),
            "accessibility": [a.to_dict() for a in self.accessibility],
            "summary": {"aa_failures": self.aa_failures, "consistency_score": self.consistency_score},
        }


# =============================================================================
# HELPER: Run async in pytest
# =============================================================================

def run_async(coro):
    """Run async function in sync context."""
    loop = asyncio.new_event_loop()
    try:
        return loop.run_until_complete(coro)
    finally:
        loop.close()


# =============================================================================
# LIVE TESTS: AURORA (Brand Identifier)
# =============================================================================

class TestAuroraLive:
    """Live evaluation of AURORA — Brand Identifier agent."""

    @pytest.fixture(scope="class")
    def aurora_result(self):
        client = get_live_client()
        agent = BrandIdentifierAgent(client)
        return run_async(agent.analyze(
            color_tokens=LIVE_COLOR_TOKENS,
            semantic_analysis=LIVE_SEMANTIC_ANALYSIS,
        ))

    def test_returns_brand_identification(self, aurora_result):
        """AURORA returns a BrandIdentification dataclass."""
        assert isinstance(aurora_result, BrandIdentification)

    def test_identifies_primary_as_blue(self, aurora_result):
        """AURORA should identify #2563eb (blue) as brand primary — it has highest frequency in buttons."""
        bp = aurora_result.brand_primary
        assert isinstance(bp, dict), f"Expected dict, got {type(bp)}"
        color = bp.get("color", "").lower()
        # Should be blue (#2563eb) — the dominant CTA color
        assert color == "#2563eb", f"Expected #2563eb as primary, got {color}"

    def test_confidence_is_high(self, aurora_result):
        """With 45 button usages, confidence should be high."""
        bp = aurora_result.brand_primary
        confidence = bp.get("confidence", "").lower()
        assert confidence in ("high", "very high"), f"Expected high confidence, got '{confidence}'"

    def test_palette_strategy_identified(self, aurora_result):
        """Palette strategy should be identified (blue + purple = near-analogous)."""
        assert aurora_result.palette_strategy != ""
        assert aurora_result.palette_strategy in (
            "analogous", "complementary", "triadic", "monochromatic",
            "split-complementary", "near-analogous", "random",
        )

    def test_cohesion_score_reasonable(self, aurora_result):
        """Cohesion score 1-10, this palette is decent so expect 5+."""
        score = aurora_result.cohesion_score
        assert 1 <= score <= 10, f"Cohesion score out of range: {score}"
        assert score >= 4, f"Expected 4+ for a decent SaaS palette, got {score}"

    def test_self_evaluation_present(self, aurora_result):
        """Self-evaluation includes confidence and data_quality."""
        se = aurora_result.self_evaluation
        assert isinstance(se, dict)
        assert "confidence" in se, f"Missing confidence in self_evaluation: {se}"

    def test_json_serializable(self, aurora_result):
        """Output is fully JSON-serializable."""
        d = aurora_result.to_dict()
        json_str = json.dumps(d)
        assert len(json_str) > 50

    def test_deepeval_quality(self, aurora_result):
        """DeepEval G-Eval: Is the brand analysis coherent and useful?"""
        if not HAS_DEEPEVAL:
            pytest.skip("DeepEval not installed — run: pip install deepeval")

        test_case = LLMTestCase(
            input=f"Analyze brand colors: primary-button=#2563eb (45 uses), secondary=#7c3aed (18 uses), 13 total colors",
            actual_output=json.dumps(aurora_result.to_dict(), indent=2),
        )

        coherence_metric = GEval(
            name="Brand Analysis Coherence",
            criteria="The brand analysis should correctly identify the most-used button color as primary, provide a valid palette strategy, and include reasoning that references usage frequency.",
            evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
            threshold=0.6,
        )

        assert_test(test_case, [coherence_metric])


# =============================================================================
# LIVE TESTS: ATLAS (Benchmark Advisor)
# =============================================================================

class TestAtlasLive:
    """Live evaluation of ATLAS — Benchmark Advisor agent."""

    @pytest.fixture(scope="class")
    def atlas_result(self):
        client = get_live_client()
        agent = BenchmarkAdvisorAgent(client)
        return run_async(agent.analyze(
            user_ratio=1.15,
            user_base=16,
            user_spacing=4,
            benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
        ))

    def test_returns_benchmark_advice(self, atlas_result):
        assert isinstance(atlas_result, BenchmarkAdvice)

    def test_recommends_known_benchmark(self, atlas_result):
        """Should recommend one of the provided benchmarks."""
        rec = atlas_result.recommended_benchmark.lower()
        assert any(name in rec for name in ["polaris", "material", "atlassian"]), \
            f"Unexpected benchmark: {atlas_result.recommended_benchmark}"

    def test_reasoning_non_empty(self, atlas_result):
        """Reasoning explains WHY this benchmark fits."""
        assert len(atlas_result.reasoning) > 20, \
            f"Reasoning too short: '{atlas_result.reasoning}'"

    def test_alignment_changes_actionable(self, atlas_result):
        """Alignment changes should be a list of specific steps."""
        changes = atlas_result.alignment_changes
        assert isinstance(changes, list)
        assert len(changes) >= 1, "Expected at least 1 alignment change"

    def test_pros_and_cons_present(self, atlas_result):
        """Both pros and cons should be listed."""
        assert isinstance(atlas_result.pros_of_alignment, list)
        assert len(atlas_result.pros_of_alignment) >= 1

    def test_self_evaluation_present(self, atlas_result):
        se = atlas_result.self_evaluation
        assert isinstance(se, dict)
        assert "confidence" in se

    def test_deepeval_quality(self, atlas_result):
        """DeepEval G-Eval: Is the benchmark recommendation well-reasoned?"""
        if not HAS_DEEPEVAL:
            pytest.skip("DeepEval not installed")

        test_case = LLMTestCase(
            input="Compare against: Polaris (85%), Material 3 (80%), Atlassian (75%)",
            actual_output=json.dumps(atlas_result.to_dict(), indent=2),
        )

        relevance_metric = GEval(
            name="Benchmark Recommendation Relevance",
            criteria="The recommendation should pick the highest-matching benchmark, explain why structurally, and list concrete alignment changes needed.",
            evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
            threshold=0.6,
        )

        assert_test(test_case, [relevance_metric])


# =============================================================================
# LIVE TESTS: SENTINEL (Best Practices Validator)
# =============================================================================

class TestSentinelLive:
    """Live evaluation of SENTINEL — Best Practices Validator agent."""

    @pytest.fixture(scope="class")
    def sentinel_result(self):
        client = get_live_client()
        agent = BestPracticesValidatorAgent(client)
        return run_async(agent.analyze(
            rule_engine_results=MockRuleEngineResults(),
        ))

    def test_returns_best_practices_result(self, sentinel_result):
        assert isinstance(sentinel_result, BestPracticesResult)

    def test_score_in_range(self, sentinel_result):
        """Score should be 0-100."""
        assert 0 <= sentinel_result.overall_score <= 100

    def test_score_reflects_failures(self, sentinel_result):
        """With 3 AA failures and inconsistent type scale, score should be < 80."""
        assert sentinel_result.overall_score < 85, \
            f"Score {sentinel_result.overall_score} seems too high for 3 AA failures + inconsistent type"

    def test_priority_fixes_ranked(self, sentinel_result):
        """Priority fixes should exist and be ranked."""
        fixes = sentinel_result.priority_fixes
        assert isinstance(fixes, list)
        assert len(fixes) >= 1, "Expected at least 1 priority fix"
        # First fix should address accessibility (most impactful)
        if isinstance(fixes[0], dict):
            first_issue = str(fixes[0].get("issue", "")).lower()
            # Should mention contrast/accessibility/AA in top fixes
            assert any(kw in first_issue for kw in ("contrast", "aa", "accessib", "color")), \
                f"Top fix doesn't address accessibility: '{first_issue}'"

    def test_checks_cover_key_areas(self, sentinel_result):
        """Checks should cover contrast, type scale, spacing."""
        if sentinel_result.checks:
            check_keys = " ".join(str(k).lower() for k in sentinel_result.checks.keys())
            # At least 2 of these should appear
            areas_found = sum(1 for area in ["contrast", "type", "spacing", "color"]
                             if area in check_keys)
            assert areas_found >= 2, f"Only {areas_found} key areas in checks: {list(sentinel_result.checks.keys())}"

    def test_self_evaluation_present(self, sentinel_result):
        se = sentinel_result.self_evaluation
        assert isinstance(se, dict)

    def test_deepeval_quality(self, sentinel_result):
        """DeepEval G-Eval: Are priority fixes correctly ordered by impact?"""
        if not HAS_DEEPEVAL:
            pytest.skip("DeepEval not installed")

        test_case = LLMTestCase(
            input="Rule engine: 3 AA failures, inconsistent type scale (variance=0.18), 4px grid 92% aligned, 13 unique colors",
            actual_output=json.dumps(sentinel_result.to_dict(), indent=2),
        )

        impact_metric = GEval(
            name="Priority Fix Impact Ordering",
            criteria="Accessibility failures should be ranked highest priority since they affect legal compliance and usability. Type scale inconsistency and color consolidation should follow.",
            evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
            threshold=0.6,
        )

        assert_test(test_case, [impact_metric])


# =============================================================================
# LIVE TESTS: NEXUS (Head Synthesizer)
# =============================================================================

class TestNexusLive:
    """Live evaluation of NEXUS — Head Synthesizer agent."""

    @pytest.fixture(scope="class")
    def nexus_result(self):
        client = get_live_client()

        # First run the 3 upstream agents
        aurora_agent = BrandIdentifierAgent(client)
        atlas_agent = BenchmarkAdvisorAgent(client)
        sentinel_agent = BestPracticesValidatorAgent(client)

        aurora_result = run_async(aurora_agent.analyze(
            color_tokens=LIVE_COLOR_TOKENS,
            semantic_analysis=LIVE_SEMANTIC_ANALYSIS,
        ))
        atlas_result = run_async(atlas_agent.analyze(
            user_ratio=1.15,
            user_base=16,
            user_spacing=4,
            benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
        ))
        sentinel_result = run_async(sentinel_agent.analyze(
            rule_engine_results=MockRuleEngineResults(),
        ))

        # Now run NEXUS with real upstream outputs
        nexus_agent = HeadSynthesizerAgent(client)
        return run_async(nexus_agent.synthesize(
            rule_engine_results=MockRuleEngineResults(),
            benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
            brand_identification=aurora_result,
            benchmark_advice=atlas_result,
            best_practices=sentinel_result,
        ))

    def test_returns_head_synthesis(self, nexus_result):
        assert isinstance(nexus_result, HeadSynthesis)

    def test_executive_summary_substantial(self, nexus_result):
        """Executive summary should be a meaningful paragraph."""
        assert len(nexus_result.executive_summary) > 50, \
            f"Summary too short ({len(nexus_result.executive_summary)} chars): '{nexus_result.executive_summary}'"

    def test_top_3_actions_present(self, nexus_result):
        """Should provide 3 action items."""
        assert isinstance(nexus_result.top_3_actions, list)
        assert len(nexus_result.top_3_actions) >= 2, \
            f"Expected 2+ actions, got {len(nexus_result.top_3_actions)}"

    def test_scores_present(self, nexus_result):
        """Overall scores dict should have key metrics."""
        scores = nexus_result.scores
        assert isinstance(scores, dict)
        assert len(scores) >= 1, "Expected at least 1 score dimension"

    def test_color_recommendations_present(self, nexus_result):
        """Should include color-specific recommendations."""
        recs = nexus_result.color_recommendations
        assert isinstance(recs, list)
        # With 3 AA failures, should have some color recs
        # (may be empty if NEXUS consolidates into actions instead)

    def test_references_all_agents(self, nexus_result):
        """Executive summary should reference brand + benchmark + practices."""
        summary_lower = nexus_result.executive_summary.lower()
        to_dict = json.dumps(nexus_result.to_dict()).lower()
        # NEXUS should incorporate insights from all 3 agents
        # Check in full output since summary might be concise
        has_brand = any(kw in to_dict for kw in ("brand", "primary", "color"))
        has_benchmark = any(kw in to_dict for kw in ("benchmark", "polaris", "material", "system"))
        has_practices = any(kw in to_dict for kw in ("accessibility", "contrast", "score", "fix"))
        assert has_brand, "NEXUS output missing brand analysis references"
        assert has_practices, "NEXUS output missing best practices references"

    def test_self_evaluation_present(self, nexus_result):
        se = nexus_result.self_evaluation
        assert isinstance(se, dict)

    def test_json_serializable(self, nexus_result):
        d = nexus_result.to_dict()
        json_str = json.dumps(d)
        assert len(json_str) > 100

    def test_deepeval_synthesis_quality(self, nexus_result):
        """DeepEval G-Eval: Does NEXUS produce a coherent synthesis?"""
        if not HAS_DEEPEVAL:
            pytest.skip("DeepEval not installed")

        test_case = LLMTestCase(
            input="Synthesize: AURORA found blue primary (#2563eb), ATLAS recommends Polaris (85% match), SENTINEL found 3 AA failures, score 68/100",
            actual_output=json.dumps(nexus_result.to_dict(), indent=2),
        )

        synthesis_metric = GEval(
            name="Synthesis Quality",
            criteria="The synthesis should: (1) reference findings from all 3 upstream agents, (2) prioritize actionable recommendations, (3) include an executive summary that a non-technical stakeholder could understand, (4) not contradict upstream agent findings.",
            evaluation_params=[GEvalParameter.ACTUAL_OUTPUT],
            threshold=0.6,
        )

        assert_test(test_case, [synthesis_metric])


# =============================================================================
# CROSS-AGENT CONSISTENCY TEST
# =============================================================================

class TestCrossAgentConsistency:
    """Tests that verify consistency across all 4 agents."""

    @pytest.fixture(scope="class")
    def all_results(self):
        """Run all 4 agents and return results."""
        client = get_live_client()

        aurora = run_async(BrandIdentifierAgent(client).analyze(
            color_tokens=LIVE_COLOR_TOKENS,
            semantic_analysis=LIVE_SEMANTIC_ANALYSIS,
        ))
        atlas = run_async(BenchmarkAdvisorAgent(client).analyze(
            user_ratio=1.15, user_base=16, user_spacing=4,
            benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
        ))
        sentinel = run_async(BestPracticesValidatorAgent(client).analyze(
            rule_engine_results=MockRuleEngineResults(),
        ))
        nexus = run_async(HeadSynthesizerAgent(client).synthesize(
            rule_engine_results=MockRuleEngineResults(),
            benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS,
            brand_identification=aurora,
            benchmark_advice=atlas,
            best_practices=sentinel,
        ))

        return {"aurora": aurora, "atlas": atlas, "sentinel": sentinel, "nexus": nexus}

    def test_all_agents_return_results(self, all_results):
        """All 4 agents should return non-None results."""
        for name, result in all_results.items():
            assert result is not None, f"{name} returned None"

    def test_all_have_self_evaluation(self, all_results):
        """Every agent should include self-evaluation."""
        for name, result in all_results.items():
            se = result.self_evaluation
            assert isinstance(se, dict), f"{name} self_evaluation is not dict: {type(se)}"

    def test_validation_passes(self, all_results):
        """All agent outputs pass schema validation."""
        from core.validation import validate_agent_output

        validations = {
            "aurora": all_results["aurora"],
            "atlas": all_results["atlas"],
            "sentinel": all_results["sentinel"],
            "nexus": all_results["nexus"],
        }
        for agent_name, result in validations.items():
            is_valid, error = validate_agent_output(result, agent_name)
            assert is_valid, f"{agent_name} validation failed: {error}"

    def test_nexus_score_near_sentinel(self, all_results):
        """NEXUS overall score should be within 20 points of SENTINEL score."""
        sentinel_score = all_results["sentinel"].overall_score
        nexus_scores = all_results["nexus"].scores
        if "overall" in nexus_scores:
            nexus_score = nexus_scores["overall"]
            diff = abs(nexus_score - sentinel_score)
            assert diff <= 25, \
                f"NEXUS ({nexus_score}) and SENTINEL ({sentinel_score}) scores differ by {diff} — should be within 25"


if __name__ == "__main__":
    pytest.main([__file__, "-v", "-s", "--timeout=120"])