Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| LLM Agent Evaluation Tests | |
| ============================ | |
| Evaluates the 4 named AI agents using mock HF client responses. | |
| Tests schema compliance, output correctness, and consistency. | |
| Uses DeepEval when available, falls back to manual assertions. | |
| Run: pytest tests/test_agent_evals.py -v | |
| """ | |
| import asyncio | |
| import json | |
| import os | |
| import sys | |
| from dataclasses import asdict | |
| from typing import Optional | |
| import pytest | |
| # Add parent directory to path | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from agents.llm_agents import ( | |
| BrandIdentifierAgent, | |
| BenchmarkAdvisorAgent, | |
| BestPracticesValidatorAgent, | |
| HeadSynthesizerAgent, | |
| BrandIdentification, | |
| BenchmarkAdvice, | |
| BestPracticesResult, | |
| HeadSynthesis, | |
| ) | |
| # Try importing DeepEval | |
| try: | |
| from deepeval import assert_test | |
| from deepeval.test_case import LLMTestCase | |
| from deepeval.metrics import JsonSchemaMetric | |
| HAS_DEEPEVAL = True | |
| except ImportError: | |
| HAS_DEEPEVAL = False | |
| # ============================================================================= | |
| # MOCK HF CLIENT | |
| # ============================================================================= | |
| # Canned JSON responses that each agent would return | |
| AURORA_RESPONSE = json.dumps({ | |
| "brand_primary": { | |
| "color": "#06b2c4", | |
| "confidence": "high", | |
| "reasoning": "Used in 33 buttons and 12 CTAs — dominant interactive color", | |
| "usage_count": 45, | |
| }, | |
| "brand_secondary": { | |
| "color": "#c1df1f", | |
| "confidence": "medium", | |
| "reasoning": "Used in highlights and badges", | |
| "usage_count": 23, | |
| }, | |
| "brand_accent": None, | |
| "palette_strategy": "complementary", | |
| "cohesion_score": 6, | |
| "cohesion_notes": "Primary and secondary are near-complementary on the color wheel. Reasonable coherence but accent is missing.", | |
| "semantic_names": { | |
| "#06b2c4": "brand.primary", | |
| "#c1df1f": "brand.secondary", | |
| "#1a1a1a": "text.primary", | |
| "#666666": "text.secondary", | |
| }, | |
| "self_evaluation": { | |
| "confidence": 8, | |
| "reasoning": "Clear dominant primary from button usage. Secondary less certain.", | |
| "data_quality": "good", | |
| "flags": [], | |
| }, | |
| }) | |
| ATLAS_RESPONSE = json.dumps({ | |
| "recommended_benchmark": "shopify_polaris", | |
| "recommended_benchmark_name": "Shopify Polaris", | |
| "reasoning": "87% structural match. Polaris uses similar type scale and spacing grid approach.", | |
| "alignment_changes": [ | |
| {"change": "Adopt 1.25 Major Third type scale", "from": "1.18 random", "to": "1.25", "effort": "low"}, | |
| {"change": "Standardize to 4px spacing grid", "from": "mixed", "to": "4px", "effort": "medium"}, | |
| ], | |
| "pros_of_alignment": [ | |
| "Industry-standard component patterns", | |
| "Strong accessibility built-in", | |
| ], | |
| "cons_of_alignment": [ | |
| "May feel generic without customization", | |
| ], | |
| "alternative_benchmarks": [ | |
| {"name": "Material Design 3", "reason": "77% match, stronger theming support"}, | |
| {"name": "Atlassian Design System", "reason": "76% match, similar enterprise focus"}, | |
| ], | |
| "self_evaluation": { | |
| "confidence": 7, | |
| "reasoning": "Good structural match but benchmark comparison limited to 8 systems", | |
| "data_quality": "good", | |
| "flags": [], | |
| }, | |
| }) | |
| SENTINEL_RESPONSE = json.dumps({ | |
| "overall_score": 62, | |
| "checks": { | |
| "color_contrast": {"status": "fail", "note": "67 AA failures including brand primary"}, | |
| "type_scale": {"status": "warn", "note": "Near-consistent but not standard ratio"}, | |
| "spacing_grid": {"status": "pass", "note": "4px grid detected with 85% alignment"}, | |
| "color_count": {"status": "warn", "note": "143 unique colors — recommend consolidation to ~20"}, | |
| }, | |
| "priority_fixes": [ | |
| {"rank": 1, "issue": "Brand primary fails AA contrast", "impact": "high", "effort": "low", "action": "Darken #06b2c4 to #048391"}, | |
| {"rank": 2, "issue": "143 colors too many", "impact": "medium", "effort": "medium", "action": "Consolidate to semantic palette"}, | |
| {"rank": 3, "issue": "Type scale inconsistent", "impact": "medium", "effort": "low", "action": "Adopt 1.25 Major Third"}, | |
| ], | |
| "passing_practices": ["spacing_grid", "font_family_consistency"], | |
| "failing_practices": ["color_contrast", "color_count"], | |
| "self_evaluation": { | |
| "confidence": 8, | |
| "reasoning": "Rule engine data is clear. Priority ordering based on impact analysis.", | |
| "data_quality": "good", | |
| "flags": [], | |
| }, | |
| }) | |
| NEXUS_RESPONSE = json.dumps({ | |
| "executive_summary": "Design system shows strong structural foundation (4px grid, consistent typography) but needs critical accessibility fixes. Brand primary #06b2c4 fails AA — recommend darkened variant. 87% aligned to Polaris.", | |
| "scores": { | |
| "overall": 62, | |
| "accessibility": 45, | |
| "consistency": 72, | |
| "organization": 68, | |
| }, | |
| "benchmark_fit": { | |
| "closest": "Shopify Polaris", | |
| "similarity": 87, | |
| "recommendation": "Align type scale and consolidate colors for 95%+ match", | |
| }, | |
| "brand_analysis": { | |
| "primary": "#06b2c4", | |
| "secondary": "#c1df1f", | |
| "cohesion": 6, | |
| }, | |
| "top_3_actions": [ | |
| {"action": "Fix brand primary contrast", "impact": "high", "effort": "low", "details": "Darken to #048391 for AA 4.5:1"}, | |
| {"action": "Consolidate color palette", "impact": "medium", "effort": "medium", "details": "Reduce 143 → ~20 semantic colors"}, | |
| {"action": "Standardize type scale", "impact": "medium", "effort": "low", "details": "Adopt 1.25 Major Third ratio"}, | |
| ], | |
| "color_recommendations": [ | |
| {"role": "brand-primary", "current": "#06b2c4", "suggested": "#048391", "reason": "AA compliance", "accept": True}, | |
| ], | |
| "type_scale_recommendation": { | |
| "current_ratio": 1.18, | |
| "recommended_ratio": 1.25, | |
| "name": "Major Third", | |
| }, | |
| "spacing_recommendation": { | |
| "current_base": 4, | |
| "recommended_base": 8, | |
| "reason": "Simpler system with fewer decisions", | |
| }, | |
| "self_evaluation": { | |
| "confidence": 8, | |
| "reasoning": "Strong data from rule engine and all 3 agents. Minor disagreement on spacing resolved by averaging.", | |
| "data_quality": "good", | |
| "flags": [], | |
| }, | |
| }) | |
| class MockHFClient: | |
| """Mock HF Inference client that returns canned responses per agent.""" | |
| AGENT_RESPONSES = { | |
| "brand_identifier": AURORA_RESPONSE, | |
| "benchmark_advisor": ATLAS_RESPONSE, | |
| "best_practices": SENTINEL_RESPONSE, | |
| "best_practices_validator": SENTINEL_RESPONSE, | |
| "head_synthesizer": NEXUS_RESPONSE, | |
| } | |
| async def complete_async( | |
| self, | |
| agent_name: str, | |
| system_prompt: str, | |
| user_message: str, | |
| max_tokens: int = 2000, | |
| json_mode: bool = True, | |
| ) -> str: | |
| """Return canned response for the agent.""" | |
| return self.AGENT_RESPONSES.get(agent_name, "{}") | |
| # ============================================================================= | |
| # TEST DATA | |
| # ============================================================================= | |
| MOCK_COLOR_TOKENS = { | |
| "brand-primary": {"value": "#06b2c4", "frequency": 45, "context": "buttons, links"}, | |
| "brand-secondary": {"value": "#c1df1f", "frequency": 23, "context": "highlights"}, | |
| "text-primary": {"value": "#1a1a1a", "frequency": 120, "context": "headings, body"}, | |
| "text-secondary": {"value": "#666666", "frequency": 80, "context": "captions"}, | |
| "background": {"value": "#ffffff", "frequency": 200, "context": "page background"}, | |
| } | |
| MOCK_SEMANTIC_ANALYSIS = { | |
| "brand": [{"hex": "#06b2c4", "name": "brand-primary"}], | |
| "text": [{"hex": "#1a1a1a", "name": "text-primary"}], | |
| } | |
| class MockBenchmarkSystem: | |
| """Mock benchmark system object (what c.benchmark returns).""" | |
| def __init__(self, name, icon, scale_ratio, base_size, spacing_base, best_for): | |
| self.name = name | |
| self.icon = icon | |
| self.typography = {"scale_ratio": scale_ratio, "base_size": base_size} | |
| self.spacing = {"base": spacing_base} | |
| self.best_for = best_for | |
| class MockBenchmarkComparison: | |
| """Mock benchmark comparison object (what ATLAS._format_comparisons expects).""" | |
| def __init__(self, benchmark, similarity_score, overall_match_pct, type_ratio_diff, base_size_diff, spacing_grid_diff): | |
| self.benchmark = benchmark | |
| self.similarity_score = similarity_score | |
| self.overall_match_pct = overall_match_pct | |
| self.type_ratio_diff = type_ratio_diff | |
| self.base_size_diff = base_size_diff | |
| self.spacing_grid_diff = spacing_grid_diff | |
| MOCK_BENCHMARK_COMPARISONS = [ | |
| MockBenchmarkComparison( | |
| benchmark=MockBenchmarkSystem("Shopify Polaris", "🟢", 1.25, 16, 4, ["e-commerce", "admin"]), | |
| similarity_score=0.13, overall_match_pct=87, type_ratio_diff=0.07, base_size_diff=0, spacing_grid_diff=0, | |
| ), | |
| MockBenchmarkComparison( | |
| benchmark=MockBenchmarkSystem("Material Design 3", "🔵", 1.25, 16, 8, ["mobile", "web"]), | |
| similarity_score=0.23, overall_match_pct=77, type_ratio_diff=0.07, base_size_diff=0, spacing_grid_diff=4, | |
| ), | |
| MockBenchmarkComparison( | |
| benchmark=MockBenchmarkSystem("Atlassian", "🔷", 1.2, 14, 8, ["enterprise", "tools"]), | |
| similarity_score=0.24, overall_match_pct=76, type_ratio_diff=0.02, base_size_diff=2, spacing_grid_diff=4, | |
| ), | |
| ] | |
| # Mock RuleEngineResults for SENTINEL and NEXUS | |
| class MockTypography: | |
| detected_ratio = 1.18 | |
| base_size = 16.0 | |
| sizes_px = [12, 14, 16, 18, 22, 28, 36, 48] | |
| is_consistent = False | |
| variance = 0.22 | |
| scale_name = "Minor Third" | |
| closest_standard_ratio = 1.2 | |
| recommendation = 1.25 | |
| recommendation_name = "Major Third" | |
| def to_dict(self): | |
| return {"detected_ratio": self.detected_ratio, "base_size": self.base_size} | |
| class MockSpacing: | |
| detected_base = 4 | |
| is_aligned = True | |
| alignment_percentage = 85.0 | |
| misaligned_values = [5, 10] | |
| recommendation = 8 | |
| recommendation_reason = "Simpler grid" | |
| current_values = [4, 8, 12, 16, 24, 32] | |
| suggested_scale = [0, 4, 8, 12, 16, 24, 32, 48] | |
| def to_dict(self): | |
| return {"detected_base": self.detected_base, "alignment_percentage": self.alignment_percentage} | |
| class MockColorStats: | |
| total_count = 160 | |
| unique_count = 143 | |
| duplicate_count = 17 | |
| gray_count = 22 | |
| saturated_count = 45 | |
| near_duplicates = [("#06b2c4", "#07b3c5", 0.01)] | |
| hue_distribution = {"cyan": 5, "gray": 22, "green": 3} | |
| def to_dict(self): | |
| return {"total": self.total_count, "unique": self.unique_count} | |
| class MockAccessibility: | |
| def __init__(self): | |
| self.hex_color = "#06b2c4" | |
| self.name = "brand-primary" | |
| self.passes_aa_normal = False | |
| self.contrast_on_white = 2.57 | |
| self.contrast_on_black = 8.18 | |
| self.suggested_fix = "#048391" | |
| self.suggested_fix_contrast = 4.5 | |
| def to_dict(self): | |
| return {"color": self.hex_color, "aa_normal": self.passes_aa_normal} | |
| class MockRuleEngineResults: | |
| typography = MockTypography() | |
| spacing = MockSpacing() | |
| color_stats = MockColorStats() | |
| accessibility = [MockAccessibility()] | |
| aa_failures = 67 | |
| consistency_score = 52 | |
| def to_dict(self): | |
| return { | |
| "typography": self.typography.to_dict(), | |
| "spacing": self.spacing.to_dict(), | |
| "color_stats": self.color_stats.to_dict(), | |
| "summary": {"aa_failures": self.aa_failures, "consistency_score": self.consistency_score}, | |
| } | |
| # ============================================================================= | |
| # SCHEMA COMPLIANCE TESTS | |
| # ============================================================================= | |
| class TestAuroraSchemaCompliance: | |
| """AURORA (Brand Identifier) output schema validation.""" | |
| def agent(self): | |
| return BrandIdentifierAgent(MockHFClient()) | |
| async def test_schema_compliance(self, agent): | |
| """AURORA output has all required BrandIdentification fields.""" | |
| result = await agent.analyze( | |
| color_tokens=MOCK_COLOR_TOKENS, | |
| semantic_analysis=MOCK_SEMANTIC_ANALYSIS, | |
| ) | |
| assert isinstance(result, BrandIdentification) | |
| # Required fields present | |
| assert hasattr(result, "brand_primary") | |
| assert hasattr(result, "palette_strategy") | |
| assert hasattr(result, "cohesion_score") | |
| assert hasattr(result, "self_evaluation") | |
| async def test_brand_primary_detected(self, agent): | |
| """AURORA correctly identifies brand primary from high-usage color.""" | |
| result = await agent.analyze( | |
| color_tokens=MOCK_COLOR_TOKENS, | |
| semantic_analysis=MOCK_SEMANTIC_ANALYSIS, | |
| ) | |
| bp = result.brand_primary | |
| assert isinstance(bp, dict) | |
| assert bp.get("color") == "#06b2c4" | |
| assert bp.get("confidence") in ("high", "medium", "low") | |
| async def test_palette_strategy_valid(self, agent): | |
| """Palette strategy is a recognized value.""" | |
| result = await agent.analyze( | |
| color_tokens=MOCK_COLOR_TOKENS, | |
| semantic_analysis=MOCK_SEMANTIC_ANALYSIS, | |
| ) | |
| valid_strategies = ["complementary", "analogous", "triadic", "monochromatic", "split-complementary", "random", ""] | |
| assert result.palette_strategy in valid_strategies | |
| async def test_to_dict_serializable(self, agent): | |
| """Output is JSON-serializable.""" | |
| result = await agent.analyze( | |
| color_tokens=MOCK_COLOR_TOKENS, | |
| semantic_analysis=MOCK_SEMANTIC_ANALYSIS, | |
| ) | |
| d = result.to_dict() | |
| json_str = json.dumps(d) | |
| assert len(json_str) > 10 | |
| class TestAtlasSchemaCompliance: | |
| """ATLAS (Benchmark Advisor) output schema validation.""" | |
| def agent(self): | |
| return BenchmarkAdvisorAgent(MockHFClient()) | |
| async def test_schema_compliance(self, agent): | |
| """ATLAS output has all required BenchmarkAdvice fields.""" | |
| result = await agent.analyze( | |
| user_ratio=1.18, | |
| user_base=16, | |
| user_spacing=4, | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| ) | |
| assert isinstance(result, BenchmarkAdvice) | |
| assert hasattr(result, "recommended_benchmark") | |
| assert hasattr(result, "reasoning") | |
| assert hasattr(result, "alignment_changes") | |
| assert hasattr(result, "self_evaluation") | |
| async def test_benchmark_recommended(self, agent): | |
| """ATLAS recommends a valid benchmark.""" | |
| result = await agent.analyze( | |
| user_ratio=1.18, | |
| user_base=16, | |
| user_spacing=4, | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| ) | |
| assert result.recommended_benchmark != "" | |
| assert result.reasoning != "" | |
| async def test_alignment_changes_structured(self, agent): | |
| """Alignment changes are structured dicts.""" | |
| result = await agent.analyze( | |
| user_ratio=1.18, | |
| user_base=16, | |
| user_spacing=4, | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| ) | |
| assert isinstance(result.alignment_changes, list) | |
| if result.alignment_changes: | |
| change = result.alignment_changes[0] | |
| assert isinstance(change, dict) | |
| assert "change" in change | |
| class TestSentinelSchemaCompliance: | |
| """SENTINEL (Best Practices Validator) output schema validation.""" | |
| def agent(self): | |
| return BestPracticesValidatorAgent(MockHFClient()) | |
| async def test_schema_compliance(self, agent): | |
| """SENTINEL output has all required BestPracticesResult fields.""" | |
| result = await agent.analyze( | |
| rule_engine_results=MockRuleEngineResults(), | |
| ) | |
| assert isinstance(result, BestPracticesResult) | |
| assert hasattr(result, "overall_score") | |
| assert hasattr(result, "priority_fixes") | |
| assert hasattr(result, "self_evaluation") | |
| async def test_score_in_range(self, agent): | |
| """Overall score is between 0-100.""" | |
| result = await agent.analyze( | |
| rule_engine_results=MockRuleEngineResults(), | |
| ) | |
| assert 0 <= result.overall_score <= 100 | |
| async def test_priority_fixes_ranked(self, agent): | |
| """Priority fixes are a list with high-impact items first.""" | |
| result = await agent.analyze( | |
| rule_engine_results=MockRuleEngineResults(), | |
| ) | |
| assert isinstance(result.priority_fixes, list) | |
| if len(result.priority_fixes) >= 2: | |
| # First fix should be highest priority | |
| first = result.priority_fixes[0] | |
| if isinstance(first, dict) and "rank" in first: | |
| assert first["rank"] == 1 | |
| class TestNexusSchemaCompliance: | |
| """NEXUS (Head Synthesizer) output schema validation.""" | |
| def agent(self): | |
| return HeadSynthesizerAgent(MockHFClient()) | |
| async def test_schema_compliance(self, agent): | |
| """NEXUS output has all required HeadSynthesis fields.""" | |
| result = await agent.synthesize( | |
| rule_engine_results=MockRuleEngineResults(), | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| brand_identification=BrandIdentification( | |
| brand_primary={"color": "#06b2c4", "confidence": "high"}, | |
| palette_strategy="complementary", | |
| cohesion_score=6, | |
| ), | |
| benchmark_advice=BenchmarkAdvice( | |
| recommended_benchmark="shopify_polaris", | |
| reasoning="87% structural match", | |
| ), | |
| best_practices=BestPracticesResult( | |
| overall_score=62, | |
| priority_fixes=[{"issue": "AA contrast", "impact": "high"}], | |
| ), | |
| ) | |
| assert isinstance(result, HeadSynthesis) | |
| assert hasattr(result, "executive_summary") | |
| assert hasattr(result, "top_3_actions") | |
| assert hasattr(result, "scores") | |
| assert hasattr(result, "self_evaluation") | |
| async def test_executive_summary_non_empty(self, agent): | |
| """NEXUS produces a non-empty executive summary.""" | |
| result = await agent.synthesize( | |
| rule_engine_results=MockRuleEngineResults(), | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| brand_identification=BrandIdentification(), | |
| benchmark_advice=BenchmarkAdvice(), | |
| best_practices=BestPracticesResult(), | |
| ) | |
| assert result.executive_summary != "" | |
| async def test_top_3_actions_present(self, agent): | |
| """NEXUS provides top 3 action items.""" | |
| result = await agent.synthesize( | |
| rule_engine_results=MockRuleEngineResults(), | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| brand_identification=BrandIdentification(), | |
| benchmark_advice=BenchmarkAdvice(), | |
| best_practices=BestPracticesResult(), | |
| ) | |
| assert isinstance(result.top_3_actions, list) | |
| assert len(result.top_3_actions) >= 1 | |
| # ============================================================================= | |
| # SELF-EVALUATION TESTS | |
| # ============================================================================= | |
| class TestSelfEvaluation: | |
| """All agents should include self_evaluation with confidence scoring.""" | |
| async def test_aurora_self_evaluation(self): | |
| agent = BrandIdentifierAgent(MockHFClient()) | |
| result = await agent.analyze( | |
| color_tokens=MOCK_COLOR_TOKENS, | |
| semantic_analysis=MOCK_SEMANTIC_ANALYSIS, | |
| ) | |
| se = result.self_evaluation | |
| assert isinstance(se, dict) | |
| assert "confidence" in se | |
| assert "data_quality" in se | |
| async def test_atlas_self_evaluation(self): | |
| agent = BenchmarkAdvisorAgent(MockHFClient()) | |
| result = await agent.analyze( | |
| user_ratio=1.18, | |
| user_base=16, | |
| user_spacing=4, | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| ) | |
| se = result.self_evaluation | |
| assert isinstance(se, dict) | |
| assert "confidence" in se | |
| async def test_sentinel_self_evaluation(self): | |
| agent = BestPracticesValidatorAgent(MockHFClient()) | |
| result = await agent.analyze( | |
| rule_engine_results=MockRuleEngineResults(), | |
| ) | |
| se = result.self_evaluation | |
| assert isinstance(se, dict) | |
| assert "confidence" in se | |
| async def test_nexus_self_evaluation(self): | |
| agent = HeadSynthesizerAgent(MockHFClient()) | |
| result = await agent.synthesize( | |
| rule_engine_results=MockRuleEngineResults(), | |
| benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, | |
| brand_identification=BrandIdentification(), | |
| benchmark_advice=BenchmarkAdvice(), | |
| best_practices=BestPracticesResult(), | |
| ) | |
| se = result.self_evaluation | |
| assert isinstance(se, dict) | |
| assert "confidence" in se | |
| # ============================================================================= | |
| # VALIDATION MODULE TESTS | |
| # ============================================================================= | |
| class TestValidationModule: | |
| """Test the core/validation.py module.""" | |
| def test_validate_aurora_output(self): | |
| from core.validation import validate_agent_output | |
| data = { | |
| "brand_primary": {"color": "#06b2c4"}, | |
| "palette_strategy": "complementary", | |
| "cohesion_score": 6, | |
| } | |
| is_valid, error = validate_agent_output(data, "aurora") | |
| assert is_valid | |
| def test_validate_aurora_missing_required(self): | |
| from core.validation import validate_agent_output | |
| data = {"cohesion_score": 6} # Missing brand_primary and palette_strategy | |
| is_valid, error = validate_agent_output(data, "aurora") | |
| assert not is_valid | |
| assert error is not None | |
| def test_validate_nexus_output(self): | |
| from core.validation import validate_agent_output | |
| data = { | |
| "executive_summary": "Test summary", | |
| "top_3_actions": [{"action": "Fix contrast"}], | |
| "scores": {"overall": 62}, | |
| } | |
| is_valid, error = validate_agent_output(data, "nexus") | |
| assert is_valid | |
| def test_validate_unknown_agent_passes(self): | |
| from core.validation import validate_agent_output | |
| is_valid, error = validate_agent_output({"anything": True}, "unknown_agent") | |
| assert is_valid # No schema = pass | |
| def test_validate_dataclass(self): | |
| from core.validation import validate_agent_output | |
| brand = BrandIdentification( | |
| brand_primary={"color": "#06b2c4"}, | |
| palette_strategy="complementary", | |
| ) | |
| is_valid, error = validate_agent_output(brand, "aurora") | |
| assert is_valid | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) | |