Spaces:
Sleeping
Sleeping
Upload test_stage2_pipeline.py
Browse files- tests/test_stage2_pipeline.py +662 -0
tests/test_stage2_pipeline.py
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Stage 2 Pipeline Test Script
|
| 4 |
+
============================
|
| 5 |
+
|
| 6 |
+
Tests the new Stage 2 architecture:
|
| 7 |
+
- Layer 1: Rule Engine
|
| 8 |
+
- Layer 2: Benchmark Research
|
| 9 |
+
- Layer 3: LLM Agents
|
| 10 |
+
- Layer 4: HEAD Synthesizer
|
| 11 |
+
|
| 12 |
+
Run: python tests/test_stage2_pipeline.py
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import asyncio
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from typing import Optional
|
| 21 |
+
|
| 22 |
+
# Add parent directory to path
|
| 23 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# =============================================================================
|
| 27 |
+
# TEST DATA - Mock extracted tokens
|
| 28 |
+
# =============================================================================
|
| 29 |
+
|
| 30 |
+
MOCK_TYPOGRAPHY_TOKENS = {
|
| 31 |
+
"heading-1": {"font_size": "48px", "font_weight": "700", "line_height": "1.2", "font_family": "Inter"},
|
| 32 |
+
"heading-2": {"font_size": "36px", "font_weight": "600", "line_height": "1.25", "font_family": "Inter"},
|
| 33 |
+
"heading-3": {"font_size": "28px", "font_weight": "600", "line_height": "1.3", "font_family": "Inter"},
|
| 34 |
+
"heading-4": {"font_size": "22px", "font_weight": "500", "line_height": "1.35", "font_family": "Inter"},
|
| 35 |
+
"body-large": {"font_size": "18px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"},
|
| 36 |
+
"body": {"font_size": "16px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"},
|
| 37 |
+
"body-small": {"font_size": "14px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"},
|
| 38 |
+
"caption": {"font_size": "12px", "font_weight": "400", "line_height": "1.4", "font_family": "Inter"},
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
MOCK_COLOR_TOKENS = {
|
| 42 |
+
"brand-primary": {"value": "#06b2c4", "frequency": 45, "context": "buttons, links"},
|
| 43 |
+
"brand-secondary": {"value": "#c1df1f", "frequency": 23, "context": "highlights, badges"},
|
| 44 |
+
"text-primary": {"value": "#1a1a1a", "frequency": 120, "context": "headings, body"},
|
| 45 |
+
"text-secondary": {"value": "#666666", "frequency": 80, "context": "captions, muted"},
|
| 46 |
+
"text-tertiary": {"value": "#999999", "frequency": 40, "context": "placeholders"},
|
| 47 |
+
"background-primary": {"value": "#ffffff", "frequency": 200, "context": "page background"},
|
| 48 |
+
"background-secondary": {"value": "#f5f5f5", "frequency": 60, "context": "cards, sections"},
|
| 49 |
+
"background-tertiary": {"value": "#e8e8e8", "frequency": 30, "context": "dividers"},
|
| 50 |
+
"border-default": {"value": "#dddddd", "frequency": 50, "context": "borders"},
|
| 51 |
+
"border-focus": {"value": "#06b2c4", "frequency": 15, "context": "focus rings"},
|
| 52 |
+
"success": {"value": "#22c55e", "frequency": 10, "context": "success states"},
|
| 53 |
+
"warning": {"value": "#f59e0b", "frequency": 8, "context": "warning states"},
|
| 54 |
+
"error": {"value": "#ef4444", "frequency": 12, "context": "error states"},
|
| 55 |
+
"info": {"value": "#3b82f6", "frequency": 6, "context": "info states"},
|
| 56 |
+
# Some problematic colors for testing
|
| 57 |
+
"light-cyan": {"value": "#7dd3fc", "frequency": 5, "context": "light accent"}, # Fails AA
|
| 58 |
+
"light-lime": {"value": "#d9f99d", "frequency": 3, "context": "light highlight"}, # Fails AA
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
MOCK_SPACING_TOKENS = {
|
| 62 |
+
"space-1": {"value": "4px", "value_px": 4, "frequency": 30},
|
| 63 |
+
"space-2": {"value": "8px", "value_px": 8, "frequency": 80},
|
| 64 |
+
"space-3": {"value": "12px", "value_px": 12, "frequency": 45},
|
| 65 |
+
"space-4": {"value": "16px", "value_px": 16, "frequency": 60},
|
| 66 |
+
"space-5": {"value": "20px", "value_px": 20, "frequency": 25},
|
| 67 |
+
"space-6": {"value": "24px", "value_px": 24, "frequency": 40},
|
| 68 |
+
"space-8": {"value": "32px", "value_px": 32, "frequency": 20},
|
| 69 |
+
"space-10": {"value": "40px", "value_px": 40, "frequency": 15},
|
| 70 |
+
"space-12": {"value": "48px", "value_px": 48, "frequency": 10},
|
| 71 |
+
# Some misaligned values for testing
|
| 72 |
+
"space-odd-1": {"value": "5px", "value_px": 5, "frequency": 3},
|
| 73 |
+
"space-odd-2": {"value": "10px", "value_px": 10, "frequency": 5},
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
MOCK_SEMANTIC_ANALYSIS = {
|
| 77 |
+
"brand": [{"hex": "#06b2c4", "name": "brand-primary"}, {"hex": "#c1df1f", "name": "brand-secondary"}],
|
| 78 |
+
"text": [{"hex": "#1a1a1a", "name": "text-primary"}, {"hex": "#666666", "name": "text-secondary"}],
|
| 79 |
+
"background": [{"hex": "#ffffff", "name": "background-primary"}, {"hex": "#f5f5f5", "name": "background-secondary"}],
|
| 80 |
+
"border": [{"hex": "#dddddd", "name": "border-default"}],
|
| 81 |
+
"feedback": [{"hex": "#22c55e", "name": "success"}, {"hex": "#ef4444", "name": "error"}],
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# =============================================================================
|
| 86 |
+
# TEST HELPERS
|
| 87 |
+
# =============================================================================
|
| 88 |
+
|
| 89 |
+
class TestLogger:
|
| 90 |
+
"""Simple logger for tests."""
|
| 91 |
+
|
| 92 |
+
def __init__(self, verbose: bool = True):
|
| 93 |
+
self.verbose = verbose
|
| 94 |
+
self.logs = []
|
| 95 |
+
|
| 96 |
+
def log(self, msg: str):
|
| 97 |
+
self.logs.append(msg)
|
| 98 |
+
if self.verbose:
|
| 99 |
+
print(msg)
|
| 100 |
+
|
| 101 |
+
def get_logs(self) -> str:
|
| 102 |
+
return "\n".join(self.logs)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def print_section(title: str):
|
| 106 |
+
"""Print a section header."""
|
| 107 |
+
print("\n" + "=" * 60)
|
| 108 |
+
print(f" {title}")
|
| 109 |
+
print("=" * 60 + "\n")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def print_result(name: str, passed: bool, details: str = ""):
|
| 113 |
+
"""Print a test result."""
|
| 114 |
+
icon = "β
" if passed else "β"
|
| 115 |
+
print(f" {icon} {name}")
|
| 116 |
+
if details:
|
| 117 |
+
print(f" {details}")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# =============================================================================
|
| 121 |
+
# LAYER 1: RULE ENGINE TESTS
|
| 122 |
+
# =============================================================================
|
| 123 |
+
|
| 124 |
+
def test_rule_engine():
|
| 125 |
+
"""Test the Rule Engine layer."""
|
| 126 |
+
print_section("LAYER 1: RULE ENGINE TESTS")
|
| 127 |
+
|
| 128 |
+
all_passed = True
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
from core.rule_engine import (
|
| 132 |
+
run_rule_engine,
|
| 133 |
+
analyze_type_scale,
|
| 134 |
+
analyze_accessibility,
|
| 135 |
+
analyze_spacing_grid,
|
| 136 |
+
analyze_color_statistics,
|
| 137 |
+
)
|
| 138 |
+
print_result("Import rule_engine", True)
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print_result("Import rule_engine", False, str(e))
|
| 141 |
+
return False
|
| 142 |
+
|
| 143 |
+
# Test Type Scale Analysis
|
| 144 |
+
try:
|
| 145 |
+
typo_result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS)
|
| 146 |
+
|
| 147 |
+
assert typo_result.detected_ratio > 0, "Ratio should be positive"
|
| 148 |
+
assert typo_result.closest_standard_ratio > 0, "Standard ratio should be positive"
|
| 149 |
+
assert typo_result.scale_name != "", "Scale name should not be empty"
|
| 150 |
+
assert len(typo_result.sizes_px) > 0, "Should detect sizes"
|
| 151 |
+
|
| 152 |
+
print_result(
|
| 153 |
+
"Type Scale Analysis",
|
| 154 |
+
True,
|
| 155 |
+
f"ratio={typo_result.detected_ratio:.3f}, consistent={typo_result.is_consistent}"
|
| 156 |
+
)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print_result("Type Scale Analysis", False, str(e))
|
| 159 |
+
all_passed = False
|
| 160 |
+
|
| 161 |
+
# Test Accessibility Analysis
|
| 162 |
+
try:
|
| 163 |
+
access_result = analyze_accessibility(MOCK_COLOR_TOKENS)
|
| 164 |
+
|
| 165 |
+
assert len(access_result) > 0, "Should analyze colors"
|
| 166 |
+
|
| 167 |
+
failures = [a for a in access_result if not a.passes_aa_normal]
|
| 168 |
+
passes = len(access_result) - len(failures)
|
| 169 |
+
|
| 170 |
+
# Check that fixes are generated for failures
|
| 171 |
+
fixes_generated = sum(1 for a in failures if a.suggested_fix)
|
| 172 |
+
|
| 173 |
+
print_result(
|
| 174 |
+
"Accessibility Analysis",
|
| 175 |
+
True,
|
| 176 |
+
f"total={len(access_result)}, pass={passes}, fail={len(failures)}, fixes={fixes_generated}"
|
| 177 |
+
)
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print_result("Accessibility Analysis", False, str(e))
|
| 180 |
+
all_passed = False
|
| 181 |
+
|
| 182 |
+
# Test Spacing Grid Analysis
|
| 183 |
+
try:
|
| 184 |
+
spacing_result = analyze_spacing_grid(MOCK_SPACING_TOKENS)
|
| 185 |
+
|
| 186 |
+
assert spacing_result.detected_base > 0, "Base should be positive"
|
| 187 |
+
assert len(spacing_result.current_values) > 0, "Should detect values"
|
| 188 |
+
assert len(spacing_result.suggested_scale) > 0, "Should suggest scale"
|
| 189 |
+
|
| 190 |
+
print_result(
|
| 191 |
+
"Spacing Grid Analysis",
|
| 192 |
+
True,
|
| 193 |
+
f"base={spacing_result.detected_base}px, aligned={spacing_result.alignment_percentage:.0f}%"
|
| 194 |
+
)
|
| 195 |
+
except Exception as e:
|
| 196 |
+
print_result("Spacing Grid Analysis", False, str(e))
|
| 197 |
+
all_passed = False
|
| 198 |
+
|
| 199 |
+
# Test Color Statistics
|
| 200 |
+
try:
|
| 201 |
+
color_stats = analyze_color_statistics(MOCK_COLOR_TOKENS)
|
| 202 |
+
|
| 203 |
+
assert color_stats.total_count > 0, "Should count colors"
|
| 204 |
+
assert color_stats.unique_count > 0, "Should count unique"
|
| 205 |
+
|
| 206 |
+
print_result(
|
| 207 |
+
"Color Statistics",
|
| 208 |
+
True,
|
| 209 |
+
f"total={color_stats.total_count}, unique={color_stats.unique_count}, grays={color_stats.gray_count}"
|
| 210 |
+
)
|
| 211 |
+
except Exception as e:
|
| 212 |
+
print_result("Color Statistics", False, str(e))
|
| 213 |
+
all_passed = False
|
| 214 |
+
|
| 215 |
+
# Test Full Rule Engine
|
| 216 |
+
try:
|
| 217 |
+
logger = TestLogger(verbose=False)
|
| 218 |
+
|
| 219 |
+
full_result = run_rule_engine(
|
| 220 |
+
typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
|
| 221 |
+
color_tokens=MOCK_COLOR_TOKENS,
|
| 222 |
+
spacing_tokens=MOCK_SPACING_TOKENS,
|
| 223 |
+
log_callback=logger.log,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
assert full_result.typography is not None
|
| 227 |
+
assert full_result.accessibility is not None
|
| 228 |
+
assert full_result.spacing is not None
|
| 229 |
+
assert full_result.color_stats is not None
|
| 230 |
+
assert 0 <= full_result.consistency_score <= 100
|
| 231 |
+
|
| 232 |
+
print_result(
|
| 233 |
+
"Full Rule Engine",
|
| 234 |
+
True,
|
| 235 |
+
f"consistency_score={full_result.consistency_score}, aa_failures={full_result.aa_failures}"
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# Check logs were generated
|
| 239 |
+
log_lines = len(logger.logs)
|
| 240 |
+
print_result("Log Generation", log_lines > 10, f"{log_lines} log lines")
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
print_result("Full Rule Engine", False, str(e))
|
| 244 |
+
all_passed = False
|
| 245 |
+
|
| 246 |
+
return all_passed
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
# =============================================================================
|
| 250 |
+
# LAYER 2: BENCHMARK RESEARCH TESTS
|
| 251 |
+
# =============================================================================
|
| 252 |
+
|
| 253 |
+
def test_benchmark_research():
|
| 254 |
+
"""Test the Benchmark Research layer."""
|
| 255 |
+
print_section("LAYER 2: BENCHMARK RESEARCH TESTS")
|
| 256 |
+
|
| 257 |
+
all_passed = True
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
from agents.benchmark_researcher import (
|
| 261 |
+
BenchmarkResearcher,
|
| 262 |
+
BenchmarkCache,
|
| 263 |
+
DESIGN_SYSTEM_SOURCES,
|
| 264 |
+
FALLBACK_BENCHMARKS,
|
| 265 |
+
get_available_benchmarks,
|
| 266 |
+
get_benchmark_choices,
|
| 267 |
+
)
|
| 268 |
+
print_result("Import benchmark_researcher", True)
|
| 269 |
+
except Exception as e:
|
| 270 |
+
print_result("Import benchmark_researcher", False, str(e))
|
| 271 |
+
return False
|
| 272 |
+
|
| 273 |
+
# Test Design System Sources
|
| 274 |
+
try:
|
| 275 |
+
assert len(DESIGN_SYSTEM_SOURCES) >= 6, "Should have at least 6 design systems"
|
| 276 |
+
|
| 277 |
+
required_systems = ["material_design_3", "shopify_polaris", "atlassian_design"]
|
| 278 |
+
for sys in required_systems:
|
| 279 |
+
assert sys in DESIGN_SYSTEM_SOURCES, f"Missing {sys}"
|
| 280 |
+
assert "urls" in DESIGN_SYSTEM_SOURCES[sys], f"Missing URLs for {sys}"
|
| 281 |
+
assert "best_for" in DESIGN_SYSTEM_SOURCES[sys], f"Missing best_for for {sys}"
|
| 282 |
+
|
| 283 |
+
print_result("Design System Sources", True, f"{len(DESIGN_SYSTEM_SOURCES)} systems defined")
|
| 284 |
+
except Exception as e:
|
| 285 |
+
print_result("Design System Sources", False, str(e))
|
| 286 |
+
all_passed = False
|
| 287 |
+
|
| 288 |
+
# Test Fallback Benchmarks
|
| 289 |
+
try:
|
| 290 |
+
assert len(FALLBACK_BENCHMARKS) >= 6, "Should have fallbacks"
|
| 291 |
+
|
| 292 |
+
for key, fallback in FALLBACK_BENCHMARKS.items():
|
| 293 |
+
assert "typography" in fallback, f"Missing typography for {key}"
|
| 294 |
+
assert "spacing" in fallback, f"Missing spacing for {key}"
|
| 295 |
+
assert fallback["typography"].get("scale_ratio"), f"Missing scale_ratio for {key}"
|
| 296 |
+
|
| 297 |
+
print_result("Fallback Benchmarks", True, f"{len(FALLBACK_BENCHMARKS)} fallbacks defined")
|
| 298 |
+
except Exception as e:
|
| 299 |
+
print_result("Fallback Benchmarks", False, str(e))
|
| 300 |
+
all_passed = False
|
| 301 |
+
|
| 302 |
+
# Test Cache
|
| 303 |
+
try:
|
| 304 |
+
cache = BenchmarkCache()
|
| 305 |
+
|
| 306 |
+
# Test set/get
|
| 307 |
+
from agents.benchmark_researcher import BenchmarkData
|
| 308 |
+
test_data = BenchmarkData(
|
| 309 |
+
key="test_system",
|
| 310 |
+
name="Test System",
|
| 311 |
+
short_name="Test",
|
| 312 |
+
vendor="Test Vendor",
|
| 313 |
+
icon="π§ͺ",
|
| 314 |
+
typography={"scale_ratio": 1.25, "base_size": 16},
|
| 315 |
+
spacing={"base": 8},
|
| 316 |
+
fetched_at=datetime.now().isoformat(),
|
| 317 |
+
confidence="high",
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
cache.set("test_system", test_data)
|
| 321 |
+
retrieved = cache.get("test_system")
|
| 322 |
+
|
| 323 |
+
assert retrieved is not None, "Should retrieve cached data"
|
| 324 |
+
assert retrieved.typography.get("scale_ratio") == 1.25, "Data should match"
|
| 325 |
+
|
| 326 |
+
print_result("Benchmark Cache", True, "set/get working")
|
| 327 |
+
except Exception as e:
|
| 328 |
+
print_result("Benchmark Cache", False, str(e))
|
| 329 |
+
all_passed = False
|
| 330 |
+
|
| 331 |
+
# Test Helper Functions
|
| 332 |
+
try:
|
| 333 |
+
benchmarks = get_available_benchmarks()
|
| 334 |
+
assert len(benchmarks) >= 6, "Should list benchmarks"
|
| 335 |
+
assert all("key" in b and "name" in b for b in benchmarks)
|
| 336 |
+
|
| 337 |
+
choices = get_benchmark_choices()
|
| 338 |
+
assert len(choices) >= 6, "Should have choices"
|
| 339 |
+
assert all(isinstance(c, tuple) and len(c) == 2 for c in choices)
|
| 340 |
+
|
| 341 |
+
print_result("Helper Functions", True, f"{len(benchmarks)} benchmarks available")
|
| 342 |
+
except Exception as e:
|
| 343 |
+
print_result("Helper Functions", False, str(e))
|
| 344 |
+
all_passed = False
|
| 345 |
+
|
| 346 |
+
# Test Researcher Initialization
|
| 347 |
+
try:
|
| 348 |
+
researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None)
|
| 349 |
+
assert researcher.cache is not None
|
| 350 |
+
|
| 351 |
+
print_result("Researcher Initialization", True, "initialized without clients")
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print_result("Researcher Initialization", False, str(e))
|
| 354 |
+
all_passed = False
|
| 355 |
+
|
| 356 |
+
# Test Comparison Logic (with fallback data)
|
| 357 |
+
try:
|
| 358 |
+
researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None)
|
| 359 |
+
|
| 360 |
+
# Create mock benchmark data
|
| 361 |
+
from agents.benchmark_researcher import BenchmarkData
|
| 362 |
+
mock_benchmarks = []
|
| 363 |
+
for key in ["material_design_3", "shopify_polaris", "atlassian_design"]:
|
| 364 |
+
source = DESIGN_SYSTEM_SOURCES[key]
|
| 365 |
+
fallback = FALLBACK_BENCHMARKS[key]
|
| 366 |
+
mock_benchmarks.append(BenchmarkData(
|
| 367 |
+
key=key,
|
| 368 |
+
name=source["name"],
|
| 369 |
+
short_name=source["short_name"],
|
| 370 |
+
vendor=source["vendor"],
|
| 371 |
+
icon=source["icon"],
|
| 372 |
+
typography=fallback["typography"],
|
| 373 |
+
spacing=fallback["spacing"],
|
| 374 |
+
fetched_at=datetime.now().isoformat(),
|
| 375 |
+
confidence="fallback",
|
| 376 |
+
best_for=source["best_for"],
|
| 377 |
+
))
|
| 378 |
+
|
| 379 |
+
comparisons = researcher.compare_to_benchmarks(
|
| 380 |
+
your_ratio=1.18,
|
| 381 |
+
your_base_size=16,
|
| 382 |
+
your_spacing_grid=8,
|
| 383 |
+
benchmarks=mock_benchmarks,
|
| 384 |
+
log_callback=lambda x: None,
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
assert len(comparisons) == 3, "Should have 3 comparisons"
|
| 388 |
+
assert comparisons[0].similarity_score <= comparisons[1].similarity_score, "Should be sorted"
|
| 389 |
+
|
| 390 |
+
print_result(
|
| 391 |
+
"Comparison Logic",
|
| 392 |
+
True,
|
| 393 |
+
f"closest={comparisons[0].benchmark.short_name}, score={comparisons[0].similarity_score:.2f}"
|
| 394 |
+
)
|
| 395 |
+
except Exception as e:
|
| 396 |
+
print_result("Comparison Logic", False, str(e))
|
| 397 |
+
all_passed = False
|
| 398 |
+
|
| 399 |
+
return all_passed
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
# =============================================================================
|
| 403 |
+
# LAYER 3: LLM AGENTS TESTS
|
| 404 |
+
# =============================================================================
|
| 405 |
+
|
| 406 |
+
def test_llm_agents():
|
| 407 |
+
"""Test the LLM Agents layer."""
|
| 408 |
+
print_section("LAYER 3: LLM AGENTS TESTS")
|
| 409 |
+
|
| 410 |
+
all_passed = True
|
| 411 |
+
|
| 412 |
+
try:
|
| 413 |
+
from agents.llm_agents import (
|
| 414 |
+
BrandIdentifierAgent,
|
| 415 |
+
BenchmarkAdvisorAgent,
|
| 416 |
+
BestPracticesValidatorAgent,
|
| 417 |
+
HeadSynthesizerAgent,
|
| 418 |
+
BrandIdentification,
|
| 419 |
+
BenchmarkAdvice,
|
| 420 |
+
BestPracticesResult,
|
| 421 |
+
HeadSynthesis,
|
| 422 |
+
)
|
| 423 |
+
print_result("Import llm_agents", True)
|
| 424 |
+
except Exception as e:
|
| 425 |
+
print_result("Import llm_agents", False, str(e))
|
| 426 |
+
return False
|
| 427 |
+
|
| 428 |
+
# Test Data Classes
|
| 429 |
+
try:
|
| 430 |
+
brand = BrandIdentification(
|
| 431 |
+
brand_primary={"color": "#06b2c4", "confidence": "high"},
|
| 432 |
+
cohesion_score=7,
|
| 433 |
+
)
|
| 434 |
+
assert brand.to_dict()["brand_primary"]["color"] == "#06b2c4"
|
| 435 |
+
|
| 436 |
+
advice = BenchmarkAdvice(
|
| 437 |
+
recommended_benchmark="shopify_polaris",
|
| 438 |
+
reasoning="Best fit for e-commerce",
|
| 439 |
+
)
|
| 440 |
+
assert advice.to_dict()["recommended_benchmark"] == "shopify_polaris"
|
| 441 |
+
|
| 442 |
+
practices = BestPracticesResult(
|
| 443 |
+
overall_score=65,
|
| 444 |
+
priority_fixes=[{"issue": "AA compliance", "impact": "high"}],
|
| 445 |
+
)
|
| 446 |
+
assert practices.to_dict()["overall_score"] == 65
|
| 447 |
+
|
| 448 |
+
synthesis = HeadSynthesis(
|
| 449 |
+
executive_summary="Test summary",
|
| 450 |
+
scores={"overall": 60},
|
| 451 |
+
)
|
| 452 |
+
assert synthesis.to_dict()["scores"]["overall"] == 60
|
| 453 |
+
|
| 454 |
+
print_result("Data Classes", True, "all serializable")
|
| 455 |
+
except Exception as e:
|
| 456 |
+
print_result("Data Classes", False, str(e))
|
| 457 |
+
all_passed = False
|
| 458 |
+
|
| 459 |
+
# Test Agent Initialization (without HF client)
|
| 460 |
+
try:
|
| 461 |
+
brand_agent = BrandIdentifierAgent(hf_client=None)
|
| 462 |
+
benchmark_agent = BenchmarkAdvisorAgent(hf_client=None)
|
| 463 |
+
practices_agent = BestPracticesValidatorAgent(hf_client=None)
|
| 464 |
+
head_agent = HeadSynthesizerAgent(hf_client=None)
|
| 465 |
+
|
| 466 |
+
print_result("Agent Initialization", True, "all agents created")
|
| 467 |
+
except Exception as e:
|
| 468 |
+
print_result("Agent Initialization", False, str(e))
|
| 469 |
+
all_passed = False
|
| 470 |
+
|
| 471 |
+
# Test Prompt Templates exist
|
| 472 |
+
try:
|
| 473 |
+
assert hasattr(BrandIdentifierAgent, 'PROMPT_TEMPLATE')
|
| 474 |
+
assert hasattr(BenchmarkAdvisorAgent, 'PROMPT_TEMPLATE')
|
| 475 |
+
assert hasattr(BestPracticesValidatorAgent, 'PROMPT_TEMPLATE')
|
| 476 |
+
assert hasattr(HeadSynthesizerAgent, 'PROMPT_TEMPLATE')
|
| 477 |
+
|
| 478 |
+
# Check templates have placeholders
|
| 479 |
+
assert "{color_data}" in BrandIdentifierAgent.PROMPT_TEMPLATE
|
| 480 |
+
assert "{user_ratio}" in BenchmarkAdvisorAgent.PROMPT_TEMPLATE
|
| 481 |
+
assert "{type_ratio}" in BestPracticesValidatorAgent.PROMPT_TEMPLATE
|
| 482 |
+
assert "{type_ratio}" in HeadSynthesizerAgent.PROMPT_TEMPLATE
|
| 483 |
+
|
| 484 |
+
print_result("Prompt Templates", True, "all templates defined with placeholders")
|
| 485 |
+
except Exception as e:
|
| 486 |
+
print_result("Prompt Templates", False, str(e))
|
| 487 |
+
all_passed = False
|
| 488 |
+
|
| 489 |
+
return all_passed
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
# =============================================================================
|
| 493 |
+
# INTEGRATION TEST
|
| 494 |
+
# =============================================================================
|
| 495 |
+
|
| 496 |
+
async def test_integration():
|
| 497 |
+
"""Test the full pipeline integration (without actual LLM calls)."""
|
| 498 |
+
print_section("INTEGRATION TEST")
|
| 499 |
+
|
| 500 |
+
all_passed = True
|
| 501 |
+
|
| 502 |
+
# Test full Rule Engine + Benchmark comparison flow
|
| 503 |
+
try:
|
| 504 |
+
from core.rule_engine import run_rule_engine
|
| 505 |
+
from agents.benchmark_researcher import (
|
| 506 |
+
BenchmarkResearcher,
|
| 507 |
+
BenchmarkData,
|
| 508 |
+
DESIGN_SYSTEM_SOURCES,
|
| 509 |
+
FALLBACK_BENCHMARKS
|
| 510 |
+
)
|
| 511 |
+
from agents.llm_agents import (
|
| 512 |
+
BrandIdentification,
|
| 513 |
+
BenchmarkAdvice,
|
| 514 |
+
BestPracticesResult,
|
| 515 |
+
HeadSynthesis,
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
logger = TestLogger(verbose=False)
|
| 519 |
+
|
| 520 |
+
# Step 1: Run Rule Engine
|
| 521 |
+
rule_results = run_rule_engine(
|
| 522 |
+
typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
|
| 523 |
+
color_tokens=MOCK_COLOR_TOKENS,
|
| 524 |
+
spacing_tokens=MOCK_SPACING_TOKENS,
|
| 525 |
+
log_callback=logger.log,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
print_result("Step 1: Rule Engine", True, f"score={rule_results.consistency_score}")
|
| 529 |
+
|
| 530 |
+
# Step 2: Benchmark Research (using fallbacks)
|
| 531 |
+
researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None)
|
| 532 |
+
|
| 533 |
+
mock_benchmarks = []
|
| 534 |
+
for key in ["material_design_3", "shopify_polaris", "atlassian_design"]:
|
| 535 |
+
source = DESIGN_SYSTEM_SOURCES[key]
|
| 536 |
+
fallback = FALLBACK_BENCHMARKS[key]
|
| 537 |
+
mock_benchmarks.append(BenchmarkData(
|
| 538 |
+
key=key,
|
| 539 |
+
name=source["name"],
|
| 540 |
+
short_name=source["short_name"],
|
| 541 |
+
vendor=source["vendor"],
|
| 542 |
+
icon=source["icon"],
|
| 543 |
+
typography=fallback["typography"],
|
| 544 |
+
spacing=fallback["spacing"],
|
| 545 |
+
fetched_at=datetime.now().isoformat(),
|
| 546 |
+
confidence="fallback",
|
| 547 |
+
best_for=source["best_for"],
|
| 548 |
+
))
|
| 549 |
+
|
| 550 |
+
comparisons = researcher.compare_to_benchmarks(
|
| 551 |
+
your_ratio=rule_results.typography.detected_ratio,
|
| 552 |
+
your_base_size=int(rule_results.typography.sizes_px[0]) if rule_results.typography.sizes_px else 16,
|
| 553 |
+
your_spacing_grid=rule_results.spacing.detected_base,
|
| 554 |
+
benchmarks=mock_benchmarks,
|
| 555 |
+
log_callback=logger.log,
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
print_result("Step 2: Benchmark Comparison", True, f"closest={comparisons[0].benchmark.short_name}")
|
| 559 |
+
|
| 560 |
+
# Step 3: Mock LLM results (simulating what agents would return)
|
| 561 |
+
brand_result = BrandIdentification(
|
| 562 |
+
brand_primary={"color": "#06b2c4", "confidence": "high", "reasoning": "Most used on CTAs"},
|
| 563 |
+
brand_secondary={"color": "#c1df1f", "confidence": "medium"},
|
| 564 |
+
palette_strategy="complementary",
|
| 565 |
+
cohesion_score=7,
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
benchmark_advice = BenchmarkAdvice(
|
| 569 |
+
recommended_benchmark="shopify_polaris",
|
| 570 |
+
recommended_benchmark_name="Shopify Polaris",
|
| 571 |
+
reasoning="Best match for e-commerce UX",
|
| 572 |
+
alignment_changes=[
|
| 573 |
+
{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}
|
| 574 |
+
],
|
| 575 |
+
)
|
| 576 |
+
|
| 577 |
+
best_practices = BestPracticesResult(
|
| 578 |
+
overall_score=58,
|
| 579 |
+
checks={
|
| 580 |
+
"type_scale_standard": {"status": "warn", "note": "1.18 close to Minor Third"},
|
| 581 |
+
"aa_compliance": {"status": "fail", "note": "2 colors fail AA"},
|
| 582 |
+
},
|
| 583 |
+
priority_fixes=[
|
| 584 |
+
{"rank": 1, "issue": "Brand primary fails AA", "impact": "high", "effort": "low"},
|
| 585 |
+
],
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
print_result("Step 3: Mock LLM Results", True, "all results created")
|
| 589 |
+
|
| 590 |
+
# Step 4: Verify data can be serialized
|
| 591 |
+
output = {
|
| 592 |
+
"rule_engine": rule_results.to_dict(),
|
| 593 |
+
"benchmarks": [c.to_dict() for c in comparisons],
|
| 594 |
+
"brand": brand_result.to_dict(),
|
| 595 |
+
"advice": benchmark_advice.to_dict(),
|
| 596 |
+
"practices": best_practices.to_dict(),
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
json_str = json.dumps(output, indent=2)
|
| 600 |
+
assert len(json_str) > 100, "Should produce substantial output"
|
| 601 |
+
|
| 602 |
+
print_result("Step 4: Serialization", True, f"{len(json_str)} bytes")
|
| 603 |
+
|
| 604 |
+
# Final summary
|
| 605 |
+
print("\n π Integration Summary:")
|
| 606 |
+
print(f" - Rule Engine Score: {rule_results.consistency_score}/100")
|
| 607 |
+
print(f" - AA Failures: {rule_results.aa_failures}")
|
| 608 |
+
print(f" - Closest Benchmark: {comparisons[0].benchmark.name}")
|
| 609 |
+
print(f" - Match: {comparisons[0].overall_match_pct:.0f}%")
|
| 610 |
+
|
| 611 |
+
all_passed = True
|
| 612 |
+
|
| 613 |
+
except Exception as e:
|
| 614 |
+
import traceback
|
| 615 |
+
print_result("Integration Test", False, str(e))
|
| 616 |
+
traceback.print_exc()
|
| 617 |
+
all_passed = False
|
| 618 |
+
|
| 619 |
+
return all_passed
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
# =============================================================================
|
| 623 |
+
# MAIN
|
| 624 |
+
# =============================================================================
|
| 625 |
+
|
| 626 |
+
def main():
|
| 627 |
+
"""Run all tests."""
|
| 628 |
+
print("\n" + "β" * 60)
|
| 629 |
+
print(" STAGE 2 PIPELINE TEST SUITE")
|
| 630 |
+
print("β" * 60)
|
| 631 |
+
print(f"\n Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 632 |
+
|
| 633 |
+
results = {}
|
| 634 |
+
|
| 635 |
+
# Run tests
|
| 636 |
+
results["Rule Engine"] = test_rule_engine()
|
| 637 |
+
results["Benchmark Research"] = test_benchmark_research()
|
| 638 |
+
results["LLM Agents"] = test_llm_agents()
|
| 639 |
+
results["Integration"] = asyncio.run(test_integration())
|
| 640 |
+
|
| 641 |
+
# Summary
|
| 642 |
+
print_section("TEST SUMMARY")
|
| 643 |
+
|
| 644 |
+
total = len(results)
|
| 645 |
+
passed = sum(1 for v in results.values() if v)
|
| 646 |
+
|
| 647 |
+
for name, result in results.items():
|
| 648 |
+
icon = "β
" if result else "β"
|
| 649 |
+
print(f" {icon} {name}")
|
| 650 |
+
|
| 651 |
+
print(f"\n Total: {passed}/{total} passed")
|
| 652 |
+
|
| 653 |
+
if passed == total:
|
| 654 |
+
print("\n π All tests passed!")
|
| 655 |
+
return 0
|
| 656 |
+
else:
|
| 657 |
+
print("\n β οΈ Some tests failed")
|
| 658 |
+
return 1
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
if __name__ == "__main__":
|
| 662 |
+
sys.exit(main())
|