riazmo commited on
Commit
f7fb352
·
verified ·
1 Parent(s): 8d4cb01

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tests/__pycache__/test_stage1_extraction.cpython-314-pytest-9.0.2.pyc filter=lfs diff=lfs merge=lfs -text
tests/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (163 Bytes). View file
 
tests/__pycache__/test_agent_evals.cpython-314-pytest-9.0.2.pyc ADDED
Binary file (64.9 kB). View file
 
tests/__pycache__/test_stage1_extraction.cpython-314-pytest-9.0.2.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c920bbdc9cb32c20edb958653c1fd8e065b1bcf34f52613f70c25d538c5bc8c
3
+ size 114191
tests/__pycache__/test_stage2_pipeline.cpython-314-pytest-9.0.2.pyc ADDED
Binary file (57.7 kB). View file
 
tests/test_agent_evals.py ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM Agent Evaluation Tests
4
+ ============================
5
+
6
+ Evaluates the 4 named AI agents using mock HF client responses.
7
+ Tests schema compliance, output correctness, and consistency.
8
+
9
+ Uses DeepEval when available, falls back to manual assertions.
10
+
11
+ Run: pytest tests/test_agent_evals.py -v
12
+ """
13
+
14
+ import asyncio
15
+ import json
16
+ import os
17
+ import sys
18
+ from dataclasses import asdict
19
+ from typing import Optional
20
+
21
+ import pytest
22
+
23
+ # Add parent directory to path
24
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
25
+
26
+ from agents.llm_agents import (
27
+ BrandIdentifierAgent,
28
+ BenchmarkAdvisorAgent,
29
+ BestPracticesValidatorAgent,
30
+ HeadSynthesizerAgent,
31
+ BrandIdentification,
32
+ BenchmarkAdvice,
33
+ BestPracticesResult,
34
+ HeadSynthesis,
35
+ )
36
+
37
+ # Try importing DeepEval
38
+ try:
39
+ from deepeval import assert_test
40
+ from deepeval.test_case import LLMTestCase
41
+ from deepeval.metrics import JsonSchemaMetric
42
+
43
+ HAS_DEEPEVAL = True
44
+ except ImportError:
45
+ HAS_DEEPEVAL = False
46
+
47
+
48
+ # =============================================================================
49
+ # MOCK HF CLIENT
50
+ # =============================================================================
51
+
52
+ # Canned JSON responses that each agent would return
53
+ AURORA_RESPONSE = json.dumps({
54
+ "brand_primary": {
55
+ "color": "#06b2c4",
56
+ "confidence": "high",
57
+ "reasoning": "Used in 33 buttons and 12 CTAs — dominant interactive color",
58
+ "usage_count": 45,
59
+ },
60
+ "brand_secondary": {
61
+ "color": "#c1df1f",
62
+ "confidence": "medium",
63
+ "reasoning": "Used in highlights and badges",
64
+ "usage_count": 23,
65
+ },
66
+ "brand_accent": None,
67
+ "palette_strategy": "complementary",
68
+ "cohesion_score": 6,
69
+ "cohesion_notes": "Primary and secondary are near-complementary on the color wheel. Reasonable coherence but accent is missing.",
70
+ "semantic_names": {
71
+ "#06b2c4": "brand.primary",
72
+ "#c1df1f": "brand.secondary",
73
+ "#1a1a1a": "text.primary",
74
+ "#666666": "text.secondary",
75
+ },
76
+ "self_evaluation": {
77
+ "confidence": 8,
78
+ "reasoning": "Clear dominant primary from button usage. Secondary less certain.",
79
+ "data_quality": "good",
80
+ "flags": [],
81
+ },
82
+ })
83
+
84
+ ATLAS_RESPONSE = json.dumps({
85
+ "recommended_benchmark": "shopify_polaris",
86
+ "recommended_benchmark_name": "Shopify Polaris",
87
+ "reasoning": "87% structural match. Polaris uses similar type scale and spacing grid approach.",
88
+ "alignment_changes": [
89
+ {"change": "Adopt 1.25 Major Third type scale", "from": "1.18 random", "to": "1.25", "effort": "low"},
90
+ {"change": "Standardize to 4px spacing grid", "from": "mixed", "to": "4px", "effort": "medium"},
91
+ ],
92
+ "pros_of_alignment": [
93
+ "Industry-standard component patterns",
94
+ "Strong accessibility built-in",
95
+ ],
96
+ "cons_of_alignment": [
97
+ "May feel generic without customization",
98
+ ],
99
+ "alternative_benchmarks": [
100
+ {"name": "Material Design 3", "reason": "77% match, stronger theming support"},
101
+ {"name": "Atlassian Design System", "reason": "76% match, similar enterprise focus"},
102
+ ],
103
+ "self_evaluation": {
104
+ "confidence": 7,
105
+ "reasoning": "Good structural match but benchmark comparison limited to 8 systems",
106
+ "data_quality": "good",
107
+ "flags": [],
108
+ },
109
+ })
110
+
111
+ SENTINEL_RESPONSE = json.dumps({
112
+ "overall_score": 62,
113
+ "checks": {
114
+ "color_contrast": {"status": "fail", "note": "67 AA failures including brand primary"},
115
+ "type_scale": {"status": "warn", "note": "Near-consistent but not standard ratio"},
116
+ "spacing_grid": {"status": "pass", "note": "4px grid detected with 85% alignment"},
117
+ "color_count": {"status": "warn", "note": "143 unique colors — recommend consolidation to ~20"},
118
+ },
119
+ "priority_fixes": [
120
+ {"rank": 1, "issue": "Brand primary fails AA contrast", "impact": "high", "effort": "low", "action": "Darken #06b2c4 to #048391"},
121
+ {"rank": 2, "issue": "143 colors too many", "impact": "medium", "effort": "medium", "action": "Consolidate to semantic palette"},
122
+ {"rank": 3, "issue": "Type scale inconsistent", "impact": "medium", "effort": "low", "action": "Adopt 1.25 Major Third"},
123
+ ],
124
+ "passing_practices": ["spacing_grid", "font_family_consistency"],
125
+ "failing_practices": ["color_contrast", "color_count"],
126
+ "self_evaluation": {
127
+ "confidence": 8,
128
+ "reasoning": "Rule engine data is clear. Priority ordering based on impact analysis.",
129
+ "data_quality": "good",
130
+ "flags": [],
131
+ },
132
+ })
133
+
134
+ NEXUS_RESPONSE = json.dumps({
135
+ "executive_summary": "Design system shows strong structural foundation (4px grid, consistent typography) but needs critical accessibility fixes. Brand primary #06b2c4 fails AA — recommend darkened variant. 87% aligned to Polaris.",
136
+ "scores": {
137
+ "overall": 62,
138
+ "accessibility": 45,
139
+ "consistency": 72,
140
+ "organization": 68,
141
+ },
142
+ "benchmark_fit": {
143
+ "closest": "Shopify Polaris",
144
+ "similarity": 87,
145
+ "recommendation": "Align type scale and consolidate colors for 95%+ match",
146
+ },
147
+ "brand_analysis": {
148
+ "primary": "#06b2c4",
149
+ "secondary": "#c1df1f",
150
+ "cohesion": 6,
151
+ },
152
+ "top_3_actions": [
153
+ {"action": "Fix brand primary contrast", "impact": "high", "effort": "low", "details": "Darken to #048391 for AA 4.5:1"},
154
+ {"action": "Consolidate color palette", "impact": "medium", "effort": "medium", "details": "Reduce 143 → ~20 semantic colors"},
155
+ {"action": "Standardize type scale", "impact": "medium", "effort": "low", "details": "Adopt 1.25 Major Third ratio"},
156
+ ],
157
+ "color_recommendations": [
158
+ {"role": "brand-primary", "current": "#06b2c4", "suggested": "#048391", "reason": "AA compliance", "accept": True},
159
+ ],
160
+ "type_scale_recommendation": {
161
+ "current_ratio": 1.18,
162
+ "recommended_ratio": 1.25,
163
+ "name": "Major Third",
164
+ },
165
+ "spacing_recommendation": {
166
+ "current_base": 4,
167
+ "recommended_base": 8,
168
+ "reason": "Simpler system with fewer decisions",
169
+ },
170
+ "self_evaluation": {
171
+ "confidence": 8,
172
+ "reasoning": "Strong data from rule engine and all 3 agents. Minor disagreement on spacing resolved by averaging.",
173
+ "data_quality": "good",
174
+ "flags": [],
175
+ },
176
+ })
177
+
178
+
179
+ class MockHFClient:
180
+ """Mock HF Inference client that returns canned responses per agent."""
181
+
182
+ AGENT_RESPONSES = {
183
+ "brand_identifier": AURORA_RESPONSE,
184
+ "benchmark_advisor": ATLAS_RESPONSE,
185
+ "best_practices": SENTINEL_RESPONSE,
186
+ "best_practices_validator": SENTINEL_RESPONSE,
187
+ "head_synthesizer": NEXUS_RESPONSE,
188
+ }
189
+
190
+ async def complete_async(
191
+ self,
192
+ agent_name: str,
193
+ system_prompt: str,
194
+ user_message: str,
195
+ max_tokens: int = 2000,
196
+ json_mode: bool = True,
197
+ ) -> str:
198
+ """Return canned response for the agent."""
199
+ return self.AGENT_RESPONSES.get(agent_name, "{}")
200
+
201
+
202
+ # =============================================================================
203
+ # TEST DATA
204
+ # =============================================================================
205
+
206
+ MOCK_COLOR_TOKENS = {
207
+ "brand-primary": {"value": "#06b2c4", "frequency": 45, "context": "buttons, links"},
208
+ "brand-secondary": {"value": "#c1df1f", "frequency": 23, "context": "highlights"},
209
+ "text-primary": {"value": "#1a1a1a", "frequency": 120, "context": "headings, body"},
210
+ "text-secondary": {"value": "#666666", "frequency": 80, "context": "captions"},
211
+ "background": {"value": "#ffffff", "frequency": 200, "context": "page background"},
212
+ }
213
+
214
+ MOCK_SEMANTIC_ANALYSIS = {
215
+ "brand": [{"hex": "#06b2c4", "name": "brand-primary"}],
216
+ "text": [{"hex": "#1a1a1a", "name": "text-primary"}],
217
+ }
218
+
219
+ class MockBenchmarkSystem:
220
+ """Mock benchmark system object (what c.benchmark returns)."""
221
+ def __init__(self, name, icon, scale_ratio, base_size, spacing_base, best_for):
222
+ self.name = name
223
+ self.icon = icon
224
+ self.typography = {"scale_ratio": scale_ratio, "base_size": base_size}
225
+ self.spacing = {"base": spacing_base}
226
+ self.best_for = best_for
227
+
228
+
229
+ class MockBenchmarkComparison:
230
+ """Mock benchmark comparison object (what ATLAS._format_comparisons expects)."""
231
+ def __init__(self, benchmark, similarity_score, overall_match_pct, type_ratio_diff, base_size_diff, spacing_grid_diff):
232
+ self.benchmark = benchmark
233
+ self.similarity_score = similarity_score
234
+ self.overall_match_pct = overall_match_pct
235
+ self.type_ratio_diff = type_ratio_diff
236
+ self.base_size_diff = base_size_diff
237
+ self.spacing_grid_diff = spacing_grid_diff
238
+
239
+
240
+ MOCK_BENCHMARK_COMPARISONS = [
241
+ MockBenchmarkComparison(
242
+ benchmark=MockBenchmarkSystem("Shopify Polaris", "🟢", 1.25, 16, 4, ["e-commerce", "admin"]),
243
+ similarity_score=0.13, overall_match_pct=87, type_ratio_diff=0.07, base_size_diff=0, spacing_grid_diff=0,
244
+ ),
245
+ MockBenchmarkComparison(
246
+ benchmark=MockBenchmarkSystem("Material Design 3", "🔵", 1.25, 16, 8, ["mobile", "web"]),
247
+ similarity_score=0.23, overall_match_pct=77, type_ratio_diff=0.07, base_size_diff=0, spacing_grid_diff=4,
248
+ ),
249
+ MockBenchmarkComparison(
250
+ benchmark=MockBenchmarkSystem("Atlassian", "🔷", 1.2, 14, 8, ["enterprise", "tools"]),
251
+ similarity_score=0.24, overall_match_pct=76, type_ratio_diff=0.02, base_size_diff=2, spacing_grid_diff=4,
252
+ ),
253
+ ]
254
+
255
+
256
+ # Mock RuleEngineResults for SENTINEL and NEXUS
257
+ class MockTypography:
258
+ detected_ratio = 1.18
259
+ base_size = 16.0
260
+ sizes_px = [12, 14, 16, 18, 22, 28, 36, 48]
261
+ is_consistent = False
262
+ variance = 0.22
263
+ scale_name = "Minor Third"
264
+ closest_standard_ratio = 1.2
265
+ recommendation = 1.25
266
+ recommendation_name = "Major Third"
267
+
268
+ def to_dict(self):
269
+ return {"detected_ratio": self.detected_ratio, "base_size": self.base_size}
270
+
271
+
272
+ class MockSpacing:
273
+ detected_base = 4
274
+ is_aligned = True
275
+ alignment_percentage = 85.0
276
+ misaligned_values = [5, 10]
277
+ recommendation = 8
278
+ recommendation_reason = "Simpler grid"
279
+ current_values = [4, 8, 12, 16, 24, 32]
280
+ suggested_scale = [0, 4, 8, 12, 16, 24, 32, 48]
281
+
282
+ def to_dict(self):
283
+ return {"detected_base": self.detected_base, "alignment_percentage": self.alignment_percentage}
284
+
285
+
286
+ class MockColorStats:
287
+ total_count = 160
288
+ unique_count = 143
289
+ duplicate_count = 17
290
+ gray_count = 22
291
+ saturated_count = 45
292
+ near_duplicates = [("#06b2c4", "#07b3c5", 0.01)]
293
+ hue_distribution = {"cyan": 5, "gray": 22, "green": 3}
294
+
295
+ def to_dict(self):
296
+ return {"total": self.total_count, "unique": self.unique_count}
297
+
298
+
299
+ class MockAccessibility:
300
+ def __init__(self):
301
+ self.hex_color = "#06b2c4"
302
+ self.name = "brand-primary"
303
+ self.passes_aa_normal = False
304
+ self.contrast_on_white = 2.57
305
+ self.contrast_on_black = 8.18
306
+ self.suggested_fix = "#048391"
307
+ self.suggested_fix_contrast = 4.5
308
+
309
+ def to_dict(self):
310
+ return {"color": self.hex_color, "aa_normal": self.passes_aa_normal}
311
+
312
+
313
+ class MockRuleEngineResults:
314
+ typography = MockTypography()
315
+ spacing = MockSpacing()
316
+ color_stats = MockColorStats()
317
+ accessibility = [MockAccessibility()]
318
+ aa_failures = 67
319
+ consistency_score = 52
320
+
321
+ def to_dict(self):
322
+ return {
323
+ "typography": self.typography.to_dict(),
324
+ "spacing": self.spacing.to_dict(),
325
+ "color_stats": self.color_stats.to_dict(),
326
+ "summary": {"aa_failures": self.aa_failures, "consistency_score": self.consistency_score},
327
+ }
328
+
329
+
330
+ # =============================================================================
331
+ # SCHEMA COMPLIANCE TESTS
332
+ # =============================================================================
333
+
334
+ class TestAuroraSchemaCompliance:
335
+ """AURORA (Brand Identifier) output schema validation."""
336
+
337
+ @pytest.fixture
338
+ def agent(self):
339
+ return BrandIdentifierAgent(MockHFClient())
340
+
341
+ @pytest.mark.asyncio
342
+ async def test_schema_compliance(self, agent):
343
+ """AURORA output has all required BrandIdentification fields."""
344
+ result = await agent.analyze(
345
+ color_tokens=MOCK_COLOR_TOKENS,
346
+ semantic_analysis=MOCK_SEMANTIC_ANALYSIS,
347
+ )
348
+ assert isinstance(result, BrandIdentification)
349
+ # Required fields present
350
+ assert hasattr(result, "brand_primary")
351
+ assert hasattr(result, "palette_strategy")
352
+ assert hasattr(result, "cohesion_score")
353
+ assert hasattr(result, "self_evaluation")
354
+
355
+ @pytest.mark.asyncio
356
+ async def test_brand_primary_detected(self, agent):
357
+ """AURORA correctly identifies brand primary from high-usage color."""
358
+ result = await agent.analyze(
359
+ color_tokens=MOCK_COLOR_TOKENS,
360
+ semantic_analysis=MOCK_SEMANTIC_ANALYSIS,
361
+ )
362
+ bp = result.brand_primary
363
+ assert isinstance(bp, dict)
364
+ assert bp.get("color") == "#06b2c4"
365
+ assert bp.get("confidence") in ("high", "medium", "low")
366
+
367
+ @pytest.mark.asyncio
368
+ async def test_palette_strategy_valid(self, agent):
369
+ """Palette strategy is a recognized value."""
370
+ result = await agent.analyze(
371
+ color_tokens=MOCK_COLOR_TOKENS,
372
+ semantic_analysis=MOCK_SEMANTIC_ANALYSIS,
373
+ )
374
+ valid_strategies = ["complementary", "analogous", "triadic", "monochromatic", "split-complementary", "random", ""]
375
+ assert result.palette_strategy in valid_strategies
376
+
377
+ @pytest.mark.asyncio
378
+ async def test_to_dict_serializable(self, agent):
379
+ """Output is JSON-serializable."""
380
+ result = await agent.analyze(
381
+ color_tokens=MOCK_COLOR_TOKENS,
382
+ semantic_analysis=MOCK_SEMANTIC_ANALYSIS,
383
+ )
384
+ d = result.to_dict()
385
+ json_str = json.dumps(d)
386
+ assert len(json_str) > 10
387
+
388
+
389
+ class TestAtlasSchemaCompliance:
390
+ """ATLAS (Benchmark Advisor) output schema validation."""
391
+
392
+ @pytest.fixture
393
+ def agent(self):
394
+ return BenchmarkAdvisorAgent(MockHFClient())
395
+
396
+ @pytest.mark.asyncio
397
+ async def test_schema_compliance(self, agent):
398
+ """ATLAS output has all required BenchmarkAdvice fields."""
399
+ result = await agent.analyze(
400
+ user_ratio=1.18,
401
+ user_base=16,
402
+ user_spacing=4,
403
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
404
+ )
405
+ assert isinstance(result, BenchmarkAdvice)
406
+ assert hasattr(result, "recommended_benchmark")
407
+ assert hasattr(result, "reasoning")
408
+ assert hasattr(result, "alignment_changes")
409
+ assert hasattr(result, "self_evaluation")
410
+
411
+ @pytest.mark.asyncio
412
+ async def test_benchmark_recommended(self, agent):
413
+ """ATLAS recommends a valid benchmark."""
414
+ result = await agent.analyze(
415
+ user_ratio=1.18,
416
+ user_base=16,
417
+ user_spacing=4,
418
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
419
+ )
420
+ assert result.recommended_benchmark != ""
421
+ assert result.reasoning != ""
422
+
423
+ @pytest.mark.asyncio
424
+ async def test_alignment_changes_structured(self, agent):
425
+ """Alignment changes are structured dicts."""
426
+ result = await agent.analyze(
427
+ user_ratio=1.18,
428
+ user_base=16,
429
+ user_spacing=4,
430
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
431
+ )
432
+ assert isinstance(result.alignment_changes, list)
433
+ if result.alignment_changes:
434
+ change = result.alignment_changes[0]
435
+ assert isinstance(change, dict)
436
+ assert "change" in change
437
+
438
+
439
+ class TestSentinelSchemaCompliance:
440
+ """SENTINEL (Best Practices Validator) output schema validation."""
441
+
442
+ @pytest.fixture
443
+ def agent(self):
444
+ return BestPracticesValidatorAgent(MockHFClient())
445
+
446
+ @pytest.mark.asyncio
447
+ async def test_schema_compliance(self, agent):
448
+ """SENTINEL output has all required BestPracticesResult fields."""
449
+ result = await agent.analyze(
450
+ rule_engine_results=MockRuleEngineResults(),
451
+ )
452
+ assert isinstance(result, BestPracticesResult)
453
+ assert hasattr(result, "overall_score")
454
+ assert hasattr(result, "priority_fixes")
455
+ assert hasattr(result, "self_evaluation")
456
+
457
+ @pytest.mark.asyncio
458
+ async def test_score_in_range(self, agent):
459
+ """Overall score is between 0-100."""
460
+ result = await agent.analyze(
461
+ rule_engine_results=MockRuleEngineResults(),
462
+ )
463
+ assert 0 <= result.overall_score <= 100
464
+
465
+ @pytest.mark.asyncio
466
+ async def test_priority_fixes_ranked(self, agent):
467
+ """Priority fixes are a list with high-impact items first."""
468
+ result = await agent.analyze(
469
+ rule_engine_results=MockRuleEngineResults(),
470
+ )
471
+ assert isinstance(result.priority_fixes, list)
472
+ if len(result.priority_fixes) >= 2:
473
+ # First fix should be highest priority
474
+ first = result.priority_fixes[0]
475
+ if isinstance(first, dict) and "rank" in first:
476
+ assert first["rank"] == 1
477
+
478
+
479
+ class TestNexusSchemaCompliance:
480
+ """NEXUS (Head Synthesizer) output schema validation."""
481
+
482
+ @pytest.fixture
483
+ def agent(self):
484
+ return HeadSynthesizerAgent(MockHFClient())
485
+
486
+ @pytest.mark.asyncio
487
+ async def test_schema_compliance(self, agent):
488
+ """NEXUS output has all required HeadSynthesis fields."""
489
+ result = await agent.synthesize(
490
+ rule_engine_results=MockRuleEngineResults(),
491
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
492
+ brand_identification=BrandIdentification(
493
+ brand_primary={"color": "#06b2c4", "confidence": "high"},
494
+ palette_strategy="complementary",
495
+ cohesion_score=6,
496
+ ),
497
+ benchmark_advice=BenchmarkAdvice(
498
+ recommended_benchmark="shopify_polaris",
499
+ reasoning="87% structural match",
500
+ ),
501
+ best_practices=BestPracticesResult(
502
+ overall_score=62,
503
+ priority_fixes=[{"issue": "AA contrast", "impact": "high"}],
504
+ ),
505
+ )
506
+ assert isinstance(result, HeadSynthesis)
507
+ assert hasattr(result, "executive_summary")
508
+ assert hasattr(result, "top_3_actions")
509
+ assert hasattr(result, "scores")
510
+ assert hasattr(result, "self_evaluation")
511
+
512
+ @pytest.mark.asyncio
513
+ async def test_executive_summary_non_empty(self, agent):
514
+ """NEXUS produces a non-empty executive summary."""
515
+ result = await agent.synthesize(
516
+ rule_engine_results=MockRuleEngineResults(),
517
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
518
+ brand_identification=BrandIdentification(),
519
+ benchmark_advice=BenchmarkAdvice(),
520
+ best_practices=BestPracticesResult(),
521
+ )
522
+ assert result.executive_summary != ""
523
+
524
+ @pytest.mark.asyncio
525
+ async def test_top_3_actions_present(self, agent):
526
+ """NEXUS provides top 3 action items."""
527
+ result = await agent.synthesize(
528
+ rule_engine_results=MockRuleEngineResults(),
529
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
530
+ brand_identification=BrandIdentification(),
531
+ benchmark_advice=BenchmarkAdvice(),
532
+ best_practices=BestPracticesResult(),
533
+ )
534
+ assert isinstance(result.top_3_actions, list)
535
+ assert len(result.top_3_actions) >= 1
536
+
537
+
538
+ # =============================================================================
539
+ # SELF-EVALUATION TESTS
540
+ # =============================================================================
541
+
542
+ class TestSelfEvaluation:
543
+ """All agents should include self_evaluation with confidence scoring."""
544
+
545
+ @pytest.mark.asyncio
546
+ async def test_aurora_self_evaluation(self):
547
+ agent = BrandIdentifierAgent(MockHFClient())
548
+ result = await agent.analyze(
549
+ color_tokens=MOCK_COLOR_TOKENS,
550
+ semantic_analysis=MOCK_SEMANTIC_ANALYSIS,
551
+ )
552
+ se = result.self_evaluation
553
+ assert isinstance(se, dict)
554
+ assert "confidence" in se
555
+ assert "data_quality" in se
556
+
557
+ @pytest.mark.asyncio
558
+ async def test_atlas_self_evaluation(self):
559
+ agent = BenchmarkAdvisorAgent(MockHFClient())
560
+ result = await agent.analyze(
561
+ user_ratio=1.18,
562
+ user_base=16,
563
+ user_spacing=4,
564
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
565
+ )
566
+ se = result.self_evaluation
567
+ assert isinstance(se, dict)
568
+ assert "confidence" in se
569
+
570
+ @pytest.mark.asyncio
571
+ async def test_sentinel_self_evaluation(self):
572
+ agent = BestPracticesValidatorAgent(MockHFClient())
573
+ result = await agent.analyze(
574
+ rule_engine_results=MockRuleEngineResults(),
575
+ )
576
+ se = result.self_evaluation
577
+ assert isinstance(se, dict)
578
+ assert "confidence" in se
579
+
580
+ @pytest.mark.asyncio
581
+ async def test_nexus_self_evaluation(self):
582
+ agent = HeadSynthesizerAgent(MockHFClient())
583
+ result = await agent.synthesize(
584
+ rule_engine_results=MockRuleEngineResults(),
585
+ benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS,
586
+ brand_identification=BrandIdentification(),
587
+ benchmark_advice=BenchmarkAdvice(),
588
+ best_practices=BestPracticesResult(),
589
+ )
590
+ se = result.self_evaluation
591
+ assert isinstance(se, dict)
592
+ assert "confidence" in se
593
+
594
+
595
+ # =============================================================================
596
+ # VALIDATION MODULE TESTS
597
+ # =============================================================================
598
+
599
+ class TestValidationModule:
600
+ """Test the core/validation.py module."""
601
+
602
+ def test_validate_aurora_output(self):
603
+ from core.validation import validate_agent_output
604
+
605
+ data = {
606
+ "brand_primary": {"color": "#06b2c4"},
607
+ "palette_strategy": "complementary",
608
+ "cohesion_score": 6,
609
+ }
610
+ is_valid, error = validate_agent_output(data, "aurora")
611
+ assert is_valid
612
+
613
+ def test_validate_aurora_missing_required(self):
614
+ from core.validation import validate_agent_output
615
+
616
+ data = {"cohesion_score": 6} # Missing brand_primary and palette_strategy
617
+ is_valid, error = validate_agent_output(data, "aurora")
618
+ assert not is_valid
619
+ assert error is not None
620
+
621
+ def test_validate_nexus_output(self):
622
+ from core.validation import validate_agent_output
623
+
624
+ data = {
625
+ "executive_summary": "Test summary",
626
+ "top_3_actions": [{"action": "Fix contrast"}],
627
+ "scores": {"overall": 62},
628
+ }
629
+ is_valid, error = validate_agent_output(data, "nexus")
630
+ assert is_valid
631
+
632
+ def test_validate_unknown_agent_passes(self):
633
+ from core.validation import validate_agent_output
634
+
635
+ is_valid, error = validate_agent_output({"anything": True}, "unknown_agent")
636
+ assert is_valid # No schema = pass
637
+
638
+ def test_validate_dataclass(self):
639
+ from core.validation import validate_agent_output
640
+
641
+ brand = BrandIdentification(
642
+ brand_primary={"color": "#06b2c4"},
643
+ palette_strategy="complementary",
644
+ )
645
+ is_valid, error = validate_agent_output(brand, "aurora")
646
+ assert is_valid
647
+
648
+
649
+ if __name__ == "__main__":
650
+ pytest.main([__file__, "-v"])
tests/test_stage1_extraction.py ADDED
@@ -0,0 +1,716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stage 1 Test Suite — Extraction, Normalization & Rule Engine
4
+ =============================================================
5
+
6
+ Tests the deterministic (free) layer:
7
+ - Color utilities: hex normalization, deduplication, categorization
8
+ - Rule Engine: WCAG contrast, type scale detection, spacing grid, consistency score
9
+ - Edge cases and boundary conditions
10
+
11
+ Run: pytest tests/test_stage1_extraction.py -v
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import pytest
17
+
18
+ # Add parent directory to path
19
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20
+
21
+ from core.color_utils import (
22
+ normalize_hex,
23
+ parse_color,
24
+ deduplicate_colors,
25
+ are_colors_similar,
26
+ color_distance,
27
+ categorize_color,
28
+ get_contrast_ratio,
29
+ check_wcag_compliance,
30
+ generate_color_ramp,
31
+ hex_to_rgb,
32
+ rgb_to_hex,
33
+ )
34
+ from core.rule_engine import (
35
+ analyze_type_scale,
36
+ analyze_accessibility,
37
+ analyze_spacing_grid,
38
+ analyze_color_statistics,
39
+ run_rule_engine,
40
+ get_contrast_ratio as re_get_contrast_ratio,
41
+ get_relative_luminance,
42
+ hex_to_rgb as re_hex_to_rgb,
43
+ is_gray,
44
+ color_distance as re_color_distance,
45
+ find_aa_compliant_color,
46
+ parse_size_to_px,
47
+ STANDARD_SCALES,
48
+ )
49
+
50
+
51
+ # =============================================================================
52
+ # TEST DATA
53
+ # =============================================================================
54
+
55
+ MOCK_TYPOGRAPHY_TOKENS = {
56
+ "heading-1": {"font_size": "48px", "font_weight": "700"},
57
+ "heading-2": {"font_size": "36px", "font_weight": "600"},
58
+ "heading-3": {"font_size": "28px", "font_weight": "600"},
59
+ "heading-4": {"font_size": "22px", "font_weight": "500"},
60
+ "body-large": {"font_size": "18px", "font_weight": "400"},
61
+ "body": {"font_size": "16px", "font_weight": "400"},
62
+ "body-small": {"font_size": "14px", "font_weight": "400"},
63
+ "caption": {"font_size": "12px", "font_weight": "400"},
64
+ }
65
+
66
+ MOCK_COLOR_TOKENS = {
67
+ "brand-primary": {"value": "#06b2c4"},
68
+ "brand-secondary": {"value": "#c1df1f"},
69
+ "text-primary": {"value": "#1a1a1a"},
70
+ "text-secondary": {"value": "#666666"},
71
+ "background": {"value": "#ffffff"},
72
+ "light-cyan": {"value": "#7dd3fc"}, # Fails AA on white
73
+ "light-lime": {"value": "#d9f99d"}, # Fails AA on white
74
+ }
75
+
76
+ MOCK_SPACING_TOKENS_ALIGNED = {
77
+ "space-1": {"value_px": 4},
78
+ "space-2": {"value_px": 8},
79
+ "space-3": {"value_px": 16},
80
+ "space-4": {"value_px": 24},
81
+ "space-5": {"value_px": 32},
82
+ "space-6": {"value_px": 48},
83
+ }
84
+
85
+ MOCK_SPACING_TOKENS_MISALIGNED = {
86
+ "space-1": {"value_px": 5},
87
+ "space-2": {"value_px": 10},
88
+ "space-3": {"value_px": 15},
89
+ "space-4": {"value_px": 22},
90
+ "space-5": {"value_px": 33},
91
+ }
92
+
93
+
94
+ # =============================================================================
95
+ # TEST CLASS: Color Utilities — Normalization & Deduplication
96
+ # =============================================================================
97
+
98
+ class TestColorNormalization:
99
+ """Test color parsing, normalization and deduplication."""
100
+
101
+ def test_normalize_hex_6digit(self):
102
+ """6-digit hex stays lowercase."""
103
+ assert normalize_hex("#FF0000") == "#ff0000"
104
+ assert normalize_hex("#ffffff") == "#ffffff"
105
+
106
+ def test_normalize_hex_3digit(self):
107
+ """3-digit hex expands to 6-digit."""
108
+ assert normalize_hex("#fff") == "#ffffff"
109
+ assert normalize_hex("#000") == "#000000"
110
+ assert normalize_hex("#f00") == "#ff0000"
111
+
112
+ def test_parse_color_hex(self):
113
+ """Parse hex color to ParsedColor."""
114
+ parsed = parse_color("#ff0000")
115
+ assert parsed is not None
116
+ assert parsed.hex == "#ff0000"
117
+ assert parsed.rgb == (255, 0, 0)
118
+
119
+ def test_parse_color_rgb(self):
120
+ """Parse rgb() string."""
121
+ parsed = parse_color("rgb(0, 128, 255)")
122
+ assert parsed is not None
123
+ assert parsed.rgb == (0, 128, 255)
124
+
125
+ def test_parse_color_invalid(self):
126
+ """Invalid color returns None."""
127
+ assert parse_color("not-a-color") is None
128
+ assert parse_color("") is None
129
+
130
+ def test_hex_to_rgb_and_back(self):
131
+ """Round-trip hex → RGB → hex."""
132
+ r, g, b = hex_to_rgb("#1a2b3c")
133
+ result = rgb_to_hex(r, g, b)
134
+ assert result == "#1a2b3c"
135
+
136
+ def test_deduplicate_exact_duplicates(self):
137
+ """Exact same colors are deduplicated."""
138
+ colors = ["#ff0000", "#ff0000", "#00ff00", "#00ff00", "#0000ff"]
139
+ result = deduplicate_colors(colors, threshold=1.0)
140
+ assert len(result) == 3
141
+
142
+ def test_deduplicate_near_duplicates(self):
143
+ """Near-duplicate colors (within threshold) are deduplicated."""
144
+ colors = ["#ff0000", "#fe0101", "#00ff00"]
145
+ result = deduplicate_colors(colors, threshold=10.0)
146
+ assert len(result) == 2 # #ff0000 and #fe0101 are near-dupes
147
+
148
+ def test_deduplicate_preserves_distinct(self):
149
+ """Distinct colors are preserved."""
150
+ colors = ["#ff0000", "#00ff00", "#0000ff"]
151
+ result = deduplicate_colors(colors, threshold=10.0)
152
+ assert len(result) == 3
153
+
154
+ def test_are_colors_similar_identical(self):
155
+ """Same color is similar."""
156
+ assert are_colors_similar("#ff0000", "#ff0000")
157
+
158
+ def test_are_colors_similar_different(self):
159
+ """Very different colors are not similar."""
160
+ assert not are_colors_similar("#ff0000", "#0000ff", threshold=10.0)
161
+
162
+ def test_color_distance_identical(self):
163
+ """Same color has distance 0."""
164
+ assert color_distance("#ff0000", "#ff0000") == 0.0
165
+
166
+ def test_color_distance_symmetric(self):
167
+ """Distance is symmetric."""
168
+ d1 = color_distance("#ff0000", "#00ff00")
169
+ d2 = color_distance("#00ff00", "#ff0000")
170
+ assert d1 == d2
171
+
172
+
173
+ # =============================================================================
174
+ # TEST CLASS: Color Categorization
175
+ # =============================================================================
176
+
177
+ class TestColorCategorization:
178
+ """Test semantic color classification."""
179
+
180
+ def test_categorize_red(self):
181
+ assert categorize_color("#ff0000") == "red"
182
+
183
+ def test_categorize_blue(self):
184
+ assert categorize_color("#0000ff") == "blue"
185
+
186
+ def test_categorize_green(self):
187
+ assert categorize_color("#00ff00") == "green"
188
+
189
+ def test_categorize_neutral_white(self):
190
+ assert categorize_color("#ffffff") == "neutral"
191
+
192
+ def test_categorize_neutral_black(self):
193
+ assert categorize_color("#000000") == "neutral"
194
+
195
+ def test_categorize_neutral_gray(self):
196
+ assert categorize_color("#808080") == "neutral"
197
+
198
+ def test_categorize_cyan(self):
199
+ """Brand color #06b2c4 should be cyan."""
200
+ assert categorize_color("#06b2c4") == "cyan"
201
+
202
+
203
+ # =============================================================================
204
+ # TEST CLASS: WCAG Contrast (Rule Engine)
205
+ # =============================================================================
206
+
207
+ class TestWCAGContrast:
208
+ """Test WCAG contrast ratio calculations — core math."""
209
+
210
+ def test_black_on_white_is_21(self):
211
+ """Black on white should be 21:1 (maximum contrast)."""
212
+ ratio = re_get_contrast_ratio("#000000", "#ffffff")
213
+ assert abs(ratio - 21.0) < 0.1
214
+
215
+ def test_white_on_black_is_21(self):
216
+ """White on black is also 21:1 (symmetric)."""
217
+ ratio = re_get_contrast_ratio("#ffffff", "#000000")
218
+ assert abs(ratio - 21.0) < 0.1
219
+
220
+ def test_same_color_is_1(self):
221
+ """Same color on same color should be 1:1."""
222
+ ratio = re_get_contrast_ratio("#ff0000", "#ff0000")
223
+ assert abs(ratio - 1.0) < 0.01
224
+
225
+ def test_contrast_ratio_symmetric(self):
226
+ """Contrast ratio is symmetric."""
227
+ r1 = re_get_contrast_ratio("#06b2c4", "#ffffff")
228
+ r2 = re_get_contrast_ratio("#ffffff", "#06b2c4")
229
+ assert abs(r1 - r2) < 0.01
230
+
231
+ def test_brand_primary_fails_aa_on_white(self):
232
+ """Brand color #06b2c4 fails AA on white (contrast ~2.6)."""
233
+ ratio = re_get_contrast_ratio("#06b2c4", "#ffffff")
234
+ assert ratio < 4.5 # Fails AA normal
235
+ assert ratio > 2.0 # But has some contrast
236
+
237
+ def test_dark_text_passes_aa(self):
238
+ """Dark text #1a1a1a passes AA on white."""
239
+ ratio = re_get_contrast_ratio("#1a1a1a", "#ffffff")
240
+ assert ratio >= 4.5
241
+
242
+ def test_luminance_black_is_zero(self):
243
+ """Black has luminance ~0."""
244
+ lum = get_relative_luminance("#000000")
245
+ assert abs(lum) < 0.001
246
+
247
+ def test_luminance_white_is_one(self):
248
+ """White has luminance ~1."""
249
+ lum = get_relative_luminance("#ffffff")
250
+ assert abs(lum - 1.0) < 0.001
251
+
252
+ def test_find_aa_compliant_preserves_passing(self):
253
+ """Color already passing AA is returned unchanged."""
254
+ result = find_aa_compliant_color("#1a1a1a", "#ffffff", 4.5)
255
+ assert result == "#1a1a1a"
256
+
257
+ def test_find_aa_compliant_fixes_failing(self):
258
+ """Failing color gets a fix that passes AA."""
259
+ fixed = find_aa_compliant_color("#06b2c4", "#ffffff", 4.5)
260
+ fixed_ratio = re_get_contrast_ratio(fixed, "#ffffff")
261
+ assert fixed_ratio >= 4.5
262
+
263
+ def test_analyze_accessibility_finds_failures(self):
264
+ """analyze_accessibility identifies colors that fail AA on BOTH white and black."""
265
+ results = analyze_accessibility(MOCK_COLOR_TOKENS)
266
+ # passes_aa_normal is True if contrast >= 4.5 on white OR black.
267
+ # Light colors pass because they have good contrast on black.
268
+ # Medium-contrast colors like #06b2c4 or #666666 may fail on both.
269
+ # At minimum, all results should be analyzed
270
+ assert len(results) >= 5 # At least the colors with valid hex
271
+ # Check that brand-primary #06b2c4 has low contrast on white
272
+ brand = [r for r in results if r.hex_color == "#06b2c4"]
273
+ assert len(brand) == 1
274
+ assert brand[0].contrast_on_white < 4.5
275
+
276
+ def test_analyze_accessibility_suggests_fixes(self):
277
+ """AA failures get suggested fixes."""
278
+ results = analyze_accessibility(MOCK_COLOR_TOKENS)
279
+ failures = [r for r in results if not r.passes_aa_normal]
280
+ for f in failures:
281
+ assert f.suggested_fix is not None
282
+ assert f.suggested_fix_contrast is not None
283
+ assert f.suggested_fix_contrast >= 4.5
284
+
285
+ def test_fg_bg_pair_check(self):
286
+ """FG/BG pairs are checked for contrast."""
287
+ pairs = [
288
+ {"foreground": "#06b2c4", "background": "#ffffff", "element": "button"},
289
+ ]
290
+ results = analyze_accessibility({}, fg_bg_pairs=pairs)
291
+ # #06b2c4 on white fails AA (contrast ~3.2)
292
+ pair_failures = [r for r in results if r.name.startswith("fg:")]
293
+ assert len(pair_failures) == 1
294
+
295
+ def test_fg_bg_same_color_skipped(self):
296
+ """Same-color FG/BG pairs are skipped (invisible text)."""
297
+ pairs = [
298
+ {"foreground": "#ffffff", "background": "#ffffff", "element": "hidden"},
299
+ ]
300
+ results = analyze_accessibility({}, fg_bg_pairs=pairs)
301
+ assert len(results) == 0
302
+
303
+
304
+ # =============================================================================
305
+ # TEST CLASS: Type Scale Detection
306
+ # =============================================================================
307
+
308
+ class TestTypeScaleDetection:
309
+ """Test type scale ratio detection and recommendations."""
310
+
311
+ def test_detect_ratio_from_tokens(self):
312
+ """Detects a reasonable ratio from mock typography tokens."""
313
+ result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS)
314
+ # Sizes: 12, 14, 16, 18, 22, 28, 36, 48 — ratios vary
315
+ assert result.detected_ratio > 1.0
316
+ assert result.detected_ratio < 2.0
317
+
318
+ def test_consistent_scale(self):
319
+ """A perfectly consistent scale is detected as consistent."""
320
+ # Major Third (1.25): 12, 15, 18.75, 23.4, 29.3
321
+ tokens = {
322
+ f"size-{i}": {"font_size": f"{12 * (1.25 ** i):.1f}px"}
323
+ for i in range(5)
324
+ }
325
+ result = analyze_type_scale(tokens)
326
+ assert result.is_consistent
327
+ assert abs(result.detected_ratio - 1.25) < 0.05
328
+
329
+ def test_inconsistent_scale(self):
330
+ """Random sizes are detected as inconsistent."""
331
+ tokens = {
332
+ "a": {"font_size": "10px"},
333
+ "b": {"font_size": "17px"},
334
+ "c": {"font_size": "31px"},
335
+ "d": {"font_size": "42px"},
336
+ }
337
+ result = analyze_type_scale(tokens)
338
+ # Very inconsistent — large variance
339
+ assert result.variance > 0.15 or not result.is_consistent
340
+
341
+ def test_single_size(self):
342
+ """Single size returns Unknown scale."""
343
+ tokens = {"body": {"font_size": "16px"}}
344
+ result = analyze_type_scale(tokens)
345
+ assert result.scale_name == "Unknown"
346
+ assert result.recommendation == 1.25 # Default: Major Third
347
+
348
+ def test_no_sizes(self):
349
+ """Empty tokens return Unknown scale."""
350
+ result = analyze_type_scale({})
351
+ assert result.scale_name == "Unknown"
352
+
353
+ def test_rem_conversion(self):
354
+ """rem values are converted to px (1rem = 16px)."""
355
+ tokens = {
356
+ "body": {"font_size": "1rem"},
357
+ "heading": {"font_size": "2rem"},
358
+ }
359
+ result = analyze_type_scale(tokens)
360
+ assert 16.0 in result.sizes_px
361
+ assert 32.0 in result.sizes_px
362
+
363
+ def test_base_size_detection(self):
364
+ """Base size detected near 16px."""
365
+ result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS)
366
+ assert 14 <= result.base_size <= 18
367
+
368
+ def test_standard_scales_defined(self):
369
+ """All standard scales are defined."""
370
+ assert 1.25 in STANDARD_SCALES
371
+ assert 1.333 in STANDARD_SCALES
372
+ assert 1.618 in STANDARD_SCALES
373
+
374
+ def test_parse_size_to_px(self):
375
+ """Various size formats parsed correctly."""
376
+ assert parse_size_to_px("16px") == 16.0
377
+ assert parse_size_to_px("1rem") == 16.0
378
+ assert parse_size_to_px("1.5em") == 24.0
379
+ assert parse_size_to_px(16) == 16.0
380
+ assert parse_size_to_px("abc") is None
381
+
382
+
383
+ # =============================================================================
384
+ # TEST CLASS: Spacing Grid Analysis
385
+ # =============================================================================
386
+
387
+ class TestSpacingGrid:
388
+ """Test spacing grid detection and GCD math."""
389
+
390
+ def test_aligned_to_4px(self):
391
+ """Values divisible by 4 are detected as 4px-aligned."""
392
+ result = analyze_spacing_grid(MOCK_SPACING_TOKENS_ALIGNED)
393
+ assert result.is_aligned
394
+ # All values (4, 8, 16, 24, 32, 48) are divisible by 4 and 8
395
+ assert result.recommendation in [4, 8]
396
+
397
+ def test_8px_grid_detected(self):
398
+ """All-8px-multiple values detected as 8px grid."""
399
+ tokens = {
400
+ "s1": {"value_px": 8},
401
+ "s2": {"value_px": 16},
402
+ "s3": {"value_px": 24},
403
+ "s4": {"value_px": 32},
404
+ }
405
+ result = analyze_spacing_grid(tokens)
406
+ assert result.detected_base == 8
407
+ assert result.is_aligned
408
+ assert result.alignment_percentage == 100.0
409
+
410
+ def test_misaligned_detected(self):
411
+ """Misaligned spacing is flagged."""
412
+ result = analyze_spacing_grid(MOCK_SPACING_TOKENS_MISALIGNED)
413
+ # GCD of 5, 10, 15, 22, 33 = 1 — not aligned
414
+ assert result.detected_base == 1
415
+ assert not result.is_aligned
416
+
417
+ def test_empty_spacing(self):
418
+ """Empty spacing defaults to 8px recommendation."""
419
+ result = analyze_spacing_grid({})
420
+ assert result.recommendation == 8
421
+ assert not result.is_aligned
422
+
423
+ def test_single_value(self):
424
+ """Single value uses itself as base."""
425
+ tokens = {"s1": {"value_px": 8}}
426
+ result = analyze_spacing_grid(tokens)
427
+ assert result.detected_base == 8
428
+
429
+ def test_gcd_calculation(self):
430
+ """GCD correctly computed for spacing values."""
431
+ tokens = {
432
+ "s1": {"value_px": 12},
433
+ "s2": {"value_px": 24},
434
+ "s3": {"value_px": 36},
435
+ }
436
+ result = analyze_spacing_grid(tokens)
437
+ assert result.detected_base == 12
438
+
439
+ def test_suggested_scale_generated(self):
440
+ """Suggested scale is generated."""
441
+ result = analyze_spacing_grid(MOCK_SPACING_TOKENS_ALIGNED)
442
+ assert len(result.suggested_scale) > 0
443
+ assert 0 in result.suggested_scale
444
+
445
+ def test_string_values_parsed(self):
446
+ """String values like '16px' are parsed correctly."""
447
+ tokens = {
448
+ "s1": {"value": "8px"},
449
+ "s2": {"value": "16px"},
450
+ }
451
+ result = analyze_spacing_grid(tokens)
452
+ assert result.current_values == [8, 16]
453
+
454
+
455
+ # =============================================================================
456
+ # TEST CLASS: Color Statistics
457
+ # =============================================================================
458
+
459
+ class TestColorStatistics:
460
+ """Test color palette statistics analysis."""
461
+
462
+ def test_counts_correct(self):
463
+ """Total and unique counts are correct."""
464
+ tokens = {
465
+ "a": {"value": "#ff0000"},
466
+ "b": {"value": "#ff0000"}, # duplicate
467
+ "c": {"value": "#00ff00"},
468
+ }
469
+ result = analyze_color_statistics(tokens)
470
+ assert result.total_count == 3
471
+ assert result.unique_count == 2
472
+ assert result.duplicate_count == 1
473
+
474
+ def test_gray_detection(self):
475
+ """Grays are counted correctly."""
476
+ tokens = {
477
+ "white": {"value": "#ffffff"},
478
+ "gray": {"value": "#808080"},
479
+ "black": {"value": "#000000"},
480
+ "red": {"value": "#ff0000"},
481
+ }
482
+ result = analyze_color_statistics(tokens)
483
+ assert result.gray_count >= 3 # white, gray, black are all low saturation
484
+
485
+ def test_near_duplicates_found(self):
486
+ """Near-duplicate colors are detected."""
487
+ tokens = {
488
+ "red1": {"value": "#ff0000"},
489
+ "red2": {"value": "#fe0101"}, # Very close to red1
490
+ "blue": {"value": "#0000ff"},
491
+ }
492
+ result = analyze_color_statistics(tokens, similarity_threshold=0.05)
493
+ assert len(result.near_duplicates) >= 1
494
+
495
+ def test_hue_distribution(self):
496
+ """Hue distribution groups colors correctly."""
497
+ tokens = {
498
+ "red": {"value": "#ff0000"},
499
+ "blue": {"value": "#0000ff"},
500
+ "green": {"value": "#00ff00"},
501
+ }
502
+ result = analyze_color_statistics(tokens)
503
+ assert "red" in result.hue_distribution
504
+ assert "blue" in result.hue_distribution
505
+ assert "green" in result.hue_distribution
506
+
507
+ def test_empty_tokens(self):
508
+ """Empty tokens return zeros."""
509
+ result = analyze_color_statistics({})
510
+ assert result.total_count == 0
511
+ assert result.unique_count == 0
512
+
513
+
514
+ # =============================================================================
515
+ # TEST CLASS: Rule Engine Integration
516
+ # =============================================================================
517
+
518
+ class TestRuleEngineIntegration:
519
+ """Test the full run_rule_engine() function."""
520
+
521
+ def test_returns_all_components(self):
522
+ """Rule engine returns all analysis components."""
523
+ result = run_rule_engine(
524
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
525
+ color_tokens=MOCK_COLOR_TOKENS,
526
+ spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED,
527
+ )
528
+ assert result.typography is not None
529
+ assert result.accessibility is not None
530
+ assert result.spacing is not None
531
+ assert result.color_stats is not None
532
+
533
+ def test_consistency_score_bounds(self):
534
+ """Consistency score is between 0 and 100."""
535
+ result = run_rule_engine(
536
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
537
+ color_tokens=MOCK_COLOR_TOKENS,
538
+ spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED,
539
+ )
540
+ assert 0 <= result.consistency_score <= 100
541
+
542
+ def test_aa_failures_counted(self):
543
+ """AA failures are counted in summary."""
544
+ result = run_rule_engine(
545
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
546
+ color_tokens=MOCK_COLOR_TOKENS,
547
+ spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED,
548
+ )
549
+ assert result.aa_failures >= 0
550
+
551
+ def test_to_dict_serializable(self):
552
+ """to_dict() returns JSON-serializable data."""
553
+ import json
554
+ result = run_rule_engine(
555
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
556
+ color_tokens=MOCK_COLOR_TOKENS,
557
+ spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED,
558
+ )
559
+ d = result.to_dict()
560
+ json_str = json.dumps(d)
561
+ assert len(json_str) > 0
562
+
563
+ def test_log_callback_called(self):
564
+ """Log callback receives messages."""
565
+ logs = []
566
+ run_rule_engine(
567
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
568
+ color_tokens=MOCK_COLOR_TOKENS,
569
+ spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED,
570
+ log_callback=lambda msg: logs.append(msg),
571
+ )
572
+ assert len(logs) > 0
573
+ # Should contain rule engine header
574
+ assert any("RULE ENGINE" in log for log in logs)
575
+
576
+ def test_with_fg_bg_pairs(self):
577
+ """FG/BG pairs are analyzed when provided."""
578
+ pairs = [
579
+ {"foreground": "#06b2c4", "background": "#ffffff", "element": "button"},
580
+ {"foreground": "#1a1a1a", "background": "#ffffff", "element": "heading"},
581
+ ]
582
+ result = run_rule_engine(
583
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
584
+ color_tokens=MOCK_COLOR_TOKENS,
585
+ spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED,
586
+ fg_bg_pairs=pairs,
587
+ )
588
+ # Should have accessibility results including pair checks
589
+ assert len(result.accessibility) > 0
590
+
591
+ def test_empty_tokens_no_crash(self):
592
+ """Empty tokens don't crash the rule engine."""
593
+ result = run_rule_engine(
594
+ typography_tokens={},
595
+ color_tokens={},
596
+ spacing_tokens={},
597
+ )
598
+ assert result.consistency_score >= 0
599
+
600
+ def test_perfect_score_possible(self):
601
+ """A well-organized design system scores high."""
602
+ # All 8px-aligned spacing
603
+ spacing = {f"s{i}": {"value_px": i * 8} for i in range(1, 7)}
604
+ # Consistent type scale (Major Third 1.25)
605
+ typo = {
606
+ f"t{i}": {"font_size": f"{16 * (1.25 ** i):.0f}px"}
607
+ for i in range(5)
608
+ }
609
+ # AA-passing colors only
610
+ colors = {
611
+ "dark": {"value": "#1a1a1a"},
612
+ "medium": {"value": "#333333"},
613
+ }
614
+ result = run_rule_engine(
615
+ typography_tokens=typo,
616
+ color_tokens=colors,
617
+ spacing_tokens=spacing,
618
+ )
619
+ assert result.consistency_score >= 50 # Should be reasonably high
620
+
621
+
622
+ # =============================================================================
623
+ # TEST CLASS: Color Ramp Generation
624
+ # =============================================================================
625
+
626
+ class TestColorRampGeneration:
627
+ """Test color ramp generation from base color."""
628
+
629
+ def test_ramp_has_all_shades(self):
630
+ """Ramp generates all standard shades."""
631
+ ramp = generate_color_ramp("#06b2c4")
632
+ assert "50" in ramp
633
+ assert "500" in ramp
634
+ assert "900" in ramp
635
+ assert len(ramp) == 10
636
+
637
+ def test_ramp_500_is_base(self):
638
+ """Shade 500 is the base color."""
639
+ ramp = generate_color_ramp("#06b2c4")
640
+ assert ramp["500"] == "#06b2c4"
641
+
642
+ def test_ramp_lightness_order(self):
643
+ """Lighter shades are lighter than darker shades."""
644
+ ramp = generate_color_ramp("#06b2c4")
645
+ shade_50 = parse_color(ramp["50"])
646
+ shade_900 = parse_color(ramp["900"])
647
+ assert shade_50.hsl[2] > shade_900.hsl[2] # 50 is lighter
648
+
649
+ def test_ramp_empty_on_invalid(self):
650
+ """Invalid color returns empty ramp."""
651
+ ramp = generate_color_ramp("not-a-color")
652
+ assert ramp == {}
653
+
654
+
655
+ # =============================================================================
656
+ # TEST CLASS: Edge Cases
657
+ # =============================================================================
658
+
659
+ class TestEdgeCases:
660
+ """Edge cases and boundary conditions."""
661
+
662
+ def test_is_gray_pure_white(self):
663
+ """White is gray (low saturation)."""
664
+ assert is_gray("#ffffff")
665
+
666
+ def test_is_gray_pure_black(self):
667
+ """Black is gray (low saturation)."""
668
+ assert is_gray("#000000")
669
+
670
+ def test_is_gray_red_is_not(self):
671
+ """Pure red is not gray."""
672
+ assert not is_gray("#ff0000")
673
+
674
+ def test_color_distance_black_white(self):
675
+ """Black to white is maximum distance."""
676
+ dist = re_color_distance("#000000", "#ffffff")
677
+ assert dist > 0.9 # Close to maximum (~1.0)
678
+
679
+ def test_very_large_spacing(self):
680
+ """Large spacing values don't crash."""
681
+ tokens = {"huge": {"value_px": 10000}}
682
+ result = analyze_spacing_grid(tokens)
683
+ assert result.detected_base == 10000
684
+
685
+ def test_typography_mixed_units(self):
686
+ """Mixed px/rem/em units are handled."""
687
+ tokens = {
688
+ "a": {"font_size": "16px"},
689
+ "b": {"font_size": "1.5rem"},
690
+ "c": {"font_size": "2em"},
691
+ }
692
+ result = analyze_type_scale(tokens)
693
+ assert len(result.sizes_px) == 3
694
+ assert 16.0 in result.sizes_px
695
+ assert 24.0 in result.sizes_px
696
+ assert 32.0 in result.sizes_px
697
+
698
+ def test_duplicate_sizes_deduped(self):
699
+ """Duplicate font sizes are deduplicated."""
700
+ tokens = {
701
+ "a": {"font_size": "16px"},
702
+ "b": {"font_size": "16px"},
703
+ "c": {"font_size": "24px"},
704
+ }
705
+ result = analyze_type_scale(tokens)
706
+ assert len(result.sizes_px) == 2 # 16 and 24
707
+
708
+ def test_hex_to_rgb_shorthand(self):
709
+ """3-digit hex expands correctly."""
710
+ assert re_hex_to_rgb("#fff") == (255, 255, 255)
711
+ assert re_hex_to_rgb("#000") == (0, 0, 0)
712
+ assert re_hex_to_rgb("#f00") == (255, 0, 0)
713
+
714
+
715
+ if __name__ == "__main__":
716
+ pytest.main([__file__, "-v"])