riazmo commited on
Commit
cf79147
Β·
verified Β·
1 Parent(s): ba4c870

Upload test_stage2_pipeline.py

Browse files
Files changed (1) hide show
  1. tests/test_stage2_pipeline.py +662 -0
tests/test_stage2_pipeline.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stage 2 Pipeline Test Script
4
+ ============================
5
+
6
+ Tests the new Stage 2 architecture:
7
+ - Layer 1: Rule Engine
8
+ - Layer 2: Benchmark Research
9
+ - Layer 3: LLM Agents
10
+ - Layer 4: HEAD Synthesizer
11
+
12
+ Run: python tests/test_stage2_pipeline.py
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import os
18
+ import sys
19
+ from datetime import datetime
20
+ from typing import Optional
21
+
22
+ # Add parent directory to path
23
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
24
+
25
+
26
+ # =============================================================================
27
+ # TEST DATA - Mock extracted tokens
28
+ # =============================================================================
29
+
30
+ MOCK_TYPOGRAPHY_TOKENS = {
31
+ "heading-1": {"font_size": "48px", "font_weight": "700", "line_height": "1.2", "font_family": "Inter"},
32
+ "heading-2": {"font_size": "36px", "font_weight": "600", "line_height": "1.25", "font_family": "Inter"},
33
+ "heading-3": {"font_size": "28px", "font_weight": "600", "line_height": "1.3", "font_family": "Inter"},
34
+ "heading-4": {"font_size": "22px", "font_weight": "500", "line_height": "1.35", "font_family": "Inter"},
35
+ "body-large": {"font_size": "18px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"},
36
+ "body": {"font_size": "16px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"},
37
+ "body-small": {"font_size": "14px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"},
38
+ "caption": {"font_size": "12px", "font_weight": "400", "line_height": "1.4", "font_family": "Inter"},
39
+ }
40
+
41
+ MOCK_COLOR_TOKENS = {
42
+ "brand-primary": {"value": "#06b2c4", "frequency": 45, "context": "buttons, links"},
43
+ "brand-secondary": {"value": "#c1df1f", "frequency": 23, "context": "highlights, badges"},
44
+ "text-primary": {"value": "#1a1a1a", "frequency": 120, "context": "headings, body"},
45
+ "text-secondary": {"value": "#666666", "frequency": 80, "context": "captions, muted"},
46
+ "text-tertiary": {"value": "#999999", "frequency": 40, "context": "placeholders"},
47
+ "background-primary": {"value": "#ffffff", "frequency": 200, "context": "page background"},
48
+ "background-secondary": {"value": "#f5f5f5", "frequency": 60, "context": "cards, sections"},
49
+ "background-tertiary": {"value": "#e8e8e8", "frequency": 30, "context": "dividers"},
50
+ "border-default": {"value": "#dddddd", "frequency": 50, "context": "borders"},
51
+ "border-focus": {"value": "#06b2c4", "frequency": 15, "context": "focus rings"},
52
+ "success": {"value": "#22c55e", "frequency": 10, "context": "success states"},
53
+ "warning": {"value": "#f59e0b", "frequency": 8, "context": "warning states"},
54
+ "error": {"value": "#ef4444", "frequency": 12, "context": "error states"},
55
+ "info": {"value": "#3b82f6", "frequency": 6, "context": "info states"},
56
+ # Some problematic colors for testing
57
+ "light-cyan": {"value": "#7dd3fc", "frequency": 5, "context": "light accent"}, # Fails AA
58
+ "light-lime": {"value": "#d9f99d", "frequency": 3, "context": "light highlight"}, # Fails AA
59
+ }
60
+
61
+ MOCK_SPACING_TOKENS = {
62
+ "space-1": {"value": "4px", "value_px": 4, "frequency": 30},
63
+ "space-2": {"value": "8px", "value_px": 8, "frequency": 80},
64
+ "space-3": {"value": "12px", "value_px": 12, "frequency": 45},
65
+ "space-4": {"value": "16px", "value_px": 16, "frequency": 60},
66
+ "space-5": {"value": "20px", "value_px": 20, "frequency": 25},
67
+ "space-6": {"value": "24px", "value_px": 24, "frequency": 40},
68
+ "space-8": {"value": "32px", "value_px": 32, "frequency": 20},
69
+ "space-10": {"value": "40px", "value_px": 40, "frequency": 15},
70
+ "space-12": {"value": "48px", "value_px": 48, "frequency": 10},
71
+ # Some misaligned values for testing
72
+ "space-odd-1": {"value": "5px", "value_px": 5, "frequency": 3},
73
+ "space-odd-2": {"value": "10px", "value_px": 10, "frequency": 5},
74
+ }
75
+
76
+ MOCK_SEMANTIC_ANALYSIS = {
77
+ "brand": [{"hex": "#06b2c4", "name": "brand-primary"}, {"hex": "#c1df1f", "name": "brand-secondary"}],
78
+ "text": [{"hex": "#1a1a1a", "name": "text-primary"}, {"hex": "#666666", "name": "text-secondary"}],
79
+ "background": [{"hex": "#ffffff", "name": "background-primary"}, {"hex": "#f5f5f5", "name": "background-secondary"}],
80
+ "border": [{"hex": "#dddddd", "name": "border-default"}],
81
+ "feedback": [{"hex": "#22c55e", "name": "success"}, {"hex": "#ef4444", "name": "error"}],
82
+ }
83
+
84
+
85
+ # =============================================================================
86
+ # TEST HELPERS
87
+ # =============================================================================
88
+
89
+ class TestLogger:
90
+ """Simple logger for tests."""
91
+
92
+ def __init__(self, verbose: bool = True):
93
+ self.verbose = verbose
94
+ self.logs = []
95
+
96
+ def log(self, msg: str):
97
+ self.logs.append(msg)
98
+ if self.verbose:
99
+ print(msg)
100
+
101
+ def get_logs(self) -> str:
102
+ return "\n".join(self.logs)
103
+
104
+
105
+ def print_section(title: str):
106
+ """Print a section header."""
107
+ print("\n" + "=" * 60)
108
+ print(f" {title}")
109
+ print("=" * 60 + "\n")
110
+
111
+
112
+ def print_result(name: str, passed: bool, details: str = ""):
113
+ """Print a test result."""
114
+ icon = "βœ…" if passed else "❌"
115
+ print(f" {icon} {name}")
116
+ if details:
117
+ print(f" {details}")
118
+
119
+
120
+ # =============================================================================
121
+ # LAYER 1: RULE ENGINE TESTS
122
+ # =============================================================================
123
+
124
+ def test_rule_engine():
125
+ """Test the Rule Engine layer."""
126
+ print_section("LAYER 1: RULE ENGINE TESTS")
127
+
128
+ all_passed = True
129
+
130
+ try:
131
+ from core.rule_engine import (
132
+ run_rule_engine,
133
+ analyze_type_scale,
134
+ analyze_accessibility,
135
+ analyze_spacing_grid,
136
+ analyze_color_statistics,
137
+ )
138
+ print_result("Import rule_engine", True)
139
+ except Exception as e:
140
+ print_result("Import rule_engine", False, str(e))
141
+ return False
142
+
143
+ # Test Type Scale Analysis
144
+ try:
145
+ typo_result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS)
146
+
147
+ assert typo_result.detected_ratio > 0, "Ratio should be positive"
148
+ assert typo_result.closest_standard_ratio > 0, "Standard ratio should be positive"
149
+ assert typo_result.scale_name != "", "Scale name should not be empty"
150
+ assert len(typo_result.sizes_px) > 0, "Should detect sizes"
151
+
152
+ print_result(
153
+ "Type Scale Analysis",
154
+ True,
155
+ f"ratio={typo_result.detected_ratio:.3f}, consistent={typo_result.is_consistent}"
156
+ )
157
+ except Exception as e:
158
+ print_result("Type Scale Analysis", False, str(e))
159
+ all_passed = False
160
+
161
+ # Test Accessibility Analysis
162
+ try:
163
+ access_result = analyze_accessibility(MOCK_COLOR_TOKENS)
164
+
165
+ assert len(access_result) > 0, "Should analyze colors"
166
+
167
+ failures = [a for a in access_result if not a.passes_aa_normal]
168
+ passes = len(access_result) - len(failures)
169
+
170
+ # Check that fixes are generated for failures
171
+ fixes_generated = sum(1 for a in failures if a.suggested_fix)
172
+
173
+ print_result(
174
+ "Accessibility Analysis",
175
+ True,
176
+ f"total={len(access_result)}, pass={passes}, fail={len(failures)}, fixes={fixes_generated}"
177
+ )
178
+ except Exception as e:
179
+ print_result("Accessibility Analysis", False, str(e))
180
+ all_passed = False
181
+
182
+ # Test Spacing Grid Analysis
183
+ try:
184
+ spacing_result = analyze_spacing_grid(MOCK_SPACING_TOKENS)
185
+
186
+ assert spacing_result.detected_base > 0, "Base should be positive"
187
+ assert len(spacing_result.current_values) > 0, "Should detect values"
188
+ assert len(spacing_result.suggested_scale) > 0, "Should suggest scale"
189
+
190
+ print_result(
191
+ "Spacing Grid Analysis",
192
+ True,
193
+ f"base={spacing_result.detected_base}px, aligned={spacing_result.alignment_percentage:.0f}%"
194
+ )
195
+ except Exception as e:
196
+ print_result("Spacing Grid Analysis", False, str(e))
197
+ all_passed = False
198
+
199
+ # Test Color Statistics
200
+ try:
201
+ color_stats = analyze_color_statistics(MOCK_COLOR_TOKENS)
202
+
203
+ assert color_stats.total_count > 0, "Should count colors"
204
+ assert color_stats.unique_count > 0, "Should count unique"
205
+
206
+ print_result(
207
+ "Color Statistics",
208
+ True,
209
+ f"total={color_stats.total_count}, unique={color_stats.unique_count}, grays={color_stats.gray_count}"
210
+ )
211
+ except Exception as e:
212
+ print_result("Color Statistics", False, str(e))
213
+ all_passed = False
214
+
215
+ # Test Full Rule Engine
216
+ try:
217
+ logger = TestLogger(verbose=False)
218
+
219
+ full_result = run_rule_engine(
220
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
221
+ color_tokens=MOCK_COLOR_TOKENS,
222
+ spacing_tokens=MOCK_SPACING_TOKENS,
223
+ log_callback=logger.log,
224
+ )
225
+
226
+ assert full_result.typography is not None
227
+ assert full_result.accessibility is not None
228
+ assert full_result.spacing is not None
229
+ assert full_result.color_stats is not None
230
+ assert 0 <= full_result.consistency_score <= 100
231
+
232
+ print_result(
233
+ "Full Rule Engine",
234
+ True,
235
+ f"consistency_score={full_result.consistency_score}, aa_failures={full_result.aa_failures}"
236
+ )
237
+
238
+ # Check logs were generated
239
+ log_lines = len(logger.logs)
240
+ print_result("Log Generation", log_lines > 10, f"{log_lines} log lines")
241
+
242
+ except Exception as e:
243
+ print_result("Full Rule Engine", False, str(e))
244
+ all_passed = False
245
+
246
+ return all_passed
247
+
248
+
249
+ # =============================================================================
250
+ # LAYER 2: BENCHMARK RESEARCH TESTS
251
+ # =============================================================================
252
+
253
+ def test_benchmark_research():
254
+ """Test the Benchmark Research layer."""
255
+ print_section("LAYER 2: BENCHMARK RESEARCH TESTS")
256
+
257
+ all_passed = True
258
+
259
+ try:
260
+ from agents.benchmark_researcher import (
261
+ BenchmarkResearcher,
262
+ BenchmarkCache,
263
+ DESIGN_SYSTEM_SOURCES,
264
+ FALLBACK_BENCHMARKS,
265
+ get_available_benchmarks,
266
+ get_benchmark_choices,
267
+ )
268
+ print_result("Import benchmark_researcher", True)
269
+ except Exception as e:
270
+ print_result("Import benchmark_researcher", False, str(e))
271
+ return False
272
+
273
+ # Test Design System Sources
274
+ try:
275
+ assert len(DESIGN_SYSTEM_SOURCES) >= 6, "Should have at least 6 design systems"
276
+
277
+ required_systems = ["material_design_3", "shopify_polaris", "atlassian_design"]
278
+ for sys in required_systems:
279
+ assert sys in DESIGN_SYSTEM_SOURCES, f"Missing {sys}"
280
+ assert "urls" in DESIGN_SYSTEM_SOURCES[sys], f"Missing URLs for {sys}"
281
+ assert "best_for" in DESIGN_SYSTEM_SOURCES[sys], f"Missing best_for for {sys}"
282
+
283
+ print_result("Design System Sources", True, f"{len(DESIGN_SYSTEM_SOURCES)} systems defined")
284
+ except Exception as e:
285
+ print_result("Design System Sources", False, str(e))
286
+ all_passed = False
287
+
288
+ # Test Fallback Benchmarks
289
+ try:
290
+ assert len(FALLBACK_BENCHMARKS) >= 6, "Should have fallbacks"
291
+
292
+ for key, fallback in FALLBACK_BENCHMARKS.items():
293
+ assert "typography" in fallback, f"Missing typography for {key}"
294
+ assert "spacing" in fallback, f"Missing spacing for {key}"
295
+ assert fallback["typography"].get("scale_ratio"), f"Missing scale_ratio for {key}"
296
+
297
+ print_result("Fallback Benchmarks", True, f"{len(FALLBACK_BENCHMARKS)} fallbacks defined")
298
+ except Exception as e:
299
+ print_result("Fallback Benchmarks", False, str(e))
300
+ all_passed = False
301
+
302
+ # Test Cache
303
+ try:
304
+ cache = BenchmarkCache()
305
+
306
+ # Test set/get
307
+ from agents.benchmark_researcher import BenchmarkData
308
+ test_data = BenchmarkData(
309
+ key="test_system",
310
+ name="Test System",
311
+ short_name="Test",
312
+ vendor="Test Vendor",
313
+ icon="πŸ§ͺ",
314
+ typography={"scale_ratio": 1.25, "base_size": 16},
315
+ spacing={"base": 8},
316
+ fetched_at=datetime.now().isoformat(),
317
+ confidence="high",
318
+ )
319
+
320
+ cache.set("test_system", test_data)
321
+ retrieved = cache.get("test_system")
322
+
323
+ assert retrieved is not None, "Should retrieve cached data"
324
+ assert retrieved.typography.get("scale_ratio") == 1.25, "Data should match"
325
+
326
+ print_result("Benchmark Cache", True, "set/get working")
327
+ except Exception as e:
328
+ print_result("Benchmark Cache", False, str(e))
329
+ all_passed = False
330
+
331
+ # Test Helper Functions
332
+ try:
333
+ benchmarks = get_available_benchmarks()
334
+ assert len(benchmarks) >= 6, "Should list benchmarks"
335
+ assert all("key" in b and "name" in b for b in benchmarks)
336
+
337
+ choices = get_benchmark_choices()
338
+ assert len(choices) >= 6, "Should have choices"
339
+ assert all(isinstance(c, tuple) and len(c) == 2 for c in choices)
340
+
341
+ print_result("Helper Functions", True, f"{len(benchmarks)} benchmarks available")
342
+ except Exception as e:
343
+ print_result("Helper Functions", False, str(e))
344
+ all_passed = False
345
+
346
+ # Test Researcher Initialization
347
+ try:
348
+ researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None)
349
+ assert researcher.cache is not None
350
+
351
+ print_result("Researcher Initialization", True, "initialized without clients")
352
+ except Exception as e:
353
+ print_result("Researcher Initialization", False, str(e))
354
+ all_passed = False
355
+
356
+ # Test Comparison Logic (with fallback data)
357
+ try:
358
+ researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None)
359
+
360
+ # Create mock benchmark data
361
+ from agents.benchmark_researcher import BenchmarkData
362
+ mock_benchmarks = []
363
+ for key in ["material_design_3", "shopify_polaris", "atlassian_design"]:
364
+ source = DESIGN_SYSTEM_SOURCES[key]
365
+ fallback = FALLBACK_BENCHMARKS[key]
366
+ mock_benchmarks.append(BenchmarkData(
367
+ key=key,
368
+ name=source["name"],
369
+ short_name=source["short_name"],
370
+ vendor=source["vendor"],
371
+ icon=source["icon"],
372
+ typography=fallback["typography"],
373
+ spacing=fallback["spacing"],
374
+ fetched_at=datetime.now().isoformat(),
375
+ confidence="fallback",
376
+ best_for=source["best_for"],
377
+ ))
378
+
379
+ comparisons = researcher.compare_to_benchmarks(
380
+ your_ratio=1.18,
381
+ your_base_size=16,
382
+ your_spacing_grid=8,
383
+ benchmarks=mock_benchmarks,
384
+ log_callback=lambda x: None,
385
+ )
386
+
387
+ assert len(comparisons) == 3, "Should have 3 comparisons"
388
+ assert comparisons[0].similarity_score <= comparisons[1].similarity_score, "Should be sorted"
389
+
390
+ print_result(
391
+ "Comparison Logic",
392
+ True,
393
+ f"closest={comparisons[0].benchmark.short_name}, score={comparisons[0].similarity_score:.2f}"
394
+ )
395
+ except Exception as e:
396
+ print_result("Comparison Logic", False, str(e))
397
+ all_passed = False
398
+
399
+ return all_passed
400
+
401
+
402
+ # =============================================================================
403
+ # LAYER 3: LLM AGENTS TESTS
404
+ # =============================================================================
405
+
406
+ def test_llm_agents():
407
+ """Test the LLM Agents layer."""
408
+ print_section("LAYER 3: LLM AGENTS TESTS")
409
+
410
+ all_passed = True
411
+
412
+ try:
413
+ from agents.llm_agents import (
414
+ BrandIdentifierAgent,
415
+ BenchmarkAdvisorAgent,
416
+ BestPracticesValidatorAgent,
417
+ HeadSynthesizerAgent,
418
+ BrandIdentification,
419
+ BenchmarkAdvice,
420
+ BestPracticesResult,
421
+ HeadSynthesis,
422
+ )
423
+ print_result("Import llm_agents", True)
424
+ except Exception as e:
425
+ print_result("Import llm_agents", False, str(e))
426
+ return False
427
+
428
+ # Test Data Classes
429
+ try:
430
+ brand = BrandIdentification(
431
+ brand_primary={"color": "#06b2c4", "confidence": "high"},
432
+ cohesion_score=7,
433
+ )
434
+ assert brand.to_dict()["brand_primary"]["color"] == "#06b2c4"
435
+
436
+ advice = BenchmarkAdvice(
437
+ recommended_benchmark="shopify_polaris",
438
+ reasoning="Best fit for e-commerce",
439
+ )
440
+ assert advice.to_dict()["recommended_benchmark"] == "shopify_polaris"
441
+
442
+ practices = BestPracticesResult(
443
+ overall_score=65,
444
+ priority_fixes=[{"issue": "AA compliance", "impact": "high"}],
445
+ )
446
+ assert practices.to_dict()["overall_score"] == 65
447
+
448
+ synthesis = HeadSynthesis(
449
+ executive_summary="Test summary",
450
+ scores={"overall": 60},
451
+ )
452
+ assert synthesis.to_dict()["scores"]["overall"] == 60
453
+
454
+ print_result("Data Classes", True, "all serializable")
455
+ except Exception as e:
456
+ print_result("Data Classes", False, str(e))
457
+ all_passed = False
458
+
459
+ # Test Agent Initialization (without HF client)
460
+ try:
461
+ brand_agent = BrandIdentifierAgent(hf_client=None)
462
+ benchmark_agent = BenchmarkAdvisorAgent(hf_client=None)
463
+ practices_agent = BestPracticesValidatorAgent(hf_client=None)
464
+ head_agent = HeadSynthesizerAgent(hf_client=None)
465
+
466
+ print_result("Agent Initialization", True, "all agents created")
467
+ except Exception as e:
468
+ print_result("Agent Initialization", False, str(e))
469
+ all_passed = False
470
+
471
+ # Test Prompt Templates exist
472
+ try:
473
+ assert hasattr(BrandIdentifierAgent, 'PROMPT_TEMPLATE')
474
+ assert hasattr(BenchmarkAdvisorAgent, 'PROMPT_TEMPLATE')
475
+ assert hasattr(BestPracticesValidatorAgent, 'PROMPT_TEMPLATE')
476
+ assert hasattr(HeadSynthesizerAgent, 'PROMPT_TEMPLATE')
477
+
478
+ # Check templates have placeholders
479
+ assert "{color_data}" in BrandIdentifierAgent.PROMPT_TEMPLATE
480
+ assert "{user_ratio}" in BenchmarkAdvisorAgent.PROMPT_TEMPLATE
481
+ assert "{type_ratio}" in BestPracticesValidatorAgent.PROMPT_TEMPLATE
482
+ assert "{type_ratio}" in HeadSynthesizerAgent.PROMPT_TEMPLATE
483
+
484
+ print_result("Prompt Templates", True, "all templates defined with placeholders")
485
+ except Exception as e:
486
+ print_result("Prompt Templates", False, str(e))
487
+ all_passed = False
488
+
489
+ return all_passed
490
+
491
+
492
+ # =============================================================================
493
+ # INTEGRATION TEST
494
+ # =============================================================================
495
+
496
+ async def test_integration():
497
+ """Test the full pipeline integration (without actual LLM calls)."""
498
+ print_section("INTEGRATION TEST")
499
+
500
+ all_passed = True
501
+
502
+ # Test full Rule Engine + Benchmark comparison flow
503
+ try:
504
+ from core.rule_engine import run_rule_engine
505
+ from agents.benchmark_researcher import (
506
+ BenchmarkResearcher,
507
+ BenchmarkData,
508
+ DESIGN_SYSTEM_SOURCES,
509
+ FALLBACK_BENCHMARKS
510
+ )
511
+ from agents.llm_agents import (
512
+ BrandIdentification,
513
+ BenchmarkAdvice,
514
+ BestPracticesResult,
515
+ HeadSynthesis,
516
+ )
517
+
518
+ logger = TestLogger(verbose=False)
519
+
520
+ # Step 1: Run Rule Engine
521
+ rule_results = run_rule_engine(
522
+ typography_tokens=MOCK_TYPOGRAPHY_TOKENS,
523
+ color_tokens=MOCK_COLOR_TOKENS,
524
+ spacing_tokens=MOCK_SPACING_TOKENS,
525
+ log_callback=logger.log,
526
+ )
527
+
528
+ print_result("Step 1: Rule Engine", True, f"score={rule_results.consistency_score}")
529
+
530
+ # Step 2: Benchmark Research (using fallbacks)
531
+ researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None)
532
+
533
+ mock_benchmarks = []
534
+ for key in ["material_design_3", "shopify_polaris", "atlassian_design"]:
535
+ source = DESIGN_SYSTEM_SOURCES[key]
536
+ fallback = FALLBACK_BENCHMARKS[key]
537
+ mock_benchmarks.append(BenchmarkData(
538
+ key=key,
539
+ name=source["name"],
540
+ short_name=source["short_name"],
541
+ vendor=source["vendor"],
542
+ icon=source["icon"],
543
+ typography=fallback["typography"],
544
+ spacing=fallback["spacing"],
545
+ fetched_at=datetime.now().isoformat(),
546
+ confidence="fallback",
547
+ best_for=source["best_for"],
548
+ ))
549
+
550
+ comparisons = researcher.compare_to_benchmarks(
551
+ your_ratio=rule_results.typography.detected_ratio,
552
+ your_base_size=int(rule_results.typography.sizes_px[0]) if rule_results.typography.sizes_px else 16,
553
+ your_spacing_grid=rule_results.spacing.detected_base,
554
+ benchmarks=mock_benchmarks,
555
+ log_callback=logger.log,
556
+ )
557
+
558
+ print_result("Step 2: Benchmark Comparison", True, f"closest={comparisons[0].benchmark.short_name}")
559
+
560
+ # Step 3: Mock LLM results (simulating what agents would return)
561
+ brand_result = BrandIdentification(
562
+ brand_primary={"color": "#06b2c4", "confidence": "high", "reasoning": "Most used on CTAs"},
563
+ brand_secondary={"color": "#c1df1f", "confidence": "medium"},
564
+ palette_strategy="complementary",
565
+ cohesion_score=7,
566
+ )
567
+
568
+ benchmark_advice = BenchmarkAdvice(
569
+ recommended_benchmark="shopify_polaris",
570
+ recommended_benchmark_name="Shopify Polaris",
571
+ reasoning="Best match for e-commerce UX",
572
+ alignment_changes=[
573
+ {"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}
574
+ ],
575
+ )
576
+
577
+ best_practices = BestPracticesResult(
578
+ overall_score=58,
579
+ checks={
580
+ "type_scale_standard": {"status": "warn", "note": "1.18 close to Minor Third"},
581
+ "aa_compliance": {"status": "fail", "note": "2 colors fail AA"},
582
+ },
583
+ priority_fixes=[
584
+ {"rank": 1, "issue": "Brand primary fails AA", "impact": "high", "effort": "low"},
585
+ ],
586
+ )
587
+
588
+ print_result("Step 3: Mock LLM Results", True, "all results created")
589
+
590
+ # Step 4: Verify data can be serialized
591
+ output = {
592
+ "rule_engine": rule_results.to_dict(),
593
+ "benchmarks": [c.to_dict() for c in comparisons],
594
+ "brand": brand_result.to_dict(),
595
+ "advice": benchmark_advice.to_dict(),
596
+ "practices": best_practices.to_dict(),
597
+ }
598
+
599
+ json_str = json.dumps(output, indent=2)
600
+ assert len(json_str) > 100, "Should produce substantial output"
601
+
602
+ print_result("Step 4: Serialization", True, f"{len(json_str)} bytes")
603
+
604
+ # Final summary
605
+ print("\n πŸ“Š Integration Summary:")
606
+ print(f" - Rule Engine Score: {rule_results.consistency_score}/100")
607
+ print(f" - AA Failures: {rule_results.aa_failures}")
608
+ print(f" - Closest Benchmark: {comparisons[0].benchmark.name}")
609
+ print(f" - Match: {comparisons[0].overall_match_pct:.0f}%")
610
+
611
+ all_passed = True
612
+
613
+ except Exception as e:
614
+ import traceback
615
+ print_result("Integration Test", False, str(e))
616
+ traceback.print_exc()
617
+ all_passed = False
618
+
619
+ return all_passed
620
+
621
+
622
+ # =============================================================================
623
+ # MAIN
624
+ # =============================================================================
625
+
626
+ def main():
627
+ """Run all tests."""
628
+ print("\n" + "β–ˆ" * 60)
629
+ print(" STAGE 2 PIPELINE TEST SUITE")
630
+ print("β–ˆ" * 60)
631
+ print(f"\n Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
632
+
633
+ results = {}
634
+
635
+ # Run tests
636
+ results["Rule Engine"] = test_rule_engine()
637
+ results["Benchmark Research"] = test_benchmark_research()
638
+ results["LLM Agents"] = test_llm_agents()
639
+ results["Integration"] = asyncio.run(test_integration())
640
+
641
+ # Summary
642
+ print_section("TEST SUMMARY")
643
+
644
+ total = len(results)
645
+ passed = sum(1 for v in results.values() if v)
646
+
647
+ for name, result in results.items():
648
+ icon = "βœ…" if result else "❌"
649
+ print(f" {icon} {name}")
650
+
651
+ print(f"\n Total: {passed}/{total} passed")
652
+
653
+ if passed == total:
654
+ print("\n πŸŽ‰ All tests passed!")
655
+ return 0
656
+ else:
657
+ print("\n ⚠️ Some tests failed")
658
+ return 1
659
+
660
+
661
+ if __name__ == "__main__":
662
+ sys.exit(main())