""" Stage 2 LLM Agents — v3 Agentic Architecture ============================================== Each agent: - Researches ALL token types (colors, typography, spacing, radius, shadows) - Uses ReAct framework: THINK → ACT → OBSERVE → VERIFY - Returns visible reasoning chain for the UI - Has a Python-based critic for validation Agents run IN PARALLEL (asyncio.gather), then NEXUS compiles. Agent Responsibilities: - AURORA: Brand identity + semantic naming for ALL colors + notes on all token types - SENTINEL: Best practices audit across ALL token types, grounded in rule-engine data - ATLAS: Benchmark comparison for ALL token types - NEXUS (HEAD): Tree-of-Thought synthesis, compiles all agent outputs """ import json import re from dataclasses import dataclass, field from typing import Optional, Callable, Any from datetime import datetime # ============================================================================= # DATA CLASSES — v3: includes reasoning_trace + naming_map # ============================================================================= @dataclass class BrandIdentification: """Results from AURORA — Brand Identifier (ReAct).""" brand_primary: dict = field(default_factory=dict) brand_secondary: dict = field(default_factory=dict) brand_accent: dict = field(default_factory=dict) palette_strategy: str = "" cohesion_score: int = 5 cohesion_notes: str = "" # v3: naming_map covers ALL colors, not just top 10 naming_map: dict = field(default_factory=dict) # {hex: "color.brand.primary"} or {hex: "color.blue.500"} semantic_names: dict = field(default_factory=dict) # backward compat self_evaluation: dict = field(default_factory=dict) # v3: reasoning trace visible to user reasoning_trace: list = field(default_factory=list) validation_passed: bool = False retry_count: int = 0 # v3: per-token-type observations typography_notes: str = "" spacing_notes: str = "" radius_notes: str = "" shadow_notes: str = "" def to_dict(self) -> dict: return { "brand_primary": self.brand_primary, "brand_secondary": self.brand_secondary, "brand_accent": self.brand_accent, "palette_strategy": self.palette_strategy, "cohesion_score": self.cohesion_score, "cohesion_notes": self.cohesion_notes, "naming_map": self.naming_map, "semantic_names": self.semantic_names, "self_evaluation": self.self_evaluation, "typography_notes": self.typography_notes, "spacing_notes": self.spacing_notes, "radius_notes": self.radius_notes, "shadow_notes": self.shadow_notes, } @dataclass class BenchmarkAdvice: """Results from ATLAS — Benchmark Advisor (ReAct).""" recommended_benchmark: str = "" recommended_benchmark_name: str = "" reasoning: str = "" alignment_changes: list = field(default_factory=list) pros_of_alignment: list = field(default_factory=list) cons_of_alignment: list = field(default_factory=list) alternative_benchmarks: list = field(default_factory=list) self_evaluation: dict = field(default_factory=dict) # v3: per-token-type benchmark comparison typography_comparison: dict = field(default_factory=dict) spacing_comparison: dict = field(default_factory=dict) color_comparison: dict = field(default_factory=dict) radius_comparison: dict = field(default_factory=dict) shadow_comparison: dict = field(default_factory=dict) reasoning_trace: list = field(default_factory=list) def to_dict(self) -> dict: return { "recommended_benchmark": self.recommended_benchmark, "recommended_benchmark_name": self.recommended_benchmark_name, "reasoning": self.reasoning, "alignment_changes": self.alignment_changes, "pros": self.pros_of_alignment, "cons": self.cons_of_alignment, "alternatives": self.alternative_benchmarks, "self_evaluation": self.self_evaluation, "typography_comparison": self.typography_comparison, "spacing_comparison": self.spacing_comparison, "color_comparison": self.color_comparison, "radius_comparison": self.radius_comparison, "shadow_comparison": self.shadow_comparison, } @dataclass class BestPracticesResult: """Results from SENTINEL — Best Practices Auditor (ReAct).""" overall_score: int = 50 checks: dict = field(default_factory=dict) priority_fixes: list = field(default_factory=list) passing_practices: list = field(default_factory=list) failing_practices: list = field(default_factory=list) self_evaluation: dict = field(default_factory=dict) # v3: per-token-type assessments color_assessment: dict = field(default_factory=dict) typography_assessment: dict = field(default_factory=dict) spacing_assessment: dict = field(default_factory=dict) radius_assessment: dict = field(default_factory=dict) shadow_assessment: dict = field(default_factory=dict) reasoning_trace: list = field(default_factory=list) validation_passed: bool = False def to_dict(self) -> dict: return { "overall_score": self.overall_score, "checks": self.checks, "priority_fixes": self.priority_fixes, "passing": self.passing_practices, "failing": self.failing_practices, "self_evaluation": self.self_evaluation, "color_assessment": self.color_assessment, "typography_assessment": self.typography_assessment, "spacing_assessment": self.spacing_assessment, "radius_assessment": self.radius_assessment, "shadow_assessment": self.shadow_assessment, } @dataclass class HeadSynthesis: """Results from NEXUS — HEAD Synthesizer (Tree of Thought).""" executive_summary: str = "" scores: dict = field(default_factory=dict) benchmark_fit: dict = field(default_factory=dict) brand_analysis: dict = field(default_factory=dict) top_3_actions: list = field(default_factory=list) color_recommendations: list = field(default_factory=list) type_scale_recommendation: dict = field(default_factory=dict) spacing_recommendation: dict = field(default_factory=dict) radius_recommendation: dict = field(default_factory=dict) shadow_recommendation: dict = field(default_factory=dict) self_evaluation: dict = field(default_factory=dict) # v3: ToT branches visible to user perspective_a: dict = field(default_factory=dict) perspective_b: dict = field(default_factory=dict) chosen_perspective: str = "" choice_reasoning: str = "" reasoning_trace: list = field(default_factory=list) def to_dict(self) -> dict: return { "executive_summary": self.executive_summary, "scores": self.scores, "benchmark_fit": self.benchmark_fit, "brand_analysis": self.brand_analysis, "top_3_actions": self.top_3_actions, "color_recommendations": self.color_recommendations, "type_scale_recommendation": self.type_scale_recommendation, "spacing_recommendation": self.spacing_recommendation, "radius_recommendation": self.radius_recommendation, "shadow_recommendation": self.shadow_recommendation, "self_evaluation": self.self_evaluation, "chosen_perspective": self.chosen_perspective, "choice_reasoning": self.choice_reasoning, } # ============================================================================= # SHARED HELPERS — format token data for prompts # ============================================================================= def _fmt_colors(tokens: dict, limit: int = 40) -> str: """Format color tokens for any agent prompt.""" if not tokens: return "No color data" lines = [] for name, t in list(tokens.items())[:limit]: d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} hex_val = d.get("value", "") freq = d.get("frequency", 0) hint = d.get("role_hint", "") ctx = ", ".join((d.get("contexts") or [])[:3]) els = ", ".join((d.get("elements") or [])[:3]) hint_s = f" [hint:{hint}]" if hint else "" lines.append(f"- {hex_val}: {freq}x, ctx=[{ctx}], el=[{els}]{hint_s}") return "\n".join(lines) def _fmt_typography(tokens: dict, limit: int = 15) -> str: if not tokens: return "No typography data" lines = [] for name, t in list(tokens.items())[:limit]: d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} fam = d.get("font_family", "?") sz = d.get("font_size", "?") w = d.get("font_weight", 400) lh = d.get("line_height", "?") freq = d.get("frequency", 0) els = ", ".join((d.get("elements") or [])[:3]) lines.append(f"- {fam} {sz} w{w} lh={lh} ({freq}x) [{els}]") return "\n".join(lines) def _fmt_spacing(tokens: dict, limit: int = 15) -> str: if not tokens: return "No spacing data" lines = [] for name, t in list(tokens.items())[:limit]: d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} val = d.get("value", "?") px = d.get("value_px", "?") freq = d.get("frequency", 0) ctx = ", ".join((d.get("contexts") or [])[:3]) lines.append(f"- {val} ({px}px) {freq}x [{ctx}]") return "\n".join(lines) def _fmt_radius(tokens: dict, limit: int = 10) -> str: if not tokens: return "No radius data" lines = [] for name, t in list(tokens.items())[:limit]: d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} val = d.get("value", "?") px = d.get("value_px", "?") freq = d.get("frequency", 0) b4 = d.get("fits_base_4", False) b8 = d.get("fits_base_8", False) els = ", ".join((d.get("elements") or [])[:3]) lines.append(f"- {name}: {val} (base4={b4}, base8={b8}, {freq}x) [{els}]") return "\n".join(lines) def _fmt_shadows(tokens: dict, limit: int = 10) -> str: if not tokens: return "No shadow data" lines = [] for name, t in list(tokens.items())[:limit]: d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} blur = d.get("blur_px", "?") y = d.get("y_offset_px", "?") freq = d.get("frequency", 0) els = ", ".join((d.get("elements") or [])[:3]) lines.append(f"- {name}: blur={blur}px y={y}px ({freq}x) [{els}]") return "\n".join(lines) def _log_reasoning(steps: list, log_fn: Callable): """Log ReAct reasoning steps with full content (no truncation).""" icons = {"THINK": "🧠", "ACT": "⚡", "OBSERVE": "👁️", "VERIFY": "✅"} for step in (steps or []): if isinstance(step, dict): st = step.get("step", "?") area = step.get("area", "") content = step.get("content", "") icon = icons.get(st, "📝") # Show full reasoning — wrap long lines for readability if len(content) > 120: log_fn(f" {icon} [{st}] {area}:") # Word-wrap at ~100 chars per line words = content.split() line = " " for word in words: if len(line) + len(word) + 1 > 105: log_fn(line) line = " " + word else: line = line + " " + word if line.strip() else " " + word if line.strip(): log_fn(line) else: log_fn(f" {icon} [{st}] {area}: {content}") def _extract_hexes(tokens: dict) -> list: """Get list of hex values from color token dict.""" hexes = [] for name, t in tokens.items(): if isinstance(t, dict): h = t.get("value", "") else: h = getattr(t, "value", "") if h: hexes.append(h.lower()) return hexes # ============================================================================= # AURORA — Brand Identifier (ReAct Framework) # ============================================================================= class BrandIdentifierAgent: """ AURORA — Senior Brand & Visual Identity Analyst. v3.1: ADVISORY ONLY — does NOT name colors (rule-based classifier does that). Provides brand insights, palette strategy, cohesion assessment. Model: Qwen 72B · Temperature: 0.4 """ SYSTEM_PROMPT = """You are AURORA, a Senior Brand & Visual Identity Analyst. ## YOUR ROLE (v3.1: Advisory Only) Color NAMING is handled by a rule-based classifier. Do NOT output naming_map. Your job is to provide INSIGHTS about the brand identity and design cohesion. ## REASONING FRAMEWORK (ReAct) Structure your response with explicit reasoning steps. For each area: THINK → ACT → OBSERVE → VERIFY. ## ANALYZE ALL TOKEN TYPES: ### 1. COLORS — Identify brand strategy (complementary? analogous? monochromatic?) ### 2. TYPOGRAPHY — Identify heading vs body hierarchy, font pairing quality ### 3. SPACING — Identify grid system, note consistency ### 4. RADIUS — Identify radius strategy (sharp/rounded/pill) ### 5. SHADOWS — Identify elevation strategy, blur progression ## QUALITY RULES - Brand Primary MUST cite usage evidence (e.g. "47x on buttons") - Cohesion 1-10: most sites score 5-7. Use the full range. - Do NOT invent names. Focus on analysis and insights. ## OUTPUT (JSON) { "reasoning_steps": [ {"step": "THINK", "area": "colors", "content": "..."}, {"step": "ACT", "area": "colors", "content": "..."}, {"step": "OBSERVE", "area": "typography", "content": "..."}, {"step": "ACT", "area": "spacing", "content": "..."}, {"step": "ACT", "area": "radius", "content": "..."}, {"step": "ACT", "area": "shadows", "content": "..."}, {"step": "VERIFY", "area": "all", "content": "Cross-checking consistency..."} ], "brand_primary": {"color": "#hex", "confidence": "high|medium|low", "reasoning": "cite evidence", "usage_count": N}, "brand_secondary": {"color": "#hex", "confidence": "...", "reasoning": "..."}, "brand_accent": {"color": "#hex or null", "confidence": "...", "reasoning": "..."}, "palette_strategy": "complementary|analogous|triadic|monochromatic|random", "cohesion_score": N, "cohesion_notes": "...", "naming_map": {}, // Optional: ONLY semantic role suggestions (brand.primary, text.secondary, etc.) "typography_notes": "Heading: Inter 700, Body: Inter 400. Clean hierarchy.", "spacing_notes": "8px grid, 92% aligned.", "radius_notes": "Rounded style: 4px inputs, 8px cards.", "shadow_notes": "3-level elevation: blur 4/8/24px.", "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "good|fair|poor", "flags": []} } Return ONLY valid JSON.""" PROMPT_TEMPLATE = """Analyze the complete design system. ## COLORS (with role_hints) {color_data} ## TYPOGRAPHY {typography_data} ## SPACING {spacing_data} ## RADIUS {radius_data} ## SHADOWS {shadow_data} Use ReAct for each area. If you see clear semantic roles (brand primary, text color, etc.), suggest them in naming_map. Otherwise leave naming_map empty — the rule-based classifier handles naming.""" def __init__(self, hf_client): self.hf_client = hf_client async def analyze( self, color_tokens: dict, typography_tokens: dict = None, spacing_tokens: dict = None, radius_tokens: dict = None, shadow_tokens: dict = None, log_callback: Callable = None, ) -> BrandIdentification: def log(msg): if log_callback: log_callback(msg) log(" 🎨 AURORA — Brand & Visual Identity (Qwen 72B)") log(" └─ ReAct: Analyzing colors + typography + spacing + radius + shadows...") prompt = self.PROMPT_TEMPLATE.format( color_data=_fmt_colors(color_tokens), typography_data=_fmt_typography(typography_tokens), spacing_data=_fmt_spacing(spacing_tokens), radius_data=_fmt_radius(radius_tokens), shadow_data=_fmt_shadows(shadow_tokens), ) try: start = datetime.now() response = await self.hf_client.complete_async( agent_name="brand_identifier", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=2000, json_mode=True, ) dur = (datetime.now() - start).total_seconds() result = self._parse(response) # Critic validation input_hexes = _extract_hexes(color_tokens) passed, errors = validate_aurora_output(result, input_hexes) result.validation_passed = passed if not passed and result.retry_count == 0: log(f" ⚠️ Critic: {len(errors)} issues — retrying with feedback...") for e in errors[:3]: log(f" └─ {e}") retry_prompt = prompt + "\n\n## CRITIC FEEDBACK — Fix:\n" + "\n".join(errors[:10]) resp2 = await self.hf_client.complete_async( agent_name="brand_identifier", system_prompt=self.SYSTEM_PROMPT, user_message=retry_prompt, max_tokens=2000, json_mode=True, ) result = self._parse(resp2) result.retry_count = 1 p2, e2 = validate_aurora_output(result, input_hexes) result.validation_passed = p2 if not p2: log(f" ⚠️ Retry: still {len(e2)} issues — using normalizer fallback names") # Log reasoning chain log(f" ─────────────────────────────────────────") log(f" 🎨 AURORA — COMPLETE ({dur:.1f}s)") _log_reasoning(result.reasoning_trace, log) log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')})") log(f" ├─ Palette: {result.palette_strategy} · Cohesion: {result.cohesion_score}/10") log(f" ├─ Colors Named: {len(result.naming_map)}/{len(input_hexes)}") log(f" ├─ Typography: {result.typography_notes or 'N/A'}") log(f" ├─ Spacing: {result.spacing_notes or 'N/A'}") log(f" ├─ Radius: {result.radius_notes or 'N/A'}") log(f" ├─ Shadows: {result.shadow_notes or 'N/A'}") log(f" └─ Critic: {'✅ PASSED' if result.validation_passed else '⚠️ FALLBACK'}") return result except Exception as e: log(f" ⚠️ AURORA failed: {str(e)[:120]}") return BrandIdentification() def _parse(self, response: str) -> BrandIdentification: try: m = re.search(r'\{[\s\S]*\}', response) if m: d = json.loads(m.group()) return BrandIdentification( brand_primary=d.get("brand_primary", {}), brand_secondary=d.get("brand_secondary", {}), brand_accent=d.get("brand_accent", {}), palette_strategy=d.get("palette_strategy", "unknown"), cohesion_score=d.get("cohesion_score", 5), cohesion_notes=d.get("cohesion_notes", ""), naming_map=d.get("naming_map", {}), semantic_names=d.get("naming_map", {}), self_evaluation=d.get("self_evaluation", {}), reasoning_trace=d.get("reasoning_steps", []), typography_notes=d.get("typography_notes", ""), spacing_notes=d.get("spacing_notes", ""), radius_notes=d.get("radius_notes", ""), shadow_notes=d.get("shadow_notes", ""), ) except Exception: pass return BrandIdentification() # ============================================================================= # ATLAS — Benchmark Advisor (ReAct Framework) # ============================================================================= class BenchmarkAdvisorAgent: """ ATLAS — Senior Design System Benchmark Analyst. ReAct comparison of ALL token types against industry benchmarks. Model: Llama 3.3 70B · Temperature: 0.25 """ SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst. ## REASONING FRAMEWORK (ReAct) For EACH token type: THINK → ACT → OBSERVE → VERIFY. Compare the user's values against benchmarks for: 1. TYPOGRAPHY — ratio, base size, scale pattern 2. SPACING — grid base, alignment, scale 3. COLORS — palette size, brand color usage 4. RADIUS — strategy (sharp/rounded/pill), tier count 5. SHADOWS — elevation levels, blur range Then pick the BEST OVERALL FIT benchmark. Max 4 alignment changes. If >85% match, say "already well-aligned". ## OUTPUT (JSON) { "reasoning_steps": [ {"step": "THINK", "area": "typography", "content": "User ratio 1.18 vs Material 1.25..."}, {"step": "ACT", "area": "typography", "content": "Material closest for type"}, {"step": "THINK", "area": "spacing", "content": "8px matches Material and Polaris"}, {"step": "ACT", "area": "spacing", "content": "Both aligned"}, {"step": "THINK", "area": "colors", "content": "25 colors vs Polaris 18..."}, {"step": "THINK", "area": "radius", "content": "4/8px tiers..."}, {"step": "THINK", "area": "shadows", "content": "3 levels vs Material 5..."}, {"step": "VERIFY", "area": "overall", "content": "Material best: 4/5 areas align"} ], "recommended_benchmark": "material_design_3", "recommended_benchmark_name": "Material Design 3", "reasoning": "Best fit across all token types — cite data", "alignment_changes": [ {"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium", "token_type": "typography"} ], "typography_comparison": {"user": "1.18", "benchmark": "1.25", "gap": "minor"}, "spacing_comparison": {"user": "8px", "benchmark": "8px", "gap": "aligned"}, "color_comparison": {"user": "25", "benchmark": "18", "gap": "reduce"}, "radius_comparison": {"user": "2 tiers", "benchmark": "3 tiers", "gap": "add xl"}, "shadow_comparison": {"user": "3 levels", "benchmark": "5 levels", "gap": "add 2"}, "pros_of_alignment": ["..."], "cons_of_alignment": ["..."], "alternative_benchmarks": [{"name": "Polaris", "reason": "..."}], "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []} } Return ONLY valid JSON.""" PROMPT_TEMPLATE = """Compare this design system against benchmarks — ALL token types. ## CURRENT VALUES - Type Scale Ratio: {user_ratio} | Base: {user_base}px | Sizes: {user_sizes} - Spacing Grid: {user_spacing}px | Values: {spacing_values} - Colors: {color_count} unique | Brand: {brand_info} - Radius: {radius_data} - Shadows: {shadow_data} ## BENCHMARKS {benchmark_comparison} Use ReAct per token type. Pick the best overall fit.""" def __init__(self, hf_client): self.hf_client = hf_client async def analyze( self, user_ratio: float, user_base: int, user_spacing: int, benchmark_comparisons: list, color_count: int = 0, brand_info: str = "", user_sizes: str = "", spacing_values: str = "", radius_data: str = "", shadow_data: str = "", log_callback: Callable = None, ) -> BenchmarkAdvice: def log(msg): if log_callback: log_callback(msg) log("") log(" 🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)") log(" └─ ReAct: Comparing typography + spacing + colors + radius + shadows...") prompt = self.PROMPT_TEMPLATE.format( user_ratio=user_ratio, user_base=user_base, user_spacing=user_spacing, user_sizes=user_sizes or "N/A", spacing_values=spacing_values or "N/A", color_count=color_count, brand_info=brand_info or "N/A", radius_data=radius_data or "No radius data", shadow_data=shadow_data or "No shadow data", benchmark_comparison=self._fmt_benchmarks(benchmark_comparisons), ) try: start = datetime.now() response = await self.hf_client.complete_async( agent_name="benchmark_advisor", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=1500, json_mode=True, ) dur = (datetime.now() - start).total_seconds() result = self._parse(response) log(f" ─────────────────────────────────────────") log(f" 🏢 ATLAS — COMPLETE ({dur:.1f}s)") _log_reasoning(result.reasoning_trace, log) log(f" ├─ Recommended: {result.recommended_benchmark_name}") log(f" ├─ Changes: {len(result.alignment_changes)}") log(f" ├─ Typography: {result.typography_comparison}") log(f" ├─ Spacing: {result.spacing_comparison}") log(f" ├─ Colors: {result.color_comparison}") log(f" ├─ Radius: {result.radius_comparison}") log(f" └─ Shadows: {result.shadow_comparison}") return result except Exception as e: log(f" ⚠️ ATLAS failed: {str(e)[:120]}") return BenchmarkAdvice() def _fmt_benchmarks(self, comparisons: list) -> str: lines = [] for i, c in enumerate(comparisons[:5]): b = c.benchmark lines.append(f"{i+1}. {b.icon} {b.name} — Match: {c.overall_match_pct:.0f}%" f" | Type: {b.typography.get('scale_ratio', '?')}" f" | Spacing: {b.spacing.get('base', '?')}px" f" | Best for: {', '.join(b.best_for)}") return "\n".join(lines) if lines else "No benchmark data" def _parse(self, response: str) -> BenchmarkAdvice: try: m = re.search(r'\{[\s\S]*\}', response) if m: d = json.loads(m.group()) return BenchmarkAdvice( recommended_benchmark=d.get("recommended_benchmark", ""), recommended_benchmark_name=d.get("recommended_benchmark_name", ""), reasoning=d.get("reasoning", ""), alignment_changes=d.get("alignment_changes", []), pros_of_alignment=d.get("pros_of_alignment", []), cons_of_alignment=d.get("cons_of_alignment", []), alternative_benchmarks=d.get("alternative_benchmarks", []), self_evaluation=d.get("self_evaluation", {}), typography_comparison=d.get("typography_comparison", {}), spacing_comparison=d.get("spacing_comparison", {}), color_comparison=d.get("color_comparison", {}), radius_comparison=d.get("radius_comparison", {}), shadow_comparison=d.get("shadow_comparison", {}), reasoning_trace=d.get("reasoning_steps", []), ) except Exception: pass return BenchmarkAdvice() # ============================================================================= # SENTINEL — Best Practices Auditor (ReAct + Grounded Scoring) # ============================================================================= class BestPracticesValidatorAgent: """ SENTINEL — Design System Best Practices Auditor. ReAct: Grounds EVERY score in actual rule-engine data. Audits ALL token types. Model: Qwen 72B · Temperature: 0.2 """ SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor. ## REASONING FRAMEWORK (ReAct + Grounded) For EACH check: THINK → ACT (cite data) → OBSERVE → VERIFY. You MUST CITE the exact input data for every score. ## AUDIT ALL TOKEN TYPES: ### COLORS (25 pts) - aa_compliance: CITE AA pass/fail count - color_count: < 20 semantic colors ideal - near_duplicates: should be 0 ### TYPOGRAPHY (25 pts) - type_scale_standard: nearest standard ratio - type_scale_consistent: variance check - base_size_accessible: >= 16px ### SPACING (20 pts) - spacing_grid: 4px or 8px consistency - spacing_alignment: > 80% target ### RADIUS (15 pts) - radius_consistency: base-4/8 grid, clear tiers ### SHADOWS (15 pts) - shadow_system: elevation hierarchy, blur progression ## CRITICAL: If data says 7 AA failures, you CANNOT say "pass". ## OUTPUT (JSON) { "reasoning_steps": [ {"step": "THINK", "area": "colors", "content": "7/25 fail AA = 28%"}, {"step": "ACT", "area": "colors", "content": "aa_compliance = FAIL"}, {"step": "THINK", "area": "typography", "content": "ratio 1.18, variance 0.22"}, {"step": "ACT", "area": "typography", "content": "type_scale_consistent = WARN"}, {"step": "THINK", "area": "spacing", "content": "8px base, 85% aligned"}, {"step": "ACT", "area": "spacing", "content": "spacing_grid = PASS"}, {"step": "THINK", "area": "radius", "content": "4px,8px,16px all base-4"}, {"step": "ACT", "area": "radius", "content": "radius_consistency = PASS"}, {"step": "THINK", "area": "shadows", "content": "3 levels, blur 4→8→24"}, {"step": "ACT", "area": "shadows", "content": "shadow_system = WARN"}, {"step": "VERIFY", "area": "scoring", "content": "3 pass, 2 warn, 1 fail → 62/100"} ], "overall_score": N, "checks": { "aa_compliance": {"status": "pass|warn|fail", "note": "CITE: 7/25 fail AA"}, "type_scale_standard": {"status": "...", "note": "CITE: ratio 1.18 nearest 1.2"}, "type_scale_consistent": {"status": "...", "note": "CITE: variance 0.22 > 0.15"}, "base_size_accessible": {"status": "...", "note": "CITE: base = Npx"}, "spacing_grid": {"status": "...", "note": "CITE: N% aligned to Npx"}, "color_count": {"status": "...", "note": "CITE: N unique colors"}, "near_duplicates": {"status": "...", "note": "CITE: N pairs"}, "radius_consistency": {"status": "...", "note": "CITE: tiers and grid"}, "shadow_system": {"status": "...", "note": "CITE: N levels, progression"} }, "color_assessment": {"aa_pass_rate": "72%", "palette_size": 25, "verdict": "needs work"}, "typography_assessment": {"ratio": 1.18, "consistent": false, "base_ok": true, "verdict": "fair"}, "spacing_assessment": {"grid": "8px", "alignment": "85%", "verdict": "good"}, "radius_assessment": {"tiers": 3, "base_aligned": true, "verdict": "good"}, "shadow_assessment": {"levels": 3, "progression": "non-linear", "verdict": "fair"}, "priority_fixes": [ {"rank": 1, "issue": "...", "impact": "high", "effort": "low", "action": "Specific fix", "token_type": "color"} ], "passing_practices": ["spacing_grid"], "failing_practices": ["aa_compliance"], "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []} } Return ONLY valid JSON.""" PROMPT_TEMPLATE = """Audit this design system. CITE the data for every score. ## RULE ENGINE FACTS (verified) ### Typography - Ratio: {type_ratio} ({type_consistent}) | Base: {base_size}px | Sizes: {sizes} ### Accessibility - Total: {total_colors} | AA Pass: {aa_pass} | AA Fail: {aa_fail} - Failing: {failing_colors} ### Spacing - Base: {spacing_base}px | Aligned: {spacing_aligned}% | Values: {spacing_values} ### Color Stats - Unique: {unique_colors} | Near-Duplicates: {near_duplicates} ### Radius {radius_data} ### Shadows {shadow_data} CITE the EXACT numbers above for every check.""" def __init__(self, hf_client): self.hf_client = hf_client async def analyze( self, rule_engine_results: Any, radius_tokens: dict = None, shadow_tokens: dict = None, log_callback: Callable = None, ) -> BestPracticesResult: def log(msg): if log_callback: log_callback(msg) log("") log(" ✅ SENTINEL — Best Practices Auditor (Qwen 72B)") log(" └─ ReAct: Auditing colors + typography + spacing + radius + shadows...") typo = rule_engine_results.typography spacing = rule_engine_results.spacing color_stats = rule_engine_results.color_stats accessibility = rule_engine_results.accessibility failures = [a for a in accessibility if not a.passes_aa_normal] failing_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:8]]) sizes_str = ", ".join([f"{s}px" for s in typo.sizes_px[:8]]) if typo.sizes_px else "N/A" sp_vals = ", ".join([f"{v}px" for v in spacing.current_values[:10]]) if hasattr(spacing, 'current_values') and spacing.current_values else "N/A" prompt = self.PROMPT_TEMPLATE.format( type_ratio=f"{typo.detected_ratio:.3f}", type_consistent="consistent" if typo.is_consistent else f"inconsistent (var={typo.variance:.2f})", base_size=typo.sizes_px[0] if typo.sizes_px else 16, sizes=sizes_str, total_colors=len(accessibility), aa_pass=len(accessibility) - len(failures), aa_fail=len(failures), failing_colors=failing_str or "None", spacing_base=spacing.detected_base, spacing_aligned=f"{spacing.alignment_percentage:.0f}", spacing_values=sp_vals, unique_colors=color_stats.unique_count, near_duplicates=len(color_stats.near_duplicates), radius_data=_fmt_radius(radius_tokens) if radius_tokens else "No radius data", shadow_data=_fmt_shadows(shadow_tokens) if shadow_tokens else "No shadow data", ) try: start = datetime.now() response = await self.hf_client.complete_async( agent_name="best_practices_validator", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=2000, json_mode=True, ) dur = (datetime.now() - start).total_seconds() result = self._parse(response) # Critic cross-reference passed, errors = validate_sentinel_output(result, rule_engine_results) result.validation_passed = passed if not passed: log(f" ⚠️ Critic: {len(errors)} issues — applying fixes...") for e in errors[:3]: log(f" └─ {e}") result = _apply_sentinel_fixes(result, rule_engine_results, errors) log(f" ─────────────────────────────────────────") log(f" ✅ SENTINEL — COMPLETE ({dur:.1f}s)") _log_reasoning(result.reasoning_trace, log) log(f" ├─ Overall Score: {result.overall_score}/100") for cn, cv in (result.checks or {}).items(): if isinstance(cv, dict): s = cv.get("status", "?") si = {"pass": "✅", "warn": "⚠️", "fail": "❌"}.get(s, "?") log(f" │ {si} {cn}: {s}") log(f" ├─ Priority Fixes: {len(result.priority_fixes)}") log(f" └─ Critic: {'✅ PASSED' if result.validation_passed else '⚠️ FIXED'}") return result except Exception as e: log(f" ⚠️ SENTINEL failed: {str(e)[:120]}") return BestPracticesResult() def _parse(self, response: str) -> BestPracticesResult: try: m = re.search(r'\{[\s\S]*\}', response) if m: d = json.loads(m.group()) return BestPracticesResult( overall_score=d.get("overall_score", 50), checks=d.get("checks", {}), priority_fixes=d.get("priority_fixes", []), passing_practices=d.get("passing_practices", []), failing_practices=d.get("failing_practices", []), self_evaluation=d.get("self_evaluation", {}), color_assessment=d.get("color_assessment", {}), typography_assessment=d.get("typography_assessment", {}), spacing_assessment=d.get("spacing_assessment", {}), radius_assessment=d.get("radius_assessment", {}), shadow_assessment=d.get("shadow_assessment", {}), reasoning_trace=d.get("reasoning_steps", []), ) except Exception: pass return BestPracticesResult() # ============================================================================= # NEXUS — HEAD Synthesizer (Tree of Thought) # ============================================================================= class HeadSynthesizerAgent: """ NEXUS — Senior Design System Architect. Tree of Thought: 2 perspectives, picks best, compiles all agent outputs. Recommendations for ALL token types. Model: Llama 3.3 70B · Temperature: 0.3 """ SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect — the final synthesizer. ## REASONING FRAMEWORK (Tree of Thought) Evaluate TWO perspectives: ### PERSPECTIVE A — Accessibility-First Weights: accessibility=40%, consistency=30%, organization=30% Penalize heavily for AA failures. ### PERSPECTIVE B — Balanced Weights: accessibility=30%, consistency=35%, organization=35% Equal emphasis across areas. For each: calculate scores, determine top 3 actions. Then CHOOSE the perspective that better reflects reality. ## SYNTHESIZE ALL TOKEN TYPES: - Colors: AURORA brand + SENTINEL AA findings → color recommendations - Typography: ATLAS benchmark match + SENTINEL scale audit → type scale rec - Spacing: ATLAS grid comparison + SENTINEL alignment → spacing rec - Radius: SENTINEL consistency + ATLAS benchmark → radius rec - Shadows: SENTINEL elevation + ATLAS benchmark → shadow rec ## OUTPUT (JSON) { "reasoning_steps": [ {"step": "THINK", "area": "perspective_a", "content": "Accessibility-first weighting..."}, {"step": "ACT", "area": "perspective_a", "content": "Score: overall=52..."}, {"step": "THINK", "area": "perspective_b", "content": "Balanced weighting..."}, {"step": "ACT", "area": "perspective_b", "content": "Score: overall=63..."}, {"step": "OBSERVE", "area": "comparison", "content": "A shows severity of AA failures..."}, {"step": "VERIFY", "area": "decision", "content": "Choosing A — honest about AA issues"} ], "perspective_a": {"scores": {"overall": 52, "accessibility": 38, "consistency": 72, "organization": 68}, "reasoning": "..."}, "perspective_b": {"scores": {"overall": 63, "accessibility": 45, "consistency": 72, "organization": 68}, "reasoning": "..."}, "chosen_perspective": "A", "choice_reasoning": "AA failures affect real users — lower score is more honest", "executive_summary": "Your design system scores X/100...", "scores": {"overall": 52, "accessibility": 38, "consistency": 72, "organization": 68}, "top_3_actions": [ {"action": "Fix AA compliance", "impact": "high", "effort": "medium", "details": "#X→#Y", "token_type": "color"} ], "color_recommendations": [ {"role": "brand.primary", "current": "#hex", "suggested": "#hex", "reason": "AA", "accept": true} ], "type_scale_recommendation": {"current_ratio": 1.18, "recommended_ratio": 1.25, "reason": "..."}, "spacing_recommendation": {"current": "8px", "recommended": "8px", "reason": "Already aligned"}, "radius_recommendation": {"current": "3 tiers", "recommended": "Add xl tier", "reason": "..."}, "shadow_recommendation": {"current": "3 levels", "recommended": "Add 2 more", "reason": "..."}, "benchmark_fit": {"closest": "Material", "similarity": "78%", "recommendation": "..."}, "brand_analysis": {"primary": "#hex", "secondary": "#hex", "cohesion": 7}, "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []} } Return ONLY valid JSON.""" PROMPT_TEMPLATE = """Synthesize all analysis into a final report. ## RULE ENGINE FACTS - Type: {type_ratio} ({type_status}) | Base: {base_size}px - AA Failures: {aa_failures}/{total_colors} - Spacing: {spacing_status} - Colors: {unique_colors} unique | Consistency: {consistency_score}/100 - Radius: {radius_facts} - Shadows: {shadow_facts} ## AURORA — Brand Analysis - Primary: {brand_primary} ({brand_confidence}) | Secondary: {brand_secondary} - Palette: {palette_strategy} | Cohesion: {cohesion_score}/10 - Typography: {aurora_typo} - Spacing: {aurora_spacing} - Radius: {aurora_radius} - Shadows: {aurora_shadows} ## ATLAS — Benchmark - Closest: {closest_benchmark} ({match_pct}%) - Typo: {atlas_typo} | Spacing: {atlas_spacing} | Colors: {atlas_colors} - Radius: {atlas_radius} | Shadows: {atlas_shadows} - Changes: {benchmark_changes} ## SENTINEL — Audit - Score: {best_practices_score}/100 - Color: {sentinel_color} | Typo: {sentinel_typo} | Spacing: {sentinel_spacing} - Radius: {sentinel_radius} | Shadows: {sentinel_shadows} - Fixes: {priority_fixes} ## AA FIXES NEEDED {accessibility_fixes} Evaluate from TWO perspectives (Tree of Thought). Choose one. Recommend for ALL token types.""" def __init__(self, hf_client): self.hf_client = hf_client async def synthesize( self, rule_engine_results: Any, benchmark_comparisons: list, brand_identification: BrandIdentification, benchmark_advice: BenchmarkAdvice, best_practices: BestPracticesResult, log_callback: Callable = None, ) -> HeadSynthesis: def log(msg): if log_callback: log_callback(msg) log("") log("═" * 60) log("🧠 NEXUS — HEAD SYNTHESIZER (Tree of Thought)") log("═" * 60) log(" Evaluating Perspective A (Accessibility-First) vs B (Balanced)...") log(" Compiling: Rule Engine + AURORA + ATLAS + SENTINEL...") typo = rule_engine_results.typography spacing = rule_engine_results.spacing color_stats = rule_engine_results.color_stats accessibility = rule_engine_results.accessibility failures = [a for a in accessibility if not a.passes_aa_normal] aa_fixes_str = "\n".join([ f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)" for a in failures[:8] if a.suggested_fix ]) closest = benchmark_comparisons[0] if benchmark_comparisons else None def _s(obj): """Safely stringify a dict/value for prompt.""" if isinstance(obj, dict): parts = [f"{k}={v}" for k, v in list(obj.items())[:4]] return ", ".join(parts) if parts else "N/A" return str(obj) if obj else "N/A" prompt = self.PROMPT_TEMPLATE.format( type_ratio=f"{typo.detected_ratio:.3f}", type_status="consistent" if typo.is_consistent else "inconsistent", base_size=typo.sizes_px[0] if typo.sizes_px else 16, aa_failures=len(failures), total_colors=len(accessibility), spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned", unique_colors=color_stats.unique_count, consistency_score=rule_engine_results.consistency_score, radius_facts=_s(best_practices.radius_assessment) or "N/A", shadow_facts=_s(best_practices.shadow_assessment) or "N/A", brand_primary=brand_identification.brand_primary.get("color", "?"), brand_confidence=brand_identification.brand_primary.get("confidence", "?"), brand_secondary=brand_identification.brand_secondary.get("color", "?"), palette_strategy=brand_identification.palette_strategy, cohesion_score=brand_identification.cohesion_score, aurora_typo=brand_identification.typography_notes or "N/A", aurora_spacing=brand_identification.spacing_notes or "N/A", aurora_radius=brand_identification.radius_notes or "N/A", aurora_shadows=brand_identification.shadow_notes or "N/A", closest_benchmark=closest.benchmark.name if closest else "?", match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0", atlas_typo=_s(benchmark_advice.typography_comparison), atlas_spacing=_s(benchmark_advice.spacing_comparison), atlas_colors=_s(benchmark_advice.color_comparison), atlas_radius=_s(benchmark_advice.radius_comparison), atlas_shadows=_s(benchmark_advice.shadow_comparison), benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:4]]), best_practices_score=best_practices.overall_score, sentinel_color=_s(best_practices.color_assessment), sentinel_typo=_s(best_practices.typography_assessment), sentinel_spacing=_s(best_practices.spacing_assessment), sentinel_radius=_s(best_practices.radius_assessment), sentinel_shadows=_s(best_practices.shadow_assessment), priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:5]]), accessibility_fixes=aa_fixes_str or "None needed", ) try: start = datetime.now() response = await self.hf_client.complete_async( agent_name="head_synthesizer", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=2500, json_mode=True, ) dur = (datetime.now() - start).total_seconds() result = self._parse(response) log("") log(f" 🧠 NEXUS — COMPLETE ({dur:.1f}s)") _log_reasoning(result.reasoning_trace, log) pa = result.perspective_a.get("scores", {}).get("overall", "?") if result.perspective_a else "?" pb = result.perspective_b.get("scores", {}).get("overall", "?") if result.perspective_b else "?" log(f" ├─ Perspective A: {pa}/100") log(f" ├─ Perspective B: {pb}/100") log(f" ├─ Chosen: {result.chosen_perspective}") log(f" ├─ Why: {result.choice_reasoning or 'N/A'}") log(f" ├─ Final Score: {result.scores.get('overall', '?')}/100" if result.scores else " ├─ Scores: N/A") log(f" ├─ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}") log(f" ├─ Typography: {_s(result.type_scale_recommendation)}") log(f" ├─ Spacing: {_s(result.spacing_recommendation)}") log(f" ├─ Radius: {_s(result.radius_recommendation)}") log(f" └─ Shadows: {_s(result.shadow_recommendation)}") log("") return result except Exception as e: log(f" ⚠️ NEXUS failed: {str(e)[:120]}") return HeadSynthesis() def _parse(self, response: str) -> HeadSynthesis: try: m = re.search(r'\{[\s\S]*\}', response) if m: d = json.loads(m.group()) return HeadSynthesis( executive_summary=d.get("executive_summary", ""), scores=d.get("scores", {}), benchmark_fit=d.get("benchmark_fit", {}), brand_analysis=d.get("brand_analysis", {}), top_3_actions=d.get("top_3_actions", []), color_recommendations=d.get("color_recommendations", []), type_scale_recommendation=d.get("type_scale_recommendation", {}), spacing_recommendation=d.get("spacing_recommendation", {}), radius_recommendation=d.get("radius_recommendation", {}), shadow_recommendation=d.get("shadow_recommendation", {}), self_evaluation=d.get("self_evaluation", {}), perspective_a=d.get("perspective_a", {}), perspective_b=d.get("perspective_b", {}), chosen_perspective=d.get("chosen_perspective", ""), choice_reasoning=d.get("choice_reasoning", ""), reasoning_trace=d.get("reasoning_steps", []), ) except Exception: pass return HeadSynthesis() # ============================================================================= # CRITIC / VALIDATOR FUNCTIONS (Rule-based, no LLM) # ============================================================================= def validate_aurora_output(output: BrandIdentification, input_hexes: list) -> tuple: """Validate AURORA naming_map. Returns (passed, errors).""" errors = [] nm = output.naming_map or {} # All input colors must have names for h in input_hexes: if h not in nm and h.lower() not in nm: errors.append(f"Missing name for {h}") # No word-based shades bad_words = {"light", "dark", "base", "muted", "deep", "lighter", "darker"} for h, name in nm.items(): for part in name.split("."): if part.lower() in bad_words: errors.append(f"Word shade '{part}' in {name}") # No duplicates seen = set() for n in nm.values(): if n in seen: errors.append(f"Duplicate: {n}") seen.add(n) # Convention: color.X.Y for h, name in nm.items(): if not name.startswith("color."): errors.append(f"'{name}' must start with 'color.'") if len(name.split(".")) < 3: errors.append(f"'{name}' needs 3+ parts") return len(errors) == 0, errors def validate_sentinel_output(output: BestPracticesResult, rule_engine) -> tuple: """Cross-reference SENTINEL scores against rule engine data.""" errors = [] checks = output.checks or {} accessibility = rule_engine.accessibility aa_failures = len([a for a in accessibility if not a.passes_aa_normal]) aa_check = checks.get("aa_compliance", {}) if aa_failures > 0 and isinstance(aa_check, dict) and aa_check.get("status") == "pass": errors.append(f"aa_compliance='pass' but {aa_failures} fail AA") score = output.overall_score if not (0 <= score <= 100): errors.append(f"Score {score} out of 0-100 range") fail_count = sum(1 for c in checks.values() if isinstance(c, dict) and c.get("status") == "fail") if fail_count >= 3 and score > 70: errors.append(f"Score {score} too high with {fail_count} failures") typo = rule_engine.typography base_size = typo.sizes_px[0] if typo.sizes_px else 16 base_check = checks.get("base_size_accessible", {}) if base_size < 16 and isinstance(base_check, dict) and base_check.get("status") == "pass": errors.append(f"base_size 'pass' but {base_size}px < 16") return len(errors) == 0, errors def _apply_sentinel_fixes(result: BestPracticesResult, rule_engine, errors: list) -> BestPracticesResult: """Deterministic fixes when critic finds issues.""" accessibility = rule_engine.accessibility failures = [a for a in accessibility if not a.passes_aa_normal] for err in errors: if "aa_compliance" in err and "pass" in err: if "aa_compliance" in result.checks: result.checks["aa_compliance"]["status"] = "fail" result.checks["aa_compliance"]["note"] = f"CORRECTED: {len(failures)} fail AA" if "too high" in err.lower(): fail_count = sum(1 for c in result.checks.values() if isinstance(c, dict) and c.get("status") == "fail") max_s = max(30, 100 - fail_count * 15) if result.overall_score > max_s: result.overall_score = max_s result.overall_score = max(0, min(100, result.overall_score)) result.validation_passed = True return result def filter_aurora_naming_map(aurora: BrandIdentification) -> dict: """Filter AURORA naming_map to only keep semantic role assignments. AURORA is a secondary naming authority — it can assign semantic roles (brand.primary, text.secondary, bg.primary, feedback.error, etc.) but cannot override palette names (blue.500, neutral.700, etc.). The color_classifier is the primary naming authority. Returns: Dict of hex -> semantic_name (only role-based names). """ SEMANTIC_PREFIXES = ('brand.', 'text.', 'bg.', 'border.', 'feedback.') filtered = {} for hex_val, name in (aurora.naming_map or {}).items(): hex_clean = str(hex_val).strip().lower() if not hex_clean.startswith('#') or not name: continue clean_name = name if name.startswith('color.') else f'color.{name}' # Extract the part after "color." after_prefix = clean_name[6:] # "brand.primary", "blue.500", etc. if any(after_prefix.startswith(sp) for sp in SEMANTIC_PREFIXES): filtered[hex_clean] = clean_name return filtered def post_validate_stage2( aurora: BrandIdentification, sentinel: BestPracticesResult, nexus: HeadSynthesis, rule_engine: Any, ) -> list: """Final deterministic checks after ALL agents. Returns issues list.""" issues = [] for h, name in (aurora.naming_map or {}).items(): if not re.match(r'^color\.\w+\.[\w]+$', name): issues.append(f"Bad name: {name}") for key, val in (nexus.scores or {}).items(): if isinstance(val, (int, float)) and not (0 <= val <= 100): issues.append(f"Score {key}={val} OOB") aa_failures = len([a for a in rule_engine.accessibility if not a.passes_aa_normal]) n_acc = nexus.scores.get("accessibility", 50) if nexus.scores else 50 if aa_failures > 3 and n_acc > 85: issues.append(f"Nexus accessibility={n_acc} but {aa_failures} AA failures") for rec in (nexus.color_recommendations or []): for field in ("current", "suggested"): v = rec.get(field, "") if v and not v.startswith("#"): issues.append(f"Color rec {field} missing #: {v}") return issues