""" Stage 2 LLM Agents — Specialized Analysis Tasks ================================================= These agents handle tasks that REQUIRE LLM reasoning: - Brand Identifier: Identify brand colors from usage context - Benchmark Advisor: Recommend best-fit design system - Best Practices Validator: Prioritize fixes by business impact - HEAD Synthesizer: Combine all outputs into final recommendations Each agent has a focused prompt for its specific task. """ import json import re from dataclasses import dataclass, field from typing import Optional, Callable, Any from datetime import datetime # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class BrandIdentification: """Results from Brand Identifier agent (AURORA).""" brand_primary: dict = field(default_factory=dict) # {color, confidence, reasoning, usage_count} brand_secondary: dict = field(default_factory=dict) brand_accent: dict = field(default_factory=dict) palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random cohesion_score: int = 5 # 1-10 cohesion_notes: str = "" semantic_names: dict = field(default_factory=dict) # {hex_color: suggested_name} self_evaluation: dict = field(default_factory=dict) # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} def to_dict(self) -> dict: return { "brand_primary": self.brand_primary, "brand_secondary": self.brand_secondary, "brand_accent": self.brand_accent, "palette_strategy": self.palette_strategy, "cohesion_score": self.cohesion_score, "cohesion_notes": self.cohesion_notes, "semantic_names": self.semantic_names, "self_evaluation": self.self_evaluation, } @dataclass class BenchmarkAdvice: """Results from Benchmark Advisor agent.""" recommended_benchmark: str = "" recommended_benchmark_name: str = "" reasoning: str = "" alignment_changes: list = field(default_factory=list) # [{change, from, to, effort}] pros_of_alignment: list = field(default_factory=list) cons_of_alignment: list = field(default_factory=list) alternative_benchmarks: list = field(default_factory=list) # [{name, reason}] self_evaluation: dict = field(default_factory=dict) # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} def to_dict(self) -> dict: return { "recommended_benchmark": self.recommended_benchmark, "recommended_benchmark_name": self.recommended_benchmark_name, "reasoning": self.reasoning, "alignment_changes": self.alignment_changes, "pros": self.pros_of_alignment, "cons": self.cons_of_alignment, "alternatives": self.alternative_benchmarks, "self_evaluation": self.self_evaluation, } @dataclass class BestPracticesResult: """Results from Best Practices Validator agent.""" overall_score: int = 50 # 0-100 checks: dict = field(default_factory=dict) # {check_name: {status: pass/warn/fail, note: str}} priority_fixes: list = field(default_factory=list) # [{rank, issue, impact, effort, action}] passing_practices: list = field(default_factory=list) failing_practices: list = field(default_factory=list) self_evaluation: dict = field(default_factory=dict) # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} def to_dict(self) -> dict: return { "overall_score": self.overall_score, "checks": self.checks, "priority_fixes": self.priority_fixes, "passing": self.passing_practices, "failing": self.failing_practices, "self_evaluation": self.self_evaluation, } @dataclass class HeadSynthesis: """Final synthesized output from HEAD agent.""" executive_summary: str = "" scores: dict = field(default_factory=dict) # {overall, accessibility, consistency, organization} benchmark_fit: dict = field(default_factory=dict) # {closest, similarity, recommendation} brand_analysis: dict = field(default_factory=dict) # {primary, secondary, cohesion} top_3_actions: list = field(default_factory=list) # [{action, impact, effort, details}] color_recommendations: list = field(default_factory=list) # [{role, current, suggested, reason, accept}] type_scale_recommendation: dict = field(default_factory=dict) spacing_recommendation: dict = field(default_factory=dict) self_evaluation: dict = field(default_factory=dict) # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} def to_dict(self) -> dict: return { "executive_summary": self.executive_summary, "scores": self.scores, "benchmark_fit": self.benchmark_fit, "brand_analysis": self.brand_analysis, "top_3_actions": self.top_3_actions, "color_recommendations": self.color_recommendations, "type_scale_recommendation": self.type_scale_recommendation, "spacing_recommendation": self.spacing_recommendation, "self_evaluation": self.self_evaluation, } # ============================================================================= # BRAND IDENTIFIER AGENT # ============================================================================= class BrandIdentifierAgent: """ AURORA — Senior Brand Color Analyst. Identifies brand colors from usage context using creative/visual reasoning. Model: Qwen 72B (strong creative reasoning, color harmony assessment) Temperature: 0.4 (allows creative interpretation of color stories) WHY LLM: Requires understanding context (33 buttons = likely brand primary), not just color math. """ SYSTEM_PROMPT = """You are AURORA, a Senior Brand Color Analyst specializing in visual identity systems. ## YOUR ROLE IN THE PIPELINE You are Agent 1 of 4 in the Design System Analysis pipeline. - INPUT: Raw color tokens with usage counts + semantic CSS analysis from Stage 1 extraction - OUTPUT: Brand color identification + palette strategy → feeds into NEXUS (Agent 4) for final synthesis - Your analysis directly influences the final color recommendations shown to the user. ## YOUR EXPERTISE - Color harmony theory (complementary, analogous, triadic, split-complementary, monochromatic) - Brand identity systems (primary/secondary/accent hierarchy) - CSS context interpretation (button colors = likely CTA, background colors = likely neutral) - Color naming conventions (design token naming: brand.primary, text.secondary, etc.) ## QUALITY STANDARDS - Brand Primary MUST have HIGH confidence if one color dominates buttons/CTAs. Say "low" if ambiguous. - Cohesion Score: Use the FULL 1-10 range. A score of 7+ means clear intentional harmony. Most sites score 5-7. - If fewer than 5 unique colors exist, flag as "insufficient_data" — don't guess relationships. ## WHAT NOT TO DO - Don't inflate confidence. "Medium" is fine when usage patterns are unclear. - Don't guess accent colors if none exist — use null. - Don't assume complementary strategy just because two colors differ — check the actual hue relationship. - Don't name colors generically. Use semantic design-token style names (brand.primary, not "blue"). ## SCORING RUBRIC (Cohesion 1-10): - 9-10: Clear harmony rule across all colors, distinct brand identity, consistent palette - 7-8: Mostly harmonious, clear brand identity, minor inconsistencies - 5-6: Some color relationships visible but not systematic - 3-4: Random-feeling palette, no clear color strategy - 1-2: Actively conflicting colors, no brand identity visible""" PROMPT_TEMPLATE = """Analyze the following color usage data and identify the brand color system. ## COLOR DATA WITH USAGE CONTEXT {color_data} ## SEMANTIC ANALYSIS (from CSS properties) {semantic_analysis} ## YOUR TASK 1. **Identify Brand Colors**: - Brand Primary: The main action/CTA color (highest visibility in buttons, links, key UI) - Brand Secondary: Supporting brand color (headers, secondary actions) - Brand Accent: Highlight color for emphasis (badges, alerts, special states) 2. **Assess Palette Strategy**: complementary, analogous, triadic, monochromatic, or random? 3. **Rate Cohesion** (1-10) using the rubric above 4. **Suggest Semantic Names** for top 10 most-used colors (design-token format) 5. **Self-Evaluate** your analysis quality ## OUTPUT FORMAT (JSON only) {{ "brand_primary": {{ "color": "#hex", "confidence": "high|medium|low", "reasoning": "Why this is brand primary — cite specific usage evidence", "usage_count": }}, "brand_secondary": {{ "color": "#hex", "confidence": "high|medium|low", "reasoning": "..." }}, "brand_accent": {{ "color": "#hex or null", "confidence": "...", "reasoning": "..." }}, "palette_strategy": "complementary|analogous|triadic|monochromatic|random", "cohesion_score": <1-10>, "cohesion_notes": "Assessment of how well colors work together", "semantic_names": {{ "#hex1": "brand.primary", "#hex2": "text.primary", "#hex3": "background.primary" }}, "self_evaluation": {{ "confidence": <1-10>, "reasoning": "Why I am this confident in my analysis", "data_quality": "good|fair|poor", "flags": [] }} }} Return ONLY valid JSON.""" def __init__(self, hf_client): self.hf_client = hf_client async def analyze( self, color_tokens: dict, semantic_analysis: dict, log_callback: Callable = None, ) -> BrandIdentification: """ Identify brand colors from usage context. Args: color_tokens: Dict of color tokens with usage data semantic_analysis: Semantic categorization from Stage 1 log_callback: Progress logging function Returns: BrandIdentification with identified colors """ def log(msg: str): if log_callback: log_callback(msg) log(" 🎨 AURORA — Brand Identifier (Qwen 72B)") log(" └─ Analyzing color context and usage patterns...") # Format color data color_data = self._format_color_data(color_tokens) semantic_str = self._format_semantic_analysis(semantic_analysis) prompt = self.PROMPT_TEMPLATE.format( color_data=color_data, semantic_analysis=semantic_str, ) try: start_time = datetime.now() response = await self.hf_client.complete_async( agent_name="brand_identifier", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=1000, json_mode=True, ) duration = (datetime.now() - start_time).total_seconds() # Parse response result = self._parse_response(response) log(f" ────────────────────────────────────────────────") log(f" 🎨 AURORA — Brand Identifier: COMPLETE ({duration:.1f}s)") log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)") log(f" ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}") log(f" ├─ Palette Strategy: {result.palette_strategy}") log(f" ├─ Cohesion Score: {result.cohesion_score}/10") se = result.self_evaluation if se: log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") return result except Exception as e: error_msg = str(e) # Always log full error for diagnosis log(f" ⚠️ Brand Identifier failed: {error_msg[:120]}") if "gated" in error_msg.lower() or "access" in error_msg.lower(): log(f" └─ Model may require license acceptance at huggingface.co") elif "Rate limit" in error_msg or "429" in error_msg: log(f" └─ HF free tier rate limit — wait or upgrade to Pro") return BrandIdentification() def _format_color_data(self, color_tokens: dict) -> str: """Format color tokens for prompt.""" lines = [] for name, token in list(color_tokens.items())[:30]: if isinstance(token, dict): hex_val = token.get("value", token.get("hex", "")) usage = token.get("usage_count", token.get("count", 1)) context = token.get("context", token.get("css_property", "")) else: hex_val = getattr(token, "value", "") usage = getattr(token, "usage_count", 1) context = getattr(token, "context", "") if hex_val: lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}") return "\n".join(lines) if lines else "No color data available" def _format_semantic_analysis(self, semantic: dict) -> str: """Format semantic analysis for prompt.""" if not semantic: return "No semantic analysis available" lines = [] try: for category, value in semantic.items(): if not value: continue if isinstance(value, list): # List of colors color_list = [] for c in value[:5]: if isinstance(c, dict): color_list.append(c.get("hex", c.get("value", str(c)))) else: color_list.append(str(c)) lines.append(f"- {category}: {', '.join(color_list)}") elif isinstance(value, dict): # Could be a nested dict of sub-roles → color dicts # e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}} # or a flat color dict {"hex": "#...", "confidence": "..."} # or a summary dict {"total_colors_analyzed": 50, ...} if "hex" in value: # Flat color dict lines.append(f"- {category}: {value['hex']}") else: # Nested dict — iterate sub-roles sub_items = [] for sub_role, sub_val in list(value.items())[:5]: if isinstance(sub_val, dict) and "hex" in sub_val: sub_items.append(f"{sub_role}={sub_val['hex']}") elif isinstance(sub_val, (str, int, float, bool)): sub_items.append(f"{sub_role}={sub_val}") if sub_items: lines.append(f"- {category}: {', '.join(sub_items)}") else: lines.append(f"- {category}: {value}") except Exception as e: return f"Error formatting semantic analysis: {str(e)[:50]}" return "\n".join(lines) if lines else "No semantic analysis available" def _parse_response(self, response: str) -> BrandIdentification: """Parse LLM response into BrandIdentification.""" try: json_match = re.search(r'\{[\s\S]*\}', response) if json_match: data = json.loads(json_match.group()) return BrandIdentification( brand_primary=data.get("brand_primary", {}), brand_secondary=data.get("brand_secondary", {}), brand_accent=data.get("brand_accent", {}), palette_strategy=data.get("palette_strategy", "unknown"), cohesion_score=data.get("cohesion_score", 5), cohesion_notes=data.get("cohesion_notes", ""), semantic_names=data.get("semantic_names", {}), self_evaluation=data.get("self_evaluation", {}), ) except Exception: pass return BrandIdentification() # ============================================================================= # BENCHMARK ADVISOR AGENT # ============================================================================= class BenchmarkAdvisorAgent: """ ATLAS — Senior Design System Benchmark Analyst. Recommends best-fit design system based on comparison data. Model: Llama 3.3 70B (128K context for large benchmark data, excellent comparative reasoning) Temperature: 0.25 (analytical, data-driven comparison) WHY LLM: Requires reasoning about trade-offs and use-case fit, not just similarity scores. """ SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst specializing in cross-system comparison and alignment strategy. ## YOUR ROLE IN THE PIPELINE You are Agent 2 of 4 in the Design System Analysis pipeline. - INPUT: User's extracted type scale, spacing, and font sizes + benchmark comparison data from the Rule Engine - OUTPUT: Benchmark recommendation with alignment roadmap → feeds into NEXUS (Agent 4) for final synthesis - Your recommendation helps the user decide which established design system to align with. ## YOUR EXPERTISE - Deep knowledge of Material Design 3, Apple HIG, IBM Carbon, Ant Design, Atlassian, Tailwind CSS, Bootstrap - Type scale mathematics (major/minor second/third, perfect fourth/fifth, golden ratio) - Spacing grid systems (4px, 8px, multiples) and their trade-offs - Migration effort estimation for design system alignment ## QUALITY STANDARDS - Always consider BOTH similarity score AND use-case fit. Closest match ≠ best fit. - Recommend max 4 alignment changes. More than that = the benchmark is not a good fit. - Effort estimates must be realistic: "low" = CSS variable change, "medium" = component updates, "high" = layout restructuring. - If similarity is above 85%, say "already well-aligned" and suggest minimal changes only. ## WHAT NOT TO DO - Don't always recommend the closest match — a system 5% less similar but much better suited is preferable. - Don't list generic pros/cons. Be specific to the user's actual values. - Don't suggest alignment changes that would break accessibility (e.g., smaller base font). - Don't recommend obscure or abandoned design systems. ## SCORING RUBRIC (Benchmark Fit): - Excellent Fit: >85% match, same use-case category, < 3 changes needed - Good Fit: 70-85% match, compatible use-case, 3-4 changes needed - Fair Fit: 50-70% match, different trade-offs to consider, 4+ changes - Poor Fit: <50% match, fundamentally different approach — don't recommend""" PROMPT_TEMPLATE = """Analyze the following benchmark comparison data and recommend the best design system alignment. ## USER'S CURRENT VALUES - Type Scale Ratio: {user_ratio} - Base Font Size: {user_base}px - Spacing Grid: {user_spacing}px ## BENCHMARK COMPARISON {benchmark_comparison} ## YOUR TASK 1. **Recommend Best Fit**: Which design system should they align with? Consider use-case fit, not just numbers. 2. **Explain Why**: Cite specific data points (similarity scores, ratio differences, spacing alignment). 3. **List Changes Needed**: What would they need to change? Include effort estimates. 4. **Pros/Cons**: Specific to this user's values, not generic statements. 5. **Self-Evaluate** your recommendation quality. ## OUTPUT FORMAT (JSON only) {{ "recommended_benchmark": "", "recommended_benchmark_name": "", "reasoning": "Why this is the best fit — cite specific data", "alignment_changes": [ {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}}, {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}} ], "pros_of_alignment": [ "Specific benefit with data" ], "cons_of_alignment": [ "Specific trade-off" ], "alternative_benchmarks": [ {{"name": "Material Design 3", "reason": "Good for Android-first products"}} ], "self_evaluation": {{ "confidence": <1-10>, "reasoning": "Why I am this confident", "data_quality": "good|fair|poor", "flags": [] }} }} Return ONLY valid JSON.""" def __init__(self, hf_client): self.hf_client = hf_client async def analyze( self, user_ratio: float, user_base: int, user_spacing: int, benchmark_comparisons: list, log_callback: Callable = None, ) -> BenchmarkAdvice: """ Recommend best-fit design system. Args: user_ratio: User's detected type scale ratio user_base: User's base font size user_spacing: User's spacing grid base benchmark_comparisons: List of BenchmarkComparison objects log_callback: Progress logging function Returns: BenchmarkAdvice with recommendations """ def log(msg: str): if log_callback: log_callback(msg) log("") log(" 🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)") log(" └─ Evaluating benchmark fit for your use case...") # Format comparison data comparison_str = self._format_comparisons(benchmark_comparisons) prompt = self.PROMPT_TEMPLATE.format( user_ratio=user_ratio, user_base=user_base, user_spacing=user_spacing, benchmark_comparison=comparison_str, ) try: start_time = datetime.now() response = await self.hf_client.complete_async( agent_name="benchmark_advisor", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=900, json_mode=True, ) duration = (datetime.now() - start_time).total_seconds() result = self._parse_response(response) log(f" ────────────────────────────────────────────────") log(f" 🏢 ATLAS — Benchmark Advisor: COMPLETE ({duration:.1f}s)") log(f" ├─ Recommended: {result.recommended_benchmark_name}") log(f" ├─ Changes Needed: {len(result.alignment_changes)}") log(f" ├─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}") se = result.self_evaluation if se: log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") return result except Exception as e: log(f" ├─ ⚠️ Benchmark Advisor failed: {str(e)[:120]}") return BenchmarkAdvice() def _format_comparisons(self, comparisons: list) -> str: """Format benchmark comparisons for prompt.""" lines = [] for i, c in enumerate(comparisons[:5]): b = c.benchmark lines.append(f""" {i+1}. {b.icon} {b.name} - Similarity Score: {c.similarity_score:.2f} (lower = better) - Match: {c.overall_match_pct:.0f}% - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f}) - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff}) - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff}) - Best For: {', '.join(b.best_for)}""") return "\n".join(lines) def _parse_response(self, response: str) -> BenchmarkAdvice: """Parse LLM response into BenchmarkAdvice.""" try: json_match = re.search(r'\{[\s\S]*\}', response) if json_match: data = json.loads(json_match.group()) return BenchmarkAdvice( recommended_benchmark=data.get("recommended_benchmark", ""), recommended_benchmark_name=data.get("recommended_benchmark_name", ""), reasoning=data.get("reasoning", ""), alignment_changes=data.get("alignment_changes", []), pros_of_alignment=data.get("pros_of_alignment", []), cons_of_alignment=data.get("cons_of_alignment", []), alternative_benchmarks=data.get("alternative_benchmarks", []), self_evaluation=data.get("self_evaluation", {}), ) except Exception: pass return BenchmarkAdvice() # ============================================================================= # BEST PRACTICES VALIDATOR AGENT # ============================================================================= class BestPracticesValidatorAgent: """ SENTINEL — Design System Best Practices Auditor. Validates against design system standards and prioritizes fixes by business impact. Model: Qwen 72B (methodical rule-following, precise judgment, structured output) Temperature: 0.2 (strict, consistent rule evaluation) WHY LLM: Prioritization requires judgment about business impact, not just checking boxes. """ SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor specializing in standards compliance and impact-based prioritization. ## YOUR ROLE IN THE PIPELINE You are Agent 3 of 4 in the Design System Analysis pipeline. - INPUT: Rule Engine analysis results (typography, accessibility, spacing, color stats) - OUTPUT: Compliance score + prioritized fix list → feeds into NEXUS (Agent 4) for final synthesis - Your score directly appears on the user's dashboard. Your priority fixes become the action items. ## YOUR EXPERTISE - WCAG 2.1 AA/AAA accessibility standards - Design system best practices (Material Design, Apple HIG, Tailwind conventions) - Typography systems (modular scales, vertical rhythm, readability) - Color management (palette size limits, near-duplicate detection, contrast requirements) - Spacing systems (grid alignment, consistency, component density) ## QUALITY STANDARDS - Overall Score MUST reflect actual data. Don't default to 50. - Use the FULL 0-100 range: 90+ = excellent, 70-89 = good, 50-69 = needs work, <50 = significant issues - Priority fixes must be ACTIONABLE — include specific values to change (e.g., "Change #06b2c4 → #0891a8") - Maximum 5 priority fixes. If more, focus on highest-impact items. ## WHAT NOT TO DO - Don't pass checks that clearly fail based on the data. - Don't inflate scores to be "encouraging" — honest assessment helps the user. - Don't list fixes without effort estimates — the user needs to plan their work. - Don't mix up "warn" and "fail": warn = imperfect but functional, fail = violates a standard. ## SCORING RUBRIC (Overall Score 0-100): - 90-100: All checks pass, excellent accessibility, clean palette, consistent grid - 75-89: Most checks pass, minor issues in 1-2 areas, good foundation - 60-74: Several warnings, 1-2 failures, needs focused improvement - 40-59: Multiple failures, significant accessibility gaps, inconsistent system - 20-39: Fundamental issues across multiple areas, major rework needed - 0-19: Barely qualifies as a design system, almost everything fails ## CHECK WEIGHTING: - AA Compliance: 25 points (most critical — affects real users) - Type Scale Consistency: 15 points - Type Scale Standard Ratio: 10 points - Base Size Accessible: 15 points - Spacing Grid: 15 points - Color Count: 5 points - No Near-Duplicates: 5 points - Shadow System: 10 points (elevation hierarchy, consistency) ## SHADOW SYSTEM BEST PRACTICES: - Use 3-6 elevation levels (xs, sm, md, lg, xl, 2xl) - Consistent Y-offset progression (shadows should grow with elevation) - Blur radius should increase with elevation (more blur = higher elevation) - Shadow colors should be neutral (black/gray with alpha) or brand-colored with low opacity - Avoid shadows with 0 blur (looks harsh/flat) - Avoid excessive blur (>32px for most use cases)""" PROMPT_TEMPLATE = """Validate the following design tokens against best practices and prioritize fixes. ## RULE ENGINE ANALYSIS RESULTS ### Typography - Detected Ratio: {type_ratio} ({type_consistent}) - Base Size: {base_size}px - Recommendation: {type_recommendation} ### Accessibility - Total Colors: {total_colors} - AA Pass: {aa_pass} - AA Fail: {aa_fail} - Failing Colors: {failing_colors} ### Spacing - Detected Base: {spacing_base}px - Grid Aligned: {spacing_aligned}% - Recommendation: {spacing_recommendation}px ### Color Statistics - Unique Colors: {unique_colors} - Duplicates: {duplicates} - Near-Duplicates: {near_duplicates} ### Shadow System - Total Shadows: {shadow_count} - Shadow Values: {shadow_values} ## BEST PRACTICES CHECKLIST (check each one) 1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618) 2. Type scale is consistent (variance < 0.15) 3. Base font size >= 16px (accessibility) 4. All interactive colors pass WCAG AA (4.5:1 contrast) 5. Spacing uses consistent grid (4px or 8px base) 6. Limited color palette (< 20 unique semantic colors) 7. No near-duplicate colors (< 3 delta-E apart) 8. Shadow system has consistent elevation hierarchy (blur/Y-offset increase together) ## YOUR TASK 1. Score each practice: pass/warn/fail with specific notes citing the data 2. Calculate overall score (0-100) using the weighting rubric 3. Identify TOP 3-5 priority fixes with impact and effort assessment 4. Self-evaluate your analysis ## OUTPUT FORMAT (JSON only) {{ "overall_score": <0-100>, "checks": {{ "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}}, "type_scale_consistent": {{"status": "...", "note": "..."}}, "base_size_accessible": {{"status": "...", "note": "..."}}, "aa_compliance": {{"status": "...", "note": "..."}}, "spacing_grid": {{"status": "...", "note": "..."}}, "color_count": {{"status": "...", "note": "..."}}, "near_duplicates": {{"status": "...", "note": "..."}}, "shadow_system": {{"status": "...", "note": "Elevation hierarchy, blur consistency, color appropriateness"}} }}, "priority_fixes": [ {{ "rank": 1, "issue": "Brand primary fails AA", "impact": "high|medium|low", "effort": "low|medium|high", "action": "Change #06b2c4 → #0891a8 for 4.5:1 contrast" }} ], "passing_practices": ["Base font size", "..."], "failing_practices": ["AA compliance", "..."], "self_evaluation": {{ "confidence": <1-10>, "reasoning": "Why I am this confident", "data_quality": "good|fair|poor", "flags": [] }} }} Return ONLY valid JSON.""" def __init__(self, hf_client): self.hf_client = hf_client async def analyze( self, rule_engine_results: Any, shadow_tokens: dict = None, log_callback: Callable = None, ) -> BestPracticesResult: """ Validate against best practices. Args: rule_engine_results: Results from rule engine shadow_tokens: Shadow tokens dict {name: {value: "..."}} log_callback: Progress logging function Returns: BestPracticesResult with validation """ def log(msg: str): if log_callback: log_callback(msg) log("") log(" ✅ SENTINEL — Best Practices Validator (Qwen 72B)") log(" └─ Checking against design system standards...") # Extract data from rule engine typo = rule_engine_results.typography spacing = rule_engine_results.spacing color_stats = rule_engine_results.color_stats accessibility = rule_engine_results.accessibility failures = [a for a in accessibility if not a.passes_aa_normal] failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]]) # Format shadow data for the prompt shadow_count = len(shadow_tokens) if shadow_tokens else 0 shadow_values_str = "None detected" if shadow_tokens and shadow_count > 0: shadow_list = [] for name, s in list(shadow_tokens.items())[:6]: val = s.get("value", "") if isinstance(s, dict) else str(s) shadow_list.append(f"{name}: {val[:50]}") shadow_values_str = "; ".join(shadow_list) prompt = self.PROMPT_TEMPLATE.format( type_ratio=f"{typo.detected_ratio:.3f}", type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}", base_size=typo.sizes_px[0] if typo.sizes_px else 16, type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})", total_colors=len(accessibility), aa_pass=len(accessibility) - len(failures), aa_fail=len(failures), failing_colors=failing_colors_str or "None", spacing_base=spacing.detected_base, spacing_aligned=f"{spacing.alignment_percentage:.0f}", spacing_recommendation=spacing.recommendation, unique_colors=color_stats.unique_count, duplicates=color_stats.duplicate_count, near_duplicates=len(color_stats.near_duplicates), shadow_count=shadow_count, shadow_values=shadow_values_str, ) try: start_time = datetime.now() response = await self.hf_client.complete_async( agent_name="best_practices_validator", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=1000, json_mode=True, ) duration = (datetime.now() - start_time).total_seconds() result = self._parse_response(response) log(f" ────────────────────────────────────────────────") log(f" ✅ SENTINEL — Best Practices: COMPLETE ({duration:.1f}s)") log(f" ├─ Overall Score: {result.overall_score}/100") log(f" ├─ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}") if result.priority_fixes: log(f" ├─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}") se = result.self_evaluation if se: log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") return result except Exception as e: log(f" ├─ ⚠️ Best Practices Validator failed: {str(e)[:120]}") return BestPracticesResult() def _parse_response(self, response: str) -> BestPracticesResult: """Parse LLM response into BestPracticesResult.""" try: json_match = re.search(r'\{[\s\S]*\}', response) if json_match: data = json.loads(json_match.group()) return BestPracticesResult( overall_score=data.get("overall_score", 50), checks=data.get("checks", {}), priority_fixes=data.get("priority_fixes", []), passing_practices=data.get("passing_practices", []), failing_practices=data.get("failing_practices", []), self_evaluation=data.get("self_evaluation", {}), ) except Exception: pass return BestPracticesResult() # ============================================================================= # HEAD SYNTHESIZER AGENT # ============================================================================= class HeadSynthesizerAgent: """ NEXUS — Senior Design System Architect & Synthesizer. Combines all agent outputs into final actionable recommendations. Model: Llama 3.3 70B (128K context for combined inputs, strong synthesis capability) Temperature: 0.3 (balanced — needs to synthesize creatively but stay grounded in data) This is the final step that produces actionable output for the user. """ SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect specializing in synthesis and actionable recommendations. ## YOUR ROLE IN THE PIPELINE You are Agent 4 of 4 — the HEAD Synthesizer in the Design System Analysis pipeline. - INPUT: Combined outputs from Rule Engine + AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices) - OUTPUT: Final executive summary, scores, and prioritized action plan → displayed directly to the user - You are the LAST agent. Your output IS the final result. Make it count. ## YOUR EXPERTISE - Design system architecture and governance - Synthesizing conflicting recommendations into coherent strategy - Effort/impact prioritization (what to fix first) - Color accessibility remediation (suggesting AA-compliant alternatives) - Executive communication (clear, actionable summaries) ## QUALITY STANDARDS - Executive Summary must be 2-3 sentences MAX. Lead with the overall score, then the #1 issue, then the #1 action. - Overall Score must SYNTHESIZE all agent inputs — don't just average them. - Color recommendations must include BOTH current AND suggested hex values. - Top 3 Actions must be ordered by IMPACT, not ease. - Accept/reject defaults on color recs: default to "accept" for accessibility fixes, "reject" for purely aesthetic changes. ## WHAT NOT TO DO - Don't contradict previous agents without explaining why. - Don't recommend changes that SENTINEL flagged as breaking. - Don't suggest more than 8 color changes — the user will ignore a long list. - Don't give vague actions like "improve accessibility" — be specific: "Change brand.primary from #06b2c4 to #0891a8 for 4.5:1 contrast". - Don't inflate scores to be "nice". If the design system has issues, say so clearly. ## SCORING RUBRIC (Overall 0-100): - 90-100: Production-ready design system, minor polishing only - 75-89: Solid foundation, 2-3 targeted improvements needed - 60-74: Functional but needs focused attention on accessibility or consistency - 40-59: Significant gaps requiring systematic improvement - 20-39: Major rework needed across multiple dimensions - 0-19: Fundamental redesign recommended""" PROMPT_TEMPLATE = """Synthesize all analysis results into a final, actionable design system report. ## RULE ENGINE FACTS (Layer 1 — Free, deterministic) - Type Scale: {type_ratio} ({type_status}) - Base Size: {base_size}px - AA Failures: {aa_failures} - Spacing Grid: {spacing_status} - Unique Colors: {unique_colors} - Consistency Score: {consistency_score}/100 ## AURORA — Brand Identification (Agent 1) - Brand Primary: {brand_primary} - Brand Secondary: {brand_secondary} - Palette Cohesion: {cohesion_score}/10 ## ATLAS — Benchmark Advice (Agent 2) Closest Match: {closest_benchmark} Match Percentage: {match_pct}% Recommended Changes: {benchmark_changes} ## SENTINEL — Best Practices Validation (Agent 3) Overall Score: {best_practices_score}/100 Priority Fixes: {priority_fixes} ## ACCESSIBILITY FIXES NEEDED {accessibility_fixes} ## YOUR TASK Synthesize ALL the above into: 1. Executive Summary (2-3 sentences — lead with score, #1 issue, #1 action) 2. Overall Scores (synthesized, not averaged) 3. Top 3 Priority Actions (ordered by IMPACT, include effort estimates) 4. Specific Color Recommendations (with accept/reject defaults) 5. Type Scale Recommendation 6. Spacing Recommendation 7. Self-Evaluation of your synthesis ## OUTPUT FORMAT (JSON only) {{ "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.", "scores": {{ "overall": <0-100>, "accessibility": <0-100>, "consistency": <0-100>, "organization": <0-100> }}, "benchmark_fit": {{ "closest": "", "similarity": "", "recommendation": "Specific action to align" }}, "brand_analysis": {{ "primary": "#hex", "secondary": "#hex", "cohesion": <1-10> }}, "top_3_actions": [ {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}} ], "color_recommendations": [ {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}} ], "type_scale_recommendation": {{ "current_ratio": 1.18, "recommended_ratio": 1.25, "reason": "Why this ratio is better" }}, "spacing_recommendation": {{ "current": "mixed", "recommended": "8px", "reason": "Why this grid is better" }}, "self_evaluation": {{ "confidence": <1-10>, "reasoning": "Why I am this confident in the synthesis", "data_quality": "good|fair|poor", "flags": [] }} }} Return ONLY valid JSON.""" def __init__(self, hf_client): self.hf_client = hf_client async def synthesize( self, rule_engine_results: Any, benchmark_comparisons: list, brand_identification: BrandIdentification, benchmark_advice: BenchmarkAdvice, best_practices: BestPracticesResult, log_callback: Callable = None, ) -> HeadSynthesis: """ Synthesize all results into final recommendations. """ def log(msg: str): if log_callback: log_callback(msg) log("") log("═" * 60) log("🧠 LAYER 4: NEXUS — HEAD SYNTHESIZER (Llama 3.3 70B)") log("═" * 60) log("") log(" Combining: Rule Engine + AURORA + ATLAS + SENTINEL...") # Extract data typo = rule_engine_results.typography spacing = rule_engine_results.spacing color_stats = rule_engine_results.color_stats accessibility = rule_engine_results.accessibility failures = [a for a in accessibility if not a.passes_aa_normal] aa_fixes_str = "\n".join([ f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)" for a in failures[:5] if a.suggested_fix ]) closest = benchmark_comparisons[0] if benchmark_comparisons else None prompt = self.PROMPT_TEMPLATE.format( type_ratio=f"{typo.detected_ratio:.3f}", type_status="consistent" if typo.is_consistent else "inconsistent", base_size=typo.sizes_px[0] if typo.sizes_px else 16, aa_failures=len(failures), spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned", unique_colors=color_stats.unique_count, consistency_score=rule_engine_results.consistency_score, closest_benchmark=closest.benchmark.name if closest else "Unknown", match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0", benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]), brand_primary=brand_identification.brand_primary.get("color", "Unknown"), brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"), cohesion_score=brand_identification.cohesion_score, best_practices_score=best_practices.overall_score, priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]), accessibility_fixes=aa_fixes_str or "None needed", ) try: start_time = datetime.now() response = await self.hf_client.complete_async( agent_name="head_synthesizer", system_prompt=self.SYSTEM_PROMPT, user_message=prompt, max_tokens=1200, json_mode=True, ) duration = (datetime.now() - start_time).total_seconds() result = self._parse_response(response) log("") log(f" ✅ NEXUS — HEAD Synthesizer: COMPLETE ({duration:.1f}s)") if result.scores: log(f" ├─ Overall Score: {result.scores.get('overall', '?')}/100") log(f" ├─ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}") se = result.self_evaluation if se: log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") log("") return result except Exception as e: log(f" ├─ ⚠️ Head Synthesizer failed: {str(e)[:120]}") return HeadSynthesis() def _parse_response(self, response: str) -> HeadSynthesis: """Parse LLM response into HeadSynthesis.""" try: json_match = re.search(r'\{[\s\S]*\}', response) if json_match: data = json.loads(json_match.group()) return HeadSynthesis( executive_summary=data.get("executive_summary", ""), scores=data.get("scores", {}), benchmark_fit=data.get("benchmark_fit", {}), brand_analysis=data.get("brand_analysis", {}), top_3_actions=data.get("top_3_actions", []), color_recommendations=data.get("color_recommendations", []), type_scale_recommendation=data.get("type_scale_recommendation", {}), spacing_recommendation=data.get("spacing_recommendation", {}), self_evaluation=data.get("self_evaluation", {}), ) except Exception: pass return HeadSynthesis()