Spaces:

riazmo
/

Design-System-Extractor-2

Running

App Files Files Community

riazmo commited on Feb 2

Commit

93b9760

verified ·

1 Parent(s): 52b0a45

Upload llm_agents.py

Browse files

Files changed (1) hide show

agents/llm_agents.py +1124 -0

agents/llm_agents.py ADDED Viewed

	@@ -0,0 +1,1124 @@

+"""
+Stage 2 LLM Agents — Specialized Analysis Tasks
+=================================================
+These agents handle tasks that REQUIRE LLM reasoning:
+- Brand Identifier: Identify brand colors from usage context
+- Benchmark Advisor: Recommend best-fit design system
+- Best Practices Validator: Prioritize fixes by business impact
+- HEAD Synthesizer: Combine all outputs into final recommendations
+Each agent has a focused prompt for its specific task.
+"""
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Optional, Callable, Any
+from datetime import datetime
+# =============================================================================
+# DATA CLASSES
+# =============================================================================
+@dataclass
+class BrandIdentification:
+    """Results from Brand Identifier agent (AURORA)."""
+    brand_primary: dict = field(default_factory=dict)
+    # {color, confidence, reasoning, usage_count}
+    brand_secondary: dict = field(default_factory=dict)
+    brand_accent: dict = field(default_factory=dict)
+    palette_strategy: str = ""  # complementary, analogous, triadic, monochromatic, random
+    cohesion_score: int = 5  # 1-10
+    cohesion_notes: str = ""
+    semantic_names: dict = field(default_factory=dict)
+    # {hex_color: suggested_name}
+    self_evaluation: dict = field(default_factory=dict)
+    # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
+    def to_dict(self) -> dict:
+        return {
+            "brand_primary": self.brand_primary,
+            "brand_secondary": self.brand_secondary,
+            "brand_accent": self.brand_accent,
+            "palette_strategy": self.palette_strategy,
+            "cohesion_score": self.cohesion_score,
+            "cohesion_notes": self.cohesion_notes,
+            "semantic_names": self.semantic_names,
+            "self_evaluation": self.self_evaluation,
+        }
+@dataclass
+class BenchmarkAdvice:
+    """Results from Benchmark Advisor agent."""
+    recommended_benchmark: str = ""
+    recommended_benchmark_name: str = ""
+    reasoning: str = ""
+    alignment_changes: list = field(default_factory=list)
+    # [{change, from, to, effort}]
+    pros_of_alignment: list = field(default_factory=list)
+    cons_of_alignment: list = field(default_factory=list)
+    alternative_benchmarks: list = field(default_factory=list)
+    # [{name, reason}]
+    self_evaluation: dict = field(default_factory=dict)
+    # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
+    def to_dict(self) -> dict:
+        return {
+            "recommended_benchmark": self.recommended_benchmark,
+            "recommended_benchmark_name": self.recommended_benchmark_name,
+            "reasoning": self.reasoning,
+            "alignment_changes": self.alignment_changes,
+            "pros": self.pros_of_alignment,
+            "cons": self.cons_of_alignment,
+            "alternatives": self.alternative_benchmarks,
+            "self_evaluation": self.self_evaluation,
+        }
+@dataclass
+class BestPracticesResult:
+    """Results from Best Practices Validator agent."""
+    overall_score: int = 50  # 0-100
+    checks: dict = field(default_factory=dict)
+    # {check_name: {status: pass/warn/fail, note: str}}
+    priority_fixes: list = field(default_factory=list)
+    # [{rank, issue, impact, effort, action}]
+    passing_practices: list = field(default_factory=list)
+    failing_practices: list = field(default_factory=list)
+    self_evaluation: dict = field(default_factory=dict)
+    # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
+    def to_dict(self) -> dict:
+        return {
+            "overall_score": self.overall_score,
+            "checks": self.checks,
+            "priority_fixes": self.priority_fixes,
+            "passing": self.passing_practices,
+            "failing": self.failing_practices,
+            "self_evaluation": self.self_evaluation,
+        }
+@dataclass
+class HeadSynthesis:
+    """Final synthesized output from HEAD agent."""
+    executive_summary: str = ""
+    scores: dict = field(default_factory=dict)
+    # {overall, accessibility, consistency, organization}
+    benchmark_fit: dict = field(default_factory=dict)
+    # {closest, similarity, recommendation}
+    brand_analysis: dict = field(default_factory=dict)
+    # {primary, secondary, cohesion}
+    top_3_actions: list = field(default_factory=list)
+    # [{action, impact, effort, details}]
+    color_recommendations: list = field(default_factory=list)
+    # [{role, current, suggested, reason, accept}]
+    type_scale_recommendation: dict = field(default_factory=dict)
+    spacing_recommendation: dict = field(default_factory=dict)
+    self_evaluation: dict = field(default_factory=dict)
+    # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
+    def to_dict(self) -> dict:
+        return {
+            "executive_summary": self.executive_summary,
+            "scores": self.scores,
+            "benchmark_fit": self.benchmark_fit,
+            "brand_analysis": self.brand_analysis,
+            "top_3_actions": self.top_3_actions,
+            "color_recommendations": self.color_recommendations,
+            "type_scale_recommendation": self.type_scale_recommendation,
+            "spacing_recommendation": self.spacing_recommendation,
+            "self_evaluation": self.self_evaluation,
+        }
+# =============================================================================
+# BRAND IDENTIFIER AGENT
+# =============================================================================
+class BrandIdentifierAgent:
+    """
+    AURORA — Senior Brand Color Analyst.
+    Identifies brand colors from usage context using creative/visual reasoning.
+    Model: Qwen 72B (strong creative reasoning, color harmony assessment)
+    Temperature: 0.4 (allows creative interpretation of color stories)
+    WHY LLM: Requires understanding context (33 buttons = likely brand primary),
+    not just color math.
+    """
+    SYSTEM_PROMPT = """You are AURORA, a Senior Brand Color Analyst specializing in visual identity systems.
+## YOUR ROLE IN THE PIPELINE
+You are Agent 1 of 4 in the Design System Analysis pipeline.
+- INPUT: Raw color tokens with usage counts + semantic CSS analysis from Stage 1 extraction
+- OUTPUT: Brand color identification + palette strategy → feeds into NEXUS (Agent 4) for final synthesis
+- Your analysis directly influences the final color recommendations shown to the user.
+## YOUR EXPERTISE
+- Color harmony theory (complementary, analogous, triadic, split-complementary, monochromatic)
+- Brand identity systems (primary/secondary/accent hierarchy)
+- CSS context interpretation (button colors = likely CTA, background colors = likely neutral)
+- Color naming conventions (design token naming: brand.primary, text.secondary, etc.)
+## QUALITY STANDARDS
+- Brand Primary MUST have HIGH confidence if one color dominates buttons/CTAs. Say "low" if ambiguous.
+- Cohesion Score: Use the FULL 1-10 range. A score of 7+ means clear intentional harmony. Most sites score 5-7.
+- If fewer than 5 unique colors exist, flag as "insufficient_data" — don't guess relationships.
+## WHAT NOT TO DO
+- Don't inflate confidence. "Medium" is fine when usage patterns are unclear.
+- Don't guess accent colors if none exist — use null.
+- Don't assume complementary strategy just because two colors differ — check the actual hue relationship.
+- Don't name colors generically. Use semantic design-token style names (brand.primary, not "blue").
+## SCORING RUBRIC (Cohesion 1-10):
+- 9-10: Clear harmony rule across all colors, distinct brand identity, consistent palette
+- 7-8: Mostly harmonious, clear brand identity, minor inconsistencies
+- 5-6: Some color relationships visible but not systematic
+- 3-4: Random-feeling palette, no clear color strategy
+- 1-2: Actively conflicting colors, no brand identity visible"""
+    PROMPT_TEMPLATE = """Analyze the following color usage data and identify the brand color system.
+## COLOR DATA WITH USAGE CONTEXT
+{color_data}
+## SEMANTIC ANALYSIS (from CSS properties)
+{semantic_analysis}
+## YOUR TASK
+1. **Identify Brand Colors**:
+   - Brand Primary: The main action/CTA color (highest visibility in buttons, links, key UI)
+   - Brand Secondary: Supporting brand color (headers, secondary actions)
+   - Brand Accent: Highlight color for emphasis (badges, alerts, special states)
+2. **Assess Palette Strategy**: complementary, analogous, triadic, monochromatic, or random?
+3. **Rate Cohesion** (1-10) using the rubric above
+4. **Suggest Semantic Names** for top 10 most-used colors (design-token format)
+5. **Self-Evaluate** your analysis quality
+## OUTPUT FORMAT (JSON only)
+{{
+  "brand_primary": {{
+    "color": "#hex",
+    "confidence": "high|medium|low",
+    "reasoning": "Why this is brand primary — cite specific usage evidence",
+    "usage_count": <number>
+  }},
+  "brand_secondary": {{
+    "color": "#hex",
+    "confidence": "high|medium|low",
+    "reasoning": "..."
+  }},
+  "brand_accent": {{
+    "color": "#hex or null",
+    "confidence": "...",
+    "reasoning": "..."
+  }},
+  "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
+  "cohesion_score": <1-10>,
+  "cohesion_notes": "Assessment of how well colors work together",
+  "semantic_names": {{
+    "#hex1": "brand.primary",
+    "#hex2": "text.primary",
+    "#hex3": "background.primary"
+  }},
+  "self_evaluation": {{
+    "confidence": <1-10>,
+    "reasoning": "Why I am this confident in my analysis",
+    "data_quality": "good|fair|poor",
+    "flags": []
+  }}
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def analyze(
+        self,
+        color_tokens: dict,
+        semantic_analysis: dict,
+        log_callback: Callable = None,
+    ) -> BrandIdentification:
+        """
+        Identify brand colors from usage context.
+        Args:
+            color_tokens: Dict of color tokens with usage data
+            semantic_analysis: Semantic categorization from Stage 1
+            log_callback: Progress logging function
+        Returns:
+            BrandIdentification with identified colors
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("   🎨 AURORA — Brand Identifier (Qwen 72B)")
+        log("   └─ Analyzing color context and usage patterns...")
+        # Format color data
+        color_data = self._format_color_data(color_tokens)
+        semantic_str = self._format_semantic_analysis(semantic_analysis)
+        prompt = self.PROMPT_TEMPLATE.format(
+            color_data=color_data,
+            semantic_analysis=semantic_str,
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="brand_identifier",
+                system_prompt=self.SYSTEM_PROMPT,
+                user_message=prompt,
+                max_tokens=1000,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            # Parse response
+            result = self._parse_response(response)
+            log(f"   ────────────────────────────────────────────────")
+            log(f"   🎨 AURORA — Brand Identifier: COMPLETE ({duration:.1f}s)")
+            log(f"   ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
+            log(f"   ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}")
+            log(f"   ├─ Palette Strategy: {result.palette_strategy}")
+            log(f"   ├─ Cohesion Score: {result.cohesion_score}/10")
+            se = result.self_evaluation
+            if se:
+                log(f"   └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
+            return result
+        except Exception as e:
+            error_msg = str(e)
+            # Always log full error for diagnosis
+            log(f"   ⚠️ Brand Identifier failed: {error_msg[:120]}")
+            if "gated" in error_msg.lower() or "access" in error_msg.lower():
+                log(f"   └─ Model may require license acceptance at huggingface.co")
+            elif "Rate limit" in error_msg or "429" in error_msg:
+                log(f"   └─ HF free tier rate limit — wait or upgrade to Pro")
+            return BrandIdentification()
+    def _format_color_data(self, color_tokens: dict) -> str:
+        """Format color tokens for prompt."""
+        lines = []
+        for name, token in list(color_tokens.items())[:30]:
+            if isinstance(token, dict):
+                hex_val = token.get("value", token.get("hex", ""))
+                usage = token.get("usage_count", token.get("count", 1))
+                context = token.get("context", token.get("css_property", ""))
+            else:
+                hex_val = getattr(token, "value", "")
+                usage = getattr(token, "usage_count", 1)
+                context = getattr(token, "context", "")
+            if hex_val:
+                lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")
+        return "\n".join(lines) if lines else "No color data available"
+    def _format_semantic_analysis(self, semantic: dict) -> str:
+        """Format semantic analysis for prompt."""
+        if not semantic:
+            return "No semantic analysis available"
+        lines = []
+        try:
+            for category, value in semantic.items():
+                if not value:
+                    continue
+                if isinstance(value, list):
+                    # List of colors
+                    color_list = []
+                    for c in value[:5]:
+                        if isinstance(c, dict):
+                            color_list.append(c.get("hex", c.get("value", str(c))))
+                        else:
+                            color_list.append(str(c))
+                    lines.append(f"- {category}: {', '.join(color_list)}")
+                elif isinstance(value, dict):
+                    # Could be a nested dict of sub-roles → color dicts
+                    # e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}}
+                    # or a flat color dict {"hex": "#...", "confidence": "..."}
+                    # or a summary dict {"total_colors_analyzed": 50, ...}
+                    if "hex" in value:
+                        # Flat color dict
+                        lines.append(f"- {category}: {value['hex']}")
+                    else:
+                        # Nested dict — iterate sub-roles
+                        sub_items = []
+                        for sub_role, sub_val in list(value.items())[:5]:
+                            if isinstance(sub_val, dict) and "hex" in sub_val:
+                                sub_items.append(f"{sub_role}={sub_val['hex']}")
+                            elif isinstance(sub_val, (str, int, float, bool)):
+                                sub_items.append(f"{sub_role}={sub_val}")
+                        if sub_items:
+                            lines.append(f"- {category}: {', '.join(sub_items)}")
+                else:
+                    lines.append(f"- {category}: {value}")
+        except Exception as e:
+            return f"Error formatting semantic analysis: {str(e)[:50]}"
+        return "\n".join(lines) if lines else "No semantic analysis available"
+    def _parse_response(self, response: str) -> BrandIdentification:
+        """Parse LLM response into BrandIdentification."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return BrandIdentification(
+                    brand_primary=data.get("brand_primary", {}),
+                    brand_secondary=data.get("brand_secondary", {}),
+                    brand_accent=data.get("brand_accent", {}),
+                    palette_strategy=data.get("palette_strategy", "unknown"),
+                    cohesion_score=data.get("cohesion_score", 5),
+                    cohesion_notes=data.get("cohesion_notes", ""),
+                    semantic_names=data.get("semantic_names", {}),
+                    self_evaluation=data.get("self_evaluation", {}),
+                )
+        except Exception:
+            pass
+        return BrandIdentification()
+# =============================================================================
+# BENCHMARK ADVISOR AGENT
+# =============================================================================
+class BenchmarkAdvisorAgent:
+    """
+    ATLAS — Senior Design System Benchmark Analyst.
+    Recommends best-fit design system based on comparison data.
+    Model: Llama 3.3 70B (128K context for large benchmark data, excellent comparative reasoning)
+    Temperature: 0.25 (analytical, data-driven comparison)
+    WHY LLM: Requires reasoning about trade-offs and use-case fit,
+    not just similarity scores.
+    """
+    SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst specializing in cross-system comparison and alignment strategy.
+## YOUR ROLE IN THE PIPELINE
+You are Agent 2 of 4 in the Design System Analysis pipeline.
+- INPUT: User's extracted type scale, spacing, and font sizes + benchmark comparison data from the Rule Engine
+- OUTPUT: Benchmark recommendation with alignment roadmap → feeds into NEXUS (Agent 4) for final synthesis
+- Your recommendation helps the user decide which established design system to align with.
+## YOUR EXPERTISE
+- Deep knowledge of Material Design 3, Apple HIG, IBM Carbon, Ant Design, Atlassian, Tailwind CSS, Bootstrap
+- Type scale mathematics (major/minor second/third, perfect fourth/fifth, golden ratio)
+- Spacing grid systems (4px, 8px, multiples) and their trade-offs
+- Migration effort estimation for design system alignment
+## QUALITY STANDARDS
+- Always consider BOTH similarity score AND use-case fit. Closest match ≠ best fit.
+- Recommend max 4 alignment changes. More than that = the benchmark is not a good fit.
+- Effort estimates must be realistic: "low" = CSS variable change, "medium" = component updates, "high" = layout restructuring.
+- If similarity is above 85%, say "already well-aligned" and suggest minimal changes only.
+## WHAT NOT TO DO
+- Don't always recommend the closest match — a system 5% less similar but much better suited is preferable.
+- Don't list generic pros/cons. Be specific to the user's actual values.
+- Don't suggest alignment changes that would break accessibility (e.g., smaller base font).
+- Don't recommend obscure or abandoned design systems.
+## SCORING RUBRIC (Benchmark Fit):
+- Excellent Fit: >85% match, same use-case category, < 3 changes needed
+- Good Fit: 70-85% match, compatible use-case, 3-4 changes needed
+- Fair Fit: 50-70% match, different trade-offs to consider, 4+ changes
+- Poor Fit: <50% match, fundamentally different approach — don't recommend"""
+    PROMPT_TEMPLATE = """Analyze the following benchmark comparison data and recommend the best design system alignment.
+## USER'S CURRENT VALUES
+- Type Scale Ratio: {user_ratio}
+- Base Font Size: {user_base}px
+- Spacing Grid: {user_spacing}px
+## BENCHMARK COMPARISON
+{benchmark_comparison}
+## YOUR TASK
+1. **Recommend Best Fit**: Which design system should they align with? Consider use-case fit, not just numbers.
+2. **Explain Why**: Cite specific data points (similarity scores, ratio differences, spacing alignment).
+3. **List Changes Needed**: What would they need to change? Include effort estimates.
+4. **Pros/Cons**: Specific to this user's values, not generic statements.
+5. **Self-Evaluate** your recommendation quality.
+## OUTPUT FORMAT (JSON only)
+{{
+  "recommended_benchmark": "<system_key>",
+  "recommended_benchmark_name": "<full name>",
+  "reasoning": "Why this is the best fit — cite specific data",
+  "alignment_changes": [
+    {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
+    {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
+  ],
+  "pros_of_alignment": [
+    "Specific benefit with data"
+  ],
+  "cons_of_alignment": [
+    "Specific trade-off"
+  ],
+  "alternative_benchmarks": [
+    {{"name": "Material Design 3", "reason": "Good for Android-first products"}}
+  ],
+  "self_evaluation": {{
+    "confidence": <1-10>,
+    "reasoning": "Why I am this confident",
+    "data_quality": "good|fair|poor",
+    "flags": []
+  }}
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def analyze(
+        self,
+        user_ratio: float,
+        user_base: int,
+        user_spacing: int,
+        benchmark_comparisons: list,
+        log_callback: Callable = None,
+    ) -> BenchmarkAdvice:
+        """
+        Recommend best-fit design system.
+        Args:
+            user_ratio: User's detected type scale ratio
+            user_base: User's base font size
+            user_spacing: User's spacing grid base
+            benchmark_comparisons: List of BenchmarkComparison objects
+            log_callback: Progress logging function
+        Returns:
+            BenchmarkAdvice with recommendations
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("   🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)")
+        log("   └─ Evaluating benchmark fit for your use case...")
+        # Format comparison data
+        comparison_str = self._format_comparisons(benchmark_comparisons)
+        prompt = self.PROMPT_TEMPLATE.format(
+            user_ratio=user_ratio,
+            user_base=user_base,
+            user_spacing=user_spacing,
+            benchmark_comparison=comparison_str,
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="benchmark_advisor",
+                system_prompt=self.SYSTEM_PROMPT,
+                user_message=prompt,
+                max_tokens=900,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            result = self._parse_response(response)
+            log(f"   ────────────────────────────────────────────────")
+            log(f"   🏢 ATLAS — Benchmark Advisor: COMPLETE ({duration:.1f}s)")
+            log(f"   ├─ Recommended: {result.recommended_benchmark_name}")
+            log(f"   ├─ Changes Needed: {len(result.alignment_changes)}")
+            log(f"   ├─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
+            se = result.self_evaluation
+            if se:
+                log(f"   └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Benchmark Advisor failed: {str(e)[:120]}")
+            return BenchmarkAdvice()
+    def _format_comparisons(self, comparisons: list) -> str:
+        """Format benchmark comparisons for prompt."""
+        lines = []
+        for i, c in enumerate(comparisons[:5]):
+            b = c.benchmark
+            lines.append(f"""
+{i+1}. {b.icon} {b.name}
+   - Similarity Score: {c.similarity_score:.2f} (lower = better)
+   - Match: {c.overall_match_pct:.0f}%
+   - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
+   - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
+   - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
+   - Best For: {', '.join(b.best_for)}""")
+        return "\n".join(lines)
+    def _parse_response(self, response: str) -> BenchmarkAdvice:
+        """Parse LLM response into BenchmarkAdvice."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return BenchmarkAdvice(
+                    recommended_benchmark=data.get("recommended_benchmark", ""),
+                    recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
+                    reasoning=data.get("reasoning", ""),
+                    alignment_changes=data.get("alignment_changes", []),
+                    pros_of_alignment=data.get("pros_of_alignment", []),
+                    cons_of_alignment=data.get("cons_of_alignment", []),
+                    alternative_benchmarks=data.get("alternative_benchmarks", []),
+                    self_evaluation=data.get("self_evaluation", {}),
+                )
+        except Exception:
+            pass
+        return BenchmarkAdvice()
+# =============================================================================
+# BEST PRACTICES VALIDATOR AGENT
+# =============================================================================
+class BestPracticesValidatorAgent:
+    """
+    SENTINEL — Design System Best Practices Auditor.
+    Validates against design system standards and prioritizes fixes by business impact.
+    Model: Qwen 72B (methodical rule-following, precise judgment, structured output)
+    Temperature: 0.2 (strict, consistent rule evaluation)
+    WHY LLM: Prioritization requires judgment about business impact,
+    not just checking boxes.
+    """
+    SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor specializing in standards compliance and impact-based prioritization.
+## YOUR ROLE IN THE PIPELINE
+You are Agent 3 of 4 in the Design System Analysis pipeline.
+- INPUT: Rule Engine analysis results (typography, accessibility, spacing, color stats)
+- OUTPUT: Compliance score + prioritized fix list → feeds into NEXUS (Agent 4) for final synthesis
+- Your score directly appears on the user's dashboard. Your priority fixes become the action items.
+## YOUR EXPERTISE
+- WCAG 2.1 AA/AAA accessibility standards
+- Design system best practices (Material Design, Apple HIG, Tailwind conventions)
+- Typography systems (modular scales, vertical rhythm, readability)
+- Color management (palette size limits, near-duplicate detection, contrast requirements)
+- Spacing systems (grid alignment, consistency, component density)
+## QUALITY STANDARDS
+- Overall Score MUST reflect actual data. Don't default to 50.
+- Use the FULL 0-100 range: 90+ = excellent, 70-89 = good, 50-69 = needs work, <50 = significant issues
+- Priority fixes must be ACTIONABLE — include specific values to change (e.g., "Change #06b2c4 → #0891a8")
+- Maximum 5 priority fixes. If more, focus on highest-impact items.
+## WHAT NOT TO DO
+- Don't pass checks that clearly fail based on the data.
+- Don't inflate scores to be "encouraging" — honest assessment helps the user.
+- Don't list fixes without effort estimates — the user needs to plan their work.
+- Don't mix up "warn" and "fail": warn = imperfect but functional, fail = violates a standard.
+## SCORING RUBRIC (Overall Score 0-100):
+- 90-100: All checks pass, excellent accessibility, clean palette, consistent grid
+- 75-89: Most checks pass, minor issues in 1-2 areas, good foundation
+- 60-74: Several warnings, 1-2 failures, needs focused improvement
+- 40-59: Multiple failures, significant accessibility gaps, inconsistent system
+- 20-39: Fundamental issues across multiple areas, major rework needed
+- 0-19: Barely qualifies as a design system, almost everything fails
+## CHECK WEIGHTING:
+- AA Compliance: 25 points (most critical — affects real users)
+- Type Scale Consistency: 15 points
+- Type Scale Standard Ratio: 10 points
+- Base Size Accessible: 15 points
+- Spacing Grid: 15 points
+- Color Count: 10 points
+- No Near-Duplicates: 10 points"""
+    PROMPT_TEMPLATE = """Validate the following design tokens against best practices and prioritize fixes.
+## RULE ENGINE ANALYSIS RESULTS
+### Typography
+- Detected Ratio: {type_ratio} ({type_consistent})
+- Base Size: {base_size}px
+- Recommendation: {type_recommendation}
+### Accessibility
+- Total Colors: {total_colors}
+- AA Pass: {aa_pass}
+- AA Fail: {aa_fail}
+- Failing Colors: {failing_colors}
+### Spacing
+- Detected Base: {spacing_base}px
+- Grid Aligned: {spacing_aligned}%
+- Recommendation: {spacing_recommendation}px
+### Color Statistics
+- Unique Colors: {unique_colors}
+- Duplicates: {duplicates}
+- Near-Duplicates: {near_duplicates}
+## BEST PRACTICES CHECKLIST (check each one)
+1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
+2. Type scale is consistent (variance < 0.15)
+3. Base font size >= 16px (accessibility)
+4. All interactive colors pass WCAG AA (4.5:1 contrast)
+5. Spacing uses consistent grid (4px or 8px base)
+6. Limited color palette (< 20 unique semantic colors)
+7. No near-duplicate colors (< 3 delta-E apart)
+## YOUR TASK
+1. Score each practice: pass/warn/fail with specific notes citing the data
+2. Calculate overall score (0-100) using the weighting rubric
+3. Identify TOP 3-5 priority fixes with impact and effort assessment
+4. Self-evaluate your analysis
+## OUTPUT FORMAT (JSON only)
+{{
+  "overall_score": <0-100>,
+  "checks": {{
+    "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}},
+    "type_scale_consistent": {{"status": "...", "note": "..."}},
+    "base_size_accessible": {{"status": "...", "note": "..."}},
+    "aa_compliance": {{"status": "...", "note": "..."}},
+    "spacing_grid": {{"status": "...", "note": "..."}},
+    "color_count": {{"status": "...", "note": "..."}},
+    "near_duplicates": {{"status": "...", "note": "..."}}
+  }},
+  "priority_fixes": [
+    {{
+      "rank": 1,
+      "issue": "Brand primary fails AA",
+      "impact": "high|medium|low",
+      "effort": "low|medium|high",
+      "action": "Change #06b2c4 → #0891a8 for 4.5:1 contrast"
+    }}
+  ],
+  "passing_practices": ["Base font size", "..."],
+  "failing_practices": ["AA compliance", "..."],
+  "self_evaluation": {{
+    "confidence": <1-10>,
+    "reasoning": "Why I am this confident",
+    "data_quality": "good|fair|poor",
+    "flags": []
+  }}
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def analyze(
+        self,
+        rule_engine_results: Any,
+        log_callback: Callable = None,
+    ) -> BestPracticesResult:
+        """
+        Validate against best practices.
+        Args:
+            rule_engine_results: Results from rule engine
+            log_callback: Progress logging function
+        Returns:
+            BestPracticesResult with validation
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("   ✅ SENTINEL — Best Practices Validator (Qwen 72B)")
+        log("   └─ Checking against design system standards...")
+        # Extract data from rule engine
+        typo = rule_engine_results.typography
+        spacing = rule_engine_results.spacing
+        color_stats = rule_engine_results.color_stats
+        accessibility = rule_engine_results.accessibility
+        failures = [a for a in accessibility if not a.passes_aa_normal]
+        failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])
+        prompt = self.PROMPT_TEMPLATE.format(
+            type_ratio=f"{typo.detected_ratio:.3f}",
+            type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
+            base_size=typo.sizes_px[0] if typo.sizes_px else 16,
+            type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
+            total_colors=len(accessibility),
+            aa_pass=len(accessibility) - len(failures),
+            aa_fail=len(failures),
+            failing_colors=failing_colors_str or "None",
+            spacing_base=spacing.detected_base,
+            spacing_aligned=f"{spacing.alignment_percentage:.0f}",
+            spacing_recommendation=spacing.recommendation,
+            unique_colors=color_stats.unique_count,
+            duplicates=color_stats.duplicate_count,
+            near_duplicates=len(color_stats.near_duplicates),
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="best_practices_validator",
+                system_prompt=self.SYSTEM_PROMPT,
+                user_message=prompt,
+                max_tokens=1000,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            result = self._parse_response(response)
+            log(f"   ────────────────────────────────────────────────")
+            log(f"   ✅ SENTINEL — Best Practices: COMPLETE ({duration:.1f}s)")
+            log(f"   ├─ Overall Score: {result.overall_score}/100")
+            log(f"   ├─ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}")
+            if result.priority_fixes:
+                log(f"   ├─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
+            se = result.self_evaluation
+            if se:
+                log(f"   └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Best Practices Validator failed: {str(e)[:120]}")
+            return BestPracticesResult()
+    def _parse_response(self, response: str) -> BestPracticesResult:
+        """Parse LLM response into BestPracticesResult."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return BestPracticesResult(
+                    overall_score=data.get("overall_score", 50),
+                    checks=data.get("checks", {}),
+                    priority_fixes=data.get("priority_fixes", []),
+                    passing_practices=data.get("passing_practices", []),
+                    failing_practices=data.get("failing_practices", []),
+                    self_evaluation=data.get("self_evaluation", {}),
+                )
+        except Exception:
+            pass
+        return BestPracticesResult()
+# =============================================================================
+# HEAD SYNTHESIZER AGENT
+# =============================================================================
+class HeadSynthesizerAgent:
+    """
+    NEXUS — Senior Design System Architect & Synthesizer.
+    Combines all agent outputs into final actionable recommendations.
+    Model: Llama 3.3 70B (128K context for combined inputs, strong synthesis capability)
+    Temperature: 0.3 (balanced — needs to synthesize creatively but stay grounded in data)
+    This is the final step that produces actionable output for the user.
+    """
+    SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect specializing in synthesis and actionable recommendations.
+## YOUR ROLE IN THE PIPELINE
+You are Agent 4 of 4 — the HEAD Synthesizer in the Design System Analysis pipeline.
+- INPUT: Combined outputs from Rule Engine + AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices)
+- OUTPUT: Final executive summary, scores, and prioritized action plan → displayed directly to the user
+- You are the LAST agent. Your output IS the final result. Make it count.
+## YOUR EXPERTISE
+- Design system architecture and governance
+- Synthesizing conflicting recommendations into coherent strategy
+- Effort/impact prioritization (what to fix first)
+- Color accessibility remediation (suggesting AA-compliant alternatives)
+- Executive communication (clear, actionable summaries)
+## QUALITY STANDARDS
+- Executive Summary must be 2-3 sentences MAX. Lead with the overall score, then the #1 issue, then the #1 action.
+- Overall Score must SYNTHESIZE all agent inputs — don't just average them.
+- Color recommendations must include BOTH current AND suggested hex values.
+- Top 3 Actions must be ordered by IMPACT, not ease.
+- Accept/reject defaults on color recs: default to "accept" for accessibility fixes, "reject" for purely aesthetic changes.
+## WHAT NOT TO DO
+- Don't contradict previous agents without explaining why.
+- Don't recommend changes that SENTINEL flagged as breaking.
+- Don't suggest more than 8 color changes — the user will ignore a long list.
+- Don't give vague actions like "improve accessibility" — be specific: "Change brand.primary from #06b2c4 to #0891a8 for 4.5:1 contrast".
+- Don't inflate scores to be "nice". If the design system has issues, say so clearly.
+## SCORING RUBRIC (Overall 0-100):
+- 90-100: Production-ready design system, minor polishing only
+- 75-89: Solid foundation, 2-3 targeted improvements needed
+- 60-74: Functional but needs focused attention on accessibility or consistency
+- 40-59: Significant gaps requiring systematic improvement
+- 20-39: Major rework needed across multiple dimensions
+- 0-19: Fundamental redesign recommended"""
+    PROMPT_TEMPLATE = """Synthesize all analysis results into a final, actionable design system report.
+## RULE ENGINE FACTS (Layer 1 — Free, deterministic)
+- Type Scale: {type_ratio} ({type_status})
+- Base Size: {base_size}px
+- AA Failures: {aa_failures}
+- Spacing Grid: {spacing_status}
+- Unique Colors: {unique_colors}
+- Consistency Score: {consistency_score}/100
+## AURORA — Brand Identification (Agent 1)
+- Brand Primary: {brand_primary}
+- Brand Secondary: {brand_secondary}
+- Palette Cohesion: {cohesion_score}/10
+## ATLAS — Benchmark Advice (Agent 2)
+Closest Match: {closest_benchmark}
+Match Percentage: {match_pct}%
+Recommended Changes: {benchmark_changes}
+## SENTINEL — Best Practices Validation (Agent 3)
+Overall Score: {best_practices_score}/100
+Priority Fixes: {priority_fixes}
+## ACCESSIBILITY FIXES NEEDED
+{accessibility_fixes}
+## YOUR TASK
+Synthesize ALL the above into:
+1. Executive Summary (2-3 sentences — lead with score, #1 issue, #1 action)
+2. Overall Scores (synthesized, not averaged)
+3. Top 3 Priority Actions (ordered by IMPACT, include effort estimates)
+4. Specific Color Recommendations (with accept/reject defaults)
+5. Type Scale Recommendation
+6. Spacing Recommendation
+7. Self-Evaluation of your synthesis
+## OUTPUT FORMAT (JSON only)
+{{
+  "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
+  "scores": {{
+    "overall": <0-100>,
+    "accessibility": <0-100>,
+    "consistency": <0-100>,
+    "organization": <0-100>
+  }},
+  "benchmark_fit": {{
+    "closest": "<name>",
+    "similarity": "<X%>",
+    "recommendation": "Specific action to align"
+  }},
+  "brand_analysis": {{
+    "primary": "#hex",
+    "secondary": "#hex",
+    "cohesion": <1-10>
+  }},
+  "top_3_actions": [
+    {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
+  ],
+  "color_recommendations": [
+    {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
+  ],
+  "type_scale_recommendation": {{
+    "current_ratio": 1.18,
+    "recommended_ratio": 1.25,
+    "reason": "Why this ratio is better"
+  }},
+  "spacing_recommendation": {{
+    "current": "mixed",
+    "recommended": "8px",
+    "reason": "Why this grid is better"
+  }},
+  "self_evaluation": {{
+    "confidence": <1-10>,
+    "reasoning": "Why I am this confident in the synthesis",
+    "data_quality": "good|fair|poor",
+    "flags": []
+  }}
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def synthesize(
+        self,
+        rule_engine_results: Any,
+        benchmark_comparisons: list,
+        brand_identification: BrandIdentification,
+        benchmark_advice: BenchmarkAdvice,
+        best_practices: BestPracticesResult,
+        log_callback: Callable = None,
+    ) -> HeadSynthesis:
+        """
+        Synthesize all results into final recommendations.
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("═" * 60)
+        log("🧠 LAYER 4: NEXUS — HEAD SYNTHESIZER (Llama 3.3 70B)")
+        log("═" * 60)
+        log("")
+        log("   Combining: Rule Engine + AURORA + ATLAS + SENTINEL...")
+        # Extract data
+        typo = rule_engine_results.typography
+        spacing = rule_engine_results.spacing
+        color_stats = rule_engine_results.color_stats
+        accessibility = rule_engine_results.accessibility
+        failures = [a for a in accessibility if not a.passes_aa_normal]
+        aa_fixes_str = "\n".join([
+            f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
+            for a in failures[:5] if a.suggested_fix
+        ])
+        closest = benchmark_comparisons[0] if benchmark_comparisons else None
+        prompt = self.PROMPT_TEMPLATE.format(
+            type_ratio=f"{typo.detected_ratio:.3f}",
+            type_status="consistent" if typo.is_consistent else "inconsistent",
+            base_size=typo.sizes_px[0] if typo.sizes_px else 16,
+            aa_failures=len(failures),
+            spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
+            unique_colors=color_stats.unique_count,
+            consistency_score=rule_engine_results.consistency_score,
+            closest_benchmark=closest.benchmark.name if closest else "Unknown",
+            match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
+            benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
+            brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
+            brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
+            cohesion_score=brand_identification.cohesion_score,
+            best_practices_score=best_practices.overall_score,
+            priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
+            accessibility_fixes=aa_fixes_str or "None needed",
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="head_synthesizer",
+                system_prompt=self.SYSTEM_PROMPT,
+                user_message=prompt,
+                max_tokens=1200,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            result = self._parse_response(response)
+            log("")
+            log(f"   ✅ NEXUS — HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
+            if result.scores:
+                log(f"   ├─ Overall Score: {result.scores.get('overall', '?')}/100")
+            log(f"   ├─ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}")
+            se = result.self_evaluation
+            if se:
+                log(f"   └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
+            log("")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Head Synthesizer failed: {str(e)[:120]}")
+            return HeadSynthesis()
+    def _parse_response(self, response: str) -> HeadSynthesis:
+        """Parse LLM response into HeadSynthesis."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return HeadSynthesis(
+                    executive_summary=data.get("executive_summary", ""),
+                    scores=data.get("scores", {}),
+                    benchmark_fit=data.get("benchmark_fit", {}),
+                    brand_analysis=data.get("brand_analysis", {}),
+                    top_3_actions=data.get("top_3_actions", []),
+                    color_recommendations=data.get("color_recommendations", []),
+                    type_scale_recommendation=data.get("type_scale_recommendation", {}),
+                    spacing_recommendation=data.get("spacing_recommendation", {}),
+                    self_evaluation=data.get("self_evaluation", {}),
+                )
+        except Exception:
+            pass
+        return HeadSynthesis()