Spaces:

riazmo
/

Design-System-Automation

Running

File size: 55,211 Bytes

"""
Stage 2 LLM Agents — v3 Agentic Architecture
==============================================

Each agent:
- Researches ALL token types (colors, typography, spacing, radius, shadows)
- Uses ReAct framework: THINK → ACT → OBSERVE → VERIFY
- Returns visible reasoning chain for the UI
- Has a Python-based critic for validation

Agents run IN PARALLEL (asyncio.gather), then NEXUS compiles.

Agent Responsibilities:
- AURORA: Brand identity + semantic naming for ALL colors + notes on all token types
- SENTINEL: Best practices audit across ALL token types, grounded in rule-engine data
- ATLAS: Benchmark comparison for ALL token types
- NEXUS (HEAD): Tree-of-Thought synthesis, compiles all agent outputs
"""

import json
import re
from dataclasses import dataclass, field
from typing import Optional, Callable, Any
from datetime import datetime


# =============================================================================
# DATA CLASSES — v3: includes reasoning_trace + naming_map
# =============================================================================

@dataclass
class BrandIdentification:
    """Results from AURORA — Brand Identifier (ReAct)."""
    brand_primary: dict = field(default_factory=dict)
    brand_secondary: dict = field(default_factory=dict)
    brand_accent: dict = field(default_factory=dict)
    palette_strategy: str = ""
    cohesion_score: int = 5
    cohesion_notes: str = ""

    # v3: naming_map covers ALL colors, not just top 10
    naming_map: dict = field(default_factory=dict)
    # {hex: "color.brand.primary"} or {hex: "color.blue.500"}

    semantic_names: dict = field(default_factory=dict)  # backward compat
    self_evaluation: dict = field(default_factory=dict)

    # v3: reasoning trace visible to user
    reasoning_trace: list = field(default_factory=list)
    validation_passed: bool = False
    retry_count: int = 0

    # v3: per-token-type observations
    typography_notes: str = ""
    spacing_notes: str = ""
    radius_notes: str = ""
    shadow_notes: str = ""

    def to_dict(self) -> dict:
        return {
            "brand_primary": self.brand_primary,
            "brand_secondary": self.brand_secondary,
            "brand_accent": self.brand_accent,
            "palette_strategy": self.palette_strategy,
            "cohesion_score": self.cohesion_score,
            "cohesion_notes": self.cohesion_notes,
            "naming_map": self.naming_map,
            "semantic_names": self.semantic_names,
            "self_evaluation": self.self_evaluation,
            "typography_notes": self.typography_notes,
            "spacing_notes": self.spacing_notes,
            "radius_notes": self.radius_notes,
            "shadow_notes": self.shadow_notes,
        }


@dataclass
class BenchmarkAdvice:
    """Results from ATLAS — Benchmark Advisor (ReAct)."""
    recommended_benchmark: str = ""
    recommended_benchmark_name: str = ""
    reasoning: str = ""
    alignment_changes: list = field(default_factory=list)
    pros_of_alignment: list = field(default_factory=list)
    cons_of_alignment: list = field(default_factory=list)
    alternative_benchmarks: list = field(default_factory=list)
    self_evaluation: dict = field(default_factory=dict)

    # v3: per-token-type benchmark comparison
    typography_comparison: dict = field(default_factory=dict)
    spacing_comparison: dict = field(default_factory=dict)
    color_comparison: dict = field(default_factory=dict)
    radius_comparison: dict = field(default_factory=dict)
    shadow_comparison: dict = field(default_factory=dict)

    reasoning_trace: list = field(default_factory=list)

    def to_dict(self) -> dict:
        return {
            "recommended_benchmark": self.recommended_benchmark,
            "recommended_benchmark_name": self.recommended_benchmark_name,
            "reasoning": self.reasoning,
            "alignment_changes": self.alignment_changes,
            "pros": self.pros_of_alignment,
            "cons": self.cons_of_alignment,
            "alternatives": self.alternative_benchmarks,
            "self_evaluation": self.self_evaluation,
            "typography_comparison": self.typography_comparison,
            "spacing_comparison": self.spacing_comparison,
            "color_comparison": self.color_comparison,
            "radius_comparison": self.radius_comparison,
            "shadow_comparison": self.shadow_comparison,
        }


@dataclass
class BestPracticesResult:
    """Results from SENTINEL — Best Practices Auditor (ReAct)."""
    overall_score: int = 50
    checks: dict = field(default_factory=dict)
    priority_fixes: list = field(default_factory=list)
    passing_practices: list = field(default_factory=list)
    failing_practices: list = field(default_factory=list)
    self_evaluation: dict = field(default_factory=dict)

    # v3: per-token-type assessments
    color_assessment: dict = field(default_factory=dict)
    typography_assessment: dict = field(default_factory=dict)
    spacing_assessment: dict = field(default_factory=dict)
    radius_assessment: dict = field(default_factory=dict)
    shadow_assessment: dict = field(default_factory=dict)

    reasoning_trace: list = field(default_factory=list)
    validation_passed: bool = False

    def to_dict(self) -> dict:
        return {
            "overall_score": self.overall_score,
            "checks": self.checks,
            "priority_fixes": self.priority_fixes,
            "passing": self.passing_practices,
            "failing": self.failing_practices,
            "self_evaluation": self.self_evaluation,
            "color_assessment": self.color_assessment,
            "typography_assessment": self.typography_assessment,
            "spacing_assessment": self.spacing_assessment,
            "radius_assessment": self.radius_assessment,
            "shadow_assessment": self.shadow_assessment,
        }


@dataclass
class HeadSynthesis:
    """Results from NEXUS — HEAD Synthesizer (Tree of Thought)."""
    executive_summary: str = ""
    scores: dict = field(default_factory=dict)
    benchmark_fit: dict = field(default_factory=dict)
    brand_analysis: dict = field(default_factory=dict)
    top_3_actions: list = field(default_factory=list)
    color_recommendations: list = field(default_factory=list)
    type_scale_recommendation: dict = field(default_factory=dict)
    spacing_recommendation: dict = field(default_factory=dict)
    radius_recommendation: dict = field(default_factory=dict)
    shadow_recommendation: dict = field(default_factory=dict)
    self_evaluation: dict = field(default_factory=dict)

    # v3: ToT branches visible to user
    perspective_a: dict = field(default_factory=dict)
    perspective_b: dict = field(default_factory=dict)
    chosen_perspective: str = ""
    choice_reasoning: str = ""

    reasoning_trace: list = field(default_factory=list)

    def to_dict(self) -> dict:
        return {
            "executive_summary": self.executive_summary,
            "scores": self.scores,
            "benchmark_fit": self.benchmark_fit,
            "brand_analysis": self.brand_analysis,
            "top_3_actions": self.top_3_actions,
            "color_recommendations": self.color_recommendations,
            "type_scale_recommendation": self.type_scale_recommendation,
            "spacing_recommendation": self.spacing_recommendation,
            "radius_recommendation": self.radius_recommendation,
            "shadow_recommendation": self.shadow_recommendation,
            "self_evaluation": self.self_evaluation,
            "chosen_perspective": self.chosen_perspective,
            "choice_reasoning": self.choice_reasoning,
        }


# =============================================================================
# SHARED HELPERS — format token data for prompts
# =============================================================================

def _fmt_colors(tokens: dict, limit: int = 40) -> str:
    """Format color tokens for any agent prompt."""
    if not tokens:
        return "No color data"
    lines = []
    for name, t in list(tokens.items())[:limit]:
        d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {}
        hex_val = d.get("value", "")
        freq = d.get("frequency", 0)
        hint = d.get("role_hint", "")
        ctx = ", ".join((d.get("contexts") or [])[:3])
        els = ", ".join((d.get("elements") or [])[:3])
        hint_s = f" [hint:{hint}]" if hint else ""
        lines.append(f"- {hex_val}: {freq}x, ctx=[{ctx}], el=[{els}]{hint_s}")
    return "\n".join(lines)


def _fmt_typography(tokens: dict, limit: int = 15) -> str:
    if not tokens:
        return "No typography data"
    lines = []
    for name, t in list(tokens.items())[:limit]:
        d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {}
        fam = d.get("font_family", "?")
        sz = d.get("font_size", "?")
        w = d.get("font_weight", 400)
        lh = d.get("line_height", "?")
        freq = d.get("frequency", 0)
        els = ", ".join((d.get("elements") or [])[:3])
        lines.append(f"- {fam} {sz} w{w} lh={lh} ({freq}x) [{els}]")
    return "\n".join(lines)


def _fmt_spacing(tokens: dict, limit: int = 15) -> str:
    if not tokens:
        return "No spacing data"
    lines = []
    for name, t in list(tokens.items())[:limit]:
        d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {}
        val = d.get("value", "?")
        px = d.get("value_px", "?")
        freq = d.get("frequency", 0)
        ctx = ", ".join((d.get("contexts") or [])[:3])
        lines.append(f"- {val} ({px}px) {freq}x [{ctx}]")
    return "\n".join(lines)


def _fmt_radius(tokens: dict, limit: int = 10) -> str:
    if not tokens:
        return "No radius data"
    lines = []
    for name, t in list(tokens.items())[:limit]:
        d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {}
        val = d.get("value", "?")
        px = d.get("value_px", "?")
        freq = d.get("frequency", 0)
        b4 = d.get("fits_base_4", False)
        b8 = d.get("fits_base_8", False)
        els = ", ".join((d.get("elements") or [])[:3])
        lines.append(f"- {name}: {val} (base4={b4}, base8={b8}, {freq}x) [{els}]")
    return "\n".join(lines)


def _fmt_shadows(tokens: dict, limit: int = 10) -> str:
    if not tokens:
        return "No shadow data"
    lines = []
    for name, t in list(tokens.items())[:limit]:
        d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {}
        blur = d.get("blur_px", "?")
        y = d.get("y_offset_px", "?")
        freq = d.get("frequency", 0)
        els = ", ".join((d.get("elements") or [])[:3])
        lines.append(f"- {name}: blur={blur}px y={y}px ({freq}x) [{els}]")
    return "\n".join(lines)


def _log_reasoning(steps: list, log_fn: Callable):
    """Log ReAct reasoning steps with full content (no truncation)."""
    icons = {"THINK": "🧠", "ACT": "⚡", "OBSERVE": "👁️", "VERIFY": "✅"}
    for step in (steps or []):
        if isinstance(step, dict):
            st = step.get("step", "?")
            area = step.get("area", "")
            content = step.get("content", "")
            icon = icons.get(st, "📝")
            # Show full reasoning — wrap long lines for readability
            if len(content) > 120:
                log_fn(f"   {icon} [{st}] {area}:")
                # Word-wrap at ~100 chars per line
                words = content.split()
                line = "      "
                for word in words:
                    if len(line) + len(word) + 1 > 105:
                        log_fn(line)
                        line = "      " + word
                    else:
                        line = line + " " + word if line.strip() else "      " + word
                if line.strip():
                    log_fn(line)
            else:
                log_fn(f"   {icon} [{st}] {area}: {content}")


def _extract_hexes(tokens: dict) -> list:
    """Get list of hex values from color token dict."""
    hexes = []
    for name, t in tokens.items():
        if isinstance(t, dict):
            h = t.get("value", "")
        else:
            h = getattr(t, "value", "")
        if h:
            hexes.append(h.lower())
    return hexes


# =============================================================================
# AURORA — Brand Identifier (ReAct Framework)
# =============================================================================

class BrandIdentifierAgent:
    """
    AURORA — Senior Brand & Visual Identity Analyst.
    v3.1: ADVISORY ONLY — does NOT name colors (rule-based classifier does that).
    Provides brand insights, palette strategy, cohesion assessment.
    Model: Qwen 72B · Temperature: 0.4
    """

    SYSTEM_PROMPT = """You are AURORA, a Senior Brand & Visual Identity Analyst.

## YOUR ROLE (v3.1: Advisory Only)
Color NAMING is handled by a rule-based classifier. Do NOT output naming_map.
Your job is to provide INSIGHTS about the brand identity and design cohesion.

## REASONING FRAMEWORK (ReAct)
Structure your response with explicit reasoning steps.
For each area: THINK → ACT → OBSERVE → VERIFY.

## ANALYZE ALL TOKEN TYPES:

### 1. COLORS — Identify brand strategy (complementary? analogous? monochromatic?)
### 2. TYPOGRAPHY — Identify heading vs body hierarchy, font pairing quality
### 3. SPACING — Identify grid system, note consistency
### 4. RADIUS — Identify radius strategy (sharp/rounded/pill)
### 5. SHADOWS — Identify elevation strategy, blur progression

## QUALITY RULES
- Brand Primary MUST cite usage evidence (e.g. "47x on buttons")
- Cohesion 1-10: most sites score 5-7. Use the full range.
- Do NOT invent names. Focus on analysis and insights.

## OUTPUT (JSON)

{
  "reasoning_steps": [
    {"step": "THINK", "area": "colors", "content": "..."},
    {"step": "ACT", "area": "colors", "content": "..."},
    {"step": "OBSERVE", "area": "typography", "content": "..."},
    {"step": "ACT", "area": "spacing", "content": "..."},
    {"step": "ACT", "area": "radius", "content": "..."},
    {"step": "ACT", "area": "shadows", "content": "..."},
    {"step": "VERIFY", "area": "all", "content": "Cross-checking consistency..."}
  ],
  "brand_primary": {"color": "#hex", "confidence": "high|medium|low", "reasoning": "cite evidence", "usage_count": N},
  "brand_secondary": {"color": "#hex", "confidence": "...", "reasoning": "..."},
  "brand_accent": {"color": "#hex or null", "confidence": "...", "reasoning": "..."},
  "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
  "cohesion_score": N,
  "cohesion_notes": "...",
  "naming_map": {},  // Optional: ONLY semantic role suggestions (brand.primary, text.secondary, etc.)
  "typography_notes": "Heading: Inter 700, Body: Inter 400. Clean hierarchy.",
  "spacing_notes": "8px grid, 92% aligned.",
  "radius_notes": "Rounded style: 4px inputs, 8px cards.",
  "shadow_notes": "3-level elevation: blur 4/8/24px.",
  "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "good|fair|poor", "flags": []}
}

Return ONLY valid JSON."""

    PROMPT_TEMPLATE = """Analyze the complete design system.

## COLORS (with role_hints)
{color_data}

## TYPOGRAPHY
{typography_data}

## SPACING
{spacing_data}

## RADIUS
{radius_data}

## SHADOWS
{shadow_data}

Use ReAct for each area. If you see clear semantic roles (brand primary, text color, etc.), suggest them in naming_map. Otherwise leave naming_map empty — the rule-based classifier handles naming."""

    def __init__(self, hf_client):
        self.hf_client = hf_client

    async def analyze(
        self,
        color_tokens: dict,
        typography_tokens: dict = None,
        spacing_tokens: dict = None,
        radius_tokens: dict = None,
        shadow_tokens: dict = None,
        log_callback: Callable = None,
    ) -> BrandIdentification:
        def log(msg):
            if log_callback:
                log_callback(msg)

        log("   🎨 AURORA — Brand & Visual Identity (Qwen 72B)")
        log("   └─ ReAct: Analyzing colors + typography + spacing + radius + shadows...")

        prompt = self.PROMPT_TEMPLATE.format(
            color_data=_fmt_colors(color_tokens),
            typography_data=_fmt_typography(typography_tokens),
            spacing_data=_fmt_spacing(spacing_tokens),
            radius_data=_fmt_radius(radius_tokens),
            shadow_data=_fmt_shadows(shadow_tokens),
        )

        try:
            start = datetime.now()
            response = await self.hf_client.complete_async(
                agent_name="brand_identifier",
                system_prompt=self.SYSTEM_PROMPT,
                user_message=prompt,
                max_tokens=2000,
                json_mode=True,
            )
            dur = (datetime.now() - start).total_seconds()
            result = self._parse(response)

            # Critic validation
            input_hexes = _extract_hexes(color_tokens)
            passed, errors = validate_aurora_output(result, input_hexes)
            result.validation_passed = passed

            if not passed and result.retry_count == 0:
                log(f"   ⚠️ Critic: {len(errors)} issues — retrying with feedback...")
                for e in errors[:3]:
                    log(f"      └─ {e}")
                retry_prompt = prompt + "\n\n## CRITIC FEEDBACK — Fix:\n" + "\n".join(errors[:10])
                resp2 = await self.hf_client.complete_async(
                    agent_name="brand_identifier",
                    system_prompt=self.SYSTEM_PROMPT,
                    user_message=retry_prompt,
                    max_tokens=2000,
                    json_mode=True,
                )
                result = self._parse(resp2)
                result.retry_count = 1
                p2, e2 = validate_aurora_output(result, input_hexes)
                result.validation_passed = p2
                if not p2:
                    log(f"   ⚠️ Retry: still {len(e2)} issues — using normalizer fallback names")

            # Log reasoning chain
            log(f"   ─────────────────────────────────────────")
            log(f"   🎨 AURORA — COMPLETE ({dur:.1f}s)")
            _log_reasoning(result.reasoning_trace, log)
            log(f"   ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')})")
            log(f"   ├─ Palette: {result.palette_strategy} · Cohesion: {result.cohesion_score}/10")
            log(f"   ├─ Colors Named: {len(result.naming_map)}/{len(input_hexes)}")
            log(f"   ├─ Typography: {result.typography_notes or 'N/A'}")
            log(f"   ├─ Spacing: {result.spacing_notes or 'N/A'}")
            log(f"   ├─ Radius: {result.radius_notes or 'N/A'}")
            log(f"   ├─ Shadows: {result.shadow_notes or 'N/A'}")
            log(f"   └─ Critic: {'✅ PASSED' if result.validation_passed else '⚠️ FALLBACK'}")
            return result

        except Exception as e:
            log(f"   ⚠️ AURORA failed: {str(e)[:120]}")
            return BrandIdentification()

    def _parse(self, response: str) -> BrandIdentification:
        try:
            m = re.search(r'\{[\s\S]*\}', response)
            if m:
                d = json.loads(m.group())
                return BrandIdentification(
                    brand_primary=d.get("brand_primary", {}),
                    brand_secondary=d.get("brand_secondary", {}),
                    brand_accent=d.get("brand_accent", {}),
                    palette_strategy=d.get("palette_strategy", "unknown"),
                    cohesion_score=d.get("cohesion_score", 5),
                    cohesion_notes=d.get("cohesion_notes", ""),
                    naming_map=d.get("naming_map", {}),
                    semantic_names=d.get("naming_map", {}),
                    self_evaluation=d.get("self_evaluation", {}),
                    reasoning_trace=d.get("reasoning_steps", []),
                    typography_notes=d.get("typography_notes", ""),
                    spacing_notes=d.get("spacing_notes", ""),
                    radius_notes=d.get("radius_notes", ""),
                    shadow_notes=d.get("shadow_notes", ""),
                )
        except Exception:
            pass
        return BrandIdentification()


# =============================================================================
# ATLAS — Benchmark Advisor (ReAct Framework)
# =============================================================================

class BenchmarkAdvisorAgent:
    """
    ATLAS — Senior Design System Benchmark Analyst.
    ReAct comparison of ALL token types against industry benchmarks.
    Model: Llama 3.3 70B · Temperature: 0.25
    """

    SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst.

## REASONING FRAMEWORK (ReAct)
For EACH token type: THINK → ACT → OBSERVE → VERIFY.

Compare the user's values against benchmarks for:
1. TYPOGRAPHY — ratio, base size, scale pattern
2. SPACING — grid base, alignment, scale
3. COLORS — palette size, brand color usage
4. RADIUS — strategy (sharp/rounded/pill), tier count
5. SHADOWS — elevation levels, blur range

Then pick the BEST OVERALL FIT benchmark.
Max 4 alignment changes. If >85% match, say "already well-aligned".

## OUTPUT (JSON)

{
  "reasoning_steps": [
    {"step": "THINK", "area": "typography", "content": "User ratio 1.18 vs Material 1.25..."},
    {"step": "ACT", "area": "typography", "content": "Material closest for type"},
    {"step": "THINK", "area": "spacing", "content": "8px matches Material and Polaris"},
    {"step": "ACT", "area": "spacing", "content": "Both aligned"},
    {"step": "THINK", "area": "colors", "content": "25 colors vs Polaris 18..."},
    {"step": "THINK", "area": "radius", "content": "4/8px tiers..."},
    {"step": "THINK", "area": "shadows", "content": "3 levels vs Material 5..."},
    {"step": "VERIFY", "area": "overall", "content": "Material best: 4/5 areas align"}
  ],
  "recommended_benchmark": "material_design_3",
  "recommended_benchmark_name": "Material Design 3",
  "reasoning": "Best fit across all token types — cite data",
  "alignment_changes": [
    {"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium", "token_type": "typography"}
  ],
  "typography_comparison": {"user": "1.18", "benchmark": "1.25", "gap": "minor"},
  "spacing_comparison": {"user": "8px", "benchmark": "8px", "gap": "aligned"},
  "color_comparison": {"user": "25", "benchmark": "18", "gap": "reduce"},
  "radius_comparison": {"user": "2 tiers", "benchmark": "3 tiers", "gap": "add xl"},
  "shadow_comparison": {"user": "3 levels", "benchmark": "5 levels", "gap": "add 2"},
  "pros_of_alignment": ["..."],
  "cons_of_alignment": ["..."],
  "alternative_benchmarks": [{"name": "Polaris", "reason": "..."}],
  "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []}
}

Return ONLY valid JSON."""

    PROMPT_TEMPLATE = """Compare this design system against benchmarks — ALL token types.

## CURRENT VALUES
- Type Scale Ratio: {user_ratio} | Base: {user_base}px | Sizes: {user_sizes}
- Spacing Grid: {user_spacing}px | Values: {spacing_values}
- Colors: {color_count} unique | Brand: {brand_info}
- Radius: {radius_data}
- Shadows: {shadow_data}

## BENCHMARKS
{benchmark_comparison}

Use ReAct per token type. Pick the best overall fit."""

    def __init__(self, hf_client):
        self.hf_client = hf_client

    async def analyze(
        self,
        user_ratio: float, user_base: int, user_spacing: int,
        benchmark_comparisons: list,
        color_count: int = 0, brand_info: str = "",
        user_sizes: str = "", spacing_values: str = "",
        radius_data: str = "", shadow_data: str = "",
        log_callback: Callable = None,
    ) -> BenchmarkAdvice:
        def log(msg):
            if log_callback:
                log_callback(msg)

        log("")
        log("   🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)")
        log("   └─ ReAct: Comparing typography + spacing + colors + radius + shadows...")

        prompt = self.PROMPT_TEMPLATE.format(
            user_ratio=user_ratio, user_base=user_base, user_spacing=user_spacing,
            user_sizes=user_sizes or "N/A",
            spacing_values=spacing_values or "N/A",
            color_count=color_count, brand_info=brand_info or "N/A",
            radius_data=radius_data or "No radius data",
            shadow_data=shadow_data or "No shadow data",
            benchmark_comparison=self._fmt_benchmarks(benchmark_comparisons),
        )

        try:
            start = datetime.now()
            response = await self.hf_client.complete_async(
                agent_name="benchmark_advisor",
                system_prompt=self.SYSTEM_PROMPT,
                user_message=prompt,
                max_tokens=1500,
                json_mode=True,
            )
            dur = (datetime.now() - start).total_seconds()
            result = self._parse(response)

            log(f"   ─────────────────────────────────────────")
            log(f"   🏢 ATLAS — COMPLETE ({dur:.1f}s)")
            _log_reasoning(result.reasoning_trace, log)
            log(f"   ├─ Recommended: {result.recommended_benchmark_name}")
            log(f"   ├─ Changes: {len(result.alignment_changes)}")
            log(f"   ├─ Typography: {result.typography_comparison}")
            log(f"   ├─ Spacing: {result.spacing_comparison}")
            log(f"   ├─ Colors: {result.color_comparison}")
            log(f"   ├─ Radius: {result.radius_comparison}")
            log(f"   └─ Shadows: {result.shadow_comparison}")
            return result

        except Exception as e:
            log(f"   ⚠️ ATLAS failed: {str(e)[:120]}")
            return BenchmarkAdvice()

    def _fmt_benchmarks(self, comparisons: list) -> str:
        lines = []
        for i, c in enumerate(comparisons[:5]):
            b = c.benchmark
            lines.append(f"{i+1}. {b.icon} {b.name} — Match: {c.overall_match_pct:.0f}%"
                         f" | Type: {b.typography.get('scale_ratio', '?')}"
                         f" | Spacing: {b.spacing.get('base', '?')}px"
                         f" | Best for: {', '.join(b.best_for)}")
        return "\n".join(lines) if lines else "No benchmark data"

    def _parse(self, response: str) -> BenchmarkAdvice:
        try:
            m = re.search(r'\{[\s\S]*\}', response)
            if m:
                d = json.loads(m.group())
                return BenchmarkAdvice(
                    recommended_benchmark=d.get("recommended_benchmark", ""),
                    recommended_benchmark_name=d.get("recommended_benchmark_name", ""),
                    reasoning=d.get("reasoning", ""),
                    alignment_changes=d.get("alignment_changes", []),
                    pros_of_alignment=d.get("pros_of_alignment", []),
                    cons_of_alignment=d.get("cons_of_alignment", []),
                    alternative_benchmarks=d.get("alternative_benchmarks", []),
                    self_evaluation=d.get("self_evaluation", {}),
                    typography_comparison=d.get("typography_comparison", {}),
                    spacing_comparison=d.get("spacing_comparison", {}),
                    color_comparison=d.get("color_comparison", {}),
                    radius_comparison=d.get("radius_comparison", {}),
                    shadow_comparison=d.get("shadow_comparison", {}),
                    reasoning_trace=d.get("reasoning_steps", []),
                )
        except Exception:
            pass
        return BenchmarkAdvice()


# =============================================================================
# SENTINEL — Best Practices Auditor (ReAct + Grounded Scoring)
# =============================================================================

class BestPracticesValidatorAgent:
    """
    SENTINEL — Design System Best Practices Auditor.
    ReAct: Grounds EVERY score in actual rule-engine data. Audits ALL token types.
    Model: Qwen 72B · Temperature: 0.2
    """

    SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor.

## REASONING FRAMEWORK (ReAct + Grounded)
For EACH check: THINK → ACT (cite data) → OBSERVE → VERIFY.
You MUST CITE the exact input data for every score.

## AUDIT ALL TOKEN TYPES:

### COLORS (25 pts)
- aa_compliance: CITE AA pass/fail count
- color_count: < 20 semantic colors ideal
- near_duplicates: should be 0

### TYPOGRAPHY (25 pts)
- type_scale_standard: nearest standard ratio
- type_scale_consistent: variance check
- base_size_accessible: >= 16px

### SPACING (20 pts)
- spacing_grid: 4px or 8px consistency
- spacing_alignment: > 80% target

### RADIUS (15 pts)
- radius_consistency: base-4/8 grid, clear tiers

### SHADOWS (15 pts)
- shadow_system: elevation hierarchy, blur progression

## CRITICAL: If data says 7 AA failures, you CANNOT say "pass".

## OUTPUT (JSON)

{
  "reasoning_steps": [
    {"step": "THINK", "area": "colors", "content": "7/25 fail AA = 28%"},
    {"step": "ACT", "area": "colors", "content": "aa_compliance = FAIL"},
    {"step": "THINK", "area": "typography", "content": "ratio 1.18, variance 0.22"},
    {"step": "ACT", "area": "typography", "content": "type_scale_consistent = WARN"},
    {"step": "THINK", "area": "spacing", "content": "8px base, 85% aligned"},
    {"step": "ACT", "area": "spacing", "content": "spacing_grid = PASS"},
    {"step": "THINK", "area": "radius", "content": "4px,8px,16px all base-4"},
    {"step": "ACT", "area": "radius", "content": "radius_consistency = PASS"},
    {"step": "THINK", "area": "shadows", "content": "3 levels, blur 4→8→24"},
    {"step": "ACT", "area": "shadows", "content": "shadow_system = WARN"},
    {"step": "VERIFY", "area": "scoring", "content": "3 pass, 2 warn, 1 fail → 62/100"}
  ],
  "overall_score": N,
  "checks": {
    "aa_compliance": {"status": "pass|warn|fail", "note": "CITE: 7/25 fail AA"},
    "type_scale_standard": {"status": "...", "note": "CITE: ratio 1.18 nearest 1.2"},
    "type_scale_consistent": {"status": "...", "note": "CITE: variance 0.22 > 0.15"},
    "base_size_accessible": {"status": "...", "note": "CITE: base = Npx"},
    "spacing_grid": {"status": "...", "note": "CITE: N% aligned to Npx"},
    "color_count": {"status": "...", "note": "CITE: N unique colors"},
    "near_duplicates": {"status": "...", "note": "CITE: N pairs"},
    "radius_consistency": {"status": "...", "note": "CITE: tiers and grid"},
    "shadow_system": {"status": "...", "note": "CITE: N levels, progression"}
  },
  "color_assessment": {"aa_pass_rate": "72%", "palette_size": 25, "verdict": "needs work"},
  "typography_assessment": {"ratio": 1.18, "consistent": false, "base_ok": true, "verdict": "fair"},
  "spacing_assessment": {"grid": "8px", "alignment": "85%", "verdict": "good"},
  "radius_assessment": {"tiers": 3, "base_aligned": true, "verdict": "good"},
  "shadow_assessment": {"levels": 3, "progression": "non-linear", "verdict": "fair"},
  "priority_fixes": [
    {"rank": 1, "issue": "...", "impact": "high", "effort": "low", "action": "Specific fix", "token_type": "color"}
  ],
  "passing_practices": ["spacing_grid"],
  "failing_practices": ["aa_compliance"],
  "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []}
}

Return ONLY valid JSON."""

    PROMPT_TEMPLATE = """Audit this design system. CITE the data for every score.

## RULE ENGINE FACTS (verified)

### Typography
- Ratio: {type_ratio} ({type_consistent}) | Base: {base_size}px | Sizes: {sizes}

### Accessibility
- Total: {total_colors} | AA Pass: {aa_pass} | AA Fail: {aa_fail}
- Failing: {failing_colors}

### Spacing
- Base: {spacing_base}px | Aligned: {spacing_aligned}% | Values: {spacing_values}

### Color Stats
- Unique: {unique_colors} | Near-Duplicates: {near_duplicates}

### Radius
{radius_data}

### Shadows
{shadow_data}

CITE the EXACT numbers above for every check."""

    def __init__(self, hf_client):
        self.hf_client = hf_client

    async def analyze(
        self,
        rule_engine_results: Any,
        radius_tokens: dict = None,
        shadow_tokens: dict = None,
        log_callback: Callable = None,
    ) -> BestPracticesResult:
        def log(msg):
            if log_callback:
                log_callback(msg)

        log("")
        log("   ✅ SENTINEL — Best Practices Auditor (Qwen 72B)")
        log("   └─ ReAct: Auditing colors + typography + spacing + radius + shadows...")

        typo = rule_engine_results.typography
        spacing = rule_engine_results.spacing
        color_stats = rule_engine_results.color_stats
        accessibility = rule_engine_results.accessibility
        failures = [a for a in accessibility if not a.passes_aa_normal]
        failing_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:8]])
        sizes_str = ", ".join([f"{s}px" for s in typo.sizes_px[:8]]) if typo.sizes_px else "N/A"
        sp_vals = ", ".join([f"{v}px" for v in spacing.current_values[:10]]) if hasattr(spacing, 'current_values') and spacing.current_values else "N/A"

        prompt = self.PROMPT_TEMPLATE.format(
            type_ratio=f"{typo.detected_ratio:.3f}",
            type_consistent="consistent" if typo.is_consistent else f"inconsistent (var={typo.variance:.2f})",
            base_size=typo.sizes_px[0] if typo.sizes_px else 16,
            sizes=sizes_str,
            total_colors=len(accessibility),
            aa_pass=len(accessibility) - len(failures),
            aa_fail=len(failures),
            failing_colors=failing_str or "None",
            spacing_base=spacing.detected_base,
            spacing_aligned=f"{spacing.alignment_percentage:.0f}",
            spacing_values=sp_vals,
            unique_colors=color_stats.unique_count,
            near_duplicates=len(color_stats.near_duplicates),
            radius_data=_fmt_radius(radius_tokens) if radius_tokens else "No radius data",
            shadow_data=_fmt_shadows(shadow_tokens) if shadow_tokens else "No shadow data",
        )

        try:
            start = datetime.now()
            response = await self.hf_client.complete_async(
                agent_name="best_practices_validator",
                system_prompt=self.SYSTEM_PROMPT,
                user_message=prompt,
                max_tokens=2000,
                json_mode=True,
            )
            dur = (datetime.now() - start).total_seconds()
            result = self._parse(response)

            # Critic cross-reference
            passed, errors = validate_sentinel_output(result, rule_engine_results)
            result.validation_passed = passed
            if not passed:
                log(f"   ⚠️ Critic: {len(errors)} issues — applying fixes...")
                for e in errors[:3]:
                    log(f"      └─ {e}")
                result = _apply_sentinel_fixes(result, rule_engine_results, errors)

            log(f"   ─────────────────────────────────────────")
            log(f"   ✅ SENTINEL — COMPLETE ({dur:.1f}s)")
            _log_reasoning(result.reasoning_trace, log)
            log(f"   ├─ Overall Score: {result.overall_score}/100")
            for cn, cv in (result.checks or {}).items():
                if isinstance(cv, dict):
                    s = cv.get("status", "?")
                    si = {"pass": "✅", "warn": "⚠️", "fail": "❌"}.get(s, "?")
                    log(f"   │  {si} {cn}: {s}")
            log(f"   ├─ Priority Fixes: {len(result.priority_fixes)}")
            log(f"   └─ Critic: {'✅ PASSED' if result.validation_passed else '⚠️ FIXED'}")
            return result

        except Exception as e:
            log(f"   ⚠️ SENTINEL failed: {str(e)[:120]}")
            return BestPracticesResult()

    def _parse(self, response: str) -> BestPracticesResult:
        try:
            m = re.search(r'\{[\s\S]*\}', response)
            if m:
                d = json.loads(m.group())
                return BestPracticesResult(
                    overall_score=d.get("overall_score", 50),
                    checks=d.get("checks", {}),
                    priority_fixes=d.get("priority_fixes", []),
                    passing_practices=d.get("passing_practices", []),
                    failing_practices=d.get("failing_practices", []),
                    self_evaluation=d.get("self_evaluation", {}),
                    color_assessment=d.get("color_assessment", {}),
                    typography_assessment=d.get("typography_assessment", {}),
                    spacing_assessment=d.get("spacing_assessment", {}),
                    radius_assessment=d.get("radius_assessment", {}),
                    shadow_assessment=d.get("shadow_assessment", {}),
                    reasoning_trace=d.get("reasoning_steps", []),
                )
        except Exception:
            pass
        return BestPracticesResult()


# =============================================================================
# NEXUS — HEAD Synthesizer (Tree of Thought)
# =============================================================================

class HeadSynthesizerAgent:
    """
    NEXUS — Senior Design System Architect.
    Tree of Thought: 2 perspectives, picks best, compiles all agent outputs.
    Recommendations for ALL token types.
    Model: Llama 3.3 70B · Temperature: 0.3
    """

    SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect — the final synthesizer.

## REASONING FRAMEWORK (Tree of Thought)
Evaluate TWO perspectives:

### PERSPECTIVE A — Accessibility-First
Weights: accessibility=40%, consistency=30%, organization=30%
Penalize heavily for AA failures.

### PERSPECTIVE B — Balanced
Weights: accessibility=30%, consistency=35%, organization=35%
Equal emphasis across areas.

For each: calculate scores, determine top 3 actions.
Then CHOOSE the perspective that better reflects reality.

## SYNTHESIZE ALL TOKEN TYPES:
- Colors: AURORA brand + SENTINEL AA findings → color recommendations
- Typography: ATLAS benchmark match + SENTINEL scale audit → type scale rec
- Spacing: ATLAS grid comparison + SENTINEL alignment → spacing rec
- Radius: SENTINEL consistency + ATLAS benchmark → radius rec
- Shadows: SENTINEL elevation + ATLAS benchmark → shadow rec

## OUTPUT (JSON)

{
  "reasoning_steps": [
    {"step": "THINK", "area": "perspective_a", "content": "Accessibility-first weighting..."},
    {"step": "ACT", "area": "perspective_a", "content": "Score: overall=52..."},
    {"step": "THINK", "area": "perspective_b", "content": "Balanced weighting..."},
    {"step": "ACT", "area": "perspective_b", "content": "Score: overall=63..."},
    {"step": "OBSERVE", "area": "comparison", "content": "A shows severity of AA failures..."},
    {"step": "VERIFY", "area": "decision", "content": "Choosing A — honest about AA issues"}
  ],
  "perspective_a": {"scores": {"overall": 52, "accessibility": 38, "consistency": 72, "organization": 68}, "reasoning": "..."},
  "perspective_b": {"scores": {"overall": 63, "accessibility": 45, "consistency": 72, "organization": 68}, "reasoning": "..."},
  "chosen_perspective": "A",
  "choice_reasoning": "AA failures affect real users — lower score is more honest",
  "executive_summary": "Your design system scores X/100...",
  "scores": {"overall": 52, "accessibility": 38, "consistency": 72, "organization": 68},
  "top_3_actions": [
    {"action": "Fix AA compliance", "impact": "high", "effort": "medium", "details": "#X→#Y", "token_type": "color"}
  ],
  "color_recommendations": [
    {"role": "brand.primary", "current": "#hex", "suggested": "#hex", "reason": "AA", "accept": true}
  ],
  "type_scale_recommendation": {"current_ratio": 1.18, "recommended_ratio": 1.25, "reason": "..."},
  "spacing_recommendation": {"current": "8px", "recommended": "8px", "reason": "Already aligned"},
  "radius_recommendation": {"current": "3 tiers", "recommended": "Add xl tier", "reason": "..."},
  "shadow_recommendation": {"current": "3 levels", "recommended": "Add 2 more", "reason": "..."},
  "benchmark_fit": {"closest": "Material", "similarity": "78%", "recommendation": "..."},
  "brand_analysis": {"primary": "#hex", "secondary": "#hex", "cohesion": 7},
  "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []}
}

Return ONLY valid JSON."""

    PROMPT_TEMPLATE = """Synthesize all analysis into a final report.

## RULE ENGINE FACTS
- Type: {type_ratio} ({type_status}) | Base: {base_size}px
- AA Failures: {aa_failures}/{total_colors}
- Spacing: {spacing_status}
- Colors: {unique_colors} unique | Consistency: {consistency_score}/100
- Radius: {radius_facts}
- Shadows: {shadow_facts}

## AURORA — Brand Analysis
- Primary: {brand_primary} ({brand_confidence}) | Secondary: {brand_secondary}
- Palette: {palette_strategy} | Cohesion: {cohesion_score}/10
- Typography: {aurora_typo}
- Spacing: {aurora_spacing}
- Radius: {aurora_radius}
- Shadows: {aurora_shadows}

## ATLAS — Benchmark
- Closest: {closest_benchmark} ({match_pct}%)
- Typo: {atlas_typo} | Spacing: {atlas_spacing} | Colors: {atlas_colors}
- Radius: {atlas_radius} | Shadows: {atlas_shadows}
- Changes: {benchmark_changes}

## SENTINEL — Audit
- Score: {best_practices_score}/100
- Color: {sentinel_color} | Typo: {sentinel_typo} | Spacing: {sentinel_spacing}
- Radius: {sentinel_radius} | Shadows: {sentinel_shadows}
- Fixes: {priority_fixes}

## AA FIXES NEEDED
{accessibility_fixes}

Evaluate from TWO perspectives (Tree of Thought). Choose one. Recommend for ALL token types."""

    def __init__(self, hf_client):
        self.hf_client = hf_client

    async def synthesize(
        self,
        rule_engine_results: Any,
        benchmark_comparisons: list,
        brand_identification: BrandIdentification,
        benchmark_advice: BenchmarkAdvice,
        best_practices: BestPracticesResult,
        log_callback: Callable = None,
    ) -> HeadSynthesis:
        def log(msg):
            if log_callback:
                log_callback(msg)

        log("")
        log("═" * 60)
        log("🧠 NEXUS — HEAD SYNTHESIZER (Tree of Thought)")
        log("═" * 60)
        log("   Evaluating Perspective A (Accessibility-First) vs B (Balanced)...")
        log("   Compiling: Rule Engine + AURORA + ATLAS + SENTINEL...")

        typo = rule_engine_results.typography
        spacing = rule_engine_results.spacing
        color_stats = rule_engine_results.color_stats
        accessibility = rule_engine_results.accessibility
        failures = [a for a in accessibility if not a.passes_aa_normal]
        aa_fixes_str = "\n".join([
            f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
            for a in failures[:8] if a.suggested_fix
        ])
        closest = benchmark_comparisons[0] if benchmark_comparisons else None

        def _s(obj):
            """Safely stringify a dict/value for prompt."""
            if isinstance(obj, dict):
                parts = [f"{k}={v}" for k, v in list(obj.items())[:4]]
                return ", ".join(parts) if parts else "N/A"
            return str(obj) if obj else "N/A"

        prompt = self.PROMPT_TEMPLATE.format(
            type_ratio=f"{typo.detected_ratio:.3f}",
            type_status="consistent" if typo.is_consistent else "inconsistent",
            base_size=typo.sizes_px[0] if typo.sizes_px else 16,
            aa_failures=len(failures), total_colors=len(accessibility),
            spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
            unique_colors=color_stats.unique_count,
            consistency_score=rule_engine_results.consistency_score,
            radius_facts=_s(best_practices.radius_assessment) or "N/A",
            shadow_facts=_s(best_practices.shadow_assessment) or "N/A",
            brand_primary=brand_identification.brand_primary.get("color", "?"),
            brand_confidence=brand_identification.brand_primary.get("confidence", "?"),
            brand_secondary=brand_identification.brand_secondary.get("color", "?"),
            palette_strategy=brand_identification.palette_strategy,
            cohesion_score=brand_identification.cohesion_score,
            aurora_typo=brand_identification.typography_notes or "N/A",
            aurora_spacing=brand_identification.spacing_notes or "N/A",
            aurora_radius=brand_identification.radius_notes or "N/A",
            aurora_shadows=brand_identification.shadow_notes or "N/A",
            closest_benchmark=closest.benchmark.name if closest else "?",
            match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
            atlas_typo=_s(benchmark_advice.typography_comparison),
            atlas_spacing=_s(benchmark_advice.spacing_comparison),
            atlas_colors=_s(benchmark_advice.color_comparison),
            atlas_radius=_s(benchmark_advice.radius_comparison),
            atlas_shadows=_s(benchmark_advice.shadow_comparison),
            benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:4]]),
            best_practices_score=best_practices.overall_score,
            sentinel_color=_s(best_practices.color_assessment),
            sentinel_typo=_s(best_practices.typography_assessment),
            sentinel_spacing=_s(best_practices.spacing_assessment),
            sentinel_radius=_s(best_practices.radius_assessment),
            sentinel_shadows=_s(best_practices.shadow_assessment),
            priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:5]]),
            accessibility_fixes=aa_fixes_str or "None needed",
        )

        try:
            start = datetime.now()
            response = await self.hf_client.complete_async(
                agent_name="head_synthesizer",
                system_prompt=self.SYSTEM_PROMPT,
                user_message=prompt,
                max_tokens=2500,
                json_mode=True,
            )
            dur = (datetime.now() - start).total_seconds()
            result = self._parse(response)

            log("")
            log(f"   🧠 NEXUS — COMPLETE ({dur:.1f}s)")
            _log_reasoning(result.reasoning_trace, log)
            pa = result.perspective_a.get("scores", {}).get("overall", "?") if result.perspective_a else "?"
            pb = result.perspective_b.get("scores", {}).get("overall", "?") if result.perspective_b else "?"
            log(f"   ├─ Perspective A: {pa}/100")
            log(f"   ├─ Perspective B: {pb}/100")
            log(f"   ├─ Chosen: {result.chosen_perspective}")
            log(f"   ├─ Why: {result.choice_reasoning or 'N/A'}")
            log(f"   ├─ Final Score: {result.scores.get('overall', '?')}/100" if result.scores else "   ├─ Scores: N/A")
            log(f"   ├─ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}")
            log(f"   ├─ Typography: {_s(result.type_scale_recommendation)}")
            log(f"   ├─ Spacing: {_s(result.spacing_recommendation)}")
            log(f"   ├─ Radius: {_s(result.radius_recommendation)}")
            log(f"   └─ Shadows: {_s(result.shadow_recommendation)}")
            log("")
            return result

        except Exception as e:
            log(f"   ⚠️ NEXUS failed: {str(e)[:120]}")
            return HeadSynthesis()

    def _parse(self, response: str) -> HeadSynthesis:
        try:
            m = re.search(r'\{[\s\S]*\}', response)
            if m:
                d = json.loads(m.group())
                return HeadSynthesis(
                    executive_summary=d.get("executive_summary", ""),
                    scores=d.get("scores", {}),
                    benchmark_fit=d.get("benchmark_fit", {}),
                    brand_analysis=d.get("brand_analysis", {}),
                    top_3_actions=d.get("top_3_actions", []),
                    color_recommendations=d.get("color_recommendations", []),
                    type_scale_recommendation=d.get("type_scale_recommendation", {}),
                    spacing_recommendation=d.get("spacing_recommendation", {}),
                    radius_recommendation=d.get("radius_recommendation", {}),
                    shadow_recommendation=d.get("shadow_recommendation", {}),
                    self_evaluation=d.get("self_evaluation", {}),
                    perspective_a=d.get("perspective_a", {}),
                    perspective_b=d.get("perspective_b", {}),
                    chosen_perspective=d.get("chosen_perspective", ""),
                    choice_reasoning=d.get("choice_reasoning", ""),
                    reasoning_trace=d.get("reasoning_steps", []),
                )
        except Exception:
            pass
        return HeadSynthesis()


# =============================================================================
# CRITIC / VALIDATOR FUNCTIONS (Rule-based, no LLM)
# =============================================================================

def validate_aurora_output(output: BrandIdentification, input_hexes: list) -> tuple:
    """Validate AURORA naming_map. Returns (passed, errors)."""
    errors = []
    nm = output.naming_map or {}

    # All input colors must have names
    for h in input_hexes:
        if h not in nm and h.lower() not in nm:
            errors.append(f"Missing name for {h}")

    # No word-based shades
    bad_words = {"light", "dark", "base", "muted", "deep", "lighter", "darker"}
    for h, name in nm.items():
        for part in name.split("."):
            if part.lower() in bad_words:
                errors.append(f"Word shade '{part}' in {name}")

    # No duplicates
    seen = set()
    for n in nm.values():
        if n in seen:
            errors.append(f"Duplicate: {n}")
        seen.add(n)

    # Convention: color.X.Y
    for h, name in nm.items():
        if not name.startswith("color."):
            errors.append(f"'{name}' must start with 'color.'")
        if len(name.split(".")) < 3:
            errors.append(f"'{name}' needs 3+ parts")

    return len(errors) == 0, errors


def validate_sentinel_output(output: BestPracticesResult, rule_engine) -> tuple:
    """Cross-reference SENTINEL scores against rule engine data."""
    errors = []
    checks = output.checks or {}
    accessibility = rule_engine.accessibility

    aa_failures = len([a for a in accessibility if not a.passes_aa_normal])
    aa_check = checks.get("aa_compliance", {})
    if aa_failures > 0 and isinstance(aa_check, dict) and aa_check.get("status") == "pass":
        errors.append(f"aa_compliance='pass' but {aa_failures} fail AA")

    score = output.overall_score
    if not (0 <= score <= 100):
        errors.append(f"Score {score} out of 0-100 range")

    fail_count = sum(1 for c in checks.values() if isinstance(c, dict) and c.get("status") == "fail")
    if fail_count >= 3 and score > 70:
        errors.append(f"Score {score} too high with {fail_count} failures")

    typo = rule_engine.typography
    base_size = typo.sizes_px[0] if typo.sizes_px else 16
    base_check = checks.get("base_size_accessible", {})
    if base_size < 16 and isinstance(base_check, dict) and base_check.get("status") == "pass":
        errors.append(f"base_size 'pass' but {base_size}px < 16")

    return len(errors) == 0, errors


def _apply_sentinel_fixes(result: BestPracticesResult, rule_engine, errors: list) -> BestPracticesResult:
    """Deterministic fixes when critic finds issues."""
    accessibility = rule_engine.accessibility
    failures = [a for a in accessibility if not a.passes_aa_normal]

    for err in errors:
        if "aa_compliance" in err and "pass" in err:
            if "aa_compliance" in result.checks:
                result.checks["aa_compliance"]["status"] = "fail"
                result.checks["aa_compliance"]["note"] = f"CORRECTED: {len(failures)} fail AA"

        if "too high" in err.lower():
            fail_count = sum(1 for c in result.checks.values() if isinstance(c, dict) and c.get("status") == "fail")
            max_s = max(30, 100 - fail_count * 15)
            if result.overall_score > max_s:
                result.overall_score = max_s

    result.overall_score = max(0, min(100, result.overall_score))
    result.validation_passed = True
    return result


def filter_aurora_naming_map(aurora: BrandIdentification) -> dict:
    """Filter AURORA naming_map to only keep semantic role assignments.

    AURORA is a secondary naming authority — it can assign semantic roles
    (brand.primary, text.secondary, bg.primary, feedback.error, etc.)
    but cannot override palette names (blue.500, neutral.700, etc.).

    The color_classifier is the primary naming authority.

    Returns:
        Dict of hex -> semantic_name (only role-based names).
    """
    SEMANTIC_PREFIXES = ('brand.', 'text.', 'bg.', 'border.', 'feedback.')
    filtered = {}

    for hex_val, name in (aurora.naming_map or {}).items():
        hex_clean = str(hex_val).strip().lower()
        if not hex_clean.startswith('#') or not name:
            continue
        clean_name = name if name.startswith('color.') else f'color.{name}'
        # Extract the part after "color."
        after_prefix = clean_name[6:]  # "brand.primary", "blue.500", etc.
        if any(after_prefix.startswith(sp) for sp in SEMANTIC_PREFIXES):
            filtered[hex_clean] = clean_name

    return filtered


def post_validate_stage2(
    aurora: BrandIdentification,
    sentinel: BestPracticesResult,
    nexus: HeadSynthesis,
    rule_engine: Any,
) -> list:
    """Final deterministic checks after ALL agents. Returns issues list."""
    issues = []

    for h, name in (aurora.naming_map or {}).items():
        if not re.match(r'^color\.\w+\.[\w]+$', name):
            issues.append(f"Bad name: {name}")

    for key, val in (nexus.scores or {}).items():
        if isinstance(val, (int, float)) and not (0 <= val <= 100):
            issues.append(f"Score {key}={val} OOB")

    aa_failures = len([a for a in rule_engine.accessibility if not a.passes_aa_normal])
    n_acc = nexus.scores.get("accessibility", 50) if nexus.scores else 50
    if aa_failures > 3 and n_acc > 85:
        issues.append(f"Nexus accessibility={n_acc} but {aa_failures} AA failures")

    for rec in (nexus.color_recommendations or []):
        for field in ("current", "suggested"):
            v = rec.get(field, "")
            if v and not v.startswith("#"):
                issues.append(f"Color rec {field} missing #: {v}")

    return issues