Spaces:

riazmo
/

Design-System-Extractor-2

Sleeping

App Files Files Community

riazmo commited on Feb 1

Commit

a6c864a

verified ·

1 Parent(s): c653719

Upload 2 files

Browse files

Files changed (2) hide show

agents/benchmark_researcher.py +717 -0
agents/llm_agents.py +865 -0

agents/benchmark_researcher.py ADDED Viewed

	@@ -0,0 +1,717 @@

+"""
+Benchmark Research Agent
+=========================
+Fetches LIVE data from design system documentation sites
+using Firecrawl, with 24-hour caching.
+This agent:
+1. Fetches official documentation from design system sites
+2. Extracts typography, spacing, color specifications using LLM
+3. Caches results for 24 hours
+4. Compares user's tokens to researched benchmarks
+"""
+import asyncio
+import json
+import os
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from typing import Optional, Callable
+import hashlib
+# =============================================================================
+# DESIGN SYSTEM SOURCES (Official Documentation URLs)
+# =============================================================================
+DESIGN_SYSTEM_SOURCES = {
+    "material_design_3": {
+        "name": "Material Design 3",
+        "short_name": "Material 3",
+        "vendor": "Google",
+        "urls": {
+            "typography": "https://m3.material.io/styles/typography/type-scale-tokens",
+            "spacing": "https://m3.material.io/foundations/layout/understanding-layout/spacing",
+            "colors": "https://m3.material.io/styles/color/the-color-system/key-colors-tones",
+        },
+        "best_for": ["Android apps", "Web apps", "Enterprise software"],
+        "icon": "🟢",
+    },
+    "apple_hig": {
+        "name": "Apple Human Interface Guidelines",
+        "short_name": "Apple HIG",
+        "vendor": "Apple",
+        "urls": {
+            "typography": "https://developer.apple.com/design/human-interface-guidelines/typography",
+            "spacing": "https://developer.apple.com/design/human-interface-guidelines/layout",
+        },
+        "best_for": ["iOS apps", "macOS apps", "Premium consumer products"],
+        "icon": "🍎",
+    },
+    "shopify_polaris": {
+        "name": "Shopify Polaris",
+        "short_name": "Polaris",
+        "vendor": "Shopify",
+        "urls": {
+            "typography": "https://polaris.shopify.com/design/typography",
+            "spacing": "https://polaris.shopify.com/design/spacing",
+            "colors": "https://polaris.shopify.com/design/colors",
+        },
+        "best_for": ["E-commerce", "Admin dashboards", "Merchant tools"],
+        "icon": "🛒",
+    },
+    "atlassian_design": {
+        "name": "Atlassian Design System",
+        "short_name": "Atlassian",
+        "vendor": "Atlassian",
+        "urls": {
+            "typography": "https://atlassian.design/foundations/typography",
+            "spacing": "https://atlassian.design/foundations/spacing",
+            "colors": "https://atlassian.design/foundations/color",
+        },
+        "best_for": ["Productivity tools", "Dense interfaces", "Enterprise B2B"],
+        "icon": "🔵",
+    },
+    "ibm_carbon": {
+        "name": "IBM Carbon Design System",
+        "short_name": "Carbon",
+        "vendor": "IBM",
+        "urls": {
+            "typography": "https://carbondesignsystem.com/guidelines/typography/overview",
+            "spacing": "https://carbondesignsystem.com/guidelines/spacing/overview",
+            "colors": "https://carbondesignsystem.com/guidelines/color/overview",
+        },
+        "best_for": ["Enterprise software", "Data-heavy applications", "IBM products"],
+        "icon": "🔷",
+    },
+    "tailwind_css": {
+        "name": "Tailwind CSS",
+        "short_name": "Tailwind",
+        "vendor": "Tailwind Labs",
+        "urls": {
+            "typography": "https://tailwindcss.com/docs/font-size",
+            "spacing": "https://tailwindcss.com/docs/customizing-spacing",
+            "colors": "https://tailwindcss.com/docs/customizing-colors",
+        },
+        "best_for": ["Web applications", "Startups", "Rapid prototyping"],
+        "icon": "🌊",
+    },
+    "ant_design": {
+        "name": "Ant Design",
+        "short_name": "Ant Design",
+        "vendor": "Ant Group",
+        "urls": {
+            "typography": "https://ant.design/docs/spec/font",
+            "spacing": "https://ant.design/docs/spec/layout",
+            "colors": "https://ant.design/docs/spec/colors",
+        },
+        "best_for": ["Enterprise B2B", "Admin panels", "Chinese market"],
+        "icon": "🐜",
+    },
+    "chakra_ui": {
+        "name": "Chakra UI",
+        "short_name": "Chakra",
+        "vendor": "Chakra UI",
+        "urls": {
+            "typography": "https://chakra-ui.com/docs/styled-system/theme#typography",
+            "spacing": "https://chakra-ui.com/docs/styled-system/theme#spacing",
+            "colors": "https://chakra-ui.com/docs/styled-system/theme#colors",
+        },
+        "best_for": ["React applications", "Startups", "Accessible products"],
+        "icon": "⚡",
+    },
+}
+# =============================================================================
+# DATA CLASSES
+# =============================================================================
+@dataclass
+class BenchmarkData:
+    """Researched benchmark data from a design system."""
+    key: str
+    name: str
+    short_name: str
+    vendor: str
+    icon: str
+    # Extracted specifications
+    typography: dict = field(default_factory=dict)
+    # Expected: {scale_ratio, base_size, sizes[], font_family, line_height_body}
+    spacing: dict = field(default_factory=dict)
+    # Expected: {base, scale[], grid}
+    colors: dict = field(default_factory=dict)
+    # Expected: {palette_size, uses_ramps, ramp_steps}
+    # Metadata
+    fetched_at: str = ""
+    confidence: str = "low"  # high, medium, low
+    source_urls: list = field(default_factory=list)
+    best_for: list = field(default_factory=list)
+    def to_dict(self) -> dict:
+        return {
+            "key": self.key,
+            "name": self.name,
+            "short_name": self.short_name,
+            "vendor": self.vendor,
+            "icon": self.icon,
+            "typography": self.typography,
+            "spacing": self.spacing,
+            "colors": self.colors,
+            "fetched_at": self.fetched_at,
+            "confidence": self.confidence,
+            "best_for": self.best_for,
+        }
+@dataclass
+class BenchmarkComparison:
+    """Comparison result between user's tokens and a benchmark."""
+    benchmark: BenchmarkData
+    similarity_score: float  # Lower = more similar
+    # Individual comparisons
+    type_ratio_diff: float
+    base_size_diff: int
+    spacing_grid_diff: int
+    # Match percentages
+    type_match_pct: float
+    spacing_match_pct: float
+    overall_match_pct: float
+    def to_dict(self) -> dict:
+        return {
+            "name": self.benchmark.name,
+            "short_name": self.benchmark.short_name,
+            "icon": self.benchmark.icon,
+            "similarity_score": round(self.similarity_score, 2),
+            "overall_match_pct": round(self.overall_match_pct, 1),
+            "comparison": {
+                "type_ratio": {
+                    "diff": round(self.type_ratio_diff, 3),
+                    "match_pct": round(self.type_match_pct, 1),
+                },
+                "base_size": {
+                    "diff": self.base_size_diff,
+                },
+                "spacing_grid": {
+                    "diff": self.spacing_grid_diff,
+                    "match_pct": round(self.spacing_match_pct, 1),
+                },
+            },
+            "benchmark_values": {
+                "type_ratio": self.benchmark.typography.get("scale_ratio"),
+                "base_size": self.benchmark.typography.get("base_size"),
+                "spacing_grid": self.benchmark.spacing.get("base"),
+            },
+            "best_for": self.benchmark.best_for,
+            "confidence": self.benchmark.confidence,
+        }
+# =============================================================================
+# CACHE MANAGER
+# =============================================================================
+class BenchmarkCache:
+    """Manages 24-hour caching of benchmark research results."""
+    def __init__(self, cache_dir: str = None):
+        if cache_dir is None:
+            cache_dir = os.path.join(os.path.dirname(__file__), "..", "storage")
+        self.cache_file = os.path.join(cache_dir, "benchmark_cache.json")
+        self._ensure_cache_dir()
+    def _ensure_cache_dir(self):
+        """Ensure cache directory exists."""
+        os.makedirs(os.path.dirname(self.cache_file), exist_ok=True)
+    def _load_cache(self) -> dict:
+        """Load cache from file."""
+        if os.path.exists(self.cache_file):
+            try:
+                with open(self.cache_file, 'r') as f:
+                    return json.load(f)
+            except Exception:
+                return {}
+        return {}
+    def _save_cache(self, cache: dict):
+        """Save cache to file."""
+        try:
+            with open(self.cache_file, 'w') as f:
+                json.dump(cache, f, indent=2)
+        except Exception:
+            pass
+    def get(self, key: str) -> Optional[BenchmarkData]:
+        """Get cached benchmark if valid (< 24 hours old)."""
+        cache = self._load_cache()
+        if key not in cache:
+            return None
+        entry = cache[key]
+        fetched_at = datetime.fromisoformat(entry.get("fetched_at", "2000-01-01"))
+        # Check if expired (24 hours)
+        if datetime.now() - fetched_at > timedelta(hours=24):
+            return None
+        # Reconstruct BenchmarkData
+        source = DESIGN_SYSTEM_SOURCES.get(key, {})
+        return BenchmarkData(
+            key=key,
+            name=entry.get("name", source.get("name", key)),
+            short_name=entry.get("short_name", source.get("short_name", key)),
+            vendor=entry.get("vendor", source.get("vendor", "")),
+            icon=entry.get("icon", source.get("icon", "📦")),
+            typography=entry.get("typography", {}),
+            spacing=entry.get("spacing", {}),
+            colors=entry.get("colors", {}),
+            fetched_at=entry.get("fetched_at", ""),
+            confidence=entry.get("confidence", "low"),
+            source_urls=entry.get("source_urls", []),
+            best_for=entry.get("best_for", source.get("best_for", [])),
+        )
+    def set(self, key: str, data: BenchmarkData):
+        """Cache benchmark data."""
+        cache = self._load_cache()
+        cache[key] = data.to_dict()
+        self._save_cache(cache)
+    def get_cache_status(self) -> dict:
+        """Get status of all cached items."""
+        cache = self._load_cache()
+        status = {}
+        for key in DESIGN_SYSTEM_SOURCES.keys():
+            if key in cache:
+                fetched_at = datetime.fromisoformat(cache[key].get("fetched_at", "2000-01-01"))
+                age_hours = (datetime.now() - fetched_at).total_seconds() / 3600
+                is_valid = age_hours < 24
+                status[key] = {
+                    "cached": True,
+                    "valid": is_valid,
+                    "age_hours": round(age_hours, 1),
+                }
+            else:
+                status[key] = {"cached": False, "valid": False}
+        return status
+# =============================================================================
+# FALLBACK DATA (Used when research fails)
+# =============================================================================
+FALLBACK_BENCHMARKS = {
+    "material_design_3": {
+        "typography": {"scale_ratio": 1.2, "base_size": 16, "font_family": "Roboto", "line_height_body": 1.5},
+        "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 48, 64], "grid": "8px"},
+        "colors": {"palette_size": 13, "uses_ramps": True},
+    },
+    "apple_hig": {
+        "typography": {"scale_ratio": 1.19, "base_size": 17, "font_family": "SF Pro", "line_height_body": 1.47},
+        "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40], "grid": "4px"},
+        "colors": {"palette_size": 9, "uses_ramps": True},
+    },
+    "shopify_polaris": {
+        "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "Inter", "line_height_body": 1.5},
+        "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64], "grid": "4px"},
+        "colors": {"palette_size": 11, "uses_ramps": True},
+    },
+    "atlassian_design": {
+        "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "Inter", "line_height_body": 1.43},
+        "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"},
+        "colors": {"palette_size": 15, "uses_ramps": True},
+    },
+    "ibm_carbon": {
+        "typography": {"scale_ratio": 1.25, "base_size": 14, "font_family": "IBM Plex Sans", "line_height_body": 1.5},
+        "spacing": {"base": 8, "scale": [0, 2, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"},
+        "colors": {"palette_size": 12, "uses_ramps": True},
+    },
+    "tailwind_css": {
+        "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5},
+        "spacing": {"base": 4, "scale": [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32], "grid": "4px"},
+        "colors": {"palette_size": 22, "uses_ramps": True},
+    },
+    "ant_design": {
+        "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "system-ui", "line_height_body": 1.57},
+        "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48], "grid": "8px"},
+        "colors": {"palette_size": 13, "uses_ramps": True},
+    },
+    "chakra_ui": {
+        "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5},
+        "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 56, 64], "grid": "4px"},
+        "colors": {"palette_size": 15, "uses_ramps": True},
+    },
+}
+# =============================================================================
+# BENCHMARK RESEARCHER
+# =============================================================================
+class BenchmarkResearcher:
+    """
+    Research agent that fetches live design system specifications.
+    Uses Firecrawl to fetch documentation and LLM to extract specs.
+    Results are cached for 24 hours.
+    """
+    def __init__(self, firecrawl_client=None, hf_client=None):
+        """
+        Initialize researcher.
+        Args:
+            firecrawl_client: Firecrawl API client for fetching docs
+            hf_client: HuggingFace client for LLM extraction
+        """
+        self.firecrawl = firecrawl_client
+        self.hf_client = hf_client
+        self.cache = BenchmarkCache()
+    async def research_benchmark(
+        self,
+        system_key: str,
+        log_callback: Callable = None,
+        force_refresh: bool = False,
+    ) -> BenchmarkData:
+        """
+        Research a specific design system.
+        Args:
+            system_key: Key from DESIGN_SYSTEM_SOURCES
+            log_callback: Function to log progress
+            force_refresh: Bypass cache and fetch fresh
+        Returns:
+            BenchmarkData with extracted specifications
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        if system_key not in DESIGN_SYSTEM_SOURCES:
+            raise ValueError(f"Unknown design system: {system_key}")
+        source = DESIGN_SYSTEM_SOURCES[system_key]
+        # Check cache first (unless force refresh)
+        if not force_refresh:
+            cached = self.cache.get(system_key)
+            if cached:
+                log(f"   ├─ {source['icon']} {source['short_name']}: Using cached data ✅")
+                return cached
+        log(f"   ├─ {source['icon']} {source['short_name']}: Fetching documentation...")
+        # Try to fetch and extract
+        raw_content = ""
+        confidence = "low"
+        if self.firecrawl:
+            try:
+                # Fetch typography docs
+                typo_url = source["urls"].get("typography")
+                if typo_url:
+                    log(f"   │  ├─ Fetching {typo_url[:50]}...")
+                    typo_content = await self._fetch_url(typo_url)
+                    if typo_content:
+                        raw_content += f"\n\n=== TYPOGRAPHY ===\n{typo_content[:4000]}"
+                        confidence = "medium"
+                # Fetch spacing docs
+                spacing_url = source["urls"].get("spacing")
+                if spacing_url:
+                    log(f"   │  ├─ Fetching spacing docs...")
+                    spacing_content = await self._fetch_url(spacing_url)
+                    if spacing_content:
+                        raw_content += f"\n\n=== SPACING ===\n{spacing_content[:3000]}"
+                        if confidence == "medium":
+                            confidence = "high"
+            except Exception as e:
+                log(f"   │  ├─ ⚠️ Fetch error: {str(e)[:50]}")
+        # Extract specs with LLM (or use fallback)
+        if raw_content and self.hf_client:
+            log(f"   │  ├─ Extracting specifications...")
+            extracted = await self._extract_specs_with_llm(source["name"], raw_content)
+        else:
+            log(f"   │  ├─ Using fallback data (fetch unavailable)")
+            extracted = FALLBACK_BENCHMARKS.get(system_key, {})
+            confidence = "fallback"
+        # Build result
+        result = BenchmarkData(
+            key=system_key,
+            name=source["name"],
+            short_name=source["short_name"],
+            vendor=source["vendor"],
+            icon=source["icon"],
+            typography=extracted.get("typography", FALLBACK_BENCHMARKS.get(system_key, {}).get("typography", {})),
+            spacing=extracted.get("spacing", FALLBACK_BENCHMARKS.get(system_key, {}).get("spacing", {})),
+            colors=extracted.get("colors", FALLBACK_BENCHMARKS.get(system_key, {}).get("colors", {})),
+            fetched_at=datetime.now().isoformat(),
+            confidence=confidence,
+            source_urls=list(source["urls"].values()),
+            best_for=source["best_for"],
+        )
+        # Cache result
+        self.cache.set(system_key, result)
+        ratio = result.typography.get("scale_ratio", "?")
+        base = result.typography.get("base_size", "?")
+        grid = result.spacing.get("base", "?")
+        log(f"   │  └─ ✅ ratio={ratio}, base={base}px, grid={grid}px [{confidence}]")
+        return result
+    async def _fetch_url(self, url: str) -> Optional[str]:
+        """Fetch URL content using Firecrawl."""
+        if not self.firecrawl:
+            return None
+        try:
+            # Firecrawl scrape
+            result = self.firecrawl.scrape_url(
+                url,
+                params={"formats": ["markdown"]}
+            )
+            if result and result.get("markdown"):
+                return result["markdown"]
+            elif result and result.get("content"):
+                return result["content"]
+        except Exception as e:
+            pass
+        return None
+    async def _extract_specs_with_llm(self, system_name: str, raw_content: str) -> dict:
+        """Extract structured specs from documentation using LLM."""
+        if not self.hf_client:
+            return {}
+        prompt = f"""Extract the design system specifications from this documentation.
+DESIGN SYSTEM: {system_name}
+DOCUMENTATION:
+{raw_content[:6000]}
+Return ONLY a JSON object with these exact fields (use null if not found):
+{{
+  "typography": {{
+    "scale_ratio": <number like 1.2 or 1.25>,
+    "base_size": <number in px>,
+    "font_family": "<font name>",
+    "sizes": [<list of sizes in px>],
+    "line_height_body": <number like 1.5>
+  }},
+  "spacing": {{
+    "base": <base unit in px like 4 or 8>,
+    "scale": [<spacing values>],
+    "grid": "<description>"
+  }},
+  "colors": {{
+    "palette_size": <number>,
+    "uses_ramps": <true/false>
+  }}
+}}
+Return ONLY valid JSON, no explanation."""
+        try:
+            response = await self.hf_client.complete_async(
+                agent_name="benchmark_extractor",
+                system_prompt="You are a design system specification extractor. Extract only the factual specifications.",
+                user_message=prompt,
+                max_tokens=600,
+                json_mode=True,
+            )
+            # Parse JSON from response
+            import re
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                return json.loads(json_match.group())
+        except Exception as e:
+            pass
+        return {}
+    async def research_selected_benchmarks(
+        self,
+        selected_keys: list[str],
+        log_callback: Callable = None,
+    ) -> list[BenchmarkData]:
+        """
+        Research multiple selected design systems.
+        Args:
+            selected_keys: List of system keys to research
+            log_callback: Function to log progress
+        Returns:
+            List of BenchmarkData
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("═" * 60)
+        log("🔬 LAYER 2: BENCHMARK RESEARCH (Firecrawl + Cache)")
+        log("═" * 60)
+        log("")
+        log(f"   Selected systems: {', '.join(selected_keys)}")
+        log("")
+        results = []
+        for key in selected_keys:
+            if key in DESIGN_SYSTEM_SOURCES:
+                try:
+                    result = await self.research_benchmark(key, log_callback)
+                    results.append(result)
+                except Exception as e:
+                    log(f"   ├─ ⚠️ Error researching {key}: {e}")
+                    # Use fallback
+                    source = DESIGN_SYSTEM_SOURCES[key]
+                    fallback = FALLBACK_BENCHMARKS.get(key, {})
+                    results.append(BenchmarkData(
+                        key=key,
+                        name=source["name"],
+                        short_name=source["short_name"],
+                        vendor=source["vendor"],
+                        icon=source["icon"],
+                        typography=fallback.get("typography", {}),
+                        spacing=fallback.get("spacing", {}),
+                        colors=fallback.get("colors", {}),
+                        fetched_at=datetime.now().isoformat(),
+                        confidence="fallback",
+                        best_for=source["best_for"],
+                    ))
+        log("")
+        log(f"   ✅ Researched {len(results)}/{len(selected_keys)} design systems")
+        return results
+    def compare_to_benchmarks(
+        self,
+        your_ratio: float,
+        your_base_size: int,
+        your_spacing_grid: int,
+        benchmarks: list[BenchmarkData],
+        log_callback: Callable = None,
+    ) -> list[BenchmarkComparison]:
+        """
+        Compare user's tokens to researched benchmarks.
+        Args:
+            your_ratio: Detected type scale ratio
+            your_base_size: Detected base font size
+            your_spacing_grid: Detected spacing grid base
+            benchmarks: List of researched BenchmarkData
+            log_callback: Function to log progress
+        Returns:
+            List of BenchmarkComparison sorted by similarity
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("   📊 BENCHMARK COMPARISON")
+        log("   " + "─" * 40)
+        log(f"   Your values: ratio={your_ratio:.2f}, base={your_base_size}px, grid={your_spacing_grid}px")
+        log("")
+        comparisons = []
+        for b in benchmarks:
+            b_ratio = b.typography.get("scale_ratio", 1.25)
+            b_base = b.typography.get("base_size", 16)
+            b_grid = b.spacing.get("base", 8)
+            # Calculate differences
+            ratio_diff = abs(your_ratio - b_ratio)
+            base_diff = abs(your_base_size - b_base)
+            grid_diff = abs(your_spacing_grid - b_grid)
+            # Calculate match percentages
+            type_match = max(0, 100 - (ratio_diff * 100))  # 0.1 diff = 90% match
+            spacing_match = max(0, 100 - (grid_diff * 10))  # 4px diff = 60% match
+            # Weighted similarity score (lower = more similar)
+            similarity = (ratio_diff * 10) + (base_diff * 0.5) + (grid_diff * 0.3)
+            # Overall match percentage
+            overall_match = (type_match * 0.5) + (spacing_match * 0.3) + (100 - base_diff * 5) * 0.2
+            overall_match = max(0, min(100, overall_match))
+            comparisons.append(BenchmarkComparison(
+                benchmark=b,
+                similarity_score=similarity,
+                type_ratio_diff=ratio_diff,
+                base_size_diff=base_diff,
+                spacing_grid_diff=grid_diff,
+                type_match_pct=type_match,
+                spacing_match_pct=spacing_match,
+                overall_match_pct=overall_match,
+            ))
+        # Sort by similarity (lower = better)
+        comparisons.sort(key=lambda x: x.similarity_score)
+        # Log results
+        medals = ["🥇", "🥈", "🥉"]
+        for i, c in enumerate(comparisons[:5]):
+            medal = medals[i] if i < 3 else "  "
+            b = c.benchmark
+            log(f"   {medal} {b.icon} {b.short_name}: {c.overall_match_pct:.0f}% match (score: {c.similarity_score:.2f})")
+            log(f"      └─ ratio={b.typography.get('scale_ratio')}, base={b.typography.get('base_size')}px, grid={b.spacing.get('base')}px")
+        return comparisons
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+def get_available_benchmarks() -> list[dict]:
+    """Get list of available design systems for UI dropdown."""
+    return [
+        {
+            "key": key,
+            "name": source["name"],
+            "short_name": source["short_name"],
+            "icon": source["icon"],
+            "vendor": source["vendor"],
+            "best_for": source["best_for"],
+        }
+        for key, source in DESIGN_SYSTEM_SOURCES.items()
+    ]
+def get_benchmark_choices() -> list[tuple[str, str]]:
+    """Get choices for Gradio dropdown."""
+    return [
+        (f"{source['icon']} {source['short_name']} ({source['vendor']})", key)
+        for key, source in DESIGN_SYSTEM_SOURCES.items()
+    ]

agents/llm_agents.py ADDED Viewed

	@@ -0,0 +1,865 @@

+"""
+Stage 2 LLM Agents — Specialized Analysis Tasks
+=================================================
+These agents handle tasks that REQUIRE LLM reasoning:
+- Brand Identifier: Identify brand colors from usage context
+- Benchmark Advisor: Recommend best-fit design system
+- Best Practices Validator: Prioritize fixes by business impact
+- HEAD Synthesizer: Combine all outputs into final recommendations
+Each agent has a focused prompt for its specific task.
+"""
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Optional, Callable, Any
+from datetime import datetime
+# =============================================================================
+# DATA CLASSES
+# =============================================================================
+@dataclass
+class BrandIdentification:
+    """Results from Brand Identifier agent."""
+    brand_primary: dict = field(default_factory=dict)
+    # {color, confidence, reasoning, usage_count}
+    brand_secondary: dict = field(default_factory=dict)
+    brand_accent: dict = field(default_factory=dict)
+    palette_strategy: str = ""  # complementary, analogous, triadic, monochromatic, random
+    cohesion_score: int = 5  # 1-10
+    cohesion_notes: str = ""
+    semantic_names: dict = field(default_factory=dict)
+    # {hex_color: suggested_name}
+    def to_dict(self) -> dict:
+        return {
+            "brand_primary": self.brand_primary,
+            "brand_secondary": self.brand_secondary,
+            "brand_accent": self.brand_accent,
+            "palette_strategy": self.palette_strategy,
+            "cohesion_score": self.cohesion_score,
+            "cohesion_notes": self.cohesion_notes,
+            "semantic_names": self.semantic_names,
+        }
+@dataclass
+class BenchmarkAdvice:
+    """Results from Benchmark Advisor agent."""
+    recommended_benchmark: str = ""
+    recommended_benchmark_name: str = ""
+    reasoning: str = ""
+    alignment_changes: list = field(default_factory=list)
+    # [{change, from, to, effort}]
+    pros_of_alignment: list = field(default_factory=list)
+    cons_of_alignment: list = field(default_factory=list)
+    alternative_benchmarks: list = field(default_factory=list)
+    # [{name, reason}]
+    def to_dict(self) -> dict:
+        return {
+            "recommended_benchmark": self.recommended_benchmark,
+            "recommended_benchmark_name": self.recommended_benchmark_name,
+            "reasoning": self.reasoning,
+            "alignment_changes": self.alignment_changes,
+            "pros": self.pros_of_alignment,
+            "cons": self.cons_of_alignment,
+            "alternatives": self.alternative_benchmarks,
+        }
+@dataclass
+class BestPracticesResult:
+    """Results from Best Practices Validator agent."""
+    overall_score: int = 50  # 0-100
+    checks: dict = field(default_factory=dict)
+    # {check_name: {status: pass/warn/fail, note: str}}
+    priority_fixes: list = field(default_factory=list)
+    # [{rank, issue, impact, effort, action}]
+    passing_practices: list = field(default_factory=list)
+    failing_practices: list = field(default_factory=list)
+    def to_dict(self) -> dict:
+        return {
+            "overall_score": self.overall_score,
+            "checks": self.checks,
+            "priority_fixes": self.priority_fixes,
+            "passing": self.passing_practices,
+            "failing": self.failing_practices,
+        }
+@dataclass
+class HeadSynthesis:
+    """Final synthesized output from HEAD agent."""
+    executive_summary: str = ""
+    scores: dict = field(default_factory=dict)
+    # {overall, accessibility, consistency, organization}
+    benchmark_fit: dict = field(default_factory=dict)
+    # {closest, similarity, recommendation}
+    brand_analysis: dict = field(default_factory=dict)
+    # {primary, secondary, cohesion}
+    top_3_actions: list = field(default_factory=list)
+    # [{action, impact, effort, details}]
+    color_recommendations: list = field(default_factory=list)
+    # [{role, current, suggested, reason, accept}]
+    type_scale_recommendation: dict = field(default_factory=dict)
+    spacing_recommendation: dict = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        return {
+            "executive_summary": self.executive_summary,
+            "scores": self.scores,
+            "benchmark_fit": self.benchmark_fit,
+            "brand_analysis": self.brand_analysis,
+            "top_3_actions": self.top_3_actions,
+            "color_recommendations": self.color_recommendations,
+            "type_scale_recommendation": self.type_scale_recommendation,
+            "spacing_recommendation": self.spacing_recommendation,
+        }
+# =============================================================================
+# BRAND IDENTIFIER AGENT
+# =============================================================================
+class BrandIdentifierAgent:
+    """
+    Identifies brand colors from usage context.
+    WHY LLM: Requires understanding context (33 buttons = likely brand primary),
+    not just color math.
+    """
+    PROMPT_TEMPLATE = """You are a senior design system analyst. Identify the brand colors from this color usage data.
+## COLOR DATA WITH USAGE CONTEXT
+{color_data}
+## SEMANTIC ANALYSIS (from CSS properties)
+{semantic_analysis}
+## YOUR TASK
+1. **Identify Brand Colors**:
+   - Brand Primary: The main action/CTA color (highest visibility)
+   - Brand Secondary: Supporting brand color
+   - Brand Accent: Highlight color for emphasis
+2. **Assess Palette Strategy**:
+   - Is it complementary, analogous, triadic, monochromatic, or random?
+3. **Rate Cohesion** (1-10):
+   - Do the colors work together?
+   - Is there a clear color story?
+4. **Suggest Semantic Names** for top 10 most-used colors
+## OUTPUT FORMAT (JSON only)
+{{
+  "brand_primary": {{
+    "color": "#hex",
+    "confidence": "high|medium|low",
+    "reasoning": "Why this is brand primary",
+    "usage_count": <number>
+  }},
+  "brand_secondary": {{
+    "color": "#hex",
+    "confidence": "high|medium|low",
+    "reasoning": "..."
+  }},
+  "brand_accent": {{
+    "color": "#hex or null",
+    "confidence": "...",
+    "reasoning": "..."
+  }},
+  "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
+  "cohesion_score": <1-10>,
+  "cohesion_notes": "Assessment of how well colors work together",
+  "semantic_names": {{
+    "#hex1": "brand.primary",
+    "#hex2": "text.primary",
+    "#hex3": "background.primary"
+  }}
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def analyze(
+        self,
+        color_tokens: dict,
+        semantic_analysis: dict,
+        log_callback: Callable = None,
+    ) -> BrandIdentification:
+        """
+        Identify brand colors from usage context.
+        Args:
+            color_tokens: Dict of color tokens with usage data
+            semantic_analysis: Semantic categorization from Stage 1
+            log_callback: Progress logging function
+        Returns:
+            BrandIdentification with identified colors
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("   🎨 Brand Identifier (Llama 70B)")
+        log("   └─ Analyzing color context and usage patterns...")
+        # Format color data
+        color_data = self._format_color_data(color_tokens)
+        semantic_str = self._format_semantic_analysis(semantic_analysis)
+        prompt = self.PROMPT_TEMPLATE.format(
+            color_data=color_data,
+            semantic_analysis=semantic_str,
+        )
+        try:
+            start_time = datetime.now()
+            # Use the correct method signature
+            response = await self.hf_client.complete_async(
+                agent_name="brand_identifier",
+                system_prompt="You are a senior design system analyst specializing in brand color identification.",
+                user_message=prompt,
+                max_tokens=800,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            # Parse response
+            result = self._parse_response(response)
+            log(f"   ────────────────────────────────────────────────")
+            log(f"   🎨 Brand Identifier: COMPLETE ({duration:.1f}s)")
+            log(f"   ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
+            log(f"   ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}")
+            log(f"   ├─ Palette Strategy: {result.palette_strategy}")
+            log(f"   └─ Cohesion Score: {result.cohesion_score}/10")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Error: {str(e)[:50]}")
+            return BrandIdentification()
+    def _format_color_data(self, color_tokens: dict) -> str:
+        """Format color tokens for prompt."""
+        lines = []
+        for name, token in list(color_tokens.items())[:30]:
+            if isinstance(token, dict):
+                hex_val = token.get("value", token.get("hex", ""))
+                usage = token.get("usage_count", token.get("count", 1))
+                context = token.get("context", token.get("css_property", ""))
+            else:
+                hex_val = getattr(token, "value", "")
+                usage = getattr(token, "usage_count", 1)
+                context = getattr(token, "context", "")
+            if hex_val:
+                lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")
+        return "\n".join(lines) if lines else "No color data available"
+    def _format_semantic_analysis(self, semantic: dict) -> str:
+        """Format semantic analysis for prompt."""
+        if not semantic:
+            return "No semantic analysis available"
+        lines = []
+        for category, colors in semantic.items():
+            if colors:
+                color_list = [c.get("hex", c) if isinstance(c, dict) else c for c in colors[:5]]
+                lines.append(f"- {category}: {', '.join(str(c) for c in color_list)}")
+        return "\n".join(lines) if lines else "No semantic analysis available"
+    def _parse_response(self, response: str) -> BrandIdentification:
+        """Parse LLM response into BrandIdentification."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return BrandIdentification(
+                    brand_primary=data.get("brand_primary", {}),
+                    brand_secondary=data.get("brand_secondary", {}),
+                    brand_accent=data.get("brand_accent", {}),
+                    palette_strategy=data.get("palette_strategy", "unknown"),
+                    cohesion_score=data.get("cohesion_score", 5),
+                    cohesion_notes=data.get("cohesion_notes", ""),
+                    semantic_names=data.get("semantic_names", {}),
+                )
+        except Exception:
+            pass
+        return BrandIdentification()
+# =============================================================================
+# BENCHMARK ADVISOR AGENT
+# =============================================================================
+class BenchmarkAdvisorAgent:
+    """
+    Recommends best-fit design system based on comparison data.
+    WHY LLM: Requires reasoning about trade-offs and use-case fit,
+    not just similarity scores.
+    """
+    PROMPT_TEMPLATE = """You are a senior design system consultant. Recommend the best design system alignment.
+## USER'S CURRENT VALUES
+- Type Scale Ratio: {user_ratio}
+- Base Font Size: {user_base}px
+- Spacing Grid: {user_spacing}px
+## BENCHMARK COMPARISON
+{benchmark_comparison}
+## YOUR TASK
+1. **Recommend Best Fit**: Which design system should they align with?
+2. **Explain Why**: Consider similarity scores AND use-case fit
+3. **List Changes Needed**: What would they need to change to align?
+4. **Pros/Cons**: Benefits and drawbacks of alignment
+## OUTPUT FORMAT (JSON only)
+{{
+  "recommended_benchmark": "<system_key>",
+  "recommended_benchmark_name": "<full name>",
+  "reasoning": "Why this is the best fit for their use case",
+  "alignment_changes": [
+    {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
+    {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
+  ],
+  "pros_of_alignment": [
+    "Familiar patterns for users",
+    "Well-tested accessibility"
+  ],
+  "cons_of_alignment": [
+    "May lose brand uniqueness"
+  ],
+  "alternative_benchmarks": [
+    {{"name": "Material Design 3", "reason": "Good for Android-first products"}}
+  ]
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def analyze(
+        self,
+        user_ratio: float,
+        user_base: int,
+        user_spacing: int,
+        benchmark_comparisons: list,
+        log_callback: Callable = None,
+    ) -> BenchmarkAdvice:
+        """
+        Recommend best-fit design system.
+        Args:
+            user_ratio: User's detected type scale ratio
+            user_base: User's base font size
+            user_spacing: User's spacing grid base
+            benchmark_comparisons: List of BenchmarkComparison objects
+            log_callback: Progress logging function
+        Returns:
+            BenchmarkAdvice with recommendations
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("   🏢 Benchmark Advisor (Qwen 72B)")
+        log("   └─ Evaluating benchmark fit for your use case...")
+        # Format comparison data
+        comparison_str = self._format_comparisons(benchmark_comparisons)
+        prompt = self.PROMPT_TEMPLATE.format(
+            user_ratio=user_ratio,
+            user_base=user_base,
+            user_spacing=user_spacing,
+            benchmark_comparison=comparison_str,
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="benchmark_advisor",
+                system_prompt="You are a senior design system consultant specializing in design system architecture.",
+                user_message=prompt,
+                max_tokens=700,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            result = self._parse_response(response)
+            log(f"   ────────────────────────────────────────────────")
+            log(f"   🏢 Benchmark Advisor: COMPLETE ({duration:.1f}s)")
+            log(f"   ├─ Recommended: {result.recommended_benchmark_name}")
+            log(f"   ├─ Changes Needed: {len(result.alignment_changes)}")
+            log(f"   └─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Error: {str(e)[:50]}")
+            return BenchmarkAdvice()
+    def _format_comparisons(self, comparisons: list) -> str:
+        """Format benchmark comparisons for prompt."""
+        lines = []
+        for i, c in enumerate(comparisons[:5]):
+            b = c.benchmark
+            lines.append(f"""
+{i+1}. {b.icon} {b.name}
+   - Similarity Score: {c.similarity_score:.2f} (lower = better)
+   - Match: {c.overall_match_pct:.0f}%
+   - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
+   - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
+   - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
+   - Best For: {', '.join(b.best_for)}""")
+        return "\n".join(lines)
+    def _parse_response(self, response: str) -> BenchmarkAdvice:
+        """Parse LLM response into BenchmarkAdvice."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return BenchmarkAdvice(
+                    recommended_benchmark=data.get("recommended_benchmark", ""),
+                    recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
+                    reasoning=data.get("reasoning", ""),
+                    alignment_changes=data.get("alignment_changes", []),
+                    pros_of_alignment=data.get("pros_of_alignment", []),
+                    cons_of_alignment=data.get("cons_of_alignment", []),
+                    alternative_benchmarks=data.get("alternative_benchmarks", []),
+                )
+        except Exception:
+            pass
+        return BenchmarkAdvice()
+# =============================================================================
+# BEST PRACTICES VALIDATOR AGENT
+# =============================================================================
+class BestPracticesValidatorAgent:
+    """
+    Validates against design system best practices and prioritizes fixes.
+    WHY LLM: Prioritization requires judgment about business impact,
+    not just checking boxes.
+    """
+    PROMPT_TEMPLATE = """You are a design system auditor. Validate these tokens against best practices.
+## RULE ENGINE ANALYSIS RESULTS
+### Typography
+- Detected Ratio: {type_ratio} ({type_consistent})
+- Base Size: {base_size}px
+- Recommendation: {type_recommendation}
+### Accessibility
+- Total Colors: {total_colors}
+- AA Pass: {aa_pass}
+- AA Fail: {aa_fail}
+- Failing Colors: {failing_colors}
+### Spacing
+- Detected Base: {spacing_base}px
+- Grid Aligned: {spacing_aligned}%
+- Recommendation: {spacing_recommendation}px
+### Color Statistics
+- Unique Colors: {unique_colors}
+- Duplicates: {duplicates}
+- Near-Duplicates: {near_duplicates}
+## BEST PRACTICES CHECKLIST
+1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
+2. Type scale is consistent (variance < 0.15)
+3. Base font size >= 16px (accessibility)
+4. Line height >= 1.5 for body text
+5. All interactive colors pass AA (4.5:1)
+6. Spacing uses consistent grid (4px or 8px)
+7. Limited color palette (< 20 unique semantic colors)
+8. No near-duplicate colors
+## YOUR TASK
+1. Score each practice: pass/warn/fail
+2. Calculate overall score (0-100)
+3. Identify TOP 3 priority fixes with impact assessment
+## OUTPUT FORMAT (JSON only)
+{{
+  "overall_score": <0-100>,
+  "checks": {{
+    "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}},
+    "type_scale_consistent": {{"status": "...", "note": "..."}},
+    "base_size_accessible": {{"status": "...", "note": "..."}},
+    "aa_compliance": {{"status": "...", "note": "..."}},
+    "spacing_grid": {{"status": "...", "note": "..."}},
+    "color_count": {{"status": "...", "note": "..."}}
+  }},
+  "priority_fixes": [
+    {{
+      "rank": 1,
+      "issue": "Brand primary fails AA",
+      "impact": "high|medium|low",
+      "effort": "low|medium|high",
+      "action": "Change #06b2c4 → #0891a8"
+    }}
+  ],
+  "passing_practices": ["Base font size", "..."],
+  "failing_practices": ["AA compliance", "..."]
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def analyze(
+        self,
+        rule_engine_results: Any,
+        log_callback: Callable = None,
+    ) -> BestPracticesResult:
+        """
+        Validate against best practices.
+        Args:
+            rule_engine_results: Results from rule engine
+            log_callback: Progress logging function
+        Returns:
+            BestPracticesResult with validation
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("   ✅ Best Practices Validator (Qwen 72B)")
+        log("   └─ Checking against design system standards...")
+        # Extract data from rule engine
+        typo = rule_engine_results.typography
+        spacing = rule_engine_results.spacing
+        color_stats = rule_engine_results.color_stats
+        accessibility = rule_engine_results.accessibility
+        failures = [a for a in accessibility if not a.passes_aa_normal]
+        failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])
+        prompt = self.PROMPT_TEMPLATE.format(
+            type_ratio=f"{typo.detected_ratio:.3f}",
+            type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
+            base_size=typo.sizes_px[0] if typo.sizes_px else 16,
+            type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
+            total_colors=len(accessibility),
+            aa_pass=len(accessibility) - len(failures),
+            aa_fail=len(failures),
+            failing_colors=failing_colors_str or "None",
+            spacing_base=spacing.detected_base,
+            spacing_aligned=f"{spacing.alignment_percentage:.0f}",
+            spacing_recommendation=spacing.recommendation,
+            unique_colors=color_stats.unique_count,
+            duplicates=color_stats.duplicate_count,
+            near_duplicates=len(color_stats.near_duplicates),
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="best_practices_validator",
+                system_prompt="You are a design system auditor specializing in best practices validation.",
+                user_message=prompt,
+                max_tokens=800,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            result = self._parse_response(response)
+            log(f"   ────────────────────────────────────────────────")
+            log(f"   ✅ Best Practices: COMPLETE ({duration:.1f}s)")
+            log(f"   ├─ Overall Score: {result.overall_score}/100")
+            log(f"   ├─ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}")
+            if result.priority_fixes:
+                log(f"   └─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Error: {str(e)[:50]}")
+            return BestPracticesResult()
+    def _parse_response(self, response: str) -> BestPracticesResult:
+        """Parse LLM response into BestPracticesResult."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return BestPracticesResult(
+                    overall_score=data.get("overall_score", 50),
+                    checks=data.get("checks", {}),
+                    priority_fixes=data.get("priority_fixes", []),
+                    passing_practices=data.get("passing_practices", []),
+                    failing_practices=data.get("failing_practices", []),
+                )
+        except Exception:
+            pass
+        return BestPracticesResult()
+# =============================================================================
+# HEAD SYNTHESIZER AGENT
+# =============================================================================
+class HeadSynthesizerAgent:
+    """
+    Combines all agent outputs into final recommendations.
+    This is the final step that produces actionable output for the user.
+    """
+    PROMPT_TEMPLATE = """You are a senior design system architect. Synthesize these analysis results into final recommendations.
+## RULE ENGINE FACTS
+- Type Scale: {type_ratio} ({type_status})
+- Base Size: {base_size}px
+- AA Failures: {aa_failures}
+- Spacing Grid: {spacing_status}
+- Unique Colors: {unique_colors}
+- Consistency Score: {consistency_score}/100
+## BENCHMARK COMPARISON
+Closest Match: {closest_benchmark}
+Match Percentage: {match_pct}%
+Recommended Changes: {benchmark_changes}
+## BRAND IDENTIFICATION
+- Brand Primary: {brand_primary}
+- Brand Secondary: {brand_secondary}
+- Palette Cohesion: {cohesion_score}/10
+## BEST PRACTICES VALIDATION
+Overall Score: {best_practices_score}/100
+Priority Fixes: {priority_fixes}
+## ACCESSIBILITY FIXES NEEDED
+{accessibility_fixes}
+## YOUR TASK
+Synthesize ALL the above into:
+1. Executive Summary (2-3 sentences)
+2. Overall Scores
+3. Top 3 Priority Actions (with effort estimates)
+4. Specific Color Recommendations (with accept/reject defaults)
+5. Type Scale Recommendation
+6. Spacing Recommendation
+## OUTPUT FORMAT (JSON only)
+{{
+  "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
+  "scores": {{
+    "overall": <0-100>,
+    "accessibility": <0-100>,
+    "consistency": <0-100>,
+    "organization": <0-100>
+  }},
+  "benchmark_fit": {{
+    "closest": "<name>",
+    "similarity": "<X%>",
+    "recommendation": "Align type scale to 1.25"
+  }},
+  "brand_analysis": {{
+    "primary": "#hex",
+    "secondary": "#hex",
+    "cohesion": <1-10>
+  }},
+  "top_3_actions": [
+    {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
+  ],
+  "color_recommendations": [
+    {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
+  ],
+  "type_scale_recommendation": {{
+    "current_ratio": 1.18,
+    "recommended_ratio": 1.25,
+    "reason": "Align with industry standard"
+  }},
+  "spacing_recommendation": {{
+    "current": "mixed",
+    "recommended": "8px",
+    "reason": "Consistent grid improves maintainability"
+  }}
+}}
+Return ONLY valid JSON."""
+    def __init__(self, hf_client):
+        self.hf_client = hf_client
+    async def synthesize(
+        self,
+        rule_engine_results: Any,
+        benchmark_comparisons: list,
+        brand_identification: BrandIdentification,
+        benchmark_advice: BenchmarkAdvice,
+        best_practices: BestPracticesResult,
+        log_callback: Callable = None,
+    ) -> HeadSynthesis:
+        """
+        Synthesize all results into final recommendations.
+        """
+        def log(msg: str):
+            if log_callback:
+                log_callback(msg)
+        log("")
+        log("═" * 60)
+        log("🧠 LAYER 4: HEAD SYNTHESIZER")
+        log("═" * 60)
+        log("")
+        log("   Combining: Rule Engine + Benchmarks + Brand + Best Practices...")
+        # Extract data
+        typo = rule_engine_results.typography
+        spacing = rule_engine_results.spacing
+        color_stats = rule_engine_results.color_stats
+        accessibility = rule_engine_results.accessibility
+        failures = [a for a in accessibility if not a.passes_aa_normal]
+        aa_fixes_str = "\n".join([
+            f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
+            for a in failures[:5] if a.suggested_fix
+        ])
+        closest = benchmark_comparisons[0] if benchmark_comparisons else None
+        prompt = self.PROMPT_TEMPLATE.format(
+            type_ratio=f"{typo.detected_ratio:.3f}",
+            type_status="consistent" if typo.is_consistent else "inconsistent",
+            base_size=typo.sizes_px[0] if typo.sizes_px else 16,
+            aa_failures=len(failures),
+            spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
+            unique_colors=color_stats.unique_count,
+            consistency_score=rule_engine_results.consistency_score,
+            closest_benchmark=closest.benchmark.name if closest else "Unknown",
+            match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
+            benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
+            brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
+            brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
+            cohesion_score=brand_identification.cohesion_score,
+            best_practices_score=best_practices.overall_score,
+            priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
+            accessibility_fixes=aa_fixes_str or "None needed",
+        )
+        try:
+            start_time = datetime.now()
+            response = await self.hf_client.complete_async(
+                agent_name="head_synthesizer",
+                system_prompt="You are a senior design system architect specializing in synthesis and recommendations.",
+                user_message=prompt,
+                max_tokens=1000,
+                json_mode=True,
+            )
+            duration = (datetime.now() - start_time).total_seconds()
+            result = self._parse_response(response)
+            log("")
+            log(f"   ✅ HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
+            log("")
+            return result
+        except Exception as e:
+            log(f"   ├─ ⚠️ Error: {str(e)[:50]}")
+            return HeadSynthesis()
+    def _parse_response(self, response: str) -> HeadSynthesis:
+        """Parse LLM response into HeadSynthesis."""
+        try:
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                return HeadSynthesis(
+                    executive_summary=data.get("executive_summary", ""),
+                    scores=data.get("scores", {}),
+                    benchmark_fit=data.get("benchmark_fit", {}),
+                    brand_analysis=data.get("brand_analysis", {}),
+                    top_3_actions=data.get("top_3_actions", []),
+                    color_recommendations=data.get("color_recommendations", []),
+                    type_scale_recommendation=data.get("type_scale_recommendation", {}),
+                    spacing_recommendation=data.get("spacing_recommendation", {}),
+                )
+        except Exception:
+            pass
+        return HeadSynthesis()