""" Benchmark Research Agent ========================= Fetches LIVE data from design system documentation sites using Firecrawl, with 24-hour caching. This agent: 1. Fetches official documentation from design system sites 2. Extracts typography, spacing, color specifications using LLM 3. Caches results for 24 hours 4. Compares user's tokens to researched benchmarks """ import asyncio import json import os from dataclasses import dataclass, field from datetime import datetime, timedelta from typing import Optional, Callable import hashlib # ============================================================================= # DESIGN SYSTEM SOURCES (Official Documentation URLs) # ============================================================================= DESIGN_SYSTEM_SOURCES = { "material_design_3": { "name": "Material Design 3", "short_name": "Material 3", "vendor": "Google", "urls": { "typography": "https://m3.material.io/styles/typography/type-scale-tokens", "spacing": "https://m3.material.io/foundations/layout/understanding-layout/spacing", "colors": "https://m3.material.io/styles/color/the-color-system/key-colors-tones", }, "best_for": ["Android apps", "Web apps", "Enterprise software"], "icon": "🟢", }, "apple_hig": { "name": "Apple Human Interface Guidelines", "short_name": "Apple HIG", "vendor": "Apple", "urls": { "typography": "https://developer.apple.com/design/human-interface-guidelines/typography", "spacing": "https://developer.apple.com/design/human-interface-guidelines/layout", }, "best_for": ["iOS apps", "macOS apps", "Premium consumer products"], "icon": "🍎", }, "shopify_polaris": { "name": "Shopify Polaris", "short_name": "Polaris", "vendor": "Shopify", "urls": { "typography": "https://polaris.shopify.com/design/typography", "spacing": "https://polaris.shopify.com/design/spacing", "colors": "https://polaris.shopify.com/design/colors", }, "best_for": ["E-commerce", "Admin dashboards", "Merchant tools"], "icon": "🛒", }, "atlassian_design": { "name": "Atlassian Design System", "short_name": "Atlassian", "vendor": "Atlassian", "urls": { "typography": "https://atlassian.design/foundations/typography", "spacing": "https://atlassian.design/foundations/spacing", "colors": "https://atlassian.design/foundations/color", }, "best_for": ["Productivity tools", "Dense interfaces", "Enterprise B2B"], "icon": "🔵", }, "ibm_carbon": { "name": "IBM Carbon Design System", "short_name": "Carbon", "vendor": "IBM", "urls": { "typography": "https://carbondesignsystem.com/guidelines/typography/overview", "spacing": "https://carbondesignsystem.com/guidelines/spacing/overview", "colors": "https://carbondesignsystem.com/guidelines/color/overview", }, "best_for": ["Enterprise software", "Data-heavy applications", "IBM products"], "icon": "🔷", }, "tailwind_css": { "name": "Tailwind CSS", "short_name": "Tailwind", "vendor": "Tailwind Labs", "urls": { "typography": "https://tailwindcss.com/docs/font-size", "spacing": "https://tailwindcss.com/docs/customizing-spacing", "colors": "https://tailwindcss.com/docs/customizing-colors", }, "best_for": ["Web applications", "Startups", "Rapid prototyping"], "icon": "🌊", }, "ant_design": { "name": "Ant Design", "short_name": "Ant Design", "vendor": "Ant Group", "urls": { "typography": "https://ant.design/docs/spec/font", "spacing": "https://ant.design/docs/spec/layout", "colors": "https://ant.design/docs/spec/colors", }, "best_for": ["Enterprise B2B", "Admin panels", "Chinese market"], "icon": "🐜", }, "chakra_ui": { "name": "Chakra UI", "short_name": "Chakra", "vendor": "Chakra UI", "urls": { "typography": "https://chakra-ui.com/docs/styled-system/theme#typography", "spacing": "https://chakra-ui.com/docs/styled-system/theme#spacing", "colors": "https://chakra-ui.com/docs/styled-system/theme#colors", }, "best_for": ["React applications", "Startups", "Accessible products"], "icon": "⚡", }, } # ============================================================================= # DATA CLASSES # ============================================================================= @dataclass class BenchmarkData: """Researched benchmark data from a design system.""" key: str name: str short_name: str vendor: str icon: str # Extracted specifications typography: dict = field(default_factory=dict) # Expected: {scale_ratio, base_size, sizes[], font_family, line_height_body} spacing: dict = field(default_factory=dict) # Expected: {base, scale[], grid} colors: dict = field(default_factory=dict) # Expected: {palette_size, uses_ramps, ramp_steps} # Metadata fetched_at: str = "" confidence: str = "low" # high, medium, low source_urls: list = field(default_factory=list) best_for: list = field(default_factory=list) def to_dict(self) -> dict: return { "key": self.key, "name": self.name, "short_name": self.short_name, "vendor": self.vendor, "icon": self.icon, "typography": self.typography, "spacing": self.spacing, "colors": self.colors, "fetched_at": self.fetched_at, "confidence": self.confidence, "best_for": self.best_for, } @dataclass class BenchmarkComparison: """Comparison result between user's tokens and a benchmark.""" benchmark: BenchmarkData similarity_score: float # Lower = more similar # Individual comparisons type_ratio_diff: float base_size_diff: int spacing_grid_diff: int # Match percentages type_match_pct: float spacing_match_pct: float overall_match_pct: float def to_dict(self) -> dict: return { "name": self.benchmark.name, "short_name": self.benchmark.short_name, "icon": self.benchmark.icon, "similarity_score": round(self.similarity_score, 2), "overall_match_pct": round(self.overall_match_pct, 1), "comparison": { "type_ratio": { "diff": round(self.type_ratio_diff, 3), "match_pct": round(self.type_match_pct, 1), }, "base_size": { "diff": self.base_size_diff, }, "spacing_grid": { "diff": self.spacing_grid_diff, "match_pct": round(self.spacing_match_pct, 1), }, }, "benchmark_values": { "type_ratio": self.benchmark.typography.get("scale_ratio"), "base_size": self.benchmark.typography.get("base_size"), "spacing_grid": self.benchmark.spacing.get("base"), }, "best_for": self.benchmark.best_for, "confidence": self.benchmark.confidence, } # ============================================================================= # CACHE MANAGER # ============================================================================= class BenchmarkCache: """Manages 24-hour caching of benchmark research results.""" def __init__(self, cache_dir: str = None): if cache_dir is None: cache_dir = os.path.join(os.path.dirname(__file__), "..", "storage") self.cache_file = os.path.join(cache_dir, "benchmark_cache.json") self._ensure_cache_dir() def _ensure_cache_dir(self): """Ensure cache directory exists.""" os.makedirs(os.path.dirname(self.cache_file), exist_ok=True) def _load_cache(self) -> dict: """Load cache from file.""" if os.path.exists(self.cache_file): try: with open(self.cache_file, 'r') as f: return json.load(f) except Exception: return {} return {} def _save_cache(self, cache: dict): """Save cache to file.""" try: with open(self.cache_file, 'w') as f: json.dump(cache, f, indent=2) except Exception: pass def get(self, key: str) -> Optional[BenchmarkData]: """Get cached benchmark if valid (< 24 hours old).""" cache = self._load_cache() if key not in cache: return None entry = cache[key] fetched_at = datetime.fromisoformat(entry.get("fetched_at", "2000-01-01")) # Check if expired (24 hours) if datetime.now() - fetched_at > timedelta(hours=24): return None # Reconstruct BenchmarkData source = DESIGN_SYSTEM_SOURCES.get(key, {}) return BenchmarkData( key=key, name=entry.get("name", source.get("name", key)), short_name=entry.get("short_name", source.get("short_name", key)), vendor=entry.get("vendor", source.get("vendor", "")), icon=entry.get("icon", source.get("icon", "📦")), typography=entry.get("typography", {}), spacing=entry.get("spacing", {}), colors=entry.get("colors", {}), fetched_at=entry.get("fetched_at", ""), confidence=entry.get("confidence", "low"), source_urls=entry.get("source_urls", []), best_for=entry.get("best_for", source.get("best_for", [])), ) def set(self, key: str, data: BenchmarkData): """Cache benchmark data.""" cache = self._load_cache() cache[key] = data.to_dict() self._save_cache(cache) def get_cache_status(self) -> dict: """Get status of all cached items.""" cache = self._load_cache() status = {} for key in DESIGN_SYSTEM_SOURCES.keys(): if key in cache: fetched_at = datetime.fromisoformat(cache[key].get("fetched_at", "2000-01-01")) age_hours = (datetime.now() - fetched_at).total_seconds() / 3600 is_valid = age_hours < 24 status[key] = { "cached": True, "valid": is_valid, "age_hours": round(age_hours, 1), } else: status[key] = {"cached": False, "valid": False} return status # ============================================================================= # FALLBACK DATA (Used when research fails) # ============================================================================= FALLBACK_BENCHMARKS = { "material_design_3": { "typography": {"scale_ratio": 1.2, "base_size": 16, "font_family": "Roboto", "line_height_body": 1.5}, "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 48, 64], "grid": "8px"}, "colors": {"palette_size": 13, "uses_ramps": True}, }, "apple_hig": { "typography": {"scale_ratio": 1.19, "base_size": 17, "font_family": "SF Pro", "line_height_body": 1.47}, "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40], "grid": "4px"}, "colors": {"palette_size": 9, "uses_ramps": True}, }, "shopify_polaris": { "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "Inter", "line_height_body": 1.5}, "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64], "grid": "4px"}, "colors": {"palette_size": 11, "uses_ramps": True}, }, "atlassian_design": { "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "Inter", "line_height_body": 1.43}, "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"}, "colors": {"palette_size": 15, "uses_ramps": True}, }, "ibm_carbon": { "typography": {"scale_ratio": 1.25, "base_size": 14, "font_family": "IBM Plex Sans", "line_height_body": 1.5}, "spacing": {"base": 8, "scale": [0, 2, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"}, "colors": {"palette_size": 12, "uses_ramps": True}, }, "tailwind_css": { "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5}, "spacing": {"base": 4, "scale": [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32], "grid": "4px"}, "colors": {"palette_size": 22, "uses_ramps": True}, }, "ant_design": { "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "system-ui", "line_height_body": 1.57}, "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48], "grid": "8px"}, "colors": {"palette_size": 13, "uses_ramps": True}, }, "chakra_ui": { "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5}, "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 56, 64], "grid": "4px"}, "colors": {"palette_size": 15, "uses_ramps": True}, }, } # ============================================================================= # BENCHMARK RESEARCHER # ============================================================================= class BenchmarkResearcher: """ Research agent that fetches live design system specifications. Uses Firecrawl to fetch documentation and LLM to extract specs. Results are cached for 24 hours. """ def __init__(self, firecrawl_client=None, hf_client=None): """ Initialize researcher. Args: firecrawl_client: Firecrawl API client for fetching docs hf_client: HuggingFace client for LLM extraction """ self.firecrawl = firecrawl_client self.hf_client = hf_client self.cache = BenchmarkCache() async def research_benchmark( self, system_key: str, log_callback: Callable = None, force_refresh: bool = False, ) -> BenchmarkData: """ Research a specific design system. Args: system_key: Key from DESIGN_SYSTEM_SOURCES log_callback: Function to log progress force_refresh: Bypass cache and fetch fresh Returns: BenchmarkData with extracted specifications """ def log(msg: str): if log_callback: log_callback(msg) if system_key not in DESIGN_SYSTEM_SOURCES: raise ValueError(f"Unknown design system: {system_key}") source = DESIGN_SYSTEM_SOURCES[system_key] # Check cache first (unless force refresh) if not force_refresh: cached = self.cache.get(system_key) if cached: log(f" ├─ {source['icon']} {source['short_name']}: Using cached data ✅") return cached log(f" ├─ {source['icon']} {source['short_name']}: Fetching documentation...") # Try to fetch and extract raw_content = "" confidence = "low" if self.firecrawl: try: # Fetch typography docs typo_url = source["urls"].get("typography") if typo_url: log(f" │ ├─ Fetching {typo_url[:50]}...") typo_content = await self._fetch_url(typo_url) if typo_content: raw_content += f"\n\n=== TYPOGRAPHY ===\n{typo_content[:4000]}" confidence = "medium" # Fetch spacing docs spacing_url = source["urls"].get("spacing") if spacing_url: log(f" │ ├─ Fetching spacing docs...") spacing_content = await self._fetch_url(spacing_url) if spacing_content: raw_content += f"\n\n=== SPACING ===\n{spacing_content[:3000]}" if confidence == "medium": confidence = "high" except Exception as e: log(f" │ ├─ ⚠️ Fetch error: {str(e)[:50]}") # Extract specs with LLM (or use fallback) if raw_content and self.hf_client: log(f" │ ├─ Extracting specifications...") extracted = await self._extract_specs_with_llm(source["name"], raw_content) else: log(f" │ ├─ Using fallback data (fetch unavailable)") extracted = FALLBACK_BENCHMARKS.get(system_key, {}) confidence = "fallback" # Build result result = BenchmarkData( key=system_key, name=source["name"], short_name=source["short_name"], vendor=source["vendor"], icon=source["icon"], typography=extracted.get("typography", FALLBACK_BENCHMARKS.get(system_key, {}).get("typography", {})), spacing=extracted.get("spacing", FALLBACK_BENCHMARKS.get(system_key, {}).get("spacing", {})), colors=extracted.get("colors", FALLBACK_BENCHMARKS.get(system_key, {}).get("colors", {})), fetched_at=datetime.now().isoformat(), confidence=confidence, source_urls=list(source["urls"].values()), best_for=source["best_for"], ) # Cache result self.cache.set(system_key, result) ratio = result.typography.get("scale_ratio", "?") base = result.typography.get("base_size", "?") grid = result.spacing.get("base", "?") log(f" │ └─ ✅ ratio={ratio}, base={base}px, grid={grid}px [{confidence}]") return result async def _fetch_url(self, url: str) -> Optional[str]: """Fetch URL content using Firecrawl.""" if not self.firecrawl: return None try: # Firecrawl scrape result = self.firecrawl.scrape_url( url, params={"formats": ["markdown"]} ) if result and result.get("markdown"): return result["markdown"] elif result and result.get("content"): return result["content"] except Exception as e: pass return None async def _extract_specs_with_llm(self, system_name: str, raw_content: str) -> dict: """Extract structured specs from documentation using LLM.""" if not self.hf_client: return {} prompt = f"""Extract the design system specifications from this documentation. DESIGN SYSTEM: {system_name} DOCUMENTATION: {raw_content[:6000]} Return ONLY a JSON object with these exact fields (use null if not found): {{ "typography": {{ "scale_ratio": , "base_size": , "font_family": "", "sizes": [], "line_height_body": }}, "spacing": {{ "base": , "scale": [], "grid": "" }}, "colors": {{ "palette_size": , "uses_ramps": }} }} Return ONLY valid JSON, no explanation.""" try: response = await self.hf_client.complete_async( agent_name="benchmark_extractor", system_prompt="You are a design system specification extractor. Extract only the factual specifications.", user_message=prompt, max_tokens=600, json_mode=True, ) # Parse JSON from response import re json_match = re.search(r'\{[\s\S]*\}', response) if json_match: return json.loads(json_match.group()) except Exception as e: pass return {} async def research_selected_benchmarks( self, selected_keys: list[str], log_callback: Callable = None, ) -> list[BenchmarkData]: """ Research multiple selected design systems. Args: selected_keys: List of system keys to research log_callback: Function to log progress Returns: List of BenchmarkData """ def log(msg: str): if log_callback: log_callback(msg) log("") log("═" * 60) log("🔬 LAYER 2: BENCHMARK RESEARCH (Firecrawl + Cache)") log("═" * 60) log("") log(f" Selected systems: {', '.join(selected_keys)}") log("") results = [] for key in selected_keys: if key in DESIGN_SYSTEM_SOURCES: try: result = await self.research_benchmark(key, log_callback) results.append(result) except Exception as e: log(f" ├─ ⚠️ Error researching {key}: {e}") # Use fallback source = DESIGN_SYSTEM_SOURCES[key] fallback = FALLBACK_BENCHMARKS.get(key, {}) results.append(BenchmarkData( key=key, name=source["name"], short_name=source["short_name"], vendor=source["vendor"], icon=source["icon"], typography=fallback.get("typography", {}), spacing=fallback.get("spacing", {}), colors=fallback.get("colors", {}), fetched_at=datetime.now().isoformat(), confidence="fallback", best_for=source["best_for"], )) log("") log(f" ✅ Researched {len(results)}/{len(selected_keys)} design systems") return results def compare_to_benchmarks( self, your_ratio: float, your_base_size: int, your_spacing_grid: int, benchmarks: list[BenchmarkData], log_callback: Callable = None, ) -> list[BenchmarkComparison]: """ Compare user's tokens to researched benchmarks. Args: your_ratio: Detected type scale ratio your_base_size: Detected base font size your_spacing_grid: Detected spacing grid base benchmarks: List of researched BenchmarkData log_callback: Function to log progress Returns: List of BenchmarkComparison sorted by similarity """ def log(msg: str): if log_callback: log_callback(msg) log("") log(" 📊 BENCHMARK COMPARISON") log(" " + "─" * 40) log(f" Your values: ratio={your_ratio:.2f}, base={your_base_size}px, grid={your_spacing_grid}px") log("") comparisons = [] for b in benchmarks: b_ratio = b.typography.get("scale_ratio", 1.25) b_base = b.typography.get("base_size", 16) b_grid = b.spacing.get("base", 8) # Calculate differences ratio_diff = abs(your_ratio - b_ratio) base_diff = abs(your_base_size - b_base) grid_diff = abs(your_spacing_grid - b_grid) # Calculate match percentages type_match = max(0, 100 - (ratio_diff * 100)) # 0.1 diff = 90% match spacing_match = max(0, 100 - (grid_diff * 10)) # 4px diff = 60% match # Weighted similarity score (lower = more similar) similarity = (ratio_diff * 10) + (base_diff * 0.5) + (grid_diff * 0.3) # Overall match percentage overall_match = (type_match * 0.5) + (spacing_match * 0.3) + (100 - base_diff * 5) * 0.2 overall_match = max(0, min(100, overall_match)) comparisons.append(BenchmarkComparison( benchmark=b, similarity_score=similarity, type_ratio_diff=ratio_diff, base_size_diff=base_diff, spacing_grid_diff=grid_diff, type_match_pct=type_match, spacing_match_pct=spacing_match, overall_match_pct=overall_match, )) # Sort by similarity (lower = better) comparisons.sort(key=lambda x: x.similarity_score) # Log results medals = ["🥇", "🥈", "🥉"] for i, c in enumerate(comparisons[:5]): medal = medals[i] if i < 3 else " " b = c.benchmark log(f" {medal} {b.icon} {b.short_name}: {c.overall_match_pct:.0f}% match (score: {c.similarity_score:.2f})") log(f" └─ ratio={b.typography.get('scale_ratio')}, base={b.typography.get('base_size')}px, grid={b.spacing.get('base')}px") return comparisons # ============================================================================= # HELPER FUNCTIONS # ============================================================================= def get_available_benchmarks() -> list[dict]: """Get list of available design systems for UI dropdown.""" return [ { "key": key, "name": source["name"], "short_name": source["short_name"], "icon": source["icon"], "vendor": source["vendor"], "best_for": source["best_for"], } for key, source in DESIGN_SYSTEM_SOURCES.items() ] def get_benchmark_choices() -> list[tuple[str, str]]: """Get choices for Gradio dropdown.""" return [ (f"{source['icon']} {source['short_name']} ({source['vendor']})", key) for key, source in DESIGN_SYSTEM_SOURCES.items() ]