Spaces:
Sleeping
Sleeping
| """ | |
| Stage 2 LLM Agents β Specialized Analysis Tasks | |
| ================================================= | |
| These agents handle tasks that REQUIRE LLM reasoning: | |
| - Brand Identifier: Identify brand colors from usage context | |
| - Benchmark Advisor: Recommend best-fit design system | |
| - Best Practices Validator: Prioritize fixes by business impact | |
| - HEAD Synthesizer: Combine all outputs into final recommendations | |
| Each agent has a focused prompt for its specific task. | |
| """ | |
| import json | |
| import re | |
| from dataclasses import dataclass, field | |
| from typing import Optional, Callable, Any | |
| from datetime import datetime | |
| # ============================================================================= | |
| # DATA CLASSES | |
| # ============================================================================= | |
| class BrandIdentification: | |
| """Results from Brand Identifier agent (AURORA).""" | |
| brand_primary: dict = field(default_factory=dict) | |
| # {color, confidence, reasoning, usage_count} | |
| brand_secondary: dict = field(default_factory=dict) | |
| brand_accent: dict = field(default_factory=dict) | |
| palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random | |
| cohesion_score: int = 5 # 1-10 | |
| cohesion_notes: str = "" | |
| semantic_names: dict = field(default_factory=dict) | |
| # {hex_color: suggested_name} | |
| self_evaluation: dict = field(default_factory=dict) | |
| # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} | |
| def to_dict(self) -> dict: | |
| return { | |
| "brand_primary": self.brand_primary, | |
| "brand_secondary": self.brand_secondary, | |
| "brand_accent": self.brand_accent, | |
| "palette_strategy": self.palette_strategy, | |
| "cohesion_score": self.cohesion_score, | |
| "cohesion_notes": self.cohesion_notes, | |
| "semantic_names": self.semantic_names, | |
| "self_evaluation": self.self_evaluation, | |
| } | |
| class BenchmarkAdvice: | |
| """Results from Benchmark Advisor agent.""" | |
| recommended_benchmark: str = "" | |
| recommended_benchmark_name: str = "" | |
| reasoning: str = "" | |
| alignment_changes: list = field(default_factory=list) | |
| # [{change, from, to, effort}] | |
| pros_of_alignment: list = field(default_factory=list) | |
| cons_of_alignment: list = field(default_factory=list) | |
| alternative_benchmarks: list = field(default_factory=list) | |
| # [{name, reason}] | |
| self_evaluation: dict = field(default_factory=dict) | |
| # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} | |
| def to_dict(self) -> dict: | |
| return { | |
| "recommended_benchmark": self.recommended_benchmark, | |
| "recommended_benchmark_name": self.recommended_benchmark_name, | |
| "reasoning": self.reasoning, | |
| "alignment_changes": self.alignment_changes, | |
| "pros": self.pros_of_alignment, | |
| "cons": self.cons_of_alignment, | |
| "alternatives": self.alternative_benchmarks, | |
| "self_evaluation": self.self_evaluation, | |
| } | |
| class BestPracticesResult: | |
| """Results from Best Practices Validator agent.""" | |
| overall_score: int = 50 # 0-100 | |
| checks: dict = field(default_factory=dict) | |
| # {check_name: {status: pass/warn/fail, note: str}} | |
| priority_fixes: list = field(default_factory=list) | |
| # [{rank, issue, impact, effort, action}] | |
| passing_practices: list = field(default_factory=list) | |
| failing_practices: list = field(default_factory=list) | |
| self_evaluation: dict = field(default_factory=dict) | |
| # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} | |
| def to_dict(self) -> dict: | |
| return { | |
| "overall_score": self.overall_score, | |
| "checks": self.checks, | |
| "priority_fixes": self.priority_fixes, | |
| "passing": self.passing_practices, | |
| "failing": self.failing_practices, | |
| "self_evaluation": self.self_evaluation, | |
| } | |
| class HeadSynthesis: | |
| """Final synthesized output from HEAD agent.""" | |
| executive_summary: str = "" | |
| scores: dict = field(default_factory=dict) | |
| # {overall, accessibility, consistency, organization} | |
| benchmark_fit: dict = field(default_factory=dict) | |
| # {closest, similarity, recommendation} | |
| brand_analysis: dict = field(default_factory=dict) | |
| # {primary, secondary, cohesion} | |
| top_3_actions: list = field(default_factory=list) | |
| # [{action, impact, effort, details}] | |
| color_recommendations: list = field(default_factory=list) | |
| # [{role, current, suggested, reason, accept}] | |
| type_scale_recommendation: dict = field(default_factory=dict) | |
| spacing_recommendation: dict = field(default_factory=dict) | |
| self_evaluation: dict = field(default_factory=dict) | |
| # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []} | |
| def to_dict(self) -> dict: | |
| return { | |
| "executive_summary": self.executive_summary, | |
| "scores": self.scores, | |
| "benchmark_fit": self.benchmark_fit, | |
| "brand_analysis": self.brand_analysis, | |
| "top_3_actions": self.top_3_actions, | |
| "color_recommendations": self.color_recommendations, | |
| "type_scale_recommendation": self.type_scale_recommendation, | |
| "spacing_recommendation": self.spacing_recommendation, | |
| "self_evaluation": self.self_evaluation, | |
| } | |
| # ============================================================================= | |
| # BRAND IDENTIFIER AGENT | |
| # ============================================================================= | |
| class BrandIdentifierAgent: | |
| """ | |
| AURORA β Senior Brand Color Analyst. | |
| Identifies brand colors from usage context using creative/visual reasoning. | |
| Model: Qwen 72B (strong creative reasoning, color harmony assessment) | |
| Temperature: 0.4 (allows creative interpretation of color stories) | |
| WHY LLM: Requires understanding context (33 buttons = likely brand primary), | |
| not just color math. | |
| """ | |
| SYSTEM_PROMPT = """You are AURORA, a Senior Brand Color Analyst specializing in visual identity systems. | |
| ## YOUR ROLE IN THE PIPELINE | |
| You are Agent 1 of 4 in the Design System Analysis pipeline. | |
| - INPUT: Raw color tokens with usage counts + semantic CSS analysis from Stage 1 extraction | |
| - OUTPUT: Brand color identification + palette strategy β feeds into NEXUS (Agent 4) for final synthesis | |
| - Your analysis directly influences the final color recommendations shown to the user. | |
| ## YOUR EXPERTISE | |
| - Color harmony theory (complementary, analogous, triadic, split-complementary, monochromatic) | |
| - Brand identity systems (primary/secondary/accent hierarchy) | |
| - CSS context interpretation (button colors = likely CTA, background colors = likely neutral) | |
| - Color naming conventions (design token naming: brand.primary, text.secondary, etc.) | |
| ## QUALITY STANDARDS | |
| - Brand Primary MUST have HIGH confidence if one color dominates buttons/CTAs. Say "low" if ambiguous. | |
| - Cohesion Score: Use the FULL 1-10 range. A score of 7+ means clear intentional harmony. Most sites score 5-7. | |
| - If fewer than 5 unique colors exist, flag as "insufficient_data" β don't guess relationships. | |
| ## WHAT NOT TO DO | |
| - Don't inflate confidence. "Medium" is fine when usage patterns are unclear. | |
| - Don't guess accent colors if none exist β use null. | |
| - Don't assume complementary strategy just because two colors differ β check the actual hue relationship. | |
| - Don't name colors generically. Use semantic design-token style names (brand.primary, not "blue"). | |
| ## SCORING RUBRIC (Cohesion 1-10): | |
| - 9-10: Clear harmony rule across all colors, distinct brand identity, consistent palette | |
| - 7-8: Mostly harmonious, clear brand identity, minor inconsistencies | |
| - 5-6: Some color relationships visible but not systematic | |
| - 3-4: Random-feeling palette, no clear color strategy | |
| - 1-2: Actively conflicting colors, no brand identity visible""" | |
| PROMPT_TEMPLATE = """Analyze the following color usage data and identify the brand color system. | |
| ## COLOR DATA WITH USAGE CONTEXT | |
| {color_data} | |
| ## SEMANTIC ANALYSIS (from CSS properties) | |
| {semantic_analysis} | |
| ## YOUR TASK | |
| 1. **Identify Brand Colors**: | |
| - Brand Primary: The main action/CTA color (highest visibility in buttons, links, key UI) | |
| - Brand Secondary: Supporting brand color (headers, secondary actions) | |
| - Brand Accent: Highlight color for emphasis (badges, alerts, special states) | |
| 2. **Assess Palette Strategy**: complementary, analogous, triadic, monochromatic, or random? | |
| 3. **Rate Cohesion** (1-10) using the rubric above | |
| 4. **Suggest Semantic Names** for top 10 most-used colors (design-token format) | |
| 5. **Self-Evaluate** your analysis quality | |
| ## OUTPUT FORMAT (JSON only) | |
| {{ | |
| "brand_primary": {{ | |
| "color": "#hex", | |
| "confidence": "high|medium|low", | |
| "reasoning": "Why this is brand primary β cite specific usage evidence", | |
| "usage_count": <number> | |
| }}, | |
| "brand_secondary": {{ | |
| "color": "#hex", | |
| "confidence": "high|medium|low", | |
| "reasoning": "..." | |
| }}, | |
| "brand_accent": {{ | |
| "color": "#hex or null", | |
| "confidence": "...", | |
| "reasoning": "..." | |
| }}, | |
| "palette_strategy": "complementary|analogous|triadic|monochromatic|random", | |
| "cohesion_score": <1-10>, | |
| "cohesion_notes": "Assessment of how well colors work together", | |
| "semantic_names": {{ | |
| "#hex1": "brand.primary", | |
| "#hex2": "text.primary", | |
| "#hex3": "background.primary" | |
| }}, | |
| "self_evaluation": {{ | |
| "confidence": <1-10>, | |
| "reasoning": "Why I am this confident in my analysis", | |
| "data_quality": "good|fair|poor", | |
| "flags": [] | |
| }} | |
| }} | |
| Return ONLY valid JSON.""" | |
| def __init__(self, hf_client): | |
| self.hf_client = hf_client | |
| async def analyze( | |
| self, | |
| color_tokens: dict, | |
| semantic_analysis: dict, | |
| log_callback: Callable = None, | |
| ) -> BrandIdentification: | |
| """ | |
| Identify brand colors from usage context. | |
| Args: | |
| color_tokens: Dict of color tokens with usage data | |
| semantic_analysis: Semantic categorization from Stage 1 | |
| log_callback: Progress logging function | |
| Returns: | |
| BrandIdentification with identified colors | |
| """ | |
| def log(msg: str): | |
| if log_callback: | |
| log_callback(msg) | |
| log(" π¨ AURORA β Brand Identifier (Qwen 72B)") | |
| log(" ββ Analyzing color context and usage patterns...") | |
| # Format color data | |
| color_data = self._format_color_data(color_tokens) | |
| semantic_str = self._format_semantic_analysis(semantic_analysis) | |
| prompt = self.PROMPT_TEMPLATE.format( | |
| color_data=color_data, | |
| semantic_analysis=semantic_str, | |
| ) | |
| try: | |
| start_time = datetime.now() | |
| response = await self.hf_client.complete_async( | |
| agent_name="brand_identifier", | |
| system_prompt=self.SYSTEM_PROMPT, | |
| user_message=prompt, | |
| max_tokens=1000, | |
| json_mode=True, | |
| ) | |
| duration = (datetime.now() - start_time).total_seconds() | |
| # Parse response | |
| result = self._parse_response(response) | |
| log(f" ββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| log(f" π¨ AURORA β Brand Identifier: COMPLETE ({duration:.1f}s)") | |
| log(f" ββ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)") | |
| log(f" ββ Brand Secondary: {result.brand_secondary.get('color', '?')}") | |
| log(f" ββ Palette Strategy: {result.palette_strategy}") | |
| log(f" ββ Cohesion Score: {result.cohesion_score}/10") | |
| se = result.self_evaluation | |
| if se: | |
| log(f" ββ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") | |
| return result | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Always log full error for diagnosis | |
| log(f" β οΈ Brand Identifier failed: {error_msg[:120]}") | |
| if "gated" in error_msg.lower() or "access" in error_msg.lower(): | |
| log(f" ββ Model may require license acceptance at huggingface.co") | |
| elif "Rate limit" in error_msg or "429" in error_msg: | |
| log(f" ββ HF free tier rate limit β wait or upgrade to Pro") | |
| return BrandIdentification() | |
| def _format_color_data(self, color_tokens: dict) -> str: | |
| """Format color tokens for prompt.""" | |
| lines = [] | |
| for name, token in list(color_tokens.items())[:30]: | |
| if isinstance(token, dict): | |
| hex_val = token.get("value", token.get("hex", "")) | |
| usage = token.get("usage_count", token.get("count", 1)) | |
| context = token.get("context", token.get("css_property", "")) | |
| else: | |
| hex_val = getattr(token, "value", "") | |
| usage = getattr(token, "usage_count", 1) | |
| context = getattr(token, "context", "") | |
| if hex_val: | |
| lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}") | |
| return "\n".join(lines) if lines else "No color data available" | |
| def _format_semantic_analysis(self, semantic: dict) -> str: | |
| """Format semantic analysis for prompt.""" | |
| if not semantic: | |
| return "No semantic analysis available" | |
| lines = [] | |
| try: | |
| for category, value in semantic.items(): | |
| if not value: | |
| continue | |
| if isinstance(value, list): | |
| # List of colors | |
| color_list = [] | |
| for c in value[:5]: | |
| if isinstance(c, dict): | |
| color_list.append(c.get("hex", c.get("value", str(c)))) | |
| else: | |
| color_list.append(str(c)) | |
| lines.append(f"- {category}: {', '.join(color_list)}") | |
| elif isinstance(value, dict): | |
| # Could be a nested dict of sub-roles β color dicts | |
| # e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}} | |
| # or a flat color dict {"hex": "#...", "confidence": "..."} | |
| # or a summary dict {"total_colors_analyzed": 50, ...} | |
| if "hex" in value: | |
| # Flat color dict | |
| lines.append(f"- {category}: {value['hex']}") | |
| else: | |
| # Nested dict β iterate sub-roles | |
| sub_items = [] | |
| for sub_role, sub_val in list(value.items())[:5]: | |
| if isinstance(sub_val, dict) and "hex" in sub_val: | |
| sub_items.append(f"{sub_role}={sub_val['hex']}") | |
| elif isinstance(sub_val, (str, int, float, bool)): | |
| sub_items.append(f"{sub_role}={sub_val}") | |
| if sub_items: | |
| lines.append(f"- {category}: {', '.join(sub_items)}") | |
| else: | |
| lines.append(f"- {category}: {value}") | |
| except Exception as e: | |
| return f"Error formatting semantic analysis: {str(e)[:50]}" | |
| return "\n".join(lines) if lines else "No semantic analysis available" | |
| def _parse_response(self, response: str) -> BrandIdentification: | |
| """Parse LLM response into BrandIdentification.""" | |
| try: | |
| json_match = re.search(r'\{[\s\S]*\}', response) | |
| if json_match: | |
| data = json.loads(json_match.group()) | |
| return BrandIdentification( | |
| brand_primary=data.get("brand_primary", {}), | |
| brand_secondary=data.get("brand_secondary", {}), | |
| brand_accent=data.get("brand_accent", {}), | |
| palette_strategy=data.get("palette_strategy", "unknown"), | |
| cohesion_score=data.get("cohesion_score", 5), | |
| cohesion_notes=data.get("cohesion_notes", ""), | |
| semantic_names=data.get("semantic_names", {}), | |
| self_evaluation=data.get("self_evaluation", {}), | |
| ) | |
| except Exception: | |
| pass | |
| return BrandIdentification() | |
| # ============================================================================= | |
| # BENCHMARK ADVISOR AGENT | |
| # ============================================================================= | |
| class BenchmarkAdvisorAgent: | |
| """ | |
| ATLAS β Senior Design System Benchmark Analyst. | |
| Recommends best-fit design system based on comparison data. | |
| Model: Llama 3.3 70B (128K context for large benchmark data, excellent comparative reasoning) | |
| Temperature: 0.25 (analytical, data-driven comparison) | |
| WHY LLM: Requires reasoning about trade-offs and use-case fit, | |
| not just similarity scores. | |
| """ | |
| SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst specializing in cross-system comparison and alignment strategy. | |
| ## YOUR ROLE IN THE PIPELINE | |
| You are Agent 2 of 4 in the Design System Analysis pipeline. | |
| - INPUT: User's extracted type scale, spacing, and font sizes + benchmark comparison data from the Rule Engine | |
| - OUTPUT: Benchmark recommendation with alignment roadmap β feeds into NEXUS (Agent 4) for final synthesis | |
| - Your recommendation helps the user decide which established design system to align with. | |
| ## YOUR EXPERTISE | |
| - Deep knowledge of Material Design 3, Apple HIG, IBM Carbon, Ant Design, Atlassian, Tailwind CSS, Bootstrap | |
| - Type scale mathematics (major/minor second/third, perfect fourth/fifth, golden ratio) | |
| - Spacing grid systems (4px, 8px, multiples) and their trade-offs | |
| - Migration effort estimation for design system alignment | |
| ## QUALITY STANDARDS | |
| - Always consider BOTH similarity score AND use-case fit. Closest match β best fit. | |
| - Recommend max 4 alignment changes. More than that = the benchmark is not a good fit. | |
| - Effort estimates must be realistic: "low" = CSS variable change, "medium" = component updates, "high" = layout restructuring. | |
| - If similarity is above 85%, say "already well-aligned" and suggest minimal changes only. | |
| ## WHAT NOT TO DO | |
| - Don't always recommend the closest match β a system 5% less similar but much better suited is preferable. | |
| - Don't list generic pros/cons. Be specific to the user's actual values. | |
| - Don't suggest alignment changes that would break accessibility (e.g., smaller base font). | |
| - Don't recommend obscure or abandoned design systems. | |
| ## SCORING RUBRIC (Benchmark Fit): | |
| - Excellent Fit: >85% match, same use-case category, < 3 changes needed | |
| - Good Fit: 70-85% match, compatible use-case, 3-4 changes needed | |
| - Fair Fit: 50-70% match, different trade-offs to consider, 4+ changes | |
| - Poor Fit: <50% match, fundamentally different approach β don't recommend""" | |
| PROMPT_TEMPLATE = """Analyze the following benchmark comparison data and recommend the best design system alignment. | |
| ## USER'S CURRENT VALUES | |
| - Type Scale Ratio: {user_ratio} | |
| - Base Font Size: {user_base}px | |
| - Spacing Grid: {user_spacing}px | |
| ## BENCHMARK COMPARISON | |
| {benchmark_comparison} | |
| ## YOUR TASK | |
| 1. **Recommend Best Fit**: Which design system should they align with? Consider use-case fit, not just numbers. | |
| 2. **Explain Why**: Cite specific data points (similarity scores, ratio differences, spacing alignment). | |
| 3. **List Changes Needed**: What would they need to change? Include effort estimates. | |
| 4. **Pros/Cons**: Specific to this user's values, not generic statements. | |
| 5. **Self-Evaluate** your recommendation quality. | |
| ## OUTPUT FORMAT (JSON only) | |
| {{ | |
| "recommended_benchmark": "<system_key>", | |
| "recommended_benchmark_name": "<full name>", | |
| "reasoning": "Why this is the best fit β cite specific data", | |
| "alignment_changes": [ | |
| {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}}, | |
| {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}} | |
| ], | |
| "pros_of_alignment": [ | |
| "Specific benefit with data" | |
| ], | |
| "cons_of_alignment": [ | |
| "Specific trade-off" | |
| ], | |
| "alternative_benchmarks": [ | |
| {{"name": "Material Design 3", "reason": "Good for Android-first products"}} | |
| ], | |
| "self_evaluation": {{ | |
| "confidence": <1-10>, | |
| "reasoning": "Why I am this confident", | |
| "data_quality": "good|fair|poor", | |
| "flags": [] | |
| }} | |
| }} | |
| Return ONLY valid JSON.""" | |
| def __init__(self, hf_client): | |
| self.hf_client = hf_client | |
| async def analyze( | |
| self, | |
| user_ratio: float, | |
| user_base: int, | |
| user_spacing: int, | |
| benchmark_comparisons: list, | |
| log_callback: Callable = None, | |
| ) -> BenchmarkAdvice: | |
| """ | |
| Recommend best-fit design system. | |
| Args: | |
| user_ratio: User's detected type scale ratio | |
| user_base: User's base font size | |
| user_spacing: User's spacing grid base | |
| benchmark_comparisons: List of BenchmarkComparison objects | |
| log_callback: Progress logging function | |
| Returns: | |
| BenchmarkAdvice with recommendations | |
| """ | |
| def log(msg: str): | |
| if log_callback: | |
| log_callback(msg) | |
| log("") | |
| log(" π’ ATLAS β Benchmark Advisor (Llama 3.3 70B)") | |
| log(" ββ Evaluating benchmark fit for your use case...") | |
| # Format comparison data | |
| comparison_str = self._format_comparisons(benchmark_comparisons) | |
| prompt = self.PROMPT_TEMPLATE.format( | |
| user_ratio=user_ratio, | |
| user_base=user_base, | |
| user_spacing=user_spacing, | |
| benchmark_comparison=comparison_str, | |
| ) | |
| try: | |
| start_time = datetime.now() | |
| response = await self.hf_client.complete_async( | |
| agent_name="benchmark_advisor", | |
| system_prompt=self.SYSTEM_PROMPT, | |
| user_message=prompt, | |
| max_tokens=900, | |
| json_mode=True, | |
| ) | |
| duration = (datetime.now() - start_time).total_seconds() | |
| result = self._parse_response(response) | |
| log(f" ββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| log(f" π’ ATLAS β Benchmark Advisor: COMPLETE ({duration:.1f}s)") | |
| log(f" ββ Recommended: {result.recommended_benchmark_name}") | |
| log(f" ββ Changes Needed: {len(result.alignment_changes)}") | |
| log(f" ββ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}") | |
| se = result.self_evaluation | |
| if se: | |
| log(f" ββ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") | |
| return result | |
| except Exception as e: | |
| log(f" ββ β οΈ Benchmark Advisor failed: {str(e)[:120]}") | |
| return BenchmarkAdvice() | |
| def _format_comparisons(self, comparisons: list) -> str: | |
| """Format benchmark comparisons for prompt.""" | |
| lines = [] | |
| for i, c in enumerate(comparisons[:5]): | |
| b = c.benchmark | |
| lines.append(f""" | |
| {i+1}. {b.icon} {b.name} | |
| - Similarity Score: {c.similarity_score:.2f} (lower = better) | |
| - Match: {c.overall_match_pct:.0f}% | |
| - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f}) | |
| - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff}) | |
| - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff}) | |
| - Best For: {', '.join(b.best_for)}""") | |
| return "\n".join(lines) | |
| def _parse_response(self, response: str) -> BenchmarkAdvice: | |
| """Parse LLM response into BenchmarkAdvice.""" | |
| try: | |
| json_match = re.search(r'\{[\s\S]*\}', response) | |
| if json_match: | |
| data = json.loads(json_match.group()) | |
| return BenchmarkAdvice( | |
| recommended_benchmark=data.get("recommended_benchmark", ""), | |
| recommended_benchmark_name=data.get("recommended_benchmark_name", ""), | |
| reasoning=data.get("reasoning", ""), | |
| alignment_changes=data.get("alignment_changes", []), | |
| pros_of_alignment=data.get("pros_of_alignment", []), | |
| cons_of_alignment=data.get("cons_of_alignment", []), | |
| alternative_benchmarks=data.get("alternative_benchmarks", []), | |
| self_evaluation=data.get("self_evaluation", {}), | |
| ) | |
| except Exception: | |
| pass | |
| return BenchmarkAdvice() | |
| # ============================================================================= | |
| # BEST PRACTICES VALIDATOR AGENT | |
| # ============================================================================= | |
| class BestPracticesValidatorAgent: | |
| """ | |
| SENTINEL β Design System Best Practices Auditor. | |
| Validates against design system standards and prioritizes fixes by business impact. | |
| Model: Qwen 72B (methodical rule-following, precise judgment, structured output) | |
| Temperature: 0.2 (strict, consistent rule evaluation) | |
| WHY LLM: Prioritization requires judgment about business impact, | |
| not just checking boxes. | |
| """ | |
| SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor specializing in standards compliance and impact-based prioritization. | |
| ## YOUR ROLE IN THE PIPELINE | |
| You are Agent 3 of 4 in the Design System Analysis pipeline. | |
| - INPUT: Rule Engine analysis results (typography, accessibility, spacing, color stats) | |
| - OUTPUT: Compliance score + prioritized fix list β feeds into NEXUS (Agent 4) for final synthesis | |
| - Your score directly appears on the user's dashboard. Your priority fixes become the action items. | |
| ## YOUR EXPERTISE | |
| - WCAG 2.1 AA/AAA accessibility standards | |
| - Design system best practices (Material Design, Apple HIG, Tailwind conventions) | |
| - Typography systems (modular scales, vertical rhythm, readability) | |
| - Color management (palette size limits, near-duplicate detection, contrast requirements) | |
| - Spacing systems (grid alignment, consistency, component density) | |
| ## QUALITY STANDARDS | |
| - Overall Score MUST reflect actual data. Don't default to 50. | |
| - Use the FULL 0-100 range: 90+ = excellent, 70-89 = good, 50-69 = needs work, <50 = significant issues | |
| - Priority fixes must be ACTIONABLE β include specific values to change (e.g., "Change #06b2c4 β #0891a8") | |
| - Maximum 5 priority fixes. If more, focus on highest-impact items. | |
| ## WHAT NOT TO DO | |
| - Don't pass checks that clearly fail based on the data. | |
| - Don't inflate scores to be "encouraging" β honest assessment helps the user. | |
| - Don't list fixes without effort estimates β the user needs to plan their work. | |
| - Don't mix up "warn" and "fail": warn = imperfect but functional, fail = violates a standard. | |
| ## SCORING RUBRIC (Overall Score 0-100): | |
| - 90-100: All checks pass, excellent accessibility, clean palette, consistent grid | |
| - 75-89: Most checks pass, minor issues in 1-2 areas, good foundation | |
| - 60-74: Several warnings, 1-2 failures, needs focused improvement | |
| - 40-59: Multiple failures, significant accessibility gaps, inconsistent system | |
| - 20-39: Fundamental issues across multiple areas, major rework needed | |
| - 0-19: Barely qualifies as a design system, almost everything fails | |
| ## CHECK WEIGHTING: | |
| - AA Compliance: 25 points (most critical β affects real users) | |
| - Type Scale Consistency: 15 points | |
| - Type Scale Standard Ratio: 10 points | |
| - Base Size Accessible: 15 points | |
| - Spacing Grid: 15 points | |
| - Color Count: 5 points | |
| - No Near-Duplicates: 5 points | |
| - Shadow System: 10 points (elevation hierarchy, consistency) | |
| ## SHADOW SYSTEM BEST PRACTICES: | |
| - Use 3-6 elevation levels (xs, sm, md, lg, xl, 2xl) | |
| - Consistent Y-offset progression (shadows should grow with elevation) | |
| - Blur radius should increase with elevation (more blur = higher elevation) | |
| - Shadow colors should be neutral (black/gray with alpha) or brand-colored with low opacity | |
| - Avoid shadows with 0 blur (looks harsh/flat) | |
| - Avoid excessive blur (>32px for most use cases)""" | |
| PROMPT_TEMPLATE = """Validate the following design tokens against best practices and prioritize fixes. | |
| ## RULE ENGINE ANALYSIS RESULTS | |
| ### Typography | |
| - Detected Ratio: {type_ratio} ({type_consistent}) | |
| - Base Size: {base_size}px | |
| - Recommendation: {type_recommendation} | |
| ### Accessibility | |
| - Total Colors: {total_colors} | |
| - AA Pass: {aa_pass} | |
| - AA Fail: {aa_fail} | |
| - Failing Colors: {failing_colors} | |
| ### Spacing | |
| - Detected Base: {spacing_base}px | |
| - Grid Aligned: {spacing_aligned}% | |
| - Recommendation: {spacing_recommendation}px | |
| ### Color Statistics | |
| - Unique Colors: {unique_colors} | |
| - Duplicates: {duplicates} | |
| - Near-Duplicates: {near_duplicates} | |
| ### Shadow System | |
| - Total Shadows: {shadow_count} | |
| - Shadow Values: {shadow_values} | |
| ## BEST PRACTICES CHECKLIST (check each one) | |
| 1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618) | |
| 2. Type scale is consistent (variance < 0.15) | |
| 3. Base font size >= 16px (accessibility) | |
| 4. All interactive colors pass WCAG AA (4.5:1 contrast) | |
| 5. Spacing uses consistent grid (4px or 8px base) | |
| 6. Limited color palette (< 20 unique semantic colors) | |
| 7. No near-duplicate colors (< 3 delta-E apart) | |
| 8. Shadow system has consistent elevation hierarchy (blur/Y-offset increase together) | |
| ## YOUR TASK | |
| 1. Score each practice: pass/warn/fail with specific notes citing the data | |
| 2. Calculate overall score (0-100) using the weighting rubric | |
| 3. Identify TOP 3-5 priority fixes with impact and effort assessment | |
| 4. Self-evaluate your analysis | |
| ## OUTPUT FORMAT (JSON only) | |
| {{ | |
| "overall_score": <0-100>, | |
| "checks": {{ | |
| "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}}, | |
| "type_scale_consistent": {{"status": "...", "note": "..."}}, | |
| "base_size_accessible": {{"status": "...", "note": "..."}}, | |
| "aa_compliance": {{"status": "...", "note": "..."}}, | |
| "spacing_grid": {{"status": "...", "note": "..."}}, | |
| "color_count": {{"status": "...", "note": "..."}}, | |
| "near_duplicates": {{"status": "...", "note": "..."}}, | |
| "shadow_system": {{"status": "...", "note": "Elevation hierarchy, blur consistency, color appropriateness"}} | |
| }}, | |
| "priority_fixes": [ | |
| {{ | |
| "rank": 1, | |
| "issue": "Brand primary fails AA", | |
| "impact": "high|medium|low", | |
| "effort": "low|medium|high", | |
| "action": "Change #06b2c4 β #0891a8 for 4.5:1 contrast" | |
| }} | |
| ], | |
| "passing_practices": ["Base font size", "..."], | |
| "failing_practices": ["AA compliance", "..."], | |
| "self_evaluation": {{ | |
| "confidence": <1-10>, | |
| "reasoning": "Why I am this confident", | |
| "data_quality": "good|fair|poor", | |
| "flags": [] | |
| }} | |
| }} | |
| Return ONLY valid JSON.""" | |
| def __init__(self, hf_client): | |
| self.hf_client = hf_client | |
| async def analyze( | |
| self, | |
| rule_engine_results: Any, | |
| shadow_tokens: dict = None, | |
| log_callback: Callable = None, | |
| ) -> BestPracticesResult: | |
| """ | |
| Validate against best practices. | |
| Args: | |
| rule_engine_results: Results from rule engine | |
| shadow_tokens: Shadow tokens dict {name: {value: "..."}} | |
| log_callback: Progress logging function | |
| Returns: | |
| BestPracticesResult with validation | |
| """ | |
| def log(msg: str): | |
| if log_callback: | |
| log_callback(msg) | |
| log("") | |
| log(" β SENTINEL β Best Practices Validator (Qwen 72B)") | |
| log(" ββ Checking against design system standards...") | |
| # Extract data from rule engine | |
| typo = rule_engine_results.typography | |
| spacing = rule_engine_results.spacing | |
| color_stats = rule_engine_results.color_stats | |
| accessibility = rule_engine_results.accessibility | |
| failures = [a for a in accessibility if not a.passes_aa_normal] | |
| failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]]) | |
| # Format shadow data for the prompt | |
| shadow_count = len(shadow_tokens) if shadow_tokens else 0 | |
| shadow_values_str = "None detected" | |
| if shadow_tokens and shadow_count > 0: | |
| shadow_list = [] | |
| for name, s in list(shadow_tokens.items())[:6]: | |
| val = s.get("value", "") if isinstance(s, dict) else str(s) | |
| shadow_list.append(f"{name}: {val[:50]}") | |
| shadow_values_str = "; ".join(shadow_list) | |
| prompt = self.PROMPT_TEMPLATE.format( | |
| type_ratio=f"{typo.detected_ratio:.3f}", | |
| type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}", | |
| base_size=typo.sizes_px[0] if typo.sizes_px else 16, | |
| type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})", | |
| total_colors=len(accessibility), | |
| aa_pass=len(accessibility) - len(failures), | |
| aa_fail=len(failures), | |
| failing_colors=failing_colors_str or "None", | |
| spacing_base=spacing.detected_base, | |
| spacing_aligned=f"{spacing.alignment_percentage:.0f}", | |
| spacing_recommendation=spacing.recommendation, | |
| unique_colors=color_stats.unique_count, | |
| duplicates=color_stats.duplicate_count, | |
| near_duplicates=len(color_stats.near_duplicates), | |
| shadow_count=shadow_count, | |
| shadow_values=shadow_values_str, | |
| ) | |
| try: | |
| start_time = datetime.now() | |
| response = await self.hf_client.complete_async( | |
| agent_name="best_practices_validator", | |
| system_prompt=self.SYSTEM_PROMPT, | |
| user_message=prompt, | |
| max_tokens=1000, | |
| json_mode=True, | |
| ) | |
| duration = (datetime.now() - start_time).total_seconds() | |
| result = self._parse_response(response) | |
| log(f" ββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| log(f" β SENTINEL β Best Practices: COMPLETE ({duration:.1f}s)") | |
| log(f" ββ Overall Score: {result.overall_score}/100") | |
| log(f" ββ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}") | |
| if result.priority_fixes: | |
| log(f" ββ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}") | |
| se = result.self_evaluation | |
| if se: | |
| log(f" ββ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") | |
| return result | |
| except Exception as e: | |
| log(f" ββ β οΈ Best Practices Validator failed: {str(e)[:120]}") | |
| return BestPracticesResult() | |
| def _parse_response(self, response: str) -> BestPracticesResult: | |
| """Parse LLM response into BestPracticesResult.""" | |
| try: | |
| json_match = re.search(r'\{[\s\S]*\}', response) | |
| if json_match: | |
| data = json.loads(json_match.group()) | |
| return BestPracticesResult( | |
| overall_score=data.get("overall_score", 50), | |
| checks=data.get("checks", {}), | |
| priority_fixes=data.get("priority_fixes", []), | |
| passing_practices=data.get("passing_practices", []), | |
| failing_practices=data.get("failing_practices", []), | |
| self_evaluation=data.get("self_evaluation", {}), | |
| ) | |
| except Exception: | |
| pass | |
| return BestPracticesResult() | |
| # ============================================================================= | |
| # HEAD SYNTHESIZER AGENT | |
| # ============================================================================= | |
| class HeadSynthesizerAgent: | |
| """ | |
| NEXUS β Senior Design System Architect & Synthesizer. | |
| Combines all agent outputs into final actionable recommendations. | |
| Model: Llama 3.3 70B (128K context for combined inputs, strong synthesis capability) | |
| Temperature: 0.3 (balanced β needs to synthesize creatively but stay grounded in data) | |
| This is the final step that produces actionable output for the user. | |
| """ | |
| SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect specializing in synthesis and actionable recommendations. | |
| ## YOUR ROLE IN THE PIPELINE | |
| You are Agent 4 of 4 β the HEAD Synthesizer in the Design System Analysis pipeline. | |
| - INPUT: Combined outputs from Rule Engine + AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices) | |
| - OUTPUT: Final executive summary, scores, and prioritized action plan β displayed directly to the user | |
| - You are the LAST agent. Your output IS the final result. Make it count. | |
| ## YOUR EXPERTISE | |
| - Design system architecture and governance | |
| - Synthesizing conflicting recommendations into coherent strategy | |
| - Effort/impact prioritization (what to fix first) | |
| - Color accessibility remediation (suggesting AA-compliant alternatives) | |
| - Executive communication (clear, actionable summaries) | |
| ## QUALITY STANDARDS | |
| - Executive Summary must be 2-3 sentences MAX. Lead with the overall score, then the #1 issue, then the #1 action. | |
| - Overall Score must SYNTHESIZE all agent inputs β don't just average them. | |
| - Color recommendations must include BOTH current AND suggested hex values. | |
| - Top 3 Actions must be ordered by IMPACT, not ease. | |
| - Accept/reject defaults on color recs: default to "accept" for accessibility fixes, "reject" for purely aesthetic changes. | |
| ## WHAT NOT TO DO | |
| - Don't contradict previous agents without explaining why. | |
| - Don't recommend changes that SENTINEL flagged as breaking. | |
| - Don't suggest more than 8 color changes β the user will ignore a long list. | |
| - Don't give vague actions like "improve accessibility" β be specific: "Change brand.primary from #06b2c4 to #0891a8 for 4.5:1 contrast". | |
| - Don't inflate scores to be "nice". If the design system has issues, say so clearly. | |
| ## SCORING RUBRIC (Overall 0-100): | |
| - 90-100: Production-ready design system, minor polishing only | |
| - 75-89: Solid foundation, 2-3 targeted improvements needed | |
| - 60-74: Functional but needs focused attention on accessibility or consistency | |
| - 40-59: Significant gaps requiring systematic improvement | |
| - 20-39: Major rework needed across multiple dimensions | |
| - 0-19: Fundamental redesign recommended""" | |
| PROMPT_TEMPLATE = """Synthesize all analysis results into a final, actionable design system report. | |
| ## RULE ENGINE FACTS (Layer 1 β Free, deterministic) | |
| - Type Scale: {type_ratio} ({type_status}) | |
| - Base Size: {base_size}px | |
| - AA Failures: {aa_failures} | |
| - Spacing Grid: {spacing_status} | |
| - Unique Colors: {unique_colors} | |
| - Consistency Score: {consistency_score}/100 | |
| ## AURORA β Brand Identification (Agent 1) | |
| - Brand Primary: {brand_primary} | |
| - Brand Secondary: {brand_secondary} | |
| - Palette Cohesion: {cohesion_score}/10 | |
| ## ATLAS β Benchmark Advice (Agent 2) | |
| Closest Match: {closest_benchmark} | |
| Match Percentage: {match_pct}% | |
| Recommended Changes: {benchmark_changes} | |
| ## SENTINEL β Best Practices Validation (Agent 3) | |
| Overall Score: {best_practices_score}/100 | |
| Priority Fixes: {priority_fixes} | |
| ## ACCESSIBILITY FIXES NEEDED | |
| {accessibility_fixes} | |
| ## YOUR TASK | |
| Synthesize ALL the above into: | |
| 1. Executive Summary (2-3 sentences β lead with score, #1 issue, #1 action) | |
| 2. Overall Scores (synthesized, not averaged) | |
| 3. Top 3 Priority Actions (ordered by IMPACT, include effort estimates) | |
| 4. Specific Color Recommendations (with accept/reject defaults) | |
| 5. Type Scale Recommendation | |
| 6. Spacing Recommendation | |
| 7. Self-Evaluation of your synthesis | |
| ## OUTPUT FORMAT (JSON only) | |
| {{ | |
| "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.", | |
| "scores": {{ | |
| "overall": <0-100>, | |
| "accessibility": <0-100>, | |
| "consistency": <0-100>, | |
| "organization": <0-100> | |
| }}, | |
| "benchmark_fit": {{ | |
| "closest": "<name>", | |
| "similarity": "<X%>", | |
| "recommendation": "Specific action to align" | |
| }}, | |
| "brand_analysis": {{ | |
| "primary": "#hex", | |
| "secondary": "#hex", | |
| "cohesion": <1-10> | |
| }}, | |
| "top_3_actions": [ | |
| {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}} | |
| ], | |
| "color_recommendations": [ | |
| {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}} | |
| ], | |
| "type_scale_recommendation": {{ | |
| "current_ratio": 1.18, | |
| "recommended_ratio": 1.25, | |
| "reason": "Why this ratio is better" | |
| }}, | |
| "spacing_recommendation": {{ | |
| "current": "mixed", | |
| "recommended": "8px", | |
| "reason": "Why this grid is better" | |
| }}, | |
| "self_evaluation": {{ | |
| "confidence": <1-10>, | |
| "reasoning": "Why I am this confident in the synthesis", | |
| "data_quality": "good|fair|poor", | |
| "flags": [] | |
| }} | |
| }} | |
| Return ONLY valid JSON.""" | |
| def __init__(self, hf_client): | |
| self.hf_client = hf_client | |
| async def synthesize( | |
| self, | |
| rule_engine_results: Any, | |
| benchmark_comparisons: list, | |
| brand_identification: BrandIdentification, | |
| benchmark_advice: BenchmarkAdvice, | |
| best_practices: BestPracticesResult, | |
| log_callback: Callable = None, | |
| ) -> HeadSynthesis: | |
| """ | |
| Synthesize all results into final recommendations. | |
| """ | |
| def log(msg: str): | |
| if log_callback: | |
| log_callback(msg) | |
| log("") | |
| log("β" * 60) | |
| log("π§ LAYER 4: NEXUS β HEAD SYNTHESIZER (Llama 3.3 70B)") | |
| log("β" * 60) | |
| log("") | |
| log(" Combining: Rule Engine + AURORA + ATLAS + SENTINEL...") | |
| # Extract data | |
| typo = rule_engine_results.typography | |
| spacing = rule_engine_results.spacing | |
| color_stats = rule_engine_results.color_stats | |
| accessibility = rule_engine_results.accessibility | |
| failures = [a for a in accessibility if not a.passes_aa_normal] | |
| aa_fixes_str = "\n".join([ | |
| f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) β {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)" | |
| for a in failures[:5] if a.suggested_fix | |
| ]) | |
| closest = benchmark_comparisons[0] if benchmark_comparisons else None | |
| prompt = self.PROMPT_TEMPLATE.format( | |
| type_ratio=f"{typo.detected_ratio:.3f}", | |
| type_status="consistent" if typo.is_consistent else "inconsistent", | |
| base_size=typo.sizes_px[0] if typo.sizes_px else 16, | |
| aa_failures=len(failures), | |
| spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned", | |
| unique_colors=color_stats.unique_count, | |
| consistency_score=rule_engine_results.consistency_score, | |
| closest_benchmark=closest.benchmark.name if closest else "Unknown", | |
| match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0", | |
| benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]), | |
| brand_primary=brand_identification.brand_primary.get("color", "Unknown"), | |
| brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"), | |
| cohesion_score=brand_identification.cohesion_score, | |
| best_practices_score=best_practices.overall_score, | |
| priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]), | |
| accessibility_fixes=aa_fixes_str or "None needed", | |
| ) | |
| try: | |
| start_time = datetime.now() | |
| response = await self.hf_client.complete_async( | |
| agent_name="head_synthesizer", | |
| system_prompt=self.SYSTEM_PROMPT, | |
| user_message=prompt, | |
| max_tokens=1200, | |
| json_mode=True, | |
| ) | |
| duration = (datetime.now() - start_time).total_seconds() | |
| result = self._parse_response(response) | |
| log("") | |
| log(f" β NEXUS β HEAD Synthesizer: COMPLETE ({duration:.1f}s)") | |
| if result.scores: | |
| log(f" ββ Overall Score: {result.scores.get('overall', '?')}/100") | |
| log(f" ββ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}") | |
| se = result.self_evaluation | |
| if se: | |
| log(f" ββ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}") | |
| log("") | |
| return result | |
| except Exception as e: | |
| log(f" ββ β οΈ Head Synthesizer failed: {str(e)[:120]}") | |
| return HeadSynthesis() | |
| def _parse_response(self, response: str) -> HeadSynthesis: | |
| """Parse LLM response into HeadSynthesis.""" | |
| try: | |
| json_match = re.search(r'\{[\s\S]*\}', response) | |
| if json_match: | |
| data = json.loads(json_match.group()) | |
| return HeadSynthesis( | |
| executive_summary=data.get("executive_summary", ""), | |
| scores=data.get("scores", {}), | |
| benchmark_fit=data.get("benchmark_fit", {}), | |
| brand_analysis=data.get("brand_analysis", {}), | |
| top_3_actions=data.get("top_3_actions", []), | |
| color_recommendations=data.get("color_recommendations", []), | |
| type_scale_recommendation=data.get("type_scale_recommendation", {}), | |
| spacing_recommendation=data.get("spacing_recommendation", {}), | |
| self_evaluation=data.get("self_evaluation", {}), | |
| ) | |
| except Exception: | |
| pass | |
| return HeadSynthesis() | |