Spaces:

riazmo
/

Design-System-Extractor-2

Sleeping

App Files Files Community

Design-System-Extractor-2 / agents /llm_agents.py

riazmo

Upload llm_agents.py

e8110e0 verified about 1 month ago

raw

history blame contribute delete

46.9 kB

	"""
	Stage 2 LLM Agents — Specialized Analysis Tasks
	=================================================

	These agents handle tasks that REQUIRE LLM reasoning:
	- Brand Identifier: Identify brand colors from usage context
	- Benchmark Advisor: Recommend best-fit design system
	- Best Practices Validator: Prioritize fixes by business impact
	- HEAD Synthesizer: Combine all outputs into final recommendations

	Each agent has a focused prompt for its specific task.
	"""

	import json
	import re
	from dataclasses import dataclass, field
	from typing import Optional, Callable, Any
	from datetime import datetime


	# =============================================================================
	# DATA CLASSES
	# =============================================================================

	@dataclass
	class BrandIdentification:
	"""Results from Brand Identifier agent (AURORA)."""
	brand_primary: dict = field(default_factory=dict)
	# {color, confidence, reasoning, usage_count}

	brand_secondary: dict = field(default_factory=dict)
	brand_accent: dict = field(default_factory=dict)

	palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random
	cohesion_score: int = 5 # 1-10
	cohesion_notes: str = ""

	semantic_names: dict = field(default_factory=dict)
	# {hex_color: suggested_name}

	self_evaluation: dict = field(default_factory=dict)
	# {confidence: 1-10, reasoning: str, data_quality: good\|fair\|poor, flags: []}

	def to_dict(self) -> dict:
	return {
	"brand_primary": self.brand_primary,
	"brand_secondary": self.brand_secondary,
	"brand_accent": self.brand_accent,
	"palette_strategy": self.palette_strategy,
	"cohesion_score": self.cohesion_score,
	"cohesion_notes": self.cohesion_notes,
	"semantic_names": self.semantic_names,
	"self_evaluation": self.self_evaluation,
	}


	@dataclass
	class BenchmarkAdvice:
	"""Results from Benchmark Advisor agent."""
	recommended_benchmark: str = ""
	recommended_benchmark_name: str = ""
	reasoning: str = ""

	alignment_changes: list = field(default_factory=list)
	# [{change, from, to, effort}]

	pros_of_alignment: list = field(default_factory=list)
	cons_of_alignment: list = field(default_factory=list)

	alternative_benchmarks: list = field(default_factory=list)
	# [{name, reason}]

	self_evaluation: dict = field(default_factory=dict)
	# {confidence: 1-10, reasoning: str, data_quality: good\|fair\|poor, flags: []}

	def to_dict(self) -> dict:
	return {
	"recommended_benchmark": self.recommended_benchmark,
	"recommended_benchmark_name": self.recommended_benchmark_name,
	"reasoning": self.reasoning,
	"alignment_changes": self.alignment_changes,
	"pros": self.pros_of_alignment,
	"cons": self.cons_of_alignment,
	"alternatives": self.alternative_benchmarks,
	"self_evaluation": self.self_evaluation,
	}


	@dataclass
	class BestPracticesResult:
	"""Results from Best Practices Validator agent."""
	overall_score: int = 50 # 0-100

	checks: dict = field(default_factory=dict)
	# {check_name: {status: pass/warn/fail, note: str}}

	priority_fixes: list = field(default_factory=list)
	# [{rank, issue, impact, effort, action}]

	passing_practices: list = field(default_factory=list)
	failing_practices: list = field(default_factory=list)

	self_evaluation: dict = field(default_factory=dict)
	# {confidence: 1-10, reasoning: str, data_quality: good\|fair\|poor, flags: []}

	def to_dict(self) -> dict:
	return {
	"overall_score": self.overall_score,
	"checks": self.checks,
	"priority_fixes": self.priority_fixes,
	"passing": self.passing_practices,
	"failing": self.failing_practices,
	"self_evaluation": self.self_evaluation,
	}


	@dataclass
	class HeadSynthesis:
	"""Final synthesized output from HEAD agent."""
	executive_summary: str = ""

	scores: dict = field(default_factory=dict)
	# {overall, accessibility, consistency, organization}

	benchmark_fit: dict = field(default_factory=dict)
	# {closest, similarity, recommendation}

	brand_analysis: dict = field(default_factory=dict)
	# {primary, secondary, cohesion}

	top_3_actions: list = field(default_factory=list)
	# [{action, impact, effort, details}]

	color_recommendations: list = field(default_factory=list)
	# [{role, current, suggested, reason, accept}]

	type_scale_recommendation: dict = field(default_factory=dict)
	spacing_recommendation: dict = field(default_factory=dict)

	self_evaluation: dict = field(default_factory=dict)
	# {confidence: 1-10, reasoning: str, data_quality: good\|fair\|poor, flags: []}

	def to_dict(self) -> dict:
	return {
	"executive_summary": self.executive_summary,
	"scores": self.scores,
	"benchmark_fit": self.benchmark_fit,
	"brand_analysis": self.brand_analysis,
	"top_3_actions": self.top_3_actions,
	"color_recommendations": self.color_recommendations,
	"type_scale_recommendation": self.type_scale_recommendation,
	"spacing_recommendation": self.spacing_recommendation,
	"self_evaluation": self.self_evaluation,
	}


	# =============================================================================
	# BRAND IDENTIFIER AGENT
	# =============================================================================

	class BrandIdentifierAgent:
	"""
	AURORA — Senior Brand Color Analyst.

	Identifies brand colors from usage context using creative/visual reasoning.
	Model: Qwen 72B (strong creative reasoning, color harmony assessment)
	Temperature: 0.4 (allows creative interpretation of color stories)

	WHY LLM: Requires understanding context (33 buttons = likely brand primary),
	not just color math.
	"""

	SYSTEM_PROMPT = """You are AURORA, a Senior Brand Color Analyst specializing in visual identity systems.

	## YOUR ROLE IN THE PIPELINE
	You are Agent 1 of 4 in the Design System Analysis pipeline.
	- INPUT: Raw color tokens with usage counts + semantic CSS analysis from Stage 1 extraction
	- OUTPUT: Brand color identification + palette strategy → feeds into NEXUS (Agent 4) for final synthesis
	- Your analysis directly influences the final color recommendations shown to the user.

	## YOUR EXPERTISE
	- Color harmony theory (complementary, analogous, triadic, split-complementary, monochromatic)
	- Brand identity systems (primary/secondary/accent hierarchy)
	- CSS context interpretation (button colors = likely CTA, background colors = likely neutral)
	- Color naming conventions (design token naming: brand.primary, text.secondary, etc.)

	## QUALITY STANDARDS
	- Brand Primary MUST have HIGH confidence if one color dominates buttons/CTAs. Say "low" if ambiguous.
	- Cohesion Score: Use the FULL 1-10 range. A score of 7+ means clear intentional harmony. Most sites score 5-7.
	- If fewer than 5 unique colors exist, flag as "insufficient_data" — don't guess relationships.

	## WHAT NOT TO DO
	- Don't inflate confidence. "Medium" is fine when usage patterns are unclear.
	- Don't guess accent colors if none exist — use null.
	- Don't assume complementary strategy just because two colors differ — check the actual hue relationship.
	- Don't name colors generically. Use semantic design-token style names (brand.primary, not "blue").

	## SCORING RUBRIC (Cohesion 1-10):
	- 9-10: Clear harmony rule across all colors, distinct brand identity, consistent palette
	- 7-8: Mostly harmonious, clear brand identity, minor inconsistencies
	- 5-6: Some color relationships visible but not systematic
	- 3-4: Random-feeling palette, no clear color strategy
	- 1-2: Actively conflicting colors, no brand identity visible"""

	PROMPT_TEMPLATE = """Analyze the following color usage data and identify the brand color system.

	## COLOR DATA WITH USAGE CONTEXT

	{color_data}

	## SEMANTIC ANALYSIS (from CSS properties)

	{semantic_analysis}

	## YOUR TASK

	1. Identify Brand Colors:
	- Brand Primary: The main action/CTA color (highest visibility in buttons, links, key UI)
	- Brand Secondary: Supporting brand color (headers, secondary actions)
	- Brand Accent: Highlight color for emphasis (badges, alerts, special states)

	2. Assess Palette Strategy: complementary, analogous, triadic, monochromatic, or random?

	3. Rate Cohesion (1-10) using the rubric above

	4. Suggest Semantic Names for top 10 most-used colors (design-token format)

	5. Self-Evaluate your analysis quality

	## OUTPUT FORMAT (JSON only)

	{{
	"brand_primary": {{
	"color": "#hex",
	"confidence": "high\|medium\|low",
	"reasoning": "Why this is brand primary — cite specific usage evidence",
	"usage_count": <number>
	}},
	"brand_secondary": {{
	"color": "#hex",
	"confidence": "high\|medium\|low",
	"reasoning": "..."
	}},
	"brand_accent": {{
	"color": "#hex or null",
	"confidence": "...",
	"reasoning": "..."
	}},
	"palette_strategy": "complementary\|analogous\|triadic\|monochromatic\|random",
	"cohesion_score": <1-10>,
	"cohesion_notes": "Assessment of how well colors work together",
	"semantic_names": {{
	"#hex1": "brand.primary",
	"#hex2": "text.primary",
	"#hex3": "background.primary"
	}},
	"self_evaluation": {{
	"confidence": <1-10>,
	"reasoning": "Why I am this confident in my analysis",
	"data_quality": "good\|fair\|poor",
	"flags": []
	}}
	}}

	Return ONLY valid JSON."""

	def __init__(self, hf_client):
	self.hf_client = hf_client

	async def analyze(
	self,
	color_tokens: dict,
	semantic_analysis: dict,
	log_callback: Callable = None,
	) -> BrandIdentification:
	"""
	Identify brand colors from usage context.

	Args:
	color_tokens: Dict of color tokens with usage data
	semantic_analysis: Semantic categorization from Stage 1
	log_callback: Progress logging function

	Returns:
	BrandIdentification with identified colors
	"""
	def log(msg: str):
	if log_callback:
	log_callback(msg)

	log(" 🎨 AURORA — Brand Identifier (Qwen 72B)")
	log(" └─ Analyzing color context and usage patterns...")

	# Format color data
	color_data = self._format_color_data(color_tokens)
	semantic_str = self._format_semantic_analysis(semantic_analysis)

	prompt = self.PROMPT_TEMPLATE.format(
	color_data=color_data,
	semantic_analysis=semantic_str,
	)

	try:
	start_time = datetime.now()

	response = await self.hf_client.complete_async(
	agent_name="brand_identifier",
	system_prompt=self.SYSTEM_PROMPT,
	user_message=prompt,
	max_tokens=1000,
	json_mode=True,
	)

	duration = (datetime.now() - start_time).total_seconds()

	# Parse response
	result = self._parse_response(response)

	log(f" ────────────────────────────────────────────────")
	log(f" 🎨 AURORA — Brand Identifier: COMPLETE ({duration:.1f}s)")
	log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
	log(f" ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}")
	log(f" ├─ Palette Strategy: {result.palette_strategy}")
	log(f" ├─ Cohesion Score: {result.cohesion_score}/10")
	se = result.self_evaluation
	if se:
	log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")

	return result

	except Exception as e:
	error_msg = str(e)
	# Always log full error for diagnosis
	log(f" ⚠️ Brand Identifier failed: {error_msg[:120]}")
	if "gated" in error_msg.lower() or "access" in error_msg.lower():
	log(f" └─ Model may require license acceptance at huggingface.co")
	elif "Rate limit" in error_msg or "429" in error_msg:
	log(f" └─ HF free tier rate limit — wait or upgrade to Pro")
	return BrandIdentification()

	def _format_color_data(self, color_tokens: dict) -> str:
	"""Format color tokens for prompt."""
	lines = []
	for name, token in list(color_tokens.items())[:30]:
	if isinstance(token, dict):
	hex_val = token.get("value", token.get("hex", ""))
	usage = token.get("usage_count", token.get("count", 1))
	context = token.get("context", token.get("css_property", ""))
	else:
	hex_val = getattr(token, "value", "")
	usage = getattr(token, "usage_count", 1)
	context = getattr(token, "context", "")

	if hex_val:
	lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")

	return "\n".join(lines) if lines else "No color data available"

	def _format_semantic_analysis(self, semantic: dict) -> str:
	"""Format semantic analysis for prompt."""
	if not semantic:
	return "No semantic analysis available"

	lines = []
	try:
	for category, value in semantic.items():
	if not value:
	continue

	if isinstance(value, list):
	# List of colors
	color_list = []
	for c in value[:5]:
	if isinstance(c, dict):
	color_list.append(c.get("hex", c.get("value", str(c))))
	else:
	color_list.append(str(c))
	lines.append(f"- {category}: {', '.join(color_list)}")

	elif isinstance(value, dict):
	# Could be a nested dict of sub-roles → color dicts
	# e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}}
	# or a flat color dict {"hex": "#...", "confidence": "..."}
	# or a summary dict {"total_colors_analyzed": 50, ...}
	if "hex" in value:
	# Flat color dict
	lines.append(f"- {category}: {value['hex']}")
	else:
	# Nested dict — iterate sub-roles
	sub_items = []
	for sub_role, sub_val in list(value.items())[:5]:
	if isinstance(sub_val, dict) and "hex" in sub_val:
	sub_items.append(f"{sub_role}={sub_val['hex']}")
	elif isinstance(sub_val, (str, int, float, bool)):
	sub_items.append(f"{sub_role}={sub_val}")
	if sub_items:
	lines.append(f"- {category}: {', '.join(sub_items)}")
	else:
	lines.append(f"- {category}: {value}")
	except Exception as e:
	return f"Error formatting semantic analysis: {str(e)[:50]}"

	return "\n".join(lines) if lines else "No semantic analysis available"

	def _parse_response(self, response: str) -> BrandIdentification:
	"""Parse LLM response into BrandIdentification."""
	try:
	json_match = re.search(r'\{[\s\S]*\}', response)
	if json_match:
	data = json.loads(json_match.group())
	return BrandIdentification(
	brand_primary=data.get("brand_primary", {}),
	brand_secondary=data.get("brand_secondary", {}),
	brand_accent=data.get("brand_accent", {}),
	palette_strategy=data.get("palette_strategy", "unknown"),
	cohesion_score=data.get("cohesion_score", 5),
	cohesion_notes=data.get("cohesion_notes", ""),
	semantic_names=data.get("semantic_names", {}),
	self_evaluation=data.get("self_evaluation", {}),
	)
	except Exception:
	pass

	return BrandIdentification()


	# =============================================================================
	# BENCHMARK ADVISOR AGENT
	# =============================================================================

	class BenchmarkAdvisorAgent:
	"""
	ATLAS — Senior Design System Benchmark Analyst.

	Recommends best-fit design system based on comparison data.
	Model: Llama 3.3 70B (128K context for large benchmark data, excellent comparative reasoning)
	Temperature: 0.25 (analytical, data-driven comparison)

	WHY LLM: Requires reasoning about trade-offs and use-case fit,
	not just similarity scores.
	"""

	SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst specializing in cross-system comparison and alignment strategy.

	## YOUR ROLE IN THE PIPELINE
	You are Agent 2 of 4 in the Design System Analysis pipeline.
	- INPUT: User's extracted type scale, spacing, and font sizes + benchmark comparison data from the Rule Engine
	- OUTPUT: Benchmark recommendation with alignment roadmap → feeds into NEXUS (Agent 4) for final synthesis
	- Your recommendation helps the user decide which established design system to align with.

	## YOUR EXPERTISE
	- Deep knowledge of Material Design 3, Apple HIG, IBM Carbon, Ant Design, Atlassian, Tailwind CSS, Bootstrap
	- Type scale mathematics (major/minor second/third, perfect fourth/fifth, golden ratio)
	- Spacing grid systems (4px, 8px, multiples) and their trade-offs
	- Migration effort estimation for design system alignment

	## QUALITY STANDARDS
	- Always consider BOTH similarity score AND use-case fit. Closest match ≠ best fit.
	- Recommend max 4 alignment changes. More than that = the benchmark is not a good fit.
	- Effort estimates must be realistic: "low" = CSS variable change, "medium" = component updates, "high" = layout restructuring.
	- If similarity is above 85%, say "already well-aligned" and suggest minimal changes only.

	## WHAT NOT TO DO
	- Don't always recommend the closest match — a system 5% less similar but much better suited is preferable.
	- Don't list generic pros/cons. Be specific to the user's actual values.
	- Don't suggest alignment changes that would break accessibility (e.g., smaller base font).
	- Don't recommend obscure or abandoned design systems.

	## SCORING RUBRIC (Benchmark Fit):
	- Excellent Fit: >85% match, same use-case category, < 3 changes needed
	- Good Fit: 70-85% match, compatible use-case, 3-4 changes needed
	- Fair Fit: 50-70% match, different trade-offs to consider, 4+ changes
	- Poor Fit: <50% match, fundamentally different approach — don't recommend"""

	PROMPT_TEMPLATE = """Analyze the following benchmark comparison data and recommend the best design system alignment.

	## USER'S CURRENT VALUES

	- Type Scale Ratio: {user_ratio}
	- Base Font Size: {user_base}px
	- Spacing Grid: {user_spacing}px

	## BENCHMARK COMPARISON

	{benchmark_comparison}

	## YOUR TASK

	1. Recommend Best Fit: Which design system should they align with? Consider use-case fit, not just numbers.
	2. Explain Why: Cite specific data points (similarity scores, ratio differences, spacing alignment).
	3. List Changes Needed: What would they need to change? Include effort estimates.
	4. Pros/Cons: Specific to this user's values, not generic statements.
	5. Self-Evaluate your recommendation quality.

	## OUTPUT FORMAT (JSON only)

	{{
	"recommended_benchmark": "<system_key>",
	"recommended_benchmark_name": "<full name>",
	"reasoning": "Why this is the best fit — cite specific data",
	"alignment_changes": [
	{{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
	{{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
	],
	"pros_of_alignment": [
	"Specific benefit with data"
	],
	"cons_of_alignment": [
	"Specific trade-off"
	],
	"alternative_benchmarks": [
	{{"name": "Material Design 3", "reason": "Good for Android-first products"}}
	],
	"self_evaluation": {{
	"confidence": <1-10>,
	"reasoning": "Why I am this confident",
	"data_quality": "good\|fair\|poor",
	"flags": []
	}}
	}}

	Return ONLY valid JSON."""

	def __init__(self, hf_client):
	self.hf_client = hf_client

	async def analyze(
	self,
	user_ratio: float,
	user_base: int,
	user_spacing: int,
	benchmark_comparisons: list,
	log_callback: Callable = None,
	) -> BenchmarkAdvice:
	"""
	Recommend best-fit design system.

	Args:
	user_ratio: User's detected type scale ratio
	user_base: User's base font size
	user_spacing: User's spacing grid base
	benchmark_comparisons: List of BenchmarkComparison objects
	log_callback: Progress logging function

	Returns:
	BenchmarkAdvice with recommendations
	"""
	def log(msg: str):
	if log_callback:
	log_callback(msg)

	log("")
	log(" 🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)")
	log(" └─ Evaluating benchmark fit for your use case...")

	# Format comparison data
	comparison_str = self._format_comparisons(benchmark_comparisons)

	prompt = self.PROMPT_TEMPLATE.format(
	user_ratio=user_ratio,
	user_base=user_base,
	user_spacing=user_spacing,
	benchmark_comparison=comparison_str,
	)

	try:
	start_time = datetime.now()

	response = await self.hf_client.complete_async(
	agent_name="benchmark_advisor",
	system_prompt=self.SYSTEM_PROMPT,
	user_message=prompt,
	max_tokens=900,
	json_mode=True,
	)

	duration = (datetime.now() - start_time).total_seconds()

	result = self._parse_response(response)

	log(f" ────────────────────────────────────────────────")
	log(f" 🏢 ATLAS — Benchmark Advisor: COMPLETE ({duration:.1f}s)")
	log(f" ├─ Recommended: {result.recommended_benchmark_name}")
	log(f" ├─ Changes Needed: {len(result.alignment_changes)}")
	log(f" ├─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
	se = result.self_evaluation
	if se:
	log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")

	return result

	except Exception as e:
	log(f" ├─ ⚠️ Benchmark Advisor failed: {str(e)[:120]}")
	return BenchmarkAdvice()

	def _format_comparisons(self, comparisons: list) -> str:
	"""Format benchmark comparisons for prompt."""
	lines = []
	for i, c in enumerate(comparisons[:5]):
	b = c.benchmark
	lines.append(f"""
	{i+1}. {b.icon} {b.name}
	- Similarity Score: {c.similarity_score:.2f} (lower = better)
	- Match: {c.overall_match_pct:.0f}%
	- Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
	- Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
	- Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
	- Best For: {', '.join(b.best_for)}""")

	return "\n".join(lines)

	def _parse_response(self, response: str) -> BenchmarkAdvice:
	"""Parse LLM response into BenchmarkAdvice."""
	try:
	json_match = re.search(r'\{[\s\S]*\}', response)
	if json_match:
	data = json.loads(json_match.group())
	return BenchmarkAdvice(
	recommended_benchmark=data.get("recommended_benchmark", ""),
	recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
	reasoning=data.get("reasoning", ""),
	alignment_changes=data.get("alignment_changes", []),
	pros_of_alignment=data.get("pros_of_alignment", []),
	cons_of_alignment=data.get("cons_of_alignment", []),
	alternative_benchmarks=data.get("alternative_benchmarks", []),
	self_evaluation=data.get("self_evaluation", {}),
	)
	except Exception:
	pass

	return BenchmarkAdvice()


	# =============================================================================
	# BEST PRACTICES VALIDATOR AGENT
	# =============================================================================

	class BestPracticesValidatorAgent:
	"""
	SENTINEL — Design System Best Practices Auditor.

	Validates against design system standards and prioritizes fixes by business impact.
	Model: Qwen 72B (methodical rule-following, precise judgment, structured output)
	Temperature: 0.2 (strict, consistent rule evaluation)

	WHY LLM: Prioritization requires judgment about business impact,
	not just checking boxes.
	"""

	SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor specializing in standards compliance and impact-based prioritization.

	## YOUR ROLE IN THE PIPELINE
	You are Agent 3 of 4 in the Design System Analysis pipeline.
	- INPUT: Rule Engine analysis results (typography, accessibility, spacing, color stats)
	- OUTPUT: Compliance score + prioritized fix list → feeds into NEXUS (Agent 4) for final synthesis
	- Your score directly appears on the user's dashboard. Your priority fixes become the action items.

	## YOUR EXPERTISE
	- WCAG 2.1 AA/AAA accessibility standards
	- Design system best practices (Material Design, Apple HIG, Tailwind conventions)
	- Typography systems (modular scales, vertical rhythm, readability)
	- Color management (palette size limits, near-duplicate detection, contrast requirements)
	- Spacing systems (grid alignment, consistency, component density)

	## QUALITY STANDARDS
	- Overall Score MUST reflect actual data. Don't default to 50.
	- Use the FULL 0-100 range: 90+ = excellent, 70-89 = good, 50-69 = needs work, <50 = significant issues
	- Priority fixes must be ACTIONABLE — include specific values to change (e.g., "Change #06b2c4 → #0891a8")
	- Maximum 5 priority fixes. If more, focus on highest-impact items.

	## WHAT NOT TO DO
	- Don't pass checks that clearly fail based on the data.
	- Don't inflate scores to be "encouraging" — honest assessment helps the user.
	- Don't list fixes without effort estimates — the user needs to plan their work.
	- Don't mix up "warn" and "fail": warn = imperfect but functional, fail = violates a standard.

	## SCORING RUBRIC (Overall Score 0-100):
	- 90-100: All checks pass, excellent accessibility, clean palette, consistent grid
	- 75-89: Most checks pass, minor issues in 1-2 areas, good foundation
	- 60-74: Several warnings, 1-2 failures, needs focused improvement
	- 40-59: Multiple failures, significant accessibility gaps, inconsistent system
	- 20-39: Fundamental issues across multiple areas, major rework needed
	- 0-19: Barely qualifies as a design system, almost everything fails

	## CHECK WEIGHTING:
	- AA Compliance: 25 points (most critical — affects real users)
	- Type Scale Consistency: 15 points
	- Type Scale Standard Ratio: 10 points
	- Base Size Accessible: 15 points
	- Spacing Grid: 15 points
	- Color Count: 5 points
	- No Near-Duplicates: 5 points
	- Shadow System: 10 points (elevation hierarchy, consistency)

	## SHADOW SYSTEM BEST PRACTICES:
	- Use 3-6 elevation levels (xs, sm, md, lg, xl, 2xl)
	- Consistent Y-offset progression (shadows should grow with elevation)
	- Blur radius should increase with elevation (more blur = higher elevation)
	- Shadow colors should be neutral (black/gray with alpha) or brand-colored with low opacity
	- Avoid shadows with 0 blur (looks harsh/flat)
	- Avoid excessive blur (>32px for most use cases)"""

	PROMPT_TEMPLATE = """Validate the following design tokens against best practices and prioritize fixes.

	## RULE ENGINE ANALYSIS RESULTS

	### Typography
	- Detected Ratio: {type_ratio} ({type_consistent})
	- Base Size: {base_size}px
	- Recommendation: {type_recommendation}

	### Accessibility
	- Total Colors: {total_colors}
	- AA Pass: {aa_pass}
	- AA Fail: {aa_fail}
	- Failing Colors: {failing_colors}

	### Spacing
	- Detected Base: {spacing_base}px
	- Grid Aligned: {spacing_aligned}%
	- Recommendation: {spacing_recommendation}px

	### Color Statistics
	- Unique Colors: {unique_colors}
	- Duplicates: {duplicates}
	- Near-Duplicates: {near_duplicates}

	### Shadow System
	- Total Shadows: {shadow_count}
	- Shadow Values: {shadow_values}

	## BEST PRACTICES CHECKLIST (check each one)

	1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
	2. Type scale is consistent (variance < 0.15)
	3. Base font size >= 16px (accessibility)
	4. All interactive colors pass WCAG AA (4.5:1 contrast)
	5. Spacing uses consistent grid (4px or 8px base)
	6. Limited color palette (< 20 unique semantic colors)
	7. No near-duplicate colors (< 3 delta-E apart)
	8. Shadow system has consistent elevation hierarchy (blur/Y-offset increase together)

	## YOUR TASK

	1. Score each practice: pass/warn/fail with specific notes citing the data
	2. Calculate overall score (0-100) using the weighting rubric
	3. Identify TOP 3-5 priority fixes with impact and effort assessment
	4. Self-evaluate your analysis

	## OUTPUT FORMAT (JSON only)

	{{
	"overall_score": <0-100>,
	"checks": {{
	"type_scale_standard": {{"status": "pass\|warn\|fail", "note": "..."}},
	"type_scale_consistent": {{"status": "...", "note": "..."}},
	"base_size_accessible": {{"status": "...", "note": "..."}},
	"aa_compliance": {{"status": "...", "note": "..."}},
	"spacing_grid": {{"status": "...", "note": "..."}},
	"color_count": {{"status": "...", "note": "..."}},
	"near_duplicates": {{"status": "...", "note": "..."}},
	"shadow_system": {{"status": "...", "note": "Elevation hierarchy, blur consistency, color appropriateness"}}
	}},
	"priority_fixes": [
	{{
	"rank": 1,
	"issue": "Brand primary fails AA",
	"impact": "high\|medium\|low",
	"effort": "low\|medium\|high",
	"action": "Change #06b2c4 → #0891a8 for 4.5:1 contrast"
	}}
	],
	"passing_practices": ["Base font size", "..."],
	"failing_practices": ["AA compliance", "..."],
	"self_evaluation": {{
	"confidence": <1-10>,
	"reasoning": "Why I am this confident",
	"data_quality": "good\|fair\|poor",
	"flags": []
	}}
	}}

	Return ONLY valid JSON."""

	def __init__(self, hf_client):
	self.hf_client = hf_client

	async def analyze(
	self,
	rule_engine_results: Any,
	shadow_tokens: dict = None,
	log_callback: Callable = None,
	) -> BestPracticesResult:
	"""
	Validate against best practices.

	Args:
	rule_engine_results: Results from rule engine
	shadow_tokens: Shadow tokens dict {name: {value: "..."}}
	log_callback: Progress logging function

	Returns:
	BestPracticesResult with validation
	"""
	def log(msg: str):
	if log_callback:
	log_callback(msg)

	log("")
	log(" ✅ SENTINEL — Best Practices Validator (Qwen 72B)")
	log(" └─ Checking against design system standards...")

	# Extract data from rule engine
	typo = rule_engine_results.typography
	spacing = rule_engine_results.spacing
	color_stats = rule_engine_results.color_stats
	accessibility = rule_engine_results.accessibility

	failures = [a for a in accessibility if not a.passes_aa_normal]
	failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])

	# Format shadow data for the prompt
	shadow_count = len(shadow_tokens) if shadow_tokens else 0
	shadow_values_str = "None detected"
	if shadow_tokens and shadow_count > 0:
	shadow_list = []
	for name, s in list(shadow_tokens.items())[:6]:
	val = s.get("value", "") if isinstance(s, dict) else str(s)
	shadow_list.append(f"{name}: {val[:50]}")
	shadow_values_str = "; ".join(shadow_list)

	prompt = self.PROMPT_TEMPLATE.format(
	type_ratio=f"{typo.detected_ratio:.3f}",
	type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
	base_size=typo.sizes_px[0] if typo.sizes_px else 16,
	type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
	total_colors=len(accessibility),
	aa_pass=len(accessibility) - len(failures),
	aa_fail=len(failures),
	failing_colors=failing_colors_str or "None",
	spacing_base=spacing.detected_base,
	spacing_aligned=f"{spacing.alignment_percentage:.0f}",
	spacing_recommendation=spacing.recommendation,
	unique_colors=color_stats.unique_count,
	duplicates=color_stats.duplicate_count,
	near_duplicates=len(color_stats.near_duplicates),
	shadow_count=shadow_count,
	shadow_values=shadow_values_str,
	)

	try:
	start_time = datetime.now()

	response = await self.hf_client.complete_async(
	agent_name="best_practices_validator",
	system_prompt=self.SYSTEM_PROMPT,
	user_message=prompt,
	max_tokens=1000,
	json_mode=True,
	)

	duration = (datetime.now() - start_time).total_seconds()

	result = self._parse_response(response)

	log(f" ────────────────────────────────────────────────")
	log(f" ✅ SENTINEL — Best Practices: COMPLETE ({duration:.1f}s)")
	log(f" ├─ Overall Score: {result.overall_score}/100")
	log(f" ├─ Passing: {len(result.passing_practices)} \| Failing: {len(result.failing_practices)}")
	if result.priority_fixes:
	log(f" ├─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
	se = result.self_evaluation
	if se:
	log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")

	return result

	except Exception as e:
	log(f" ├─ ⚠️ Best Practices Validator failed: {str(e)[:120]}")
	return BestPracticesResult()

	def _parse_response(self, response: str) -> BestPracticesResult:
	"""Parse LLM response into BestPracticesResult."""
	try:
	json_match = re.search(r'\{[\s\S]*\}', response)
	if json_match:
	data = json.loads(json_match.group())
	return BestPracticesResult(
	overall_score=data.get("overall_score", 50),
	checks=data.get("checks", {}),
	priority_fixes=data.get("priority_fixes", []),
	passing_practices=data.get("passing_practices", []),
	failing_practices=data.get("failing_practices", []),
	self_evaluation=data.get("self_evaluation", {}),
	)
	except Exception:
	pass

	return BestPracticesResult()


	# =============================================================================
	# HEAD SYNTHESIZER AGENT
	# =============================================================================

	class HeadSynthesizerAgent:
	"""
	NEXUS — Senior Design System Architect & Synthesizer.

	Combines all agent outputs into final actionable recommendations.
	Model: Llama 3.3 70B (128K context for combined inputs, strong synthesis capability)
	Temperature: 0.3 (balanced — needs to synthesize creatively but stay grounded in data)

	This is the final step that produces actionable output for the user.
	"""

	SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect specializing in synthesis and actionable recommendations.

	## YOUR ROLE IN THE PIPELINE
	You are Agent 4 of 4 — the HEAD Synthesizer in the Design System Analysis pipeline.
	- INPUT: Combined outputs from Rule Engine + AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices)
	- OUTPUT: Final executive summary, scores, and prioritized action plan → displayed directly to the user
	- You are the LAST agent. Your output IS the final result. Make it count.

	## YOUR EXPERTISE
	- Design system architecture and governance
	- Synthesizing conflicting recommendations into coherent strategy
	- Effort/impact prioritization (what to fix first)
	- Color accessibility remediation (suggesting AA-compliant alternatives)
	- Executive communication (clear, actionable summaries)

	## QUALITY STANDARDS
	- Executive Summary must be 2-3 sentences MAX. Lead with the overall score, then the #1 issue, then the #1 action.
	- Overall Score must SYNTHESIZE all agent inputs — don't just average them.
	- Color recommendations must include BOTH current AND suggested hex values.
	- Top 3 Actions must be ordered by IMPACT, not ease.
	- Accept/reject defaults on color recs: default to "accept" for accessibility fixes, "reject" for purely aesthetic changes.

	## WHAT NOT TO DO
	- Don't contradict previous agents without explaining why.
	- Don't recommend changes that SENTINEL flagged as breaking.
	- Don't suggest more than 8 color changes — the user will ignore a long list.
	- Don't give vague actions like "improve accessibility" — be specific: "Change brand.primary from #06b2c4 to #0891a8 for 4.5:1 contrast".
	- Don't inflate scores to be "nice". If the design system has issues, say so clearly.

	## SCORING RUBRIC (Overall 0-100):
	- 90-100: Production-ready design system, minor polishing only
	- 75-89: Solid foundation, 2-3 targeted improvements needed
	- 60-74: Functional but needs focused attention on accessibility or consistency
	- 40-59: Significant gaps requiring systematic improvement
	- 20-39: Major rework needed across multiple dimensions
	- 0-19: Fundamental redesign recommended"""

	PROMPT_TEMPLATE = """Synthesize all analysis results into a final, actionable design system report.

	## RULE ENGINE FACTS (Layer 1 — Free, deterministic)

	- Type Scale: {type_ratio} ({type_status})
	- Base Size: {base_size}px
	- AA Failures: {aa_failures}
	- Spacing Grid: {spacing_status}
	- Unique Colors: {unique_colors}
	- Consistency Score: {consistency_score}/100

	## AURORA — Brand Identification (Agent 1)

	- Brand Primary: {brand_primary}
	- Brand Secondary: {brand_secondary}
	- Palette Cohesion: {cohesion_score}/10

	## ATLAS — Benchmark Advice (Agent 2)

	Closest Match: {closest_benchmark}
	Match Percentage: {match_pct}%
	Recommended Changes: {benchmark_changes}

	## SENTINEL — Best Practices Validation (Agent 3)

	Overall Score: {best_practices_score}/100
	Priority Fixes: {priority_fixes}

	## ACCESSIBILITY FIXES NEEDED

	{accessibility_fixes}

	## YOUR TASK

	Synthesize ALL the above into:
	1. Executive Summary (2-3 sentences — lead with score, #1 issue, #1 action)
	2. Overall Scores (synthesized, not averaged)
	3. Top 3 Priority Actions (ordered by IMPACT, include effort estimates)
	4. Specific Color Recommendations (with accept/reject defaults)
	5. Type Scale Recommendation
	6. Spacing Recommendation
	7. Self-Evaluation of your synthesis

	## OUTPUT FORMAT (JSON only)

	{{
	"executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
	"scores": {{
	"overall": <0-100>,
	"accessibility": <0-100>,
	"consistency": <0-100>,
	"organization": <0-100>
	}},
	"benchmark_fit": {{
	"closest": "<name>",
	"similarity": "<X%>",
	"recommendation": "Specific action to align"
	}},
	"brand_analysis": {{
	"primary": "#hex",
	"secondary": "#hex",
	"cohesion": <1-10>
	}},
	"top_3_actions": [
	{{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
	],
	"color_recommendations": [
	{{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
	],
	"type_scale_recommendation": {{
	"current_ratio": 1.18,
	"recommended_ratio": 1.25,
	"reason": "Why this ratio is better"
	}},
	"spacing_recommendation": {{
	"current": "mixed",
	"recommended": "8px",
	"reason": "Why this grid is better"
	}},
	"self_evaluation": {{
	"confidence": <1-10>,
	"reasoning": "Why I am this confident in the synthesis",
	"data_quality": "good\|fair\|poor",
	"flags": []
	}}
	}}

	Return ONLY valid JSON."""

	def __init__(self, hf_client):
	self.hf_client = hf_client

	async def synthesize(
	self,
	rule_engine_results: Any,
	benchmark_comparisons: list,
	brand_identification: BrandIdentification,
	benchmark_advice: BenchmarkAdvice,
	best_practices: BestPracticesResult,
	log_callback: Callable = None,
	) -> HeadSynthesis:
	"""
	Synthesize all results into final recommendations.
	"""
	def log(msg: str):
	if log_callback:
	log_callback(msg)

	log("")
	log("═" * 60)
	log("🧠 LAYER 4: NEXUS — HEAD SYNTHESIZER (Llama 3.3 70B)")
	log("═" * 60)
	log("")
	log(" Combining: Rule Engine + AURORA + ATLAS + SENTINEL...")

	# Extract data
	typo = rule_engine_results.typography
	spacing = rule_engine_results.spacing
	color_stats = rule_engine_results.color_stats
	accessibility = rule_engine_results.accessibility

	failures = [a for a in accessibility if not a.passes_aa_normal]
	aa_fixes_str = "\n".join([
	f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
	for a in failures[:5] if a.suggested_fix
	])

	closest = benchmark_comparisons[0] if benchmark_comparisons else None

	prompt = self.PROMPT_TEMPLATE.format(
	type_ratio=f"{typo.detected_ratio:.3f}",
	type_status="consistent" if typo.is_consistent else "inconsistent",
	base_size=typo.sizes_px[0] if typo.sizes_px else 16,
	aa_failures=len(failures),
	spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
	unique_colors=color_stats.unique_count,
	consistency_score=rule_engine_results.consistency_score,
	closest_benchmark=closest.benchmark.name if closest else "Unknown",
	match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
	benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
	brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
	brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
	cohesion_score=brand_identification.cohesion_score,
	best_practices_score=best_practices.overall_score,
	priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
	accessibility_fixes=aa_fixes_str or "None needed",
	)

	try:
	start_time = datetime.now()

	response = await self.hf_client.complete_async(
	agent_name="head_synthesizer",
	system_prompt=self.SYSTEM_PROMPT,
	user_message=prompt,
	max_tokens=1200,
	json_mode=True,
	)

	duration = (datetime.now() - start_time).total_seconds()

	result = self._parse_response(response)

	log("")
	log(f" ✅ NEXUS — HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
	if result.scores:
	log(f" ├─ Overall Score: {result.scores.get('overall', '?')}/100")
	log(f" ├─ Actions: {len(result.top_3_actions)} \| Color Recs: {len(result.color_recommendations)}")
	se = result.self_evaluation
	if se:
	log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
	log("")

	return result

	except Exception as e:
	log(f" ├─ ⚠️ Head Synthesizer failed: {str(e)[:120]}")
	return HeadSynthesis()

	def _parse_response(self, response: str) -> HeadSynthesis:
	"""Parse LLM response into HeadSynthesis."""
	try:
	json_match = re.search(r'\{[\s\S]*\}', response)
	if json_match:
	data = json.loads(json_match.group())
	return HeadSynthesis(
	executive_summary=data.get("executive_summary", ""),
	scores=data.get("scores", {}),
	benchmark_fit=data.get("benchmark_fit", {}),
	brand_analysis=data.get("brand_analysis", {}),
	top_3_actions=data.get("top_3_actions", []),
	color_recommendations=data.get("color_recommendations", []),
	type_scale_recommendation=data.get("type_scale_recommendation", {}),
	spacing_recommendation=data.get("spacing_recommendation", {}),
	self_evaluation=data.get("self_evaluation", {}),
	)
	except Exception:
	pass

	return HeadSynthesis()