riazmo commited on
Commit
93b9760
Β·
verified Β·
1 Parent(s): 52b0a45

Upload llm_agents.py

Browse files
Files changed (1) hide show
  1. agents/llm_agents.py +1124 -0
agents/llm_agents.py ADDED
@@ -0,0 +1,1124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage 2 LLM Agents β€” Specialized Analysis Tasks
3
+ =================================================
4
+
5
+ These agents handle tasks that REQUIRE LLM reasoning:
6
+ - Brand Identifier: Identify brand colors from usage context
7
+ - Benchmark Advisor: Recommend best-fit design system
8
+ - Best Practices Validator: Prioritize fixes by business impact
9
+ - HEAD Synthesizer: Combine all outputs into final recommendations
10
+
11
+ Each agent has a focused prompt for its specific task.
12
+ """
13
+
14
+ import json
15
+ import re
16
+ from dataclasses import dataclass, field
17
+ from typing import Optional, Callable, Any
18
+ from datetime import datetime
19
+
20
+
21
+ # =============================================================================
22
+ # DATA CLASSES
23
+ # =============================================================================
24
+
25
+ @dataclass
26
+ class BrandIdentification:
27
+ """Results from Brand Identifier agent (AURORA)."""
28
+ brand_primary: dict = field(default_factory=dict)
29
+ # {color, confidence, reasoning, usage_count}
30
+
31
+ brand_secondary: dict = field(default_factory=dict)
32
+ brand_accent: dict = field(default_factory=dict)
33
+
34
+ palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random
35
+ cohesion_score: int = 5 # 1-10
36
+ cohesion_notes: str = ""
37
+
38
+ semantic_names: dict = field(default_factory=dict)
39
+ # {hex_color: suggested_name}
40
+
41
+ self_evaluation: dict = field(default_factory=dict)
42
+ # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
43
+
44
+ def to_dict(self) -> dict:
45
+ return {
46
+ "brand_primary": self.brand_primary,
47
+ "brand_secondary": self.brand_secondary,
48
+ "brand_accent": self.brand_accent,
49
+ "palette_strategy": self.palette_strategy,
50
+ "cohesion_score": self.cohesion_score,
51
+ "cohesion_notes": self.cohesion_notes,
52
+ "semantic_names": self.semantic_names,
53
+ "self_evaluation": self.self_evaluation,
54
+ }
55
+
56
+
57
+ @dataclass
58
+ class BenchmarkAdvice:
59
+ """Results from Benchmark Advisor agent."""
60
+ recommended_benchmark: str = ""
61
+ recommended_benchmark_name: str = ""
62
+ reasoning: str = ""
63
+
64
+ alignment_changes: list = field(default_factory=list)
65
+ # [{change, from, to, effort}]
66
+
67
+ pros_of_alignment: list = field(default_factory=list)
68
+ cons_of_alignment: list = field(default_factory=list)
69
+
70
+ alternative_benchmarks: list = field(default_factory=list)
71
+ # [{name, reason}]
72
+
73
+ self_evaluation: dict = field(default_factory=dict)
74
+ # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
75
+
76
+ def to_dict(self) -> dict:
77
+ return {
78
+ "recommended_benchmark": self.recommended_benchmark,
79
+ "recommended_benchmark_name": self.recommended_benchmark_name,
80
+ "reasoning": self.reasoning,
81
+ "alignment_changes": self.alignment_changes,
82
+ "pros": self.pros_of_alignment,
83
+ "cons": self.cons_of_alignment,
84
+ "alternatives": self.alternative_benchmarks,
85
+ "self_evaluation": self.self_evaluation,
86
+ }
87
+
88
+
89
+ @dataclass
90
+ class BestPracticesResult:
91
+ """Results from Best Practices Validator agent."""
92
+ overall_score: int = 50 # 0-100
93
+
94
+ checks: dict = field(default_factory=dict)
95
+ # {check_name: {status: pass/warn/fail, note: str}}
96
+
97
+ priority_fixes: list = field(default_factory=list)
98
+ # [{rank, issue, impact, effort, action}]
99
+
100
+ passing_practices: list = field(default_factory=list)
101
+ failing_practices: list = field(default_factory=list)
102
+
103
+ self_evaluation: dict = field(default_factory=dict)
104
+ # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
105
+
106
+ def to_dict(self) -> dict:
107
+ return {
108
+ "overall_score": self.overall_score,
109
+ "checks": self.checks,
110
+ "priority_fixes": self.priority_fixes,
111
+ "passing": self.passing_practices,
112
+ "failing": self.failing_practices,
113
+ "self_evaluation": self.self_evaluation,
114
+ }
115
+
116
+
117
+ @dataclass
118
+ class HeadSynthesis:
119
+ """Final synthesized output from HEAD agent."""
120
+ executive_summary: str = ""
121
+
122
+ scores: dict = field(default_factory=dict)
123
+ # {overall, accessibility, consistency, organization}
124
+
125
+ benchmark_fit: dict = field(default_factory=dict)
126
+ # {closest, similarity, recommendation}
127
+
128
+ brand_analysis: dict = field(default_factory=dict)
129
+ # {primary, secondary, cohesion}
130
+
131
+ top_3_actions: list = field(default_factory=list)
132
+ # [{action, impact, effort, details}]
133
+
134
+ color_recommendations: list = field(default_factory=list)
135
+ # [{role, current, suggested, reason, accept}]
136
+
137
+ type_scale_recommendation: dict = field(default_factory=dict)
138
+ spacing_recommendation: dict = field(default_factory=dict)
139
+
140
+ self_evaluation: dict = field(default_factory=dict)
141
+ # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
142
+
143
+ def to_dict(self) -> dict:
144
+ return {
145
+ "executive_summary": self.executive_summary,
146
+ "scores": self.scores,
147
+ "benchmark_fit": self.benchmark_fit,
148
+ "brand_analysis": self.brand_analysis,
149
+ "top_3_actions": self.top_3_actions,
150
+ "color_recommendations": self.color_recommendations,
151
+ "type_scale_recommendation": self.type_scale_recommendation,
152
+ "spacing_recommendation": self.spacing_recommendation,
153
+ "self_evaluation": self.self_evaluation,
154
+ }
155
+
156
+
157
+ # =============================================================================
158
+ # BRAND IDENTIFIER AGENT
159
+ # =============================================================================
160
+
161
+ class BrandIdentifierAgent:
162
+ """
163
+ AURORA β€” Senior Brand Color Analyst.
164
+
165
+ Identifies brand colors from usage context using creative/visual reasoning.
166
+ Model: Qwen 72B (strong creative reasoning, color harmony assessment)
167
+ Temperature: 0.4 (allows creative interpretation of color stories)
168
+
169
+ WHY LLM: Requires understanding context (33 buttons = likely brand primary),
170
+ not just color math.
171
+ """
172
+
173
+ SYSTEM_PROMPT = """You are AURORA, a Senior Brand Color Analyst specializing in visual identity systems.
174
+
175
+ ## YOUR ROLE IN THE PIPELINE
176
+ You are Agent 1 of 4 in the Design System Analysis pipeline.
177
+ - INPUT: Raw color tokens with usage counts + semantic CSS analysis from Stage 1 extraction
178
+ - OUTPUT: Brand color identification + palette strategy β†’ feeds into NEXUS (Agent 4) for final synthesis
179
+ - Your analysis directly influences the final color recommendations shown to the user.
180
+
181
+ ## YOUR EXPERTISE
182
+ - Color harmony theory (complementary, analogous, triadic, split-complementary, monochromatic)
183
+ - Brand identity systems (primary/secondary/accent hierarchy)
184
+ - CSS context interpretation (button colors = likely CTA, background colors = likely neutral)
185
+ - Color naming conventions (design token naming: brand.primary, text.secondary, etc.)
186
+
187
+ ## QUALITY STANDARDS
188
+ - Brand Primary MUST have HIGH confidence if one color dominates buttons/CTAs. Say "low" if ambiguous.
189
+ - Cohesion Score: Use the FULL 1-10 range. A score of 7+ means clear intentional harmony. Most sites score 5-7.
190
+ - If fewer than 5 unique colors exist, flag as "insufficient_data" β€” don't guess relationships.
191
+
192
+ ## WHAT NOT TO DO
193
+ - Don't inflate confidence. "Medium" is fine when usage patterns are unclear.
194
+ - Don't guess accent colors if none exist β€” use null.
195
+ - Don't assume complementary strategy just because two colors differ β€” check the actual hue relationship.
196
+ - Don't name colors generically. Use semantic design-token style names (brand.primary, not "blue").
197
+
198
+ ## SCORING RUBRIC (Cohesion 1-10):
199
+ - 9-10: Clear harmony rule across all colors, distinct brand identity, consistent palette
200
+ - 7-8: Mostly harmonious, clear brand identity, minor inconsistencies
201
+ - 5-6: Some color relationships visible but not systematic
202
+ - 3-4: Random-feeling palette, no clear color strategy
203
+ - 1-2: Actively conflicting colors, no brand identity visible"""
204
+
205
+ PROMPT_TEMPLATE = """Analyze the following color usage data and identify the brand color system.
206
+
207
+ ## COLOR DATA WITH USAGE CONTEXT
208
+
209
+ {color_data}
210
+
211
+ ## SEMANTIC ANALYSIS (from CSS properties)
212
+
213
+ {semantic_analysis}
214
+
215
+ ## YOUR TASK
216
+
217
+ 1. **Identify Brand Colors**:
218
+ - Brand Primary: The main action/CTA color (highest visibility in buttons, links, key UI)
219
+ - Brand Secondary: Supporting brand color (headers, secondary actions)
220
+ - Brand Accent: Highlight color for emphasis (badges, alerts, special states)
221
+
222
+ 2. **Assess Palette Strategy**: complementary, analogous, triadic, monochromatic, or random?
223
+
224
+ 3. **Rate Cohesion** (1-10) using the rubric above
225
+
226
+ 4. **Suggest Semantic Names** for top 10 most-used colors (design-token format)
227
+
228
+ 5. **Self-Evaluate** your analysis quality
229
+
230
+ ## OUTPUT FORMAT (JSON only)
231
+
232
+ {{
233
+ "brand_primary": {{
234
+ "color": "#hex",
235
+ "confidence": "high|medium|low",
236
+ "reasoning": "Why this is brand primary β€” cite specific usage evidence",
237
+ "usage_count": <number>
238
+ }},
239
+ "brand_secondary": {{
240
+ "color": "#hex",
241
+ "confidence": "high|medium|low",
242
+ "reasoning": "..."
243
+ }},
244
+ "brand_accent": {{
245
+ "color": "#hex or null",
246
+ "confidence": "...",
247
+ "reasoning": "..."
248
+ }},
249
+ "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
250
+ "cohesion_score": <1-10>,
251
+ "cohesion_notes": "Assessment of how well colors work together",
252
+ "semantic_names": {{
253
+ "#hex1": "brand.primary",
254
+ "#hex2": "text.primary",
255
+ "#hex3": "background.primary"
256
+ }},
257
+ "self_evaluation": {{
258
+ "confidence": <1-10>,
259
+ "reasoning": "Why I am this confident in my analysis",
260
+ "data_quality": "good|fair|poor",
261
+ "flags": []
262
+ }}
263
+ }}
264
+
265
+ Return ONLY valid JSON."""
266
+
267
+ def __init__(self, hf_client):
268
+ self.hf_client = hf_client
269
+
270
+ async def analyze(
271
+ self,
272
+ color_tokens: dict,
273
+ semantic_analysis: dict,
274
+ log_callback: Callable = None,
275
+ ) -> BrandIdentification:
276
+ """
277
+ Identify brand colors from usage context.
278
+
279
+ Args:
280
+ color_tokens: Dict of color tokens with usage data
281
+ semantic_analysis: Semantic categorization from Stage 1
282
+ log_callback: Progress logging function
283
+
284
+ Returns:
285
+ BrandIdentification with identified colors
286
+ """
287
+ def log(msg: str):
288
+ if log_callback:
289
+ log_callback(msg)
290
+
291
+ log(" 🎨 AURORA β€” Brand Identifier (Qwen 72B)")
292
+ log(" └─ Analyzing color context and usage patterns...")
293
+
294
+ # Format color data
295
+ color_data = self._format_color_data(color_tokens)
296
+ semantic_str = self._format_semantic_analysis(semantic_analysis)
297
+
298
+ prompt = self.PROMPT_TEMPLATE.format(
299
+ color_data=color_data,
300
+ semantic_analysis=semantic_str,
301
+ )
302
+
303
+ try:
304
+ start_time = datetime.now()
305
+
306
+ response = await self.hf_client.complete_async(
307
+ agent_name="brand_identifier",
308
+ system_prompt=self.SYSTEM_PROMPT,
309
+ user_message=prompt,
310
+ max_tokens=1000,
311
+ json_mode=True,
312
+ )
313
+
314
+ duration = (datetime.now() - start_time).total_seconds()
315
+
316
+ # Parse response
317
+ result = self._parse_response(response)
318
+
319
+ log(f" ────────────────────────────────────────────────")
320
+ log(f" 🎨 AURORA β€” Brand Identifier: COMPLETE ({duration:.1f}s)")
321
+ log(f" β”œβ”€ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
322
+ log(f" β”œβ”€ Brand Secondary: {result.brand_secondary.get('color', '?')}")
323
+ log(f" β”œβ”€ Palette Strategy: {result.palette_strategy}")
324
+ log(f" β”œβ”€ Cohesion Score: {result.cohesion_score}/10")
325
+ se = result.self_evaluation
326
+ if se:
327
+ log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
328
+
329
+ return result
330
+
331
+ except Exception as e:
332
+ error_msg = str(e)
333
+ # Always log full error for diagnosis
334
+ log(f" ⚠️ Brand Identifier failed: {error_msg[:120]}")
335
+ if "gated" in error_msg.lower() or "access" in error_msg.lower():
336
+ log(f" └─ Model may require license acceptance at huggingface.co")
337
+ elif "Rate limit" in error_msg or "429" in error_msg:
338
+ log(f" └─ HF free tier rate limit β€” wait or upgrade to Pro")
339
+ return BrandIdentification()
340
+
341
+ def _format_color_data(self, color_tokens: dict) -> str:
342
+ """Format color tokens for prompt."""
343
+ lines = []
344
+ for name, token in list(color_tokens.items())[:30]:
345
+ if isinstance(token, dict):
346
+ hex_val = token.get("value", token.get("hex", ""))
347
+ usage = token.get("usage_count", token.get("count", 1))
348
+ context = token.get("context", token.get("css_property", ""))
349
+ else:
350
+ hex_val = getattr(token, "value", "")
351
+ usage = getattr(token, "usage_count", 1)
352
+ context = getattr(token, "context", "")
353
+
354
+ if hex_val:
355
+ lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")
356
+
357
+ return "\n".join(lines) if lines else "No color data available"
358
+
359
+ def _format_semantic_analysis(self, semantic: dict) -> str:
360
+ """Format semantic analysis for prompt."""
361
+ if not semantic:
362
+ return "No semantic analysis available"
363
+
364
+ lines = []
365
+ try:
366
+ for category, value in semantic.items():
367
+ if not value:
368
+ continue
369
+
370
+ if isinstance(value, list):
371
+ # List of colors
372
+ color_list = []
373
+ for c in value[:5]:
374
+ if isinstance(c, dict):
375
+ color_list.append(c.get("hex", c.get("value", str(c))))
376
+ else:
377
+ color_list.append(str(c))
378
+ lines.append(f"- {category}: {', '.join(color_list)}")
379
+
380
+ elif isinstance(value, dict):
381
+ # Could be a nested dict of sub-roles β†’ color dicts
382
+ # e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}}
383
+ # or a flat color dict {"hex": "#...", "confidence": "..."}
384
+ # or a summary dict {"total_colors_analyzed": 50, ...}
385
+ if "hex" in value:
386
+ # Flat color dict
387
+ lines.append(f"- {category}: {value['hex']}")
388
+ else:
389
+ # Nested dict β€” iterate sub-roles
390
+ sub_items = []
391
+ for sub_role, sub_val in list(value.items())[:5]:
392
+ if isinstance(sub_val, dict) and "hex" in sub_val:
393
+ sub_items.append(f"{sub_role}={sub_val['hex']}")
394
+ elif isinstance(sub_val, (str, int, float, bool)):
395
+ sub_items.append(f"{sub_role}={sub_val}")
396
+ if sub_items:
397
+ lines.append(f"- {category}: {', '.join(sub_items)}")
398
+ else:
399
+ lines.append(f"- {category}: {value}")
400
+ except Exception as e:
401
+ return f"Error formatting semantic analysis: {str(e)[:50]}"
402
+
403
+ return "\n".join(lines) if lines else "No semantic analysis available"
404
+
405
+ def _parse_response(self, response: str) -> BrandIdentification:
406
+ """Parse LLM response into BrandIdentification."""
407
+ try:
408
+ json_match = re.search(r'\{[\s\S]*\}', response)
409
+ if json_match:
410
+ data = json.loads(json_match.group())
411
+ return BrandIdentification(
412
+ brand_primary=data.get("brand_primary", {}),
413
+ brand_secondary=data.get("brand_secondary", {}),
414
+ brand_accent=data.get("brand_accent", {}),
415
+ palette_strategy=data.get("palette_strategy", "unknown"),
416
+ cohesion_score=data.get("cohesion_score", 5),
417
+ cohesion_notes=data.get("cohesion_notes", ""),
418
+ semantic_names=data.get("semantic_names", {}),
419
+ self_evaluation=data.get("self_evaluation", {}),
420
+ )
421
+ except Exception:
422
+ pass
423
+
424
+ return BrandIdentification()
425
+
426
+
427
+ # =============================================================================
428
+ # BENCHMARK ADVISOR AGENT
429
+ # =============================================================================
430
+
431
+ class BenchmarkAdvisorAgent:
432
+ """
433
+ ATLAS β€” Senior Design System Benchmark Analyst.
434
+
435
+ Recommends best-fit design system based on comparison data.
436
+ Model: Llama 3.3 70B (128K context for large benchmark data, excellent comparative reasoning)
437
+ Temperature: 0.25 (analytical, data-driven comparison)
438
+
439
+ WHY LLM: Requires reasoning about trade-offs and use-case fit,
440
+ not just similarity scores.
441
+ """
442
+
443
+ SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst specializing in cross-system comparison and alignment strategy.
444
+
445
+ ## YOUR ROLE IN THE PIPELINE
446
+ You are Agent 2 of 4 in the Design System Analysis pipeline.
447
+ - INPUT: User's extracted type scale, spacing, and font sizes + benchmark comparison data from the Rule Engine
448
+ - OUTPUT: Benchmark recommendation with alignment roadmap β†’ feeds into NEXUS (Agent 4) for final synthesis
449
+ - Your recommendation helps the user decide which established design system to align with.
450
+
451
+ ## YOUR EXPERTISE
452
+ - Deep knowledge of Material Design 3, Apple HIG, IBM Carbon, Ant Design, Atlassian, Tailwind CSS, Bootstrap
453
+ - Type scale mathematics (major/minor second/third, perfect fourth/fifth, golden ratio)
454
+ - Spacing grid systems (4px, 8px, multiples) and their trade-offs
455
+ - Migration effort estimation for design system alignment
456
+
457
+ ## QUALITY STANDARDS
458
+ - Always consider BOTH similarity score AND use-case fit. Closest match β‰  best fit.
459
+ - Recommend max 4 alignment changes. More than that = the benchmark is not a good fit.
460
+ - Effort estimates must be realistic: "low" = CSS variable change, "medium" = component updates, "high" = layout restructuring.
461
+ - If similarity is above 85%, say "already well-aligned" and suggest minimal changes only.
462
+
463
+ ## WHAT NOT TO DO
464
+ - Don't always recommend the closest match β€” a system 5% less similar but much better suited is preferable.
465
+ - Don't list generic pros/cons. Be specific to the user's actual values.
466
+ - Don't suggest alignment changes that would break accessibility (e.g., smaller base font).
467
+ - Don't recommend obscure or abandoned design systems.
468
+
469
+ ## SCORING RUBRIC (Benchmark Fit):
470
+ - Excellent Fit: >85% match, same use-case category, < 3 changes needed
471
+ - Good Fit: 70-85% match, compatible use-case, 3-4 changes needed
472
+ - Fair Fit: 50-70% match, different trade-offs to consider, 4+ changes
473
+ - Poor Fit: <50% match, fundamentally different approach β€” don't recommend"""
474
+
475
+ PROMPT_TEMPLATE = """Analyze the following benchmark comparison data and recommend the best design system alignment.
476
+
477
+ ## USER'S CURRENT VALUES
478
+
479
+ - Type Scale Ratio: {user_ratio}
480
+ - Base Font Size: {user_base}px
481
+ - Spacing Grid: {user_spacing}px
482
+
483
+ ## BENCHMARK COMPARISON
484
+
485
+ {benchmark_comparison}
486
+
487
+ ## YOUR TASK
488
+
489
+ 1. **Recommend Best Fit**: Which design system should they align with? Consider use-case fit, not just numbers.
490
+ 2. **Explain Why**: Cite specific data points (similarity scores, ratio differences, spacing alignment).
491
+ 3. **List Changes Needed**: What would they need to change? Include effort estimates.
492
+ 4. **Pros/Cons**: Specific to this user's values, not generic statements.
493
+ 5. **Self-Evaluate** your recommendation quality.
494
+
495
+ ## OUTPUT FORMAT (JSON only)
496
+
497
+ {{
498
+ "recommended_benchmark": "<system_key>",
499
+ "recommended_benchmark_name": "<full name>",
500
+ "reasoning": "Why this is the best fit β€” cite specific data",
501
+ "alignment_changes": [
502
+ {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
503
+ {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
504
+ ],
505
+ "pros_of_alignment": [
506
+ "Specific benefit with data"
507
+ ],
508
+ "cons_of_alignment": [
509
+ "Specific trade-off"
510
+ ],
511
+ "alternative_benchmarks": [
512
+ {{"name": "Material Design 3", "reason": "Good for Android-first products"}}
513
+ ],
514
+ "self_evaluation": {{
515
+ "confidence": <1-10>,
516
+ "reasoning": "Why I am this confident",
517
+ "data_quality": "good|fair|poor",
518
+ "flags": []
519
+ }}
520
+ }}
521
+
522
+ Return ONLY valid JSON."""
523
+
524
+ def __init__(self, hf_client):
525
+ self.hf_client = hf_client
526
+
527
+ async def analyze(
528
+ self,
529
+ user_ratio: float,
530
+ user_base: int,
531
+ user_spacing: int,
532
+ benchmark_comparisons: list,
533
+ log_callback: Callable = None,
534
+ ) -> BenchmarkAdvice:
535
+ """
536
+ Recommend best-fit design system.
537
+
538
+ Args:
539
+ user_ratio: User's detected type scale ratio
540
+ user_base: User's base font size
541
+ user_spacing: User's spacing grid base
542
+ benchmark_comparisons: List of BenchmarkComparison objects
543
+ log_callback: Progress logging function
544
+
545
+ Returns:
546
+ BenchmarkAdvice with recommendations
547
+ """
548
+ def log(msg: str):
549
+ if log_callback:
550
+ log_callback(msg)
551
+
552
+ log("")
553
+ log(" 🏒 ATLAS β€” Benchmark Advisor (Llama 3.3 70B)")
554
+ log(" └─ Evaluating benchmark fit for your use case...")
555
+
556
+ # Format comparison data
557
+ comparison_str = self._format_comparisons(benchmark_comparisons)
558
+
559
+ prompt = self.PROMPT_TEMPLATE.format(
560
+ user_ratio=user_ratio,
561
+ user_base=user_base,
562
+ user_spacing=user_spacing,
563
+ benchmark_comparison=comparison_str,
564
+ )
565
+
566
+ try:
567
+ start_time = datetime.now()
568
+
569
+ response = await self.hf_client.complete_async(
570
+ agent_name="benchmark_advisor",
571
+ system_prompt=self.SYSTEM_PROMPT,
572
+ user_message=prompt,
573
+ max_tokens=900,
574
+ json_mode=True,
575
+ )
576
+
577
+ duration = (datetime.now() - start_time).total_seconds()
578
+
579
+ result = self._parse_response(response)
580
+
581
+ log(f" ────────────────────────────────────────────────")
582
+ log(f" 🏒 ATLAS β€” Benchmark Advisor: COMPLETE ({duration:.1f}s)")
583
+ log(f" β”œβ”€ Recommended: {result.recommended_benchmark_name}")
584
+ log(f" β”œβ”€ Changes Needed: {len(result.alignment_changes)}")
585
+ log(f" β”œβ”€ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
586
+ se = result.self_evaluation
587
+ if se:
588
+ log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
589
+
590
+ return result
591
+
592
+ except Exception as e:
593
+ log(f" β”œβ”€ ⚠️ Benchmark Advisor failed: {str(e)[:120]}")
594
+ return BenchmarkAdvice()
595
+
596
+ def _format_comparisons(self, comparisons: list) -> str:
597
+ """Format benchmark comparisons for prompt."""
598
+ lines = []
599
+ for i, c in enumerate(comparisons[:5]):
600
+ b = c.benchmark
601
+ lines.append(f"""
602
+ {i+1}. {b.icon} {b.name}
603
+ - Similarity Score: {c.similarity_score:.2f} (lower = better)
604
+ - Match: {c.overall_match_pct:.0f}%
605
+ - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
606
+ - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
607
+ - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
608
+ - Best For: {', '.join(b.best_for)}""")
609
+
610
+ return "\n".join(lines)
611
+
612
+ def _parse_response(self, response: str) -> BenchmarkAdvice:
613
+ """Parse LLM response into BenchmarkAdvice."""
614
+ try:
615
+ json_match = re.search(r'\{[\s\S]*\}', response)
616
+ if json_match:
617
+ data = json.loads(json_match.group())
618
+ return BenchmarkAdvice(
619
+ recommended_benchmark=data.get("recommended_benchmark", ""),
620
+ recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
621
+ reasoning=data.get("reasoning", ""),
622
+ alignment_changes=data.get("alignment_changes", []),
623
+ pros_of_alignment=data.get("pros_of_alignment", []),
624
+ cons_of_alignment=data.get("cons_of_alignment", []),
625
+ alternative_benchmarks=data.get("alternative_benchmarks", []),
626
+ self_evaluation=data.get("self_evaluation", {}),
627
+ )
628
+ except Exception:
629
+ pass
630
+
631
+ return BenchmarkAdvice()
632
+
633
+
634
+ # =============================================================================
635
+ # BEST PRACTICES VALIDATOR AGENT
636
+ # =============================================================================
637
+
638
+ class BestPracticesValidatorAgent:
639
+ """
640
+ SENTINEL β€” Design System Best Practices Auditor.
641
+
642
+ Validates against design system standards and prioritizes fixes by business impact.
643
+ Model: Qwen 72B (methodical rule-following, precise judgment, structured output)
644
+ Temperature: 0.2 (strict, consistent rule evaluation)
645
+
646
+ WHY LLM: Prioritization requires judgment about business impact,
647
+ not just checking boxes.
648
+ """
649
+
650
+ SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor specializing in standards compliance and impact-based prioritization.
651
+
652
+ ## YOUR ROLE IN THE PIPELINE
653
+ You are Agent 3 of 4 in the Design System Analysis pipeline.
654
+ - INPUT: Rule Engine analysis results (typography, accessibility, spacing, color stats)
655
+ - OUTPUT: Compliance score + prioritized fix list β†’ feeds into NEXUS (Agent 4) for final synthesis
656
+ - Your score directly appears on the user's dashboard. Your priority fixes become the action items.
657
+
658
+ ## YOUR EXPERTISE
659
+ - WCAG 2.1 AA/AAA accessibility standards
660
+ - Design system best practices (Material Design, Apple HIG, Tailwind conventions)
661
+ - Typography systems (modular scales, vertical rhythm, readability)
662
+ - Color management (palette size limits, near-duplicate detection, contrast requirements)
663
+ - Spacing systems (grid alignment, consistency, component density)
664
+
665
+ ## QUALITY STANDARDS
666
+ - Overall Score MUST reflect actual data. Don't default to 50.
667
+ - Use the FULL 0-100 range: 90+ = excellent, 70-89 = good, 50-69 = needs work, <50 = significant issues
668
+ - Priority fixes must be ACTIONABLE β€” include specific values to change (e.g., "Change #06b2c4 β†’ #0891a8")
669
+ - Maximum 5 priority fixes. If more, focus on highest-impact items.
670
+
671
+ ## WHAT NOT TO DO
672
+ - Don't pass checks that clearly fail based on the data.
673
+ - Don't inflate scores to be "encouraging" β€” honest assessment helps the user.
674
+ - Don't list fixes without effort estimates β€” the user needs to plan their work.
675
+ - Don't mix up "warn" and "fail": warn = imperfect but functional, fail = violates a standard.
676
+
677
+ ## SCORING RUBRIC (Overall Score 0-100):
678
+ - 90-100: All checks pass, excellent accessibility, clean palette, consistent grid
679
+ - 75-89: Most checks pass, minor issues in 1-2 areas, good foundation
680
+ - 60-74: Several warnings, 1-2 failures, needs focused improvement
681
+ - 40-59: Multiple failures, significant accessibility gaps, inconsistent system
682
+ - 20-39: Fundamental issues across multiple areas, major rework needed
683
+ - 0-19: Barely qualifies as a design system, almost everything fails
684
+
685
+ ## CHECK WEIGHTING:
686
+ - AA Compliance: 25 points (most critical β€” affects real users)
687
+ - Type Scale Consistency: 15 points
688
+ - Type Scale Standard Ratio: 10 points
689
+ - Base Size Accessible: 15 points
690
+ - Spacing Grid: 15 points
691
+ - Color Count: 10 points
692
+ - No Near-Duplicates: 10 points"""
693
+
694
+ PROMPT_TEMPLATE = """Validate the following design tokens against best practices and prioritize fixes.
695
+
696
+ ## RULE ENGINE ANALYSIS RESULTS
697
+
698
+ ### Typography
699
+ - Detected Ratio: {type_ratio} ({type_consistent})
700
+ - Base Size: {base_size}px
701
+ - Recommendation: {type_recommendation}
702
+
703
+ ### Accessibility
704
+ - Total Colors: {total_colors}
705
+ - AA Pass: {aa_pass}
706
+ - AA Fail: {aa_fail}
707
+ - Failing Colors: {failing_colors}
708
+
709
+ ### Spacing
710
+ - Detected Base: {spacing_base}px
711
+ - Grid Aligned: {spacing_aligned}%
712
+ - Recommendation: {spacing_recommendation}px
713
+
714
+ ### Color Statistics
715
+ - Unique Colors: {unique_colors}
716
+ - Duplicates: {duplicates}
717
+ - Near-Duplicates: {near_duplicates}
718
+
719
+ ## BEST PRACTICES CHECKLIST (check each one)
720
+
721
+ 1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
722
+ 2. Type scale is consistent (variance < 0.15)
723
+ 3. Base font size >= 16px (accessibility)
724
+ 4. All interactive colors pass WCAG AA (4.5:1 contrast)
725
+ 5. Spacing uses consistent grid (4px or 8px base)
726
+ 6. Limited color palette (< 20 unique semantic colors)
727
+ 7. No near-duplicate colors (< 3 delta-E apart)
728
+
729
+ ## YOUR TASK
730
+
731
+ 1. Score each practice: pass/warn/fail with specific notes citing the data
732
+ 2. Calculate overall score (0-100) using the weighting rubric
733
+ 3. Identify TOP 3-5 priority fixes with impact and effort assessment
734
+ 4. Self-evaluate your analysis
735
+
736
+ ## OUTPUT FORMAT (JSON only)
737
+
738
+ {{
739
+ "overall_score": <0-100>,
740
+ "checks": {{
741
+ "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}},
742
+ "type_scale_consistent": {{"status": "...", "note": "..."}},
743
+ "base_size_accessible": {{"status": "...", "note": "..."}},
744
+ "aa_compliance": {{"status": "...", "note": "..."}},
745
+ "spacing_grid": {{"status": "...", "note": "..."}},
746
+ "color_count": {{"status": "...", "note": "..."}},
747
+ "near_duplicates": {{"status": "...", "note": "..."}}
748
+ }},
749
+ "priority_fixes": [
750
+ {{
751
+ "rank": 1,
752
+ "issue": "Brand primary fails AA",
753
+ "impact": "high|medium|low",
754
+ "effort": "low|medium|high",
755
+ "action": "Change #06b2c4 β†’ #0891a8 for 4.5:1 contrast"
756
+ }}
757
+ ],
758
+ "passing_practices": ["Base font size", "..."],
759
+ "failing_practices": ["AA compliance", "..."],
760
+ "self_evaluation": {{
761
+ "confidence": <1-10>,
762
+ "reasoning": "Why I am this confident",
763
+ "data_quality": "good|fair|poor",
764
+ "flags": []
765
+ }}
766
+ }}
767
+
768
+ Return ONLY valid JSON."""
769
+
770
+ def __init__(self, hf_client):
771
+ self.hf_client = hf_client
772
+
773
+ async def analyze(
774
+ self,
775
+ rule_engine_results: Any,
776
+ log_callback: Callable = None,
777
+ ) -> BestPracticesResult:
778
+ """
779
+ Validate against best practices.
780
+
781
+ Args:
782
+ rule_engine_results: Results from rule engine
783
+ log_callback: Progress logging function
784
+
785
+ Returns:
786
+ BestPracticesResult with validation
787
+ """
788
+ def log(msg: str):
789
+ if log_callback:
790
+ log_callback(msg)
791
+
792
+ log("")
793
+ log(" βœ… SENTINEL β€” Best Practices Validator (Qwen 72B)")
794
+ log(" └─ Checking against design system standards...")
795
+
796
+ # Extract data from rule engine
797
+ typo = rule_engine_results.typography
798
+ spacing = rule_engine_results.spacing
799
+ color_stats = rule_engine_results.color_stats
800
+ accessibility = rule_engine_results.accessibility
801
+
802
+ failures = [a for a in accessibility if not a.passes_aa_normal]
803
+ failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])
804
+
805
+ prompt = self.PROMPT_TEMPLATE.format(
806
+ type_ratio=f"{typo.detected_ratio:.3f}",
807
+ type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
808
+ base_size=typo.sizes_px[0] if typo.sizes_px else 16,
809
+ type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
810
+ total_colors=len(accessibility),
811
+ aa_pass=len(accessibility) - len(failures),
812
+ aa_fail=len(failures),
813
+ failing_colors=failing_colors_str or "None",
814
+ spacing_base=spacing.detected_base,
815
+ spacing_aligned=f"{spacing.alignment_percentage:.0f}",
816
+ spacing_recommendation=spacing.recommendation,
817
+ unique_colors=color_stats.unique_count,
818
+ duplicates=color_stats.duplicate_count,
819
+ near_duplicates=len(color_stats.near_duplicates),
820
+ )
821
+
822
+ try:
823
+ start_time = datetime.now()
824
+
825
+ response = await self.hf_client.complete_async(
826
+ agent_name="best_practices_validator",
827
+ system_prompt=self.SYSTEM_PROMPT,
828
+ user_message=prompt,
829
+ max_tokens=1000,
830
+ json_mode=True,
831
+ )
832
+
833
+ duration = (datetime.now() - start_time).total_seconds()
834
+
835
+ result = self._parse_response(response)
836
+
837
+ log(f" ────────────────────────────────────────────────")
838
+ log(f" βœ… SENTINEL β€” Best Practices: COMPLETE ({duration:.1f}s)")
839
+ log(f" β”œβ”€ Overall Score: {result.overall_score}/100")
840
+ log(f" β”œβ”€ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}")
841
+ if result.priority_fixes:
842
+ log(f" β”œβ”€ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
843
+ se = result.self_evaluation
844
+ if se:
845
+ log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
846
+
847
+ return result
848
+
849
+ except Exception as e:
850
+ log(f" β”œβ”€ ⚠️ Best Practices Validator failed: {str(e)[:120]}")
851
+ return BestPracticesResult()
852
+
853
+ def _parse_response(self, response: str) -> BestPracticesResult:
854
+ """Parse LLM response into BestPracticesResult."""
855
+ try:
856
+ json_match = re.search(r'\{[\s\S]*\}', response)
857
+ if json_match:
858
+ data = json.loads(json_match.group())
859
+ return BestPracticesResult(
860
+ overall_score=data.get("overall_score", 50),
861
+ checks=data.get("checks", {}),
862
+ priority_fixes=data.get("priority_fixes", []),
863
+ passing_practices=data.get("passing_practices", []),
864
+ failing_practices=data.get("failing_practices", []),
865
+ self_evaluation=data.get("self_evaluation", {}),
866
+ )
867
+ except Exception:
868
+ pass
869
+
870
+ return BestPracticesResult()
871
+
872
+
873
+ # =============================================================================
874
+ # HEAD SYNTHESIZER AGENT
875
+ # =============================================================================
876
+
877
+ class HeadSynthesizerAgent:
878
+ """
879
+ NEXUS β€” Senior Design System Architect & Synthesizer.
880
+
881
+ Combines all agent outputs into final actionable recommendations.
882
+ Model: Llama 3.3 70B (128K context for combined inputs, strong synthesis capability)
883
+ Temperature: 0.3 (balanced β€” needs to synthesize creatively but stay grounded in data)
884
+
885
+ This is the final step that produces actionable output for the user.
886
+ """
887
+
888
+ SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect specializing in synthesis and actionable recommendations.
889
+
890
+ ## YOUR ROLE IN THE PIPELINE
891
+ You are Agent 4 of 4 β€” the HEAD Synthesizer in the Design System Analysis pipeline.
892
+ - INPUT: Combined outputs from Rule Engine + AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices)
893
+ - OUTPUT: Final executive summary, scores, and prioritized action plan β†’ displayed directly to the user
894
+ - You are the LAST agent. Your output IS the final result. Make it count.
895
+
896
+ ## YOUR EXPERTISE
897
+ - Design system architecture and governance
898
+ - Synthesizing conflicting recommendations into coherent strategy
899
+ - Effort/impact prioritization (what to fix first)
900
+ - Color accessibility remediation (suggesting AA-compliant alternatives)
901
+ - Executive communication (clear, actionable summaries)
902
+
903
+ ## QUALITY STANDARDS
904
+ - Executive Summary must be 2-3 sentences MAX. Lead with the overall score, then the #1 issue, then the #1 action.
905
+ - Overall Score must SYNTHESIZE all agent inputs β€” don't just average them.
906
+ - Color recommendations must include BOTH current AND suggested hex values.
907
+ - Top 3 Actions must be ordered by IMPACT, not ease.
908
+ - Accept/reject defaults on color recs: default to "accept" for accessibility fixes, "reject" for purely aesthetic changes.
909
+
910
+ ## WHAT NOT TO DO
911
+ - Don't contradict previous agents without explaining why.
912
+ - Don't recommend changes that SENTINEL flagged as breaking.
913
+ - Don't suggest more than 8 color changes β€” the user will ignore a long list.
914
+ - Don't give vague actions like "improve accessibility" β€” be specific: "Change brand.primary from #06b2c4 to #0891a8 for 4.5:1 contrast".
915
+ - Don't inflate scores to be "nice". If the design system has issues, say so clearly.
916
+
917
+ ## SCORING RUBRIC (Overall 0-100):
918
+ - 90-100: Production-ready design system, minor polishing only
919
+ - 75-89: Solid foundation, 2-3 targeted improvements needed
920
+ - 60-74: Functional but needs focused attention on accessibility or consistency
921
+ - 40-59: Significant gaps requiring systematic improvement
922
+ - 20-39: Major rework needed across multiple dimensions
923
+ - 0-19: Fundamental redesign recommended"""
924
+
925
+ PROMPT_TEMPLATE = """Synthesize all analysis results into a final, actionable design system report.
926
+
927
+ ## RULE ENGINE FACTS (Layer 1 β€” Free, deterministic)
928
+
929
+ - Type Scale: {type_ratio} ({type_status})
930
+ - Base Size: {base_size}px
931
+ - AA Failures: {aa_failures}
932
+ - Spacing Grid: {spacing_status}
933
+ - Unique Colors: {unique_colors}
934
+ - Consistency Score: {consistency_score}/100
935
+
936
+ ## AURORA β€” Brand Identification (Agent 1)
937
+
938
+ - Brand Primary: {brand_primary}
939
+ - Brand Secondary: {brand_secondary}
940
+ - Palette Cohesion: {cohesion_score}/10
941
+
942
+ ## ATLAS β€” Benchmark Advice (Agent 2)
943
+
944
+ Closest Match: {closest_benchmark}
945
+ Match Percentage: {match_pct}%
946
+ Recommended Changes: {benchmark_changes}
947
+
948
+ ## SENTINEL β€” Best Practices Validation (Agent 3)
949
+
950
+ Overall Score: {best_practices_score}/100
951
+ Priority Fixes: {priority_fixes}
952
+
953
+ ## ACCESSIBILITY FIXES NEEDED
954
+
955
+ {accessibility_fixes}
956
+
957
+ ## YOUR TASK
958
+
959
+ Synthesize ALL the above into:
960
+ 1. Executive Summary (2-3 sentences β€” lead with score, #1 issue, #1 action)
961
+ 2. Overall Scores (synthesized, not averaged)
962
+ 3. Top 3 Priority Actions (ordered by IMPACT, include effort estimates)
963
+ 4. Specific Color Recommendations (with accept/reject defaults)
964
+ 5. Type Scale Recommendation
965
+ 6. Spacing Recommendation
966
+ 7. Self-Evaluation of your synthesis
967
+
968
+ ## OUTPUT FORMAT (JSON only)
969
+
970
+ {{
971
+ "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
972
+ "scores": {{
973
+ "overall": <0-100>,
974
+ "accessibility": <0-100>,
975
+ "consistency": <0-100>,
976
+ "organization": <0-100>
977
+ }},
978
+ "benchmark_fit": {{
979
+ "closest": "<name>",
980
+ "similarity": "<X%>",
981
+ "recommendation": "Specific action to align"
982
+ }},
983
+ "brand_analysis": {{
984
+ "primary": "#hex",
985
+ "secondary": "#hex",
986
+ "cohesion": <1-10>
987
+ }},
988
+ "top_3_actions": [
989
+ {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
990
+ ],
991
+ "color_recommendations": [
992
+ {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
993
+ ],
994
+ "type_scale_recommendation": {{
995
+ "current_ratio": 1.18,
996
+ "recommended_ratio": 1.25,
997
+ "reason": "Why this ratio is better"
998
+ }},
999
+ "spacing_recommendation": {{
1000
+ "current": "mixed",
1001
+ "recommended": "8px",
1002
+ "reason": "Why this grid is better"
1003
+ }},
1004
+ "self_evaluation": {{
1005
+ "confidence": <1-10>,
1006
+ "reasoning": "Why I am this confident in the synthesis",
1007
+ "data_quality": "good|fair|poor",
1008
+ "flags": []
1009
+ }}
1010
+ }}
1011
+
1012
+ Return ONLY valid JSON."""
1013
+
1014
+ def __init__(self, hf_client):
1015
+ self.hf_client = hf_client
1016
+
1017
+ async def synthesize(
1018
+ self,
1019
+ rule_engine_results: Any,
1020
+ benchmark_comparisons: list,
1021
+ brand_identification: BrandIdentification,
1022
+ benchmark_advice: BenchmarkAdvice,
1023
+ best_practices: BestPracticesResult,
1024
+ log_callback: Callable = None,
1025
+ ) -> HeadSynthesis:
1026
+ """
1027
+ Synthesize all results into final recommendations.
1028
+ """
1029
+ def log(msg: str):
1030
+ if log_callback:
1031
+ log_callback(msg)
1032
+
1033
+ log("")
1034
+ log("═" * 60)
1035
+ log("🧠 LAYER 4: NEXUS β€” HEAD SYNTHESIZER (Llama 3.3 70B)")
1036
+ log("═" * 60)
1037
+ log("")
1038
+ log(" Combining: Rule Engine + AURORA + ATLAS + SENTINEL...")
1039
+
1040
+ # Extract data
1041
+ typo = rule_engine_results.typography
1042
+ spacing = rule_engine_results.spacing
1043
+ color_stats = rule_engine_results.color_stats
1044
+ accessibility = rule_engine_results.accessibility
1045
+
1046
+ failures = [a for a in accessibility if not a.passes_aa_normal]
1047
+ aa_fixes_str = "\n".join([
1048
+ f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) β†’ {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
1049
+ for a in failures[:5] if a.suggested_fix
1050
+ ])
1051
+
1052
+ closest = benchmark_comparisons[0] if benchmark_comparisons else None
1053
+
1054
+ prompt = self.PROMPT_TEMPLATE.format(
1055
+ type_ratio=f"{typo.detected_ratio:.3f}",
1056
+ type_status="consistent" if typo.is_consistent else "inconsistent",
1057
+ base_size=typo.sizes_px[0] if typo.sizes_px else 16,
1058
+ aa_failures=len(failures),
1059
+ spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
1060
+ unique_colors=color_stats.unique_count,
1061
+ consistency_score=rule_engine_results.consistency_score,
1062
+ closest_benchmark=closest.benchmark.name if closest else "Unknown",
1063
+ match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
1064
+ benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
1065
+ brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
1066
+ brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
1067
+ cohesion_score=brand_identification.cohesion_score,
1068
+ best_practices_score=best_practices.overall_score,
1069
+ priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
1070
+ accessibility_fixes=aa_fixes_str or "None needed",
1071
+ )
1072
+
1073
+ try:
1074
+ start_time = datetime.now()
1075
+
1076
+ response = await self.hf_client.complete_async(
1077
+ agent_name="head_synthesizer",
1078
+ system_prompt=self.SYSTEM_PROMPT,
1079
+ user_message=prompt,
1080
+ max_tokens=1200,
1081
+ json_mode=True,
1082
+ )
1083
+
1084
+ duration = (datetime.now() - start_time).total_seconds()
1085
+
1086
+ result = self._parse_response(response)
1087
+
1088
+ log("")
1089
+ log(f" βœ… NEXUS β€” HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
1090
+ if result.scores:
1091
+ log(f" β”œβ”€ Overall Score: {result.scores.get('overall', '?')}/100")
1092
+ log(f" β”œβ”€ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}")
1093
+ se = result.self_evaluation
1094
+ if se:
1095
+ log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
1096
+ log("")
1097
+
1098
+ return result
1099
+
1100
+ except Exception as e:
1101
+ log(f" β”œβ”€ ⚠️ Head Synthesizer failed: {str(e)[:120]}")
1102
+ return HeadSynthesis()
1103
+
1104
+ def _parse_response(self, response: str) -> HeadSynthesis:
1105
+ """Parse LLM response into HeadSynthesis."""
1106
+ try:
1107
+ json_match = re.search(r'\{[\s\S]*\}', response)
1108
+ if json_match:
1109
+ data = json.loads(json_match.group())
1110
+ return HeadSynthesis(
1111
+ executive_summary=data.get("executive_summary", ""),
1112
+ scores=data.get("scores", {}),
1113
+ benchmark_fit=data.get("benchmark_fit", {}),
1114
+ brand_analysis=data.get("brand_analysis", {}),
1115
+ top_3_actions=data.get("top_3_actions", []),
1116
+ color_recommendations=data.get("color_recommendations", []),
1117
+ type_scale_recommendation=data.get("type_scale_recommendation", {}),
1118
+ spacing_recommendation=data.get("spacing_recommendation", {}),
1119
+ self_evaluation=data.get("self_evaluation", {}),
1120
+ )
1121
+ except Exception:
1122
+ pass
1123
+
1124
+ return HeadSynthesis()