riazmo commited on
Commit
0c5fda9
·
verified ·
1 Parent(s): ce0a129

Delete llm_agents.py

Browse files
Files changed (1) hide show
  1. llm_agents.py +0 -1124
llm_agents.py DELETED
@@ -1,1124 +0,0 @@
1
- """
2
- Stage 2 LLM Agents — Specialized Analysis Tasks
3
- =================================================
4
-
5
- These agents handle tasks that REQUIRE LLM reasoning:
6
- - Brand Identifier: Identify brand colors from usage context
7
- - Benchmark Advisor: Recommend best-fit design system
8
- - Best Practices Validator: Prioritize fixes by business impact
9
- - HEAD Synthesizer: Combine all outputs into final recommendations
10
-
11
- Each agent has a focused prompt for its specific task.
12
- """
13
-
14
- import json
15
- import re
16
- from dataclasses import dataclass, field
17
- from typing import Optional, Callable, Any
18
- from datetime import datetime
19
-
20
-
21
- # =============================================================================
22
- # DATA CLASSES
23
- # =============================================================================
24
-
25
- @dataclass
26
- class BrandIdentification:
27
- """Results from Brand Identifier agent (AURORA)."""
28
- brand_primary: dict = field(default_factory=dict)
29
- # {color, confidence, reasoning, usage_count}
30
-
31
- brand_secondary: dict = field(default_factory=dict)
32
- brand_accent: dict = field(default_factory=dict)
33
-
34
- palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random
35
- cohesion_score: int = 5 # 1-10
36
- cohesion_notes: str = ""
37
-
38
- semantic_names: dict = field(default_factory=dict)
39
- # {hex_color: suggested_name}
40
-
41
- self_evaluation: dict = field(default_factory=dict)
42
- # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
43
-
44
- def to_dict(self) -> dict:
45
- return {
46
- "brand_primary": self.brand_primary,
47
- "brand_secondary": self.brand_secondary,
48
- "brand_accent": self.brand_accent,
49
- "palette_strategy": self.palette_strategy,
50
- "cohesion_score": self.cohesion_score,
51
- "cohesion_notes": self.cohesion_notes,
52
- "semantic_names": self.semantic_names,
53
- "self_evaluation": self.self_evaluation,
54
- }
55
-
56
-
57
- @dataclass
58
- class BenchmarkAdvice:
59
- """Results from Benchmark Advisor agent."""
60
- recommended_benchmark: str = ""
61
- recommended_benchmark_name: str = ""
62
- reasoning: str = ""
63
-
64
- alignment_changes: list = field(default_factory=list)
65
- # [{change, from, to, effort}]
66
-
67
- pros_of_alignment: list = field(default_factory=list)
68
- cons_of_alignment: list = field(default_factory=list)
69
-
70
- alternative_benchmarks: list = field(default_factory=list)
71
- # [{name, reason}]
72
-
73
- self_evaluation: dict = field(default_factory=dict)
74
- # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
75
-
76
- def to_dict(self) -> dict:
77
- return {
78
- "recommended_benchmark": self.recommended_benchmark,
79
- "recommended_benchmark_name": self.recommended_benchmark_name,
80
- "reasoning": self.reasoning,
81
- "alignment_changes": self.alignment_changes,
82
- "pros": self.pros_of_alignment,
83
- "cons": self.cons_of_alignment,
84
- "alternatives": self.alternative_benchmarks,
85
- "self_evaluation": self.self_evaluation,
86
- }
87
-
88
-
89
- @dataclass
90
- class BestPracticesResult:
91
- """Results from Best Practices Validator agent."""
92
- overall_score: int = 50 # 0-100
93
-
94
- checks: dict = field(default_factory=dict)
95
- # {check_name: {status: pass/warn/fail, note: str}}
96
-
97
- priority_fixes: list = field(default_factory=list)
98
- # [{rank, issue, impact, effort, action}]
99
-
100
- passing_practices: list = field(default_factory=list)
101
- failing_practices: list = field(default_factory=list)
102
-
103
- self_evaluation: dict = field(default_factory=dict)
104
- # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
105
-
106
- def to_dict(self) -> dict:
107
- return {
108
- "overall_score": self.overall_score,
109
- "checks": self.checks,
110
- "priority_fixes": self.priority_fixes,
111
- "passing": self.passing_practices,
112
- "failing": self.failing_practices,
113
- "self_evaluation": self.self_evaluation,
114
- }
115
-
116
-
117
- @dataclass
118
- class HeadSynthesis:
119
- """Final synthesized output from HEAD agent."""
120
- executive_summary: str = ""
121
-
122
- scores: dict = field(default_factory=dict)
123
- # {overall, accessibility, consistency, organization}
124
-
125
- benchmark_fit: dict = field(default_factory=dict)
126
- # {closest, similarity, recommendation}
127
-
128
- brand_analysis: dict = field(default_factory=dict)
129
- # {primary, secondary, cohesion}
130
-
131
- top_3_actions: list = field(default_factory=list)
132
- # [{action, impact, effort, details}]
133
-
134
- color_recommendations: list = field(default_factory=list)
135
- # [{role, current, suggested, reason, accept}]
136
-
137
- type_scale_recommendation: dict = field(default_factory=dict)
138
- spacing_recommendation: dict = field(default_factory=dict)
139
-
140
- self_evaluation: dict = field(default_factory=dict)
141
- # {confidence: 1-10, reasoning: str, data_quality: good|fair|poor, flags: []}
142
-
143
- def to_dict(self) -> dict:
144
- return {
145
- "executive_summary": self.executive_summary,
146
- "scores": self.scores,
147
- "benchmark_fit": self.benchmark_fit,
148
- "brand_analysis": self.brand_analysis,
149
- "top_3_actions": self.top_3_actions,
150
- "color_recommendations": self.color_recommendations,
151
- "type_scale_recommendation": self.type_scale_recommendation,
152
- "spacing_recommendation": self.spacing_recommendation,
153
- "self_evaluation": self.self_evaluation,
154
- }
155
-
156
-
157
- # =============================================================================
158
- # BRAND IDENTIFIER AGENT
159
- # =============================================================================
160
-
161
- class BrandIdentifierAgent:
162
- """
163
- AURORA — Senior Brand Color Analyst.
164
-
165
- Identifies brand colors from usage context using creative/visual reasoning.
166
- Model: Qwen 72B (strong creative reasoning, color harmony assessment)
167
- Temperature: 0.4 (allows creative interpretation of color stories)
168
-
169
- WHY LLM: Requires understanding context (33 buttons = likely brand primary),
170
- not just color math.
171
- """
172
-
173
- SYSTEM_PROMPT = """You are AURORA, a Senior Brand Color Analyst specializing in visual identity systems.
174
-
175
- ## YOUR ROLE IN THE PIPELINE
176
- You are Agent 1 of 4 in the Design System Analysis pipeline.
177
- - INPUT: Raw color tokens with usage counts + semantic CSS analysis from Stage 1 extraction
178
- - OUTPUT: Brand color identification + palette strategy → feeds into NEXUS (Agent 4) for final synthesis
179
- - Your analysis directly influences the final color recommendations shown to the user.
180
-
181
- ## YOUR EXPERTISE
182
- - Color harmony theory (complementary, analogous, triadic, split-complementary, monochromatic)
183
- - Brand identity systems (primary/secondary/accent hierarchy)
184
- - CSS context interpretation (button colors = likely CTA, background colors = likely neutral)
185
- - Color naming conventions (design token naming: brand.primary, text.secondary, etc.)
186
-
187
- ## QUALITY STANDARDS
188
- - Brand Primary MUST have HIGH confidence if one color dominates buttons/CTAs. Say "low" if ambiguous.
189
- - Cohesion Score: Use the FULL 1-10 range. A score of 7+ means clear intentional harmony. Most sites score 5-7.
190
- - If fewer than 5 unique colors exist, flag as "insufficient_data" — don't guess relationships.
191
-
192
- ## WHAT NOT TO DO
193
- - Don't inflate confidence. "Medium" is fine when usage patterns are unclear.
194
- - Don't guess accent colors if none exist — use null.
195
- - Don't assume complementary strategy just because two colors differ — check the actual hue relationship.
196
- - Don't name colors generically. Use semantic design-token style names (brand.primary, not "blue").
197
-
198
- ## SCORING RUBRIC (Cohesion 1-10):
199
- - 9-10: Clear harmony rule across all colors, distinct brand identity, consistent palette
200
- - 7-8: Mostly harmonious, clear brand identity, minor inconsistencies
201
- - 5-6: Some color relationships visible but not systematic
202
- - 3-4: Random-feeling palette, no clear color strategy
203
- - 1-2: Actively conflicting colors, no brand identity visible"""
204
-
205
- PROMPT_TEMPLATE = """Analyze the following color usage data and identify the brand color system.
206
-
207
- ## COLOR DATA WITH USAGE CONTEXT
208
-
209
- {color_data}
210
-
211
- ## SEMANTIC ANALYSIS (from CSS properties)
212
-
213
- {semantic_analysis}
214
-
215
- ## YOUR TASK
216
-
217
- 1. **Identify Brand Colors**:
218
- - Brand Primary: The main action/CTA color (highest visibility in buttons, links, key UI)
219
- - Brand Secondary: Supporting brand color (headers, secondary actions)
220
- - Brand Accent: Highlight color for emphasis (badges, alerts, special states)
221
-
222
- 2. **Assess Palette Strategy**: complementary, analogous, triadic, monochromatic, or random?
223
-
224
- 3. **Rate Cohesion** (1-10) using the rubric above
225
-
226
- 4. **Suggest Semantic Names** for top 10 most-used colors (design-token format)
227
-
228
- 5. **Self-Evaluate** your analysis quality
229
-
230
- ## OUTPUT FORMAT (JSON only)
231
-
232
- {{
233
- "brand_primary": {{
234
- "color": "#hex",
235
- "confidence": "high|medium|low",
236
- "reasoning": "Why this is brand primary — cite specific usage evidence",
237
- "usage_count": <number>
238
- }},
239
- "brand_secondary": {{
240
- "color": "#hex",
241
- "confidence": "high|medium|low",
242
- "reasoning": "..."
243
- }},
244
- "brand_accent": {{
245
- "color": "#hex or null",
246
- "confidence": "...",
247
- "reasoning": "..."
248
- }},
249
- "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
250
- "cohesion_score": <1-10>,
251
- "cohesion_notes": "Assessment of how well colors work together",
252
- "semantic_names": {{
253
- "#hex1": "brand.primary",
254
- "#hex2": "text.primary",
255
- "#hex3": "background.primary"
256
- }},
257
- "self_evaluation": {{
258
- "confidence": <1-10>,
259
- "reasoning": "Why I am this confident in my analysis",
260
- "data_quality": "good|fair|poor",
261
- "flags": []
262
- }}
263
- }}
264
-
265
- Return ONLY valid JSON."""
266
-
267
- def __init__(self, hf_client):
268
- self.hf_client = hf_client
269
-
270
- async def analyze(
271
- self,
272
- color_tokens: dict,
273
- semantic_analysis: dict,
274
- log_callback: Callable = None,
275
- ) -> BrandIdentification:
276
- """
277
- Identify brand colors from usage context.
278
-
279
- Args:
280
- color_tokens: Dict of color tokens with usage data
281
- semantic_analysis: Semantic categorization from Stage 1
282
- log_callback: Progress logging function
283
-
284
- Returns:
285
- BrandIdentification with identified colors
286
- """
287
- def log(msg: str):
288
- if log_callback:
289
- log_callback(msg)
290
-
291
- log(" 🎨 AURORA — Brand Identifier (Qwen 72B)")
292
- log(" └─ Analyzing color context and usage patterns...")
293
-
294
- # Format color data
295
- color_data = self._format_color_data(color_tokens)
296
- semantic_str = self._format_semantic_analysis(semantic_analysis)
297
-
298
- prompt = self.PROMPT_TEMPLATE.format(
299
- color_data=color_data,
300
- semantic_analysis=semantic_str,
301
- )
302
-
303
- try:
304
- start_time = datetime.now()
305
-
306
- response = await self.hf_client.complete_async(
307
- agent_name="brand_identifier",
308
- system_prompt=self.SYSTEM_PROMPT,
309
- user_message=prompt,
310
- max_tokens=1000,
311
- json_mode=True,
312
- )
313
-
314
- duration = (datetime.now() - start_time).total_seconds()
315
-
316
- # Parse response
317
- result = self._parse_response(response)
318
-
319
- log(f" ────────────────────────────────────────────────")
320
- log(f" 🎨 AURORA — Brand Identifier: COMPLETE ({duration:.1f}s)")
321
- log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
322
- log(f" ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}")
323
- log(f" ├─ Palette Strategy: {result.palette_strategy}")
324
- log(f" ├─ Cohesion Score: {result.cohesion_score}/10")
325
- se = result.self_evaluation
326
- if se:
327
- log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
328
-
329
- return result
330
-
331
- except Exception as e:
332
- error_msg = str(e)
333
- # Always log full error for diagnosis
334
- log(f" ⚠️ Brand Identifier failed: {error_msg[:120]}")
335
- if "gated" in error_msg.lower() or "access" in error_msg.lower():
336
- log(f" └─ Model may require license acceptance at huggingface.co")
337
- elif "Rate limit" in error_msg or "429" in error_msg:
338
- log(f" └─ HF free tier rate limit — wait or upgrade to Pro")
339
- return BrandIdentification()
340
-
341
- def _format_color_data(self, color_tokens: dict) -> str:
342
- """Format color tokens for prompt."""
343
- lines = []
344
- for name, token in list(color_tokens.items())[:30]:
345
- if isinstance(token, dict):
346
- hex_val = token.get("value", token.get("hex", ""))
347
- usage = token.get("usage_count", token.get("count", 1))
348
- context = token.get("context", token.get("css_property", ""))
349
- else:
350
- hex_val = getattr(token, "value", "")
351
- usage = getattr(token, "usage_count", 1)
352
- context = getattr(token, "context", "")
353
-
354
- if hex_val:
355
- lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")
356
-
357
- return "\n".join(lines) if lines else "No color data available"
358
-
359
- def _format_semantic_analysis(self, semantic: dict) -> str:
360
- """Format semantic analysis for prompt."""
361
- if not semantic:
362
- return "No semantic analysis available"
363
-
364
- lines = []
365
- try:
366
- for category, value in semantic.items():
367
- if not value:
368
- continue
369
-
370
- if isinstance(value, list):
371
- # List of colors
372
- color_list = []
373
- for c in value[:5]:
374
- if isinstance(c, dict):
375
- color_list.append(c.get("hex", c.get("value", str(c))))
376
- else:
377
- color_list.append(str(c))
378
- lines.append(f"- {category}: {', '.join(color_list)}")
379
-
380
- elif isinstance(value, dict):
381
- # Could be a nested dict of sub-roles → color dicts
382
- # e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}}
383
- # or a flat color dict {"hex": "#...", "confidence": "..."}
384
- # or a summary dict {"total_colors_analyzed": 50, ...}
385
- if "hex" in value:
386
- # Flat color dict
387
- lines.append(f"- {category}: {value['hex']}")
388
- else:
389
- # Nested dict — iterate sub-roles
390
- sub_items = []
391
- for sub_role, sub_val in list(value.items())[:5]:
392
- if isinstance(sub_val, dict) and "hex" in sub_val:
393
- sub_items.append(f"{sub_role}={sub_val['hex']}")
394
- elif isinstance(sub_val, (str, int, float, bool)):
395
- sub_items.append(f"{sub_role}={sub_val}")
396
- if sub_items:
397
- lines.append(f"- {category}: {', '.join(sub_items)}")
398
- else:
399
- lines.append(f"- {category}: {value}")
400
- except Exception as e:
401
- return f"Error formatting semantic analysis: {str(e)[:50]}"
402
-
403
- return "\n".join(lines) if lines else "No semantic analysis available"
404
-
405
- def _parse_response(self, response: str) -> BrandIdentification:
406
- """Parse LLM response into BrandIdentification."""
407
- try:
408
- json_match = re.search(r'\{[\s\S]*\}', response)
409
- if json_match:
410
- data = json.loads(json_match.group())
411
- return BrandIdentification(
412
- brand_primary=data.get("brand_primary", {}),
413
- brand_secondary=data.get("brand_secondary", {}),
414
- brand_accent=data.get("brand_accent", {}),
415
- palette_strategy=data.get("palette_strategy", "unknown"),
416
- cohesion_score=data.get("cohesion_score", 5),
417
- cohesion_notes=data.get("cohesion_notes", ""),
418
- semantic_names=data.get("semantic_names", {}),
419
- self_evaluation=data.get("self_evaluation", {}),
420
- )
421
- except Exception:
422
- pass
423
-
424
- return BrandIdentification()
425
-
426
-
427
- # =============================================================================
428
- # BENCHMARK ADVISOR AGENT
429
- # =============================================================================
430
-
431
- class BenchmarkAdvisorAgent:
432
- """
433
- ATLAS — Senior Design System Benchmark Analyst.
434
-
435
- Recommends best-fit design system based on comparison data.
436
- Model: Llama 3.3 70B (128K context for large benchmark data, excellent comparative reasoning)
437
- Temperature: 0.25 (analytical, data-driven comparison)
438
-
439
- WHY LLM: Requires reasoning about trade-offs and use-case fit,
440
- not just similarity scores.
441
- """
442
-
443
- SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst specializing in cross-system comparison and alignment strategy.
444
-
445
- ## YOUR ROLE IN THE PIPELINE
446
- You are Agent 2 of 4 in the Design System Analysis pipeline.
447
- - INPUT: User's extracted type scale, spacing, and font sizes + benchmark comparison data from the Rule Engine
448
- - OUTPUT: Benchmark recommendation with alignment roadmap → feeds into NEXUS (Agent 4) for final synthesis
449
- - Your recommendation helps the user decide which established design system to align with.
450
-
451
- ## YOUR EXPERTISE
452
- - Deep knowledge of Material Design 3, Apple HIG, IBM Carbon, Ant Design, Atlassian, Tailwind CSS, Bootstrap
453
- - Type scale mathematics (major/minor second/third, perfect fourth/fifth, golden ratio)
454
- - Spacing grid systems (4px, 8px, multiples) and their trade-offs
455
- - Migration effort estimation for design system alignment
456
-
457
- ## QUALITY STANDARDS
458
- - Always consider BOTH similarity score AND use-case fit. Closest match ≠ best fit.
459
- - Recommend max 4 alignment changes. More than that = the benchmark is not a good fit.
460
- - Effort estimates must be realistic: "low" = CSS variable change, "medium" = component updates, "high" = layout restructuring.
461
- - If similarity is above 85%, say "already well-aligned" and suggest minimal changes only.
462
-
463
- ## WHAT NOT TO DO
464
- - Don't always recommend the closest match — a system 5% less similar but much better suited is preferable.
465
- - Don't list generic pros/cons. Be specific to the user's actual values.
466
- - Don't suggest alignment changes that would break accessibility (e.g., smaller base font).
467
- - Don't recommend obscure or abandoned design systems.
468
-
469
- ## SCORING RUBRIC (Benchmark Fit):
470
- - Excellent Fit: >85% match, same use-case category, < 3 changes needed
471
- - Good Fit: 70-85% match, compatible use-case, 3-4 changes needed
472
- - Fair Fit: 50-70% match, different trade-offs to consider, 4+ changes
473
- - Poor Fit: <50% match, fundamentally different approach — don't recommend"""
474
-
475
- PROMPT_TEMPLATE = """Analyze the following benchmark comparison data and recommend the best design system alignment.
476
-
477
- ## USER'S CURRENT VALUES
478
-
479
- - Type Scale Ratio: {user_ratio}
480
- - Base Font Size: {user_base}px
481
- - Spacing Grid: {user_spacing}px
482
-
483
- ## BENCHMARK COMPARISON
484
-
485
- {benchmark_comparison}
486
-
487
- ## YOUR TASK
488
-
489
- 1. **Recommend Best Fit**: Which design system should they align with? Consider use-case fit, not just numbers.
490
- 2. **Explain Why**: Cite specific data points (similarity scores, ratio differences, spacing alignment).
491
- 3. **List Changes Needed**: What would they need to change? Include effort estimates.
492
- 4. **Pros/Cons**: Specific to this user's values, not generic statements.
493
- 5. **Self-Evaluate** your recommendation quality.
494
-
495
- ## OUTPUT FORMAT (JSON only)
496
-
497
- {{
498
- "recommended_benchmark": "<system_key>",
499
- "recommended_benchmark_name": "<full name>",
500
- "reasoning": "Why this is the best fit — cite specific data",
501
- "alignment_changes": [
502
- {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
503
- {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
504
- ],
505
- "pros_of_alignment": [
506
- "Specific benefit with data"
507
- ],
508
- "cons_of_alignment": [
509
- "Specific trade-off"
510
- ],
511
- "alternative_benchmarks": [
512
- {{"name": "Material Design 3", "reason": "Good for Android-first products"}}
513
- ],
514
- "self_evaluation": {{
515
- "confidence": <1-10>,
516
- "reasoning": "Why I am this confident",
517
- "data_quality": "good|fair|poor",
518
- "flags": []
519
- }}
520
- }}
521
-
522
- Return ONLY valid JSON."""
523
-
524
- def __init__(self, hf_client):
525
- self.hf_client = hf_client
526
-
527
- async def analyze(
528
- self,
529
- user_ratio: float,
530
- user_base: int,
531
- user_spacing: int,
532
- benchmark_comparisons: list,
533
- log_callback: Callable = None,
534
- ) -> BenchmarkAdvice:
535
- """
536
- Recommend best-fit design system.
537
-
538
- Args:
539
- user_ratio: User's detected type scale ratio
540
- user_base: User's base font size
541
- user_spacing: User's spacing grid base
542
- benchmark_comparisons: List of BenchmarkComparison objects
543
- log_callback: Progress logging function
544
-
545
- Returns:
546
- BenchmarkAdvice with recommendations
547
- """
548
- def log(msg: str):
549
- if log_callback:
550
- log_callback(msg)
551
-
552
- log("")
553
- log(" 🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)")
554
- log(" └─ Evaluating benchmark fit for your use case...")
555
-
556
- # Format comparison data
557
- comparison_str = self._format_comparisons(benchmark_comparisons)
558
-
559
- prompt = self.PROMPT_TEMPLATE.format(
560
- user_ratio=user_ratio,
561
- user_base=user_base,
562
- user_spacing=user_spacing,
563
- benchmark_comparison=comparison_str,
564
- )
565
-
566
- try:
567
- start_time = datetime.now()
568
-
569
- response = await self.hf_client.complete_async(
570
- agent_name="benchmark_advisor",
571
- system_prompt=self.SYSTEM_PROMPT,
572
- user_message=prompt,
573
- max_tokens=900,
574
- json_mode=True,
575
- )
576
-
577
- duration = (datetime.now() - start_time).total_seconds()
578
-
579
- result = self._parse_response(response)
580
-
581
- log(f" ────────────────────────────────────────────────")
582
- log(f" 🏢 ATLAS — Benchmark Advisor: COMPLETE ({duration:.1f}s)")
583
- log(f" ├─ Recommended: {result.recommended_benchmark_name}")
584
- log(f" ├─ Changes Needed: {len(result.alignment_changes)}")
585
- log(f" ├─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
586
- se = result.self_evaluation
587
- if se:
588
- log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
589
-
590
- return result
591
-
592
- except Exception as e:
593
- log(f" ├─ ⚠️ Benchmark Advisor failed: {str(e)[:120]}")
594
- return BenchmarkAdvice()
595
-
596
- def _format_comparisons(self, comparisons: list) -> str:
597
- """Format benchmark comparisons for prompt."""
598
- lines = []
599
- for i, c in enumerate(comparisons[:5]):
600
- b = c.benchmark
601
- lines.append(f"""
602
- {i+1}. {b.icon} {b.name}
603
- - Similarity Score: {c.similarity_score:.2f} (lower = better)
604
- - Match: {c.overall_match_pct:.0f}%
605
- - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
606
- - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
607
- - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
608
- - Best For: {', '.join(b.best_for)}""")
609
-
610
- return "\n".join(lines)
611
-
612
- def _parse_response(self, response: str) -> BenchmarkAdvice:
613
- """Parse LLM response into BenchmarkAdvice."""
614
- try:
615
- json_match = re.search(r'\{[\s\S]*\}', response)
616
- if json_match:
617
- data = json.loads(json_match.group())
618
- return BenchmarkAdvice(
619
- recommended_benchmark=data.get("recommended_benchmark", ""),
620
- recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
621
- reasoning=data.get("reasoning", ""),
622
- alignment_changes=data.get("alignment_changes", []),
623
- pros_of_alignment=data.get("pros_of_alignment", []),
624
- cons_of_alignment=data.get("cons_of_alignment", []),
625
- alternative_benchmarks=data.get("alternative_benchmarks", []),
626
- self_evaluation=data.get("self_evaluation", {}),
627
- )
628
- except Exception:
629
- pass
630
-
631
- return BenchmarkAdvice()
632
-
633
-
634
- # =============================================================================
635
- # BEST PRACTICES VALIDATOR AGENT
636
- # =============================================================================
637
-
638
- class BestPracticesValidatorAgent:
639
- """
640
- SENTINEL — Design System Best Practices Auditor.
641
-
642
- Validates against design system standards and prioritizes fixes by business impact.
643
- Model: Qwen 72B (methodical rule-following, precise judgment, structured output)
644
- Temperature: 0.2 (strict, consistent rule evaluation)
645
-
646
- WHY LLM: Prioritization requires judgment about business impact,
647
- not just checking boxes.
648
- """
649
-
650
- SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor specializing in standards compliance and impact-based prioritization.
651
-
652
- ## YOUR ROLE IN THE PIPELINE
653
- You are Agent 3 of 4 in the Design System Analysis pipeline.
654
- - INPUT: Rule Engine analysis results (typography, accessibility, spacing, color stats)
655
- - OUTPUT: Compliance score + prioritized fix list → feeds into NEXUS (Agent 4) for final synthesis
656
- - Your score directly appears on the user's dashboard. Your priority fixes become the action items.
657
-
658
- ## YOUR EXPERTISE
659
- - WCAG 2.1 AA/AAA accessibility standards
660
- - Design system best practices (Material Design, Apple HIG, Tailwind conventions)
661
- - Typography systems (modular scales, vertical rhythm, readability)
662
- - Color management (palette size limits, near-duplicate detection, contrast requirements)
663
- - Spacing systems (grid alignment, consistency, component density)
664
-
665
- ## QUALITY STANDARDS
666
- - Overall Score MUST reflect actual data. Don't default to 50.
667
- - Use the FULL 0-100 range: 90+ = excellent, 70-89 = good, 50-69 = needs work, <50 = significant issues
668
- - Priority fixes must be ACTIONABLE — include specific values to change (e.g., "Change #06b2c4 → #0891a8")
669
- - Maximum 5 priority fixes. If more, focus on highest-impact items.
670
-
671
- ## WHAT NOT TO DO
672
- - Don't pass checks that clearly fail based on the data.
673
- - Don't inflate scores to be "encouraging" — honest assessment helps the user.
674
- - Don't list fixes without effort estimates — the user needs to plan their work.
675
- - Don't mix up "warn" and "fail": warn = imperfect but functional, fail = violates a standard.
676
-
677
- ## SCORING RUBRIC (Overall Score 0-100):
678
- - 90-100: All checks pass, excellent accessibility, clean palette, consistent grid
679
- - 75-89: Most checks pass, minor issues in 1-2 areas, good foundation
680
- - 60-74: Several warnings, 1-2 failures, needs focused improvement
681
- - 40-59: Multiple failures, significant accessibility gaps, inconsistent system
682
- - 20-39: Fundamental issues across multiple areas, major rework needed
683
- - 0-19: Barely qualifies as a design system, almost everything fails
684
-
685
- ## CHECK WEIGHTING:
686
- - AA Compliance: 25 points (most critical — affects real users)
687
- - Type Scale Consistency: 15 points
688
- - Type Scale Standard Ratio: 10 points
689
- - Base Size Accessible: 15 points
690
- - Spacing Grid: 15 points
691
- - Color Count: 10 points
692
- - No Near-Duplicates: 10 points"""
693
-
694
- PROMPT_TEMPLATE = """Validate the following design tokens against best practices and prioritize fixes.
695
-
696
- ## RULE ENGINE ANALYSIS RESULTS
697
-
698
- ### Typography
699
- - Detected Ratio: {type_ratio} ({type_consistent})
700
- - Base Size: {base_size}px
701
- - Recommendation: {type_recommendation}
702
-
703
- ### Accessibility
704
- - Total Colors: {total_colors}
705
- - AA Pass: {aa_pass}
706
- - AA Fail: {aa_fail}
707
- - Failing Colors: {failing_colors}
708
-
709
- ### Spacing
710
- - Detected Base: {spacing_base}px
711
- - Grid Aligned: {spacing_aligned}%
712
- - Recommendation: {spacing_recommendation}px
713
-
714
- ### Color Statistics
715
- - Unique Colors: {unique_colors}
716
- - Duplicates: {duplicates}
717
- - Near-Duplicates: {near_duplicates}
718
-
719
- ## BEST PRACTICES CHECKLIST (check each one)
720
-
721
- 1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
722
- 2. Type scale is consistent (variance < 0.15)
723
- 3. Base font size >= 16px (accessibility)
724
- 4. All interactive colors pass WCAG AA (4.5:1 contrast)
725
- 5. Spacing uses consistent grid (4px or 8px base)
726
- 6. Limited color palette (< 20 unique semantic colors)
727
- 7. No near-duplicate colors (< 3 delta-E apart)
728
-
729
- ## YOUR TASK
730
-
731
- 1. Score each practice: pass/warn/fail with specific notes citing the data
732
- 2. Calculate overall score (0-100) using the weighting rubric
733
- 3. Identify TOP 3-5 priority fixes with impact and effort assessment
734
- 4. Self-evaluate your analysis
735
-
736
- ## OUTPUT FORMAT (JSON only)
737
-
738
- {{
739
- "overall_score": <0-100>,
740
- "checks": {{
741
- "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}},
742
- "type_scale_consistent": {{"status": "...", "note": "..."}},
743
- "base_size_accessible": {{"status": "...", "note": "..."}},
744
- "aa_compliance": {{"status": "...", "note": "..."}},
745
- "spacing_grid": {{"status": "...", "note": "..."}},
746
- "color_count": {{"status": "...", "note": "..."}},
747
- "near_duplicates": {{"status": "...", "note": "..."}}
748
- }},
749
- "priority_fixes": [
750
- {{
751
- "rank": 1,
752
- "issue": "Brand primary fails AA",
753
- "impact": "high|medium|low",
754
- "effort": "low|medium|high",
755
- "action": "Change #06b2c4 → #0891a8 for 4.5:1 contrast"
756
- }}
757
- ],
758
- "passing_practices": ["Base font size", "..."],
759
- "failing_practices": ["AA compliance", "..."],
760
- "self_evaluation": {{
761
- "confidence": <1-10>,
762
- "reasoning": "Why I am this confident",
763
- "data_quality": "good|fair|poor",
764
- "flags": []
765
- }}
766
- }}
767
-
768
- Return ONLY valid JSON."""
769
-
770
- def __init__(self, hf_client):
771
- self.hf_client = hf_client
772
-
773
- async def analyze(
774
- self,
775
- rule_engine_results: Any,
776
- log_callback: Callable = None,
777
- ) -> BestPracticesResult:
778
- """
779
- Validate against best practices.
780
-
781
- Args:
782
- rule_engine_results: Results from rule engine
783
- log_callback: Progress logging function
784
-
785
- Returns:
786
- BestPracticesResult with validation
787
- """
788
- def log(msg: str):
789
- if log_callback:
790
- log_callback(msg)
791
-
792
- log("")
793
- log(" ✅ SENTINEL — Best Practices Validator (Qwen 72B)")
794
- log(" └─ Checking against design system standards...")
795
-
796
- # Extract data from rule engine
797
- typo = rule_engine_results.typography
798
- spacing = rule_engine_results.spacing
799
- color_stats = rule_engine_results.color_stats
800
- accessibility = rule_engine_results.accessibility
801
-
802
- failures = [a for a in accessibility if not a.passes_aa_normal]
803
- failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])
804
-
805
- prompt = self.PROMPT_TEMPLATE.format(
806
- type_ratio=f"{typo.detected_ratio:.3f}",
807
- type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
808
- base_size=typo.sizes_px[0] if typo.sizes_px else 16,
809
- type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
810
- total_colors=len(accessibility),
811
- aa_pass=len(accessibility) - len(failures),
812
- aa_fail=len(failures),
813
- failing_colors=failing_colors_str or "None",
814
- spacing_base=spacing.detected_base,
815
- spacing_aligned=f"{spacing.alignment_percentage:.0f}",
816
- spacing_recommendation=spacing.recommendation,
817
- unique_colors=color_stats.unique_count,
818
- duplicates=color_stats.duplicate_count,
819
- near_duplicates=len(color_stats.near_duplicates),
820
- )
821
-
822
- try:
823
- start_time = datetime.now()
824
-
825
- response = await self.hf_client.complete_async(
826
- agent_name="best_practices_validator",
827
- system_prompt=self.SYSTEM_PROMPT,
828
- user_message=prompt,
829
- max_tokens=1000,
830
- json_mode=True,
831
- )
832
-
833
- duration = (datetime.now() - start_time).total_seconds()
834
-
835
- result = self._parse_response(response)
836
-
837
- log(f" ────────────────────────────────────────────────")
838
- log(f" ✅ SENTINEL — Best Practices: COMPLETE ({duration:.1f}s)")
839
- log(f" ├─ Overall Score: {result.overall_score}/100")
840
- log(f" ├─ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}")
841
- if result.priority_fixes:
842
- log(f" ├─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
843
- se = result.self_evaluation
844
- if se:
845
- log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
846
-
847
- return result
848
-
849
- except Exception as e:
850
- log(f" ├─ ⚠️ Best Practices Validator failed: {str(e)[:120]}")
851
- return BestPracticesResult()
852
-
853
- def _parse_response(self, response: str) -> BestPracticesResult:
854
- """Parse LLM response into BestPracticesResult."""
855
- try:
856
- json_match = re.search(r'\{[\s\S]*\}', response)
857
- if json_match:
858
- data = json.loads(json_match.group())
859
- return BestPracticesResult(
860
- overall_score=data.get("overall_score", 50),
861
- checks=data.get("checks", {}),
862
- priority_fixes=data.get("priority_fixes", []),
863
- passing_practices=data.get("passing_practices", []),
864
- failing_practices=data.get("failing_practices", []),
865
- self_evaluation=data.get("self_evaluation", {}),
866
- )
867
- except Exception:
868
- pass
869
-
870
- return BestPracticesResult()
871
-
872
-
873
- # =============================================================================
874
- # HEAD SYNTHESIZER AGENT
875
- # =============================================================================
876
-
877
- class HeadSynthesizerAgent:
878
- """
879
- NEXUS — Senior Design System Architect & Synthesizer.
880
-
881
- Combines all agent outputs into final actionable recommendations.
882
- Model: Llama 3.3 70B (128K context for combined inputs, strong synthesis capability)
883
- Temperature: 0.3 (balanced — needs to synthesize creatively but stay grounded in data)
884
-
885
- This is the final step that produces actionable output for the user.
886
- """
887
-
888
- SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect specializing in synthesis and actionable recommendations.
889
-
890
- ## YOUR ROLE IN THE PIPELINE
891
- You are Agent 4 of 4 — the HEAD Synthesizer in the Design System Analysis pipeline.
892
- - INPUT: Combined outputs from Rule Engine + AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices)
893
- - OUTPUT: Final executive summary, scores, and prioritized action plan → displayed directly to the user
894
- - You are the LAST agent. Your output IS the final result. Make it count.
895
-
896
- ## YOUR EXPERTISE
897
- - Design system architecture and governance
898
- - Synthesizing conflicting recommendations into coherent strategy
899
- - Effort/impact prioritization (what to fix first)
900
- - Color accessibility remediation (suggesting AA-compliant alternatives)
901
- - Executive communication (clear, actionable summaries)
902
-
903
- ## QUALITY STANDARDS
904
- - Executive Summary must be 2-3 sentences MAX. Lead with the overall score, then the #1 issue, then the #1 action.
905
- - Overall Score must SYNTHESIZE all agent inputs — don't just average them.
906
- - Color recommendations must include BOTH current AND suggested hex values.
907
- - Top 3 Actions must be ordered by IMPACT, not ease.
908
- - Accept/reject defaults on color recs: default to "accept" for accessibility fixes, "reject" for purely aesthetic changes.
909
-
910
- ## WHAT NOT TO DO
911
- - Don't contradict previous agents without explaining why.
912
- - Don't recommend changes that SENTINEL flagged as breaking.
913
- - Don't suggest more than 8 color changes — the user will ignore a long list.
914
- - Don't give vague actions like "improve accessibility" — be specific: "Change brand.primary from #06b2c4 to #0891a8 for 4.5:1 contrast".
915
- - Don't inflate scores to be "nice". If the design system has issues, say so clearly.
916
-
917
- ## SCORING RUBRIC (Overall 0-100):
918
- - 90-100: Production-ready design system, minor polishing only
919
- - 75-89: Solid foundation, 2-3 targeted improvements needed
920
- - 60-74: Functional but needs focused attention on accessibility or consistency
921
- - 40-59: Significant gaps requiring systematic improvement
922
- - 20-39: Major rework needed across multiple dimensions
923
- - 0-19: Fundamental redesign recommended"""
924
-
925
- PROMPT_TEMPLATE = """Synthesize all analysis results into a final, actionable design system report.
926
-
927
- ## RULE ENGINE FACTS (Layer 1 — Free, deterministic)
928
-
929
- - Type Scale: {type_ratio} ({type_status})
930
- - Base Size: {base_size}px
931
- - AA Failures: {aa_failures}
932
- - Spacing Grid: {spacing_status}
933
- - Unique Colors: {unique_colors}
934
- - Consistency Score: {consistency_score}/100
935
-
936
- ## AURORA — Brand Identification (Agent 1)
937
-
938
- - Brand Primary: {brand_primary}
939
- - Brand Secondary: {brand_secondary}
940
- - Palette Cohesion: {cohesion_score}/10
941
-
942
- ## ATLAS — Benchmark Advice (Agent 2)
943
-
944
- Closest Match: {closest_benchmark}
945
- Match Percentage: {match_pct}%
946
- Recommended Changes: {benchmark_changes}
947
-
948
- ## SENTINEL — Best Practices Validation (Agent 3)
949
-
950
- Overall Score: {best_practices_score}/100
951
- Priority Fixes: {priority_fixes}
952
-
953
- ## ACCESSIBILITY FIXES NEEDED
954
-
955
- {accessibility_fixes}
956
-
957
- ## YOUR TASK
958
-
959
- Synthesize ALL the above into:
960
- 1. Executive Summary (2-3 sentences — lead with score, #1 issue, #1 action)
961
- 2. Overall Scores (synthesized, not averaged)
962
- 3. Top 3 Priority Actions (ordered by IMPACT, include effort estimates)
963
- 4. Specific Color Recommendations (with accept/reject defaults)
964
- 5. Type Scale Recommendation
965
- 6. Spacing Recommendation
966
- 7. Self-Evaluation of your synthesis
967
-
968
- ## OUTPUT FORMAT (JSON only)
969
-
970
- {{
971
- "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
972
- "scores": {{
973
- "overall": <0-100>,
974
- "accessibility": <0-100>,
975
- "consistency": <0-100>,
976
- "organization": <0-100>
977
- }},
978
- "benchmark_fit": {{
979
- "closest": "<name>",
980
- "similarity": "<X%>",
981
- "recommendation": "Specific action to align"
982
- }},
983
- "brand_analysis": {{
984
- "primary": "#hex",
985
- "secondary": "#hex",
986
- "cohesion": <1-10>
987
- }},
988
- "top_3_actions": [
989
- {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
990
- ],
991
- "color_recommendations": [
992
- {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
993
- ],
994
- "type_scale_recommendation": {{
995
- "current_ratio": 1.18,
996
- "recommended_ratio": 1.25,
997
- "reason": "Why this ratio is better"
998
- }},
999
- "spacing_recommendation": {{
1000
- "current": "mixed",
1001
- "recommended": "8px",
1002
- "reason": "Why this grid is better"
1003
- }},
1004
- "self_evaluation": {{
1005
- "confidence": <1-10>,
1006
- "reasoning": "Why I am this confident in the synthesis",
1007
- "data_quality": "good|fair|poor",
1008
- "flags": []
1009
- }}
1010
- }}
1011
-
1012
- Return ONLY valid JSON."""
1013
-
1014
- def __init__(self, hf_client):
1015
- self.hf_client = hf_client
1016
-
1017
- async def synthesize(
1018
- self,
1019
- rule_engine_results: Any,
1020
- benchmark_comparisons: list,
1021
- brand_identification: BrandIdentification,
1022
- benchmark_advice: BenchmarkAdvice,
1023
- best_practices: BestPracticesResult,
1024
- log_callback: Callable = None,
1025
- ) -> HeadSynthesis:
1026
- """
1027
- Synthesize all results into final recommendations.
1028
- """
1029
- def log(msg: str):
1030
- if log_callback:
1031
- log_callback(msg)
1032
-
1033
- log("")
1034
- log("═" * 60)
1035
- log("🧠 LAYER 4: NEXUS — HEAD SYNTHESIZER (Llama 3.3 70B)")
1036
- log("═" * 60)
1037
- log("")
1038
- log(" Combining: Rule Engine + AURORA + ATLAS + SENTINEL...")
1039
-
1040
- # Extract data
1041
- typo = rule_engine_results.typography
1042
- spacing = rule_engine_results.spacing
1043
- color_stats = rule_engine_results.color_stats
1044
- accessibility = rule_engine_results.accessibility
1045
-
1046
- failures = [a for a in accessibility if not a.passes_aa_normal]
1047
- aa_fixes_str = "\n".join([
1048
- f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
1049
- for a in failures[:5] if a.suggested_fix
1050
- ])
1051
-
1052
- closest = benchmark_comparisons[0] if benchmark_comparisons else None
1053
-
1054
- prompt = self.PROMPT_TEMPLATE.format(
1055
- type_ratio=f"{typo.detected_ratio:.3f}",
1056
- type_status="consistent" if typo.is_consistent else "inconsistent",
1057
- base_size=typo.sizes_px[0] if typo.sizes_px else 16,
1058
- aa_failures=len(failures),
1059
- spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
1060
- unique_colors=color_stats.unique_count,
1061
- consistency_score=rule_engine_results.consistency_score,
1062
- closest_benchmark=closest.benchmark.name if closest else "Unknown",
1063
- match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
1064
- benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
1065
- brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
1066
- brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
1067
- cohesion_score=brand_identification.cohesion_score,
1068
- best_practices_score=best_practices.overall_score,
1069
- priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
1070
- accessibility_fixes=aa_fixes_str or "None needed",
1071
- )
1072
-
1073
- try:
1074
- start_time = datetime.now()
1075
-
1076
- response = await self.hf_client.complete_async(
1077
- agent_name="head_synthesizer",
1078
- system_prompt=self.SYSTEM_PROMPT,
1079
- user_message=prompt,
1080
- max_tokens=1200,
1081
- json_mode=True,
1082
- )
1083
-
1084
- duration = (datetime.now() - start_time).total_seconds()
1085
-
1086
- result = self._parse_response(response)
1087
-
1088
- log("")
1089
- log(f" ✅ NEXUS — HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
1090
- if result.scores:
1091
- log(f" ├─ Overall Score: {result.scores.get('overall', '?')}/100")
1092
- log(f" ├─ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}")
1093
- se = result.self_evaluation
1094
- if se:
1095
- log(f" └─ Self-Eval: confidence={se.get('confidence', '?')}/10, data={se.get('data_quality', '?')}")
1096
- log("")
1097
-
1098
- return result
1099
-
1100
- except Exception as e:
1101
- log(f" ├─ ⚠️ Head Synthesizer failed: {str(e)[:120]}")
1102
- return HeadSynthesis()
1103
-
1104
- def _parse_response(self, response: str) -> HeadSynthesis:
1105
- """Parse LLM response into HeadSynthesis."""
1106
- try:
1107
- json_match = re.search(r'\{[\s\S]*\}', response)
1108
- if json_match:
1109
- data = json.loads(json_match.group())
1110
- return HeadSynthesis(
1111
- executive_summary=data.get("executive_summary", ""),
1112
- scores=data.get("scores", {}),
1113
- benchmark_fit=data.get("benchmark_fit", {}),
1114
- brand_analysis=data.get("brand_analysis", {}),
1115
- top_3_actions=data.get("top_3_actions", []),
1116
- color_recommendations=data.get("color_recommendations", []),
1117
- type_scale_recommendation=data.get("type_scale_recommendation", {}),
1118
- spacing_recommendation=data.get("spacing_recommendation", {}),
1119
- self_evaluation=data.get("self_evaluation", {}),
1120
- )
1121
- except Exception:
1122
- pass
1123
-
1124
- return HeadSynthesis()