riazmo commited on
Commit
a6c864a
·
verified ·
1 Parent(s): c653719

Upload 2 files

Browse files
Files changed (2) hide show
  1. agents/benchmark_researcher.py +717 -0
  2. agents/llm_agents.py +865 -0
agents/benchmark_researcher.py ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark Research Agent
3
+ =========================
4
+ Fetches LIVE data from design system documentation sites
5
+ using Firecrawl, with 24-hour caching.
6
+
7
+ This agent:
8
+ 1. Fetches official documentation from design system sites
9
+ 2. Extracts typography, spacing, color specifications using LLM
10
+ 3. Caches results for 24 hours
11
+ 4. Compares user's tokens to researched benchmarks
12
+ """
13
+
14
+ import asyncio
15
+ import json
16
+ import os
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime, timedelta
19
+ from typing import Optional, Callable
20
+ import hashlib
21
+
22
+
23
+ # =============================================================================
24
+ # DESIGN SYSTEM SOURCES (Official Documentation URLs)
25
+ # =============================================================================
26
+
27
+ DESIGN_SYSTEM_SOURCES = {
28
+ "material_design_3": {
29
+ "name": "Material Design 3",
30
+ "short_name": "Material 3",
31
+ "vendor": "Google",
32
+ "urls": {
33
+ "typography": "https://m3.material.io/styles/typography/type-scale-tokens",
34
+ "spacing": "https://m3.material.io/foundations/layout/understanding-layout/spacing",
35
+ "colors": "https://m3.material.io/styles/color/the-color-system/key-colors-tones",
36
+ },
37
+ "best_for": ["Android apps", "Web apps", "Enterprise software"],
38
+ "icon": "🟢",
39
+ },
40
+ "apple_hig": {
41
+ "name": "Apple Human Interface Guidelines",
42
+ "short_name": "Apple HIG",
43
+ "vendor": "Apple",
44
+ "urls": {
45
+ "typography": "https://developer.apple.com/design/human-interface-guidelines/typography",
46
+ "spacing": "https://developer.apple.com/design/human-interface-guidelines/layout",
47
+ },
48
+ "best_for": ["iOS apps", "macOS apps", "Premium consumer products"],
49
+ "icon": "🍎",
50
+ },
51
+ "shopify_polaris": {
52
+ "name": "Shopify Polaris",
53
+ "short_name": "Polaris",
54
+ "vendor": "Shopify",
55
+ "urls": {
56
+ "typography": "https://polaris.shopify.com/design/typography",
57
+ "spacing": "https://polaris.shopify.com/design/spacing",
58
+ "colors": "https://polaris.shopify.com/design/colors",
59
+ },
60
+ "best_for": ["E-commerce", "Admin dashboards", "Merchant tools"],
61
+ "icon": "🛒",
62
+ },
63
+ "atlassian_design": {
64
+ "name": "Atlassian Design System",
65
+ "short_name": "Atlassian",
66
+ "vendor": "Atlassian",
67
+ "urls": {
68
+ "typography": "https://atlassian.design/foundations/typography",
69
+ "spacing": "https://atlassian.design/foundations/spacing",
70
+ "colors": "https://atlassian.design/foundations/color",
71
+ },
72
+ "best_for": ["Productivity tools", "Dense interfaces", "Enterprise B2B"],
73
+ "icon": "🔵",
74
+ },
75
+ "ibm_carbon": {
76
+ "name": "IBM Carbon Design System",
77
+ "short_name": "Carbon",
78
+ "vendor": "IBM",
79
+ "urls": {
80
+ "typography": "https://carbondesignsystem.com/guidelines/typography/overview",
81
+ "spacing": "https://carbondesignsystem.com/guidelines/spacing/overview",
82
+ "colors": "https://carbondesignsystem.com/guidelines/color/overview",
83
+ },
84
+ "best_for": ["Enterprise software", "Data-heavy applications", "IBM products"],
85
+ "icon": "🔷",
86
+ },
87
+ "tailwind_css": {
88
+ "name": "Tailwind CSS",
89
+ "short_name": "Tailwind",
90
+ "vendor": "Tailwind Labs",
91
+ "urls": {
92
+ "typography": "https://tailwindcss.com/docs/font-size",
93
+ "spacing": "https://tailwindcss.com/docs/customizing-spacing",
94
+ "colors": "https://tailwindcss.com/docs/customizing-colors",
95
+ },
96
+ "best_for": ["Web applications", "Startups", "Rapid prototyping"],
97
+ "icon": "🌊",
98
+ },
99
+ "ant_design": {
100
+ "name": "Ant Design",
101
+ "short_name": "Ant Design",
102
+ "vendor": "Ant Group",
103
+ "urls": {
104
+ "typography": "https://ant.design/docs/spec/font",
105
+ "spacing": "https://ant.design/docs/spec/layout",
106
+ "colors": "https://ant.design/docs/spec/colors",
107
+ },
108
+ "best_for": ["Enterprise B2B", "Admin panels", "Chinese market"],
109
+ "icon": "🐜",
110
+ },
111
+ "chakra_ui": {
112
+ "name": "Chakra UI",
113
+ "short_name": "Chakra",
114
+ "vendor": "Chakra UI",
115
+ "urls": {
116
+ "typography": "https://chakra-ui.com/docs/styled-system/theme#typography",
117
+ "spacing": "https://chakra-ui.com/docs/styled-system/theme#spacing",
118
+ "colors": "https://chakra-ui.com/docs/styled-system/theme#colors",
119
+ },
120
+ "best_for": ["React applications", "Startups", "Accessible products"],
121
+ "icon": "⚡",
122
+ },
123
+ }
124
+
125
+
126
+ # =============================================================================
127
+ # DATA CLASSES
128
+ # =============================================================================
129
+
130
+ @dataclass
131
+ class BenchmarkData:
132
+ """Researched benchmark data from a design system."""
133
+ key: str
134
+ name: str
135
+ short_name: str
136
+ vendor: str
137
+ icon: str
138
+
139
+ # Extracted specifications
140
+ typography: dict = field(default_factory=dict)
141
+ # Expected: {scale_ratio, base_size, sizes[], font_family, line_height_body}
142
+
143
+ spacing: dict = field(default_factory=dict)
144
+ # Expected: {base, scale[], grid}
145
+
146
+ colors: dict = field(default_factory=dict)
147
+ # Expected: {palette_size, uses_ramps, ramp_steps}
148
+
149
+ # Metadata
150
+ fetched_at: str = ""
151
+ confidence: str = "low" # high, medium, low
152
+ source_urls: list = field(default_factory=list)
153
+ best_for: list = field(default_factory=list)
154
+
155
+ def to_dict(self) -> dict:
156
+ return {
157
+ "key": self.key,
158
+ "name": self.name,
159
+ "short_name": self.short_name,
160
+ "vendor": self.vendor,
161
+ "icon": self.icon,
162
+ "typography": self.typography,
163
+ "spacing": self.spacing,
164
+ "colors": self.colors,
165
+ "fetched_at": self.fetched_at,
166
+ "confidence": self.confidence,
167
+ "best_for": self.best_for,
168
+ }
169
+
170
+
171
+ @dataclass
172
+ class BenchmarkComparison:
173
+ """Comparison result between user's tokens and a benchmark."""
174
+ benchmark: BenchmarkData
175
+ similarity_score: float # Lower = more similar
176
+
177
+ # Individual comparisons
178
+ type_ratio_diff: float
179
+ base_size_diff: int
180
+ spacing_grid_diff: int
181
+
182
+ # Match percentages
183
+ type_match_pct: float
184
+ spacing_match_pct: float
185
+ overall_match_pct: float
186
+
187
+ def to_dict(self) -> dict:
188
+ return {
189
+ "name": self.benchmark.name,
190
+ "short_name": self.benchmark.short_name,
191
+ "icon": self.benchmark.icon,
192
+ "similarity_score": round(self.similarity_score, 2),
193
+ "overall_match_pct": round(self.overall_match_pct, 1),
194
+ "comparison": {
195
+ "type_ratio": {
196
+ "diff": round(self.type_ratio_diff, 3),
197
+ "match_pct": round(self.type_match_pct, 1),
198
+ },
199
+ "base_size": {
200
+ "diff": self.base_size_diff,
201
+ },
202
+ "spacing_grid": {
203
+ "diff": self.spacing_grid_diff,
204
+ "match_pct": round(self.spacing_match_pct, 1),
205
+ },
206
+ },
207
+ "benchmark_values": {
208
+ "type_ratio": self.benchmark.typography.get("scale_ratio"),
209
+ "base_size": self.benchmark.typography.get("base_size"),
210
+ "spacing_grid": self.benchmark.spacing.get("base"),
211
+ },
212
+ "best_for": self.benchmark.best_for,
213
+ "confidence": self.benchmark.confidence,
214
+ }
215
+
216
+
217
+ # =============================================================================
218
+ # CACHE MANAGER
219
+ # =============================================================================
220
+
221
+ class BenchmarkCache:
222
+ """Manages 24-hour caching of benchmark research results."""
223
+
224
+ def __init__(self, cache_dir: str = None):
225
+ if cache_dir is None:
226
+ cache_dir = os.path.join(os.path.dirname(__file__), "..", "storage")
227
+ self.cache_file = os.path.join(cache_dir, "benchmark_cache.json")
228
+ self._ensure_cache_dir()
229
+
230
+ def _ensure_cache_dir(self):
231
+ """Ensure cache directory exists."""
232
+ os.makedirs(os.path.dirname(self.cache_file), exist_ok=True)
233
+
234
+ def _load_cache(self) -> dict:
235
+ """Load cache from file."""
236
+ if os.path.exists(self.cache_file):
237
+ try:
238
+ with open(self.cache_file, 'r') as f:
239
+ return json.load(f)
240
+ except Exception:
241
+ return {}
242
+ return {}
243
+
244
+ def _save_cache(self, cache: dict):
245
+ """Save cache to file."""
246
+ try:
247
+ with open(self.cache_file, 'w') as f:
248
+ json.dump(cache, f, indent=2)
249
+ except Exception:
250
+ pass
251
+
252
+ def get(self, key: str) -> Optional[BenchmarkData]:
253
+ """Get cached benchmark if valid (< 24 hours old)."""
254
+ cache = self._load_cache()
255
+
256
+ if key not in cache:
257
+ return None
258
+
259
+ entry = cache[key]
260
+ fetched_at = datetime.fromisoformat(entry.get("fetched_at", "2000-01-01"))
261
+
262
+ # Check if expired (24 hours)
263
+ if datetime.now() - fetched_at > timedelta(hours=24):
264
+ return None
265
+
266
+ # Reconstruct BenchmarkData
267
+ source = DESIGN_SYSTEM_SOURCES.get(key, {})
268
+ return BenchmarkData(
269
+ key=key,
270
+ name=entry.get("name", source.get("name", key)),
271
+ short_name=entry.get("short_name", source.get("short_name", key)),
272
+ vendor=entry.get("vendor", source.get("vendor", "")),
273
+ icon=entry.get("icon", source.get("icon", "📦")),
274
+ typography=entry.get("typography", {}),
275
+ spacing=entry.get("spacing", {}),
276
+ colors=entry.get("colors", {}),
277
+ fetched_at=entry.get("fetched_at", ""),
278
+ confidence=entry.get("confidence", "low"),
279
+ source_urls=entry.get("source_urls", []),
280
+ best_for=entry.get("best_for", source.get("best_for", [])),
281
+ )
282
+
283
+ def set(self, key: str, data: BenchmarkData):
284
+ """Cache benchmark data."""
285
+ cache = self._load_cache()
286
+ cache[key] = data.to_dict()
287
+ self._save_cache(cache)
288
+
289
+ def get_cache_status(self) -> dict:
290
+ """Get status of all cached items."""
291
+ cache = self._load_cache()
292
+ status = {}
293
+
294
+ for key in DESIGN_SYSTEM_SOURCES.keys():
295
+ if key in cache:
296
+ fetched_at = datetime.fromisoformat(cache[key].get("fetched_at", "2000-01-01"))
297
+ age_hours = (datetime.now() - fetched_at).total_seconds() / 3600
298
+ is_valid = age_hours < 24
299
+ status[key] = {
300
+ "cached": True,
301
+ "valid": is_valid,
302
+ "age_hours": round(age_hours, 1),
303
+ }
304
+ else:
305
+ status[key] = {"cached": False, "valid": False}
306
+
307
+ return status
308
+
309
+
310
+ # =============================================================================
311
+ # FALLBACK DATA (Used when research fails)
312
+ # =============================================================================
313
+
314
+ FALLBACK_BENCHMARKS = {
315
+ "material_design_3": {
316
+ "typography": {"scale_ratio": 1.2, "base_size": 16, "font_family": "Roboto", "line_height_body": 1.5},
317
+ "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 48, 64], "grid": "8px"},
318
+ "colors": {"palette_size": 13, "uses_ramps": True},
319
+ },
320
+ "apple_hig": {
321
+ "typography": {"scale_ratio": 1.19, "base_size": 17, "font_family": "SF Pro", "line_height_body": 1.47},
322
+ "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40], "grid": "4px"},
323
+ "colors": {"palette_size": 9, "uses_ramps": True},
324
+ },
325
+ "shopify_polaris": {
326
+ "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "Inter", "line_height_body": 1.5},
327
+ "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64], "grid": "4px"},
328
+ "colors": {"palette_size": 11, "uses_ramps": True},
329
+ },
330
+ "atlassian_design": {
331
+ "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "Inter", "line_height_body": 1.43},
332
+ "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"},
333
+ "colors": {"palette_size": 15, "uses_ramps": True},
334
+ },
335
+ "ibm_carbon": {
336
+ "typography": {"scale_ratio": 1.25, "base_size": 14, "font_family": "IBM Plex Sans", "line_height_body": 1.5},
337
+ "spacing": {"base": 8, "scale": [0, 2, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"},
338
+ "colors": {"palette_size": 12, "uses_ramps": True},
339
+ },
340
+ "tailwind_css": {
341
+ "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5},
342
+ "spacing": {"base": 4, "scale": [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32], "grid": "4px"},
343
+ "colors": {"palette_size": 22, "uses_ramps": True},
344
+ },
345
+ "ant_design": {
346
+ "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "system-ui", "line_height_body": 1.57},
347
+ "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48], "grid": "8px"},
348
+ "colors": {"palette_size": 13, "uses_ramps": True},
349
+ },
350
+ "chakra_ui": {
351
+ "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5},
352
+ "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 56, 64], "grid": "4px"},
353
+ "colors": {"palette_size": 15, "uses_ramps": True},
354
+ },
355
+ }
356
+
357
+
358
+ # =============================================================================
359
+ # BENCHMARK RESEARCHER
360
+ # =============================================================================
361
+
362
+ class BenchmarkResearcher:
363
+ """
364
+ Research agent that fetches live design system specifications.
365
+
366
+ Uses Firecrawl to fetch documentation and LLM to extract specs.
367
+ Results are cached for 24 hours.
368
+ """
369
+
370
+ def __init__(self, firecrawl_client=None, hf_client=None):
371
+ """
372
+ Initialize researcher.
373
+
374
+ Args:
375
+ firecrawl_client: Firecrawl API client for fetching docs
376
+ hf_client: HuggingFace client for LLM extraction
377
+ """
378
+ self.firecrawl = firecrawl_client
379
+ self.hf_client = hf_client
380
+ self.cache = BenchmarkCache()
381
+
382
+ async def research_benchmark(
383
+ self,
384
+ system_key: str,
385
+ log_callback: Callable = None,
386
+ force_refresh: bool = False,
387
+ ) -> BenchmarkData:
388
+ """
389
+ Research a specific design system.
390
+
391
+ Args:
392
+ system_key: Key from DESIGN_SYSTEM_SOURCES
393
+ log_callback: Function to log progress
394
+ force_refresh: Bypass cache and fetch fresh
395
+
396
+ Returns:
397
+ BenchmarkData with extracted specifications
398
+ """
399
+ def log(msg: str):
400
+ if log_callback:
401
+ log_callback(msg)
402
+
403
+ if system_key not in DESIGN_SYSTEM_SOURCES:
404
+ raise ValueError(f"Unknown design system: {system_key}")
405
+
406
+ source = DESIGN_SYSTEM_SOURCES[system_key]
407
+
408
+ # Check cache first (unless force refresh)
409
+ if not force_refresh:
410
+ cached = self.cache.get(system_key)
411
+ if cached:
412
+ log(f" ├─ {source['icon']} {source['short_name']}: Using cached data ✅")
413
+ return cached
414
+
415
+ log(f" ├─ {source['icon']} {source['short_name']}: Fetching documentation...")
416
+
417
+ # Try to fetch and extract
418
+ raw_content = ""
419
+ confidence = "low"
420
+
421
+ if self.firecrawl:
422
+ try:
423
+ # Fetch typography docs
424
+ typo_url = source["urls"].get("typography")
425
+ if typo_url:
426
+ log(f" │ ├─ Fetching {typo_url[:50]}...")
427
+ typo_content = await self._fetch_url(typo_url)
428
+ if typo_content:
429
+ raw_content += f"\n\n=== TYPOGRAPHY ===\n{typo_content[:4000]}"
430
+ confidence = "medium"
431
+
432
+ # Fetch spacing docs
433
+ spacing_url = source["urls"].get("spacing")
434
+ if spacing_url:
435
+ log(f" │ ├─ Fetching spacing docs...")
436
+ spacing_content = await self._fetch_url(spacing_url)
437
+ if spacing_content:
438
+ raw_content += f"\n\n=== SPACING ===\n{spacing_content[:3000]}"
439
+ if confidence == "medium":
440
+ confidence = "high"
441
+
442
+ except Exception as e:
443
+ log(f" │ ├─ ⚠️ Fetch error: {str(e)[:50]}")
444
+
445
+ # Extract specs with LLM (or use fallback)
446
+ if raw_content and self.hf_client:
447
+ log(f" │ ├─ Extracting specifications...")
448
+ extracted = await self._extract_specs_with_llm(source["name"], raw_content)
449
+ else:
450
+ log(f" │ ├─ Using fallback data (fetch unavailable)")
451
+ extracted = FALLBACK_BENCHMARKS.get(system_key, {})
452
+ confidence = "fallback"
453
+
454
+ # Build result
455
+ result = BenchmarkData(
456
+ key=system_key,
457
+ name=source["name"],
458
+ short_name=source["short_name"],
459
+ vendor=source["vendor"],
460
+ icon=source["icon"],
461
+ typography=extracted.get("typography", FALLBACK_BENCHMARKS.get(system_key, {}).get("typography", {})),
462
+ spacing=extracted.get("spacing", FALLBACK_BENCHMARKS.get(system_key, {}).get("spacing", {})),
463
+ colors=extracted.get("colors", FALLBACK_BENCHMARKS.get(system_key, {}).get("colors", {})),
464
+ fetched_at=datetime.now().isoformat(),
465
+ confidence=confidence,
466
+ source_urls=list(source["urls"].values()),
467
+ best_for=source["best_for"],
468
+ )
469
+
470
+ # Cache result
471
+ self.cache.set(system_key, result)
472
+
473
+ ratio = result.typography.get("scale_ratio", "?")
474
+ base = result.typography.get("base_size", "?")
475
+ grid = result.spacing.get("base", "?")
476
+ log(f" │ └─ ✅ ratio={ratio}, base={base}px, grid={grid}px [{confidence}]")
477
+
478
+ return result
479
+
480
+ async def _fetch_url(self, url: str) -> Optional[str]:
481
+ """Fetch URL content using Firecrawl."""
482
+ if not self.firecrawl:
483
+ return None
484
+
485
+ try:
486
+ # Firecrawl scrape
487
+ result = self.firecrawl.scrape_url(
488
+ url,
489
+ params={"formats": ["markdown"]}
490
+ )
491
+
492
+ if result and result.get("markdown"):
493
+ return result["markdown"]
494
+ elif result and result.get("content"):
495
+ return result["content"]
496
+
497
+ except Exception as e:
498
+ pass
499
+
500
+ return None
501
+
502
+ async def _extract_specs_with_llm(self, system_name: str, raw_content: str) -> dict:
503
+ """Extract structured specs from documentation using LLM."""
504
+ if not self.hf_client:
505
+ return {}
506
+
507
+ prompt = f"""Extract the design system specifications from this documentation.
508
+
509
+ DESIGN SYSTEM: {system_name}
510
+
511
+ DOCUMENTATION:
512
+ {raw_content[:6000]}
513
+
514
+ Return ONLY a JSON object with these exact fields (use null if not found):
515
+ {{
516
+ "typography": {{
517
+ "scale_ratio": <number like 1.2 or 1.25>,
518
+ "base_size": <number in px>,
519
+ "font_family": "<font name>",
520
+ "sizes": [<list of sizes in px>],
521
+ "line_height_body": <number like 1.5>
522
+ }},
523
+ "spacing": {{
524
+ "base": <base unit in px like 4 or 8>,
525
+ "scale": [<spacing values>],
526
+ "grid": "<description>"
527
+ }},
528
+ "colors": {{
529
+ "palette_size": <number>,
530
+ "uses_ramps": <true/false>
531
+ }}
532
+ }}
533
+
534
+ Return ONLY valid JSON, no explanation."""
535
+
536
+ try:
537
+ response = await self.hf_client.complete_async(
538
+ agent_name="benchmark_extractor",
539
+ system_prompt="You are a design system specification extractor. Extract only the factual specifications.",
540
+ user_message=prompt,
541
+ max_tokens=600,
542
+ json_mode=True,
543
+ )
544
+
545
+ # Parse JSON from response
546
+ import re
547
+ json_match = re.search(r'\{[\s\S]*\}', response)
548
+ if json_match:
549
+ return json.loads(json_match.group())
550
+
551
+ except Exception as e:
552
+ pass
553
+
554
+ return {}
555
+
556
+ async def research_selected_benchmarks(
557
+ self,
558
+ selected_keys: list[str],
559
+ log_callback: Callable = None,
560
+ ) -> list[BenchmarkData]:
561
+ """
562
+ Research multiple selected design systems.
563
+
564
+ Args:
565
+ selected_keys: List of system keys to research
566
+ log_callback: Function to log progress
567
+
568
+ Returns:
569
+ List of BenchmarkData
570
+ """
571
+ def log(msg: str):
572
+ if log_callback:
573
+ log_callback(msg)
574
+
575
+ log("")
576
+ log("═" * 60)
577
+ log("🔬 LAYER 2: BENCHMARK RESEARCH (Firecrawl + Cache)")
578
+ log("═" * 60)
579
+ log("")
580
+ log(f" Selected systems: {', '.join(selected_keys)}")
581
+ log("")
582
+
583
+ results = []
584
+
585
+ for key in selected_keys:
586
+ if key in DESIGN_SYSTEM_SOURCES:
587
+ try:
588
+ result = await self.research_benchmark(key, log_callback)
589
+ results.append(result)
590
+ except Exception as e:
591
+ log(f" ├─ ⚠️ Error researching {key}: {e}")
592
+ # Use fallback
593
+ source = DESIGN_SYSTEM_SOURCES[key]
594
+ fallback = FALLBACK_BENCHMARKS.get(key, {})
595
+ results.append(BenchmarkData(
596
+ key=key,
597
+ name=source["name"],
598
+ short_name=source["short_name"],
599
+ vendor=source["vendor"],
600
+ icon=source["icon"],
601
+ typography=fallback.get("typography", {}),
602
+ spacing=fallback.get("spacing", {}),
603
+ colors=fallback.get("colors", {}),
604
+ fetched_at=datetime.now().isoformat(),
605
+ confidence="fallback",
606
+ best_for=source["best_for"],
607
+ ))
608
+
609
+ log("")
610
+ log(f" ✅ Researched {len(results)}/{len(selected_keys)} design systems")
611
+
612
+ return results
613
+
614
+ def compare_to_benchmarks(
615
+ self,
616
+ your_ratio: float,
617
+ your_base_size: int,
618
+ your_spacing_grid: int,
619
+ benchmarks: list[BenchmarkData],
620
+ log_callback: Callable = None,
621
+ ) -> list[BenchmarkComparison]:
622
+ """
623
+ Compare user's tokens to researched benchmarks.
624
+
625
+ Args:
626
+ your_ratio: Detected type scale ratio
627
+ your_base_size: Detected base font size
628
+ your_spacing_grid: Detected spacing grid base
629
+ benchmarks: List of researched BenchmarkData
630
+ log_callback: Function to log progress
631
+
632
+ Returns:
633
+ List of BenchmarkComparison sorted by similarity
634
+ """
635
+ def log(msg: str):
636
+ if log_callback:
637
+ log_callback(msg)
638
+
639
+ log("")
640
+ log(" 📊 BENCHMARK COMPARISON")
641
+ log(" " + "─" * 40)
642
+ log(f" Your values: ratio={your_ratio:.2f}, base={your_base_size}px, grid={your_spacing_grid}px")
643
+ log("")
644
+
645
+ comparisons = []
646
+
647
+ for b in benchmarks:
648
+ b_ratio = b.typography.get("scale_ratio", 1.25)
649
+ b_base = b.typography.get("base_size", 16)
650
+ b_grid = b.spacing.get("base", 8)
651
+
652
+ # Calculate differences
653
+ ratio_diff = abs(your_ratio - b_ratio)
654
+ base_diff = abs(your_base_size - b_base)
655
+ grid_diff = abs(your_spacing_grid - b_grid)
656
+
657
+ # Calculate match percentages
658
+ type_match = max(0, 100 - (ratio_diff * 100)) # 0.1 diff = 90% match
659
+ spacing_match = max(0, 100 - (grid_diff * 10)) # 4px diff = 60% match
660
+
661
+ # Weighted similarity score (lower = more similar)
662
+ similarity = (ratio_diff * 10) + (base_diff * 0.5) + (grid_diff * 0.3)
663
+
664
+ # Overall match percentage
665
+ overall_match = (type_match * 0.5) + (spacing_match * 0.3) + (100 - base_diff * 5) * 0.2
666
+ overall_match = max(0, min(100, overall_match))
667
+
668
+ comparisons.append(BenchmarkComparison(
669
+ benchmark=b,
670
+ similarity_score=similarity,
671
+ type_ratio_diff=ratio_diff,
672
+ base_size_diff=base_diff,
673
+ spacing_grid_diff=grid_diff,
674
+ type_match_pct=type_match,
675
+ spacing_match_pct=spacing_match,
676
+ overall_match_pct=overall_match,
677
+ ))
678
+
679
+ # Sort by similarity (lower = better)
680
+ comparisons.sort(key=lambda x: x.similarity_score)
681
+
682
+ # Log results
683
+ medals = ["🥇", "🥈", "🥉"]
684
+ for i, c in enumerate(comparisons[:5]):
685
+ medal = medals[i] if i < 3 else " "
686
+ b = c.benchmark
687
+ log(f" {medal} {b.icon} {b.short_name}: {c.overall_match_pct:.0f}% match (score: {c.similarity_score:.2f})")
688
+ log(f" └─ ratio={b.typography.get('scale_ratio')}, base={b.typography.get('base_size')}px, grid={b.spacing.get('base')}px")
689
+
690
+ return comparisons
691
+
692
+
693
+ # =============================================================================
694
+ # HELPER FUNCTIONS
695
+ # =============================================================================
696
+
697
+ def get_available_benchmarks() -> list[dict]:
698
+ """Get list of available design systems for UI dropdown."""
699
+ return [
700
+ {
701
+ "key": key,
702
+ "name": source["name"],
703
+ "short_name": source["short_name"],
704
+ "icon": source["icon"],
705
+ "vendor": source["vendor"],
706
+ "best_for": source["best_for"],
707
+ }
708
+ for key, source in DESIGN_SYSTEM_SOURCES.items()
709
+ ]
710
+
711
+
712
+ def get_benchmark_choices() -> list[tuple[str, str]]:
713
+ """Get choices for Gradio dropdown."""
714
+ return [
715
+ (f"{source['icon']} {source['short_name']} ({source['vendor']})", key)
716
+ for key, source in DESIGN_SYSTEM_SOURCES.items()
717
+ ]
agents/llm_agents.py ADDED
@@ -0,0 +1,865 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage 2 LLM Agents — Specialized Analysis Tasks
3
+ =================================================
4
+
5
+ These agents handle tasks that REQUIRE LLM reasoning:
6
+ - Brand Identifier: Identify brand colors from usage context
7
+ - Benchmark Advisor: Recommend best-fit design system
8
+ - Best Practices Validator: Prioritize fixes by business impact
9
+ - HEAD Synthesizer: Combine all outputs into final recommendations
10
+
11
+ Each agent has a focused prompt for its specific task.
12
+ """
13
+
14
+ import json
15
+ import re
16
+ from dataclasses import dataclass, field
17
+ from typing import Optional, Callable, Any
18
+ from datetime import datetime
19
+
20
+
21
+ # =============================================================================
22
+ # DATA CLASSES
23
+ # =============================================================================
24
+
25
+ @dataclass
26
+ class BrandIdentification:
27
+ """Results from Brand Identifier agent."""
28
+ brand_primary: dict = field(default_factory=dict)
29
+ # {color, confidence, reasoning, usage_count}
30
+
31
+ brand_secondary: dict = field(default_factory=dict)
32
+ brand_accent: dict = field(default_factory=dict)
33
+
34
+ palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random
35
+ cohesion_score: int = 5 # 1-10
36
+ cohesion_notes: str = ""
37
+
38
+ semantic_names: dict = field(default_factory=dict)
39
+ # {hex_color: suggested_name}
40
+
41
+ def to_dict(self) -> dict:
42
+ return {
43
+ "brand_primary": self.brand_primary,
44
+ "brand_secondary": self.brand_secondary,
45
+ "brand_accent": self.brand_accent,
46
+ "palette_strategy": self.palette_strategy,
47
+ "cohesion_score": self.cohesion_score,
48
+ "cohesion_notes": self.cohesion_notes,
49
+ "semantic_names": self.semantic_names,
50
+ }
51
+
52
+
53
+ @dataclass
54
+ class BenchmarkAdvice:
55
+ """Results from Benchmark Advisor agent."""
56
+ recommended_benchmark: str = ""
57
+ recommended_benchmark_name: str = ""
58
+ reasoning: str = ""
59
+
60
+ alignment_changes: list = field(default_factory=list)
61
+ # [{change, from, to, effort}]
62
+
63
+ pros_of_alignment: list = field(default_factory=list)
64
+ cons_of_alignment: list = field(default_factory=list)
65
+
66
+ alternative_benchmarks: list = field(default_factory=list)
67
+ # [{name, reason}]
68
+
69
+ def to_dict(self) -> dict:
70
+ return {
71
+ "recommended_benchmark": self.recommended_benchmark,
72
+ "recommended_benchmark_name": self.recommended_benchmark_name,
73
+ "reasoning": self.reasoning,
74
+ "alignment_changes": self.alignment_changes,
75
+ "pros": self.pros_of_alignment,
76
+ "cons": self.cons_of_alignment,
77
+ "alternatives": self.alternative_benchmarks,
78
+ }
79
+
80
+
81
+ @dataclass
82
+ class BestPracticesResult:
83
+ """Results from Best Practices Validator agent."""
84
+ overall_score: int = 50 # 0-100
85
+
86
+ checks: dict = field(default_factory=dict)
87
+ # {check_name: {status: pass/warn/fail, note: str}}
88
+
89
+ priority_fixes: list = field(default_factory=list)
90
+ # [{rank, issue, impact, effort, action}]
91
+
92
+ passing_practices: list = field(default_factory=list)
93
+ failing_practices: list = field(default_factory=list)
94
+
95
+ def to_dict(self) -> dict:
96
+ return {
97
+ "overall_score": self.overall_score,
98
+ "checks": self.checks,
99
+ "priority_fixes": self.priority_fixes,
100
+ "passing": self.passing_practices,
101
+ "failing": self.failing_practices,
102
+ }
103
+
104
+
105
+ @dataclass
106
+ class HeadSynthesis:
107
+ """Final synthesized output from HEAD agent."""
108
+ executive_summary: str = ""
109
+
110
+ scores: dict = field(default_factory=dict)
111
+ # {overall, accessibility, consistency, organization}
112
+
113
+ benchmark_fit: dict = field(default_factory=dict)
114
+ # {closest, similarity, recommendation}
115
+
116
+ brand_analysis: dict = field(default_factory=dict)
117
+ # {primary, secondary, cohesion}
118
+
119
+ top_3_actions: list = field(default_factory=list)
120
+ # [{action, impact, effort, details}]
121
+
122
+ color_recommendations: list = field(default_factory=list)
123
+ # [{role, current, suggested, reason, accept}]
124
+
125
+ type_scale_recommendation: dict = field(default_factory=dict)
126
+ spacing_recommendation: dict = field(default_factory=dict)
127
+
128
+ def to_dict(self) -> dict:
129
+ return {
130
+ "executive_summary": self.executive_summary,
131
+ "scores": self.scores,
132
+ "benchmark_fit": self.benchmark_fit,
133
+ "brand_analysis": self.brand_analysis,
134
+ "top_3_actions": self.top_3_actions,
135
+ "color_recommendations": self.color_recommendations,
136
+ "type_scale_recommendation": self.type_scale_recommendation,
137
+ "spacing_recommendation": self.spacing_recommendation,
138
+ }
139
+
140
+
141
+ # =============================================================================
142
+ # BRAND IDENTIFIER AGENT
143
+ # =============================================================================
144
+
145
+ class BrandIdentifierAgent:
146
+ """
147
+ Identifies brand colors from usage context.
148
+
149
+ WHY LLM: Requires understanding context (33 buttons = likely brand primary),
150
+ not just color math.
151
+ """
152
+
153
+ PROMPT_TEMPLATE = """You are a senior design system analyst. Identify the brand colors from this color usage data.
154
+
155
+ ## COLOR DATA WITH USAGE CONTEXT
156
+
157
+ {color_data}
158
+
159
+ ## SEMANTIC ANALYSIS (from CSS properties)
160
+
161
+ {semantic_analysis}
162
+
163
+ ## YOUR TASK
164
+
165
+ 1. **Identify Brand Colors**:
166
+ - Brand Primary: The main action/CTA color (highest visibility)
167
+ - Brand Secondary: Supporting brand color
168
+ - Brand Accent: Highlight color for emphasis
169
+
170
+ 2. **Assess Palette Strategy**:
171
+ - Is it complementary, analogous, triadic, monochromatic, or random?
172
+
173
+ 3. **Rate Cohesion** (1-10):
174
+ - Do the colors work together?
175
+ - Is there a clear color story?
176
+
177
+ 4. **Suggest Semantic Names** for top 10 most-used colors
178
+
179
+ ## OUTPUT FORMAT (JSON only)
180
+
181
+ {{
182
+ "brand_primary": {{
183
+ "color": "#hex",
184
+ "confidence": "high|medium|low",
185
+ "reasoning": "Why this is brand primary",
186
+ "usage_count": <number>
187
+ }},
188
+ "brand_secondary": {{
189
+ "color": "#hex",
190
+ "confidence": "high|medium|low",
191
+ "reasoning": "..."
192
+ }},
193
+ "brand_accent": {{
194
+ "color": "#hex or null",
195
+ "confidence": "...",
196
+ "reasoning": "..."
197
+ }},
198
+ "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
199
+ "cohesion_score": <1-10>,
200
+ "cohesion_notes": "Assessment of how well colors work together",
201
+ "semantic_names": {{
202
+ "#hex1": "brand.primary",
203
+ "#hex2": "text.primary",
204
+ "#hex3": "background.primary"
205
+ }}
206
+ }}
207
+
208
+ Return ONLY valid JSON."""
209
+
210
+ def __init__(self, hf_client):
211
+ self.hf_client = hf_client
212
+
213
+ async def analyze(
214
+ self,
215
+ color_tokens: dict,
216
+ semantic_analysis: dict,
217
+ log_callback: Callable = None,
218
+ ) -> BrandIdentification:
219
+ """
220
+ Identify brand colors from usage context.
221
+
222
+ Args:
223
+ color_tokens: Dict of color tokens with usage data
224
+ semantic_analysis: Semantic categorization from Stage 1
225
+ log_callback: Progress logging function
226
+
227
+ Returns:
228
+ BrandIdentification with identified colors
229
+ """
230
+ def log(msg: str):
231
+ if log_callback:
232
+ log_callback(msg)
233
+
234
+ log(" 🎨 Brand Identifier (Llama 70B)")
235
+ log(" └─ Analyzing color context and usage patterns...")
236
+
237
+ # Format color data
238
+ color_data = self._format_color_data(color_tokens)
239
+ semantic_str = self._format_semantic_analysis(semantic_analysis)
240
+
241
+ prompt = self.PROMPT_TEMPLATE.format(
242
+ color_data=color_data,
243
+ semantic_analysis=semantic_str,
244
+ )
245
+
246
+ try:
247
+ start_time = datetime.now()
248
+
249
+ # Use the correct method signature
250
+ response = await self.hf_client.complete_async(
251
+ agent_name="brand_identifier",
252
+ system_prompt="You are a senior design system analyst specializing in brand color identification.",
253
+ user_message=prompt,
254
+ max_tokens=800,
255
+ json_mode=True,
256
+ )
257
+
258
+ duration = (datetime.now() - start_time).total_seconds()
259
+
260
+ # Parse response
261
+ result = self._parse_response(response)
262
+
263
+ log(f" ────────────────────────────────────────────────")
264
+ log(f" 🎨 Brand Identifier: COMPLETE ({duration:.1f}s)")
265
+ log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
266
+ log(f" ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}")
267
+ log(f" ├─ Palette Strategy: {result.palette_strategy}")
268
+ log(f" └─ Cohesion Score: {result.cohesion_score}/10")
269
+
270
+ return result
271
+
272
+ except Exception as e:
273
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
274
+ return BrandIdentification()
275
+
276
+ def _format_color_data(self, color_tokens: dict) -> str:
277
+ """Format color tokens for prompt."""
278
+ lines = []
279
+ for name, token in list(color_tokens.items())[:30]:
280
+ if isinstance(token, dict):
281
+ hex_val = token.get("value", token.get("hex", ""))
282
+ usage = token.get("usage_count", token.get("count", 1))
283
+ context = token.get("context", token.get("css_property", ""))
284
+ else:
285
+ hex_val = getattr(token, "value", "")
286
+ usage = getattr(token, "usage_count", 1)
287
+ context = getattr(token, "context", "")
288
+
289
+ if hex_val:
290
+ lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")
291
+
292
+ return "\n".join(lines) if lines else "No color data available"
293
+
294
+ def _format_semantic_analysis(self, semantic: dict) -> str:
295
+ """Format semantic analysis for prompt."""
296
+ if not semantic:
297
+ return "No semantic analysis available"
298
+
299
+ lines = []
300
+ for category, colors in semantic.items():
301
+ if colors:
302
+ color_list = [c.get("hex", c) if isinstance(c, dict) else c for c in colors[:5]]
303
+ lines.append(f"- {category}: {', '.join(str(c) for c in color_list)}")
304
+
305
+ return "\n".join(lines) if lines else "No semantic analysis available"
306
+
307
+ def _parse_response(self, response: str) -> BrandIdentification:
308
+ """Parse LLM response into BrandIdentification."""
309
+ try:
310
+ json_match = re.search(r'\{[\s\S]*\}', response)
311
+ if json_match:
312
+ data = json.loads(json_match.group())
313
+ return BrandIdentification(
314
+ brand_primary=data.get("brand_primary", {}),
315
+ brand_secondary=data.get("brand_secondary", {}),
316
+ brand_accent=data.get("brand_accent", {}),
317
+ palette_strategy=data.get("palette_strategy", "unknown"),
318
+ cohesion_score=data.get("cohesion_score", 5),
319
+ cohesion_notes=data.get("cohesion_notes", ""),
320
+ semantic_names=data.get("semantic_names", {}),
321
+ )
322
+ except Exception:
323
+ pass
324
+
325
+ return BrandIdentification()
326
+
327
+
328
+ # =============================================================================
329
+ # BENCHMARK ADVISOR AGENT
330
+ # =============================================================================
331
+
332
+ class BenchmarkAdvisorAgent:
333
+ """
334
+ Recommends best-fit design system based on comparison data.
335
+
336
+ WHY LLM: Requires reasoning about trade-offs and use-case fit,
337
+ not just similarity scores.
338
+ """
339
+
340
+ PROMPT_TEMPLATE = """You are a senior design system consultant. Recommend the best design system alignment.
341
+
342
+ ## USER'S CURRENT VALUES
343
+
344
+ - Type Scale Ratio: {user_ratio}
345
+ - Base Font Size: {user_base}px
346
+ - Spacing Grid: {user_spacing}px
347
+
348
+ ## BENCHMARK COMPARISON
349
+
350
+ {benchmark_comparison}
351
+
352
+ ## YOUR TASK
353
+
354
+ 1. **Recommend Best Fit**: Which design system should they align with?
355
+ 2. **Explain Why**: Consider similarity scores AND use-case fit
356
+ 3. **List Changes Needed**: What would they need to change to align?
357
+ 4. **Pros/Cons**: Benefits and drawbacks of alignment
358
+
359
+ ## OUTPUT FORMAT (JSON only)
360
+
361
+ {{
362
+ "recommended_benchmark": "<system_key>",
363
+ "recommended_benchmark_name": "<full name>",
364
+ "reasoning": "Why this is the best fit for their use case",
365
+ "alignment_changes": [
366
+ {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
367
+ {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
368
+ ],
369
+ "pros_of_alignment": [
370
+ "Familiar patterns for users",
371
+ "Well-tested accessibility"
372
+ ],
373
+ "cons_of_alignment": [
374
+ "May lose brand uniqueness"
375
+ ],
376
+ "alternative_benchmarks": [
377
+ {{"name": "Material Design 3", "reason": "Good for Android-first products"}}
378
+ ]
379
+ }}
380
+
381
+ Return ONLY valid JSON."""
382
+
383
+ def __init__(self, hf_client):
384
+ self.hf_client = hf_client
385
+
386
+ async def analyze(
387
+ self,
388
+ user_ratio: float,
389
+ user_base: int,
390
+ user_spacing: int,
391
+ benchmark_comparisons: list,
392
+ log_callback: Callable = None,
393
+ ) -> BenchmarkAdvice:
394
+ """
395
+ Recommend best-fit design system.
396
+
397
+ Args:
398
+ user_ratio: User's detected type scale ratio
399
+ user_base: User's base font size
400
+ user_spacing: User's spacing grid base
401
+ benchmark_comparisons: List of BenchmarkComparison objects
402
+ log_callback: Progress logging function
403
+
404
+ Returns:
405
+ BenchmarkAdvice with recommendations
406
+ """
407
+ def log(msg: str):
408
+ if log_callback:
409
+ log_callback(msg)
410
+
411
+ log("")
412
+ log(" 🏢 Benchmark Advisor (Qwen 72B)")
413
+ log(" └─ Evaluating benchmark fit for your use case...")
414
+
415
+ # Format comparison data
416
+ comparison_str = self._format_comparisons(benchmark_comparisons)
417
+
418
+ prompt = self.PROMPT_TEMPLATE.format(
419
+ user_ratio=user_ratio,
420
+ user_base=user_base,
421
+ user_spacing=user_spacing,
422
+ benchmark_comparison=comparison_str,
423
+ )
424
+
425
+ try:
426
+ start_time = datetime.now()
427
+
428
+ response = await self.hf_client.complete_async(
429
+ agent_name="benchmark_advisor",
430
+ system_prompt="You are a senior design system consultant specializing in design system architecture.",
431
+ user_message=prompt,
432
+ max_tokens=700,
433
+ json_mode=True,
434
+ )
435
+
436
+ duration = (datetime.now() - start_time).total_seconds()
437
+
438
+ result = self._parse_response(response)
439
+
440
+ log(f" ────────────────────────────────────────────────")
441
+ log(f" 🏢 Benchmark Advisor: COMPLETE ({duration:.1f}s)")
442
+ log(f" ├─ Recommended: {result.recommended_benchmark_name}")
443
+ log(f" ├─ Changes Needed: {len(result.alignment_changes)}")
444
+ log(f" └─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
445
+
446
+ return result
447
+
448
+ except Exception as e:
449
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
450
+ return BenchmarkAdvice()
451
+
452
+ def _format_comparisons(self, comparisons: list) -> str:
453
+ """Format benchmark comparisons for prompt."""
454
+ lines = []
455
+ for i, c in enumerate(comparisons[:5]):
456
+ b = c.benchmark
457
+ lines.append(f"""
458
+ {i+1}. {b.icon} {b.name}
459
+ - Similarity Score: {c.similarity_score:.2f} (lower = better)
460
+ - Match: {c.overall_match_pct:.0f}%
461
+ - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
462
+ - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
463
+ - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
464
+ - Best For: {', '.join(b.best_for)}""")
465
+
466
+ return "\n".join(lines)
467
+
468
+ def _parse_response(self, response: str) -> BenchmarkAdvice:
469
+ """Parse LLM response into BenchmarkAdvice."""
470
+ try:
471
+ json_match = re.search(r'\{[\s\S]*\}', response)
472
+ if json_match:
473
+ data = json.loads(json_match.group())
474
+ return BenchmarkAdvice(
475
+ recommended_benchmark=data.get("recommended_benchmark", ""),
476
+ recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
477
+ reasoning=data.get("reasoning", ""),
478
+ alignment_changes=data.get("alignment_changes", []),
479
+ pros_of_alignment=data.get("pros_of_alignment", []),
480
+ cons_of_alignment=data.get("cons_of_alignment", []),
481
+ alternative_benchmarks=data.get("alternative_benchmarks", []),
482
+ )
483
+ except Exception:
484
+ pass
485
+
486
+ return BenchmarkAdvice()
487
+
488
+
489
+ # =============================================================================
490
+ # BEST PRACTICES VALIDATOR AGENT
491
+ # =============================================================================
492
+
493
+ class BestPracticesValidatorAgent:
494
+ """
495
+ Validates against design system best practices and prioritizes fixes.
496
+
497
+ WHY LLM: Prioritization requires judgment about business impact,
498
+ not just checking boxes.
499
+ """
500
+
501
+ PROMPT_TEMPLATE = """You are a design system auditor. Validate these tokens against best practices.
502
+
503
+ ## RULE ENGINE ANALYSIS RESULTS
504
+
505
+ ### Typography
506
+ - Detected Ratio: {type_ratio} ({type_consistent})
507
+ - Base Size: {base_size}px
508
+ - Recommendation: {type_recommendation}
509
+
510
+ ### Accessibility
511
+ - Total Colors: {total_colors}
512
+ - AA Pass: {aa_pass}
513
+ - AA Fail: {aa_fail}
514
+ - Failing Colors: {failing_colors}
515
+
516
+ ### Spacing
517
+ - Detected Base: {spacing_base}px
518
+ - Grid Aligned: {spacing_aligned}%
519
+ - Recommendation: {spacing_recommendation}px
520
+
521
+ ### Color Statistics
522
+ - Unique Colors: {unique_colors}
523
+ - Duplicates: {duplicates}
524
+ - Near-Duplicates: {near_duplicates}
525
+
526
+ ## BEST PRACTICES CHECKLIST
527
+
528
+ 1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
529
+ 2. Type scale is consistent (variance < 0.15)
530
+ 3. Base font size >= 16px (accessibility)
531
+ 4. Line height >= 1.5 for body text
532
+ 5. All interactive colors pass AA (4.5:1)
533
+ 6. Spacing uses consistent grid (4px or 8px)
534
+ 7. Limited color palette (< 20 unique semantic colors)
535
+ 8. No near-duplicate colors
536
+
537
+ ## YOUR TASK
538
+
539
+ 1. Score each practice: pass/warn/fail
540
+ 2. Calculate overall score (0-100)
541
+ 3. Identify TOP 3 priority fixes with impact assessment
542
+
543
+ ## OUTPUT FORMAT (JSON only)
544
+
545
+ {{
546
+ "overall_score": <0-100>,
547
+ "checks": {{
548
+ "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}},
549
+ "type_scale_consistent": {{"status": "...", "note": "..."}},
550
+ "base_size_accessible": {{"status": "...", "note": "..."}},
551
+ "aa_compliance": {{"status": "...", "note": "..."}},
552
+ "spacing_grid": {{"status": "...", "note": "..."}},
553
+ "color_count": {{"status": "...", "note": "..."}}
554
+ }},
555
+ "priority_fixes": [
556
+ {{
557
+ "rank": 1,
558
+ "issue": "Brand primary fails AA",
559
+ "impact": "high|medium|low",
560
+ "effort": "low|medium|high",
561
+ "action": "Change #06b2c4 → #0891a8"
562
+ }}
563
+ ],
564
+ "passing_practices": ["Base font size", "..."],
565
+ "failing_practices": ["AA compliance", "..."]
566
+ }}
567
+
568
+ Return ONLY valid JSON."""
569
+
570
+ def __init__(self, hf_client):
571
+ self.hf_client = hf_client
572
+
573
+ async def analyze(
574
+ self,
575
+ rule_engine_results: Any,
576
+ log_callback: Callable = None,
577
+ ) -> BestPracticesResult:
578
+ """
579
+ Validate against best practices.
580
+
581
+ Args:
582
+ rule_engine_results: Results from rule engine
583
+ log_callback: Progress logging function
584
+
585
+ Returns:
586
+ BestPracticesResult with validation
587
+ """
588
+ def log(msg: str):
589
+ if log_callback:
590
+ log_callback(msg)
591
+
592
+ log("")
593
+ log(" ✅ Best Practices Validator (Qwen 72B)")
594
+ log(" └─ Checking against design system standards...")
595
+
596
+ # Extract data from rule engine
597
+ typo = rule_engine_results.typography
598
+ spacing = rule_engine_results.spacing
599
+ color_stats = rule_engine_results.color_stats
600
+ accessibility = rule_engine_results.accessibility
601
+
602
+ failures = [a for a in accessibility if not a.passes_aa_normal]
603
+ failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])
604
+
605
+ prompt = self.PROMPT_TEMPLATE.format(
606
+ type_ratio=f"{typo.detected_ratio:.3f}",
607
+ type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
608
+ base_size=typo.sizes_px[0] if typo.sizes_px else 16,
609
+ type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
610
+ total_colors=len(accessibility),
611
+ aa_pass=len(accessibility) - len(failures),
612
+ aa_fail=len(failures),
613
+ failing_colors=failing_colors_str or "None",
614
+ spacing_base=spacing.detected_base,
615
+ spacing_aligned=f"{spacing.alignment_percentage:.0f}",
616
+ spacing_recommendation=spacing.recommendation,
617
+ unique_colors=color_stats.unique_count,
618
+ duplicates=color_stats.duplicate_count,
619
+ near_duplicates=len(color_stats.near_duplicates),
620
+ )
621
+
622
+ try:
623
+ start_time = datetime.now()
624
+
625
+ response = await self.hf_client.complete_async(
626
+ agent_name="best_practices_validator",
627
+ system_prompt="You are a design system auditor specializing in best practices validation.",
628
+ user_message=prompt,
629
+ max_tokens=800,
630
+ json_mode=True,
631
+ )
632
+
633
+ duration = (datetime.now() - start_time).total_seconds()
634
+
635
+ result = self._parse_response(response)
636
+
637
+ log(f" ────────────────────────────────────────────────")
638
+ log(f" ✅ Best Practices: COMPLETE ({duration:.1f}s)")
639
+ log(f" ├─ Overall Score: {result.overall_score}/100")
640
+ log(f" ├─ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}")
641
+ if result.priority_fixes:
642
+ log(f" └─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
643
+
644
+ return result
645
+
646
+ except Exception as e:
647
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
648
+ return BestPracticesResult()
649
+
650
+ def _parse_response(self, response: str) -> BestPracticesResult:
651
+ """Parse LLM response into BestPracticesResult."""
652
+ try:
653
+ json_match = re.search(r'\{[\s\S]*\}', response)
654
+ if json_match:
655
+ data = json.loads(json_match.group())
656
+ return BestPracticesResult(
657
+ overall_score=data.get("overall_score", 50),
658
+ checks=data.get("checks", {}),
659
+ priority_fixes=data.get("priority_fixes", []),
660
+ passing_practices=data.get("passing_practices", []),
661
+ failing_practices=data.get("failing_practices", []),
662
+ )
663
+ except Exception:
664
+ pass
665
+
666
+ return BestPracticesResult()
667
+
668
+
669
+ # =============================================================================
670
+ # HEAD SYNTHESIZER AGENT
671
+ # =============================================================================
672
+
673
+ class HeadSynthesizerAgent:
674
+ """
675
+ Combines all agent outputs into final recommendations.
676
+
677
+ This is the final step that produces actionable output for the user.
678
+ """
679
+
680
+ PROMPT_TEMPLATE = """You are a senior design system architect. Synthesize these analysis results into final recommendations.
681
+
682
+ ## RULE ENGINE FACTS
683
+
684
+ - Type Scale: {type_ratio} ({type_status})
685
+ - Base Size: {base_size}px
686
+ - AA Failures: {aa_failures}
687
+ - Spacing Grid: {spacing_status}
688
+ - Unique Colors: {unique_colors}
689
+ - Consistency Score: {consistency_score}/100
690
+
691
+ ## BENCHMARK COMPARISON
692
+
693
+ Closest Match: {closest_benchmark}
694
+ Match Percentage: {match_pct}%
695
+ Recommended Changes: {benchmark_changes}
696
+
697
+ ## BRAND IDENTIFICATION
698
+
699
+ - Brand Primary: {brand_primary}
700
+ - Brand Secondary: {brand_secondary}
701
+ - Palette Cohesion: {cohesion_score}/10
702
+
703
+ ## BEST PRACTICES VALIDATION
704
+
705
+ Overall Score: {best_practices_score}/100
706
+ Priority Fixes: {priority_fixes}
707
+
708
+ ## ACCESSIBILITY FIXES NEEDED
709
+
710
+ {accessibility_fixes}
711
+
712
+ ## YOUR TASK
713
+
714
+ Synthesize ALL the above into:
715
+ 1. Executive Summary (2-3 sentences)
716
+ 2. Overall Scores
717
+ 3. Top 3 Priority Actions (with effort estimates)
718
+ 4. Specific Color Recommendations (with accept/reject defaults)
719
+ 5. Type Scale Recommendation
720
+ 6. Spacing Recommendation
721
+
722
+ ## OUTPUT FORMAT (JSON only)
723
+
724
+ {{
725
+ "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
726
+ "scores": {{
727
+ "overall": <0-100>,
728
+ "accessibility": <0-100>,
729
+ "consistency": <0-100>,
730
+ "organization": <0-100>
731
+ }},
732
+ "benchmark_fit": {{
733
+ "closest": "<name>",
734
+ "similarity": "<X%>",
735
+ "recommendation": "Align type scale to 1.25"
736
+ }},
737
+ "brand_analysis": {{
738
+ "primary": "#hex",
739
+ "secondary": "#hex",
740
+ "cohesion": <1-10>
741
+ }},
742
+ "top_3_actions": [
743
+ {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
744
+ ],
745
+ "color_recommendations": [
746
+ {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
747
+ ],
748
+ "type_scale_recommendation": {{
749
+ "current_ratio": 1.18,
750
+ "recommended_ratio": 1.25,
751
+ "reason": "Align with industry standard"
752
+ }},
753
+ "spacing_recommendation": {{
754
+ "current": "mixed",
755
+ "recommended": "8px",
756
+ "reason": "Consistent grid improves maintainability"
757
+ }}
758
+ }}
759
+
760
+ Return ONLY valid JSON."""
761
+
762
+ def __init__(self, hf_client):
763
+ self.hf_client = hf_client
764
+
765
+ async def synthesize(
766
+ self,
767
+ rule_engine_results: Any,
768
+ benchmark_comparisons: list,
769
+ brand_identification: BrandIdentification,
770
+ benchmark_advice: BenchmarkAdvice,
771
+ best_practices: BestPracticesResult,
772
+ log_callback: Callable = None,
773
+ ) -> HeadSynthesis:
774
+ """
775
+ Synthesize all results into final recommendations.
776
+ """
777
+ def log(msg: str):
778
+ if log_callback:
779
+ log_callback(msg)
780
+
781
+ log("")
782
+ log("═" * 60)
783
+ log("🧠 LAYER 4: HEAD SYNTHESIZER")
784
+ log("═" * 60)
785
+ log("")
786
+ log(" Combining: Rule Engine + Benchmarks + Brand + Best Practices...")
787
+
788
+ # Extract data
789
+ typo = rule_engine_results.typography
790
+ spacing = rule_engine_results.spacing
791
+ color_stats = rule_engine_results.color_stats
792
+ accessibility = rule_engine_results.accessibility
793
+
794
+ failures = [a for a in accessibility if not a.passes_aa_normal]
795
+ aa_fixes_str = "\n".join([
796
+ f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
797
+ for a in failures[:5] if a.suggested_fix
798
+ ])
799
+
800
+ closest = benchmark_comparisons[0] if benchmark_comparisons else None
801
+
802
+ prompt = self.PROMPT_TEMPLATE.format(
803
+ type_ratio=f"{typo.detected_ratio:.3f}",
804
+ type_status="consistent" if typo.is_consistent else "inconsistent",
805
+ base_size=typo.sizes_px[0] if typo.sizes_px else 16,
806
+ aa_failures=len(failures),
807
+ spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
808
+ unique_colors=color_stats.unique_count,
809
+ consistency_score=rule_engine_results.consistency_score,
810
+ closest_benchmark=closest.benchmark.name if closest else "Unknown",
811
+ match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
812
+ benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
813
+ brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
814
+ brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
815
+ cohesion_score=brand_identification.cohesion_score,
816
+ best_practices_score=best_practices.overall_score,
817
+ priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
818
+ accessibility_fixes=aa_fixes_str or "None needed",
819
+ )
820
+
821
+ try:
822
+ start_time = datetime.now()
823
+
824
+ response = await self.hf_client.complete_async(
825
+ agent_name="head_synthesizer",
826
+ system_prompt="You are a senior design system architect specializing in synthesis and recommendations.",
827
+ user_message=prompt,
828
+ max_tokens=1000,
829
+ json_mode=True,
830
+ )
831
+
832
+ duration = (datetime.now() - start_time).total_seconds()
833
+
834
+ result = self._parse_response(response)
835
+
836
+ log("")
837
+ log(f" ✅ HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
838
+ log("")
839
+
840
+ return result
841
+
842
+ except Exception as e:
843
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
844
+ return HeadSynthesis()
845
+
846
+ def _parse_response(self, response: str) -> HeadSynthesis:
847
+ """Parse LLM response into HeadSynthesis."""
848
+ try:
849
+ json_match = re.search(r'\{[\s\S]*\}', response)
850
+ if json_match:
851
+ data = json.loads(json_match.group())
852
+ return HeadSynthesis(
853
+ executive_summary=data.get("executive_summary", ""),
854
+ scores=data.get("scores", {}),
855
+ benchmark_fit=data.get("benchmark_fit", {}),
856
+ brand_analysis=data.get("brand_analysis", {}),
857
+ top_3_actions=data.get("top_3_actions", []),
858
+ color_recommendations=data.get("color_recommendations", []),
859
+ type_scale_recommendation=data.get("type_scale_recommendation", {}),
860
+ spacing_recommendation=data.get("spacing_recommendation", {}),
861
+ )
862
+ except Exception:
863
+ pass
864
+
865
+ return HeadSynthesis()