riazmo commited on
Commit
d2da694
·
verified ·
1 Parent(s): 7f20fd9

Upload 2 files

Browse files
Files changed (2) hide show
  1. agents/extractor.py +1229 -0
  2. agents/llm_agents.py +905 -0
agents/extractor.py ADDED
@@ -0,0 +1,1229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 1: Token Extractor
3
+ Design System Extractor v2
4
+
5
+ Persona: Meticulous Design Archaeologist
6
+
7
+ Responsibilities:
8
+ - Crawl pages at specified viewport
9
+ - Extract computed styles from all elements
10
+ - Parse CSS files for variables and rules
11
+ - Extract colors from SVGs
12
+ - Collect colors, typography, spacing, radius, shadows
13
+ - Track frequency and context for each token
14
+ """
15
+
16
+ import asyncio
17
+ import re
18
+ from typing import Optional, Callable
19
+ from datetime import datetime
20
+ from collections import defaultdict
21
+
22
+ from playwright.async_api import async_playwright, Browser, Page, BrowserContext
23
+
24
+ from core.token_schema import (
25
+ Viewport,
26
+ ExtractedTokens,
27
+ ColorToken,
28
+ TypographyToken,
29
+ SpacingToken,
30
+ RadiusToken,
31
+ ShadowToken,
32
+ FontFamily,
33
+ TokenSource,
34
+ Confidence,
35
+ )
36
+ from core.color_utils import (
37
+ normalize_hex,
38
+ parse_color,
39
+ get_contrast_with_white,
40
+ get_contrast_with_black,
41
+ check_wcag_compliance,
42
+ )
43
+ from config.settings import get_settings
44
+
45
+
46
+ class TokenExtractor:
47
+ """
48
+ Extracts design tokens from web pages.
49
+
50
+ This is the second part of Agent 1's job — after pages are confirmed,
51
+ we crawl and extract all CSS values.
52
+
53
+ Enhanced with:
54
+ - CSS file parsing for variables and rules
55
+ - SVG color extraction
56
+ - Inline style extraction
57
+ """
58
+
59
+ def __init__(self, viewport: Viewport = Viewport.DESKTOP):
60
+ self.settings = get_settings()
61
+ self.viewport = viewport
62
+ self.browser: Optional[Browser] = None
63
+ self.context: Optional[BrowserContext] = None
64
+
65
+ # Token collection
66
+ self.colors: dict[str, ColorToken] = {}
67
+ self.typography: dict[str, TypographyToken] = {}
68
+ self.spacing: dict[str, SpacingToken] = {}
69
+ self.radius: dict[str, RadiusToken] = {}
70
+ self.shadows: dict[str, ShadowToken] = {}
71
+
72
+ # Foreground-background pairs extracted from actual DOM elements
73
+ self.fg_bg_pairs: list[dict] = []
74
+
75
+ # CSS Variables collection
76
+ self.css_variables: dict[str, str] = {}
77
+
78
+ # Font tracking
79
+ self.font_families: dict[str, FontFamily] = {}
80
+
81
+ # Statistics
82
+ self.total_elements = 0
83
+ self.errors: list[str] = []
84
+ self.warnings: list[str] = []
85
+
86
+ async def __aenter__(self):
87
+ """Async context manager entry."""
88
+ await self._init_browser()
89
+ return self
90
+
91
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
92
+ """Async context manager exit."""
93
+ await self._close_browser()
94
+
95
+ async def _init_browser(self):
96
+ """Initialize Playwright browser."""
97
+ playwright = await async_playwright().start()
98
+ self.browser = await playwright.chromium.launch(
99
+ headless=self.settings.browser.headless
100
+ )
101
+
102
+ # Set viewport based on extraction mode
103
+ if self.viewport == Viewport.DESKTOP:
104
+ width = self.settings.viewport.desktop_width
105
+ height = self.settings.viewport.desktop_height
106
+ else:
107
+ width = self.settings.viewport.mobile_width
108
+ height = self.settings.viewport.mobile_height
109
+
110
+ self.context = await self.browser.new_context(
111
+ viewport={"width": width, "height": height},
112
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
113
+ )
114
+
115
+ async def _close_browser(self):
116
+ """Close browser and cleanup."""
117
+ if self.context:
118
+ await self.context.close()
119
+ if self.browser:
120
+ await self.browser.close()
121
+
122
+ async def _scroll_page(self, page: Page):
123
+ """Scroll page to load lazy content."""
124
+ await page.evaluate("""
125
+ async () => {
126
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
127
+ const height = document.body.scrollHeight;
128
+ const step = window.innerHeight;
129
+
130
+ for (let y = 0; y < height; y += step) {
131
+ window.scrollTo(0, y);
132
+ await delay(100);
133
+ }
134
+
135
+ // Scroll back to top
136
+ window.scrollTo(0, 0);
137
+ }
138
+ """)
139
+
140
+ # Wait for network idle after scrolling
141
+ await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout)
142
+
143
+ async def _extract_styles_from_page(self, page: Page) -> dict:
144
+ """
145
+ Extract computed styles from all elements on the page.
146
+
147
+ This is the core extraction logic — we get getComputedStyle for every element.
148
+ """
149
+ styles_data = await page.evaluate("""
150
+ () => {
151
+ const elements = document.querySelectorAll('*');
152
+ const results = {
153
+ colors: [],
154
+ typography: [],
155
+ spacing: [],
156
+ radius: [],
157
+ shadows: [],
158
+ elements_count: elements.length,
159
+ };
160
+
161
+ const colorProperties = [
162
+ 'color', 'background-color', 'border-color',
163
+ 'border-top-color', 'border-right-color',
164
+ 'border-bottom-color', 'border-left-color',
165
+ 'outline-color', 'text-decoration-color',
166
+ ];
167
+
168
+ const spacingProperties = [
169
+ 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
170
+ 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
171
+ 'gap', 'row-gap', 'column-gap',
172
+ ];
173
+
174
+ elements.forEach(el => {
175
+ const tag = el.tagName.toLowerCase();
176
+ const styles = window.getComputedStyle(el);
177
+
178
+ // Skip invisible elements
179
+ if (styles.display === 'none' || styles.visibility === 'hidden') {
180
+ return;
181
+ }
182
+
183
+ // --- COLORS ---
184
+ colorProperties.forEach(prop => {
185
+ const value = styles.getPropertyValue(prop);
186
+ if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') {
187
+ results.colors.push({
188
+ value: value,
189
+ property: prop,
190
+ element: tag,
191
+ context: prop.includes('background') ? 'background' :
192
+ prop.includes('border') ? 'border' : 'text',
193
+ });
194
+ }
195
+ });
196
+
197
+ // --- TYPOGRAPHY ---
198
+ const fontFamily = styles.getPropertyValue('font-family');
199
+ const fontSize = styles.getPropertyValue('font-size');
200
+ const fontWeight = styles.getPropertyValue('font-weight');
201
+ const lineHeight = styles.getPropertyValue('line-height');
202
+ const letterSpacing = styles.getPropertyValue('letter-spacing');
203
+
204
+ if (fontSize && fontFamily) {
205
+ results.typography.push({
206
+ fontFamily: fontFamily,
207
+ fontSize: fontSize,
208
+ fontWeight: fontWeight,
209
+ lineHeight: lineHeight,
210
+ letterSpacing: letterSpacing,
211
+ element: tag,
212
+ });
213
+ }
214
+
215
+ // --- SPACING ---
216
+ spacingProperties.forEach(prop => {
217
+ const value = styles.getPropertyValue(prop);
218
+ if (value && value !== '0px' && value !== 'auto' && value !== 'normal') {
219
+ const px = parseFloat(value);
220
+ if (!isNaN(px) && px > 0 && px < 500) {
221
+ results.spacing.push({
222
+ value: value,
223
+ valuePx: Math.round(px),
224
+ property: prop,
225
+ context: prop.includes('margin') ? 'margin' :
226
+ prop.includes('padding') ? 'padding' : 'gap',
227
+ });
228
+ }
229
+ }
230
+ });
231
+
232
+ // --- BORDER RADIUS ---
233
+ const radiusProps = [
234
+ 'border-radius', 'border-top-left-radius',
235
+ 'border-top-right-radius', 'border-bottom-left-radius',
236
+ 'border-bottom-right-radius',
237
+ ];
238
+
239
+ radiusProps.forEach(prop => {
240
+ const value = styles.getPropertyValue(prop);
241
+ if (value && value !== '0px') {
242
+ results.radius.push({
243
+ value: value,
244
+ element: tag,
245
+ });
246
+ }
247
+ });
248
+
249
+ // --- BOX SHADOW ---
250
+ const shadow = styles.getPropertyValue('box-shadow');
251
+ if (shadow && shadow !== 'none') {
252
+ results.shadows.push({
253
+ value: shadow,
254
+ element: tag,
255
+ });
256
+ }
257
+ });
258
+
259
+ return results;
260
+ }
261
+ """)
262
+
263
+ return styles_data
264
+
265
+ async def _extract_fg_bg_pairs(self, page: Page) -> list[dict]:
266
+ """
267
+ Extract actual foreground-background color pairs from visible DOM elements.
268
+
269
+ For each visible element that has a non-transparent text color, walk up the
270
+ ancestor chain to find the effective background color. This gives us real
271
+ foreground/background pairs so we can do accurate WCAG AA checks instead of
272
+ only comparing every color against white/black.
273
+ """
274
+ pairs = await page.evaluate("""
275
+ () => {
276
+ const pairs = [];
277
+ const seen = new Set();
278
+
279
+ function rgbToHex(rgb) {
280
+ if (!rgb || rgb === 'transparent' || rgb === 'rgba(0, 0, 0, 0)') return null;
281
+ const match = rgb.match(/rgba?\\((\\d+),\\s*(\\d+),\\s*(\\d+)/);
282
+ if (!match) return null;
283
+ const r = parseInt(match[1]);
284
+ const g = parseInt(match[2]);
285
+ const b = parseInt(match[3]);
286
+ return '#' + [r, g, b].map(c => c.toString(16).padStart(2, '0')).join('');
287
+ }
288
+
289
+ function getEffectiveBackground(el) {
290
+ let current = el;
291
+ while (current && current !== document.documentElement) {
292
+ const bg = window.getComputedStyle(current).backgroundColor;
293
+ if (bg && bg !== 'rgba(0, 0, 0, 0)' && bg !== 'transparent') {
294
+ return rgbToHex(bg);
295
+ }
296
+ current = current.parentElement;
297
+ }
298
+ return '#ffffff'; // default page background
299
+ }
300
+
301
+ const elements = document.querySelectorAll('*');
302
+ elements.forEach(el => {
303
+ const styles = window.getComputedStyle(el);
304
+ if (styles.display === 'none' || styles.visibility === 'hidden') return;
305
+
306
+ const fg = rgbToHex(styles.color);
307
+ if (!fg) return;
308
+
309
+ const bg = getEffectiveBackground(el);
310
+ if (!bg) return;
311
+
312
+ const key = fg + '|' + bg;
313
+ if (seen.has(key)) return;
314
+ seen.add(key);
315
+
316
+ pairs.push({
317
+ foreground: fg,
318
+ background: bg,
319
+ element: el.tagName.toLowerCase(),
320
+ });
321
+ });
322
+
323
+ return pairs;
324
+ }
325
+ """)
326
+ return pairs or []
327
+
328
+ async def _extract_css_variables(self, page: Page) -> dict:
329
+ """
330
+ Extract CSS custom properties (variables) from :root and stylesheets.
331
+
332
+ This catches colors defined as:
333
+ - :root { --primary-color: #3860be; }
334
+ - :root { --brand-cyan: #00c4cc; }
335
+ """
336
+ css_vars = await page.evaluate("""
337
+ () => {
338
+ const variables = {};
339
+
340
+ // 1. Get CSS variables from :root computed styles
341
+ const rootStyles = getComputedStyle(document.documentElement);
342
+ const rootCss = document.documentElement.style.cssText;
343
+
344
+ // 2. Parse all stylesheets for CSS variables
345
+ for (const sheet of document.styleSheets) {
346
+ try {
347
+ const rules = sheet.cssRules || sheet.rules;
348
+ for (const rule of rules) {
349
+ if (rule.style) {
350
+ for (let i = 0; i < rule.style.length; i++) {
351
+ const prop = rule.style[i];
352
+ if (prop.startsWith('--')) {
353
+ const value = rule.style.getPropertyValue(prop).trim();
354
+ if (value) {
355
+ variables[prop] = value;
356
+ }
357
+ }
358
+ }
359
+ }
360
+ // Also check @media rules
361
+ if (rule.cssRules) {
362
+ for (const innerRule of rule.cssRules) {
363
+ if (innerRule.style) {
364
+ for (let i = 0; i < innerRule.style.length; i++) {
365
+ const prop = innerRule.style[i];
366
+ if (prop.startsWith('--')) {
367
+ const value = innerRule.style.getPropertyValue(prop).trim();
368
+ if (value) {
369
+ variables[prop] = value;
370
+ }
371
+ }
372
+ }
373
+ }
374
+ }
375
+ }
376
+ }
377
+ } catch (e) {
378
+ // CORS may block access to external stylesheets
379
+ console.log('Could not access stylesheet:', e);
380
+ }
381
+ }
382
+
383
+ // 3. Get computed CSS variable values from :root
384
+ const computedVars = {};
385
+ for (const prop of Object.keys(variables)) {
386
+ const computed = rootStyles.getPropertyValue(prop).trim();
387
+ if (computed) {
388
+ computedVars[prop] = computed;
389
+ }
390
+ }
391
+
392
+ return { raw: variables, computed: computedVars };
393
+ }
394
+ """)
395
+
396
+ return css_vars
397
+
398
+ async def _extract_svg_colors(self, page: Page) -> list[dict]:
399
+ """
400
+ Extract colors from SVG elements (fill, stroke).
401
+
402
+ This catches colors in:
403
+ - <svg fill="#00c4cc">
404
+ - <path stroke="#3860be">
405
+ - <circle fill="rgb(188, 212, 50)">
406
+ """
407
+ svg_colors = await page.evaluate("""
408
+ () => {
409
+ const colors = [];
410
+
411
+ // Find all SVG elements
412
+ const svgs = document.querySelectorAll('svg, svg *');
413
+
414
+ svgs.forEach(el => {
415
+ // Check fill attribute
416
+ const fill = el.getAttribute('fill');
417
+ if (fill && fill !== 'none' && fill !== 'currentColor' && !fill.startsWith('url(')) {
418
+ colors.push({
419
+ value: fill,
420
+ property: 'svg-fill',
421
+ element: el.tagName.toLowerCase(),
422
+ context: 'svg',
423
+ });
424
+ }
425
+
426
+ // Check stroke attribute
427
+ const stroke = el.getAttribute('stroke');
428
+ if (stroke && stroke !== 'none' && stroke !== 'currentColor' && !stroke.startsWith('url(')) {
429
+ colors.push({
430
+ value: stroke,
431
+ property: 'svg-stroke',
432
+ element: el.tagName.toLowerCase(),
433
+ context: 'svg',
434
+ });
435
+ }
436
+
437
+ // Check computed styles for SVG elements
438
+ const styles = getComputedStyle(el);
439
+ const computedFill = styles.fill;
440
+ const computedStroke = styles.stroke;
441
+
442
+ if (computedFill && computedFill !== 'none' && !computedFill.startsWith('url(')) {
443
+ colors.push({
444
+ value: computedFill,
445
+ property: 'svg-fill-computed',
446
+ element: el.tagName.toLowerCase(),
447
+ context: 'svg',
448
+ });
449
+ }
450
+
451
+ if (computedStroke && computedStroke !== 'none' && !computedStroke.startsWith('url(')) {
452
+ colors.push({
453
+ value: computedStroke,
454
+ property: 'svg-stroke-computed',
455
+ element: el.tagName.toLowerCase(),
456
+ context: 'svg',
457
+ });
458
+ }
459
+ });
460
+
461
+ return colors;
462
+ }
463
+ """)
464
+
465
+ return svg_colors
466
+
467
+ async def _extract_inline_styles(self, page: Page) -> dict:
468
+ """
469
+ Extract colors from inline style attributes.
470
+
471
+ This catches colors in:
472
+ - <div style="background-color: #bcd432;">
473
+ - <span style="color: rgb(0, 196, 204);">
474
+ """
475
+ inline_data = await page.evaluate("""
476
+ () => {
477
+ const colors = [];
478
+ const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
479
+
480
+ // Find all elements with inline styles
481
+ const elements = document.querySelectorAll('[style]');
482
+
483
+ elements.forEach(el => {
484
+ const styleAttr = el.getAttribute('style');
485
+ if (styleAttr) {
486
+ const matches = styleAttr.match(colorRegex);
487
+ if (matches) {
488
+ matches.forEach(color => {
489
+ colors.push({
490
+ value: color,
491
+ property: 'inline-style',
492
+ element: el.tagName.toLowerCase(),
493
+ context: 'inline',
494
+ });
495
+ });
496
+ }
497
+ }
498
+ });
499
+
500
+ return colors;
501
+ }
502
+ """)
503
+
504
+ return inline_data
505
+
506
+ async def _extract_stylesheet_colors(self, page: Page) -> list[dict]:
507
+ """
508
+ Parse CSS stylesheets for color values.
509
+
510
+ This catches colors defined in CSS rules that may not be
511
+ currently applied to visible elements.
512
+
513
+ Also fetches external stylesheets that may be CORS-blocked.
514
+ """
515
+ css_colors = await page.evaluate("""
516
+ () => {
517
+ const colors = [];
518
+ const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
519
+
520
+ // Color-related CSS properties
521
+ const colorProps = [
522
+ 'color', 'background-color', 'background', 'border-color',
523
+ 'border-top-color', 'border-right-color', 'border-bottom-color', 'border-left-color',
524
+ 'outline-color', 'box-shadow', 'text-shadow', 'fill', 'stroke',
525
+ 'caret-color', 'column-rule-color', 'text-decoration-color',
526
+ ];
527
+
528
+ // Parse all stylesheets
529
+ for (const sheet of document.styleSheets) {
530
+ try {
531
+ const rules = sheet.cssRules || sheet.rules;
532
+ for (const rule of rules) {
533
+ if (rule.style) {
534
+ colorProps.forEach(prop => {
535
+ const value = rule.style.getPropertyValue(prop);
536
+ if (value) {
537
+ const matches = value.match(colorRegex);
538
+ if (matches) {
539
+ matches.forEach(color => {
540
+ colors.push({
541
+ value: color,
542
+ property: prop,
543
+ element: 'css-rule',
544
+ context: 'stylesheet',
545
+ selector: rule.selectorText || '',
546
+ });
547
+ });
548
+ }
549
+ }
550
+ });
551
+ }
552
+ }
553
+ } catch (e) {
554
+ // CORS may block access to external stylesheets
555
+ }
556
+ }
557
+
558
+ return colors;
559
+ }
560
+ """)
561
+
562
+ return css_colors
563
+
564
+ async def _fetch_external_css_colors(self, page: Page) -> list[dict]:
565
+ """
566
+ Fetch and parse external CSS files directly to bypass CORS.
567
+
568
+ This catches colors in external stylesheets that are blocked by CORS.
569
+ """
570
+ colors = []
571
+
572
+ try:
573
+ # Get all stylesheet URLs
574
+ css_urls = await page.evaluate("""
575
+ () => {
576
+ const urls = [];
577
+ const links = document.querySelectorAll('link[rel="stylesheet"]');
578
+ links.forEach(link => {
579
+ if (link.href) {
580
+ urls.push(link.href);
581
+ }
582
+ });
583
+ return urls;
584
+ }
585
+ """)
586
+
587
+ # Color regex pattern
588
+ color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
589
+
590
+ # Fetch each CSS file
591
+ for css_url in css_urls[:10]: # Limit to 10 files
592
+ try:
593
+ response = await page.request.get(css_url, timeout=5000)
594
+ if response.ok:
595
+ css_text = await response.text()
596
+
597
+ # Find all color values in CSS text
598
+ matches = color_regex.findall(css_text)
599
+ for match in matches:
600
+ colors.append({
601
+ "value": match,
602
+ "property": "external-css",
603
+ "element": "css-file",
604
+ "context": "external-stylesheet",
605
+ })
606
+ except Exception as e:
607
+ # Skip if fetch fails
608
+ pass
609
+
610
+ except Exception as e:
611
+ self.warnings.append(f"External CSS fetch failed: {str(e)}")
612
+
613
+ return colors
614
+
615
+ async def _extract_all_page_colors(self, page: Page) -> list[dict]:
616
+ """
617
+ Extract ALL color values from the page source and styles.
618
+
619
+ This is a brute-force approach that scans the entire page HTML
620
+ and all style blocks for any color values.
621
+ """
622
+ colors = await page.evaluate("""
623
+ () => {
624
+ const colors = [];
625
+ const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
626
+
627
+ // 1. Scan all <style> tags
628
+ const styleTags = document.querySelectorAll('style');
629
+ styleTags.forEach(style => {
630
+ const matches = style.textContent.match(colorRegex);
631
+ if (matches) {
632
+ matches.forEach(color => {
633
+ colors.push({
634
+ value: color,
635
+ property: 'style-tag',
636
+ element: 'style',
637
+ context: 'style-block',
638
+ });
639
+ });
640
+ }
641
+ });
642
+
643
+ // 2. Scan data attributes that might contain colors
644
+ const allElements = document.querySelectorAll('*');
645
+ allElements.forEach(el => {
646
+ // Check data attributes
647
+ for (const attr of el.attributes) {
648
+ if (attr.name.startsWith('data-') || attr.name === 'style') {
649
+ const matches = attr.value.match(colorRegex);
650
+ if (matches) {
651
+ matches.forEach(color => {
652
+ colors.push({
653
+ value: color,
654
+ property: attr.name,
655
+ element: el.tagName.toLowerCase(),
656
+ context: 'attribute',
657
+ });
658
+ });
659
+ }
660
+ }
661
+ }
662
+
663
+ // Check for color in class names (some frameworks use color classes)
664
+ const classList = el.className;
665
+ if (typeof classList === 'string') {
666
+ const colorMatches = classList.match(colorRegex);
667
+ if (colorMatches) {
668
+ colorMatches.forEach(color => {
669
+ colors.push({
670
+ value: color,
671
+ property: 'class',
672
+ element: el.tagName.toLowerCase(),
673
+ context: 'class-name',
674
+ });
675
+ });
676
+ }
677
+ }
678
+ });
679
+
680
+ // 3. Look for colors in script tags (config objects)
681
+ const scriptTags = document.querySelectorAll('script');
682
+ scriptTags.forEach(script => {
683
+ if (script.textContent && !script.src) {
684
+ const matches = script.textContent.match(colorRegex);
685
+ if (matches) {
686
+ matches.forEach(color => {
687
+ colors.push({
688
+ value: color,
689
+ property: 'script',
690
+ element: 'script',
691
+ context: 'javascript',
692
+ });
693
+ });
694
+ }
695
+ }
696
+ });
697
+
698
+ return colors;
699
+ }
700
+ """)
701
+
702
+ return colors
703
+
704
+ def _process_css_variables(self, css_vars: dict):
705
+ """Process CSS variables and extract color tokens from them."""
706
+ computed = css_vars.get("computed", {})
707
+ raw = css_vars.get("raw", {})
708
+
709
+ # Store CSS variables
710
+ self.css_variables = {**raw, **computed}
711
+
712
+ # Extract colors from CSS variables
713
+ color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
714
+
715
+ for var_name, value in computed.items():
716
+ if color_regex.match(value.strip()):
717
+ # This is a color variable
718
+ color_data = {
719
+ "value": value.strip(),
720
+ "property": var_name,
721
+ "element": ":root",
722
+ "context": "css-variable",
723
+ }
724
+
725
+ hex_value = self._process_color(color_data)
726
+ if hex_value and hex_value not in self.colors:
727
+ contrast_white = get_contrast_with_white(hex_value)
728
+ contrast_black = get_contrast_with_black(hex_value)
729
+ compliance = check_wcag_compliance(hex_value, "#ffffff")
730
+
731
+ self.colors[hex_value] = ColorToken(
732
+ value=hex_value,
733
+ frequency=1,
734
+ contexts=["css-variable"],
735
+ elements=[":root"],
736
+ css_properties=[var_name],
737
+ contrast_white=round(contrast_white, 2),
738
+ contrast_black=round(contrast_black, 2),
739
+ wcag_aa_large_text=compliance["aa_large_text"],
740
+ wcag_aa_small_text=compliance["aa_normal_text"],
741
+ source=TokenSource.DETECTED, # CSS variable is still "detected"
742
+ confidence=Confidence.HIGH,
743
+ )
744
+ elif hex_value and hex_value in self.colors:
745
+ # Update existing token
746
+ token = self.colors[hex_value]
747
+ token.frequency += 1
748
+ if "css-variable" not in token.contexts:
749
+ token.contexts.append("css-variable")
750
+ if var_name not in token.css_properties:
751
+ token.css_properties.append(var_name)
752
+
753
+ def _process_color(self, color_data: dict) -> Optional[str]:
754
+ """Process and normalize a color value."""
755
+ value = color_data.get("value", "")
756
+
757
+ # Parse and normalize
758
+ parsed = parse_color(value)
759
+ if not parsed:
760
+ return None
761
+
762
+ return parsed.hex
763
+
764
+ def _aggregate_colors(self, raw_colors: list[dict]):
765
+ """Aggregate color data from extraction."""
766
+ for color_data in raw_colors:
767
+ hex_value = self._process_color(color_data)
768
+ if not hex_value:
769
+ continue
770
+
771
+ if hex_value not in self.colors:
772
+ # Calculate contrast ratios
773
+ contrast_white = get_contrast_with_white(hex_value)
774
+ contrast_black = get_contrast_with_black(hex_value)
775
+ compliance = check_wcag_compliance(hex_value, "#ffffff")
776
+
777
+ self.colors[hex_value] = ColorToken(
778
+ value=hex_value,
779
+ frequency=0,
780
+ contexts=[],
781
+ elements=[],
782
+ css_properties=[],
783
+ contrast_white=round(contrast_white, 2),
784
+ contrast_black=round(contrast_black, 2),
785
+ wcag_aa_large_text=compliance["aa_large_text"],
786
+ wcag_aa_small_text=compliance["aa_normal_text"],
787
+ )
788
+
789
+ # Update frequency and context
790
+ token = self.colors[hex_value]
791
+ token.frequency += 1
792
+
793
+ context = color_data.get("context", "")
794
+ if context and context not in token.contexts:
795
+ token.contexts.append(context)
796
+
797
+ element = color_data.get("element", "")
798
+ if element and element not in token.elements:
799
+ token.elements.append(element)
800
+
801
+ prop = color_data.get("property", "")
802
+ if prop and prop not in token.css_properties:
803
+ token.css_properties.append(prop)
804
+
805
+ def _aggregate_typography(self, raw_typography: list[dict]):
806
+ """Aggregate typography data from extraction."""
807
+ for typo_data in raw_typography:
808
+ # Create unique key
809
+ font_family = typo_data.get("fontFamily", "")
810
+ font_size = typo_data.get("fontSize", "")
811
+ font_weight = typo_data.get("fontWeight", "400")
812
+ line_height = typo_data.get("lineHeight", "normal")
813
+
814
+ key = f"{font_size}|{font_weight}|{font_family[:50]}"
815
+
816
+ if key not in self.typography:
817
+ # Parse font size to px
818
+ font_size_px = None
819
+ if font_size.endswith("px"):
820
+ try:
821
+ font_size_px = float(font_size.replace("px", ""))
822
+ except ValueError:
823
+ pass
824
+
825
+ # Parse line height
826
+ line_height_computed = None
827
+ if line_height and line_height != "normal":
828
+ if line_height.endswith("px") and font_size_px:
829
+ try:
830
+ lh_px = float(line_height.replace("px", ""))
831
+ line_height_computed = round(lh_px / font_size_px, 2)
832
+ except ValueError:
833
+ pass
834
+ else:
835
+ try:
836
+ line_height_computed = float(line_height)
837
+ except ValueError:
838
+ pass
839
+
840
+ self.typography[key] = TypographyToken(
841
+ font_family=font_family.split(",")[0].strip().strip('"\''),
842
+ font_size=font_size,
843
+ font_size_px=font_size_px,
844
+ font_weight=int(font_weight) if font_weight.isdigit() else 400,
845
+ line_height=line_height,
846
+ line_height_computed=line_height_computed,
847
+ letter_spacing=typo_data.get("letterSpacing"),
848
+ frequency=0,
849
+ elements=[],
850
+ )
851
+
852
+ # Update
853
+ token = self.typography[key]
854
+ token.frequency += 1
855
+
856
+ element = typo_data.get("element", "")
857
+ if element and element not in token.elements:
858
+ token.elements.append(element)
859
+
860
+ # Track font families
861
+ primary_font = token.font_family
862
+ if primary_font not in self.font_families:
863
+ self.font_families[primary_font] = FontFamily(
864
+ name=primary_font,
865
+ fallbacks=[f.strip().strip('"\'') for f in font_family.split(",")[1:]],
866
+ frequency=0,
867
+ )
868
+ self.font_families[primary_font].frequency += 1
869
+
870
+ def _aggregate_spacing(self, raw_spacing: list[dict]):
871
+ """Aggregate spacing data from extraction."""
872
+ for space_data in raw_spacing:
873
+ value = space_data.get("value", "")
874
+ value_px = space_data.get("valuePx", 0)
875
+
876
+ key = str(value_px)
877
+
878
+ if key not in self.spacing:
879
+ self.spacing[key] = SpacingToken(
880
+ value=f"{value_px}px",
881
+ value_px=value_px,
882
+ frequency=0,
883
+ contexts=[],
884
+ properties=[],
885
+ fits_base_4=value_px % 4 == 0,
886
+ fits_base_8=value_px % 8 == 0,
887
+ )
888
+
889
+ token = self.spacing[key]
890
+ token.frequency += 1
891
+
892
+ context = space_data.get("context", "")
893
+ if context and context not in token.contexts:
894
+ token.contexts.append(context)
895
+
896
+ prop = space_data.get("property", "")
897
+ if prop and prop not in token.properties:
898
+ token.properties.append(prop)
899
+
900
+ def _aggregate_radius(self, raw_radius: list[dict]):
901
+ """Aggregate border radius data."""
902
+ for radius_data in raw_radius:
903
+ value = radius_data.get("value", "")
904
+
905
+ # Normalize to simple format
906
+ # "8px 8px 8px 8px" -> "8px"
907
+ parts = value.split()
908
+ if len(set(parts)) == 1:
909
+ value = parts[0]
910
+
911
+ if value not in self.radius:
912
+ value_px = None
913
+ if value.endswith("px"):
914
+ try:
915
+ value_px = int(float(value.replace("px", "")))
916
+ except ValueError:
917
+ pass
918
+
919
+ self.radius[value] = RadiusToken(
920
+ value=value,
921
+ value_px=value_px,
922
+ frequency=0,
923
+ elements=[],
924
+ fits_base_4=value_px % 4 == 0 if value_px else False,
925
+ fits_base_8=value_px % 8 == 0 if value_px else False,
926
+ )
927
+
928
+ token = self.radius[value]
929
+ token.frequency += 1
930
+
931
+ element = radius_data.get("element", "")
932
+ if element and element not in token.elements:
933
+ token.elements.append(element)
934
+
935
+ def _aggregate_shadows(self, raw_shadows: list[dict]):
936
+ """Aggregate box shadow data."""
937
+ for shadow_data in raw_shadows:
938
+ value = shadow_data.get("value", "")
939
+
940
+ if value not in self.shadows:
941
+ self.shadows[value] = ShadowToken(
942
+ value=value,
943
+ frequency=0,
944
+ elements=[],
945
+ )
946
+
947
+ token = self.shadows[value]
948
+ token.frequency += 1
949
+
950
+ element = shadow_data.get("element", "")
951
+ if element and element not in token.elements:
952
+ token.elements.append(element)
953
+
954
+ def _calculate_confidence(self, frequency: int) -> Confidence:
955
+ """Calculate confidence level based on frequency."""
956
+ if frequency >= 10:
957
+ return Confidence.HIGH
958
+ elif frequency >= 3:
959
+ return Confidence.MEDIUM
960
+ return Confidence.LOW
961
+
962
+ def _detect_spacing_base(self) -> Optional[int]:
963
+ """Detect the base spacing unit (4 or 8)."""
964
+ fits_4 = sum(1 for s in self.spacing.values() if s.fits_base_4)
965
+ fits_8 = sum(1 for s in self.spacing.values() if s.fits_base_8)
966
+
967
+ total = len(self.spacing)
968
+ if total == 0:
969
+ return None
970
+
971
+ # If 80%+ values fit base 8, use 8
972
+ if fits_8 / total >= 0.8:
973
+ return 8
974
+ # If 80%+ values fit base 4, use 4
975
+ elif fits_4 / total >= 0.8:
976
+ return 4
977
+
978
+ return None
979
+
980
+ async def extract(
981
+ self,
982
+ pages: list[str],
983
+ progress_callback: Optional[Callable[[float], None]] = None
984
+ ) -> ExtractedTokens:
985
+ """
986
+ Extract tokens from a list of pages.
987
+
988
+ Enhanced extraction includes:
989
+ - DOM computed styles
990
+ - CSS variables from :root
991
+ - SVG fill/stroke colors
992
+ - Inline style colors
993
+ - Stylesheet color rules
994
+
995
+ Args:
996
+ pages: List of URLs to crawl
997
+ progress_callback: Optional callback for progress updates
998
+
999
+ Returns:
1000
+ ExtractedTokens with all discovered tokens
1001
+ """
1002
+ start_time = datetime.now()
1003
+ pages_crawled = []
1004
+
1005
+ async with self:
1006
+ for i, url in enumerate(pages):
1007
+ try:
1008
+ page = await self.context.new_page()
1009
+
1010
+ # Navigate with fallback strategy
1011
+ try:
1012
+ await page.goto(
1013
+ url,
1014
+ wait_until="domcontentloaded",
1015
+ timeout=60000 # 60 seconds
1016
+ )
1017
+ # Wait for JS to render
1018
+ await page.wait_for_timeout(2000)
1019
+ except Exception as nav_error:
1020
+ # Fallback to load event
1021
+ try:
1022
+ await page.goto(
1023
+ url,
1024
+ wait_until="load",
1025
+ timeout=60000
1026
+ )
1027
+ await page.wait_for_timeout(3000)
1028
+ except Exception:
1029
+ self.warnings.append(f"Slow load for {url}, extracting partial content")
1030
+
1031
+ # Scroll to load lazy content
1032
+ await self._scroll_page(page)
1033
+
1034
+ # =========================================================
1035
+ # ENHANCED EXTRACTION: Multiple sources
1036
+ # =========================================================
1037
+
1038
+ # Track counts before extraction for this page
1039
+ colors_before = len(self.colors)
1040
+ typo_before = len(self.typography)
1041
+ spacing_before = len(self.spacing)
1042
+ radius_before = len(self.radius)
1043
+ shadows_before = len(self.shadows)
1044
+
1045
+ # 1. Extract DOM computed styles (original method)
1046
+ styles = await self._extract_styles_from_page(page)
1047
+ dom_colors = len(styles.get("colors", []))
1048
+ self._aggregate_colors(styles.get("colors", []))
1049
+ self._aggregate_typography(styles.get("typography", []))
1050
+ self._aggregate_spacing(styles.get("spacing", []))
1051
+ self._aggregate_radius(styles.get("radius", []))
1052
+ self._aggregate_shadows(styles.get("shadows", []))
1053
+
1054
+ # 2. Extract CSS variables (--primary-color, etc.)
1055
+ css_var_count = 0
1056
+ try:
1057
+ css_vars = await self._extract_css_variables(page)
1058
+ css_var_count = len(css_vars.get("computed", {}))
1059
+ self._process_css_variables(css_vars)
1060
+ except Exception as e:
1061
+ self.warnings.append(f"CSS variables extraction failed: {str(e)}")
1062
+
1063
+ # 3. Extract SVG colors (fill, stroke)
1064
+ svg_color_count = 0
1065
+ try:
1066
+ svg_colors = await self._extract_svg_colors(page)
1067
+ svg_color_count = len(svg_colors)
1068
+ self._aggregate_colors(svg_colors)
1069
+ except Exception as e:
1070
+ self.warnings.append(f"SVG color extraction failed: {str(e)}")
1071
+
1072
+ # 4. Extract inline style colors
1073
+ inline_color_count = 0
1074
+ try:
1075
+ inline_colors = await self._extract_inline_styles(page)
1076
+ inline_color_count = len(inline_colors)
1077
+ self._aggregate_colors(inline_colors)
1078
+ except Exception as e:
1079
+ self.warnings.append(f"Inline style extraction failed: {str(e)}")
1080
+
1081
+ # 5. Extract stylesheet colors (CSS rules)
1082
+ stylesheet_color_count = 0
1083
+ try:
1084
+ stylesheet_colors = await self._extract_stylesheet_colors(page)
1085
+ stylesheet_color_count = len(stylesheet_colors)
1086
+ self._aggregate_colors(stylesheet_colors)
1087
+ except Exception as e:
1088
+ self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
1089
+
1090
+ # 6. Fetch external CSS files (bypass CORS)
1091
+ external_css_count = 0
1092
+ try:
1093
+ external_colors = await self._fetch_external_css_colors(page)
1094
+ external_css_count = len(external_colors)
1095
+ self._aggregate_colors(external_colors)
1096
+ except Exception as e:
1097
+ self.warnings.append(f"External CSS fetch failed: {str(e)}")
1098
+
1099
+ # 7. Brute-force scan all page content for colors
1100
+ page_scan_count = 0
1101
+ try:
1102
+ page_colors = await self._extract_all_page_colors(page)
1103
+ page_scan_count = len(page_colors)
1104
+ self._aggregate_colors(page_colors)
1105
+ except Exception as e:
1106
+ self.warnings.append(f"Page scan failed: {str(e)}")
1107
+
1108
+ # 8. Extract foreground-background color pairs for real AA checks
1109
+ try:
1110
+ fg_bg = await self._extract_fg_bg_pairs(page)
1111
+ self.fg_bg_pairs.extend(fg_bg)
1112
+ except Exception as e:
1113
+ self.warnings.append(f"FG/BG pair extraction failed: {str(e)}")
1114
+
1115
+ # =========================================================
1116
+ # Log extraction results for this page
1117
+ # =========================================================
1118
+ colors_new = len(self.colors) - colors_before
1119
+ typo_new = len(self.typography) - typo_before
1120
+ spacing_new = len(self.spacing) - spacing_before
1121
+ radius_new = len(self.radius) - radius_before
1122
+ shadows_new = len(self.shadows) - shadows_before
1123
+
1124
+ # Store extraction stats for logging
1125
+ self._last_extraction_stats = {
1126
+ "url": url,
1127
+ "dom_colors": dom_colors,
1128
+ "css_variables": css_var_count,
1129
+ "svg_colors": svg_color_count,
1130
+ "inline_colors": inline_color_count,
1131
+ "stylesheet_colors": stylesheet_color_count,
1132
+ "external_css_colors": external_css_count,
1133
+ "page_scan_colors": page_scan_count,
1134
+ "new_colors": colors_new,
1135
+ "new_typography": typo_new,
1136
+ "new_spacing": spacing_new,
1137
+ "new_radius": radius_new,
1138
+ "new_shadows": shadows_new,
1139
+ }
1140
+
1141
+ # =========================================================
1142
+
1143
+ self.total_elements += styles.get("elements_count", 0)
1144
+ pages_crawled.append(url)
1145
+
1146
+ await page.close()
1147
+
1148
+ # Progress callback
1149
+ if progress_callback:
1150
+ progress_callback((i + 1) / len(pages))
1151
+
1152
+ # Rate limiting
1153
+ await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000)
1154
+
1155
+ except Exception as e:
1156
+ self.errors.append(f"Error extracting {url}: {str(e)}")
1157
+
1158
+ # Calculate confidence for all tokens
1159
+ for token in self.colors.values():
1160
+ token.confidence = self._calculate_confidence(token.frequency)
1161
+ for token in self.typography.values():
1162
+ token.confidence = self._calculate_confidence(token.frequency)
1163
+ for token in self.spacing.values():
1164
+ token.confidence = self._calculate_confidence(token.frequency)
1165
+
1166
+ # Detect spacing base
1167
+ spacing_base = self._detect_spacing_base()
1168
+
1169
+ # Mark outliers in spacing
1170
+ if spacing_base:
1171
+ for token in self.spacing.values():
1172
+ if spacing_base == 8 and not token.fits_base_8:
1173
+ token.is_outlier = True
1174
+ elif spacing_base == 4 and not token.fits_base_4:
1175
+ token.is_outlier = True
1176
+
1177
+ # Determine primary font
1178
+ if self.font_families:
1179
+ primary_font = max(self.font_families.values(), key=lambda f: f.frequency)
1180
+ primary_font.usage = "primary"
1181
+
1182
+ # Build result
1183
+ end_time = datetime.now()
1184
+ duration_ms = int((end_time - start_time).total_seconds() * 1000)
1185
+
1186
+ return ExtractedTokens(
1187
+ viewport=self.viewport,
1188
+ source_url=pages[0] if pages else "",
1189
+ pages_crawled=pages_crawled,
1190
+ colors=list(self.colors.values()),
1191
+ typography=list(self.typography.values()),
1192
+ spacing=list(self.spacing.values()),
1193
+ radius=list(self.radius.values()),
1194
+ shadows=list(self.shadows.values()),
1195
+ font_families=list(self.font_families.values()),
1196
+ spacing_base=spacing_base,
1197
+ extraction_timestamp=start_time,
1198
+ extraction_duration_ms=duration_ms,
1199
+ total_elements_analyzed=self.total_elements,
1200
+ unique_colors=len(self.colors),
1201
+ unique_font_sizes=len(set(t.font_size for t in self.typography.values())),
1202
+ unique_spacing_values=len(self.spacing),
1203
+ errors=self.errors,
1204
+ warnings=self.warnings,
1205
+ )
1206
+
1207
+
1208
+ # =============================================================================
1209
+ # CONVENIENCE FUNCTIONS
1210
+ # =============================================================================
1211
+
1212
+ async def extract_from_pages(
1213
+ pages: list[str],
1214
+ viewport: Viewport = Viewport.DESKTOP
1215
+ ) -> ExtractedTokens:
1216
+ """Convenience function to extract tokens from pages."""
1217
+ extractor = TokenExtractor(viewport=viewport)
1218
+ return await extractor.extract(pages)
1219
+
1220
+
1221
+ async def extract_both_viewports(pages: list[str]) -> tuple[ExtractedTokens, ExtractedTokens]:
1222
+ """Extract tokens from both desktop and mobile viewports."""
1223
+ desktop_extractor = TokenExtractor(viewport=Viewport.DESKTOP)
1224
+ mobile_extractor = TokenExtractor(viewport=Viewport.MOBILE)
1225
+
1226
+ desktop_result = await desktop_extractor.extract(pages)
1227
+ mobile_result = await mobile_extractor.extract(pages)
1228
+
1229
+ return desktop_result, mobile_result
agents/llm_agents.py ADDED
@@ -0,0 +1,905 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage 2 LLM Agents — Specialized Analysis Tasks
3
+ =================================================
4
+
5
+ These agents handle tasks that REQUIRE LLM reasoning:
6
+ - Brand Identifier: Identify brand colors from usage context
7
+ - Benchmark Advisor: Recommend best-fit design system
8
+ - Best Practices Validator: Prioritize fixes by business impact
9
+ - HEAD Synthesizer: Combine all outputs into final recommendations
10
+
11
+ Each agent has a focused prompt for its specific task.
12
+ """
13
+
14
+ import json
15
+ import re
16
+ from dataclasses import dataclass, field
17
+ from typing import Optional, Callable, Any
18
+ from datetime import datetime
19
+
20
+
21
+ # =============================================================================
22
+ # DATA CLASSES
23
+ # =============================================================================
24
+
25
+ @dataclass
26
+ class BrandIdentification:
27
+ """Results from Brand Identifier agent."""
28
+ brand_primary: dict = field(default_factory=dict)
29
+ # {color, confidence, reasoning, usage_count}
30
+
31
+ brand_secondary: dict = field(default_factory=dict)
32
+ brand_accent: dict = field(default_factory=dict)
33
+
34
+ palette_strategy: str = "" # complementary, analogous, triadic, monochromatic, random
35
+ cohesion_score: int = 5 # 1-10
36
+ cohesion_notes: str = ""
37
+
38
+ semantic_names: dict = field(default_factory=dict)
39
+ # {hex_color: suggested_name}
40
+
41
+ def to_dict(self) -> dict:
42
+ return {
43
+ "brand_primary": self.brand_primary,
44
+ "brand_secondary": self.brand_secondary,
45
+ "brand_accent": self.brand_accent,
46
+ "palette_strategy": self.palette_strategy,
47
+ "cohesion_score": self.cohesion_score,
48
+ "cohesion_notes": self.cohesion_notes,
49
+ "semantic_names": self.semantic_names,
50
+ }
51
+
52
+
53
+ @dataclass
54
+ class BenchmarkAdvice:
55
+ """Results from Benchmark Advisor agent."""
56
+ recommended_benchmark: str = ""
57
+ recommended_benchmark_name: str = ""
58
+ reasoning: str = ""
59
+
60
+ alignment_changes: list = field(default_factory=list)
61
+ # [{change, from, to, effort}]
62
+
63
+ pros_of_alignment: list = field(default_factory=list)
64
+ cons_of_alignment: list = field(default_factory=list)
65
+
66
+ alternative_benchmarks: list = field(default_factory=list)
67
+ # [{name, reason}]
68
+
69
+ def to_dict(self) -> dict:
70
+ return {
71
+ "recommended_benchmark": self.recommended_benchmark,
72
+ "recommended_benchmark_name": self.recommended_benchmark_name,
73
+ "reasoning": self.reasoning,
74
+ "alignment_changes": self.alignment_changes,
75
+ "pros": self.pros_of_alignment,
76
+ "cons": self.cons_of_alignment,
77
+ "alternatives": self.alternative_benchmarks,
78
+ }
79
+
80
+
81
+ @dataclass
82
+ class BestPracticesResult:
83
+ """Results from Best Practices Validator agent."""
84
+ overall_score: int = 50 # 0-100
85
+
86
+ checks: dict = field(default_factory=dict)
87
+ # {check_name: {status: pass/warn/fail, note: str}}
88
+
89
+ priority_fixes: list = field(default_factory=list)
90
+ # [{rank, issue, impact, effort, action}]
91
+
92
+ passing_practices: list = field(default_factory=list)
93
+ failing_practices: list = field(default_factory=list)
94
+
95
+ def to_dict(self) -> dict:
96
+ return {
97
+ "overall_score": self.overall_score,
98
+ "checks": self.checks,
99
+ "priority_fixes": self.priority_fixes,
100
+ "passing": self.passing_practices,
101
+ "failing": self.failing_practices,
102
+ }
103
+
104
+
105
+ @dataclass
106
+ class HeadSynthesis:
107
+ """Final synthesized output from HEAD agent."""
108
+ executive_summary: str = ""
109
+
110
+ scores: dict = field(default_factory=dict)
111
+ # {overall, accessibility, consistency, organization}
112
+
113
+ benchmark_fit: dict = field(default_factory=dict)
114
+ # {closest, similarity, recommendation}
115
+
116
+ brand_analysis: dict = field(default_factory=dict)
117
+ # {primary, secondary, cohesion}
118
+
119
+ top_3_actions: list = field(default_factory=list)
120
+ # [{action, impact, effort, details}]
121
+
122
+ color_recommendations: list = field(default_factory=list)
123
+ # [{role, current, suggested, reason, accept}]
124
+
125
+ type_scale_recommendation: dict = field(default_factory=dict)
126
+ spacing_recommendation: dict = field(default_factory=dict)
127
+
128
+ def to_dict(self) -> dict:
129
+ return {
130
+ "executive_summary": self.executive_summary,
131
+ "scores": self.scores,
132
+ "benchmark_fit": self.benchmark_fit,
133
+ "brand_analysis": self.brand_analysis,
134
+ "top_3_actions": self.top_3_actions,
135
+ "color_recommendations": self.color_recommendations,
136
+ "type_scale_recommendation": self.type_scale_recommendation,
137
+ "spacing_recommendation": self.spacing_recommendation,
138
+ }
139
+
140
+
141
+ # =============================================================================
142
+ # BRAND IDENTIFIER AGENT
143
+ # =============================================================================
144
+
145
+ class BrandIdentifierAgent:
146
+ """
147
+ Identifies brand colors from usage context.
148
+
149
+ WHY LLM: Requires understanding context (33 buttons = likely brand primary),
150
+ not just color math.
151
+ """
152
+
153
+ PROMPT_TEMPLATE = """You are a senior design system analyst. Identify the brand colors from this color usage data.
154
+
155
+ ## COLOR DATA WITH USAGE CONTEXT
156
+
157
+ {color_data}
158
+
159
+ ## SEMANTIC ANALYSIS (from CSS properties)
160
+
161
+ {semantic_analysis}
162
+
163
+ ## YOUR TASK
164
+
165
+ 1. **Identify Brand Colors**:
166
+ - Brand Primary: The main action/CTA color (highest visibility)
167
+ - Brand Secondary: Supporting brand color
168
+ - Brand Accent: Highlight color for emphasis
169
+
170
+ 2. **Assess Palette Strategy**:
171
+ - Is it complementary, analogous, triadic, monochromatic, or random?
172
+
173
+ 3. **Rate Cohesion** (1-10):
174
+ - Do the colors work together?
175
+ - Is there a clear color story?
176
+
177
+ 4. **Suggest Semantic Names** for top 10 most-used colors
178
+
179
+ ## OUTPUT FORMAT (JSON only)
180
+
181
+ {{
182
+ "brand_primary": {{
183
+ "color": "#hex",
184
+ "confidence": "high|medium|low",
185
+ "reasoning": "Why this is brand primary",
186
+ "usage_count": <number>
187
+ }},
188
+ "brand_secondary": {{
189
+ "color": "#hex",
190
+ "confidence": "high|medium|low",
191
+ "reasoning": "..."
192
+ }},
193
+ "brand_accent": {{
194
+ "color": "#hex or null",
195
+ "confidence": "...",
196
+ "reasoning": "..."
197
+ }},
198
+ "palette_strategy": "complementary|analogous|triadic|monochromatic|random",
199
+ "cohesion_score": <1-10>,
200
+ "cohesion_notes": "Assessment of how well colors work together",
201
+ "semantic_names": {{
202
+ "#hex1": "brand.primary",
203
+ "#hex2": "text.primary",
204
+ "#hex3": "background.primary"
205
+ }}
206
+ }}
207
+
208
+ Return ONLY valid JSON."""
209
+
210
+ def __init__(self, hf_client):
211
+ self.hf_client = hf_client
212
+
213
+ async def analyze(
214
+ self,
215
+ color_tokens: dict,
216
+ semantic_analysis: dict,
217
+ log_callback: Callable = None,
218
+ ) -> BrandIdentification:
219
+ """
220
+ Identify brand colors from usage context.
221
+
222
+ Args:
223
+ color_tokens: Dict of color tokens with usage data
224
+ semantic_analysis: Semantic categorization from Stage 1
225
+ log_callback: Progress logging function
226
+
227
+ Returns:
228
+ BrandIdentification with identified colors
229
+ """
230
+ def log(msg: str):
231
+ if log_callback:
232
+ log_callback(msg)
233
+
234
+ log(" 🎨 Brand Identifier (Llama 70B)")
235
+ log(" └─ Analyzing color context and usage patterns...")
236
+
237
+ # Format color data
238
+ color_data = self._format_color_data(color_tokens)
239
+ semantic_str = self._format_semantic_analysis(semantic_analysis)
240
+
241
+ prompt = self.PROMPT_TEMPLATE.format(
242
+ color_data=color_data,
243
+ semantic_analysis=semantic_str,
244
+ )
245
+
246
+ try:
247
+ start_time = datetime.now()
248
+
249
+ # Use the correct method signature
250
+ response = await self.hf_client.complete_async(
251
+ agent_name="brand_identifier",
252
+ system_prompt="You are a senior design system analyst specializing in brand color identification.",
253
+ user_message=prompt,
254
+ max_tokens=800,
255
+ json_mode=True,
256
+ )
257
+
258
+ duration = (datetime.now() - start_time).total_seconds()
259
+
260
+ # Parse response
261
+ result = self._parse_response(response)
262
+
263
+ log(f" ────────────────────────────────────────────────")
264
+ log(f" 🎨 Brand Identifier: COMPLETE ({duration:.1f}s)")
265
+ log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')} confidence)")
266
+ log(f" ├─ Brand Secondary: {result.brand_secondary.get('color', '?')}")
267
+ log(f" ├─ Palette Strategy: {result.palette_strategy}")
268
+ log(f" └─ Cohesion Score: {result.cohesion_score}/10")
269
+
270
+ return result
271
+
272
+ except Exception as e:
273
+ error_msg = str(e)
274
+ # Parse common HF errors
275
+ if "Rate limit" in error_msg or "429" in error_msg:
276
+ log(f" ⚠️ Rate limited - HF free tier exhausted")
277
+ elif "Request ID:" in error_msg:
278
+ log(f" ⚠️ HF API error (check token/model)")
279
+ else:
280
+ log(f" ⚠️ Error: {error_msg[:60]}")
281
+ return BrandIdentification()
282
+
283
+ def _format_color_data(self, color_tokens: dict) -> str:
284
+ """Format color tokens for prompt."""
285
+ lines = []
286
+ for name, token in list(color_tokens.items())[:30]:
287
+ if isinstance(token, dict):
288
+ hex_val = token.get("value", token.get("hex", ""))
289
+ usage = token.get("usage_count", token.get("count", 1))
290
+ context = token.get("context", token.get("css_property", ""))
291
+ else:
292
+ hex_val = getattr(token, "value", "")
293
+ usage = getattr(token, "usage_count", 1)
294
+ context = getattr(token, "context", "")
295
+
296
+ if hex_val:
297
+ lines.append(f"- {hex_val}: used {usage}x, context: {context or 'unknown'}")
298
+
299
+ return "\n".join(lines) if lines else "No color data available"
300
+
301
+ def _format_semantic_analysis(self, semantic: dict) -> str:
302
+ """Format semantic analysis for prompt."""
303
+ if not semantic:
304
+ return "No semantic analysis available"
305
+
306
+ lines = []
307
+ try:
308
+ for category, value in semantic.items():
309
+ if not value:
310
+ continue
311
+
312
+ if isinstance(value, list):
313
+ # List of colors
314
+ color_list = []
315
+ for c in value[:5]:
316
+ if isinstance(c, dict):
317
+ color_list.append(c.get("hex", c.get("value", str(c))))
318
+ else:
319
+ color_list.append(str(c))
320
+ lines.append(f"- {category}: {', '.join(color_list)}")
321
+
322
+ elif isinstance(value, dict):
323
+ # Could be a nested dict of sub-roles → color dicts
324
+ # e.g. {"primary": {"hex": "#007bff", ...}, "secondary": {...}}
325
+ # or a flat color dict {"hex": "#...", "confidence": "..."}
326
+ # or a summary dict {"total_colors_analyzed": 50, ...}
327
+ if "hex" in value:
328
+ # Flat color dict
329
+ lines.append(f"- {category}: {value['hex']}")
330
+ else:
331
+ # Nested dict — iterate sub-roles
332
+ sub_items = []
333
+ for sub_role, sub_val in list(value.items())[:5]:
334
+ if isinstance(sub_val, dict) and "hex" in sub_val:
335
+ sub_items.append(f"{sub_role}={sub_val['hex']}")
336
+ elif isinstance(sub_val, (str, int, float, bool)):
337
+ sub_items.append(f"{sub_role}={sub_val}")
338
+ if sub_items:
339
+ lines.append(f"- {category}: {', '.join(sub_items)}")
340
+ else:
341
+ lines.append(f"- {category}: {value}")
342
+ except Exception as e:
343
+ return f"Error formatting semantic analysis: {str(e)[:50]}"
344
+
345
+ return "\n".join(lines) if lines else "No semantic analysis available"
346
+
347
+ def _parse_response(self, response: str) -> BrandIdentification:
348
+ """Parse LLM response into BrandIdentification."""
349
+ try:
350
+ json_match = re.search(r'\{[\s\S]*\}', response)
351
+ if json_match:
352
+ data = json.loads(json_match.group())
353
+ return BrandIdentification(
354
+ brand_primary=data.get("brand_primary", {}),
355
+ brand_secondary=data.get("brand_secondary", {}),
356
+ brand_accent=data.get("brand_accent", {}),
357
+ palette_strategy=data.get("palette_strategy", "unknown"),
358
+ cohesion_score=data.get("cohesion_score", 5),
359
+ cohesion_notes=data.get("cohesion_notes", ""),
360
+ semantic_names=data.get("semantic_names", {}),
361
+ )
362
+ except Exception:
363
+ pass
364
+
365
+ return BrandIdentification()
366
+
367
+
368
+ # =============================================================================
369
+ # BENCHMARK ADVISOR AGENT
370
+ # =============================================================================
371
+
372
+ class BenchmarkAdvisorAgent:
373
+ """
374
+ Recommends best-fit design system based on comparison data.
375
+
376
+ WHY LLM: Requires reasoning about trade-offs and use-case fit,
377
+ not just similarity scores.
378
+ """
379
+
380
+ PROMPT_TEMPLATE = """You are a senior design system consultant. Recommend the best design system alignment.
381
+
382
+ ## USER'S CURRENT VALUES
383
+
384
+ - Type Scale Ratio: {user_ratio}
385
+ - Base Font Size: {user_base}px
386
+ - Spacing Grid: {user_spacing}px
387
+
388
+ ## BENCHMARK COMPARISON
389
+
390
+ {benchmark_comparison}
391
+
392
+ ## YOUR TASK
393
+
394
+ 1. **Recommend Best Fit**: Which design system should they align with?
395
+ 2. **Explain Why**: Consider similarity scores AND use-case fit
396
+ 3. **List Changes Needed**: What would they need to change to align?
397
+ 4. **Pros/Cons**: Benefits and drawbacks of alignment
398
+
399
+ ## OUTPUT FORMAT (JSON only)
400
+
401
+ {{
402
+ "recommended_benchmark": "<system_key>",
403
+ "recommended_benchmark_name": "<full name>",
404
+ "reasoning": "Why this is the best fit for their use case",
405
+ "alignment_changes": [
406
+ {{"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"}},
407
+ {{"change": "Spacing grid", "from": "mixed", "to": "4px", "effort": "high"}}
408
+ ],
409
+ "pros_of_alignment": [
410
+ "Familiar patterns for users",
411
+ "Well-tested accessibility"
412
+ ],
413
+ "cons_of_alignment": [
414
+ "May lose brand uniqueness"
415
+ ],
416
+ "alternative_benchmarks": [
417
+ {{"name": "Material Design 3", "reason": "Good for Android-first products"}}
418
+ ]
419
+ }}
420
+
421
+ Return ONLY valid JSON."""
422
+
423
+ def __init__(self, hf_client):
424
+ self.hf_client = hf_client
425
+
426
+ async def analyze(
427
+ self,
428
+ user_ratio: float,
429
+ user_base: int,
430
+ user_spacing: int,
431
+ benchmark_comparisons: list,
432
+ log_callback: Callable = None,
433
+ ) -> BenchmarkAdvice:
434
+ """
435
+ Recommend best-fit design system.
436
+
437
+ Args:
438
+ user_ratio: User's detected type scale ratio
439
+ user_base: User's base font size
440
+ user_spacing: User's spacing grid base
441
+ benchmark_comparisons: List of BenchmarkComparison objects
442
+ log_callback: Progress logging function
443
+
444
+ Returns:
445
+ BenchmarkAdvice with recommendations
446
+ """
447
+ def log(msg: str):
448
+ if log_callback:
449
+ log_callback(msg)
450
+
451
+ log("")
452
+ log(" 🏢 Benchmark Advisor (Qwen 72B)")
453
+ log(" └─ Evaluating benchmark fit for your use case...")
454
+
455
+ # Format comparison data
456
+ comparison_str = self._format_comparisons(benchmark_comparisons)
457
+
458
+ prompt = self.PROMPT_TEMPLATE.format(
459
+ user_ratio=user_ratio,
460
+ user_base=user_base,
461
+ user_spacing=user_spacing,
462
+ benchmark_comparison=comparison_str,
463
+ )
464
+
465
+ try:
466
+ start_time = datetime.now()
467
+
468
+ response = await self.hf_client.complete_async(
469
+ agent_name="benchmark_advisor",
470
+ system_prompt="You are a senior design system consultant specializing in design system architecture.",
471
+ user_message=prompt,
472
+ max_tokens=700,
473
+ json_mode=True,
474
+ )
475
+
476
+ duration = (datetime.now() - start_time).total_seconds()
477
+
478
+ result = self._parse_response(response)
479
+
480
+ log(f" ────────────────────────────────────────────────")
481
+ log(f" 🏢 Benchmark Advisor: COMPLETE ({duration:.1f}s)")
482
+ log(f" ├─ Recommended: {result.recommended_benchmark_name}")
483
+ log(f" ├─ Changes Needed: {len(result.alignment_changes)}")
484
+ log(f" └─ Key Change: {result.alignment_changes[0].get('change', 'N/A') if result.alignment_changes else 'None'}")
485
+
486
+ return result
487
+
488
+ except Exception as e:
489
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
490
+ return BenchmarkAdvice()
491
+
492
+ def _format_comparisons(self, comparisons: list) -> str:
493
+ """Format benchmark comparisons for prompt."""
494
+ lines = []
495
+ for i, c in enumerate(comparisons[:5]):
496
+ b = c.benchmark
497
+ lines.append(f"""
498
+ {i+1}. {b.icon} {b.name}
499
+ - Similarity Score: {c.similarity_score:.2f} (lower = better)
500
+ - Match: {c.overall_match_pct:.0f}%
501
+ - Type Ratio: {b.typography.get('scale_ratio', '?')} (diff: {c.type_ratio_diff:.3f})
502
+ - Base Size: {b.typography.get('base_size', '?')}px (diff: {c.base_size_diff})
503
+ - Spacing: {b.spacing.get('base', '?')}px (diff: {c.spacing_grid_diff})
504
+ - Best For: {', '.join(b.best_for)}""")
505
+
506
+ return "\n".join(lines)
507
+
508
+ def _parse_response(self, response: str) -> BenchmarkAdvice:
509
+ """Parse LLM response into BenchmarkAdvice."""
510
+ try:
511
+ json_match = re.search(r'\{[\s\S]*\}', response)
512
+ if json_match:
513
+ data = json.loads(json_match.group())
514
+ return BenchmarkAdvice(
515
+ recommended_benchmark=data.get("recommended_benchmark", ""),
516
+ recommended_benchmark_name=data.get("recommended_benchmark_name", ""),
517
+ reasoning=data.get("reasoning", ""),
518
+ alignment_changes=data.get("alignment_changes", []),
519
+ pros_of_alignment=data.get("pros_of_alignment", []),
520
+ cons_of_alignment=data.get("cons_of_alignment", []),
521
+ alternative_benchmarks=data.get("alternative_benchmarks", []),
522
+ )
523
+ except Exception:
524
+ pass
525
+
526
+ return BenchmarkAdvice()
527
+
528
+
529
+ # =============================================================================
530
+ # BEST PRACTICES VALIDATOR AGENT
531
+ # =============================================================================
532
+
533
+ class BestPracticesValidatorAgent:
534
+ """
535
+ Validates against design system best practices and prioritizes fixes.
536
+
537
+ WHY LLM: Prioritization requires judgment about business impact,
538
+ not just checking boxes.
539
+ """
540
+
541
+ PROMPT_TEMPLATE = """You are a design system auditor. Validate these tokens against best practices.
542
+
543
+ ## RULE ENGINE ANALYSIS RESULTS
544
+
545
+ ### Typography
546
+ - Detected Ratio: {type_ratio} ({type_consistent})
547
+ - Base Size: {base_size}px
548
+ - Recommendation: {type_recommendation}
549
+
550
+ ### Accessibility
551
+ - Total Colors: {total_colors}
552
+ - AA Pass: {aa_pass}
553
+ - AA Fail: {aa_fail}
554
+ - Failing Colors: {failing_colors}
555
+
556
+ ### Spacing
557
+ - Detected Base: {spacing_base}px
558
+ - Grid Aligned: {spacing_aligned}%
559
+ - Recommendation: {spacing_recommendation}px
560
+
561
+ ### Color Statistics
562
+ - Unique Colors: {unique_colors}
563
+ - Duplicates: {duplicates}
564
+ - Near-Duplicates: {near_duplicates}
565
+
566
+ ## BEST PRACTICES CHECKLIST
567
+
568
+ 1. Type scale uses standard ratio (1.2, 1.25, 1.333, 1.5, 1.618)
569
+ 2. Type scale is consistent (variance < 0.15)
570
+ 3. Base font size >= 16px (accessibility)
571
+ 4. Line height >= 1.5 for body text
572
+ 5. All interactive colors pass AA (4.5:1)
573
+ 6. Spacing uses consistent grid (4px or 8px)
574
+ 7. Limited color palette (< 20 unique semantic colors)
575
+ 8. No near-duplicate colors
576
+
577
+ ## YOUR TASK
578
+
579
+ 1. Score each practice: pass/warn/fail
580
+ 2. Calculate overall score (0-100)
581
+ 3. Identify TOP 3 priority fixes with impact assessment
582
+
583
+ ## OUTPUT FORMAT (JSON only)
584
+
585
+ {{
586
+ "overall_score": <0-100>,
587
+ "checks": {{
588
+ "type_scale_standard": {{"status": "pass|warn|fail", "note": "..."}},
589
+ "type_scale_consistent": {{"status": "...", "note": "..."}},
590
+ "base_size_accessible": {{"status": "...", "note": "..."}},
591
+ "aa_compliance": {{"status": "...", "note": "..."}},
592
+ "spacing_grid": {{"status": "...", "note": "..."}},
593
+ "color_count": {{"status": "...", "note": "..."}}
594
+ }},
595
+ "priority_fixes": [
596
+ {{
597
+ "rank": 1,
598
+ "issue": "Brand primary fails AA",
599
+ "impact": "high|medium|low",
600
+ "effort": "low|medium|high",
601
+ "action": "Change #06b2c4 → #0891a8"
602
+ }}
603
+ ],
604
+ "passing_practices": ["Base font size", "..."],
605
+ "failing_practices": ["AA compliance", "..."]
606
+ }}
607
+
608
+ Return ONLY valid JSON."""
609
+
610
+ def __init__(self, hf_client):
611
+ self.hf_client = hf_client
612
+
613
+ async def analyze(
614
+ self,
615
+ rule_engine_results: Any,
616
+ log_callback: Callable = None,
617
+ ) -> BestPracticesResult:
618
+ """
619
+ Validate against best practices.
620
+
621
+ Args:
622
+ rule_engine_results: Results from rule engine
623
+ log_callback: Progress logging function
624
+
625
+ Returns:
626
+ BestPracticesResult with validation
627
+ """
628
+ def log(msg: str):
629
+ if log_callback:
630
+ log_callback(msg)
631
+
632
+ log("")
633
+ log(" ✅ Best Practices Validator (Qwen 72B)")
634
+ log(" └─ Checking against design system standards...")
635
+
636
+ # Extract data from rule engine
637
+ typo = rule_engine_results.typography
638
+ spacing = rule_engine_results.spacing
639
+ color_stats = rule_engine_results.color_stats
640
+ accessibility = rule_engine_results.accessibility
641
+
642
+ failures = [a for a in accessibility if not a.passes_aa_normal]
643
+ failing_colors_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:5]])
644
+
645
+ prompt = self.PROMPT_TEMPLATE.format(
646
+ type_ratio=f"{typo.detected_ratio:.3f}",
647
+ type_consistent="consistent" if typo.is_consistent else f"inconsistent, variance={typo.variance:.2f}",
648
+ base_size=typo.sizes_px[0] if typo.sizes_px else 16,
649
+ type_recommendation=f"{typo.recommendation} ({typo.recommendation_name})",
650
+ total_colors=len(accessibility),
651
+ aa_pass=len(accessibility) - len(failures),
652
+ aa_fail=len(failures),
653
+ failing_colors=failing_colors_str or "None",
654
+ spacing_base=spacing.detected_base,
655
+ spacing_aligned=f"{spacing.alignment_percentage:.0f}",
656
+ spacing_recommendation=spacing.recommendation,
657
+ unique_colors=color_stats.unique_count,
658
+ duplicates=color_stats.duplicate_count,
659
+ near_duplicates=len(color_stats.near_duplicates),
660
+ )
661
+
662
+ try:
663
+ start_time = datetime.now()
664
+
665
+ response = await self.hf_client.complete_async(
666
+ agent_name="best_practices_validator",
667
+ system_prompt="You are a design system auditor specializing in best practices validation.",
668
+ user_message=prompt,
669
+ max_tokens=800,
670
+ json_mode=True,
671
+ )
672
+
673
+ duration = (datetime.now() - start_time).total_seconds()
674
+
675
+ result = self._parse_response(response)
676
+
677
+ log(f" ────────────────────────────────────────────────")
678
+ log(f" ✅ Best Practices: COMPLETE ({duration:.1f}s)")
679
+ log(f" ├─ Overall Score: {result.overall_score}/100")
680
+ log(f" ├─ Passing: {len(result.passing_practices)} | Failing: {len(result.failing_practices)}")
681
+ if result.priority_fixes:
682
+ log(f" └─ Top Fix: {result.priority_fixes[0].get('issue', 'N/A')}")
683
+
684
+ return result
685
+
686
+ except Exception as e:
687
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
688
+ return BestPracticesResult()
689
+
690
+ def _parse_response(self, response: str) -> BestPracticesResult:
691
+ """Parse LLM response into BestPracticesResult."""
692
+ try:
693
+ json_match = re.search(r'\{[\s\S]*\}', response)
694
+ if json_match:
695
+ data = json.loads(json_match.group())
696
+ return BestPracticesResult(
697
+ overall_score=data.get("overall_score", 50),
698
+ checks=data.get("checks", {}),
699
+ priority_fixes=data.get("priority_fixes", []),
700
+ passing_practices=data.get("passing_practices", []),
701
+ failing_practices=data.get("failing_practices", []),
702
+ )
703
+ except Exception:
704
+ pass
705
+
706
+ return BestPracticesResult()
707
+
708
+
709
+ # =============================================================================
710
+ # HEAD SYNTHESIZER AGENT
711
+ # =============================================================================
712
+
713
+ class HeadSynthesizerAgent:
714
+ """
715
+ Combines all agent outputs into final recommendations.
716
+
717
+ This is the final step that produces actionable output for the user.
718
+ """
719
+
720
+ PROMPT_TEMPLATE = """You are a senior design system architect. Synthesize these analysis results into final recommendations.
721
+
722
+ ## RULE ENGINE FACTS
723
+
724
+ - Type Scale: {type_ratio} ({type_status})
725
+ - Base Size: {base_size}px
726
+ - AA Failures: {aa_failures}
727
+ - Spacing Grid: {spacing_status}
728
+ - Unique Colors: {unique_colors}
729
+ - Consistency Score: {consistency_score}/100
730
+
731
+ ## BENCHMARK COMPARISON
732
+
733
+ Closest Match: {closest_benchmark}
734
+ Match Percentage: {match_pct}%
735
+ Recommended Changes: {benchmark_changes}
736
+
737
+ ## BRAND IDENTIFICATION
738
+
739
+ - Brand Primary: {brand_primary}
740
+ - Brand Secondary: {brand_secondary}
741
+ - Palette Cohesion: {cohesion_score}/10
742
+
743
+ ## BEST PRACTICES VALIDATION
744
+
745
+ Overall Score: {best_practices_score}/100
746
+ Priority Fixes: {priority_fixes}
747
+
748
+ ## ACCESSIBILITY FIXES NEEDED
749
+
750
+ {accessibility_fixes}
751
+
752
+ ## YOUR TASK
753
+
754
+ Synthesize ALL the above into:
755
+ 1. Executive Summary (2-3 sentences)
756
+ 2. Overall Scores
757
+ 3. Top 3 Priority Actions (with effort estimates)
758
+ 4. Specific Color Recommendations (with accept/reject defaults)
759
+ 5. Type Scale Recommendation
760
+ 6. Spacing Recommendation
761
+
762
+ ## OUTPUT FORMAT (JSON only)
763
+
764
+ {{
765
+ "executive_summary": "Your design system scores X/100. Key issues are Y. Priority action is Z.",
766
+ "scores": {{
767
+ "overall": <0-100>,
768
+ "accessibility": <0-100>,
769
+ "consistency": <0-100>,
770
+ "organization": <0-100>
771
+ }},
772
+ "benchmark_fit": {{
773
+ "closest": "<name>",
774
+ "similarity": "<X%>",
775
+ "recommendation": "Align type scale to 1.25"
776
+ }},
777
+ "brand_analysis": {{
778
+ "primary": "#hex",
779
+ "secondary": "#hex",
780
+ "cohesion": <1-10>
781
+ }},
782
+ "top_3_actions": [
783
+ {{"action": "Fix brand color AA", "impact": "high", "effort": "5 min", "details": "Change #X to #Y"}}
784
+ ],
785
+ "color_recommendations": [
786
+ {{"role": "brand.primary", "current": "#06b2c4", "suggested": "#0891a8", "reason": "AA compliance", "accept": true}}
787
+ ],
788
+ "type_scale_recommendation": {{
789
+ "current_ratio": 1.18,
790
+ "recommended_ratio": 1.25,
791
+ "reason": "Align with industry standard"
792
+ }},
793
+ "spacing_recommendation": {{
794
+ "current": "mixed",
795
+ "recommended": "8px",
796
+ "reason": "Consistent grid improves maintainability"
797
+ }}
798
+ }}
799
+
800
+ Return ONLY valid JSON."""
801
+
802
+ def __init__(self, hf_client):
803
+ self.hf_client = hf_client
804
+
805
+ async def synthesize(
806
+ self,
807
+ rule_engine_results: Any,
808
+ benchmark_comparisons: list,
809
+ brand_identification: BrandIdentification,
810
+ benchmark_advice: BenchmarkAdvice,
811
+ best_practices: BestPracticesResult,
812
+ log_callback: Callable = None,
813
+ ) -> HeadSynthesis:
814
+ """
815
+ Synthesize all results into final recommendations.
816
+ """
817
+ def log(msg: str):
818
+ if log_callback:
819
+ log_callback(msg)
820
+
821
+ log("")
822
+ log("═" * 60)
823
+ log("🧠 LAYER 4: HEAD SYNTHESIZER")
824
+ log("═" * 60)
825
+ log("")
826
+ log(" Combining: Rule Engine + Benchmarks + Brand + Best Practices...")
827
+
828
+ # Extract data
829
+ typo = rule_engine_results.typography
830
+ spacing = rule_engine_results.spacing
831
+ color_stats = rule_engine_results.color_stats
832
+ accessibility = rule_engine_results.accessibility
833
+
834
+ failures = [a for a in accessibility if not a.passes_aa_normal]
835
+ aa_fixes_str = "\n".join([
836
+ f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)"
837
+ for a in failures[:5] if a.suggested_fix
838
+ ])
839
+
840
+ closest = benchmark_comparisons[0] if benchmark_comparisons else None
841
+
842
+ prompt = self.PROMPT_TEMPLATE.format(
843
+ type_ratio=f"{typo.detected_ratio:.3f}",
844
+ type_status="consistent" if typo.is_consistent else "inconsistent",
845
+ base_size=typo.sizes_px[0] if typo.sizes_px else 16,
846
+ aa_failures=len(failures),
847
+ spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned",
848
+ unique_colors=color_stats.unique_count,
849
+ consistency_score=rule_engine_results.consistency_score,
850
+ closest_benchmark=closest.benchmark.name if closest else "Unknown",
851
+ match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0",
852
+ benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:3]]),
853
+ brand_primary=brand_identification.brand_primary.get("color", "Unknown"),
854
+ brand_secondary=brand_identification.brand_secondary.get("color", "Unknown"),
855
+ cohesion_score=brand_identification.cohesion_score,
856
+ best_practices_score=best_practices.overall_score,
857
+ priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:3]]),
858
+ accessibility_fixes=aa_fixes_str or "None needed",
859
+ )
860
+
861
+ try:
862
+ start_time = datetime.now()
863
+
864
+ response = await self.hf_client.complete_async(
865
+ agent_name="head_synthesizer",
866
+ system_prompt="You are a senior design system architect specializing in synthesis and recommendations.",
867
+ user_message=prompt,
868
+ max_tokens=1000,
869
+ json_mode=True,
870
+ )
871
+
872
+ duration = (datetime.now() - start_time).total_seconds()
873
+
874
+ result = self._parse_response(response)
875
+
876
+ log("")
877
+ log(f" ✅ HEAD Synthesizer: COMPLETE ({duration:.1f}s)")
878
+ log("")
879
+
880
+ return result
881
+
882
+ except Exception as e:
883
+ log(f" ├─ ⚠️ Error: {str(e)[:50]}")
884
+ return HeadSynthesis()
885
+
886
+ def _parse_response(self, response: str) -> HeadSynthesis:
887
+ """Parse LLM response into HeadSynthesis."""
888
+ try:
889
+ json_match = re.search(r'\{[\s\S]*\}', response)
890
+ if json_match:
891
+ data = json.loads(json_match.group())
892
+ return HeadSynthesis(
893
+ executive_summary=data.get("executive_summary", ""),
894
+ scores=data.get("scores", {}),
895
+ benchmark_fit=data.get("benchmark_fit", {}),
896
+ brand_analysis=data.get("brand_analysis", {}),
897
+ top_3_actions=data.get("top_3_actions", []),
898
+ color_recommendations=data.get("color_recommendations", []),
899
+ type_scale_recommendation=data.get("type_scale_recommendation", {}),
900
+ spacing_recommendation=data.get("spacing_recommendation", {}),
901
+ )
902
+ except Exception:
903
+ pass
904
+
905
+ return HeadSynthesis()