riazmo commited on
Commit
e3a838d
·
verified ·
1 Parent(s): e486cfa

Upload extractor.py

Browse files
Files changed (1) hide show
  1. agents/extractor.py +994 -0
agents/extractor.py ADDED
@@ -0,0 +1,994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 1: Token Extractor
3
+ Design System Extractor v2
4
+
5
+ Persona: Meticulous Design Archaeologist
6
+
7
+ Responsibilities:
8
+ - Crawl pages at specified viewport
9
+ - Extract computed styles from all elements
10
+ - Parse CSS files for variables and rules
11
+ - Extract colors from SVGs
12
+ - Collect colors, typography, spacing, radius, shadows
13
+ - Track frequency and context for each token
14
+ """
15
+
16
+ import asyncio
17
+ import re
18
+ from typing import Optional, Callable
19
+ from datetime import datetime
20
+ from collections import defaultdict
21
+
22
+ from playwright.async_api import async_playwright, Browser, Page, BrowserContext
23
+
24
+ from core.token_schema import (
25
+ Viewport,
26
+ ExtractedTokens,
27
+ ColorToken,
28
+ TypographyToken,
29
+ SpacingToken,
30
+ RadiusToken,
31
+ ShadowToken,
32
+ FontFamily,
33
+ TokenSource,
34
+ Confidence,
35
+ )
36
+ from core.color_utils import (
37
+ normalize_hex,
38
+ parse_color,
39
+ get_contrast_with_white,
40
+ get_contrast_with_black,
41
+ check_wcag_compliance,
42
+ )
43
+ from config.settings import get_settings
44
+
45
+
46
+ class TokenExtractor:
47
+ """
48
+ Extracts design tokens from web pages.
49
+
50
+ This is the second part of Agent 1's job — after pages are confirmed,
51
+ we crawl and extract all CSS values.
52
+
53
+ Enhanced with:
54
+ - CSS file parsing for variables and rules
55
+ - SVG color extraction
56
+ - Inline style extraction
57
+ """
58
+
59
+ def __init__(self, viewport: Viewport = Viewport.DESKTOP):
60
+ self.settings = get_settings()
61
+ self.viewport = viewport
62
+ self.browser: Optional[Browser] = None
63
+ self.context: Optional[BrowserContext] = None
64
+
65
+ # Token collection
66
+ self.colors: dict[str, ColorToken] = {}
67
+ self.typography: dict[str, TypographyToken] = {}
68
+ self.spacing: dict[str, SpacingToken] = {}
69
+ self.radius: dict[str, RadiusToken] = {}
70
+ self.shadows: dict[str, ShadowToken] = {}
71
+
72
+ # CSS Variables collection
73
+ self.css_variables: dict[str, str] = {}
74
+
75
+ # Font tracking
76
+ self.font_families: dict[str, FontFamily] = {}
77
+
78
+ # Statistics
79
+ self.total_elements = 0
80
+ self.errors: list[str] = []
81
+ self.warnings: list[str] = []
82
+
83
+ async def __aenter__(self):
84
+ """Async context manager entry."""
85
+ await self._init_browser()
86
+ return self
87
+
88
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
89
+ """Async context manager exit."""
90
+ await self._close_browser()
91
+
92
+ async def _init_browser(self):
93
+ """Initialize Playwright browser."""
94
+ playwright = await async_playwright().start()
95
+ self.browser = await playwright.chromium.launch(
96
+ headless=self.settings.browser.headless
97
+ )
98
+
99
+ # Set viewport based on extraction mode
100
+ if self.viewport == Viewport.DESKTOP:
101
+ width = self.settings.viewport.desktop_width
102
+ height = self.settings.viewport.desktop_height
103
+ else:
104
+ width = self.settings.viewport.mobile_width
105
+ height = self.settings.viewport.mobile_height
106
+
107
+ self.context = await self.browser.new_context(
108
+ viewport={"width": width, "height": height},
109
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
110
+ )
111
+
112
+ async def _close_browser(self):
113
+ """Close browser and cleanup."""
114
+ if self.context:
115
+ await self.context.close()
116
+ if self.browser:
117
+ await self.browser.close()
118
+
119
+ async def _scroll_page(self, page: Page):
120
+ """Scroll page to load lazy content."""
121
+ await page.evaluate("""
122
+ async () => {
123
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
124
+ const height = document.body.scrollHeight;
125
+ const step = window.innerHeight;
126
+
127
+ for (let y = 0; y < height; y += step) {
128
+ window.scrollTo(0, y);
129
+ await delay(100);
130
+ }
131
+
132
+ // Scroll back to top
133
+ window.scrollTo(0, 0);
134
+ }
135
+ """)
136
+
137
+ # Wait for network idle after scrolling
138
+ await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout)
139
+
140
+ async def _extract_styles_from_page(self, page: Page) -> dict:
141
+ """
142
+ Extract computed styles from all elements on the page.
143
+
144
+ This is the core extraction logic — we get getComputedStyle for every element.
145
+ """
146
+ styles_data = await page.evaluate("""
147
+ () => {
148
+ const elements = document.querySelectorAll('*');
149
+ const results = {
150
+ colors: [],
151
+ typography: [],
152
+ spacing: [],
153
+ radius: [],
154
+ shadows: [],
155
+ elements_count: elements.length,
156
+ };
157
+
158
+ const colorProperties = [
159
+ 'color', 'background-color', 'border-color',
160
+ 'border-top-color', 'border-right-color',
161
+ 'border-bottom-color', 'border-left-color',
162
+ 'outline-color', 'text-decoration-color',
163
+ ];
164
+
165
+ const spacingProperties = [
166
+ 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
167
+ 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
168
+ 'gap', 'row-gap', 'column-gap',
169
+ ];
170
+
171
+ elements.forEach(el => {
172
+ const tag = el.tagName.toLowerCase();
173
+ const styles = window.getComputedStyle(el);
174
+
175
+ // Skip invisible elements
176
+ if (styles.display === 'none' || styles.visibility === 'hidden') {
177
+ return;
178
+ }
179
+
180
+ // --- COLORS ---
181
+ colorProperties.forEach(prop => {
182
+ const value = styles.getPropertyValue(prop);
183
+ if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') {
184
+ results.colors.push({
185
+ value: value,
186
+ property: prop,
187
+ element: tag,
188
+ context: prop.includes('background') ? 'background' :
189
+ prop.includes('border') ? 'border' : 'text',
190
+ });
191
+ }
192
+ });
193
+
194
+ // --- TYPOGRAPHY ---
195
+ const fontFamily = styles.getPropertyValue('font-family');
196
+ const fontSize = styles.getPropertyValue('font-size');
197
+ const fontWeight = styles.getPropertyValue('font-weight');
198
+ const lineHeight = styles.getPropertyValue('line-height');
199
+ const letterSpacing = styles.getPropertyValue('letter-spacing');
200
+
201
+ if (fontSize && fontFamily) {
202
+ results.typography.push({
203
+ fontFamily: fontFamily,
204
+ fontSize: fontSize,
205
+ fontWeight: fontWeight,
206
+ lineHeight: lineHeight,
207
+ letterSpacing: letterSpacing,
208
+ element: tag,
209
+ });
210
+ }
211
+
212
+ // --- SPACING ---
213
+ spacingProperties.forEach(prop => {
214
+ const value = styles.getPropertyValue(prop);
215
+ if (value && value !== '0px' && value !== 'auto' && value !== 'normal') {
216
+ const px = parseFloat(value);
217
+ if (!isNaN(px) && px > 0 && px < 500) {
218
+ results.spacing.push({
219
+ value: value,
220
+ valuePx: Math.round(px),
221
+ property: prop,
222
+ context: prop.includes('margin') ? 'margin' :
223
+ prop.includes('padding') ? 'padding' : 'gap',
224
+ });
225
+ }
226
+ }
227
+ });
228
+
229
+ // --- BORDER RADIUS ---
230
+ const radiusProps = [
231
+ 'border-radius', 'border-top-left-radius',
232
+ 'border-top-right-radius', 'border-bottom-left-radius',
233
+ 'border-bottom-right-radius',
234
+ ];
235
+
236
+ radiusProps.forEach(prop => {
237
+ const value = styles.getPropertyValue(prop);
238
+ if (value && value !== '0px') {
239
+ results.radius.push({
240
+ value: value,
241
+ element: tag,
242
+ });
243
+ }
244
+ });
245
+
246
+ // --- BOX SHADOW ---
247
+ const shadow = styles.getPropertyValue('box-shadow');
248
+ if (shadow && shadow !== 'none') {
249
+ results.shadows.push({
250
+ value: shadow,
251
+ element: tag,
252
+ });
253
+ }
254
+ });
255
+
256
+ return results;
257
+ }
258
+ """)
259
+
260
+ return styles_data
261
+
262
+ async def _extract_css_variables(self, page: Page) -> dict:
263
+ """
264
+ Extract CSS custom properties (variables) from :root and stylesheets.
265
+
266
+ This catches colors defined as:
267
+ - :root { --primary-color: #3860be; }
268
+ - :root { --brand-cyan: #00c4cc; }
269
+ """
270
+ css_vars = await page.evaluate("""
271
+ () => {
272
+ const variables = {};
273
+
274
+ // 1. Get CSS variables from :root computed styles
275
+ const rootStyles = getComputedStyle(document.documentElement);
276
+ const rootCss = document.documentElement.style.cssText;
277
+
278
+ // 2. Parse all stylesheets for CSS variables
279
+ for (const sheet of document.styleSheets) {
280
+ try {
281
+ const rules = sheet.cssRules || sheet.rules;
282
+ for (const rule of rules) {
283
+ if (rule.style) {
284
+ for (let i = 0; i < rule.style.length; i++) {
285
+ const prop = rule.style[i];
286
+ if (prop.startsWith('--')) {
287
+ const value = rule.style.getPropertyValue(prop).trim();
288
+ if (value) {
289
+ variables[prop] = value;
290
+ }
291
+ }
292
+ }
293
+ }
294
+ // Also check @media rules
295
+ if (rule.cssRules) {
296
+ for (const innerRule of rule.cssRules) {
297
+ if (innerRule.style) {
298
+ for (let i = 0; i < innerRule.style.length; i++) {
299
+ const prop = innerRule.style[i];
300
+ if (prop.startsWith('--')) {
301
+ const value = innerRule.style.getPropertyValue(prop).trim();
302
+ if (value) {
303
+ variables[prop] = value;
304
+ }
305
+ }
306
+ }
307
+ }
308
+ }
309
+ }
310
+ }
311
+ } catch (e) {
312
+ // CORS may block access to external stylesheets
313
+ console.log('Could not access stylesheet:', e);
314
+ }
315
+ }
316
+
317
+ // 3. Get computed CSS variable values from :root
318
+ const computedVars = {};
319
+ for (const prop of Object.keys(variables)) {
320
+ const computed = rootStyles.getPropertyValue(prop).trim();
321
+ if (computed) {
322
+ computedVars[prop] = computed;
323
+ }
324
+ }
325
+
326
+ return { raw: variables, computed: computedVars };
327
+ }
328
+ """)
329
+
330
+ return css_vars
331
+
332
+ async def _extract_svg_colors(self, page: Page) -> list[dict]:
333
+ """
334
+ Extract colors from SVG elements (fill, stroke).
335
+
336
+ This catches colors in:
337
+ - <svg fill="#00c4cc">
338
+ - <path stroke="#3860be">
339
+ - <circle fill="rgb(188, 212, 50)">
340
+ """
341
+ svg_colors = await page.evaluate("""
342
+ () => {
343
+ const colors = [];
344
+
345
+ // Find all SVG elements
346
+ const svgs = document.querySelectorAll('svg, svg *');
347
+
348
+ svgs.forEach(el => {
349
+ // Check fill attribute
350
+ const fill = el.getAttribute('fill');
351
+ if (fill && fill !== 'none' && fill !== 'currentColor' && !fill.startsWith('url(')) {
352
+ colors.push({
353
+ value: fill,
354
+ property: 'svg-fill',
355
+ element: el.tagName.toLowerCase(),
356
+ context: 'svg',
357
+ });
358
+ }
359
+
360
+ // Check stroke attribute
361
+ const stroke = el.getAttribute('stroke');
362
+ if (stroke && stroke !== 'none' && stroke !== 'currentColor' && !stroke.startsWith('url(')) {
363
+ colors.push({
364
+ value: stroke,
365
+ property: 'svg-stroke',
366
+ element: el.tagName.toLowerCase(),
367
+ context: 'svg',
368
+ });
369
+ }
370
+
371
+ // Check computed styles for SVG elements
372
+ const styles = getComputedStyle(el);
373
+ const computedFill = styles.fill;
374
+ const computedStroke = styles.stroke;
375
+
376
+ if (computedFill && computedFill !== 'none' && !computedFill.startsWith('url(')) {
377
+ colors.push({
378
+ value: computedFill,
379
+ property: 'svg-fill-computed',
380
+ element: el.tagName.toLowerCase(),
381
+ context: 'svg',
382
+ });
383
+ }
384
+
385
+ if (computedStroke && computedStroke !== 'none' && !computedStroke.startsWith('url(')) {
386
+ colors.push({
387
+ value: computedStroke,
388
+ property: 'svg-stroke-computed',
389
+ element: el.tagName.toLowerCase(),
390
+ context: 'svg',
391
+ });
392
+ }
393
+ });
394
+
395
+ return colors;
396
+ }
397
+ """)
398
+
399
+ return svg_colors
400
+
401
+ async def _extract_inline_styles(self, page: Page) -> dict:
402
+ """
403
+ Extract colors from inline style attributes.
404
+
405
+ This catches colors in:
406
+ - <div style="background-color: #bcd432;">
407
+ - <span style="color: rgb(0, 196, 204);">
408
+ """
409
+ inline_data = await page.evaluate("""
410
+ () => {
411
+ const colors = [];
412
+ const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
413
+
414
+ // Find all elements with inline styles
415
+ const elements = document.querySelectorAll('[style]');
416
+
417
+ elements.forEach(el => {
418
+ const styleAttr = el.getAttribute('style');
419
+ if (styleAttr) {
420
+ const matches = styleAttr.match(colorRegex);
421
+ if (matches) {
422
+ matches.forEach(color => {
423
+ colors.push({
424
+ value: color,
425
+ property: 'inline-style',
426
+ element: el.tagName.toLowerCase(),
427
+ context: 'inline',
428
+ });
429
+ });
430
+ }
431
+ }
432
+ });
433
+
434
+ return colors;
435
+ }
436
+ """)
437
+
438
+ return inline_data
439
+
440
+ async def _extract_stylesheet_colors(self, page: Page) -> list[dict]:
441
+ """
442
+ Parse CSS stylesheets for color values.
443
+
444
+ This catches colors defined in CSS rules that may not be
445
+ currently applied to visible elements.
446
+ """
447
+ css_colors = await page.evaluate("""
448
+ () => {
449
+ const colors = [];
450
+ const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
451
+
452
+ // Color-related CSS properties
453
+ const colorProps = [
454
+ 'color', 'background-color', 'background', 'border-color',
455
+ 'border-top-color', 'border-right-color', 'border-bottom-color', 'border-left-color',
456
+ 'outline-color', 'box-shadow', 'text-shadow', 'fill', 'stroke',
457
+ 'caret-color', 'column-rule-color', 'text-decoration-color',
458
+ ];
459
+
460
+ // Parse all stylesheets
461
+ for (const sheet of document.styleSheets) {
462
+ try {
463
+ const rules = sheet.cssRules || sheet.rules;
464
+ for (const rule of rules) {
465
+ if (rule.style) {
466
+ colorProps.forEach(prop => {
467
+ const value = rule.style.getPropertyValue(prop);
468
+ if (value) {
469
+ const matches = value.match(colorRegex);
470
+ if (matches) {
471
+ matches.forEach(color => {
472
+ colors.push({
473
+ value: color,
474
+ property: prop,
475
+ element: 'css-rule',
476
+ context: 'stylesheet',
477
+ selector: rule.selectorText || '',
478
+ });
479
+ });
480
+ }
481
+ }
482
+ });
483
+ }
484
+ }
485
+ } catch (e) {
486
+ // CORS may block access to external stylesheets
487
+ }
488
+ }
489
+
490
+ return colors;
491
+ }
492
+ """)
493
+
494
+ return css_colors
495
+
496
+ def _process_css_variables(self, css_vars: dict):
497
+ """Process CSS variables and extract color tokens from them."""
498
+ computed = css_vars.get("computed", {})
499
+ raw = css_vars.get("raw", {})
500
+
501
+ # Store CSS variables
502
+ self.css_variables = {**raw, **computed}
503
+
504
+ # Extract colors from CSS variables
505
+ color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
506
+
507
+ for var_name, value in computed.items():
508
+ if color_regex.match(value.strip()):
509
+ # This is a color variable
510
+ color_data = {
511
+ "value": value.strip(),
512
+ "property": var_name,
513
+ "element": ":root",
514
+ "context": "css-variable",
515
+ }
516
+
517
+ hex_value = self._process_color(color_data)
518
+ if hex_value and hex_value not in self.colors:
519
+ contrast_white = get_contrast_with_white(hex_value)
520
+ contrast_black = get_contrast_with_black(hex_value)
521
+ compliance = check_wcag_compliance(hex_value, "#ffffff")
522
+
523
+ self.colors[hex_value] = ColorToken(
524
+ value=hex_value,
525
+ frequency=1,
526
+ contexts=["css-variable"],
527
+ elements=[":root"],
528
+ css_properties=[var_name],
529
+ contrast_white=round(contrast_white, 2),
530
+ contrast_black=round(contrast_black, 2),
531
+ wcag_aa_large_text=compliance["aa_large_text"],
532
+ wcag_aa_small_text=compliance["aa_normal_text"],
533
+ source=TokenSource.CSS_VARIABLE,
534
+ confidence=Confidence.HIGH,
535
+ )
536
+ elif hex_value and hex_value in self.colors:
537
+ # Update existing token
538
+ token = self.colors[hex_value]
539
+ token.frequency += 1
540
+ if "css-variable" not in token.contexts:
541
+ token.contexts.append("css-variable")
542
+ if var_name not in token.css_properties:
543
+ token.css_properties.append(var_name)
544
+
545
+ def _process_color(self, color_data: dict) -> Optional[str]:
546
+ """Process and normalize a color value."""
547
+ value = color_data.get("value", "")
548
+
549
+ # Parse and normalize
550
+ parsed = parse_color(value)
551
+ if not parsed:
552
+ return None
553
+
554
+ return parsed.hex
555
+
556
+ def _aggregate_colors(self, raw_colors: list[dict]):
557
+ """Aggregate color data from extraction."""
558
+ for color_data in raw_colors:
559
+ hex_value = self._process_color(color_data)
560
+ if not hex_value:
561
+ continue
562
+
563
+ if hex_value not in self.colors:
564
+ # Calculate contrast ratios
565
+ contrast_white = get_contrast_with_white(hex_value)
566
+ contrast_black = get_contrast_with_black(hex_value)
567
+ compliance = check_wcag_compliance(hex_value, "#ffffff")
568
+
569
+ self.colors[hex_value] = ColorToken(
570
+ value=hex_value,
571
+ frequency=0,
572
+ contexts=[],
573
+ elements=[],
574
+ css_properties=[],
575
+ contrast_white=round(contrast_white, 2),
576
+ contrast_black=round(contrast_black, 2),
577
+ wcag_aa_large_text=compliance["aa_large_text"],
578
+ wcag_aa_small_text=compliance["aa_normal_text"],
579
+ )
580
+
581
+ # Update frequency and context
582
+ token = self.colors[hex_value]
583
+ token.frequency += 1
584
+
585
+ context = color_data.get("context", "")
586
+ if context and context not in token.contexts:
587
+ token.contexts.append(context)
588
+
589
+ element = color_data.get("element", "")
590
+ if element and element not in token.elements:
591
+ token.elements.append(element)
592
+
593
+ prop = color_data.get("property", "")
594
+ if prop and prop not in token.css_properties:
595
+ token.css_properties.append(prop)
596
+
597
+ def _aggregate_typography(self, raw_typography: list[dict]):
598
+ """Aggregate typography data from extraction."""
599
+ for typo_data in raw_typography:
600
+ # Create unique key
601
+ font_family = typo_data.get("fontFamily", "")
602
+ font_size = typo_data.get("fontSize", "")
603
+ font_weight = typo_data.get("fontWeight", "400")
604
+ line_height = typo_data.get("lineHeight", "normal")
605
+
606
+ key = f"{font_size}|{font_weight}|{font_family[:50]}"
607
+
608
+ if key not in self.typography:
609
+ # Parse font size to px
610
+ font_size_px = None
611
+ if font_size.endswith("px"):
612
+ try:
613
+ font_size_px = float(font_size.replace("px", ""))
614
+ except ValueError:
615
+ pass
616
+
617
+ # Parse line height
618
+ line_height_computed = None
619
+ if line_height and line_height != "normal":
620
+ if line_height.endswith("px") and font_size_px:
621
+ try:
622
+ lh_px = float(line_height.replace("px", ""))
623
+ line_height_computed = round(lh_px / font_size_px, 2)
624
+ except ValueError:
625
+ pass
626
+ else:
627
+ try:
628
+ line_height_computed = float(line_height)
629
+ except ValueError:
630
+ pass
631
+
632
+ self.typography[key] = TypographyToken(
633
+ font_family=font_family.split(",")[0].strip().strip('"\''),
634
+ font_size=font_size,
635
+ font_size_px=font_size_px,
636
+ font_weight=int(font_weight) if font_weight.isdigit() else 400,
637
+ line_height=line_height,
638
+ line_height_computed=line_height_computed,
639
+ letter_spacing=typo_data.get("letterSpacing"),
640
+ frequency=0,
641
+ elements=[],
642
+ )
643
+
644
+ # Update
645
+ token = self.typography[key]
646
+ token.frequency += 1
647
+
648
+ element = typo_data.get("element", "")
649
+ if element and element not in token.elements:
650
+ token.elements.append(element)
651
+
652
+ # Track font families
653
+ primary_font = token.font_family
654
+ if primary_font not in self.font_families:
655
+ self.font_families[primary_font] = FontFamily(
656
+ name=primary_font,
657
+ fallbacks=[f.strip().strip('"\'') for f in font_family.split(",")[1:]],
658
+ frequency=0,
659
+ )
660
+ self.font_families[primary_font].frequency += 1
661
+
662
+ def _aggregate_spacing(self, raw_spacing: list[dict]):
663
+ """Aggregate spacing data from extraction."""
664
+ for space_data in raw_spacing:
665
+ value = space_data.get("value", "")
666
+ value_px = space_data.get("valuePx", 0)
667
+
668
+ key = str(value_px)
669
+
670
+ if key not in self.spacing:
671
+ self.spacing[key] = SpacingToken(
672
+ value=f"{value_px}px",
673
+ value_px=value_px,
674
+ frequency=0,
675
+ contexts=[],
676
+ properties=[],
677
+ fits_base_4=value_px % 4 == 0,
678
+ fits_base_8=value_px % 8 == 0,
679
+ )
680
+
681
+ token = self.spacing[key]
682
+ token.frequency += 1
683
+
684
+ context = space_data.get("context", "")
685
+ if context and context not in token.contexts:
686
+ token.contexts.append(context)
687
+
688
+ prop = space_data.get("property", "")
689
+ if prop and prop not in token.properties:
690
+ token.properties.append(prop)
691
+
692
+ def _aggregate_radius(self, raw_radius: list[dict]):
693
+ """Aggregate border radius data."""
694
+ for radius_data in raw_radius:
695
+ value = radius_data.get("value", "")
696
+
697
+ # Normalize to simple format
698
+ # "8px 8px 8px 8px" -> "8px"
699
+ parts = value.split()
700
+ if len(set(parts)) == 1:
701
+ value = parts[0]
702
+
703
+ if value not in self.radius:
704
+ value_px = None
705
+ if value.endswith("px"):
706
+ try:
707
+ value_px = int(float(value.replace("px", "")))
708
+ except ValueError:
709
+ pass
710
+
711
+ self.radius[value] = RadiusToken(
712
+ value=value,
713
+ value_px=value_px,
714
+ frequency=0,
715
+ elements=[],
716
+ fits_base_4=value_px % 4 == 0 if value_px else False,
717
+ fits_base_8=value_px % 8 == 0 if value_px else False,
718
+ )
719
+
720
+ token = self.radius[value]
721
+ token.frequency += 1
722
+
723
+ element = radius_data.get("element", "")
724
+ if element and element not in token.elements:
725
+ token.elements.append(element)
726
+
727
+ def _aggregate_shadows(self, raw_shadows: list[dict]):
728
+ """Aggregate box shadow data."""
729
+ for shadow_data in raw_shadows:
730
+ value = shadow_data.get("value", "")
731
+
732
+ if value not in self.shadows:
733
+ self.shadows[value] = ShadowToken(
734
+ value=value,
735
+ frequency=0,
736
+ elements=[],
737
+ )
738
+
739
+ token = self.shadows[value]
740
+ token.frequency += 1
741
+
742
+ element = shadow_data.get("element", "")
743
+ if element and element not in token.elements:
744
+ token.elements.append(element)
745
+
746
+ def _calculate_confidence(self, frequency: int) -> Confidence:
747
+ """Calculate confidence level based on frequency."""
748
+ if frequency >= 10:
749
+ return Confidence.HIGH
750
+ elif frequency >= 3:
751
+ return Confidence.MEDIUM
752
+ return Confidence.LOW
753
+
754
+ def _detect_spacing_base(self) -> Optional[int]:
755
+ """Detect the base spacing unit (4 or 8)."""
756
+ fits_4 = sum(1 for s in self.spacing.values() if s.fits_base_4)
757
+ fits_8 = sum(1 for s in self.spacing.values() if s.fits_base_8)
758
+
759
+ total = len(self.spacing)
760
+ if total == 0:
761
+ return None
762
+
763
+ # If 80%+ values fit base 8, use 8
764
+ if fits_8 / total >= 0.8:
765
+ return 8
766
+ # If 80%+ values fit base 4, use 4
767
+ elif fits_4 / total >= 0.8:
768
+ return 4
769
+
770
+ return None
771
+
772
+ async def extract(
773
+ self,
774
+ pages: list[str],
775
+ progress_callback: Optional[Callable[[float], None]] = None
776
+ ) -> ExtractedTokens:
777
+ """
778
+ Extract tokens from a list of pages.
779
+
780
+ Enhanced extraction includes:
781
+ - DOM computed styles
782
+ - CSS variables from :root
783
+ - SVG fill/stroke colors
784
+ - Inline style colors
785
+ - Stylesheet color rules
786
+
787
+ Args:
788
+ pages: List of URLs to crawl
789
+ progress_callback: Optional callback for progress updates
790
+
791
+ Returns:
792
+ ExtractedTokens with all discovered tokens
793
+ """
794
+ start_time = datetime.now()
795
+ pages_crawled = []
796
+
797
+ async with self:
798
+ for i, url in enumerate(pages):
799
+ try:
800
+ page = await self.context.new_page()
801
+
802
+ # Navigate with fallback strategy
803
+ try:
804
+ await page.goto(
805
+ url,
806
+ wait_until="domcontentloaded",
807
+ timeout=60000 # 60 seconds
808
+ )
809
+ # Wait for JS to render
810
+ await page.wait_for_timeout(2000)
811
+ except Exception as nav_error:
812
+ # Fallback to load event
813
+ try:
814
+ await page.goto(
815
+ url,
816
+ wait_until="load",
817
+ timeout=60000
818
+ )
819
+ await page.wait_for_timeout(3000)
820
+ except Exception:
821
+ self.warnings.append(f"Slow load for {url}, extracting partial content")
822
+
823
+ # Scroll to load lazy content
824
+ await self._scroll_page(page)
825
+
826
+ # =========================================================
827
+ # ENHANCED EXTRACTION: Multiple sources
828
+ # =========================================================
829
+
830
+ # Track counts before extraction for this page
831
+ colors_before = len(self.colors)
832
+ typo_before = len(self.typography)
833
+ spacing_before = len(self.spacing)
834
+ radius_before = len(self.radius)
835
+ shadows_before = len(self.shadows)
836
+
837
+ # 1. Extract DOM computed styles (original method)
838
+ styles = await self._extract_styles_from_page(page)
839
+ dom_colors = len(styles.get("colors", []))
840
+ self._aggregate_colors(styles.get("colors", []))
841
+ self._aggregate_typography(styles.get("typography", []))
842
+ self._aggregate_spacing(styles.get("spacing", []))
843
+ self._aggregate_radius(styles.get("radius", []))
844
+ self._aggregate_shadows(styles.get("shadows", []))
845
+
846
+ # 2. Extract CSS variables (--primary-color, etc.)
847
+ css_var_count = 0
848
+ try:
849
+ css_vars = await self._extract_css_variables(page)
850
+ css_var_count = len(css_vars.get("computed", {}))
851
+ self._process_css_variables(css_vars)
852
+ except Exception as e:
853
+ self.warnings.append(f"CSS variables extraction failed: {str(e)}")
854
+
855
+ # 3. Extract SVG colors (fill, stroke)
856
+ svg_color_count = 0
857
+ try:
858
+ svg_colors = await self._extract_svg_colors(page)
859
+ svg_color_count = len(svg_colors)
860
+ self._aggregate_colors(svg_colors)
861
+ except Exception as e:
862
+ self.warnings.append(f"SVG color extraction failed: {str(e)}")
863
+
864
+ # 4. Extract inline style colors
865
+ inline_color_count = 0
866
+ try:
867
+ inline_colors = await self._extract_inline_styles(page)
868
+ inline_color_count = len(inline_colors)
869
+ self._aggregate_colors(inline_colors)
870
+ except Exception as e:
871
+ self.warnings.append(f"Inline style extraction failed: {str(e)}")
872
+
873
+ # 5. Extract stylesheet colors (CSS rules)
874
+ stylesheet_color_count = 0
875
+ try:
876
+ stylesheet_colors = await self._extract_stylesheet_colors(page)
877
+ stylesheet_color_count = len(stylesheet_colors)
878
+ self._aggregate_colors(stylesheet_colors)
879
+ except Exception as e:
880
+ self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
881
+
882
+ # =========================================================
883
+ # Log extraction results for this page
884
+ # =========================================================
885
+ colors_new = len(self.colors) - colors_before
886
+ typo_new = len(self.typography) - typo_before
887
+ spacing_new = len(self.spacing) - spacing_before
888
+ radius_new = len(self.radius) - radius_before
889
+ shadows_new = len(self.shadows) - shadows_before
890
+
891
+ # Store extraction stats for logging
892
+ self._last_extraction_stats = {
893
+ "url": url,
894
+ "dom_colors": dom_colors,
895
+ "css_variables": css_var_count,
896
+ "svg_colors": svg_color_count,
897
+ "inline_colors": inline_color_count,
898
+ "stylesheet_colors": stylesheet_color_count,
899
+ "new_colors": colors_new,
900
+ "new_typography": typo_new,
901
+ "new_spacing": spacing_new,
902
+ "new_radius": radius_new,
903
+ "new_shadows": shadows_new,
904
+ }
905
+
906
+ # =========================================================
907
+
908
+ self.total_elements += styles.get("elements_count", 0)
909
+ pages_crawled.append(url)
910
+
911
+ await page.close()
912
+
913
+ # Progress callback
914
+ if progress_callback:
915
+ progress_callback((i + 1) / len(pages))
916
+
917
+ # Rate limiting
918
+ await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000)
919
+
920
+ except Exception as e:
921
+ self.errors.append(f"Error extracting {url}: {str(e)}")
922
+
923
+ # Calculate confidence for all tokens
924
+ for token in self.colors.values():
925
+ token.confidence = self._calculate_confidence(token.frequency)
926
+ for token in self.typography.values():
927
+ token.confidence = self._calculate_confidence(token.frequency)
928
+ for token in self.spacing.values():
929
+ token.confidence = self._calculate_confidence(token.frequency)
930
+
931
+ # Detect spacing base
932
+ spacing_base = self._detect_spacing_base()
933
+
934
+ # Mark outliers in spacing
935
+ if spacing_base:
936
+ for token in self.spacing.values():
937
+ if spacing_base == 8 and not token.fits_base_8:
938
+ token.is_outlier = True
939
+ elif spacing_base == 4 and not token.fits_base_4:
940
+ token.is_outlier = True
941
+
942
+ # Determine primary font
943
+ if self.font_families:
944
+ primary_font = max(self.font_families.values(), key=lambda f: f.frequency)
945
+ primary_font.usage = "primary"
946
+
947
+ # Build result
948
+ end_time = datetime.now()
949
+ duration_ms = int((end_time - start_time).total_seconds() * 1000)
950
+
951
+ return ExtractedTokens(
952
+ viewport=self.viewport,
953
+ source_url=pages[0] if pages else "",
954
+ pages_crawled=pages_crawled,
955
+ colors=list(self.colors.values()),
956
+ typography=list(self.typography.values()),
957
+ spacing=list(self.spacing.values()),
958
+ radius=list(self.radius.values()),
959
+ shadows=list(self.shadows.values()),
960
+ font_families=list(self.font_families.values()),
961
+ spacing_base=spacing_base,
962
+ extraction_timestamp=start_time,
963
+ extraction_duration_ms=duration_ms,
964
+ total_elements_analyzed=self.total_elements,
965
+ unique_colors=len(self.colors),
966
+ unique_font_sizes=len(set(t.font_size for t in self.typography.values())),
967
+ unique_spacing_values=len(self.spacing),
968
+ errors=self.errors,
969
+ warnings=self.warnings,
970
+ )
971
+
972
+
973
+ # =============================================================================
974
+ # CONVENIENCE FUNCTIONS
975
+ # =============================================================================
976
+
977
+ async def extract_from_pages(
978
+ pages: list[str],
979
+ viewport: Viewport = Viewport.DESKTOP
980
+ ) -> ExtractedTokens:
981
+ """Convenience function to extract tokens from pages."""
982
+ extractor = TokenExtractor(viewport=viewport)
983
+ return await extractor.extract(pages)
984
+
985
+
986
+ async def extract_both_viewports(pages: list[str]) -> tuple[ExtractedTokens, ExtractedTokens]:
987
+ """Extract tokens from both desktop and mobile viewports."""
988
+ desktop_extractor = TokenExtractor(viewport=Viewport.DESKTOP)
989
+ mobile_extractor = TokenExtractor(viewport=Viewport.MOBILE)
990
+
991
+ desktop_result = await desktop_extractor.extract(pages)
992
+ mobile_result = await mobile_extractor.extract(pages)
993
+
994
+ return desktop_result, mobile_result