riazmo commited on
Commit
7f20fd9
·
verified ·
1 Parent(s): d2da502

Delete agents/extractor.py

Browse files
Files changed (1) hide show
  1. agents/extractor.py +0 -1229
agents/extractor.py DELETED
@@ -1,1229 +0,0 @@
1
- """
2
- Agent 1: Token Extractor
3
- Design System Extractor v2
4
-
5
- Persona: Meticulous Design Archaeologist
6
-
7
- Responsibilities:
8
- - Crawl pages at specified viewport
9
- - Extract computed styles from all elements
10
- - Parse CSS files for variables and rules
11
- - Extract colors from SVGs
12
- - Collect colors, typography, spacing, radius, shadows
13
- - Track frequency and context for each token
14
- """
15
-
16
- import asyncio
17
- import re
18
- from typing import Optional, Callable
19
- from datetime import datetime
20
- from collections import defaultdict
21
-
22
- from playwright.async_api import async_playwright, Browser, Page, BrowserContext
23
-
24
- from core.token_schema import (
25
- Viewport,
26
- ExtractedTokens,
27
- ColorToken,
28
- TypographyToken,
29
- SpacingToken,
30
- RadiusToken,
31
- ShadowToken,
32
- FontFamily,
33
- TokenSource,
34
- Confidence,
35
- )
36
- from core.color_utils import (
37
- normalize_hex,
38
- parse_color,
39
- get_contrast_with_white,
40
- get_contrast_with_black,
41
- check_wcag_compliance,
42
- )
43
- from config.settings import get_settings
44
-
45
-
46
- class TokenExtractor:
47
- """
48
- Extracts design tokens from web pages.
49
-
50
- This is the second part of Agent 1's job — after pages are confirmed,
51
- we crawl and extract all CSS values.
52
-
53
- Enhanced with:
54
- - CSS file parsing for variables and rules
55
- - SVG color extraction
56
- - Inline style extraction
57
- """
58
-
59
- def __init__(self, viewport: Viewport = Viewport.DESKTOP):
60
- self.settings = get_settings()
61
- self.viewport = viewport
62
- self.browser: Optional[Browser] = None
63
- self.context: Optional[BrowserContext] = None
64
-
65
- # Token collection
66
- self.colors: dict[str, ColorToken] = {}
67
- self.typography: dict[str, TypographyToken] = {}
68
- self.spacing: dict[str, SpacingToken] = {}
69
- self.radius: dict[str, RadiusToken] = {}
70
- self.shadows: dict[str, ShadowToken] = {}
71
-
72
- # Foreground-background pairs extracted from actual DOM elements
73
- self.fg_bg_pairs: list[dict] = []
74
-
75
- # CSS Variables collection
76
- self.css_variables: dict[str, str] = {}
77
-
78
- # Font tracking
79
- self.font_families: dict[str, FontFamily] = {}
80
-
81
- # Statistics
82
- self.total_elements = 0
83
- self.errors: list[str] = []
84
- self.warnings: list[str] = []
85
-
86
- async def __aenter__(self):
87
- """Async context manager entry."""
88
- await self._init_browser()
89
- return self
90
-
91
- async def __aexit__(self, exc_type, exc_val, exc_tb):
92
- """Async context manager exit."""
93
- await self._close_browser()
94
-
95
- async def _init_browser(self):
96
- """Initialize Playwright browser."""
97
- playwright = await async_playwright().start()
98
- self.browser = await playwright.chromium.launch(
99
- headless=self.settings.browser.headless
100
- )
101
-
102
- # Set viewport based on extraction mode
103
- if self.viewport == Viewport.DESKTOP:
104
- width = self.settings.viewport.desktop_width
105
- height = self.settings.viewport.desktop_height
106
- else:
107
- width = self.settings.viewport.mobile_width
108
- height = self.settings.viewport.mobile_height
109
-
110
- self.context = await self.browser.new_context(
111
- viewport={"width": width, "height": height},
112
- user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
113
- )
114
-
115
- async def _close_browser(self):
116
- """Close browser and cleanup."""
117
- if self.context:
118
- await self.context.close()
119
- if self.browser:
120
- await self.browser.close()
121
-
122
- async def _scroll_page(self, page: Page):
123
- """Scroll page to load lazy content."""
124
- await page.evaluate("""
125
- async () => {
126
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
127
- const height = document.body.scrollHeight;
128
- const step = window.innerHeight;
129
-
130
- for (let y = 0; y < height; y += step) {
131
- window.scrollTo(0, y);
132
- await delay(100);
133
- }
134
-
135
- // Scroll back to top
136
- window.scrollTo(0, 0);
137
- }
138
- """)
139
-
140
- # Wait for network idle after scrolling
141
- await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout)
142
-
143
- async def _extract_styles_from_page(self, page: Page) -> dict:
144
- """
145
- Extract computed styles from all elements on the page.
146
-
147
- This is the core extraction logic — we get getComputedStyle for every element.
148
- """
149
- styles_data = await page.evaluate("""
150
- () => {
151
- const elements = document.querySelectorAll('*');
152
- const results = {
153
- colors: [],
154
- typography: [],
155
- spacing: [],
156
- radius: [],
157
- shadows: [],
158
- elements_count: elements.length,
159
- };
160
-
161
- const colorProperties = [
162
- 'color', 'background-color', 'border-color',
163
- 'border-top-color', 'border-right-color',
164
- 'border-bottom-color', 'border-left-color',
165
- 'outline-color', 'text-decoration-color',
166
- ];
167
-
168
- const spacingProperties = [
169
- 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
170
- 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
171
- 'gap', 'row-gap', 'column-gap',
172
- ];
173
-
174
- elements.forEach(el => {
175
- const tag = el.tagName.toLowerCase();
176
- const styles = window.getComputedStyle(el);
177
-
178
- // Skip invisible elements
179
- if (styles.display === 'none' || styles.visibility === 'hidden') {
180
- return;
181
- }
182
-
183
- // --- COLORS ---
184
- colorProperties.forEach(prop => {
185
- const value = styles.getPropertyValue(prop);
186
- if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') {
187
- results.colors.push({
188
- value: value,
189
- property: prop,
190
- element: tag,
191
- context: prop.includes('background') ? 'background' :
192
- prop.includes('border') ? 'border' : 'text',
193
- });
194
- }
195
- });
196
-
197
- // --- TYPOGRAPHY ---
198
- const fontFamily = styles.getPropertyValue('font-family');
199
- const fontSize = styles.getPropertyValue('font-size');
200
- const fontWeight = styles.getPropertyValue('font-weight');
201
- const lineHeight = styles.getPropertyValue('line-height');
202
- const letterSpacing = styles.getPropertyValue('letter-spacing');
203
-
204
- if (fontSize && fontFamily) {
205
- results.typography.push({
206
- fontFamily: fontFamily,
207
- fontSize: fontSize,
208
- fontWeight: fontWeight,
209
- lineHeight: lineHeight,
210
- letterSpacing: letterSpacing,
211
- element: tag,
212
- });
213
- }
214
-
215
- // --- SPACING ---
216
- spacingProperties.forEach(prop => {
217
- const value = styles.getPropertyValue(prop);
218
- if (value && value !== '0px' && value !== 'auto' && value !== 'normal') {
219
- const px = parseFloat(value);
220
- if (!isNaN(px) && px > 0 && px < 500) {
221
- results.spacing.push({
222
- value: value,
223
- valuePx: Math.round(px),
224
- property: prop,
225
- context: prop.includes('margin') ? 'margin' :
226
- prop.includes('padding') ? 'padding' : 'gap',
227
- });
228
- }
229
- }
230
- });
231
-
232
- // --- BORDER RADIUS ---
233
- const radiusProps = [
234
- 'border-radius', 'border-top-left-radius',
235
- 'border-top-right-radius', 'border-bottom-left-radius',
236
- 'border-bottom-right-radius',
237
- ];
238
-
239
- radiusProps.forEach(prop => {
240
- const value = styles.getPropertyValue(prop);
241
- if (value && value !== '0px') {
242
- results.radius.push({
243
- value: value,
244
- element: tag,
245
- });
246
- }
247
- });
248
-
249
- // --- BOX SHADOW ---
250
- const shadow = styles.getPropertyValue('box-shadow');
251
- if (shadow && shadow !== 'none') {
252
- results.shadows.push({
253
- value: shadow,
254
- element: tag,
255
- });
256
- }
257
- });
258
-
259
- return results;
260
- }
261
- """)
262
-
263
- return styles_data
264
-
265
- async def _extract_fg_bg_pairs(self, page: Page) -> list[dict]:
266
- """
267
- Extract actual foreground-background color pairs from visible DOM elements.
268
-
269
- For each visible element that has a non-transparent text color, walk up the
270
- ancestor chain to find the effective background color. This gives us real
271
- foreground/background pairs so we can do accurate WCAG AA checks instead of
272
- only comparing every color against white/black.
273
- """
274
- pairs = await page.evaluate("""
275
- () => {
276
- const pairs = [];
277
- const seen = new Set();
278
-
279
- function rgbToHex(rgb) {
280
- if (!rgb || rgb === 'transparent' || rgb === 'rgba(0, 0, 0, 0)') return null;
281
- const match = rgb.match(/rgba?\\((\\d+),\\s*(\\d+),\\s*(\\d+)/);
282
- if (!match) return null;
283
- const r = parseInt(match[1]);
284
- const g = parseInt(match[2]);
285
- const b = parseInt(match[3]);
286
- return '#' + [r, g, b].map(c => c.toString(16).padStart(2, '0')).join('');
287
- }
288
-
289
- function getEffectiveBackground(el) {
290
- let current = el;
291
- while (current && current !== document.documentElement) {
292
- const bg = window.getComputedStyle(current).backgroundColor;
293
- if (bg && bg !== 'rgba(0, 0, 0, 0)' && bg !== 'transparent') {
294
- return rgbToHex(bg);
295
- }
296
- current = current.parentElement;
297
- }
298
- return '#ffffff'; // default page background
299
- }
300
-
301
- const elements = document.querySelectorAll('*');
302
- elements.forEach(el => {
303
- const styles = window.getComputedStyle(el);
304
- if (styles.display === 'none' || styles.visibility === 'hidden') return;
305
-
306
- const fg = rgbToHex(styles.color);
307
- if (!fg) return;
308
-
309
- const bg = getEffectiveBackground(el);
310
- if (!bg) return;
311
-
312
- const key = fg + '|' + bg;
313
- if (seen.has(key)) return;
314
- seen.add(key);
315
-
316
- pairs.push({
317
- foreground: fg,
318
- background: bg,
319
- element: el.tagName.toLowerCase(),
320
- });
321
- });
322
-
323
- return pairs;
324
- }
325
- """)
326
- return pairs or []
327
-
328
- async def _extract_css_variables(self, page: Page) -> dict:
329
- """
330
- Extract CSS custom properties (variables) from :root and stylesheets.
331
-
332
- This catches colors defined as:
333
- - :root { --primary-color: #3860be; }
334
- - :root { --brand-cyan: #00c4cc; }
335
- """
336
- css_vars = await page.evaluate("""
337
- () => {
338
- const variables = {};
339
-
340
- // 1. Get CSS variables from :root computed styles
341
- const rootStyles = getComputedStyle(document.documentElement);
342
- const rootCss = document.documentElement.style.cssText;
343
-
344
- // 2. Parse all stylesheets for CSS variables
345
- for (const sheet of document.styleSheets) {
346
- try {
347
- const rules = sheet.cssRules || sheet.rules;
348
- for (const rule of rules) {
349
- if (rule.style) {
350
- for (let i = 0; i < rule.style.length; i++) {
351
- const prop = rule.style[i];
352
- if (prop.startsWith('--')) {
353
- const value = rule.style.getPropertyValue(prop).trim();
354
- if (value) {
355
- variables[prop] = value;
356
- }
357
- }
358
- }
359
- }
360
- // Also check @media rules
361
- if (rule.cssRules) {
362
- for (const innerRule of rule.cssRules) {
363
- if (innerRule.style) {
364
- for (let i = 0; i < innerRule.style.length; i++) {
365
- const prop = innerRule.style[i];
366
- if (prop.startsWith('--')) {
367
- const value = innerRule.style.getPropertyValue(prop).trim();
368
- if (value) {
369
- variables[prop] = value;
370
- }
371
- }
372
- }
373
- }
374
- }
375
- }
376
- }
377
- } catch (e) {
378
- // CORS may block access to external stylesheets
379
- console.log('Could not access stylesheet:', e);
380
- }
381
- }
382
-
383
- // 3. Get computed CSS variable values from :root
384
- const computedVars = {};
385
- for (const prop of Object.keys(variables)) {
386
- const computed = rootStyles.getPropertyValue(prop).trim();
387
- if (computed) {
388
- computedVars[prop] = computed;
389
- }
390
- }
391
-
392
- return { raw: variables, computed: computedVars };
393
- }
394
- """)
395
-
396
- return css_vars
397
-
398
- async def _extract_svg_colors(self, page: Page) -> list[dict]:
399
- """
400
- Extract colors from SVG elements (fill, stroke).
401
-
402
- This catches colors in:
403
- - <svg fill="#00c4cc">
404
- - <path stroke="#3860be">
405
- - <circle fill="rgb(188, 212, 50)">
406
- """
407
- svg_colors = await page.evaluate("""
408
- () => {
409
- const colors = [];
410
-
411
- // Find all SVG elements
412
- const svgs = document.querySelectorAll('svg, svg *');
413
-
414
- svgs.forEach(el => {
415
- // Check fill attribute
416
- const fill = el.getAttribute('fill');
417
- if (fill && fill !== 'none' && fill !== 'currentColor' && !fill.startsWith('url(')) {
418
- colors.push({
419
- value: fill,
420
- property: 'svg-fill',
421
- element: el.tagName.toLowerCase(),
422
- context: 'svg',
423
- });
424
- }
425
-
426
- // Check stroke attribute
427
- const stroke = el.getAttribute('stroke');
428
- if (stroke && stroke !== 'none' && stroke !== 'currentColor' && !stroke.startsWith('url(')) {
429
- colors.push({
430
- value: stroke,
431
- property: 'svg-stroke',
432
- element: el.tagName.toLowerCase(),
433
- context: 'svg',
434
- });
435
- }
436
-
437
- // Check computed styles for SVG elements
438
- const styles = getComputedStyle(el);
439
- const computedFill = styles.fill;
440
- const computedStroke = styles.stroke;
441
-
442
- if (computedFill && computedFill !== 'none' && !computedFill.startsWith('url(')) {
443
- colors.push({
444
- value: computedFill,
445
- property: 'svg-fill-computed',
446
- element: el.tagName.toLowerCase(),
447
- context: 'svg',
448
- });
449
- }
450
-
451
- if (computedStroke && computedStroke !== 'none' && !computedStroke.startsWith('url(')) {
452
- colors.push({
453
- value: computedStroke,
454
- property: 'svg-stroke-computed',
455
- element: el.tagName.toLowerCase(),
456
- context: 'svg',
457
- });
458
- }
459
- });
460
-
461
- return colors;
462
- }
463
- """)
464
-
465
- return svg_colors
466
-
467
- async def _extract_inline_styles(self, page: Page) -> dict:
468
- """
469
- Extract colors from inline style attributes.
470
-
471
- This catches colors in:
472
- - <div style="background-color: #bcd432;">
473
- - <span style="color: rgb(0, 196, 204);">
474
- """
475
- inline_data = await page.evaluate("""
476
- () => {
477
- const colors = [];
478
- const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
479
-
480
- // Find all elements with inline styles
481
- const elements = document.querySelectorAll('[style]');
482
-
483
- elements.forEach(el => {
484
- const styleAttr = el.getAttribute('style');
485
- if (styleAttr) {
486
- const matches = styleAttr.match(colorRegex);
487
- if (matches) {
488
- matches.forEach(color => {
489
- colors.push({
490
- value: color,
491
- property: 'inline-style',
492
- element: el.tagName.toLowerCase(),
493
- context: 'inline',
494
- });
495
- });
496
- }
497
- }
498
- });
499
-
500
- return colors;
501
- }
502
- """)
503
-
504
- return inline_data
505
-
506
- async def _extract_stylesheet_colors(self, page: Page) -> list[dict]:
507
- """
508
- Parse CSS stylesheets for color values.
509
-
510
- This catches colors defined in CSS rules that may not be
511
- currently applied to visible elements.
512
-
513
- Also fetches external stylesheets that may be CORS-blocked.
514
- """
515
- css_colors = await page.evaluate("""
516
- () => {
517
- const colors = [];
518
- const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
519
-
520
- // Color-related CSS properties
521
- const colorProps = [
522
- 'color', 'background-color', 'background', 'border-color',
523
- 'border-top-color', 'border-right-color', 'border-bottom-color', 'border-left-color',
524
- 'outline-color', 'box-shadow', 'text-shadow', 'fill', 'stroke',
525
- 'caret-color', 'column-rule-color', 'text-decoration-color',
526
- ];
527
-
528
- // Parse all stylesheets
529
- for (const sheet of document.styleSheets) {
530
- try {
531
- const rules = sheet.cssRules || sheet.rules;
532
- for (const rule of rules) {
533
- if (rule.style) {
534
- colorProps.forEach(prop => {
535
- const value = rule.style.getPropertyValue(prop);
536
- if (value) {
537
- const matches = value.match(colorRegex);
538
- if (matches) {
539
- matches.forEach(color => {
540
- colors.push({
541
- value: color,
542
- property: prop,
543
- element: 'css-rule',
544
- context: 'stylesheet',
545
- selector: rule.selectorText || '',
546
- });
547
- });
548
- }
549
- }
550
- });
551
- }
552
- }
553
- } catch (e) {
554
- // CORS may block access to external stylesheets
555
- }
556
- }
557
-
558
- return colors;
559
- }
560
- """)
561
-
562
- return css_colors
563
-
564
- async def _fetch_external_css_colors(self, page: Page) -> list[dict]:
565
- """
566
- Fetch and parse external CSS files directly to bypass CORS.
567
-
568
- This catches colors in external stylesheets that are blocked by CORS.
569
- """
570
- colors = []
571
-
572
- try:
573
- # Get all stylesheet URLs
574
- css_urls = await page.evaluate("""
575
- () => {
576
- const urls = [];
577
- const links = document.querySelectorAll('link[rel="stylesheet"]');
578
- links.forEach(link => {
579
- if (link.href) {
580
- urls.push(link.href);
581
- }
582
- });
583
- return urls;
584
- }
585
- """)
586
-
587
- # Color regex pattern
588
- color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
589
-
590
- # Fetch each CSS file
591
- for css_url in css_urls[:10]: # Limit to 10 files
592
- try:
593
- response = await page.request.get(css_url, timeout=5000)
594
- if response.ok:
595
- css_text = await response.text()
596
-
597
- # Find all color values in CSS text
598
- matches = color_regex.findall(css_text)
599
- for match in matches:
600
- colors.append({
601
- "value": match,
602
- "property": "external-css",
603
- "element": "css-file",
604
- "context": "external-stylesheet",
605
- })
606
- except Exception as e:
607
- # Skip if fetch fails
608
- pass
609
-
610
- except Exception as e:
611
- self.warnings.append(f"External CSS fetch failed: {str(e)}")
612
-
613
- return colors
614
-
615
- async def _extract_all_page_colors(self, page: Page) -> list[dict]:
616
- """
617
- Extract ALL color values from the page source and styles.
618
-
619
- This is a brute-force approach that scans the entire page HTML
620
- and all style blocks for any color values.
621
- """
622
- colors = await page.evaluate("""
623
- () => {
624
- const colors = [];
625
- const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
626
-
627
- // 1. Scan all <style> tags
628
- const styleTags = document.querySelectorAll('style');
629
- styleTags.forEach(style => {
630
- const matches = style.textContent.match(colorRegex);
631
- if (matches) {
632
- matches.forEach(color => {
633
- colors.push({
634
- value: color,
635
- property: 'style-tag',
636
- element: 'style',
637
- context: 'style-block',
638
- });
639
- });
640
- }
641
- });
642
-
643
- // 2. Scan data attributes that might contain colors
644
- const allElements = document.querySelectorAll('*');
645
- allElements.forEach(el => {
646
- // Check data attributes
647
- for (const attr of el.attributes) {
648
- if (attr.name.startsWith('data-') || attr.name === 'style') {
649
- const matches = attr.value.match(colorRegex);
650
- if (matches) {
651
- matches.forEach(color => {
652
- colors.push({
653
- value: color,
654
- property: attr.name,
655
- element: el.tagName.toLowerCase(),
656
- context: 'attribute',
657
- });
658
- });
659
- }
660
- }
661
- }
662
-
663
- // Check for color in class names (some frameworks use color classes)
664
- const classList = el.className;
665
- if (typeof classList === 'string') {
666
- const colorMatches = classList.match(colorRegex);
667
- if (colorMatches) {
668
- colorMatches.forEach(color => {
669
- colors.push({
670
- value: color,
671
- property: 'class',
672
- element: el.tagName.toLowerCase(),
673
- context: 'class-name',
674
- });
675
- });
676
- }
677
- }
678
- });
679
-
680
- // 3. Look for colors in script tags (config objects)
681
- const scriptTags = document.querySelectorAll('script');
682
- scriptTags.forEach(script => {
683
- if (script.textContent && !script.src) {
684
- const matches = script.textContent.match(colorRegex);
685
- if (matches) {
686
- matches.forEach(color => {
687
- colors.push({
688
- value: color,
689
- property: 'script',
690
- element: 'script',
691
- context: 'javascript',
692
- });
693
- });
694
- }
695
- }
696
- });
697
-
698
- return colors;
699
- }
700
- """)
701
-
702
- return colors
703
-
704
- def _process_css_variables(self, css_vars: dict):
705
- """Process CSS variables and extract color tokens from them."""
706
- computed = css_vars.get("computed", {})
707
- raw = css_vars.get("raw", {})
708
-
709
- # Store CSS variables
710
- self.css_variables = {**raw, **computed}
711
-
712
- # Extract colors from CSS variables
713
- color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
714
-
715
- for var_name, value in computed.items():
716
- if color_regex.match(value.strip()):
717
- # This is a color variable
718
- color_data = {
719
- "value": value.strip(),
720
- "property": var_name,
721
- "element": ":root",
722
- "context": "css-variable",
723
- }
724
-
725
- hex_value = self._process_color(color_data)
726
- if hex_value and hex_value not in self.colors:
727
- contrast_white = get_contrast_with_white(hex_value)
728
- contrast_black = get_contrast_with_black(hex_value)
729
- compliance = check_wcag_compliance(hex_value, "#ffffff")
730
-
731
- self.colors[hex_value] = ColorToken(
732
- value=hex_value,
733
- frequency=1,
734
- contexts=["css-variable"],
735
- elements=[":root"],
736
- css_properties=[var_name],
737
- contrast_white=round(contrast_white, 2),
738
- contrast_black=round(contrast_black, 2),
739
- wcag_aa_large_text=compliance["aa_large_text"],
740
- wcag_aa_small_text=compliance["aa_normal_text"],
741
- source=TokenSource.DETECTED, # CSS variable is still "detected"
742
- confidence=Confidence.HIGH,
743
- )
744
- elif hex_value and hex_value in self.colors:
745
- # Update existing token
746
- token = self.colors[hex_value]
747
- token.frequency += 1
748
- if "css-variable" not in token.contexts:
749
- token.contexts.append("css-variable")
750
- if var_name not in token.css_properties:
751
- token.css_properties.append(var_name)
752
-
753
- def _process_color(self, color_data: dict) -> Optional[str]:
754
- """Process and normalize a color value."""
755
- value = color_data.get("value", "")
756
-
757
- # Parse and normalize
758
- parsed = parse_color(value)
759
- if not parsed:
760
- return None
761
-
762
- return parsed.hex
763
-
764
- def _aggregate_colors(self, raw_colors: list[dict]):
765
- """Aggregate color data from extraction."""
766
- for color_data in raw_colors:
767
- hex_value = self._process_color(color_data)
768
- if not hex_value:
769
- continue
770
-
771
- if hex_value not in self.colors:
772
- # Calculate contrast ratios
773
- contrast_white = get_contrast_with_white(hex_value)
774
- contrast_black = get_contrast_with_black(hex_value)
775
- compliance = check_wcag_compliance(hex_value, "#ffffff")
776
-
777
- self.colors[hex_value] = ColorToken(
778
- value=hex_value,
779
- frequency=0,
780
- contexts=[],
781
- elements=[],
782
- css_properties=[],
783
- contrast_white=round(contrast_white, 2),
784
- contrast_black=round(contrast_black, 2),
785
- wcag_aa_large_text=compliance["aa_large_text"],
786
- wcag_aa_small_text=compliance["aa_normal_text"],
787
- )
788
-
789
- # Update frequency and context
790
- token = self.colors[hex_value]
791
- token.frequency += 1
792
-
793
- context = color_data.get("context", "")
794
- if context and context not in token.contexts:
795
- token.contexts.append(context)
796
-
797
- element = color_data.get("element", "")
798
- if element and element not in token.elements:
799
- token.elements.append(element)
800
-
801
- prop = color_data.get("property", "")
802
- if prop and prop not in token.css_properties:
803
- token.css_properties.append(prop)
804
-
805
- def _aggregate_typography(self, raw_typography: list[dict]):
806
- """Aggregate typography data from extraction."""
807
- for typo_data in raw_typography:
808
- # Create unique key
809
- font_family = typo_data.get("fontFamily", "")
810
- font_size = typo_data.get("fontSize", "")
811
- font_weight = typo_data.get("fontWeight", "400")
812
- line_height = typo_data.get("lineHeight", "normal")
813
-
814
- key = f"{font_size}|{font_weight}|{font_family[:50]}"
815
-
816
- if key not in self.typography:
817
- # Parse font size to px
818
- font_size_px = None
819
- if font_size.endswith("px"):
820
- try:
821
- font_size_px = float(font_size.replace("px", ""))
822
- except ValueError:
823
- pass
824
-
825
- # Parse line height
826
- line_height_computed = None
827
- if line_height and line_height != "normal":
828
- if line_height.endswith("px") and font_size_px:
829
- try:
830
- lh_px = float(line_height.replace("px", ""))
831
- line_height_computed = round(lh_px / font_size_px, 2)
832
- except ValueError:
833
- pass
834
- else:
835
- try:
836
- line_height_computed = float(line_height)
837
- except ValueError:
838
- pass
839
-
840
- self.typography[key] = TypographyToken(
841
- font_family=font_family.split(",")[0].strip().strip('"\''),
842
- font_size=font_size,
843
- font_size_px=font_size_px,
844
- font_weight=int(font_weight) if font_weight.isdigit() else 400,
845
- line_height=line_height,
846
- line_height_computed=line_height_computed,
847
- letter_spacing=typo_data.get("letterSpacing"),
848
- frequency=0,
849
- elements=[],
850
- )
851
-
852
- # Update
853
- token = self.typography[key]
854
- token.frequency += 1
855
-
856
- element = typo_data.get("element", "")
857
- if element and element not in token.elements:
858
- token.elements.append(element)
859
-
860
- # Track font families
861
- primary_font = token.font_family
862
- if primary_font not in self.font_families:
863
- self.font_families[primary_font] = FontFamily(
864
- name=primary_font,
865
- fallbacks=[f.strip().strip('"\'') for f in font_family.split(",")[1:]],
866
- frequency=0,
867
- )
868
- self.font_families[primary_font].frequency += 1
869
-
870
- def _aggregate_spacing(self, raw_spacing: list[dict]):
871
- """Aggregate spacing data from extraction."""
872
- for space_data in raw_spacing:
873
- value = space_data.get("value", "")
874
- value_px = space_data.get("valuePx", 0)
875
-
876
- key = str(value_px)
877
-
878
- if key not in self.spacing:
879
- self.spacing[key] = SpacingToken(
880
- value=f"{value_px}px",
881
- value_px=value_px,
882
- frequency=0,
883
- contexts=[],
884
- properties=[],
885
- fits_base_4=value_px % 4 == 0,
886
- fits_base_8=value_px % 8 == 0,
887
- )
888
-
889
- token = self.spacing[key]
890
- token.frequency += 1
891
-
892
- context = space_data.get("context", "")
893
- if context and context not in token.contexts:
894
- token.contexts.append(context)
895
-
896
- prop = space_data.get("property", "")
897
- if prop and prop not in token.properties:
898
- token.properties.append(prop)
899
-
900
- def _aggregate_radius(self, raw_radius: list[dict]):
901
- """Aggregate border radius data."""
902
- for radius_data in raw_radius:
903
- value = radius_data.get("value", "")
904
-
905
- # Normalize to simple format
906
- # "8px 8px 8px 8px" -> "8px"
907
- parts = value.split()
908
- if len(set(parts)) == 1:
909
- value = parts[0]
910
-
911
- if value not in self.radius:
912
- value_px = None
913
- if value.endswith("px"):
914
- try:
915
- value_px = int(float(value.replace("px", "")))
916
- except ValueError:
917
- pass
918
-
919
- self.radius[value] = RadiusToken(
920
- value=value,
921
- value_px=value_px,
922
- frequency=0,
923
- elements=[],
924
- fits_base_4=value_px % 4 == 0 if value_px else False,
925
- fits_base_8=value_px % 8 == 0 if value_px else False,
926
- )
927
-
928
- token = self.radius[value]
929
- token.frequency += 1
930
-
931
- element = radius_data.get("element", "")
932
- if element and element not in token.elements:
933
- token.elements.append(element)
934
-
935
- def _aggregate_shadows(self, raw_shadows: list[dict]):
936
- """Aggregate box shadow data."""
937
- for shadow_data in raw_shadows:
938
- value = shadow_data.get("value", "")
939
-
940
- if value not in self.shadows:
941
- self.shadows[value] = ShadowToken(
942
- value=value,
943
- frequency=0,
944
- elements=[],
945
- )
946
-
947
- token = self.shadows[value]
948
- token.frequency += 1
949
-
950
- element = shadow_data.get("element", "")
951
- if element and element not in token.elements:
952
- token.elements.append(element)
953
-
954
- def _calculate_confidence(self, frequency: int) -> Confidence:
955
- """Calculate confidence level based on frequency."""
956
- if frequency >= 10:
957
- return Confidence.HIGH
958
- elif frequency >= 3:
959
- return Confidence.MEDIUM
960
- return Confidence.LOW
961
-
962
- def _detect_spacing_base(self) -> Optional[int]:
963
- """Detect the base spacing unit (4 or 8)."""
964
- fits_4 = sum(1 for s in self.spacing.values() if s.fits_base_4)
965
- fits_8 = sum(1 for s in self.spacing.values() if s.fits_base_8)
966
-
967
- total = len(self.spacing)
968
- if total == 0:
969
- return None
970
-
971
- # If 80%+ values fit base 8, use 8
972
- if fits_8 / total >= 0.8:
973
- return 8
974
- # If 80%+ values fit base 4, use 4
975
- elif fits_4 / total >= 0.8:
976
- return 4
977
-
978
- return None
979
-
980
- async def extract(
981
- self,
982
- pages: list[str],
983
- progress_callback: Optional[Callable[[float], None]] = None
984
- ) -> ExtractedTokens:
985
- """
986
- Extract tokens from a list of pages.
987
-
988
- Enhanced extraction includes:
989
- - DOM computed styles
990
- - CSS variables from :root
991
- - SVG fill/stroke colors
992
- - Inline style colors
993
- - Stylesheet color rules
994
-
995
- Args:
996
- pages: List of URLs to crawl
997
- progress_callback: Optional callback for progress updates
998
-
999
- Returns:
1000
- ExtractedTokens with all discovered tokens
1001
- """
1002
- start_time = datetime.now()
1003
- pages_crawled = []
1004
-
1005
- async with self:
1006
- for i, url in enumerate(pages):
1007
- try:
1008
- page = await self.context.new_page()
1009
-
1010
- # Navigate with fallback strategy
1011
- try:
1012
- await page.goto(
1013
- url,
1014
- wait_until="domcontentloaded",
1015
- timeout=60000 # 60 seconds
1016
- )
1017
- # Wait for JS to render
1018
- await page.wait_for_timeout(2000)
1019
- except Exception as nav_error:
1020
- # Fallback to load event
1021
- try:
1022
- await page.goto(
1023
- url,
1024
- wait_until="load",
1025
- timeout=60000
1026
- )
1027
- await page.wait_for_timeout(3000)
1028
- except Exception:
1029
- self.warnings.append(f"Slow load for {url}, extracting partial content")
1030
-
1031
- # Scroll to load lazy content
1032
- await self._scroll_page(page)
1033
-
1034
- # =========================================================
1035
- # ENHANCED EXTRACTION: Multiple sources
1036
- # =========================================================
1037
-
1038
- # Track counts before extraction for this page
1039
- colors_before = len(self.colors)
1040
- typo_before = len(self.typography)
1041
- spacing_before = len(self.spacing)
1042
- radius_before = len(self.radius)
1043
- shadows_before = len(self.shadows)
1044
-
1045
- # 1. Extract DOM computed styles (original method)
1046
- styles = await self._extract_styles_from_page(page)
1047
- dom_colors = len(styles.get("colors", []))
1048
- self._aggregate_colors(styles.get("colors", []))
1049
- self._aggregate_typography(styles.get("typography", []))
1050
- self._aggregate_spacing(styles.get("spacing", []))
1051
- self._aggregate_radius(styles.get("radius", []))
1052
- self._aggregate_shadows(styles.get("shadows", []))
1053
-
1054
- # 2. Extract CSS variables (--primary-color, etc.)
1055
- css_var_count = 0
1056
- try:
1057
- css_vars = await self._extract_css_variables(page)
1058
- css_var_count = len(css_vars.get("computed", {}))
1059
- self._process_css_variables(css_vars)
1060
- except Exception as e:
1061
- self.warnings.append(f"CSS variables extraction failed: {str(e)}")
1062
-
1063
- # 3. Extract SVG colors (fill, stroke)
1064
- svg_color_count = 0
1065
- try:
1066
- svg_colors = await self._extract_svg_colors(page)
1067
- svg_color_count = len(svg_colors)
1068
- self._aggregate_colors(svg_colors)
1069
- except Exception as e:
1070
- self.warnings.append(f"SVG color extraction failed: {str(e)}")
1071
-
1072
- # 4. Extract inline style colors
1073
- inline_color_count = 0
1074
- try:
1075
- inline_colors = await self._extract_inline_styles(page)
1076
- inline_color_count = len(inline_colors)
1077
- self._aggregate_colors(inline_colors)
1078
- except Exception as e:
1079
- self.warnings.append(f"Inline style extraction failed: {str(e)}")
1080
-
1081
- # 5. Extract stylesheet colors (CSS rules)
1082
- stylesheet_color_count = 0
1083
- try:
1084
- stylesheet_colors = await self._extract_stylesheet_colors(page)
1085
- stylesheet_color_count = len(stylesheet_colors)
1086
- self._aggregate_colors(stylesheet_colors)
1087
- except Exception as e:
1088
- self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
1089
-
1090
- # 6. Fetch external CSS files (bypass CORS)
1091
- external_css_count = 0
1092
- try:
1093
- external_colors = await self._fetch_external_css_colors(page)
1094
- external_css_count = len(external_colors)
1095
- self._aggregate_colors(external_colors)
1096
- except Exception as e:
1097
- self.warnings.append(f"External CSS fetch failed: {str(e)}")
1098
-
1099
- # 7. Brute-force scan all page content for colors
1100
- page_scan_count = 0
1101
- try:
1102
- page_colors = await self._extract_all_page_colors(page)
1103
- page_scan_count = len(page_colors)
1104
- self._aggregate_colors(page_colors)
1105
- except Exception as e:
1106
- self.warnings.append(f"Page scan failed: {str(e)}")
1107
-
1108
- # 8. Extract foreground-background color pairs for real AA checks
1109
- try:
1110
- fg_bg = await self._extract_fg_bg_pairs(page)
1111
- self.fg_bg_pairs.extend(fg_bg)
1112
- except Exception as e:
1113
- self.warnings.append(f"FG/BG pair extraction failed: {str(e)}")
1114
-
1115
- # =========================================================
1116
- # Log extraction results for this page
1117
- # =========================================================
1118
- colors_new = len(self.colors) - colors_before
1119
- typo_new = len(self.typography) - typo_before
1120
- spacing_new = len(self.spacing) - spacing_before
1121
- radius_new = len(self.radius) - radius_before
1122
- shadows_new = len(self.shadows) - shadows_before
1123
-
1124
- # Store extraction stats for logging
1125
- self._last_extraction_stats = {
1126
- "url": url,
1127
- "dom_colors": dom_colors,
1128
- "css_variables": css_var_count,
1129
- "svg_colors": svg_color_count,
1130
- "inline_colors": inline_color_count,
1131
- "stylesheet_colors": stylesheet_color_count,
1132
- "external_css_colors": external_css_count,
1133
- "page_scan_colors": page_scan_count,
1134
- "new_colors": colors_new,
1135
- "new_typography": typo_new,
1136
- "new_spacing": spacing_new,
1137
- "new_radius": radius_new,
1138
- "new_shadows": shadows_new,
1139
- }
1140
-
1141
- # =========================================================
1142
-
1143
- self.total_elements += styles.get("elements_count", 0)
1144
- pages_crawled.append(url)
1145
-
1146
- await page.close()
1147
-
1148
- # Progress callback
1149
- if progress_callback:
1150
- progress_callback((i + 1) / len(pages))
1151
-
1152
- # Rate limiting
1153
- await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000)
1154
-
1155
- except Exception as e:
1156
- self.errors.append(f"Error extracting {url}: {str(e)}")
1157
-
1158
- # Calculate confidence for all tokens
1159
- for token in self.colors.values():
1160
- token.confidence = self._calculate_confidence(token.frequency)
1161
- for token in self.typography.values():
1162
- token.confidence = self._calculate_confidence(token.frequency)
1163
- for token in self.spacing.values():
1164
- token.confidence = self._calculate_confidence(token.frequency)
1165
-
1166
- # Detect spacing base
1167
- spacing_base = self._detect_spacing_base()
1168
-
1169
- # Mark outliers in spacing
1170
- if spacing_base:
1171
- for token in self.spacing.values():
1172
- if spacing_base == 8 and not token.fits_base_8:
1173
- token.is_outlier = True
1174
- elif spacing_base == 4 and not token.fits_base_4:
1175
- token.is_outlier = True
1176
-
1177
- # Determine primary font
1178
- if self.font_families:
1179
- primary_font = max(self.font_families.values(), key=lambda f: f.frequency)
1180
- primary_font.usage = "primary"
1181
-
1182
- # Build result
1183
- end_time = datetime.now()
1184
- duration_ms = int((end_time - start_time).total_seconds() * 1000)
1185
-
1186
- return ExtractedTokens(
1187
- viewport=self.viewport,
1188
- source_url=pages[0] if pages else "",
1189
- pages_crawled=pages_crawled,
1190
- colors=list(self.colors.values()),
1191
- typography=list(self.typography.values()),
1192
- spacing=list(self.spacing.values()),
1193
- radius=list(self.radius.values()),
1194
- shadows=list(self.shadows.values()),
1195
- font_families=list(self.font_families.values()),
1196
- spacing_base=spacing_base,
1197
- extraction_timestamp=start_time,
1198
- extraction_duration_ms=duration_ms,
1199
- total_elements_analyzed=self.total_elements,
1200
- unique_colors=len(self.colors),
1201
- unique_font_sizes=len(set(t.font_size for t in self.typography.values())),
1202
- unique_spacing_values=len(self.spacing),
1203
- errors=self.errors,
1204
- warnings=self.warnings,
1205
- )
1206
-
1207
-
1208
- # =============================================================================
1209
- # CONVENIENCE FUNCTIONS
1210
- # =============================================================================
1211
-
1212
- async def extract_from_pages(
1213
- pages: list[str],
1214
- viewport: Viewport = Viewport.DESKTOP
1215
- ) -> ExtractedTokens:
1216
- """Convenience function to extract tokens from pages."""
1217
- extractor = TokenExtractor(viewport=viewport)
1218
- return await extractor.extract(pages)
1219
-
1220
-
1221
- async def extract_both_viewports(pages: list[str]) -> tuple[ExtractedTokens, ExtractedTokens]:
1222
- """Extract tokens from both desktop and mobile viewports."""
1223
- desktop_extractor = TokenExtractor(viewport=Viewport.DESKTOP)
1224
- mobile_extractor = TokenExtractor(viewport=Viewport.MOBILE)
1225
-
1226
- desktop_result = await desktop_extractor.extract(pages)
1227
- mobile_result = await mobile_extractor.extract(pages)
1228
-
1229
- return desktop_result, mobile_result