""" Agent 1: Token Extractor Design System Extractor v2 Persona: Meticulous Design Archaeologist Responsibilities: - Crawl pages at specified viewport - Extract computed styles from all elements - Parse CSS files for variables and rules - Extract colors from SVGs - Collect colors, typography, spacing, radius, shadows - Track frequency and context for each token """ import asyncio import re from typing import Optional, Callable from datetime import datetime from collections import defaultdict from playwright.async_api import async_playwright, Browser, Page, BrowserContext from core.token_schema import ( Viewport, ExtractedTokens, ColorToken, TypographyToken, SpacingToken, RadiusToken, ShadowToken, FontFamily, TokenSource, Confidence, ) from core.color_utils import ( normalize_hex, parse_color, get_contrast_with_white, get_contrast_with_black, check_wcag_compliance, ) from config.settings import get_settings class TokenExtractor: """ Extracts design tokens from web pages. This is the second part of Agent 1's job — after pages are confirmed, we crawl and extract all CSS values. Enhanced with: - CSS file parsing for variables and rules - SVG color extraction - Inline style extraction """ def __init__(self, viewport: Viewport = Viewport.DESKTOP): self.settings = get_settings() self.viewport = viewport self.browser: Optional[Browser] = None self.context: Optional[BrowserContext] = None # Token collection self.colors: dict[str, ColorToken] = {} self.typography: dict[str, TypographyToken] = {} self.spacing: dict[str, SpacingToken] = {} self.radius: dict[str, RadiusToken] = {} self.shadows: dict[str, ShadowToken] = {} # CSS Variables collection self.css_variables: dict[str, str] = {} # Font tracking self.font_families: dict[str, FontFamily] = {} # Statistics self.total_elements = 0 self.errors: list[str] = [] self.warnings: list[str] = [] async def __aenter__(self): """Async context manager entry.""" await self._init_browser() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit.""" await self._close_browser() async def _init_browser(self): """Initialize Playwright browser.""" playwright = await async_playwright().start() self.browser = await playwright.chromium.launch( headless=self.settings.browser.headless ) # Set viewport based on extraction mode if self.viewport == Viewport.DESKTOP: width = self.settings.viewport.desktop_width height = self.settings.viewport.desktop_height else: width = self.settings.viewport.mobile_width height = self.settings.viewport.mobile_height self.context = await self.browser.new_context( viewport={"width": width, "height": height}, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ) async def _close_browser(self): """Close browser and cleanup.""" if self.context: await self.context.close() if self.browser: await self.browser.close() async def _scroll_page(self, page: Page): """Scroll page to load lazy content.""" await page.evaluate(""" async () => { const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); const height = document.body.scrollHeight; const step = window.innerHeight; for (let y = 0; y < height; y += step) { window.scrollTo(0, y); await delay(100); } // Scroll back to top window.scrollTo(0, 0); } """) # Wait for network idle after scrolling await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout) async def _extract_styles_from_page(self, page: Page) -> dict: """ Extract computed styles from all elements on the page. This is the core extraction logic — we get getComputedStyle for every element. """ styles_data = await page.evaluate(""" () => { const elements = document.querySelectorAll('*'); const results = { colors: [], typography: [], spacing: [], radius: [], shadows: [], elements_count: elements.length, }; const colorProperties = [ 'color', 'background-color', 'border-color', 'border-top-color', 'border-right-color', 'border-bottom-color', 'border-left-color', 'outline-color', 'text-decoration-color', ]; const spacingProperties = [ 'margin-top', 'margin-right', 'margin-bottom', 'margin-left', 'padding-top', 'padding-right', 'padding-bottom', 'padding-left', 'gap', 'row-gap', 'column-gap', ]; elements.forEach(el => { const tag = el.tagName.toLowerCase(); const styles = window.getComputedStyle(el); // Skip invisible elements if (styles.display === 'none' || styles.visibility === 'hidden') { return; } // --- COLORS --- colorProperties.forEach(prop => { const value = styles.getPropertyValue(prop); if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') { results.colors.push({ value: value, property: prop, element: tag, context: prop.includes('background') ? 'background' : prop.includes('border') ? 'border' : 'text', }); } }); // --- TYPOGRAPHY --- const fontFamily = styles.getPropertyValue('font-family'); const fontSize = styles.getPropertyValue('font-size'); const fontWeight = styles.getPropertyValue('font-weight'); const lineHeight = styles.getPropertyValue('line-height'); const letterSpacing = styles.getPropertyValue('letter-spacing'); if (fontSize && fontFamily) { results.typography.push({ fontFamily: fontFamily, fontSize: fontSize, fontWeight: fontWeight, lineHeight: lineHeight, letterSpacing: letterSpacing, element: tag, }); } // --- SPACING --- spacingProperties.forEach(prop => { const value = styles.getPropertyValue(prop); if (value && value !== '0px' && value !== 'auto' && value !== 'normal') { const px = parseFloat(value); if (!isNaN(px) && px > 0 && px < 500) { results.spacing.push({ value: value, valuePx: Math.round(px), property: prop, context: prop.includes('margin') ? 'margin' : prop.includes('padding') ? 'padding' : 'gap', }); } } }); // --- BORDER RADIUS --- const radiusProps = [ 'border-radius', 'border-top-left-radius', 'border-top-right-radius', 'border-bottom-left-radius', 'border-bottom-right-radius', ]; radiusProps.forEach(prop => { const value = styles.getPropertyValue(prop); if (value && value !== '0px') { results.radius.push({ value: value, element: tag, }); } }); // --- BOX SHADOW --- const shadow = styles.getPropertyValue('box-shadow'); if (shadow && shadow !== 'none') { results.shadows.push({ value: shadow, element: tag, }); } }); return results; } """) return styles_data async def _extract_css_variables(self, page: Page) -> dict: """ Extract CSS custom properties (variables) from :root and stylesheets. This catches colors defined as: - :root { --primary-color: #3860be; } - :root { --brand-cyan: #00c4cc; } """ css_vars = await page.evaluate(""" () => { const variables = {}; // 1. Get CSS variables from :root computed styles const rootStyles = getComputedStyle(document.documentElement); const rootCss = document.documentElement.style.cssText; // 2. Parse all stylesheets for CSS variables for (const sheet of document.styleSheets) { try { const rules = sheet.cssRules || sheet.rules; for (const rule of rules) { if (rule.style) { for (let i = 0; i < rule.style.length; i++) { const prop = rule.style[i]; if (prop.startsWith('--')) { const value = rule.style.getPropertyValue(prop).trim(); if (value) { variables[prop] = value; } } } } // Also check @media rules if (rule.cssRules) { for (const innerRule of rule.cssRules) { if (innerRule.style) { for (let i = 0; i < innerRule.style.length; i++) { const prop = innerRule.style[i]; if (prop.startsWith('--')) { const value = innerRule.style.getPropertyValue(prop).trim(); if (value) { variables[prop] = value; } } } } } } } } catch (e) { // CORS may block access to external stylesheets console.log('Could not access stylesheet:', e); } } // 3. Get computed CSS variable values from :root const computedVars = {}; for (const prop of Object.keys(variables)) { const computed = rootStyles.getPropertyValue(prop).trim(); if (computed) { computedVars[prop] = computed; } } return { raw: variables, computed: computedVars }; } """) return css_vars async def _extract_svg_colors(self, page: Page) -> list[dict]: """ Extract colors from SVG elements (fill, stroke). This catches colors in: - - - """ svg_colors = await page.evaluate(""" () => { const colors = []; // Find all SVG elements const svgs = document.querySelectorAll('svg, svg *'); svgs.forEach(el => { // Check fill attribute const fill = el.getAttribute('fill'); if (fill && fill !== 'none' && fill !== 'currentColor' && !fill.startsWith('url(')) { colors.push({ value: fill, property: 'svg-fill', element: el.tagName.toLowerCase(), context: 'svg', }); } // Check stroke attribute const stroke = el.getAttribute('stroke'); if (stroke && stroke !== 'none' && stroke !== 'currentColor' && !stroke.startsWith('url(')) { colors.push({ value: stroke, property: 'svg-stroke', element: el.tagName.toLowerCase(), context: 'svg', }); } // Check computed styles for SVG elements const styles = getComputedStyle(el); const computedFill = styles.fill; const computedStroke = styles.stroke; if (computedFill && computedFill !== 'none' && !computedFill.startsWith('url(')) { colors.push({ value: computedFill, property: 'svg-fill-computed', element: el.tagName.toLowerCase(), context: 'svg', }); } if (computedStroke && computedStroke !== 'none' && !computedStroke.startsWith('url(')) { colors.push({ value: computedStroke, property: 'svg-stroke-computed', element: el.tagName.toLowerCase(), context: 'svg', }); } }); return colors; } """) return svg_colors async def _extract_inline_styles(self, page: Page) -> dict: """ Extract colors from inline style attributes. This catches colors in: -
- """ inline_data = await page.evaluate(""" () => { const colors = []; const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi; // Find all elements with inline styles const elements = document.querySelectorAll('[style]'); elements.forEach(el => { const styleAttr = el.getAttribute('style'); if (styleAttr) { const matches = styleAttr.match(colorRegex); if (matches) { matches.forEach(color => { colors.push({ value: color, property: 'inline-style', element: el.tagName.toLowerCase(), context: 'inline', }); }); } } }); return colors; } """) return inline_data async def _extract_stylesheet_colors(self, page: Page) -> list[dict]: """ Parse CSS stylesheets for color values. This catches colors defined in CSS rules that may not be currently applied to visible elements. Also fetches external stylesheets that may be CORS-blocked. """ css_colors = await page.evaluate(""" () => { const colors = []; const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi; // Color-related CSS properties const colorProps = [ 'color', 'background-color', 'background', 'border-color', 'border-top-color', 'border-right-color', 'border-bottom-color', 'border-left-color', 'outline-color', 'box-shadow', 'text-shadow', 'fill', 'stroke', 'caret-color', 'column-rule-color', 'text-decoration-color', ]; // Parse all stylesheets for (const sheet of document.styleSheets) { try { const rules = sheet.cssRules || sheet.rules; for (const rule of rules) { if (rule.style) { colorProps.forEach(prop => { const value = rule.style.getPropertyValue(prop); if (value) { const matches = value.match(colorRegex); if (matches) { matches.forEach(color => { colors.push({ value: color, property: prop, element: 'css-rule', context: 'stylesheet', selector: rule.selectorText || '', }); }); } } }); } } } catch (e) { // CORS may block access to external stylesheets } } return colors; } """) return css_colors async def _fetch_external_css_colors(self, page: Page) -> list[dict]: """ Fetch and parse external CSS files directly to bypass CORS. This catches colors in external stylesheets that are blocked by CORS. """ colors = [] try: # Get all stylesheet URLs css_urls = await page.evaluate(""" () => { const urls = []; const links = document.querySelectorAll('link[rel="stylesheet"]'); links.forEach(link => { if (link.href) { urls.push(link.href); } }); return urls; } """) # Color regex pattern color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE) # Fetch each CSS file for css_url in css_urls[:10]: # Limit to 10 files try: response = await page.request.get(css_url, timeout=5000) if response.ok: css_text = await response.text() # Find all color values in CSS text matches = color_regex.findall(css_text) for match in matches: colors.append({ "value": match, "property": "external-css", "element": "css-file", "context": "external-stylesheet", }) except Exception as e: # Skip if fetch fails pass except Exception as e: self.warnings.append(f"External CSS fetch failed: {str(e)}") return colors async def _extract_all_page_colors(self, page: Page) -> list[dict]: """ Extract ALL color values from the page source and styles. This is a brute-force approach that scans the entire page HTML and all style blocks for any color values. """ colors = await page.evaluate(""" () => { const colors = []; const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi; // 1. Scan all