riazmo commited on
Commit
7d94ddd
·
verified ·
1 Parent(s): 38d9cec

Upload 2 files

Browse files
Files changed (2) hide show
  1. agents/crawler.py +366 -0
  2. agents/extractor.py +622 -0
agents/crawler.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 1: Website Crawler
3
+ Design System Extractor v2
4
+
5
+ Persona: Meticulous Design Archaeologist
6
+
7
+ Responsibilities:
8
+ - Auto-discover pages from base URL
9
+ - Classify page types (homepage, listing, detail, etc.)
10
+ - Prepare page list for user confirmation
11
+ """
12
+
13
+ import asyncio
14
+ import re
15
+ from urllib.parse import urljoin, urlparse
16
+ from typing import Optional, Callable
17
+ from datetime import datetime
18
+
19
+ from playwright.async_api import async_playwright, Browser, Page, BrowserContext
20
+
21
+ from core.token_schema import DiscoveredPage, PageType, Viewport
22
+ from config.settings import get_settings
23
+
24
+
25
+ class PageDiscoverer:
26
+ """
27
+ Discovers pages from a website for design system extraction.
28
+
29
+ This is the first part of Agent 1's job — finding pages before
30
+ the human confirms which ones to crawl.
31
+ """
32
+
33
+ def __init__(self):
34
+ self.settings = get_settings()
35
+ self.browser: Optional[Browser] = None
36
+ self.context: Optional[BrowserContext] = None
37
+ self.visited_urls: set[str] = set()
38
+ self.discovered_pages: list[DiscoveredPage] = []
39
+
40
+ async def __aenter__(self):
41
+ """Async context manager entry."""
42
+ await self._init_browser()
43
+ return self
44
+
45
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
46
+ """Async context manager exit."""
47
+ await self._close_browser()
48
+
49
+ async def _init_browser(self):
50
+ """Initialize Playwright browser."""
51
+ playwright = await async_playwright().start()
52
+ self.browser = await playwright.chromium.launch(
53
+ headless=self.settings.browser.headless
54
+ )
55
+ self.context = await self.browser.new_context(
56
+ viewport={
57
+ "width": self.settings.viewport.desktop_width,
58
+ "height": self.settings.viewport.desktop_height,
59
+ },
60
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
61
+ )
62
+
63
+ async def _close_browser(self):
64
+ """Close browser and cleanup."""
65
+ if self.context:
66
+ await self.context.close()
67
+ if self.browser:
68
+ await self.browser.close()
69
+
70
+ def _normalize_url(self, url: str, base_url: str) -> Optional[str]:
71
+ """Normalize and validate URL."""
72
+ # Handle relative URLs
73
+ if not url.startswith(('http://', 'https://')):
74
+ url = urljoin(base_url, url)
75
+
76
+ parsed = urlparse(url)
77
+ base_parsed = urlparse(base_url)
78
+
79
+ # Only allow same domain
80
+ if parsed.netloc != base_parsed.netloc:
81
+ return None
82
+
83
+ # Remove fragments and normalize
84
+ normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
85
+
86
+ # Remove trailing slash for consistency
87
+ if normalized.endswith('/') and len(normalized) > len(f"{parsed.scheme}://{parsed.netloc}/"):
88
+ normalized = normalized.rstrip('/')
89
+
90
+ return normalized
91
+
92
+ def _classify_page_type(self, url: str, title: str = "") -> PageType:
93
+ """
94
+ Classify page type based on URL patterns and title.
95
+
96
+ This is a heuristic — not perfect, but good enough for discovery.
97
+ """
98
+ url_lower = url.lower()
99
+ title_lower = title.lower() if title else ""
100
+
101
+ # Check URL patterns
102
+ patterns = {
103
+ PageType.HOMEPAGE: [r'/$', r'/home$', r'/index'],
104
+ PageType.LISTING: [r'/products', r'/catalog', r'/list', r'/category', r'/collection', r'/search'],
105
+ PageType.DETAIL: [r'/product/', r'/item/', r'/detail/', r'/p/', r'/[a-z-]+/\d+'],
106
+ PageType.FORM: [r'/contact', r'/form', r'/apply', r'/submit', r'/register'],
107
+ PageType.AUTH: [r'/login', r'/signin', r'/signup', r'/auth', r'/account'],
108
+ PageType.CHECKOUT: [r'/cart', r'/checkout', r'/basket', r'/payment'],
109
+ PageType.MARKETING: [r'/landing', r'/promo', r'/campaign', r'/offer'],
110
+ PageType.ABOUT: [r'/about', r'/team', r'/company', r'/story'],
111
+ PageType.CONTACT: [r'/contact', r'/support', r'/help'],
112
+ }
113
+
114
+ for page_type, url_patterns in patterns.items():
115
+ for pattern in url_patterns:
116
+ if re.search(pattern, url_lower):
117
+ return page_type
118
+
119
+ # Check title patterns
120
+ title_patterns = {
121
+ PageType.HOMEPAGE: ['home', 'welcome'],
122
+ PageType.LISTING: ['products', 'catalog', 'collection', 'browse'],
123
+ PageType.DETAIL: ['product', 'item'],
124
+ PageType.AUTH: ['login', 'sign in', 'sign up', 'register'],
125
+ PageType.ABOUT: ['about', 'our story', 'team'],
126
+ PageType.CONTACT: ['contact', 'get in touch', 'support'],
127
+ }
128
+
129
+ for page_type, keywords in title_patterns.items():
130
+ for keyword in keywords:
131
+ if keyword in title_lower:
132
+ return page_type
133
+
134
+ return PageType.OTHER
135
+
136
+ async def _extract_links(self, page: Page, base_url: str) -> list[str]:
137
+ """Extract all internal links from a page."""
138
+ links = await page.evaluate("""
139
+ () => {
140
+ const links = Array.from(document.querySelectorAll('a[href]'));
141
+ return links.map(a => a.href).filter(href =>
142
+ href &&
143
+ !href.startsWith('javascript:') &&
144
+ !href.startsWith('mailto:') &&
145
+ !href.startsWith('tel:') &&
146
+ !href.includes('#')
147
+ );
148
+ }
149
+ """)
150
+
151
+ # Normalize and filter
152
+ valid_links = []
153
+ for link in links:
154
+ normalized = self._normalize_url(link, base_url)
155
+ if normalized and normalized not in self.visited_urls:
156
+ valid_links.append(normalized)
157
+
158
+ return list(set(valid_links))
159
+
160
+ async def _get_page_title(self, page: Page) -> str:
161
+ """Get page title."""
162
+ try:
163
+ return await page.title()
164
+ except Exception:
165
+ return ""
166
+
167
+ async def discover(
168
+ self,
169
+ base_url: str,
170
+ max_pages: int = None,
171
+ progress_callback: Optional[Callable[[float], None]] = None
172
+ ) -> list[DiscoveredPage]:
173
+ """
174
+ Discover pages from a website.
175
+
176
+ Args:
177
+ base_url: The starting URL
178
+ max_pages: Maximum pages to discover (default from settings)
179
+ progress_callback: Optional callback for progress updates
180
+
181
+ Returns:
182
+ List of discovered pages
183
+ """
184
+ max_pages = max_pages or self.settings.crawl.max_pages
185
+
186
+ async with self:
187
+ # Start with homepage
188
+ normalized_base = self._normalize_url(base_url, base_url)
189
+ if not normalized_base:
190
+ raise ValueError(f"Invalid base URL: {base_url}")
191
+
192
+ queue = [normalized_base]
193
+ self.visited_urls = set()
194
+ self.discovered_pages = []
195
+
196
+ while queue and len(self.discovered_pages) < max_pages:
197
+ current_url = queue.pop(0)
198
+
199
+ if current_url in self.visited_urls:
200
+ continue
201
+
202
+ self.visited_urls.add(current_url)
203
+
204
+ try:
205
+ page = await self.context.new_page()
206
+
207
+ # Navigate to page with more lenient settings
208
+ # Use 'domcontentloaded' instead of 'networkidle' for faster/more reliable loading
209
+ try:
210
+ await page.goto(
211
+ current_url,
212
+ wait_until="domcontentloaded",
213
+ timeout=60000 # 60 seconds
214
+ )
215
+ # Wait a bit more for JS to render
216
+ await page.wait_for_timeout(2000)
217
+ except Exception as nav_error:
218
+ # Try with 'load' event as fallback
219
+ try:
220
+ await page.goto(
221
+ current_url,
222
+ wait_until="load",
223
+ timeout=60000
224
+ )
225
+ await page.wait_for_timeout(3000)
226
+ except Exception:
227
+ # Last resort - just try to get whatever loaded
228
+ pass
229
+
230
+ # Get page info
231
+ title = await self._get_page_title(page)
232
+ page_type = self._classify_page_type(current_url, title)
233
+ depth = len(urlparse(current_url).path.split('/')) - 1
234
+
235
+ # Create discovered page
236
+ discovered = DiscoveredPage(
237
+ url=current_url,
238
+ title=title,
239
+ page_type=page_type,
240
+ depth=depth,
241
+ selected=True,
242
+ )
243
+ self.discovered_pages.append(discovered)
244
+
245
+ # Extract links for further crawling
246
+ new_links = await self._extract_links(page, base_url)
247
+
248
+ # Prioritize certain page types
249
+ priority_patterns = ['/product', '/listing', '/category', '/about', '/contact']
250
+ priority_links = [l for l in new_links if any(p in l.lower() for p in priority_patterns)]
251
+ other_links = [l for l in new_links if l not in priority_links]
252
+
253
+ # Add to queue (priority first)
254
+ for link in priority_links + other_links:
255
+ if link not in self.visited_urls and link not in queue:
256
+ queue.append(link)
257
+
258
+ await page.close()
259
+
260
+ # Progress callback
261
+ if progress_callback:
262
+ progress = len(self.discovered_pages) / max_pages
263
+ progress_callback(min(progress, 1.0))
264
+
265
+ # Rate limiting
266
+ await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000)
267
+
268
+ except Exception as e:
269
+ # Log error but continue
270
+ discovered = DiscoveredPage(
271
+ url=current_url,
272
+ title="",
273
+ page_type=PageType.OTHER,
274
+ depth=0,
275
+ selected=False,
276
+ error=str(e),
277
+ )
278
+ self.discovered_pages.append(discovered)
279
+
280
+ return self.discovered_pages
281
+
282
+ def get_pages_by_type(self) -> dict[PageType, list[DiscoveredPage]]:
283
+ """Group discovered pages by type."""
284
+ grouped: dict[PageType, list[DiscoveredPage]] = {}
285
+ for page in self.discovered_pages:
286
+ if page.page_type not in grouped:
287
+ grouped[page.page_type] = []
288
+ grouped[page.page_type].append(page)
289
+ return grouped
290
+
291
+ def get_suggested_pages(self, min_pages: int = None) -> list[DiscoveredPage]:
292
+ """
293
+ Get suggested pages for extraction.
294
+
295
+ Ensures diversity of page types and prioritizes key templates.
296
+ """
297
+ min_pages = min_pages or self.settings.crawl.min_pages
298
+
299
+ # Priority order for page types
300
+ priority_types = [
301
+ PageType.HOMEPAGE,
302
+ PageType.LISTING,
303
+ PageType.DETAIL,
304
+ PageType.FORM,
305
+ PageType.MARKETING,
306
+ PageType.AUTH,
307
+ PageType.ABOUT,
308
+ PageType.CONTACT,
309
+ PageType.OTHER,
310
+ ]
311
+
312
+ selected = []
313
+ grouped = self.get_pages_by_type()
314
+
315
+ # First pass: get at least one of each priority type
316
+ for page_type in priority_types:
317
+ if page_type in grouped and grouped[page_type]:
318
+ # Take the first (usually shallowest) page of this type
319
+ page = sorted(grouped[page_type], key=lambda p: p.depth)[0]
320
+ if page not in selected:
321
+ selected.append(page)
322
+
323
+ # Second pass: fill up to min_pages with remaining pages
324
+ remaining = [p for p in self.discovered_pages if p not in selected and not p.error]
325
+ remaining.sort(key=lambda p: p.depth)
326
+
327
+ while len(selected) < min_pages and remaining:
328
+ selected.append(remaining.pop(0))
329
+
330
+ # Mark as selected
331
+ for page in selected:
332
+ page.selected = True
333
+
334
+ return selected
335
+
336
+
337
+ # =============================================================================
338
+ # CONVENIENCE FUNCTIONS
339
+ # =============================================================================
340
+
341
+ async def discover_pages(base_url: str, max_pages: int = 20) -> list[DiscoveredPage]:
342
+ """Convenience function to discover pages."""
343
+ discoverer = PageDiscoverer()
344
+ return await discoverer.discover(base_url, max_pages)
345
+
346
+
347
+ async def quick_discover(base_url: str) -> dict:
348
+ """Quick discovery returning summary dict."""
349
+ pages = await discover_pages(base_url)
350
+
351
+ return {
352
+ "total_found": len(pages),
353
+ "by_type": {
354
+ pt.value: len([p for p in pages if p.page_type == pt])
355
+ for pt in PageType
356
+ },
357
+ "pages": [
358
+ {
359
+ "url": p.url,
360
+ "title": p.title,
361
+ "type": p.page_type.value,
362
+ "selected": p.selected,
363
+ }
364
+ for p in pages
365
+ ],
366
+ }
agents/extractor.py ADDED
@@ -0,0 +1,622 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 1: Token Extractor
3
+ Design System Extractor v2
4
+
5
+ Persona: Meticulous Design Archaeologist
6
+
7
+ Responsibilities:
8
+ - Crawl pages at specified viewport
9
+ - Extract computed styles from all elements
10
+ - Collect colors, typography, spacing, radius, shadows
11
+ - Track frequency and context for each token
12
+ """
13
+
14
+ import asyncio
15
+ import re
16
+ from typing import Optional, Callable
17
+ from datetime import datetime
18
+ from collections import defaultdict
19
+
20
+ from playwright.async_api import async_playwright, Browser, Page, BrowserContext
21
+
22
+ from core.token_schema import (
23
+ Viewport,
24
+ ExtractedTokens,
25
+ ColorToken,
26
+ TypographyToken,
27
+ SpacingToken,
28
+ RadiusToken,
29
+ ShadowToken,
30
+ FontFamily,
31
+ TokenSource,
32
+ Confidence,
33
+ )
34
+ from core.color_utils import (
35
+ normalize_hex,
36
+ parse_color,
37
+ get_contrast_with_white,
38
+ get_contrast_with_black,
39
+ check_wcag_compliance,
40
+ )
41
+ from config.settings import get_settings
42
+
43
+
44
+ class TokenExtractor:
45
+ """
46
+ Extracts design tokens from web pages.
47
+
48
+ This is the second part of Agent 1's job — after pages are confirmed,
49
+ we crawl and extract all CSS values.
50
+ """
51
+
52
+ def __init__(self, viewport: Viewport = Viewport.DESKTOP):
53
+ self.settings = get_settings()
54
+ self.viewport = viewport
55
+ self.browser: Optional[Browser] = None
56
+ self.context: Optional[BrowserContext] = None
57
+
58
+ # Token collection
59
+ self.colors: dict[str, ColorToken] = {}
60
+ self.typography: dict[str, TypographyToken] = {}
61
+ self.spacing: dict[str, SpacingToken] = {}
62
+ self.radius: dict[str, RadiusToken] = {}
63
+ self.shadows: dict[str, ShadowToken] = {}
64
+
65
+ # Font tracking
66
+ self.font_families: dict[str, FontFamily] = {}
67
+
68
+ # Statistics
69
+ self.total_elements = 0
70
+ self.errors: list[str] = []
71
+ self.warnings: list[str] = []
72
+
73
+ async def __aenter__(self):
74
+ """Async context manager entry."""
75
+ await self._init_browser()
76
+ return self
77
+
78
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
79
+ """Async context manager exit."""
80
+ await self._close_browser()
81
+
82
+ async def _init_browser(self):
83
+ """Initialize Playwright browser."""
84
+ playwright = await async_playwright().start()
85
+ self.browser = await playwright.chromium.launch(
86
+ headless=self.settings.browser.headless
87
+ )
88
+
89
+ # Set viewport based on extraction mode
90
+ if self.viewport == Viewport.DESKTOP:
91
+ width = self.settings.viewport.desktop_width
92
+ height = self.settings.viewport.desktop_height
93
+ else:
94
+ width = self.settings.viewport.mobile_width
95
+ height = self.settings.viewport.mobile_height
96
+
97
+ self.context = await self.browser.new_context(
98
+ viewport={"width": width, "height": height},
99
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
100
+ )
101
+
102
+ async def _close_browser(self):
103
+ """Close browser and cleanup."""
104
+ if self.context:
105
+ await self.context.close()
106
+ if self.browser:
107
+ await self.browser.close()
108
+
109
+ async def _scroll_page(self, page: Page):
110
+ """Scroll page to load lazy content."""
111
+ await page.evaluate("""
112
+ async () => {
113
+ const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
114
+ const height = document.body.scrollHeight;
115
+ const step = window.innerHeight;
116
+
117
+ for (let y = 0; y < height; y += step) {
118
+ window.scrollTo(0, y);
119
+ await delay(100);
120
+ }
121
+
122
+ // Scroll back to top
123
+ window.scrollTo(0, 0);
124
+ }
125
+ """)
126
+
127
+ # Wait for network idle after scrolling
128
+ await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout)
129
+
130
+ async def _extract_styles_from_page(self, page: Page) -> dict:
131
+ """
132
+ Extract computed styles from all elements on the page.
133
+
134
+ This is the core extraction logic — we get getComputedStyle for every element.
135
+ """
136
+ styles_data = await page.evaluate("""
137
+ () => {
138
+ const elements = document.querySelectorAll('*');
139
+ const results = {
140
+ colors: [],
141
+ typography: [],
142
+ spacing: [],
143
+ radius: [],
144
+ shadows: [],
145
+ elements_count: elements.length,
146
+ };
147
+
148
+ const colorProperties = [
149
+ 'color', 'background-color', 'border-color',
150
+ 'border-top-color', 'border-right-color',
151
+ 'border-bottom-color', 'border-left-color',
152
+ 'outline-color', 'text-decoration-color',
153
+ ];
154
+
155
+ const spacingProperties = [
156
+ 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
157
+ 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
158
+ 'gap', 'row-gap', 'column-gap',
159
+ ];
160
+
161
+ elements.forEach(el => {
162
+ const tag = el.tagName.toLowerCase();
163
+ const styles = window.getComputedStyle(el);
164
+
165
+ // Skip invisible elements
166
+ if (styles.display === 'none' || styles.visibility === 'hidden') {
167
+ return;
168
+ }
169
+
170
+ // --- COLORS ---
171
+ colorProperties.forEach(prop => {
172
+ const value = styles.getPropertyValue(prop);
173
+ if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') {
174
+ results.colors.push({
175
+ value: value,
176
+ property: prop,
177
+ element: tag,
178
+ context: prop.includes('background') ? 'background' :
179
+ prop.includes('border') ? 'border' : 'text',
180
+ });
181
+ }
182
+ });
183
+
184
+ // --- TYPOGRAPHY ---
185
+ const fontFamily = styles.getPropertyValue('font-family');
186
+ const fontSize = styles.getPropertyValue('font-size');
187
+ const fontWeight = styles.getPropertyValue('font-weight');
188
+ const lineHeight = styles.getPropertyValue('line-height');
189
+ const letterSpacing = styles.getPropertyValue('letter-spacing');
190
+
191
+ if (fontSize && fontFamily) {
192
+ results.typography.push({
193
+ fontFamily: fontFamily,
194
+ fontSize: fontSize,
195
+ fontWeight: fontWeight,
196
+ lineHeight: lineHeight,
197
+ letterSpacing: letterSpacing,
198
+ element: tag,
199
+ });
200
+ }
201
+
202
+ // --- SPACING ---
203
+ spacingProperties.forEach(prop => {
204
+ const value = styles.getPropertyValue(prop);
205
+ if (value && value !== '0px' && value !== 'auto' && value !== 'normal') {
206
+ const px = parseFloat(value);
207
+ if (!isNaN(px) && px > 0 && px < 500) {
208
+ results.spacing.push({
209
+ value: value,
210
+ valuePx: Math.round(px),
211
+ property: prop,
212
+ context: prop.includes('margin') ? 'margin' :
213
+ prop.includes('padding') ? 'padding' : 'gap',
214
+ });
215
+ }
216
+ }
217
+ });
218
+
219
+ // --- BORDER RADIUS ---
220
+ const radiusProps = [
221
+ 'border-radius', 'border-top-left-radius',
222
+ 'border-top-right-radius', 'border-bottom-left-radius',
223
+ 'border-bottom-right-radius',
224
+ ];
225
+
226
+ radiusProps.forEach(prop => {
227
+ const value = styles.getPropertyValue(prop);
228
+ if (value && value !== '0px') {
229
+ results.radius.push({
230
+ value: value,
231
+ element: tag,
232
+ });
233
+ }
234
+ });
235
+
236
+ // --- BOX SHADOW ---
237
+ const shadow = styles.getPropertyValue('box-shadow');
238
+ if (shadow && shadow !== 'none') {
239
+ results.shadows.push({
240
+ value: shadow,
241
+ element: tag,
242
+ });
243
+ }
244
+ });
245
+
246
+ return results;
247
+ }
248
+ """)
249
+
250
+ return styles_data
251
+
252
+ def _process_color(self, color_data: dict) -> Optional[str]:
253
+ """Process and normalize a color value."""
254
+ value = color_data.get("value", "")
255
+
256
+ # Parse and normalize
257
+ parsed = parse_color(value)
258
+ if not parsed:
259
+ return None
260
+
261
+ return parsed.hex
262
+
263
+ def _aggregate_colors(self, raw_colors: list[dict]):
264
+ """Aggregate color data from extraction."""
265
+ for color_data in raw_colors:
266
+ hex_value = self._process_color(color_data)
267
+ if not hex_value:
268
+ continue
269
+
270
+ if hex_value not in self.colors:
271
+ # Calculate contrast ratios
272
+ contrast_white = get_contrast_with_white(hex_value)
273
+ contrast_black = get_contrast_with_black(hex_value)
274
+ compliance = check_wcag_compliance(hex_value, "#ffffff")
275
+
276
+ self.colors[hex_value] = ColorToken(
277
+ value=hex_value,
278
+ frequency=0,
279
+ contexts=[],
280
+ elements=[],
281
+ css_properties=[],
282
+ contrast_white=round(contrast_white, 2),
283
+ contrast_black=round(contrast_black, 2),
284
+ wcag_aa_large_text=compliance["aa_large_text"],
285
+ wcag_aa_small_text=compliance["aa_normal_text"],
286
+ )
287
+
288
+ # Update frequency and context
289
+ token = self.colors[hex_value]
290
+ token.frequency += 1
291
+
292
+ context = color_data.get("context", "")
293
+ if context and context not in token.contexts:
294
+ token.contexts.append(context)
295
+
296
+ element = color_data.get("element", "")
297
+ if element and element not in token.elements:
298
+ token.elements.append(element)
299
+
300
+ prop = color_data.get("property", "")
301
+ if prop and prop not in token.css_properties:
302
+ token.css_properties.append(prop)
303
+
304
+ def _aggregate_typography(self, raw_typography: list[dict]):
305
+ """Aggregate typography data from extraction."""
306
+ for typo_data in raw_typography:
307
+ # Create unique key
308
+ font_family = typo_data.get("fontFamily", "")
309
+ font_size = typo_data.get("fontSize", "")
310
+ font_weight = typo_data.get("fontWeight", "400")
311
+ line_height = typo_data.get("lineHeight", "normal")
312
+
313
+ key = f"{font_size}|{font_weight}|{font_family[:50]}"
314
+
315
+ if key not in self.typography:
316
+ # Parse font size to px
317
+ font_size_px = None
318
+ if font_size.endswith("px"):
319
+ try:
320
+ font_size_px = float(font_size.replace("px", ""))
321
+ except ValueError:
322
+ pass
323
+
324
+ # Parse line height
325
+ line_height_computed = None
326
+ if line_height and line_height != "normal":
327
+ if line_height.endswith("px") and font_size_px:
328
+ try:
329
+ lh_px = float(line_height.replace("px", ""))
330
+ line_height_computed = round(lh_px / font_size_px, 2)
331
+ except ValueError:
332
+ pass
333
+ else:
334
+ try:
335
+ line_height_computed = float(line_height)
336
+ except ValueError:
337
+ pass
338
+
339
+ self.typography[key] = TypographyToken(
340
+ font_family=font_family.split(",")[0].strip().strip('"\''),
341
+ font_size=font_size,
342
+ font_size_px=font_size_px,
343
+ font_weight=int(font_weight) if font_weight.isdigit() else 400,
344
+ line_height=line_height,
345
+ line_height_computed=line_height_computed,
346
+ letter_spacing=typo_data.get("letterSpacing"),
347
+ frequency=0,
348
+ elements=[],
349
+ )
350
+
351
+ # Update
352
+ token = self.typography[key]
353
+ token.frequency += 1
354
+
355
+ element = typo_data.get("element", "")
356
+ if element and element not in token.elements:
357
+ token.elements.append(element)
358
+
359
+ # Track font families
360
+ primary_font = token.font_family
361
+ if primary_font not in self.font_families:
362
+ self.font_families[primary_font] = FontFamily(
363
+ name=primary_font,
364
+ fallbacks=[f.strip().strip('"\'') for f in font_family.split(",")[1:]],
365
+ frequency=0,
366
+ )
367
+ self.font_families[primary_font].frequency += 1
368
+
369
+ def _aggregate_spacing(self, raw_spacing: list[dict]):
370
+ """Aggregate spacing data from extraction."""
371
+ for space_data in raw_spacing:
372
+ value = space_data.get("value", "")
373
+ value_px = space_data.get("valuePx", 0)
374
+
375
+ key = str(value_px)
376
+
377
+ if key not in self.spacing:
378
+ self.spacing[key] = SpacingToken(
379
+ value=f"{value_px}px",
380
+ value_px=value_px,
381
+ frequency=0,
382
+ contexts=[],
383
+ properties=[],
384
+ fits_base_4=value_px % 4 == 0,
385
+ fits_base_8=value_px % 8 == 0,
386
+ )
387
+
388
+ token = self.spacing[key]
389
+ token.frequency += 1
390
+
391
+ context = space_data.get("context", "")
392
+ if context and context not in token.contexts:
393
+ token.contexts.append(context)
394
+
395
+ prop = space_data.get("property", "")
396
+ if prop and prop not in token.properties:
397
+ token.properties.append(prop)
398
+
399
+ def _aggregate_radius(self, raw_radius: list[dict]):
400
+ """Aggregate border radius data."""
401
+ for radius_data in raw_radius:
402
+ value = radius_data.get("value", "")
403
+
404
+ # Normalize to simple format
405
+ # "8px 8px 8px 8px" -> "8px"
406
+ parts = value.split()
407
+ if len(set(parts)) == 1:
408
+ value = parts[0]
409
+
410
+ if value not in self.radius:
411
+ value_px = None
412
+ if value.endswith("px"):
413
+ try:
414
+ value_px = int(float(value.replace("px", "")))
415
+ except ValueError:
416
+ pass
417
+
418
+ self.radius[value] = RadiusToken(
419
+ value=value,
420
+ value_px=value_px,
421
+ frequency=0,
422
+ elements=[],
423
+ fits_base_4=value_px % 4 == 0 if value_px else False,
424
+ fits_base_8=value_px % 8 == 0 if value_px else False,
425
+ )
426
+
427
+ token = self.radius[value]
428
+ token.frequency += 1
429
+
430
+ element = radius_data.get("element", "")
431
+ if element and element not in token.elements:
432
+ token.elements.append(element)
433
+
434
+ def _aggregate_shadows(self, raw_shadows: list[dict]):
435
+ """Aggregate box shadow data."""
436
+ for shadow_data in raw_shadows:
437
+ value = shadow_data.get("value", "")
438
+
439
+ if value not in self.shadows:
440
+ self.shadows[value] = ShadowToken(
441
+ value=value,
442
+ frequency=0,
443
+ elements=[],
444
+ )
445
+
446
+ token = self.shadows[value]
447
+ token.frequency += 1
448
+
449
+ element = shadow_data.get("element", "")
450
+ if element and element not in token.elements:
451
+ token.elements.append(element)
452
+
453
+ def _calculate_confidence(self, frequency: int) -> Confidence:
454
+ """Calculate confidence level based on frequency."""
455
+ if frequency >= 10:
456
+ return Confidence.HIGH
457
+ elif frequency >= 3:
458
+ return Confidence.MEDIUM
459
+ return Confidence.LOW
460
+
461
+ def _detect_spacing_base(self) -> Optional[int]:
462
+ """Detect the base spacing unit (4 or 8)."""
463
+ fits_4 = sum(1 for s in self.spacing.values() if s.fits_base_4)
464
+ fits_8 = sum(1 for s in self.spacing.values() if s.fits_base_8)
465
+
466
+ total = len(self.spacing)
467
+ if total == 0:
468
+ return None
469
+
470
+ # If 80%+ values fit base 8, use 8
471
+ if fits_8 / total >= 0.8:
472
+ return 8
473
+ # If 80%+ values fit base 4, use 4
474
+ elif fits_4 / total >= 0.8:
475
+ return 4
476
+
477
+ return None
478
+
479
+ async def extract(
480
+ self,
481
+ pages: list[str],
482
+ progress_callback: Optional[Callable[[float], None]] = None
483
+ ) -> ExtractedTokens:
484
+ """
485
+ Extract tokens from a list of pages.
486
+
487
+ Args:
488
+ pages: List of URLs to crawl
489
+ progress_callback: Optional callback for progress updates
490
+
491
+ Returns:
492
+ ExtractedTokens with all discovered tokens
493
+ """
494
+ start_time = datetime.now()
495
+ pages_crawled = []
496
+
497
+ async with self:
498
+ for i, url in enumerate(pages):
499
+ try:
500
+ page = await self.context.new_page()
501
+
502
+ # Navigate with fallback strategy
503
+ try:
504
+ await page.goto(
505
+ url,
506
+ wait_until="domcontentloaded",
507
+ timeout=60000 # 60 seconds
508
+ )
509
+ # Wait for JS to render
510
+ await page.wait_for_timeout(2000)
511
+ except Exception as nav_error:
512
+ # Fallback to load event
513
+ try:
514
+ await page.goto(
515
+ url,
516
+ wait_until="load",
517
+ timeout=60000
518
+ )
519
+ await page.wait_for_timeout(3000)
520
+ except Exception:
521
+ self.warnings.append(f"Slow load for {url}, extracting partial content")
522
+
523
+ # Scroll to load lazy content
524
+ await self._scroll_page(page)
525
+
526
+ # Extract styles
527
+ styles = await self._extract_styles_from_page(page)
528
+
529
+ # Aggregate
530
+ self._aggregate_colors(styles.get("colors", []))
531
+ self._aggregate_typography(styles.get("typography", []))
532
+ self._aggregate_spacing(styles.get("spacing", []))
533
+ self._aggregate_radius(styles.get("radius", []))
534
+ self._aggregate_shadows(styles.get("shadows", []))
535
+
536
+ self.total_elements += styles.get("elements_count", 0)
537
+ pages_crawled.append(url)
538
+
539
+ await page.close()
540
+
541
+ # Progress callback
542
+ if progress_callback:
543
+ progress_callback((i + 1) / len(pages))
544
+
545
+ # Rate limiting
546
+ await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000)
547
+
548
+ except Exception as e:
549
+ self.errors.append(f"Error extracting {url}: {str(e)}")
550
+
551
+ # Calculate confidence for all tokens
552
+ for token in self.colors.values():
553
+ token.confidence = self._calculate_confidence(token.frequency)
554
+ for token in self.typography.values():
555
+ token.confidence = self._calculate_confidence(token.frequency)
556
+ for token in self.spacing.values():
557
+ token.confidence = self._calculate_confidence(token.frequency)
558
+
559
+ # Detect spacing base
560
+ spacing_base = self._detect_spacing_base()
561
+
562
+ # Mark outliers in spacing
563
+ if spacing_base:
564
+ for token in self.spacing.values():
565
+ if spacing_base == 8 and not token.fits_base_8:
566
+ token.is_outlier = True
567
+ elif spacing_base == 4 and not token.fits_base_4:
568
+ token.is_outlier = True
569
+
570
+ # Determine primary font
571
+ if self.font_families:
572
+ primary_font = max(self.font_families.values(), key=lambda f: f.frequency)
573
+ primary_font.usage = "primary"
574
+
575
+ # Build result
576
+ end_time = datetime.now()
577
+ duration_ms = int((end_time - start_time).total_seconds() * 1000)
578
+
579
+ return ExtractedTokens(
580
+ viewport=self.viewport,
581
+ source_url=pages[0] if pages else "",
582
+ pages_crawled=pages_crawled,
583
+ colors=list(self.colors.values()),
584
+ typography=list(self.typography.values()),
585
+ spacing=list(self.spacing.values()),
586
+ radius=list(self.radius.values()),
587
+ shadows=list(self.shadows.values()),
588
+ font_families=list(self.font_families.values()),
589
+ spacing_base=spacing_base,
590
+ extraction_timestamp=start_time,
591
+ extraction_duration_ms=duration_ms,
592
+ total_elements_analyzed=self.total_elements,
593
+ unique_colors=len(self.colors),
594
+ unique_font_sizes=len(set(t.font_size for t in self.typography.values())),
595
+ unique_spacing_values=len(self.spacing),
596
+ errors=self.errors,
597
+ warnings=self.warnings,
598
+ )
599
+
600
+
601
+ # =============================================================================
602
+ # CONVENIENCE FUNCTIONS
603
+ # =============================================================================
604
+
605
+ async def extract_from_pages(
606
+ pages: list[str],
607
+ viewport: Viewport = Viewport.DESKTOP
608
+ ) -> ExtractedTokens:
609
+ """Convenience function to extract tokens from pages."""
610
+ extractor = TokenExtractor(viewport=viewport)
611
+ return await extractor.extract(pages)
612
+
613
+
614
+ async def extract_both_viewports(pages: list[str]) -> tuple[ExtractedTokens, ExtractedTokens]:
615
+ """Extract tokens from both desktop and mobile viewports."""
616
+ desktop_extractor = TokenExtractor(viewport=Viewport.DESKTOP)
617
+ mobile_extractor = TokenExtractor(viewport=Viewport.MOBILE)
618
+
619
+ desktop_result = await desktop_extractor.extract(pages)
620
+ mobile_result = await mobile_extractor.extract(pages)
621
+
622
+ return desktop_result, mobile_result