riazmo commited on
Commit
e486cfa
·
verified ·
1 Parent(s): 10a2a39

Delete agents/extractor.py

Browse files
Files changed (1) hide show
  1. agents/extractor.py +0 -622
agents/extractor.py DELETED
@@ -1,622 +0,0 @@
1
- """
2
- Agent 1: Token Extractor
3
- Design System Extractor v2
4
-
5
- Persona: Meticulous Design Archaeologist
6
-
7
- Responsibilities:
8
- - Crawl pages at specified viewport
9
- - Extract computed styles from all elements
10
- - Collect colors, typography, spacing, radius, shadows
11
- - Track frequency and context for each token
12
- """
13
-
14
- import asyncio
15
- import re
16
- from typing import Optional, Callable
17
- from datetime import datetime
18
- from collections import defaultdict
19
-
20
- from playwright.async_api import async_playwright, Browser, Page, BrowserContext
21
-
22
- from core.token_schema import (
23
- Viewport,
24
- ExtractedTokens,
25
- ColorToken,
26
- TypographyToken,
27
- SpacingToken,
28
- RadiusToken,
29
- ShadowToken,
30
- FontFamily,
31
- TokenSource,
32
- Confidence,
33
- )
34
- from core.color_utils import (
35
- normalize_hex,
36
- parse_color,
37
- get_contrast_with_white,
38
- get_contrast_with_black,
39
- check_wcag_compliance,
40
- )
41
- from config.settings import get_settings
42
-
43
-
44
- class TokenExtractor:
45
- """
46
- Extracts design tokens from web pages.
47
-
48
- This is the second part of Agent 1's job — after pages are confirmed,
49
- we crawl and extract all CSS values.
50
- """
51
-
52
- def __init__(self, viewport: Viewport = Viewport.DESKTOP):
53
- self.settings = get_settings()
54
- self.viewport = viewport
55
- self.browser: Optional[Browser] = None
56
- self.context: Optional[BrowserContext] = None
57
-
58
- # Token collection
59
- self.colors: dict[str, ColorToken] = {}
60
- self.typography: dict[str, TypographyToken] = {}
61
- self.spacing: dict[str, SpacingToken] = {}
62
- self.radius: dict[str, RadiusToken] = {}
63
- self.shadows: dict[str, ShadowToken] = {}
64
-
65
- # Font tracking
66
- self.font_families: dict[str, FontFamily] = {}
67
-
68
- # Statistics
69
- self.total_elements = 0
70
- self.errors: list[str] = []
71
- self.warnings: list[str] = []
72
-
73
- async def __aenter__(self):
74
- """Async context manager entry."""
75
- await self._init_browser()
76
- return self
77
-
78
- async def __aexit__(self, exc_type, exc_val, exc_tb):
79
- """Async context manager exit."""
80
- await self._close_browser()
81
-
82
- async def _init_browser(self):
83
- """Initialize Playwright browser."""
84
- playwright = await async_playwright().start()
85
- self.browser = await playwright.chromium.launch(
86
- headless=self.settings.browser.headless
87
- )
88
-
89
- # Set viewport based on extraction mode
90
- if self.viewport == Viewport.DESKTOP:
91
- width = self.settings.viewport.desktop_width
92
- height = self.settings.viewport.desktop_height
93
- else:
94
- width = self.settings.viewport.mobile_width
95
- height = self.settings.viewport.mobile_height
96
-
97
- self.context = await self.browser.new_context(
98
- viewport={"width": width, "height": height},
99
- user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
100
- )
101
-
102
- async def _close_browser(self):
103
- """Close browser and cleanup."""
104
- if self.context:
105
- await self.context.close()
106
- if self.browser:
107
- await self.browser.close()
108
-
109
- async def _scroll_page(self, page: Page):
110
- """Scroll page to load lazy content."""
111
- await page.evaluate("""
112
- async () => {
113
- const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
114
- const height = document.body.scrollHeight;
115
- const step = window.innerHeight;
116
-
117
- for (let y = 0; y < height; y += step) {
118
- window.scrollTo(0, y);
119
- await delay(100);
120
- }
121
-
122
- // Scroll back to top
123
- window.scrollTo(0, 0);
124
- }
125
- """)
126
-
127
- # Wait for network idle after scrolling
128
- await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout)
129
-
130
- async def _extract_styles_from_page(self, page: Page) -> dict:
131
- """
132
- Extract computed styles from all elements on the page.
133
-
134
- This is the core extraction logic — we get getComputedStyle for every element.
135
- """
136
- styles_data = await page.evaluate("""
137
- () => {
138
- const elements = document.querySelectorAll('*');
139
- const results = {
140
- colors: [],
141
- typography: [],
142
- spacing: [],
143
- radius: [],
144
- shadows: [],
145
- elements_count: elements.length,
146
- };
147
-
148
- const colorProperties = [
149
- 'color', 'background-color', 'border-color',
150
- 'border-top-color', 'border-right-color',
151
- 'border-bottom-color', 'border-left-color',
152
- 'outline-color', 'text-decoration-color',
153
- ];
154
-
155
- const spacingProperties = [
156
- 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
157
- 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
158
- 'gap', 'row-gap', 'column-gap',
159
- ];
160
-
161
- elements.forEach(el => {
162
- const tag = el.tagName.toLowerCase();
163
- const styles = window.getComputedStyle(el);
164
-
165
- // Skip invisible elements
166
- if (styles.display === 'none' || styles.visibility === 'hidden') {
167
- return;
168
- }
169
-
170
- // --- COLORS ---
171
- colorProperties.forEach(prop => {
172
- const value = styles.getPropertyValue(prop);
173
- if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') {
174
- results.colors.push({
175
- value: value,
176
- property: prop,
177
- element: tag,
178
- context: prop.includes('background') ? 'background' :
179
- prop.includes('border') ? 'border' : 'text',
180
- });
181
- }
182
- });
183
-
184
- // --- TYPOGRAPHY ---
185
- const fontFamily = styles.getPropertyValue('font-family');
186
- const fontSize = styles.getPropertyValue('font-size');
187
- const fontWeight = styles.getPropertyValue('font-weight');
188
- const lineHeight = styles.getPropertyValue('line-height');
189
- const letterSpacing = styles.getPropertyValue('letter-spacing');
190
-
191
- if (fontSize && fontFamily) {
192
- results.typography.push({
193
- fontFamily: fontFamily,
194
- fontSize: fontSize,
195
- fontWeight: fontWeight,
196
- lineHeight: lineHeight,
197
- letterSpacing: letterSpacing,
198
- element: tag,
199
- });
200
- }
201
-
202
- // --- SPACING ---
203
- spacingProperties.forEach(prop => {
204
- const value = styles.getPropertyValue(prop);
205
- if (value && value !== '0px' && value !== 'auto' && value !== 'normal') {
206
- const px = parseFloat(value);
207
- if (!isNaN(px) && px > 0 && px < 500) {
208
- results.spacing.push({
209
- value: value,
210
- valuePx: Math.round(px),
211
- property: prop,
212
- context: prop.includes('margin') ? 'margin' :
213
- prop.includes('padding') ? 'padding' : 'gap',
214
- });
215
- }
216
- }
217
- });
218
-
219
- // --- BORDER RADIUS ---
220
- const radiusProps = [
221
- 'border-radius', 'border-top-left-radius',
222
- 'border-top-right-radius', 'border-bottom-left-radius',
223
- 'border-bottom-right-radius',
224
- ];
225
-
226
- radiusProps.forEach(prop => {
227
- const value = styles.getPropertyValue(prop);
228
- if (value && value !== '0px') {
229
- results.radius.push({
230
- value: value,
231
- element: tag,
232
- });
233
- }
234
- });
235
-
236
- // --- BOX SHADOW ---
237
- const shadow = styles.getPropertyValue('box-shadow');
238
- if (shadow && shadow !== 'none') {
239
- results.shadows.push({
240
- value: shadow,
241
- element: tag,
242
- });
243
- }
244
- });
245
-
246
- return results;
247
- }
248
- """)
249
-
250
- return styles_data
251
-
252
- def _process_color(self, color_data: dict) -> Optional[str]:
253
- """Process and normalize a color value."""
254
- value = color_data.get("value", "")
255
-
256
- # Parse and normalize
257
- parsed = parse_color(value)
258
- if not parsed:
259
- return None
260
-
261
- return parsed.hex
262
-
263
- def _aggregate_colors(self, raw_colors: list[dict]):
264
- """Aggregate color data from extraction."""
265
- for color_data in raw_colors:
266
- hex_value = self._process_color(color_data)
267
- if not hex_value:
268
- continue
269
-
270
- if hex_value not in self.colors:
271
- # Calculate contrast ratios
272
- contrast_white = get_contrast_with_white(hex_value)
273
- contrast_black = get_contrast_with_black(hex_value)
274
- compliance = check_wcag_compliance(hex_value, "#ffffff")
275
-
276
- self.colors[hex_value] = ColorToken(
277
- value=hex_value,
278
- frequency=0,
279
- contexts=[],
280
- elements=[],
281
- css_properties=[],
282
- contrast_white=round(contrast_white, 2),
283
- contrast_black=round(contrast_black, 2),
284
- wcag_aa_large_text=compliance["aa_large_text"],
285
- wcag_aa_small_text=compliance["aa_normal_text"],
286
- )
287
-
288
- # Update frequency and context
289
- token = self.colors[hex_value]
290
- token.frequency += 1
291
-
292
- context = color_data.get("context", "")
293
- if context and context not in token.contexts:
294
- token.contexts.append(context)
295
-
296
- element = color_data.get("element", "")
297
- if element and element not in token.elements:
298
- token.elements.append(element)
299
-
300
- prop = color_data.get("property", "")
301
- if prop and prop not in token.css_properties:
302
- token.css_properties.append(prop)
303
-
304
- def _aggregate_typography(self, raw_typography: list[dict]):
305
- """Aggregate typography data from extraction."""
306
- for typo_data in raw_typography:
307
- # Create unique key
308
- font_family = typo_data.get("fontFamily", "")
309
- font_size = typo_data.get("fontSize", "")
310
- font_weight = typo_data.get("fontWeight", "400")
311
- line_height = typo_data.get("lineHeight", "normal")
312
-
313
- key = f"{font_size}|{font_weight}|{font_family[:50]}"
314
-
315
- if key not in self.typography:
316
- # Parse font size to px
317
- font_size_px = None
318
- if font_size.endswith("px"):
319
- try:
320
- font_size_px = float(font_size.replace("px", ""))
321
- except ValueError:
322
- pass
323
-
324
- # Parse line height
325
- line_height_computed = None
326
- if line_height and line_height != "normal":
327
- if line_height.endswith("px") and font_size_px:
328
- try:
329
- lh_px = float(line_height.replace("px", ""))
330
- line_height_computed = round(lh_px / font_size_px, 2)
331
- except ValueError:
332
- pass
333
- else:
334
- try:
335
- line_height_computed = float(line_height)
336
- except ValueError:
337
- pass
338
-
339
- self.typography[key] = TypographyToken(
340
- font_family=font_family.split(",")[0].strip().strip('"\''),
341
- font_size=font_size,
342
- font_size_px=font_size_px,
343
- font_weight=int(font_weight) if font_weight.isdigit() else 400,
344
- line_height=line_height,
345
- line_height_computed=line_height_computed,
346
- letter_spacing=typo_data.get("letterSpacing"),
347
- frequency=0,
348
- elements=[],
349
- )
350
-
351
- # Update
352
- token = self.typography[key]
353
- token.frequency += 1
354
-
355
- element = typo_data.get("element", "")
356
- if element and element not in token.elements:
357
- token.elements.append(element)
358
-
359
- # Track font families
360
- primary_font = token.font_family
361
- if primary_font not in self.font_families:
362
- self.font_families[primary_font] = FontFamily(
363
- name=primary_font,
364
- fallbacks=[f.strip().strip('"\'') for f in font_family.split(",")[1:]],
365
- frequency=0,
366
- )
367
- self.font_families[primary_font].frequency += 1
368
-
369
- def _aggregate_spacing(self, raw_spacing: list[dict]):
370
- """Aggregate spacing data from extraction."""
371
- for space_data in raw_spacing:
372
- value = space_data.get("value", "")
373
- value_px = space_data.get("valuePx", 0)
374
-
375
- key = str(value_px)
376
-
377
- if key not in self.spacing:
378
- self.spacing[key] = SpacingToken(
379
- value=f"{value_px}px",
380
- value_px=value_px,
381
- frequency=0,
382
- contexts=[],
383
- properties=[],
384
- fits_base_4=value_px % 4 == 0,
385
- fits_base_8=value_px % 8 == 0,
386
- )
387
-
388
- token = self.spacing[key]
389
- token.frequency += 1
390
-
391
- context = space_data.get("context", "")
392
- if context and context not in token.contexts:
393
- token.contexts.append(context)
394
-
395
- prop = space_data.get("property", "")
396
- if prop and prop not in token.properties:
397
- token.properties.append(prop)
398
-
399
- def _aggregate_radius(self, raw_radius: list[dict]):
400
- """Aggregate border radius data."""
401
- for radius_data in raw_radius:
402
- value = radius_data.get("value", "")
403
-
404
- # Normalize to simple format
405
- # "8px 8px 8px 8px" -> "8px"
406
- parts = value.split()
407
- if len(set(parts)) == 1:
408
- value = parts[0]
409
-
410
- if value not in self.radius:
411
- value_px = None
412
- if value.endswith("px"):
413
- try:
414
- value_px = int(float(value.replace("px", "")))
415
- except ValueError:
416
- pass
417
-
418
- self.radius[value] = RadiusToken(
419
- value=value,
420
- value_px=value_px,
421
- frequency=0,
422
- elements=[],
423
- fits_base_4=value_px % 4 == 0 if value_px else False,
424
- fits_base_8=value_px % 8 == 0 if value_px else False,
425
- )
426
-
427
- token = self.radius[value]
428
- token.frequency += 1
429
-
430
- element = radius_data.get("element", "")
431
- if element and element not in token.elements:
432
- token.elements.append(element)
433
-
434
- def _aggregate_shadows(self, raw_shadows: list[dict]):
435
- """Aggregate box shadow data."""
436
- for shadow_data in raw_shadows:
437
- value = shadow_data.get("value", "")
438
-
439
- if value not in self.shadows:
440
- self.shadows[value] = ShadowToken(
441
- value=value,
442
- frequency=0,
443
- elements=[],
444
- )
445
-
446
- token = self.shadows[value]
447
- token.frequency += 1
448
-
449
- element = shadow_data.get("element", "")
450
- if element and element not in token.elements:
451
- token.elements.append(element)
452
-
453
- def _calculate_confidence(self, frequency: int) -> Confidence:
454
- """Calculate confidence level based on frequency."""
455
- if frequency >= 10:
456
- return Confidence.HIGH
457
- elif frequency >= 3:
458
- return Confidence.MEDIUM
459
- return Confidence.LOW
460
-
461
- def _detect_spacing_base(self) -> Optional[int]:
462
- """Detect the base spacing unit (4 or 8)."""
463
- fits_4 = sum(1 for s in self.spacing.values() if s.fits_base_4)
464
- fits_8 = sum(1 for s in self.spacing.values() if s.fits_base_8)
465
-
466
- total = len(self.spacing)
467
- if total == 0:
468
- return None
469
-
470
- # If 80%+ values fit base 8, use 8
471
- if fits_8 / total >= 0.8:
472
- return 8
473
- # If 80%+ values fit base 4, use 4
474
- elif fits_4 / total >= 0.8:
475
- return 4
476
-
477
- return None
478
-
479
- async def extract(
480
- self,
481
- pages: list[str],
482
- progress_callback: Optional[Callable[[float], None]] = None
483
- ) -> ExtractedTokens:
484
- """
485
- Extract tokens from a list of pages.
486
-
487
- Args:
488
- pages: List of URLs to crawl
489
- progress_callback: Optional callback for progress updates
490
-
491
- Returns:
492
- ExtractedTokens with all discovered tokens
493
- """
494
- start_time = datetime.now()
495
- pages_crawled = []
496
-
497
- async with self:
498
- for i, url in enumerate(pages):
499
- try:
500
- page = await self.context.new_page()
501
-
502
- # Navigate with fallback strategy
503
- try:
504
- await page.goto(
505
- url,
506
- wait_until="domcontentloaded",
507
- timeout=60000 # 60 seconds
508
- )
509
- # Wait for JS to render
510
- await page.wait_for_timeout(2000)
511
- except Exception as nav_error:
512
- # Fallback to load event
513
- try:
514
- await page.goto(
515
- url,
516
- wait_until="load",
517
- timeout=60000
518
- )
519
- await page.wait_for_timeout(3000)
520
- except Exception:
521
- self.warnings.append(f"Slow load for {url}, extracting partial content")
522
-
523
- # Scroll to load lazy content
524
- await self._scroll_page(page)
525
-
526
- # Extract styles
527
- styles = await self._extract_styles_from_page(page)
528
-
529
- # Aggregate
530
- self._aggregate_colors(styles.get("colors", []))
531
- self._aggregate_typography(styles.get("typography", []))
532
- self._aggregate_spacing(styles.get("spacing", []))
533
- self._aggregate_radius(styles.get("radius", []))
534
- self._aggregate_shadows(styles.get("shadows", []))
535
-
536
- self.total_elements += styles.get("elements_count", 0)
537
- pages_crawled.append(url)
538
-
539
- await page.close()
540
-
541
- # Progress callback
542
- if progress_callback:
543
- progress_callback((i + 1) / len(pages))
544
-
545
- # Rate limiting
546
- await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000)
547
-
548
- except Exception as e:
549
- self.errors.append(f"Error extracting {url}: {str(e)}")
550
-
551
- # Calculate confidence for all tokens
552
- for token in self.colors.values():
553
- token.confidence = self._calculate_confidence(token.frequency)
554
- for token in self.typography.values():
555
- token.confidence = self._calculate_confidence(token.frequency)
556
- for token in self.spacing.values():
557
- token.confidence = self._calculate_confidence(token.frequency)
558
-
559
- # Detect spacing base
560
- spacing_base = self._detect_spacing_base()
561
-
562
- # Mark outliers in spacing
563
- if spacing_base:
564
- for token in self.spacing.values():
565
- if spacing_base == 8 and not token.fits_base_8:
566
- token.is_outlier = True
567
- elif spacing_base == 4 and not token.fits_base_4:
568
- token.is_outlier = True
569
-
570
- # Determine primary font
571
- if self.font_families:
572
- primary_font = max(self.font_families.values(), key=lambda f: f.frequency)
573
- primary_font.usage = "primary"
574
-
575
- # Build result
576
- end_time = datetime.now()
577
- duration_ms = int((end_time - start_time).total_seconds() * 1000)
578
-
579
- return ExtractedTokens(
580
- viewport=self.viewport,
581
- source_url=pages[0] if pages else "",
582
- pages_crawled=pages_crawled,
583
- colors=list(self.colors.values()),
584
- typography=list(self.typography.values()),
585
- spacing=list(self.spacing.values()),
586
- radius=list(self.radius.values()),
587
- shadows=list(self.shadows.values()),
588
- font_families=list(self.font_families.values()),
589
- spacing_base=spacing_base,
590
- extraction_timestamp=start_time,
591
- extraction_duration_ms=duration_ms,
592
- total_elements_analyzed=self.total_elements,
593
- unique_colors=len(self.colors),
594
- unique_font_sizes=len(set(t.font_size for t in self.typography.values())),
595
- unique_spacing_values=len(self.spacing),
596
- errors=self.errors,
597
- warnings=self.warnings,
598
- )
599
-
600
-
601
- # =============================================================================
602
- # CONVENIENCE FUNCTIONS
603
- # =============================================================================
604
-
605
- async def extract_from_pages(
606
- pages: list[str],
607
- viewport: Viewport = Viewport.DESKTOP
608
- ) -> ExtractedTokens:
609
- """Convenience function to extract tokens from pages."""
610
- extractor = TokenExtractor(viewport=viewport)
611
- return await extractor.extract(pages)
612
-
613
-
614
- async def extract_both_viewports(pages: list[str]) -> tuple[ExtractedTokens, ExtractedTokens]:
615
- """Extract tokens from both desktop and mobile viewports."""
616
- desktop_extractor = TokenExtractor(viewport=Viewport.DESKTOP)
617
- mobile_extractor = TokenExtractor(viewport=Viewport.MOBILE)
618
-
619
- desktop_result = await desktop_extractor.extract(pages)
620
- mobile_result = await mobile_extractor.extract(pages)
621
-
622
- return desktop_result, mobile_result