riazmo commited on
Commit
b40e625
·
verified ·
1 Parent(s): c4b9fba

Upload extractor.py

Browse files
Files changed (1) hide show
  1. agents/extractor.py +163 -1
agents/extractor.py CHANGED
@@ -443,6 +443,8 @@ class TokenExtractor:
443
 
444
  This catches colors defined in CSS rules that may not be
445
  currently applied to visible elements.
 
 
446
  """
447
  css_colors = await page.evaluate("""
448
  () => {
@@ -493,6 +495,146 @@ class TokenExtractor:
493
 
494
  return css_colors
495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  def _process_css_variables(self, css_vars: dict):
497
  """Process CSS variables and extract color tokens from them."""
498
  computed = css_vars.get("computed", {})
@@ -530,7 +672,7 @@ class TokenExtractor:
530
  contrast_black=round(contrast_black, 2),
531
  wcag_aa_large_text=compliance["aa_large_text"],
532
  wcag_aa_small_text=compliance["aa_normal_text"],
533
- source=TokenSource.CSS_VARIABLE,
534
  confidence=Confidence.HIGH,
535
  )
536
  elif hex_value and hex_value in self.colors:
@@ -879,6 +1021,24 @@ class TokenExtractor:
879
  except Exception as e:
880
  self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
882
  # =========================================================
883
  # Log extraction results for this page
884
  # =========================================================
@@ -896,6 +1056,8 @@ class TokenExtractor:
896
  "svg_colors": svg_color_count,
897
  "inline_colors": inline_color_count,
898
  "stylesheet_colors": stylesheet_color_count,
 
 
899
  "new_colors": colors_new,
900
  "new_typography": typo_new,
901
  "new_spacing": spacing_new,
 
443
 
444
  This catches colors defined in CSS rules that may not be
445
  currently applied to visible elements.
446
+
447
+ Also fetches external stylesheets that may be CORS-blocked.
448
  """
449
  css_colors = await page.evaluate("""
450
  () => {
 
495
 
496
  return css_colors
497
 
498
+ async def _fetch_external_css_colors(self, page: Page) -> list[dict]:
499
+ """
500
+ Fetch and parse external CSS files directly to bypass CORS.
501
+
502
+ This catches colors in external stylesheets that are blocked by CORS.
503
+ """
504
+ colors = []
505
+
506
+ try:
507
+ # Get all stylesheet URLs
508
+ css_urls = await page.evaluate("""
509
+ () => {
510
+ const urls = [];
511
+ const links = document.querySelectorAll('link[rel="stylesheet"]');
512
+ links.forEach(link => {
513
+ if (link.href) {
514
+ urls.push(link.href);
515
+ }
516
+ });
517
+ return urls;
518
+ }
519
+ """)
520
+
521
+ # Color regex pattern
522
+ color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
523
+
524
+ # Fetch each CSS file
525
+ for css_url in css_urls[:10]: # Limit to 10 files
526
+ try:
527
+ response = await page.request.get(css_url, timeout=5000)
528
+ if response.ok:
529
+ css_text = await response.text()
530
+
531
+ # Find all color values in CSS text
532
+ matches = color_regex.findall(css_text)
533
+ for match in matches:
534
+ colors.append({
535
+ "value": match,
536
+ "property": "external-css",
537
+ "element": "css-file",
538
+ "context": "external-stylesheet",
539
+ })
540
+ except Exception as e:
541
+ # Skip if fetch fails
542
+ pass
543
+
544
+ except Exception as e:
545
+ self.warnings.append(f"External CSS fetch failed: {str(e)}")
546
+
547
+ return colors
548
+
549
+ async def _extract_all_page_colors(self, page: Page) -> list[dict]:
550
+ """
551
+ Extract ALL color values from the page source and styles.
552
+
553
+ This is a brute-force approach that scans the entire page HTML
554
+ and all style blocks for any color values.
555
+ """
556
+ colors = await page.evaluate("""
557
+ () => {
558
+ const colors = [];
559
+ const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
560
+
561
+ // 1. Scan all <style> tags
562
+ const styleTags = document.querySelectorAll('style');
563
+ styleTags.forEach(style => {
564
+ const matches = style.textContent.match(colorRegex);
565
+ if (matches) {
566
+ matches.forEach(color => {
567
+ colors.push({
568
+ value: color,
569
+ property: 'style-tag',
570
+ element: 'style',
571
+ context: 'style-block',
572
+ });
573
+ });
574
+ }
575
+ });
576
+
577
+ // 2. Scan data attributes that might contain colors
578
+ const allElements = document.querySelectorAll('*');
579
+ allElements.forEach(el => {
580
+ // Check data attributes
581
+ for (const attr of el.attributes) {
582
+ if (attr.name.startsWith('data-') || attr.name === 'style') {
583
+ const matches = attr.value.match(colorRegex);
584
+ if (matches) {
585
+ matches.forEach(color => {
586
+ colors.push({
587
+ value: color,
588
+ property: attr.name,
589
+ element: el.tagName.toLowerCase(),
590
+ context: 'attribute',
591
+ });
592
+ });
593
+ }
594
+ }
595
+ }
596
+
597
+ // Check for color in class names (some frameworks use color classes)
598
+ const classList = el.className;
599
+ if (typeof classList === 'string') {
600
+ const colorMatches = classList.match(colorRegex);
601
+ if (colorMatches) {
602
+ colorMatches.forEach(color => {
603
+ colors.push({
604
+ value: color,
605
+ property: 'class',
606
+ element: el.tagName.toLowerCase(),
607
+ context: 'class-name',
608
+ });
609
+ });
610
+ }
611
+ }
612
+ });
613
+
614
+ // 3. Look for colors in script tags (config objects)
615
+ const scriptTags = document.querySelectorAll('script');
616
+ scriptTags.forEach(script => {
617
+ if (script.textContent && !script.src) {
618
+ const matches = script.textContent.match(colorRegex);
619
+ if (matches) {
620
+ matches.forEach(color => {
621
+ colors.push({
622
+ value: color,
623
+ property: 'script',
624
+ element: 'script',
625
+ context: 'javascript',
626
+ });
627
+ });
628
+ }
629
+ }
630
+ });
631
+
632
+ return colors;
633
+ }
634
+ """)
635
+
636
+ return colors
637
+
638
  def _process_css_variables(self, css_vars: dict):
639
  """Process CSS variables and extract color tokens from them."""
640
  computed = css_vars.get("computed", {})
 
672
  contrast_black=round(contrast_black, 2),
673
  wcag_aa_large_text=compliance["aa_large_text"],
674
  wcag_aa_small_text=compliance["aa_normal_text"],
675
+ source=TokenSource.DETECTED, # CSS variable is still "detected"
676
  confidence=Confidence.HIGH,
677
  )
678
  elif hex_value and hex_value in self.colors:
 
1021
  except Exception as e:
1022
  self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
1023
 
1024
+ # 6. Fetch external CSS files (bypass CORS)
1025
+ external_css_count = 0
1026
+ try:
1027
+ external_colors = await self._fetch_external_css_colors(page)
1028
+ external_css_count = len(external_colors)
1029
+ self._aggregate_colors(external_colors)
1030
+ except Exception as e:
1031
+ self.warnings.append(f"External CSS fetch failed: {str(e)}")
1032
+
1033
+ # 7. Brute-force scan all page content for colors
1034
+ page_scan_count = 0
1035
+ try:
1036
+ page_colors = await self._extract_all_page_colors(page)
1037
+ page_scan_count = len(page_colors)
1038
+ self._aggregate_colors(page_colors)
1039
+ except Exception as e:
1040
+ self.warnings.append(f"Page scan failed: {str(e)}")
1041
+
1042
  # =========================================================
1043
  # Log extraction results for this page
1044
  # =========================================================
 
1056
  "svg_colors": svg_color_count,
1057
  "inline_colors": inline_color_count,
1058
  "stylesheet_colors": stylesheet_color_count,
1059
+ "external_css_colors": external_css_count,
1060
+ "page_scan_colors": page_scan_count,
1061
  "new_colors": colors_new,
1062
  "new_typography": typo_new,
1063
  "new_spacing": spacing_new,