Upload extractor.py
Browse files- agents/extractor.py +163 -1
agents/extractor.py
CHANGED
|
@@ -443,6 +443,8 @@ class TokenExtractor:
|
|
| 443 |
|
| 444 |
This catches colors defined in CSS rules that may not be
|
| 445 |
currently applied to visible elements.
|
|
|
|
|
|
|
| 446 |
"""
|
| 447 |
css_colors = await page.evaluate("""
|
| 448 |
() => {
|
|
@@ -493,6 +495,146 @@ class TokenExtractor:
|
|
| 493 |
|
| 494 |
return css_colors
|
| 495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
def _process_css_variables(self, css_vars: dict):
|
| 497 |
"""Process CSS variables and extract color tokens from them."""
|
| 498 |
computed = css_vars.get("computed", {})
|
|
@@ -530,7 +672,7 @@ class TokenExtractor:
|
|
| 530 |
contrast_black=round(contrast_black, 2),
|
| 531 |
wcag_aa_large_text=compliance["aa_large_text"],
|
| 532 |
wcag_aa_small_text=compliance["aa_normal_text"],
|
| 533 |
-
source=TokenSource.
|
| 534 |
confidence=Confidence.HIGH,
|
| 535 |
)
|
| 536 |
elif hex_value and hex_value in self.colors:
|
|
@@ -879,6 +1021,24 @@ class TokenExtractor:
|
|
| 879 |
except Exception as e:
|
| 880 |
self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
|
| 881 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 882 |
# =========================================================
|
| 883 |
# Log extraction results for this page
|
| 884 |
# =========================================================
|
|
@@ -896,6 +1056,8 @@ class TokenExtractor:
|
|
| 896 |
"svg_colors": svg_color_count,
|
| 897 |
"inline_colors": inline_color_count,
|
| 898 |
"stylesheet_colors": stylesheet_color_count,
|
|
|
|
|
|
|
| 899 |
"new_colors": colors_new,
|
| 900 |
"new_typography": typo_new,
|
| 901 |
"new_spacing": spacing_new,
|
|
|
|
| 443 |
|
| 444 |
This catches colors defined in CSS rules that may not be
|
| 445 |
currently applied to visible elements.
|
| 446 |
+
|
| 447 |
+
Also fetches external stylesheets that may be CORS-blocked.
|
| 448 |
"""
|
| 449 |
css_colors = await page.evaluate("""
|
| 450 |
() => {
|
|
|
|
| 495 |
|
| 496 |
return css_colors
|
| 497 |
|
| 498 |
+
async def _fetch_external_css_colors(self, page: Page) -> list[dict]:
|
| 499 |
+
"""
|
| 500 |
+
Fetch and parse external CSS files directly to bypass CORS.
|
| 501 |
+
|
| 502 |
+
This catches colors in external stylesheets that are blocked by CORS.
|
| 503 |
+
"""
|
| 504 |
+
colors = []
|
| 505 |
+
|
| 506 |
+
try:
|
| 507 |
+
# Get all stylesheet URLs
|
| 508 |
+
css_urls = await page.evaluate("""
|
| 509 |
+
() => {
|
| 510 |
+
const urls = [];
|
| 511 |
+
const links = document.querySelectorAll('link[rel="stylesheet"]');
|
| 512 |
+
links.forEach(link => {
|
| 513 |
+
if (link.href) {
|
| 514 |
+
urls.push(link.href);
|
| 515 |
+
}
|
| 516 |
+
});
|
| 517 |
+
return urls;
|
| 518 |
+
}
|
| 519 |
+
""")
|
| 520 |
+
|
| 521 |
+
# Color regex pattern
|
| 522 |
+
color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE)
|
| 523 |
+
|
| 524 |
+
# Fetch each CSS file
|
| 525 |
+
for css_url in css_urls[:10]: # Limit to 10 files
|
| 526 |
+
try:
|
| 527 |
+
response = await page.request.get(css_url, timeout=5000)
|
| 528 |
+
if response.ok:
|
| 529 |
+
css_text = await response.text()
|
| 530 |
+
|
| 531 |
+
# Find all color values in CSS text
|
| 532 |
+
matches = color_regex.findall(css_text)
|
| 533 |
+
for match in matches:
|
| 534 |
+
colors.append({
|
| 535 |
+
"value": match,
|
| 536 |
+
"property": "external-css",
|
| 537 |
+
"element": "css-file",
|
| 538 |
+
"context": "external-stylesheet",
|
| 539 |
+
})
|
| 540 |
+
except Exception as e:
|
| 541 |
+
# Skip if fetch fails
|
| 542 |
+
pass
|
| 543 |
+
|
| 544 |
+
except Exception as e:
|
| 545 |
+
self.warnings.append(f"External CSS fetch failed: {str(e)}")
|
| 546 |
+
|
| 547 |
+
return colors
|
| 548 |
+
|
| 549 |
+
async def _extract_all_page_colors(self, page: Page) -> list[dict]:
|
| 550 |
+
"""
|
| 551 |
+
Extract ALL color values from the page source and styles.
|
| 552 |
+
|
| 553 |
+
This is a brute-force approach that scans the entire page HTML
|
| 554 |
+
and all style blocks for any color values.
|
| 555 |
+
"""
|
| 556 |
+
colors = await page.evaluate("""
|
| 557 |
+
() => {
|
| 558 |
+
const colors = [];
|
| 559 |
+
const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi;
|
| 560 |
+
|
| 561 |
+
// 1. Scan all <style> tags
|
| 562 |
+
const styleTags = document.querySelectorAll('style');
|
| 563 |
+
styleTags.forEach(style => {
|
| 564 |
+
const matches = style.textContent.match(colorRegex);
|
| 565 |
+
if (matches) {
|
| 566 |
+
matches.forEach(color => {
|
| 567 |
+
colors.push({
|
| 568 |
+
value: color,
|
| 569 |
+
property: 'style-tag',
|
| 570 |
+
element: 'style',
|
| 571 |
+
context: 'style-block',
|
| 572 |
+
});
|
| 573 |
+
});
|
| 574 |
+
}
|
| 575 |
+
});
|
| 576 |
+
|
| 577 |
+
// 2. Scan data attributes that might contain colors
|
| 578 |
+
const allElements = document.querySelectorAll('*');
|
| 579 |
+
allElements.forEach(el => {
|
| 580 |
+
// Check data attributes
|
| 581 |
+
for (const attr of el.attributes) {
|
| 582 |
+
if (attr.name.startsWith('data-') || attr.name === 'style') {
|
| 583 |
+
const matches = attr.value.match(colorRegex);
|
| 584 |
+
if (matches) {
|
| 585 |
+
matches.forEach(color => {
|
| 586 |
+
colors.push({
|
| 587 |
+
value: color,
|
| 588 |
+
property: attr.name,
|
| 589 |
+
element: el.tagName.toLowerCase(),
|
| 590 |
+
context: 'attribute',
|
| 591 |
+
});
|
| 592 |
+
});
|
| 593 |
+
}
|
| 594 |
+
}
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
// Check for color in class names (some frameworks use color classes)
|
| 598 |
+
const classList = el.className;
|
| 599 |
+
if (typeof classList === 'string') {
|
| 600 |
+
const colorMatches = classList.match(colorRegex);
|
| 601 |
+
if (colorMatches) {
|
| 602 |
+
colorMatches.forEach(color => {
|
| 603 |
+
colors.push({
|
| 604 |
+
value: color,
|
| 605 |
+
property: 'class',
|
| 606 |
+
element: el.tagName.toLowerCase(),
|
| 607 |
+
context: 'class-name',
|
| 608 |
+
});
|
| 609 |
+
});
|
| 610 |
+
}
|
| 611 |
+
}
|
| 612 |
+
});
|
| 613 |
+
|
| 614 |
+
// 3. Look for colors in script tags (config objects)
|
| 615 |
+
const scriptTags = document.querySelectorAll('script');
|
| 616 |
+
scriptTags.forEach(script => {
|
| 617 |
+
if (script.textContent && !script.src) {
|
| 618 |
+
const matches = script.textContent.match(colorRegex);
|
| 619 |
+
if (matches) {
|
| 620 |
+
matches.forEach(color => {
|
| 621 |
+
colors.push({
|
| 622 |
+
value: color,
|
| 623 |
+
property: 'script',
|
| 624 |
+
element: 'script',
|
| 625 |
+
context: 'javascript',
|
| 626 |
+
});
|
| 627 |
+
});
|
| 628 |
+
}
|
| 629 |
+
}
|
| 630 |
+
});
|
| 631 |
+
|
| 632 |
+
return colors;
|
| 633 |
+
}
|
| 634 |
+
""")
|
| 635 |
+
|
| 636 |
+
return colors
|
| 637 |
+
|
| 638 |
def _process_css_variables(self, css_vars: dict):
|
| 639 |
"""Process CSS variables and extract color tokens from them."""
|
| 640 |
computed = css_vars.get("computed", {})
|
|
|
|
| 672 |
contrast_black=round(contrast_black, 2),
|
| 673 |
wcag_aa_large_text=compliance["aa_large_text"],
|
| 674 |
wcag_aa_small_text=compliance["aa_normal_text"],
|
| 675 |
+
source=TokenSource.DETECTED, # CSS variable is still "detected"
|
| 676 |
confidence=Confidence.HIGH,
|
| 677 |
)
|
| 678 |
elif hex_value and hex_value in self.colors:
|
|
|
|
| 1021 |
except Exception as e:
|
| 1022 |
self.warnings.append(f"Stylesheet color extraction failed: {str(e)}")
|
| 1023 |
|
| 1024 |
+
# 6. Fetch external CSS files (bypass CORS)
|
| 1025 |
+
external_css_count = 0
|
| 1026 |
+
try:
|
| 1027 |
+
external_colors = await self._fetch_external_css_colors(page)
|
| 1028 |
+
external_css_count = len(external_colors)
|
| 1029 |
+
self._aggregate_colors(external_colors)
|
| 1030 |
+
except Exception as e:
|
| 1031 |
+
self.warnings.append(f"External CSS fetch failed: {str(e)}")
|
| 1032 |
+
|
| 1033 |
+
# 7. Brute-force scan all page content for colors
|
| 1034 |
+
page_scan_count = 0
|
| 1035 |
+
try:
|
| 1036 |
+
page_colors = await self._extract_all_page_colors(page)
|
| 1037 |
+
page_scan_count = len(page_colors)
|
| 1038 |
+
self._aggregate_colors(page_colors)
|
| 1039 |
+
except Exception as e:
|
| 1040 |
+
self.warnings.append(f"Page scan failed: {str(e)}")
|
| 1041 |
+
|
| 1042 |
# =========================================================
|
| 1043 |
# Log extraction results for this page
|
| 1044 |
# =========================================================
|
|
|
|
| 1056 |
"svg_colors": svg_color_count,
|
| 1057 |
"inline_colors": inline_color_count,
|
| 1058 |
"stylesheet_colors": stylesheet_color_count,
|
| 1059 |
+
"external_css_colors": external_css_count,
|
| 1060 |
+
"page_scan_colors": page_scan_count,
|
| 1061 |
"new_colors": colors_new,
|
| 1062 |
"new_typography": typo_new,
|
| 1063 |
"new_spacing": spacing_new,
|