|
|
""" |
|
|
Agent 2: Token Normalizer & Structurer |
|
|
Design System Extractor v2 |
|
|
|
|
|
Persona: Design System Librarian |
|
|
|
|
|
Responsibilities: |
|
|
- Clean noisy extraction data |
|
|
- Deduplicate similar tokens (colors within threshold, similar spacing) |
|
|
- Infer naming patterns from class names and contexts |
|
|
- Tag tokens as: detected | inferred | low-confidence |
|
|
- Group colors by role (primary, secondary, neutral, etc.) |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import Optional |
|
|
from collections import defaultdict |
|
|
|
|
|
from core.token_schema import ( |
|
|
ColorToken, |
|
|
TypographyToken, |
|
|
SpacingToken, |
|
|
ExtractedTokens, |
|
|
NormalizedTokens, |
|
|
Confidence, |
|
|
TokenSource, |
|
|
) |
|
|
from core.color_utils import ( |
|
|
parse_color, |
|
|
normalize_hex, |
|
|
categorize_color, |
|
|
) |
|
|
|
|
|
|
|
|
class TokenNormalizer: |
|
|
""" |
|
|
Normalizes and structures extracted tokens. |
|
|
|
|
|
This is Agent 2's job — taking raw extraction data and |
|
|
organizing it into a clean, deduplicated structure. |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.color_similarity_threshold = 10 |
|
|
self.spacing_merge_threshold = 2 |
|
|
|
|
|
|
|
|
self.color_role_keywords = { |
|
|
"primary": ["primary", "brand", "main", "accent"], |
|
|
"secondary": ["secondary", "alt", "alternate"], |
|
|
"success": ["success", "green", "positive", "valid"], |
|
|
"warning": ["warning", "yellow", "caution", "alert"], |
|
|
"error": ["error", "red", "danger", "invalid", "negative"], |
|
|
"info": ["info", "blue", "notice"], |
|
|
"neutral": ["gray", "grey", "neutral", "muted", "subtle"], |
|
|
"background": ["bg", "background", "surface"], |
|
|
"text": ["text", "foreground", "content", "body"], |
|
|
"border": ["border", "divider", "separator", "line"], |
|
|
} |
|
|
|
|
|
def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens: |
|
|
""" |
|
|
Normalize extracted tokens. |
|
|
|
|
|
Args: |
|
|
extracted: Raw extraction results from Agent 1 |
|
|
|
|
|
Returns: |
|
|
NormalizedTokens with cleaned, deduplicated data |
|
|
""" |
|
|
|
|
|
colors_list = self._normalize_colors(extracted.colors) |
|
|
typography_list = self._normalize_typography(extracted.typography) |
|
|
spacing_list = self._normalize_spacing(extracted.spacing) |
|
|
|
|
|
|
|
|
colors_dict = {} |
|
|
for c in colors_list: |
|
|
key = c.suggested_name or c.value |
|
|
colors_dict[key] = c |
|
|
|
|
|
typography_dict = {} |
|
|
for t in typography_list: |
|
|
key = t.suggested_name or f"{t.font_family}-{t.font_size}" |
|
|
typography_dict[key] = t |
|
|
|
|
|
spacing_dict = {} |
|
|
for s in spacing_list: |
|
|
key = s.suggested_name or s.value |
|
|
spacing_dict[key] = s |
|
|
|
|
|
|
|
|
radius_dict = {} |
|
|
for r in extracted.radius: |
|
|
key = f"radius-{r.value}" |
|
|
radius_dict[key] = r |
|
|
|
|
|
shadows_dict = {} |
|
|
for s in extracted.shadows: |
|
|
key = f"shadow-{hash(s.value) % 1000}" |
|
|
shadows_dict[key] = s |
|
|
|
|
|
|
|
|
normalized = NormalizedTokens( |
|
|
viewport=extracted.viewport, |
|
|
source_url=extracted.source_url, |
|
|
colors=colors_dict, |
|
|
typography=typography_dict, |
|
|
spacing=spacing_dict, |
|
|
radius=radius_dict, |
|
|
shadows=shadows_dict, |
|
|
font_families=extracted.font_families, |
|
|
detected_spacing_base=extracted.spacing_base, |
|
|
detected_naming_convention=extracted.naming_convention, |
|
|
) |
|
|
|
|
|
return normalized |
|
|
|
|
|
def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]: |
|
|
""" |
|
|
Normalize color tokens: |
|
|
- Deduplicate similar colors |
|
|
- Infer color roles |
|
|
- Assign suggested names |
|
|
- Calculate confidence |
|
|
""" |
|
|
if not colors: |
|
|
return [] |
|
|
|
|
|
|
|
|
unique_colors = {} |
|
|
for color in colors: |
|
|
hex_val = normalize_hex(color.value) |
|
|
if hex_val in unique_colors: |
|
|
|
|
|
existing = unique_colors[hex_val] |
|
|
existing.frequency += color.frequency |
|
|
existing.contexts = list(set(existing.contexts + color.contexts)) |
|
|
existing.elements = list(set(existing.elements + color.elements)) |
|
|
existing.css_properties = list(set(existing.css_properties + color.css_properties)) |
|
|
else: |
|
|
color.value = hex_val |
|
|
unique_colors[hex_val] = color |
|
|
|
|
|
|
|
|
merged_colors = self._merge_similar_colors(list(unique_colors.values())) |
|
|
|
|
|
|
|
|
for color in merged_colors: |
|
|
role = self._infer_color_role(color) |
|
|
if role: |
|
|
color.suggested_name = self._generate_color_name(color, role) |
|
|
else: |
|
|
color.suggested_name = self._generate_color_name_from_value(color) |
|
|
|
|
|
|
|
|
color.confidence = self._calculate_confidence(color.frequency) |
|
|
|
|
|
|
|
|
merged_colors.sort(key=lambda c: -c.frequency) |
|
|
|
|
|
return merged_colors |
|
|
|
|
|
def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]: |
|
|
"""Merge colors that are visually very similar.""" |
|
|
if len(colors) <= 1: |
|
|
return colors |
|
|
|
|
|
merged = [] |
|
|
used = set() |
|
|
|
|
|
for i, color1 in enumerate(colors): |
|
|
if i in used: |
|
|
continue |
|
|
|
|
|
|
|
|
similar_group = [color1] |
|
|
for j, color2 in enumerate(colors[i+1:], i+1): |
|
|
if j in used: |
|
|
continue |
|
|
if self._colors_are_similar(color1.value, color2.value): |
|
|
similar_group.append(color2) |
|
|
used.add(j) |
|
|
|
|
|
|
|
|
similar_group.sort(key=lambda c: -c.frequency) |
|
|
primary = similar_group[0] |
|
|
|
|
|
|
|
|
for other in similar_group[1:]: |
|
|
primary.frequency += other.frequency |
|
|
primary.contexts = list(set(primary.contexts + other.contexts)) |
|
|
primary.elements = list(set(primary.elements + other.elements)) |
|
|
|
|
|
merged.append(primary) |
|
|
used.add(i) |
|
|
|
|
|
return merged |
|
|
|
|
|
def _colors_are_similar(self, hex1: str, hex2: str) -> bool: |
|
|
"""Check if two colors are visually similar.""" |
|
|
try: |
|
|
parsed1 = parse_color(hex1) |
|
|
parsed2 = parse_color(hex2) |
|
|
if parsed1 is None or parsed2 is None: |
|
|
return False |
|
|
if parsed1.rgb is None or parsed2.rgb is None: |
|
|
return False |
|
|
|
|
|
rgb1 = parsed1.rgb |
|
|
rgb2 = parsed2.rgb |
|
|
|
|
|
|
|
|
distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5 |
|
|
return distance < self.color_similarity_threshold |
|
|
except Exception: |
|
|
return False |
|
|
|
|
|
def _infer_color_role(self, color: ColorToken) -> Optional[str]: |
|
|
"""Infer the semantic role of a color from its contexts.""" |
|
|
all_context = " ".join(color.contexts + color.elements).lower() |
|
|
|
|
|
for role, keywords in self.color_role_keywords.items(): |
|
|
for keyword in keywords: |
|
|
if keyword in all_context: |
|
|
return role |
|
|
|
|
|
|
|
|
category = categorize_color(color.value) |
|
|
if category in ["gray", "white", "black"]: |
|
|
return "neutral" |
|
|
|
|
|
return None |
|
|
|
|
|
def _generate_color_name(self, color: ColorToken, role: str) -> str: |
|
|
"""Generate a semantic name for a color.""" |
|
|
|
|
|
parsed = parse_color(color.value) |
|
|
if parsed and parsed.rgb: |
|
|
rgb = parsed.rgb |
|
|
luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255 |
|
|
if luminance > 0.8: |
|
|
shade = "50" |
|
|
elif luminance > 0.6: |
|
|
shade = "200" |
|
|
elif luminance > 0.4: |
|
|
shade = "500" |
|
|
elif luminance > 0.2: |
|
|
shade = "700" |
|
|
else: |
|
|
shade = "900" |
|
|
else: |
|
|
shade = "500" |
|
|
|
|
|
return f"color.{role}.{shade}" |
|
|
|
|
|
def _generate_color_name_from_value(self, color: ColorToken) -> str: |
|
|
"""Generate a name based on the color value itself.""" |
|
|
category = categorize_color(color.value) |
|
|
parsed = parse_color(color.value) |
|
|
|
|
|
if parsed and parsed.rgb: |
|
|
rgb = parsed.rgb |
|
|
luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255 |
|
|
if luminance > 0.6: |
|
|
shade = "light" |
|
|
elif luminance > 0.3: |
|
|
shade = "base" |
|
|
else: |
|
|
shade = "dark" |
|
|
else: |
|
|
shade = "base" |
|
|
|
|
|
return f"color.{category}.{shade}" |
|
|
|
|
|
def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]: |
|
|
""" |
|
|
Normalize typography tokens: |
|
|
- Deduplicate identical styles |
|
|
- Infer type scale categories |
|
|
- Assign suggested names |
|
|
""" |
|
|
if not typography: |
|
|
return [] |
|
|
|
|
|
|
|
|
unique_typo = {} |
|
|
for typo in typography: |
|
|
key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}" |
|
|
if key in unique_typo: |
|
|
existing = unique_typo[key] |
|
|
existing.frequency += typo.frequency |
|
|
existing.elements = list(set(existing.elements + typo.elements)) |
|
|
else: |
|
|
unique_typo[key] = typo |
|
|
|
|
|
result = list(unique_typo.values()) |
|
|
|
|
|
|
|
|
for typo in result: |
|
|
typo.suggested_name = self._generate_typography_name(typo) |
|
|
typo.confidence = self._calculate_confidence(typo.frequency) |
|
|
|
|
|
|
|
|
result.sort(key=lambda t: -self._parse_font_size(t.font_size)) |
|
|
|
|
|
return result |
|
|
|
|
|
def _generate_typography_name(self, typo: TypographyToken) -> str: |
|
|
"""Generate a semantic name for typography.""" |
|
|
size_px = self._parse_font_size(typo.font_size) |
|
|
elements = " ".join(typo.elements).lower() |
|
|
|
|
|
|
|
|
if any(h in elements for h in ["h1", "hero", "display"]): |
|
|
category = "display" |
|
|
elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]): |
|
|
category = "heading" |
|
|
elif any(h in elements for h in ["label", "caption", "small", "meta"]): |
|
|
category = "label" |
|
|
elif any(h in elements for h in ["body", "p", "paragraph", "text"]): |
|
|
category = "body" |
|
|
else: |
|
|
category = "text" |
|
|
|
|
|
|
|
|
if size_px >= 32: |
|
|
size_tier = "xl" |
|
|
elif size_px >= 24: |
|
|
size_tier = "lg" |
|
|
elif size_px >= 18: |
|
|
size_tier = "md" |
|
|
elif size_px >= 14: |
|
|
size_tier = "sm" |
|
|
else: |
|
|
size_tier = "xs" |
|
|
|
|
|
return f"font.{category}.{size_tier}" |
|
|
|
|
|
def _parse_font_size(self, size: str) -> float: |
|
|
"""Parse font size string to pixels.""" |
|
|
if not size: |
|
|
return 16 |
|
|
|
|
|
size = size.lower().strip() |
|
|
|
|
|
|
|
|
if "px" in size: |
|
|
try: |
|
|
return float(size.replace("px", "")) |
|
|
except ValueError: |
|
|
return 16 |
|
|
|
|
|
|
|
|
if "rem" in size: |
|
|
try: |
|
|
return float(size.replace("rem", "")) * 16 |
|
|
except ValueError: |
|
|
return 16 |
|
|
|
|
|
|
|
|
if "em" in size: |
|
|
try: |
|
|
return float(size.replace("em", "")) * 16 |
|
|
except ValueError: |
|
|
return 16 |
|
|
|
|
|
|
|
|
try: |
|
|
return float(size) |
|
|
except ValueError: |
|
|
return 16 |
|
|
|
|
|
def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]: |
|
|
""" |
|
|
Normalize spacing tokens: |
|
|
- Merge similar values |
|
|
- Align to base-8 grid if close |
|
|
- Assign suggested names |
|
|
""" |
|
|
if not spacing: |
|
|
return [] |
|
|
|
|
|
|
|
|
unique_spacing = {} |
|
|
for space in spacing: |
|
|
key = space.value |
|
|
if key in unique_spacing: |
|
|
existing = unique_spacing[key] |
|
|
existing.frequency += space.frequency |
|
|
existing.contexts = list(set(existing.contexts + space.contexts)) |
|
|
else: |
|
|
unique_spacing[key] = space |
|
|
|
|
|
result = list(unique_spacing.values()) |
|
|
|
|
|
|
|
|
result = self._merge_similar_spacing(result) |
|
|
|
|
|
|
|
|
for space in result: |
|
|
space.suggested_name = self._generate_spacing_name(space) |
|
|
space.confidence = self._calculate_confidence(space.frequency) |
|
|
|
|
|
|
|
|
result.sort(key=lambda s: s.value_px) |
|
|
|
|
|
return result |
|
|
|
|
|
def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]: |
|
|
"""Merge spacing values that are very close.""" |
|
|
if len(spacing) <= 1: |
|
|
return spacing |
|
|
|
|
|
|
|
|
spacing.sort(key=lambda s: s.value_px) |
|
|
|
|
|
merged = [] |
|
|
i = 0 |
|
|
|
|
|
while i < len(spacing): |
|
|
current = spacing[i] |
|
|
group = [current] |
|
|
|
|
|
|
|
|
j = i + 1 |
|
|
while j < len(spacing): |
|
|
if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold: |
|
|
group.append(spacing[j]) |
|
|
j += 1 |
|
|
else: |
|
|
break |
|
|
|
|
|
|
|
|
group.sort(key=lambda s: (-s.fits_base_8, -s.frequency)) |
|
|
primary = group[0] |
|
|
|
|
|
for other in group[1:]: |
|
|
primary.frequency += other.frequency |
|
|
primary.contexts = list(set(primary.contexts + other.contexts)) |
|
|
|
|
|
merged.append(primary) |
|
|
i = j |
|
|
|
|
|
return merged |
|
|
|
|
|
def _generate_spacing_name(self, space: SpacingToken) -> str: |
|
|
"""Generate a semantic name for spacing.""" |
|
|
px = space.value_px |
|
|
|
|
|
|
|
|
if px <= 2: |
|
|
size = "px" |
|
|
elif px <= 4: |
|
|
size = "0.5" |
|
|
elif px <= 8: |
|
|
size = "1" |
|
|
elif px <= 12: |
|
|
size = "1.5" |
|
|
elif px <= 16: |
|
|
size = "2" |
|
|
elif px <= 20: |
|
|
size = "2.5" |
|
|
elif px <= 24: |
|
|
size = "3" |
|
|
elif px <= 32: |
|
|
size = "4" |
|
|
elif px <= 40: |
|
|
size = "5" |
|
|
elif px <= 48: |
|
|
size = "6" |
|
|
elif px <= 64: |
|
|
size = "8" |
|
|
elif px <= 80: |
|
|
size = "10" |
|
|
elif px <= 96: |
|
|
size = "12" |
|
|
else: |
|
|
size = str(int(px / 4)) |
|
|
|
|
|
return f"space.{size}" |
|
|
|
|
|
def _calculate_confidence(self, frequency: int) -> Confidence: |
|
|
"""Calculate confidence based on frequency.""" |
|
|
if frequency >= 10: |
|
|
return Confidence.HIGH |
|
|
elif frequency >= 3: |
|
|
return Confidence.MEDIUM |
|
|
else: |
|
|
return Confidence.LOW |
|
|
|
|
|
|
|
|
def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens: |
|
|
"""Convenience function to normalize tokens.""" |
|
|
normalizer = TokenNormalizer() |
|
|
return normalizer.normalize(extracted) |
|
|
|