riazmo's picture
Upload normalizer.py
8a330ac verified
"""
Agent 2: Token Normalizer & Structurer
Design System Extractor v2
Persona: Design System Librarian
Responsibilities:
- Clean noisy extraction data
- Deduplicate similar tokens (colors within threshold, similar spacing)
- Infer naming patterns from class names and contexts
- Tag tokens as: detected | inferred | low-confidence
- Group colors by role (primary, secondary, neutral, etc.)
"""
import re
from typing import Optional
from collections import defaultdict
from core.token_schema import (
ColorToken,
TypographyToken,
SpacingToken,
ExtractedTokens,
NormalizedTokens,
Confidence,
TokenSource,
)
from core.color_utils import (
parse_color,
normalize_hex,
categorize_color,
)
class TokenNormalizer:
"""
Normalizes and structures extracted tokens.
This is Agent 2's job — taking raw extraction data and
organizing it into a clean, deduplicated structure.
"""
def __init__(self):
# Thresholds for duplicate detection
self.color_similarity_threshold = 10 # Delta in RGB space
self.spacing_merge_threshold = 2 # px difference to merge
# Naming patterns
self.color_role_keywords = {
"primary": ["primary", "brand", "main", "accent"],
"secondary": ["secondary", "alt", "alternate"],
"success": ["success", "green", "positive", "valid"],
"warning": ["warning", "yellow", "caution", "alert"],
"error": ["error", "red", "danger", "invalid", "negative"],
"info": ["info", "blue", "notice"],
"neutral": ["gray", "grey", "neutral", "muted", "subtle"],
"background": ["bg", "background", "surface"],
"text": ["text", "foreground", "content", "body"],
"border": ["border", "divider", "separator", "line"],
}
def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
"""
Normalize extracted tokens.
Args:
extracted: Raw extraction results from Agent 1
Returns:
NormalizedTokens with cleaned, deduplicated data
"""
# Process each token type (returns lists)
colors_list = self._normalize_colors(extracted.colors)
typography_list = self._normalize_typography(extracted.typography)
spacing_list = self._normalize_spacing(extracted.spacing)
# Convert to dicts keyed by suggested_name
colors_dict = {}
for c in colors_list:
key = c.suggested_name or c.value
colors_dict[key] = c
typography_dict = {}
for t in typography_list:
key = t.suggested_name or f"{t.font_family}-{t.font_size}"
typography_dict[key] = t
spacing_dict = {}
for s in spacing_list:
key = s.suggested_name or s.value
spacing_dict[key] = s
# Convert radius and shadows to dicts
radius_dict = {}
for r in extracted.radius:
key = f"radius-{r.value}"
radius_dict[key] = r
shadows_dict = {}
for s in extracted.shadows:
key = f"shadow-{hash(s.value) % 1000}"
shadows_dict[key] = s
# Create normalized result
normalized = NormalizedTokens(
viewport=extracted.viewport,
source_url=extracted.source_url,
colors=colors_dict,
typography=typography_dict,
spacing=spacing_dict,
radius=radius_dict,
shadows=shadows_dict,
font_families=extracted.font_families,
detected_spacing_base=extracted.spacing_base,
detected_naming_convention=extracted.naming_convention,
)
return normalized
def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
"""
Normalize color tokens:
- Deduplicate similar colors
- Infer color roles
- Assign suggested names
- Calculate confidence
"""
if not colors:
return []
# Step 1: Deduplicate by exact hex value
unique_colors = {}
for color in colors:
hex_val = normalize_hex(color.value)
if hex_val in unique_colors:
# Merge frequency and contexts
existing = unique_colors[hex_val]
existing.frequency += color.frequency
existing.contexts = list(set(existing.contexts + color.contexts))
existing.elements = list(set(existing.elements + color.elements))
existing.css_properties = list(set(existing.css_properties + color.css_properties))
else:
color.value = hex_val
unique_colors[hex_val] = color
# Step 2: Merge visually similar colors
merged_colors = self._merge_similar_colors(list(unique_colors.values()))
# Step 3: Infer roles and names
for color in merged_colors:
role = self._infer_color_role(color)
if role:
color.suggested_name = self._generate_color_name(color, role)
else:
color.suggested_name = self._generate_color_name_from_value(color)
# Update confidence based on frequency
color.confidence = self._calculate_confidence(color.frequency)
# Sort by frequency (most used first)
merged_colors.sort(key=lambda c: -c.frequency)
return merged_colors
def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
"""Merge colors that are visually very similar."""
if len(colors) <= 1:
return colors
merged = []
used = set()
for i, color1 in enumerate(colors):
if i in used:
continue
# Find similar colors
similar_group = [color1]
for j, color2 in enumerate(colors[i+1:], i+1):
if j in used:
continue
if self._colors_are_similar(color1.value, color2.value):
similar_group.append(color2)
used.add(j)
# Merge the group - keep the most frequent
similar_group.sort(key=lambda c: -c.frequency)
primary = similar_group[0]
# Aggregate data from similar colors
for other in similar_group[1:]:
primary.frequency += other.frequency
primary.contexts = list(set(primary.contexts + other.contexts))
primary.elements = list(set(primary.elements + other.elements))
merged.append(primary)
used.add(i)
return merged
def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
"""Check if two colors are visually similar."""
try:
parsed1 = parse_color(hex1)
parsed2 = parse_color(hex2)
if parsed1 is None or parsed2 is None:
return False
if parsed1.rgb is None or parsed2.rgb is None:
return False
rgb1 = parsed1.rgb
rgb2 = parsed2.rgb
# Calculate Euclidean distance in RGB space
distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
return distance < self.color_similarity_threshold
except Exception:
return False
def _infer_color_role(self, color: ColorToken) -> Optional[str]:
"""Infer the semantic role of a color from its contexts."""
all_context = " ".join(color.contexts + color.elements).lower()
for role, keywords in self.color_role_keywords.items():
for keyword in keywords:
if keyword in all_context:
return role
# Try to infer from color category
category = categorize_color(color.value)
if category in ["gray", "white", "black"]:
return "neutral"
return None
def _generate_color_name(self, color: ColorToken, role: str) -> str:
"""Generate a semantic name for a color."""
# Determine shade level based on luminance
parsed = parse_color(color.value)
if parsed and parsed.rgb:
rgb = parsed.rgb
luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
if luminance > 0.8:
shade = "50"
elif luminance > 0.6:
shade = "200"
elif luminance > 0.4:
shade = "500"
elif luminance > 0.2:
shade = "700"
else:
shade = "900"
else:
shade = "500"
return f"color.{role}.{shade}"
def _generate_color_name_from_value(self, color: ColorToken) -> str:
"""Generate a name based on the color value itself."""
category = categorize_color(color.value)
parsed = parse_color(color.value)
if parsed and parsed.rgb:
rgb = parsed.rgb
luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
if luminance > 0.6:
shade = "light"
elif luminance > 0.3:
shade = "base"
else:
shade = "dark"
else:
shade = "base"
return f"color.{category}.{shade}"
def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
"""
Normalize typography tokens:
- Deduplicate identical styles
- Infer type scale categories
- Assign suggested names
"""
if not typography:
return []
# Deduplicate by unique style combination
unique_typo = {}
for typo in typography:
key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
if key in unique_typo:
existing = unique_typo[key]
existing.frequency += typo.frequency
existing.elements = list(set(existing.elements + typo.elements))
else:
unique_typo[key] = typo
result = list(unique_typo.values())
# Infer names based on size and elements
for typo in result:
typo.suggested_name = self._generate_typography_name(typo)
typo.confidence = self._calculate_confidence(typo.frequency)
# Sort by font size (largest first)
result.sort(key=lambda t: -self._parse_font_size(t.font_size))
return result
def _generate_typography_name(self, typo: TypographyToken) -> str:
"""Generate a semantic name for typography."""
size_px = self._parse_font_size(typo.font_size)
elements = " ".join(typo.elements).lower()
# Determine category from elements
if any(h in elements for h in ["h1", "hero", "display"]):
category = "display"
elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
category = "heading"
elif any(h in elements for h in ["label", "caption", "small", "meta"]):
category = "label"
elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
category = "body"
else:
category = "text"
# Determine size tier
if size_px >= 32:
size_tier = "xl"
elif size_px >= 24:
size_tier = "lg"
elif size_px >= 18:
size_tier = "md"
elif size_px >= 14:
size_tier = "sm"
else:
size_tier = "xs"
return f"font.{category}.{size_tier}"
def _parse_font_size(self, size: str) -> float:
"""Parse font size string to pixels."""
if not size:
return 16
size = size.lower().strip()
# Handle px
if "px" in size:
try:
return float(size.replace("px", ""))
except ValueError:
return 16
# Handle rem (assume 16px base)
if "rem" in size:
try:
return float(size.replace("rem", "")) * 16
except ValueError:
return 16
# Handle em (assume 16px base)
if "em" in size:
try:
return float(size.replace("em", "")) * 16
except ValueError:
return 16
# Try plain number
try:
return float(size)
except ValueError:
return 16
def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
"""
Normalize spacing tokens:
- Merge similar values
- Align to base-8 grid if close
- Assign suggested names
"""
if not spacing:
return []
# Deduplicate by value
unique_spacing = {}
for space in spacing:
key = space.value
if key in unique_spacing:
existing = unique_spacing[key]
existing.frequency += space.frequency
existing.contexts = list(set(existing.contexts + space.contexts))
else:
unique_spacing[key] = space
result = list(unique_spacing.values())
# Merge very similar values
result = self._merge_similar_spacing(result)
# Assign names
for space in result:
space.suggested_name = self._generate_spacing_name(space)
space.confidence = self._calculate_confidence(space.frequency)
# Sort by value
result.sort(key=lambda s: s.value_px)
return result
def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
"""Merge spacing values that are very close."""
if len(spacing) <= 1:
return spacing
# Sort by pixel value
spacing.sort(key=lambda s: s.value_px)
merged = []
i = 0
while i < len(spacing):
current = spacing[i]
group = [current]
# Find adjacent similar values
j = i + 1
while j < len(spacing):
if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
group.append(spacing[j])
j += 1
else:
break
# Merge group - prefer base-8 aligned value or most frequent
group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
primary = group[0]
for other in group[1:]:
primary.frequency += other.frequency
primary.contexts = list(set(primary.contexts + other.contexts))
merged.append(primary)
i = j
return merged
def _generate_spacing_name(self, space: SpacingToken) -> str:
"""Generate a semantic name for spacing."""
px = space.value_px
# Map to t-shirt sizes based on value
if px <= 2:
size = "px"
elif px <= 4:
size = "0.5"
elif px <= 8:
size = "1"
elif px <= 12:
size = "1.5"
elif px <= 16:
size = "2"
elif px <= 20:
size = "2.5"
elif px <= 24:
size = "3"
elif px <= 32:
size = "4"
elif px <= 40:
size = "5"
elif px <= 48:
size = "6"
elif px <= 64:
size = "8"
elif px <= 80:
size = "10"
elif px <= 96:
size = "12"
else:
size = str(int(px / 4))
return f"space.{size}"
def _calculate_confidence(self, frequency: int) -> Confidence:
"""Calculate confidence based on frequency."""
if frequency >= 10:
return Confidence.HIGH
elif frequency >= 3:
return Confidence.MEDIUM
else:
return Confidence.LOW
def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
"""Convenience function to normalize tokens."""
normalizer = TokenNormalizer()
return normalizer.normalize(extracted)