Spaces:

riazmo
/

Design-System-Extractor-2

Running

File size: 16,805 Bytes

8a330ac

"""
Agent 2: Token Normalizer & Structurer
Design System Extractor v2

Persona: Design System Librarian

Responsibilities:
- Clean noisy extraction data
- Deduplicate similar tokens (colors within threshold, similar spacing)
- Infer naming patterns from class names and contexts
- Tag tokens as: detected | inferred | low-confidence
- Group colors by role (primary, secondary, neutral, etc.)
"""

import re
from typing import Optional
from collections import defaultdict

from core.token_schema import (
    ColorToken,
    TypographyToken,
    SpacingToken,
    ExtractedTokens,
    NormalizedTokens,
    Confidence,
    TokenSource,
)
from core.color_utils import (
    parse_color,
    normalize_hex,
    categorize_color,
)


class TokenNormalizer:
    """
    Normalizes and structures extracted tokens.
    
    This is Agent 2's job — taking raw extraction data and
    organizing it into a clean, deduplicated structure.
    """
    
    def __init__(self):
        # Thresholds for duplicate detection
        self.color_similarity_threshold = 10  # Delta in RGB space
        self.spacing_merge_threshold = 2  # px difference to merge
        
        # Naming patterns
        self.color_role_keywords = {
            "primary": ["primary", "brand", "main", "accent"],
            "secondary": ["secondary", "alt", "alternate"],
            "success": ["success", "green", "positive", "valid"],
            "warning": ["warning", "yellow", "caution", "alert"],
            "error": ["error", "red", "danger", "invalid", "negative"],
            "info": ["info", "blue", "notice"],
            "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
            "background": ["bg", "background", "surface"],
            "text": ["text", "foreground", "content", "body"],
            "border": ["border", "divider", "separator", "line"],
        }
    
    def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
        """
        Normalize extracted tokens.
        
        Args:
            extracted: Raw extraction results from Agent 1
            
        Returns:
            NormalizedTokens with cleaned, deduplicated data
        """
        # Process each token type (returns lists)
        colors_list = self._normalize_colors(extracted.colors)
        typography_list = self._normalize_typography(extracted.typography)
        spacing_list = self._normalize_spacing(extracted.spacing)
        
        # Convert to dicts keyed by suggested_name
        colors_dict = {}
        for c in colors_list:
            key = c.suggested_name or c.value
            colors_dict[key] = c
        
        typography_dict = {}
        for t in typography_list:
            key = t.suggested_name or f"{t.font_family}-{t.font_size}"
            typography_dict[key] = t
        
        spacing_dict = {}
        for s in spacing_list:
            key = s.suggested_name or s.value
            spacing_dict[key] = s
        
        # Convert radius and shadows to dicts
        radius_dict = {}
        for r in extracted.radius:
            key = f"radius-{r.value}"
            radius_dict[key] = r
        
        shadows_dict = {}
        for s in extracted.shadows:
            key = f"shadow-{hash(s.value) % 1000}"
            shadows_dict[key] = s
        
        # Create normalized result
        normalized = NormalizedTokens(
            viewport=extracted.viewport,
            source_url=extracted.source_url,
            colors=colors_dict,
            typography=typography_dict,
            spacing=spacing_dict,
            radius=radius_dict,
            shadows=shadows_dict,
            font_families=extracted.font_families,
            detected_spacing_base=extracted.spacing_base,
            detected_naming_convention=extracted.naming_convention,
        )
        
        return normalized
    
    def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
        """
        Normalize color tokens:
        - Deduplicate similar colors
        - Infer color roles
        - Assign suggested names
        - Calculate confidence
        """
        if not colors:
            return []
        
        # Step 1: Deduplicate by exact hex value
        unique_colors = {}
        for color in colors:
            hex_val = normalize_hex(color.value)
            if hex_val in unique_colors:
                # Merge frequency and contexts
                existing = unique_colors[hex_val]
                existing.frequency += color.frequency
                existing.contexts = list(set(existing.contexts + color.contexts))
                existing.elements = list(set(existing.elements + color.elements))
                existing.css_properties = list(set(existing.css_properties + color.css_properties))
            else:
                color.value = hex_val
                unique_colors[hex_val] = color
        
        # Step 2: Merge visually similar colors
        merged_colors = self._merge_similar_colors(list(unique_colors.values()))
        
        # Step 3: Infer roles and names
        for color in merged_colors:
            role = self._infer_color_role(color)
            if role:
                color.suggested_name = self._generate_color_name(color, role)
            else:
                color.suggested_name = self._generate_color_name_from_value(color)
            
            # Update confidence based on frequency
            color.confidence = self._calculate_confidence(color.frequency)
        
        # Sort by frequency (most used first)
        merged_colors.sort(key=lambda c: -c.frequency)
        
        return merged_colors
    
    def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
        """Merge colors that are visually very similar."""
        if len(colors) <= 1:
            return colors
        
        merged = []
        used = set()
        
        for i, color1 in enumerate(colors):
            if i in used:
                continue
            
            # Find similar colors
            similar_group = [color1]
            for j, color2 in enumerate(colors[i+1:], i+1):
                if j in used:
                    continue
                if self._colors_are_similar(color1.value, color2.value):
                    similar_group.append(color2)
                    used.add(j)
            
            # Merge the group - keep the most frequent
            similar_group.sort(key=lambda c: -c.frequency)
            primary = similar_group[0]
            
            # Aggregate data from similar colors
            for other in similar_group[1:]:
                primary.frequency += other.frequency
                primary.contexts = list(set(primary.contexts + other.contexts))
                primary.elements = list(set(primary.elements + other.elements))
            
            merged.append(primary)
            used.add(i)
        
        return merged
    
    def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
        """Check if two colors are visually similar."""
        try:
            parsed1 = parse_color(hex1)
            parsed2 = parse_color(hex2)
            if parsed1 is None or parsed2 is None:
                return False
            if parsed1.rgb is None or parsed2.rgb is None:
                return False
            
            rgb1 = parsed1.rgb
            rgb2 = parsed2.rgb
            
            # Calculate Euclidean distance in RGB space
            distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
            return distance < self.color_similarity_threshold
        except Exception:
            return False
    
    def _infer_color_role(self, color: ColorToken) -> Optional[str]:
        """Infer the semantic role of a color from its contexts."""
        all_context = " ".join(color.contexts + color.elements).lower()
        
        for role, keywords in self.color_role_keywords.items():
            for keyword in keywords:
                if keyword in all_context:
                    return role
        
        # Try to infer from color category
        category = categorize_color(color.value)
        if category in ["gray", "white", "black"]:
            return "neutral"
        
        return None
    
    def _generate_color_name(self, color: ColorToken, role: str) -> str:
        """Generate a semantic name for a color."""
        # Determine shade level based on luminance
        parsed = parse_color(color.value)
        if parsed and parsed.rgb:
            rgb = parsed.rgb
            luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
            if luminance > 0.8:
                shade = "50"
            elif luminance > 0.6:
                shade = "200"
            elif luminance > 0.4:
                shade = "500"
            elif luminance > 0.2:
                shade = "700"
            else:
                shade = "900"
        else:
            shade = "500"
        
        return f"color.{role}.{shade}"
    
    def _generate_color_name_from_value(self, color: ColorToken) -> str:
        """Generate a name based on the color value itself."""
        category = categorize_color(color.value)
        parsed = parse_color(color.value)
        
        if parsed and parsed.rgb:
            rgb = parsed.rgb
            luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
            if luminance > 0.6:
                shade = "light"
            elif luminance > 0.3:
                shade = "base"
            else:
                shade = "dark"
        else:
            shade = "base"
        
        return f"color.{category}.{shade}"
    
    def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
        """
        Normalize typography tokens:
        - Deduplicate identical styles
        - Infer type scale categories
        - Assign suggested names
        """
        if not typography:
            return []
        
        # Deduplicate by unique style combination
        unique_typo = {}
        for typo in typography:
            key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
            if key in unique_typo:
                existing = unique_typo[key]
                existing.frequency += typo.frequency
                existing.elements = list(set(existing.elements + typo.elements))
            else:
                unique_typo[key] = typo
        
        result = list(unique_typo.values())
        
        # Infer names based on size and elements
        for typo in result:
            typo.suggested_name = self._generate_typography_name(typo)
            typo.confidence = self._calculate_confidence(typo.frequency)
        
        # Sort by font size (largest first)
        result.sort(key=lambda t: -self._parse_font_size(t.font_size))
        
        return result
    
    def _generate_typography_name(self, typo: TypographyToken) -> str:
        """Generate a semantic name for typography."""
        size_px = self._parse_font_size(typo.font_size)
        elements = " ".join(typo.elements).lower()
        
        # Determine category from elements
        if any(h in elements for h in ["h1", "hero", "display"]):
            category = "display"
        elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
            category = "heading"
        elif any(h in elements for h in ["label", "caption", "small", "meta"]):
            category = "label"
        elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
            category = "body"
        else:
            category = "text"
        
        # Determine size tier
        if size_px >= 32:
            size_tier = "xl"
        elif size_px >= 24:
            size_tier = "lg"
        elif size_px >= 18:
            size_tier = "md"
        elif size_px >= 14:
            size_tier = "sm"
        else:
            size_tier = "xs"
        
        return f"font.{category}.{size_tier}"
    
    def _parse_font_size(self, size: str) -> float:
        """Parse font size string to pixels."""
        if not size:
            return 16
        
        size = size.lower().strip()
        
        # Handle px
        if "px" in size:
            try:
                return float(size.replace("px", ""))
            except ValueError:
                return 16
        
        # Handle rem (assume 16px base)
        if "rem" in size:
            try:
                return float(size.replace("rem", "")) * 16
            except ValueError:
                return 16
        
        # Handle em (assume 16px base)
        if "em" in size:
            try:
                return float(size.replace("em", "")) * 16
            except ValueError:
                return 16
        
        # Try plain number
        try:
            return float(size)
        except ValueError:
            return 16
    
    def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
        """
        Normalize spacing tokens:
        - Merge similar values
        - Align to base-8 grid if close
        - Assign suggested names
        """
        if not spacing:
            return []
        
        # Deduplicate by value
        unique_spacing = {}
        for space in spacing:
            key = space.value
            if key in unique_spacing:
                existing = unique_spacing[key]
                existing.frequency += space.frequency
                existing.contexts = list(set(existing.contexts + space.contexts))
            else:
                unique_spacing[key] = space
        
        result = list(unique_spacing.values())
        
        # Merge very similar values
        result = self._merge_similar_spacing(result)
        
        # Assign names
        for space in result:
            space.suggested_name = self._generate_spacing_name(space)
            space.confidence = self._calculate_confidence(space.frequency)
        
        # Sort by value
        result.sort(key=lambda s: s.value_px)
        
        return result
    
    def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
        """Merge spacing values that are very close."""
        if len(spacing) <= 1:
            return spacing
        
        # Sort by pixel value
        spacing.sort(key=lambda s: s.value_px)
        
        merged = []
        i = 0
        
        while i < len(spacing):
            current = spacing[i]
            group = [current]
            
            # Find adjacent similar values
            j = i + 1
            while j < len(spacing):
                if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
                    group.append(spacing[j])
                    j += 1
                else:
                    break
            
            # Merge group - prefer base-8 aligned value or most frequent
            group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
            primary = group[0]
            
            for other in group[1:]:
                primary.frequency += other.frequency
                primary.contexts = list(set(primary.contexts + other.contexts))
            
            merged.append(primary)
            i = j
        
        return merged
    
    def _generate_spacing_name(self, space: SpacingToken) -> str:
        """Generate a semantic name for spacing."""
        px = space.value_px
        
        # Map to t-shirt sizes based on value
        if px <= 2:
            size = "px"
        elif px <= 4:
            size = "0.5"
        elif px <= 8:
            size = "1"
        elif px <= 12:
            size = "1.5"
        elif px <= 16:
            size = "2"
        elif px <= 20:
            size = "2.5"
        elif px <= 24:
            size = "3"
        elif px <= 32:
            size = "4"
        elif px <= 40:
            size = "5"
        elif px <= 48:
            size = "6"
        elif px <= 64:
            size = "8"
        elif px <= 80:
            size = "10"
        elif px <= 96:
            size = "12"
        else:
            size = str(int(px / 4))
        
        return f"space.{size}"
    
    def _calculate_confidence(self, frequency: int) -> Confidence:
        """Calculate confidence based on frequency."""
        if frequency >= 10:
            return Confidence.HIGH
        elif frequency >= 3:
            return Confidence.MEDIUM
        else:
            return Confidence.LOW


def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
    """Convenience function to normalize tokens."""
    normalizer = TokenNormalizer()
    return normalizer.normalize(extracted)