Spaces:

riazmo
/

Design-System-Extractor-2

Running

App Files Files Community

riazmo commited on about 1 month ago

Commit

f1c7a18

verified ·

1 Parent(s): 7e4e20b

Upload normalizer.py

Browse files

Files changed (1) hide show

agents/normalizer.py +462 -0

agents/normalizer.py ADDED Viewed

	@@ -0,0 +1,462 @@

+"""
+Agent 2: Token Normalizer & Structurer
+Design System Extractor v2
+Persona: Design System Librarian
+Responsibilities:
+- Clean noisy extraction data
+- Deduplicate similar tokens (colors within threshold, similar spacing)
+- Infer naming patterns from class names and contexts
+- Tag tokens as: detected | inferred | low-confidence
+- Group colors by role (primary, secondary, neutral, etc.)
+"""
+import re
+from typing import Optional
+from collections import defaultdict
+from core.token_schema import (
+    ColorToken,
+    TypographyToken,
+    SpacingToken,
+    ExtractedTokens,
+    NormalizedTokens,
+    Confidence,
+    TokenSource,
+)
+from core.color_utils import (
+    parse_color,
+    normalize_hex,
+    categorize_color,
+)
+class TokenNormalizer:
+    """
+    Normalizes and structures extracted tokens.
+    This is Agent 2's job — taking raw extraction data and
+    organizing it into a clean, deduplicated structure.
+    """
+    def __init__(self):
+        # Thresholds for duplicate detection
+        self.color_similarity_threshold = 10  # Delta in RGB space
+        self.spacing_merge_threshold = 2  # px difference to merge
+        # Naming patterns
+        self.color_role_keywords = {
+            "primary": ["primary", "brand", "main", "accent"],
+            "secondary": ["secondary", "alt", "alternate"],
+            "success": ["success", "green", "positive", "valid"],
+            "warning": ["warning", "yellow", "caution", "alert"],
+            "error": ["error", "red", "danger", "invalid", "negative"],
+            "info": ["info", "blue", "notice"],
+            "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
+            "background": ["bg", "background", "surface"],
+            "text": ["text", "foreground", "content", "body"],
+            "border": ["border", "divider", "separator", "line"],
+        }
+    def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
+        """
+        Normalize extracted tokens.
+        Args:
+            extracted: Raw extraction results from Agent 1
+        Returns:
+            NormalizedTokens with cleaned, deduplicated data
+        """
+        # Process each token type
+        colors = self._normalize_colors(extracted.colors)
+        typography = self._normalize_typography(extracted.typography)
+        spacing = self._normalize_spacing(extracted.spacing)
+        # Create normalized result
+        normalized = NormalizedTokens(
+            viewport=extracted.viewport,
+            colors=colors,
+            typography=typography,
+            spacing=spacing,
+            radius=extracted.radius,  # Pass through for now
+            shadows=extracted.shadows,  # Pass through for now
+            font_families=extracted.font_families,
+            pages_crawled=extracted.pages_crawled,
+            total_elements=extracted.total_elements,
+        )
+        return normalized
+    def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
+        """
+        Normalize color tokens:
+        - Deduplicate similar colors
+        - Infer color roles
+        - Assign suggested names
+        - Calculate confidence
+        """
+        if not colors:
+            return []
+        # Step 1: Deduplicate by exact hex value
+        unique_colors = {}
+        for color in colors:
+            hex_val = normalize_hex(color.value)
+            if hex_val in unique_colors:
+                # Merge frequency and contexts
+                existing = unique_colors[hex_val]
+                existing.frequency += color.frequency
+                existing.contexts = list(set(existing.contexts + color.contexts))
+                existing.elements = list(set(existing.elements + color.elements))
+                existing.css_properties = list(set(existing.css_properties + color.css_properties))
+            else:
+                color.value = hex_val
+                unique_colors[hex_val] = color
+        # Step 2: Merge visually similar colors
+        merged_colors = self._merge_similar_colors(list(unique_colors.values()))
+        # Step 3: Infer roles and names
+        for color in merged_colors:
+            role = self._infer_color_role(color)
+            if role:
+                color.suggested_name = self._generate_color_name(color, role)
+            else:
+                color.suggested_name = self._generate_color_name_from_value(color)
+            # Update confidence based on frequency
+            color.confidence = self._calculate_confidence(color.frequency)
+        # Sort by frequency (most used first)
+        merged_colors.sort(key=lambda c: -c.frequency)
+        return merged_colors
+    def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
+        """Merge colors that are visually very similar."""
+        if len(colors) <= 1:
+            return colors
+        merged = []
+        used = set()
+        for i, color1 in enumerate(colors):
+            if i in used:
+                continue
+            # Find similar colors
+            similar_group = [color1]
+            for j, color2 in enumerate(colors[i+1:], i+1):
+                if j in used:
+                    continue
+                if self._colors_are_similar(color1.value, color2.value):
+                    similar_group.append(color2)
+                    used.add(j)
+            # Merge the group - keep the most frequent
+            similar_group.sort(key=lambda c: -c.frequency)
+            primary = similar_group[0]
+            # Aggregate data from similar colors
+            for other in similar_group[1:]:
+                primary.frequency += other.frequency
+                primary.contexts = list(set(primary.contexts + other.contexts))
+                primary.elements = list(set(primary.elements + other.elements))
+            merged.append(primary)
+            used.add(i)
+        return merged
+    def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
+        """Check if two colors are visually similar."""
+        try:
+            rgb1 = parse_color(hex1)
+            rgb2 = parse_color(hex2)
+            if rgb1 is None or rgb2 is None:
+                return False
+            # Calculate Euclidean distance in RGB space
+            distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
+            return distance < self.color_similarity_threshold
+        except Exception:
+            return False
+    def _infer_color_role(self, color: ColorToken) -> Optional[str]:
+        """Infer the semantic role of a color from its contexts."""
+        all_context = " ".join(color.contexts + color.elements).lower()
+        for role, keywords in self.color_role_keywords.items():
+            for keyword in keywords:
+                if keyword in all_context:
+                    return role
+        # Try to infer from color category
+        category = categorize_color(color.value)
+        if category in ["gray", "white", "black"]:
+            return "neutral"
+        return None
+    def _generate_color_name(self, color: ColorToken, role: str) -> str:
+        """Generate a semantic name for a color."""
+        # Determine shade level based on luminance
+        rgb = parse_color(color.value)
+        if rgb:
+            luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
+            if luminance > 0.8:
+                shade = "50"
+            elif luminance > 0.6:
+                shade = "200"
+            elif luminance > 0.4:
+                shade = "500"
+            elif luminance > 0.2:
+                shade = "700"
+            else:
+                shade = "900"
+        else:
+            shade = "500"
+        return f"color.{role}.{shade}"
+    def _generate_color_name_from_value(self, color: ColorToken) -> str:
+        """Generate a name based on the color value itself."""
+        category = categorize_color(color.value)
+        rgb = parse_color(color.value)
+        if rgb:
+            luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
+            if luminance > 0.6:
+                shade = "light"
+            elif luminance > 0.3:
+                shade = "base"
+            else:
+                shade = "dark"
+        else:
+            shade = "base"
+        return f"color.{category}.{shade}"
+    def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
+        """
+        Normalize typography tokens:
+        - Deduplicate identical styles
+        - Infer type scale categories
+        - Assign suggested names
+        """
+        if not typography:
+            return []
+        # Deduplicate by unique style combination
+        unique_typo = {}
+        for typo in typography:
+            key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
+            if key in unique_typo:
+                existing = unique_typo[key]
+                existing.frequency += typo.frequency
+                existing.elements = list(set(existing.elements + typo.elements))
+            else:
+                unique_typo[key] = typo
+        result = list(unique_typo.values())
+        # Infer names based on size and elements
+        for typo in result:
+            typo.suggested_name = self._generate_typography_name(typo)
+            typo.confidence = self._calculate_confidence(typo.frequency)
+        # Sort by font size (largest first)
+        result.sort(key=lambda t: -self._parse_font_size(t.font_size))
+        return result
+    def _generate_typography_name(self, typo: TypographyToken) -> str:
+        """Generate a semantic name for typography."""
+        size_px = self._parse_font_size(typo.font_size)
+        elements = " ".join(typo.elements).lower()
+        # Determine category from elements
+        if any(h in elements for h in ["h1", "hero", "display"]):
+            category = "display"
+        elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
+            category = "heading"
+        elif any(h in elements for h in ["label", "caption", "small", "meta"]):
+            category = "label"
+        elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
+            category = "body"
+        else:
+            category = "text"
+        # Determine size tier
+        if size_px >= 32:
+            size_tier = "xl"
+        elif size_px >= 24:
+            size_tier = "lg"
+        elif size_px >= 18:
+            size_tier = "md"
+        elif size_px >= 14:
+            size_tier = "sm"
+        else:
+            size_tier = "xs"
+        return f"font.{category}.{size_tier}"
+    def _parse_font_size(self, size: str) -> float:
+        """Parse font size string to pixels."""
+        if not size:
+            return 16
+        size = size.lower().strip()
+        # Handle px
+        if "px" in size:
+            try:
+                return float(size.replace("px", ""))
+            except ValueError:
+                return 16
+        # Handle rem (assume 16px base)
+        if "rem" in size:
+            try:
+                return float(size.replace("rem", "")) * 16
+            except ValueError:
+                return 16
+        # Handle em (assume 16px base)
+        if "em" in size:
+            try:
+                return float(size.replace("em", "")) * 16
+            except ValueError:
+                return 16
+        # Try plain number
+        try:
+            return float(size)
+        except ValueError:
+            return 16
+    def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
+        """
+        Normalize spacing tokens:
+        - Merge similar values
+        - Align to base-8 grid if close
+        - Assign suggested names
+        """
+        if not spacing:
+            return []
+        # Deduplicate by value
+        unique_spacing = {}
+        for space in spacing:
+            key = space.value
+            if key in unique_spacing:
+                existing = unique_spacing[key]
+                existing.frequency += space.frequency
+                existing.contexts = list(set(existing.contexts + space.contexts))
+            else:
+                unique_spacing[key] = space
+        result = list(unique_spacing.values())
+        # Merge very similar values
+        result = self._merge_similar_spacing(result)
+        # Assign names
+        for space in result:
+            space.suggested_name = self._generate_spacing_name(space)
+            space.confidence = self._calculate_confidence(space.frequency)
+        # Sort by value
+        result.sort(key=lambda s: s.value_px)
+        return result
+    def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
+        """Merge spacing values that are very close."""
+        if len(spacing) <= 1:
+            return spacing
+        # Sort by pixel value
+        spacing.sort(key=lambda s: s.value_px)
+        merged = []
+        i = 0
+        while i < len(spacing):
+            current = spacing[i]
+            group = [current]
+            # Find adjacent similar values
+            j = i + 1
+            while j < len(spacing):
+                if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
+                    group.append(spacing[j])
+                    j += 1
+                else:
+                    break
+            # Merge group - prefer base-8 aligned value or most frequent
+            group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
+            primary = group[0]
+            for other in group[1:]:
+                primary.frequency += other.frequency
+                primary.contexts = list(set(primary.contexts + other.contexts))
+            merged.append(primary)
+            i = j
+        return merged
+    def _generate_spacing_name(self, space: SpacingToken) -> str:
+        """Generate a semantic name for spacing."""
+        px = space.value_px
+        # Map to t-shirt sizes based on value
+        if px <= 2:
+            size = "px"
+        elif px <= 4:
+            size = "0.5"
+        elif px <= 8:
+            size = "1"
+        elif px <= 12:
+            size = "1.5"
+        elif px <= 16:
+            size = "2"
+        elif px <= 20:
+            size = "2.5"
+        elif px <= 24:
+            size = "3"
+        elif px <= 32:
+            size = "4"
+        elif px <= 40:
+            size = "5"
+        elif px <= 48:
+            size = "6"
+        elif px <= 64:
+            size = "8"
+        elif px <= 80:
+            size = "10"
+        elif px <= 96:
+            size = "12"
+        else:
+            size = str(int(px / 4))
+        return f"space.{size}"
+    def _calculate_confidence(self, frequency: int) -> Confidence:
+        """Calculate confidence based on frequency."""
+        if frequency >= 10:
+            return Confidence.HIGH
+        elif frequency >= 3:
+            return Confidence.MEDIUM
+        else:
+            return Confidence.LOW
+def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
+    """Convenience function to normalize tokens."""
+    normalizer = TokenNormalizer()
+    return normalizer.normalize(extracted)