File size: 16,805 Bytes
8a330ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
"""
Agent 2: Token Normalizer & Structurer
Design System Extractor v2

Persona: Design System Librarian

Responsibilities:
- Clean noisy extraction data
- Deduplicate similar tokens (colors within threshold, similar spacing)
- Infer naming patterns from class names and contexts
- Tag tokens as: detected | inferred | low-confidence
- Group colors by role (primary, secondary, neutral, etc.)
"""

import re
from typing import Optional
from collections import defaultdict

from core.token_schema import (
    ColorToken,
    TypographyToken,
    SpacingToken,
    ExtractedTokens,
    NormalizedTokens,
    Confidence,
    TokenSource,
)
from core.color_utils import (
    parse_color,
    normalize_hex,
    categorize_color,
)


class TokenNormalizer:
    """
    Normalizes and structures extracted tokens.
    
    This is Agent 2's job — taking raw extraction data and
    organizing it into a clean, deduplicated structure.
    """
    
    def __init__(self):
        # Thresholds for duplicate detection
        self.color_similarity_threshold = 10  # Delta in RGB space
        self.spacing_merge_threshold = 2  # px difference to merge
        
        # Naming patterns
        self.color_role_keywords = {
            "primary": ["primary", "brand", "main", "accent"],
            "secondary": ["secondary", "alt", "alternate"],
            "success": ["success", "green", "positive", "valid"],
            "warning": ["warning", "yellow", "caution", "alert"],
            "error": ["error", "red", "danger", "invalid", "negative"],
            "info": ["info", "blue", "notice"],
            "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
            "background": ["bg", "background", "surface"],
            "text": ["text", "foreground", "content", "body"],
            "border": ["border", "divider", "separator", "line"],
        }
    
    def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
        """
        Normalize extracted tokens.
        
        Args:
            extracted: Raw extraction results from Agent 1
            
        Returns:
            NormalizedTokens with cleaned, deduplicated data
        """
        # Process each token type (returns lists)
        colors_list = self._normalize_colors(extracted.colors)
        typography_list = self._normalize_typography(extracted.typography)
        spacing_list = self._normalize_spacing(extracted.spacing)
        
        # Convert to dicts keyed by suggested_name
        colors_dict = {}
        for c in colors_list:
            key = c.suggested_name or c.value
            colors_dict[key] = c
        
        typography_dict = {}
        for t in typography_list:
            key = t.suggested_name or f"{t.font_family}-{t.font_size}"
            typography_dict[key] = t
        
        spacing_dict = {}
        for s in spacing_list:
            key = s.suggested_name or s.value
            spacing_dict[key] = s
        
        # Convert radius and shadows to dicts
        radius_dict = {}
        for r in extracted.radius:
            key = f"radius-{r.value}"
            radius_dict[key] = r
        
        shadows_dict = {}
        for s in extracted.shadows:
            key = f"shadow-{hash(s.value) % 1000}"
            shadows_dict[key] = s
        
        # Create normalized result
        normalized = NormalizedTokens(
            viewport=extracted.viewport,
            source_url=extracted.source_url,
            colors=colors_dict,
            typography=typography_dict,
            spacing=spacing_dict,
            radius=radius_dict,
            shadows=shadows_dict,
            font_families=extracted.font_families,
            detected_spacing_base=extracted.spacing_base,
            detected_naming_convention=extracted.naming_convention,
        )
        
        return normalized
    
    def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
        """
        Normalize color tokens:
        - Deduplicate similar colors
        - Infer color roles
        - Assign suggested names
        - Calculate confidence
        """
        if not colors:
            return []
        
        # Step 1: Deduplicate by exact hex value
        unique_colors = {}
        for color in colors:
            hex_val = normalize_hex(color.value)
            if hex_val in unique_colors:
                # Merge frequency and contexts
                existing = unique_colors[hex_val]
                existing.frequency += color.frequency
                existing.contexts = list(set(existing.contexts + color.contexts))
                existing.elements = list(set(existing.elements + color.elements))
                existing.css_properties = list(set(existing.css_properties + color.css_properties))
            else:
                color.value = hex_val
                unique_colors[hex_val] = color
        
        # Step 2: Merge visually similar colors
        merged_colors = self._merge_similar_colors(list(unique_colors.values()))
        
        # Step 3: Infer roles and names
        for color in merged_colors:
            role = self._infer_color_role(color)
            if role:
                color.suggested_name = self._generate_color_name(color, role)
            else:
                color.suggested_name = self._generate_color_name_from_value(color)
            
            # Update confidence based on frequency
            color.confidence = self._calculate_confidence(color.frequency)
        
        # Sort by frequency (most used first)
        merged_colors.sort(key=lambda c: -c.frequency)
        
        return merged_colors
    
    def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
        """Merge colors that are visually very similar."""
        if len(colors) <= 1:
            return colors
        
        merged = []
        used = set()
        
        for i, color1 in enumerate(colors):
            if i in used:
                continue
            
            # Find similar colors
            similar_group = [color1]
            for j, color2 in enumerate(colors[i+1:], i+1):
                if j in used:
                    continue
                if self._colors_are_similar(color1.value, color2.value):
                    similar_group.append(color2)
                    used.add(j)
            
            # Merge the group - keep the most frequent
            similar_group.sort(key=lambda c: -c.frequency)
            primary = similar_group[0]
            
            # Aggregate data from similar colors
            for other in similar_group[1:]:
                primary.frequency += other.frequency
                primary.contexts = list(set(primary.contexts + other.contexts))
                primary.elements = list(set(primary.elements + other.elements))
            
            merged.append(primary)
            used.add(i)
        
        return merged
    
    def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
        """Check if two colors are visually similar."""
        try:
            parsed1 = parse_color(hex1)
            parsed2 = parse_color(hex2)
            if parsed1 is None or parsed2 is None:
                return False
            if parsed1.rgb is None or parsed2.rgb is None:
                return False
            
            rgb1 = parsed1.rgb
            rgb2 = parsed2.rgb
            
            # Calculate Euclidean distance in RGB space
            distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
            return distance < self.color_similarity_threshold
        except Exception:
            return False
    
    def _infer_color_role(self, color: ColorToken) -> Optional[str]:
        """Infer the semantic role of a color from its contexts."""
        all_context = " ".join(color.contexts + color.elements).lower()
        
        for role, keywords in self.color_role_keywords.items():
            for keyword in keywords:
                if keyword in all_context:
                    return role
        
        # Try to infer from color category
        category = categorize_color(color.value)
        if category in ["gray", "white", "black"]:
            return "neutral"
        
        return None
    
    def _generate_color_name(self, color: ColorToken, role: str) -> str:
        """Generate a semantic name for a color."""
        # Determine shade level based on luminance
        parsed = parse_color(color.value)
        if parsed and parsed.rgb:
            rgb = parsed.rgb
            luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
            if luminance > 0.8:
                shade = "50"
            elif luminance > 0.6:
                shade = "200"
            elif luminance > 0.4:
                shade = "500"
            elif luminance > 0.2:
                shade = "700"
            else:
                shade = "900"
        else:
            shade = "500"
        
        return f"color.{role}.{shade}"
    
    def _generate_color_name_from_value(self, color: ColorToken) -> str:
        """Generate a name based on the color value itself."""
        category = categorize_color(color.value)
        parsed = parse_color(color.value)
        
        if parsed and parsed.rgb:
            rgb = parsed.rgb
            luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
            if luminance > 0.6:
                shade = "light"
            elif luminance > 0.3:
                shade = "base"
            else:
                shade = "dark"
        else:
            shade = "base"
        
        return f"color.{category}.{shade}"
    
    def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
        """
        Normalize typography tokens:
        - Deduplicate identical styles
        - Infer type scale categories
        - Assign suggested names
        """
        if not typography:
            return []
        
        # Deduplicate by unique style combination
        unique_typo = {}
        for typo in typography:
            key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
            if key in unique_typo:
                existing = unique_typo[key]
                existing.frequency += typo.frequency
                existing.elements = list(set(existing.elements + typo.elements))
            else:
                unique_typo[key] = typo
        
        result = list(unique_typo.values())
        
        # Infer names based on size and elements
        for typo in result:
            typo.suggested_name = self._generate_typography_name(typo)
            typo.confidence = self._calculate_confidence(typo.frequency)
        
        # Sort by font size (largest first)
        result.sort(key=lambda t: -self._parse_font_size(t.font_size))
        
        return result
    
    def _generate_typography_name(self, typo: TypographyToken) -> str:
        """Generate a semantic name for typography."""
        size_px = self._parse_font_size(typo.font_size)
        elements = " ".join(typo.elements).lower()
        
        # Determine category from elements
        if any(h in elements for h in ["h1", "hero", "display"]):
            category = "display"
        elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
            category = "heading"
        elif any(h in elements for h in ["label", "caption", "small", "meta"]):
            category = "label"
        elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
            category = "body"
        else:
            category = "text"
        
        # Determine size tier
        if size_px >= 32:
            size_tier = "xl"
        elif size_px >= 24:
            size_tier = "lg"
        elif size_px >= 18:
            size_tier = "md"
        elif size_px >= 14:
            size_tier = "sm"
        else:
            size_tier = "xs"
        
        return f"font.{category}.{size_tier}"
    
    def _parse_font_size(self, size: str) -> float:
        """Parse font size string to pixels."""
        if not size:
            return 16
        
        size = size.lower().strip()
        
        # Handle px
        if "px" in size:
            try:
                return float(size.replace("px", ""))
            except ValueError:
                return 16
        
        # Handle rem (assume 16px base)
        if "rem" in size:
            try:
                return float(size.replace("rem", "")) * 16
            except ValueError:
                return 16
        
        # Handle em (assume 16px base)
        if "em" in size:
            try:
                return float(size.replace("em", "")) * 16
            except ValueError:
                return 16
        
        # Try plain number
        try:
            return float(size)
        except ValueError:
            return 16
    
    def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
        """
        Normalize spacing tokens:
        - Merge similar values
        - Align to base-8 grid if close
        - Assign suggested names
        """
        if not spacing:
            return []
        
        # Deduplicate by value
        unique_spacing = {}
        for space in spacing:
            key = space.value
            if key in unique_spacing:
                existing = unique_spacing[key]
                existing.frequency += space.frequency
                existing.contexts = list(set(existing.contexts + space.contexts))
            else:
                unique_spacing[key] = space
        
        result = list(unique_spacing.values())
        
        # Merge very similar values
        result = self._merge_similar_spacing(result)
        
        # Assign names
        for space in result:
            space.suggested_name = self._generate_spacing_name(space)
            space.confidence = self._calculate_confidence(space.frequency)
        
        # Sort by value
        result.sort(key=lambda s: s.value_px)
        
        return result
    
    def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
        """Merge spacing values that are very close."""
        if len(spacing) <= 1:
            return spacing
        
        # Sort by pixel value
        spacing.sort(key=lambda s: s.value_px)
        
        merged = []
        i = 0
        
        while i < len(spacing):
            current = spacing[i]
            group = [current]
            
            # Find adjacent similar values
            j = i + 1
            while j < len(spacing):
                if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
                    group.append(spacing[j])
                    j += 1
                else:
                    break
            
            # Merge group - prefer base-8 aligned value or most frequent
            group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
            primary = group[0]
            
            for other in group[1:]:
                primary.frequency += other.frequency
                primary.contexts = list(set(primary.contexts + other.contexts))
            
            merged.append(primary)
            i = j
        
        return merged
    
    def _generate_spacing_name(self, space: SpacingToken) -> str:
        """Generate a semantic name for spacing."""
        px = space.value_px
        
        # Map to t-shirt sizes based on value
        if px <= 2:
            size = "px"
        elif px <= 4:
            size = "0.5"
        elif px <= 8:
            size = "1"
        elif px <= 12:
            size = "1.5"
        elif px <= 16:
            size = "2"
        elif px <= 20:
            size = "2.5"
        elif px <= 24:
            size = "3"
        elif px <= 32:
            size = "4"
        elif px <= 40:
            size = "5"
        elif px <= 48:
            size = "6"
        elif px <= 64:
            size = "8"
        elif px <= 80:
            size = "10"
        elif px <= 96:
            size = "12"
        else:
            size = str(int(px / 4))
        
        return f"space.{size}"
    
    def _calculate_confidence(self, frequency: int) -> Confidence:
        """Calculate confidence based on frequency."""
        if frequency >= 10:
            return Confidence.HIGH
        elif frequency >= 3:
            return Confidence.MEDIUM
        else:
            return Confidence.LOW


def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
    """Convenience function to normalize tokens."""
    normalizer = TokenNormalizer()
    return normalizer.normalize(extracted)