riazmo commited on
Commit
8a330ac
·
verified ·
1 Parent(s): 98da421

Upload normalizer.py

Browse files
Files changed (1) hide show
  1. agents/normalizer.py +497 -0
agents/normalizer.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 2: Token Normalizer & Structurer
3
+ Design System Extractor v2
4
+
5
+ Persona: Design System Librarian
6
+
7
+ Responsibilities:
8
+ - Clean noisy extraction data
9
+ - Deduplicate similar tokens (colors within threshold, similar spacing)
10
+ - Infer naming patterns from class names and contexts
11
+ - Tag tokens as: detected | inferred | low-confidence
12
+ - Group colors by role (primary, secondary, neutral, etc.)
13
+ """
14
+
15
+ import re
16
+ from typing import Optional
17
+ from collections import defaultdict
18
+
19
+ from core.token_schema import (
20
+ ColorToken,
21
+ TypographyToken,
22
+ SpacingToken,
23
+ ExtractedTokens,
24
+ NormalizedTokens,
25
+ Confidence,
26
+ TokenSource,
27
+ )
28
+ from core.color_utils import (
29
+ parse_color,
30
+ normalize_hex,
31
+ categorize_color,
32
+ )
33
+
34
+
35
+ class TokenNormalizer:
36
+ """
37
+ Normalizes and structures extracted tokens.
38
+
39
+ This is Agent 2's job — taking raw extraction data and
40
+ organizing it into a clean, deduplicated structure.
41
+ """
42
+
43
+ def __init__(self):
44
+ # Thresholds for duplicate detection
45
+ self.color_similarity_threshold = 10 # Delta in RGB space
46
+ self.spacing_merge_threshold = 2 # px difference to merge
47
+
48
+ # Naming patterns
49
+ self.color_role_keywords = {
50
+ "primary": ["primary", "brand", "main", "accent"],
51
+ "secondary": ["secondary", "alt", "alternate"],
52
+ "success": ["success", "green", "positive", "valid"],
53
+ "warning": ["warning", "yellow", "caution", "alert"],
54
+ "error": ["error", "red", "danger", "invalid", "negative"],
55
+ "info": ["info", "blue", "notice"],
56
+ "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
57
+ "background": ["bg", "background", "surface"],
58
+ "text": ["text", "foreground", "content", "body"],
59
+ "border": ["border", "divider", "separator", "line"],
60
+ }
61
+
62
+ def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
63
+ """
64
+ Normalize extracted tokens.
65
+
66
+ Args:
67
+ extracted: Raw extraction results from Agent 1
68
+
69
+ Returns:
70
+ NormalizedTokens with cleaned, deduplicated data
71
+ """
72
+ # Process each token type (returns lists)
73
+ colors_list = self._normalize_colors(extracted.colors)
74
+ typography_list = self._normalize_typography(extracted.typography)
75
+ spacing_list = self._normalize_spacing(extracted.spacing)
76
+
77
+ # Convert to dicts keyed by suggested_name
78
+ colors_dict = {}
79
+ for c in colors_list:
80
+ key = c.suggested_name or c.value
81
+ colors_dict[key] = c
82
+
83
+ typography_dict = {}
84
+ for t in typography_list:
85
+ key = t.suggested_name or f"{t.font_family}-{t.font_size}"
86
+ typography_dict[key] = t
87
+
88
+ spacing_dict = {}
89
+ for s in spacing_list:
90
+ key = s.suggested_name or s.value
91
+ spacing_dict[key] = s
92
+
93
+ # Convert radius and shadows to dicts
94
+ radius_dict = {}
95
+ for r in extracted.radius:
96
+ key = f"radius-{r.value}"
97
+ radius_dict[key] = r
98
+
99
+ shadows_dict = {}
100
+ for s in extracted.shadows:
101
+ key = f"shadow-{hash(s.value) % 1000}"
102
+ shadows_dict[key] = s
103
+
104
+ # Create normalized result
105
+ normalized = NormalizedTokens(
106
+ viewport=extracted.viewport,
107
+ source_url=extracted.source_url,
108
+ colors=colors_dict,
109
+ typography=typography_dict,
110
+ spacing=spacing_dict,
111
+ radius=radius_dict,
112
+ shadows=shadows_dict,
113
+ font_families=extracted.font_families,
114
+ detected_spacing_base=extracted.spacing_base,
115
+ detected_naming_convention=extracted.naming_convention,
116
+ )
117
+
118
+ return normalized
119
+
120
+ def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
121
+ """
122
+ Normalize color tokens:
123
+ - Deduplicate similar colors
124
+ - Infer color roles
125
+ - Assign suggested names
126
+ - Calculate confidence
127
+ """
128
+ if not colors:
129
+ return []
130
+
131
+ # Step 1: Deduplicate by exact hex value
132
+ unique_colors = {}
133
+ for color in colors:
134
+ hex_val = normalize_hex(color.value)
135
+ if hex_val in unique_colors:
136
+ # Merge frequency and contexts
137
+ existing = unique_colors[hex_val]
138
+ existing.frequency += color.frequency
139
+ existing.contexts = list(set(existing.contexts + color.contexts))
140
+ existing.elements = list(set(existing.elements + color.elements))
141
+ existing.css_properties = list(set(existing.css_properties + color.css_properties))
142
+ else:
143
+ color.value = hex_val
144
+ unique_colors[hex_val] = color
145
+
146
+ # Step 2: Merge visually similar colors
147
+ merged_colors = self._merge_similar_colors(list(unique_colors.values()))
148
+
149
+ # Step 3: Infer roles and names
150
+ for color in merged_colors:
151
+ role = self._infer_color_role(color)
152
+ if role:
153
+ color.suggested_name = self._generate_color_name(color, role)
154
+ else:
155
+ color.suggested_name = self._generate_color_name_from_value(color)
156
+
157
+ # Update confidence based on frequency
158
+ color.confidence = self._calculate_confidence(color.frequency)
159
+
160
+ # Sort by frequency (most used first)
161
+ merged_colors.sort(key=lambda c: -c.frequency)
162
+
163
+ return merged_colors
164
+
165
+ def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
166
+ """Merge colors that are visually very similar."""
167
+ if len(colors) <= 1:
168
+ return colors
169
+
170
+ merged = []
171
+ used = set()
172
+
173
+ for i, color1 in enumerate(colors):
174
+ if i in used:
175
+ continue
176
+
177
+ # Find similar colors
178
+ similar_group = [color1]
179
+ for j, color2 in enumerate(colors[i+1:], i+1):
180
+ if j in used:
181
+ continue
182
+ if self._colors_are_similar(color1.value, color2.value):
183
+ similar_group.append(color2)
184
+ used.add(j)
185
+
186
+ # Merge the group - keep the most frequent
187
+ similar_group.sort(key=lambda c: -c.frequency)
188
+ primary = similar_group[0]
189
+
190
+ # Aggregate data from similar colors
191
+ for other in similar_group[1:]:
192
+ primary.frequency += other.frequency
193
+ primary.contexts = list(set(primary.contexts + other.contexts))
194
+ primary.elements = list(set(primary.elements + other.elements))
195
+
196
+ merged.append(primary)
197
+ used.add(i)
198
+
199
+ return merged
200
+
201
+ def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
202
+ """Check if two colors are visually similar."""
203
+ try:
204
+ parsed1 = parse_color(hex1)
205
+ parsed2 = parse_color(hex2)
206
+ if parsed1 is None or parsed2 is None:
207
+ return False
208
+ if parsed1.rgb is None or parsed2.rgb is None:
209
+ return False
210
+
211
+ rgb1 = parsed1.rgb
212
+ rgb2 = parsed2.rgb
213
+
214
+ # Calculate Euclidean distance in RGB space
215
+ distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
216
+ return distance < self.color_similarity_threshold
217
+ except Exception:
218
+ return False
219
+
220
+ def _infer_color_role(self, color: ColorToken) -> Optional[str]:
221
+ """Infer the semantic role of a color from its contexts."""
222
+ all_context = " ".join(color.contexts + color.elements).lower()
223
+
224
+ for role, keywords in self.color_role_keywords.items():
225
+ for keyword in keywords:
226
+ if keyword in all_context:
227
+ return role
228
+
229
+ # Try to infer from color category
230
+ category = categorize_color(color.value)
231
+ if category in ["gray", "white", "black"]:
232
+ return "neutral"
233
+
234
+ return None
235
+
236
+ def _generate_color_name(self, color: ColorToken, role: str) -> str:
237
+ """Generate a semantic name for a color."""
238
+ # Determine shade level based on luminance
239
+ parsed = parse_color(color.value)
240
+ if parsed and parsed.rgb:
241
+ rgb = parsed.rgb
242
+ luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
243
+ if luminance > 0.8:
244
+ shade = "50"
245
+ elif luminance > 0.6:
246
+ shade = "200"
247
+ elif luminance > 0.4:
248
+ shade = "500"
249
+ elif luminance > 0.2:
250
+ shade = "700"
251
+ else:
252
+ shade = "900"
253
+ else:
254
+ shade = "500"
255
+
256
+ return f"color.{role}.{shade}"
257
+
258
+ def _generate_color_name_from_value(self, color: ColorToken) -> str:
259
+ """Generate a name based on the color value itself."""
260
+ category = categorize_color(color.value)
261
+ parsed = parse_color(color.value)
262
+
263
+ if parsed and parsed.rgb:
264
+ rgb = parsed.rgb
265
+ luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
266
+ if luminance > 0.6:
267
+ shade = "light"
268
+ elif luminance > 0.3:
269
+ shade = "base"
270
+ else:
271
+ shade = "dark"
272
+ else:
273
+ shade = "base"
274
+
275
+ return f"color.{category}.{shade}"
276
+
277
+ def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
278
+ """
279
+ Normalize typography tokens:
280
+ - Deduplicate identical styles
281
+ - Infer type scale categories
282
+ - Assign suggested names
283
+ """
284
+ if not typography:
285
+ return []
286
+
287
+ # Deduplicate by unique style combination
288
+ unique_typo = {}
289
+ for typo in typography:
290
+ key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
291
+ if key in unique_typo:
292
+ existing = unique_typo[key]
293
+ existing.frequency += typo.frequency
294
+ existing.elements = list(set(existing.elements + typo.elements))
295
+ else:
296
+ unique_typo[key] = typo
297
+
298
+ result = list(unique_typo.values())
299
+
300
+ # Infer names based on size and elements
301
+ for typo in result:
302
+ typo.suggested_name = self._generate_typography_name(typo)
303
+ typo.confidence = self._calculate_confidence(typo.frequency)
304
+
305
+ # Sort by font size (largest first)
306
+ result.sort(key=lambda t: -self._parse_font_size(t.font_size))
307
+
308
+ return result
309
+
310
+ def _generate_typography_name(self, typo: TypographyToken) -> str:
311
+ """Generate a semantic name for typography."""
312
+ size_px = self._parse_font_size(typo.font_size)
313
+ elements = " ".join(typo.elements).lower()
314
+
315
+ # Determine category from elements
316
+ if any(h in elements for h in ["h1", "hero", "display"]):
317
+ category = "display"
318
+ elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
319
+ category = "heading"
320
+ elif any(h in elements for h in ["label", "caption", "small", "meta"]):
321
+ category = "label"
322
+ elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
323
+ category = "body"
324
+ else:
325
+ category = "text"
326
+
327
+ # Determine size tier
328
+ if size_px >= 32:
329
+ size_tier = "xl"
330
+ elif size_px >= 24:
331
+ size_tier = "lg"
332
+ elif size_px >= 18:
333
+ size_tier = "md"
334
+ elif size_px >= 14:
335
+ size_tier = "sm"
336
+ else:
337
+ size_tier = "xs"
338
+
339
+ return f"font.{category}.{size_tier}"
340
+
341
+ def _parse_font_size(self, size: str) -> float:
342
+ """Parse font size string to pixels."""
343
+ if not size:
344
+ return 16
345
+
346
+ size = size.lower().strip()
347
+
348
+ # Handle px
349
+ if "px" in size:
350
+ try:
351
+ return float(size.replace("px", ""))
352
+ except ValueError:
353
+ return 16
354
+
355
+ # Handle rem (assume 16px base)
356
+ if "rem" in size:
357
+ try:
358
+ return float(size.replace("rem", "")) * 16
359
+ except ValueError:
360
+ return 16
361
+
362
+ # Handle em (assume 16px base)
363
+ if "em" in size:
364
+ try:
365
+ return float(size.replace("em", "")) * 16
366
+ except ValueError:
367
+ return 16
368
+
369
+ # Try plain number
370
+ try:
371
+ return float(size)
372
+ except ValueError:
373
+ return 16
374
+
375
+ def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
376
+ """
377
+ Normalize spacing tokens:
378
+ - Merge similar values
379
+ - Align to base-8 grid if close
380
+ - Assign suggested names
381
+ """
382
+ if not spacing:
383
+ return []
384
+
385
+ # Deduplicate by value
386
+ unique_spacing = {}
387
+ for space in spacing:
388
+ key = space.value
389
+ if key in unique_spacing:
390
+ existing = unique_spacing[key]
391
+ existing.frequency += space.frequency
392
+ existing.contexts = list(set(existing.contexts + space.contexts))
393
+ else:
394
+ unique_spacing[key] = space
395
+
396
+ result = list(unique_spacing.values())
397
+
398
+ # Merge very similar values
399
+ result = self._merge_similar_spacing(result)
400
+
401
+ # Assign names
402
+ for space in result:
403
+ space.suggested_name = self._generate_spacing_name(space)
404
+ space.confidence = self._calculate_confidence(space.frequency)
405
+
406
+ # Sort by value
407
+ result.sort(key=lambda s: s.value_px)
408
+
409
+ return result
410
+
411
+ def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
412
+ """Merge spacing values that are very close."""
413
+ if len(spacing) <= 1:
414
+ return spacing
415
+
416
+ # Sort by pixel value
417
+ spacing.sort(key=lambda s: s.value_px)
418
+
419
+ merged = []
420
+ i = 0
421
+
422
+ while i < len(spacing):
423
+ current = spacing[i]
424
+ group = [current]
425
+
426
+ # Find adjacent similar values
427
+ j = i + 1
428
+ while j < len(spacing):
429
+ if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
430
+ group.append(spacing[j])
431
+ j += 1
432
+ else:
433
+ break
434
+
435
+ # Merge group - prefer base-8 aligned value or most frequent
436
+ group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
437
+ primary = group[0]
438
+
439
+ for other in group[1:]:
440
+ primary.frequency += other.frequency
441
+ primary.contexts = list(set(primary.contexts + other.contexts))
442
+
443
+ merged.append(primary)
444
+ i = j
445
+
446
+ return merged
447
+
448
+ def _generate_spacing_name(self, space: SpacingToken) -> str:
449
+ """Generate a semantic name for spacing."""
450
+ px = space.value_px
451
+
452
+ # Map to t-shirt sizes based on value
453
+ if px <= 2:
454
+ size = "px"
455
+ elif px <= 4:
456
+ size = "0.5"
457
+ elif px <= 8:
458
+ size = "1"
459
+ elif px <= 12:
460
+ size = "1.5"
461
+ elif px <= 16:
462
+ size = "2"
463
+ elif px <= 20:
464
+ size = "2.5"
465
+ elif px <= 24:
466
+ size = "3"
467
+ elif px <= 32:
468
+ size = "4"
469
+ elif px <= 40:
470
+ size = "5"
471
+ elif px <= 48:
472
+ size = "6"
473
+ elif px <= 64:
474
+ size = "8"
475
+ elif px <= 80:
476
+ size = "10"
477
+ elif px <= 96:
478
+ size = "12"
479
+ else:
480
+ size = str(int(px / 4))
481
+
482
+ return f"space.{size}"
483
+
484
+ def _calculate_confidence(self, frequency: int) -> Confidence:
485
+ """Calculate confidence based on frequency."""
486
+ if frequency >= 10:
487
+ return Confidence.HIGH
488
+ elif frequency >= 3:
489
+ return Confidence.MEDIUM
490
+ else:
491
+ return Confidence.LOW
492
+
493
+
494
+ def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
495
+ """Convenience function to normalize tokens."""
496
+ normalizer = TokenNormalizer()
497
+ return normalizer.normalize(extracted)