riazmo commited on
Commit
e36dc73
·
verified ·
1 Parent(s): 92ac926

Upload normalizer.py

Browse files
Files changed (1) hide show
  1. agents/normalizer.py +469 -0
agents/normalizer.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 2: Token Normalizer & Structurer
3
+ Design System Extractor v2
4
+
5
+ Persona: Design System Librarian
6
+
7
+ Responsibilities:
8
+ - Clean noisy extraction data
9
+ - Deduplicate similar tokens (colors within threshold, similar spacing)
10
+ - Infer naming patterns from class names and contexts
11
+ - Tag tokens as: detected | inferred | low-confidence
12
+ - Group colors by role (primary, secondary, neutral, etc.)
13
+ """
14
+
15
+ import re
16
+ from typing import Optional
17
+ from collections import defaultdict
18
+
19
+ from core.token_schema import (
20
+ ColorToken,
21
+ TypographyToken,
22
+ SpacingToken,
23
+ ExtractedTokens,
24
+ NormalizedTokens,
25
+ Confidence,
26
+ TokenSource,
27
+ )
28
+ from core.color_utils import (
29
+ parse_color,
30
+ normalize_hex,
31
+ categorize_color,
32
+ )
33
+
34
+
35
+ class TokenNormalizer:
36
+ """
37
+ Normalizes and structures extracted tokens.
38
+
39
+ This is Agent 2's job — taking raw extraction data and
40
+ organizing it into a clean, deduplicated structure.
41
+ """
42
+
43
+ def __init__(self):
44
+ # Thresholds for duplicate detection
45
+ self.color_similarity_threshold = 10 # Delta in RGB space
46
+ self.spacing_merge_threshold = 2 # px difference to merge
47
+
48
+ # Naming patterns
49
+ self.color_role_keywords = {
50
+ "primary": ["primary", "brand", "main", "accent"],
51
+ "secondary": ["secondary", "alt", "alternate"],
52
+ "success": ["success", "green", "positive", "valid"],
53
+ "warning": ["warning", "yellow", "caution", "alert"],
54
+ "error": ["error", "red", "danger", "invalid", "negative"],
55
+ "info": ["info", "blue", "notice"],
56
+ "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
57
+ "background": ["bg", "background", "surface"],
58
+ "text": ["text", "foreground", "content", "body"],
59
+ "border": ["border", "divider", "separator", "line"],
60
+ }
61
+
62
+ def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
63
+ """
64
+ Normalize extracted tokens.
65
+
66
+ Args:
67
+ extracted: Raw extraction results from Agent 1
68
+
69
+ Returns:
70
+ NormalizedTokens with cleaned, deduplicated data
71
+ """
72
+ # Process each token type
73
+ colors = self._normalize_colors(extracted.colors)
74
+ typography = self._normalize_typography(extracted.typography)
75
+ spacing = self._normalize_spacing(extracted.spacing)
76
+
77
+ # Create normalized result
78
+ normalized = NormalizedTokens(
79
+ viewport=extracted.viewport,
80
+ colors=colors,
81
+ typography=typography,
82
+ spacing=spacing,
83
+ radius=extracted.radius, # Pass through for now
84
+ shadows=extracted.shadows, # Pass through for now
85
+ font_families=extracted.font_families,
86
+ pages_crawled=extracted.pages_crawled,
87
+ total_elements=extracted.total_elements,
88
+ )
89
+
90
+ return normalized
91
+
92
+ def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
93
+ """
94
+ Normalize color tokens:
95
+ - Deduplicate similar colors
96
+ - Infer color roles
97
+ - Assign suggested names
98
+ - Calculate confidence
99
+ """
100
+ if not colors:
101
+ return []
102
+
103
+ # Step 1: Deduplicate by exact hex value
104
+ unique_colors = {}
105
+ for color in colors:
106
+ hex_val = normalize_hex(color.value)
107
+ if hex_val in unique_colors:
108
+ # Merge frequency and contexts
109
+ existing = unique_colors[hex_val]
110
+ existing.frequency += color.frequency
111
+ existing.contexts = list(set(existing.contexts + color.contexts))
112
+ existing.elements = list(set(existing.elements + color.elements))
113
+ existing.css_properties = list(set(existing.css_properties + color.css_properties))
114
+ else:
115
+ color.value = hex_val
116
+ unique_colors[hex_val] = color
117
+
118
+ # Step 2: Merge visually similar colors
119
+ merged_colors = self._merge_similar_colors(list(unique_colors.values()))
120
+
121
+ # Step 3: Infer roles and names
122
+ for color in merged_colors:
123
+ role = self._infer_color_role(color)
124
+ if role:
125
+ color.suggested_name = self._generate_color_name(color, role)
126
+ else:
127
+ color.suggested_name = self._generate_color_name_from_value(color)
128
+
129
+ # Update confidence based on frequency
130
+ color.confidence = self._calculate_confidence(color.frequency)
131
+
132
+ # Sort by frequency (most used first)
133
+ merged_colors.sort(key=lambda c: -c.frequency)
134
+
135
+ return merged_colors
136
+
137
+ def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
138
+ """Merge colors that are visually very similar."""
139
+ if len(colors) <= 1:
140
+ return colors
141
+
142
+ merged = []
143
+ used = set()
144
+
145
+ for i, color1 in enumerate(colors):
146
+ if i in used:
147
+ continue
148
+
149
+ # Find similar colors
150
+ similar_group = [color1]
151
+ for j, color2 in enumerate(colors[i+1:], i+1):
152
+ if j in used:
153
+ continue
154
+ if self._colors_are_similar(color1.value, color2.value):
155
+ similar_group.append(color2)
156
+ used.add(j)
157
+
158
+ # Merge the group - keep the most frequent
159
+ similar_group.sort(key=lambda c: -c.frequency)
160
+ primary = similar_group[0]
161
+
162
+ # Aggregate data from similar colors
163
+ for other in similar_group[1:]:
164
+ primary.frequency += other.frequency
165
+ primary.contexts = list(set(primary.contexts + other.contexts))
166
+ primary.elements = list(set(primary.elements + other.elements))
167
+
168
+ merged.append(primary)
169
+ used.add(i)
170
+
171
+ return merged
172
+
173
+ def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
174
+ """Check if two colors are visually similar."""
175
+ try:
176
+ parsed1 = parse_color(hex1)
177
+ parsed2 = parse_color(hex2)
178
+ if parsed1 is None or parsed2 is None:
179
+ return False
180
+ if parsed1.rgb is None or parsed2.rgb is None:
181
+ return False
182
+
183
+ rgb1 = parsed1.rgb
184
+ rgb2 = parsed2.rgb
185
+
186
+ # Calculate Euclidean distance in RGB space
187
+ distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
188
+ return distance < self.color_similarity_threshold
189
+ except Exception:
190
+ return False
191
+
192
+ def _infer_color_role(self, color: ColorToken) -> Optional[str]:
193
+ """Infer the semantic role of a color from its contexts."""
194
+ all_context = " ".join(color.contexts + color.elements).lower()
195
+
196
+ for role, keywords in self.color_role_keywords.items():
197
+ for keyword in keywords:
198
+ if keyword in all_context:
199
+ return role
200
+
201
+ # Try to infer from color category
202
+ category = categorize_color(color.value)
203
+ if category in ["gray", "white", "black"]:
204
+ return "neutral"
205
+
206
+ return None
207
+
208
+ def _generate_color_name(self, color: ColorToken, role: str) -> str:
209
+ """Generate a semantic name for a color."""
210
+ # Determine shade level based on luminance
211
+ parsed = parse_color(color.value)
212
+ if parsed and parsed.rgb:
213
+ rgb = parsed.rgb
214
+ luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
215
+ if luminance > 0.8:
216
+ shade = "50"
217
+ elif luminance > 0.6:
218
+ shade = "200"
219
+ elif luminance > 0.4:
220
+ shade = "500"
221
+ elif luminance > 0.2:
222
+ shade = "700"
223
+ else:
224
+ shade = "900"
225
+ else:
226
+ shade = "500"
227
+
228
+ return f"color.{role}.{shade}"
229
+
230
+ def _generate_color_name_from_value(self, color: ColorToken) -> str:
231
+ """Generate a name based on the color value itself."""
232
+ category = categorize_color(color.value)
233
+ parsed = parse_color(color.value)
234
+
235
+ if parsed and parsed.rgb:
236
+ rgb = parsed.rgb
237
+ luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
238
+ if luminance > 0.6:
239
+ shade = "light"
240
+ elif luminance > 0.3:
241
+ shade = "base"
242
+ else:
243
+ shade = "dark"
244
+ else:
245
+ shade = "base"
246
+
247
+ return f"color.{category}.{shade}"
248
+
249
+ def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
250
+ """
251
+ Normalize typography tokens:
252
+ - Deduplicate identical styles
253
+ - Infer type scale categories
254
+ - Assign suggested names
255
+ """
256
+ if not typography:
257
+ return []
258
+
259
+ # Deduplicate by unique style combination
260
+ unique_typo = {}
261
+ for typo in typography:
262
+ key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
263
+ if key in unique_typo:
264
+ existing = unique_typo[key]
265
+ existing.frequency += typo.frequency
266
+ existing.elements = list(set(existing.elements + typo.elements))
267
+ else:
268
+ unique_typo[key] = typo
269
+
270
+ result = list(unique_typo.values())
271
+
272
+ # Infer names based on size and elements
273
+ for typo in result:
274
+ typo.suggested_name = self._generate_typography_name(typo)
275
+ typo.confidence = self._calculate_confidence(typo.frequency)
276
+
277
+ # Sort by font size (largest first)
278
+ result.sort(key=lambda t: -self._parse_font_size(t.font_size))
279
+
280
+ return result
281
+
282
+ def _generate_typography_name(self, typo: TypographyToken) -> str:
283
+ """Generate a semantic name for typography."""
284
+ size_px = self._parse_font_size(typo.font_size)
285
+ elements = " ".join(typo.elements).lower()
286
+
287
+ # Determine category from elements
288
+ if any(h in elements for h in ["h1", "hero", "display"]):
289
+ category = "display"
290
+ elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
291
+ category = "heading"
292
+ elif any(h in elements for h in ["label", "caption", "small", "meta"]):
293
+ category = "label"
294
+ elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
295
+ category = "body"
296
+ else:
297
+ category = "text"
298
+
299
+ # Determine size tier
300
+ if size_px >= 32:
301
+ size_tier = "xl"
302
+ elif size_px >= 24:
303
+ size_tier = "lg"
304
+ elif size_px >= 18:
305
+ size_tier = "md"
306
+ elif size_px >= 14:
307
+ size_tier = "sm"
308
+ else:
309
+ size_tier = "xs"
310
+
311
+ return f"font.{category}.{size_tier}"
312
+
313
+ def _parse_font_size(self, size: str) -> float:
314
+ """Parse font size string to pixels."""
315
+ if not size:
316
+ return 16
317
+
318
+ size = size.lower().strip()
319
+
320
+ # Handle px
321
+ if "px" in size:
322
+ try:
323
+ return float(size.replace("px", ""))
324
+ except ValueError:
325
+ return 16
326
+
327
+ # Handle rem (assume 16px base)
328
+ if "rem" in size:
329
+ try:
330
+ return float(size.replace("rem", "")) * 16
331
+ except ValueError:
332
+ return 16
333
+
334
+ # Handle em (assume 16px base)
335
+ if "em" in size:
336
+ try:
337
+ return float(size.replace("em", "")) * 16
338
+ except ValueError:
339
+ return 16
340
+
341
+ # Try plain number
342
+ try:
343
+ return float(size)
344
+ except ValueError:
345
+ return 16
346
+
347
+ def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
348
+ """
349
+ Normalize spacing tokens:
350
+ - Merge similar values
351
+ - Align to base-8 grid if close
352
+ - Assign suggested names
353
+ """
354
+ if not spacing:
355
+ return []
356
+
357
+ # Deduplicate by value
358
+ unique_spacing = {}
359
+ for space in spacing:
360
+ key = space.value
361
+ if key in unique_spacing:
362
+ existing = unique_spacing[key]
363
+ existing.frequency += space.frequency
364
+ existing.contexts = list(set(existing.contexts + space.contexts))
365
+ else:
366
+ unique_spacing[key] = space
367
+
368
+ result = list(unique_spacing.values())
369
+
370
+ # Merge very similar values
371
+ result = self._merge_similar_spacing(result)
372
+
373
+ # Assign names
374
+ for space in result:
375
+ space.suggested_name = self._generate_spacing_name(space)
376
+ space.confidence = self._calculate_confidence(space.frequency)
377
+
378
+ # Sort by value
379
+ result.sort(key=lambda s: s.value_px)
380
+
381
+ return result
382
+
383
+ def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
384
+ """Merge spacing values that are very close."""
385
+ if len(spacing) <= 1:
386
+ return spacing
387
+
388
+ # Sort by pixel value
389
+ spacing.sort(key=lambda s: s.value_px)
390
+
391
+ merged = []
392
+ i = 0
393
+
394
+ while i < len(spacing):
395
+ current = spacing[i]
396
+ group = [current]
397
+
398
+ # Find adjacent similar values
399
+ j = i + 1
400
+ while j < len(spacing):
401
+ if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
402
+ group.append(spacing[j])
403
+ j += 1
404
+ else:
405
+ break
406
+
407
+ # Merge group - prefer base-8 aligned value or most frequent
408
+ group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
409
+ primary = group[0]
410
+
411
+ for other in group[1:]:
412
+ primary.frequency += other.frequency
413
+ primary.contexts = list(set(primary.contexts + other.contexts))
414
+
415
+ merged.append(primary)
416
+ i = j
417
+
418
+ return merged
419
+
420
+ def _generate_spacing_name(self, space: SpacingToken) -> str:
421
+ """Generate a semantic name for spacing."""
422
+ px = space.value_px
423
+
424
+ # Map to t-shirt sizes based on value
425
+ if px <= 2:
426
+ size = "px"
427
+ elif px <= 4:
428
+ size = "0.5"
429
+ elif px <= 8:
430
+ size = "1"
431
+ elif px <= 12:
432
+ size = "1.5"
433
+ elif px <= 16:
434
+ size = "2"
435
+ elif px <= 20:
436
+ size = "2.5"
437
+ elif px <= 24:
438
+ size = "3"
439
+ elif px <= 32:
440
+ size = "4"
441
+ elif px <= 40:
442
+ size = "5"
443
+ elif px <= 48:
444
+ size = "6"
445
+ elif px <= 64:
446
+ size = "8"
447
+ elif px <= 80:
448
+ size = "10"
449
+ elif px <= 96:
450
+ size = "12"
451
+ else:
452
+ size = str(int(px / 4))
453
+
454
+ return f"space.{size}"
455
+
456
+ def _calculate_confidence(self, frequency: int) -> Confidence:
457
+ """Calculate confidence based on frequency."""
458
+ if frequency >= 10:
459
+ return Confidence.HIGH
460
+ elif frequency >= 3:
461
+ return Confidence.MEDIUM
462
+ else:
463
+ return Confidence.LOW
464
+
465
+
466
+ def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
467
+ """Convenience function to normalize tokens."""
468
+ normalizer = TokenNormalizer()
469
+ return normalizer.normalize(extracted)