riazmo commited on
Commit
f1c7a18
·
verified ·
1 Parent(s): 7e4e20b

Upload normalizer.py

Browse files
Files changed (1) hide show
  1. agents/normalizer.py +462 -0
agents/normalizer.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 2: Token Normalizer & Structurer
3
+ Design System Extractor v2
4
+
5
+ Persona: Design System Librarian
6
+
7
+ Responsibilities:
8
+ - Clean noisy extraction data
9
+ - Deduplicate similar tokens (colors within threshold, similar spacing)
10
+ - Infer naming patterns from class names and contexts
11
+ - Tag tokens as: detected | inferred | low-confidence
12
+ - Group colors by role (primary, secondary, neutral, etc.)
13
+ """
14
+
15
+ import re
16
+ from typing import Optional
17
+ from collections import defaultdict
18
+
19
+ from core.token_schema import (
20
+ ColorToken,
21
+ TypographyToken,
22
+ SpacingToken,
23
+ ExtractedTokens,
24
+ NormalizedTokens,
25
+ Confidence,
26
+ TokenSource,
27
+ )
28
+ from core.color_utils import (
29
+ parse_color,
30
+ normalize_hex,
31
+ categorize_color,
32
+ )
33
+
34
+
35
+ class TokenNormalizer:
36
+ """
37
+ Normalizes and structures extracted tokens.
38
+
39
+ This is Agent 2's job — taking raw extraction data and
40
+ organizing it into a clean, deduplicated structure.
41
+ """
42
+
43
+ def __init__(self):
44
+ # Thresholds for duplicate detection
45
+ self.color_similarity_threshold = 10 # Delta in RGB space
46
+ self.spacing_merge_threshold = 2 # px difference to merge
47
+
48
+ # Naming patterns
49
+ self.color_role_keywords = {
50
+ "primary": ["primary", "brand", "main", "accent"],
51
+ "secondary": ["secondary", "alt", "alternate"],
52
+ "success": ["success", "green", "positive", "valid"],
53
+ "warning": ["warning", "yellow", "caution", "alert"],
54
+ "error": ["error", "red", "danger", "invalid", "negative"],
55
+ "info": ["info", "blue", "notice"],
56
+ "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
57
+ "background": ["bg", "background", "surface"],
58
+ "text": ["text", "foreground", "content", "body"],
59
+ "border": ["border", "divider", "separator", "line"],
60
+ }
61
+
62
+ def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
63
+ """
64
+ Normalize extracted tokens.
65
+
66
+ Args:
67
+ extracted: Raw extraction results from Agent 1
68
+
69
+ Returns:
70
+ NormalizedTokens with cleaned, deduplicated data
71
+ """
72
+ # Process each token type
73
+ colors = self._normalize_colors(extracted.colors)
74
+ typography = self._normalize_typography(extracted.typography)
75
+ spacing = self._normalize_spacing(extracted.spacing)
76
+
77
+ # Create normalized result
78
+ normalized = NormalizedTokens(
79
+ viewport=extracted.viewport,
80
+ colors=colors,
81
+ typography=typography,
82
+ spacing=spacing,
83
+ radius=extracted.radius, # Pass through for now
84
+ shadows=extracted.shadows, # Pass through for now
85
+ font_families=extracted.font_families,
86
+ pages_crawled=extracted.pages_crawled,
87
+ total_elements=extracted.total_elements,
88
+ )
89
+
90
+ return normalized
91
+
92
+ def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
93
+ """
94
+ Normalize color tokens:
95
+ - Deduplicate similar colors
96
+ - Infer color roles
97
+ - Assign suggested names
98
+ - Calculate confidence
99
+ """
100
+ if not colors:
101
+ return []
102
+
103
+ # Step 1: Deduplicate by exact hex value
104
+ unique_colors = {}
105
+ for color in colors:
106
+ hex_val = normalize_hex(color.value)
107
+ if hex_val in unique_colors:
108
+ # Merge frequency and contexts
109
+ existing = unique_colors[hex_val]
110
+ existing.frequency += color.frequency
111
+ existing.contexts = list(set(existing.contexts + color.contexts))
112
+ existing.elements = list(set(existing.elements + color.elements))
113
+ existing.css_properties = list(set(existing.css_properties + color.css_properties))
114
+ else:
115
+ color.value = hex_val
116
+ unique_colors[hex_val] = color
117
+
118
+ # Step 2: Merge visually similar colors
119
+ merged_colors = self._merge_similar_colors(list(unique_colors.values()))
120
+
121
+ # Step 3: Infer roles and names
122
+ for color in merged_colors:
123
+ role = self._infer_color_role(color)
124
+ if role:
125
+ color.suggested_name = self._generate_color_name(color, role)
126
+ else:
127
+ color.suggested_name = self._generate_color_name_from_value(color)
128
+
129
+ # Update confidence based on frequency
130
+ color.confidence = self._calculate_confidence(color.frequency)
131
+
132
+ # Sort by frequency (most used first)
133
+ merged_colors.sort(key=lambda c: -c.frequency)
134
+
135
+ return merged_colors
136
+
137
+ def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
138
+ """Merge colors that are visually very similar."""
139
+ if len(colors) <= 1:
140
+ return colors
141
+
142
+ merged = []
143
+ used = set()
144
+
145
+ for i, color1 in enumerate(colors):
146
+ if i in used:
147
+ continue
148
+
149
+ # Find similar colors
150
+ similar_group = [color1]
151
+ for j, color2 in enumerate(colors[i+1:], i+1):
152
+ if j in used:
153
+ continue
154
+ if self._colors_are_similar(color1.value, color2.value):
155
+ similar_group.append(color2)
156
+ used.add(j)
157
+
158
+ # Merge the group - keep the most frequent
159
+ similar_group.sort(key=lambda c: -c.frequency)
160
+ primary = similar_group[0]
161
+
162
+ # Aggregate data from similar colors
163
+ for other in similar_group[1:]:
164
+ primary.frequency += other.frequency
165
+ primary.contexts = list(set(primary.contexts + other.contexts))
166
+ primary.elements = list(set(primary.elements + other.elements))
167
+
168
+ merged.append(primary)
169
+ used.add(i)
170
+
171
+ return merged
172
+
173
+ def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
174
+ """Check if two colors are visually similar."""
175
+ try:
176
+ rgb1 = parse_color(hex1)
177
+ rgb2 = parse_color(hex2)
178
+ if rgb1 is None or rgb2 is None:
179
+ return False
180
+
181
+ # Calculate Euclidean distance in RGB space
182
+ distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
183
+ return distance < self.color_similarity_threshold
184
+ except Exception:
185
+ return False
186
+
187
+ def _infer_color_role(self, color: ColorToken) -> Optional[str]:
188
+ """Infer the semantic role of a color from its contexts."""
189
+ all_context = " ".join(color.contexts + color.elements).lower()
190
+
191
+ for role, keywords in self.color_role_keywords.items():
192
+ for keyword in keywords:
193
+ if keyword in all_context:
194
+ return role
195
+
196
+ # Try to infer from color category
197
+ category = categorize_color(color.value)
198
+ if category in ["gray", "white", "black"]:
199
+ return "neutral"
200
+
201
+ return None
202
+
203
+ def _generate_color_name(self, color: ColorToken, role: str) -> str:
204
+ """Generate a semantic name for a color."""
205
+ # Determine shade level based on luminance
206
+ rgb = parse_color(color.value)
207
+ if rgb:
208
+ luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
209
+ if luminance > 0.8:
210
+ shade = "50"
211
+ elif luminance > 0.6:
212
+ shade = "200"
213
+ elif luminance > 0.4:
214
+ shade = "500"
215
+ elif luminance > 0.2:
216
+ shade = "700"
217
+ else:
218
+ shade = "900"
219
+ else:
220
+ shade = "500"
221
+
222
+ return f"color.{role}.{shade}"
223
+
224
+ def _generate_color_name_from_value(self, color: ColorToken) -> str:
225
+ """Generate a name based on the color value itself."""
226
+ category = categorize_color(color.value)
227
+ rgb = parse_color(color.value)
228
+
229
+ if rgb:
230
+ luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
231
+ if luminance > 0.6:
232
+ shade = "light"
233
+ elif luminance > 0.3:
234
+ shade = "base"
235
+ else:
236
+ shade = "dark"
237
+ else:
238
+ shade = "base"
239
+
240
+ return f"color.{category}.{shade}"
241
+
242
+ def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
243
+ """
244
+ Normalize typography tokens:
245
+ - Deduplicate identical styles
246
+ - Infer type scale categories
247
+ - Assign suggested names
248
+ """
249
+ if not typography:
250
+ return []
251
+
252
+ # Deduplicate by unique style combination
253
+ unique_typo = {}
254
+ for typo in typography:
255
+ key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
256
+ if key in unique_typo:
257
+ existing = unique_typo[key]
258
+ existing.frequency += typo.frequency
259
+ existing.elements = list(set(existing.elements + typo.elements))
260
+ else:
261
+ unique_typo[key] = typo
262
+
263
+ result = list(unique_typo.values())
264
+
265
+ # Infer names based on size and elements
266
+ for typo in result:
267
+ typo.suggested_name = self._generate_typography_name(typo)
268
+ typo.confidence = self._calculate_confidence(typo.frequency)
269
+
270
+ # Sort by font size (largest first)
271
+ result.sort(key=lambda t: -self._parse_font_size(t.font_size))
272
+
273
+ return result
274
+
275
+ def _generate_typography_name(self, typo: TypographyToken) -> str:
276
+ """Generate a semantic name for typography."""
277
+ size_px = self._parse_font_size(typo.font_size)
278
+ elements = " ".join(typo.elements).lower()
279
+
280
+ # Determine category from elements
281
+ if any(h in elements for h in ["h1", "hero", "display"]):
282
+ category = "display"
283
+ elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
284
+ category = "heading"
285
+ elif any(h in elements for h in ["label", "caption", "small", "meta"]):
286
+ category = "label"
287
+ elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
288
+ category = "body"
289
+ else:
290
+ category = "text"
291
+
292
+ # Determine size tier
293
+ if size_px >= 32:
294
+ size_tier = "xl"
295
+ elif size_px >= 24:
296
+ size_tier = "lg"
297
+ elif size_px >= 18:
298
+ size_tier = "md"
299
+ elif size_px >= 14:
300
+ size_tier = "sm"
301
+ else:
302
+ size_tier = "xs"
303
+
304
+ return f"font.{category}.{size_tier}"
305
+
306
+ def _parse_font_size(self, size: str) -> float:
307
+ """Parse font size string to pixels."""
308
+ if not size:
309
+ return 16
310
+
311
+ size = size.lower().strip()
312
+
313
+ # Handle px
314
+ if "px" in size:
315
+ try:
316
+ return float(size.replace("px", ""))
317
+ except ValueError:
318
+ return 16
319
+
320
+ # Handle rem (assume 16px base)
321
+ if "rem" in size:
322
+ try:
323
+ return float(size.replace("rem", "")) * 16
324
+ except ValueError:
325
+ return 16
326
+
327
+ # Handle em (assume 16px base)
328
+ if "em" in size:
329
+ try:
330
+ return float(size.replace("em", "")) * 16
331
+ except ValueError:
332
+ return 16
333
+
334
+ # Try plain number
335
+ try:
336
+ return float(size)
337
+ except ValueError:
338
+ return 16
339
+
340
+ def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
341
+ """
342
+ Normalize spacing tokens:
343
+ - Merge similar values
344
+ - Align to base-8 grid if close
345
+ - Assign suggested names
346
+ """
347
+ if not spacing:
348
+ return []
349
+
350
+ # Deduplicate by value
351
+ unique_spacing = {}
352
+ for space in spacing:
353
+ key = space.value
354
+ if key in unique_spacing:
355
+ existing = unique_spacing[key]
356
+ existing.frequency += space.frequency
357
+ existing.contexts = list(set(existing.contexts + space.contexts))
358
+ else:
359
+ unique_spacing[key] = space
360
+
361
+ result = list(unique_spacing.values())
362
+
363
+ # Merge very similar values
364
+ result = self._merge_similar_spacing(result)
365
+
366
+ # Assign names
367
+ for space in result:
368
+ space.suggested_name = self._generate_spacing_name(space)
369
+ space.confidence = self._calculate_confidence(space.frequency)
370
+
371
+ # Sort by value
372
+ result.sort(key=lambda s: s.value_px)
373
+
374
+ return result
375
+
376
+ def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
377
+ """Merge spacing values that are very close."""
378
+ if len(spacing) <= 1:
379
+ return spacing
380
+
381
+ # Sort by pixel value
382
+ spacing.sort(key=lambda s: s.value_px)
383
+
384
+ merged = []
385
+ i = 0
386
+
387
+ while i < len(spacing):
388
+ current = spacing[i]
389
+ group = [current]
390
+
391
+ # Find adjacent similar values
392
+ j = i + 1
393
+ while j < len(spacing):
394
+ if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
395
+ group.append(spacing[j])
396
+ j += 1
397
+ else:
398
+ break
399
+
400
+ # Merge group - prefer base-8 aligned value or most frequent
401
+ group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
402
+ primary = group[0]
403
+
404
+ for other in group[1:]:
405
+ primary.frequency += other.frequency
406
+ primary.contexts = list(set(primary.contexts + other.contexts))
407
+
408
+ merged.append(primary)
409
+ i = j
410
+
411
+ return merged
412
+
413
+ def _generate_spacing_name(self, space: SpacingToken) -> str:
414
+ """Generate a semantic name for spacing."""
415
+ px = space.value_px
416
+
417
+ # Map to t-shirt sizes based on value
418
+ if px <= 2:
419
+ size = "px"
420
+ elif px <= 4:
421
+ size = "0.5"
422
+ elif px <= 8:
423
+ size = "1"
424
+ elif px <= 12:
425
+ size = "1.5"
426
+ elif px <= 16:
427
+ size = "2"
428
+ elif px <= 20:
429
+ size = "2.5"
430
+ elif px <= 24:
431
+ size = "3"
432
+ elif px <= 32:
433
+ size = "4"
434
+ elif px <= 40:
435
+ size = "5"
436
+ elif px <= 48:
437
+ size = "6"
438
+ elif px <= 64:
439
+ size = "8"
440
+ elif px <= 80:
441
+ size = "10"
442
+ elif px <= 96:
443
+ size = "12"
444
+ else:
445
+ size = str(int(px / 4))
446
+
447
+ return f"space.{size}"
448
+
449
+ def _calculate_confidence(self, frequency: int) -> Confidence:
450
+ """Calculate confidence based on frequency."""
451
+ if frequency >= 10:
452
+ return Confidence.HIGH
453
+ elif frequency >= 3:
454
+ return Confidence.MEDIUM
455
+ else:
456
+ return Confidence.LOW
457
+
458
+
459
+ def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
460
+ """Convenience function to normalize tokens."""
461
+ normalizer = TokenNormalizer()
462
+ return normalizer.normalize(extracted)