riazmo commited on
Commit
98da421
·
verified ·
1 Parent(s): 7ca38c3

Delete agents/normalizer.py

Browse files
Files changed (1) hide show
  1. agents/normalizer.py +0 -469
agents/normalizer.py DELETED
@@ -1,469 +0,0 @@
1
- """
2
- Agent 2: Token Normalizer & Structurer
3
- Design System Extractor v2
4
-
5
- Persona: Design System Librarian
6
-
7
- Responsibilities:
8
- - Clean noisy extraction data
9
- - Deduplicate similar tokens (colors within threshold, similar spacing)
10
- - Infer naming patterns from class names and contexts
11
- - Tag tokens as: detected | inferred | low-confidence
12
- - Group colors by role (primary, secondary, neutral, etc.)
13
- """
14
-
15
- import re
16
- from typing import Optional
17
- from collections import defaultdict
18
-
19
- from core.token_schema import (
20
- ColorToken,
21
- TypographyToken,
22
- SpacingToken,
23
- ExtractedTokens,
24
- NormalizedTokens,
25
- Confidence,
26
- TokenSource,
27
- )
28
- from core.color_utils import (
29
- parse_color,
30
- normalize_hex,
31
- categorize_color,
32
- )
33
-
34
-
35
- class TokenNormalizer:
36
- """
37
- Normalizes and structures extracted tokens.
38
-
39
- This is Agent 2's job — taking raw extraction data and
40
- organizing it into a clean, deduplicated structure.
41
- """
42
-
43
- def __init__(self):
44
- # Thresholds for duplicate detection
45
- self.color_similarity_threshold = 10 # Delta in RGB space
46
- self.spacing_merge_threshold = 2 # px difference to merge
47
-
48
- # Naming patterns
49
- self.color_role_keywords = {
50
- "primary": ["primary", "brand", "main", "accent"],
51
- "secondary": ["secondary", "alt", "alternate"],
52
- "success": ["success", "green", "positive", "valid"],
53
- "warning": ["warning", "yellow", "caution", "alert"],
54
- "error": ["error", "red", "danger", "invalid", "negative"],
55
- "info": ["info", "blue", "notice"],
56
- "neutral": ["gray", "grey", "neutral", "muted", "subtle"],
57
- "background": ["bg", "background", "surface"],
58
- "text": ["text", "foreground", "content", "body"],
59
- "border": ["border", "divider", "separator", "line"],
60
- }
61
-
62
- def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens:
63
- """
64
- Normalize extracted tokens.
65
-
66
- Args:
67
- extracted: Raw extraction results from Agent 1
68
-
69
- Returns:
70
- NormalizedTokens with cleaned, deduplicated data
71
- """
72
- # Process each token type
73
- colors = self._normalize_colors(extracted.colors)
74
- typography = self._normalize_typography(extracted.typography)
75
- spacing = self._normalize_spacing(extracted.spacing)
76
-
77
- # Create normalized result
78
- normalized = NormalizedTokens(
79
- viewport=extracted.viewport,
80
- colors=colors,
81
- typography=typography,
82
- spacing=spacing,
83
- radius=extracted.radius, # Pass through for now
84
- shadows=extracted.shadows, # Pass through for now
85
- font_families=extracted.font_families,
86
- pages_crawled=extracted.pages_crawled,
87
- total_elements=extracted.total_elements,
88
- )
89
-
90
- return normalized
91
-
92
- def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
93
- """
94
- Normalize color tokens:
95
- - Deduplicate similar colors
96
- - Infer color roles
97
- - Assign suggested names
98
- - Calculate confidence
99
- """
100
- if not colors:
101
- return []
102
-
103
- # Step 1: Deduplicate by exact hex value
104
- unique_colors = {}
105
- for color in colors:
106
- hex_val = normalize_hex(color.value)
107
- if hex_val in unique_colors:
108
- # Merge frequency and contexts
109
- existing = unique_colors[hex_val]
110
- existing.frequency += color.frequency
111
- existing.contexts = list(set(existing.contexts + color.contexts))
112
- existing.elements = list(set(existing.elements + color.elements))
113
- existing.css_properties = list(set(existing.css_properties + color.css_properties))
114
- else:
115
- color.value = hex_val
116
- unique_colors[hex_val] = color
117
-
118
- # Step 2: Merge visually similar colors
119
- merged_colors = self._merge_similar_colors(list(unique_colors.values()))
120
-
121
- # Step 3: Infer roles and names
122
- for color in merged_colors:
123
- role = self._infer_color_role(color)
124
- if role:
125
- color.suggested_name = self._generate_color_name(color, role)
126
- else:
127
- color.suggested_name = self._generate_color_name_from_value(color)
128
-
129
- # Update confidence based on frequency
130
- color.confidence = self._calculate_confidence(color.frequency)
131
-
132
- # Sort by frequency (most used first)
133
- merged_colors.sort(key=lambda c: -c.frequency)
134
-
135
- return merged_colors
136
-
137
- def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]:
138
- """Merge colors that are visually very similar."""
139
- if len(colors) <= 1:
140
- return colors
141
-
142
- merged = []
143
- used = set()
144
-
145
- for i, color1 in enumerate(colors):
146
- if i in used:
147
- continue
148
-
149
- # Find similar colors
150
- similar_group = [color1]
151
- for j, color2 in enumerate(colors[i+1:], i+1):
152
- if j in used:
153
- continue
154
- if self._colors_are_similar(color1.value, color2.value):
155
- similar_group.append(color2)
156
- used.add(j)
157
-
158
- # Merge the group - keep the most frequent
159
- similar_group.sort(key=lambda c: -c.frequency)
160
- primary = similar_group[0]
161
-
162
- # Aggregate data from similar colors
163
- for other in similar_group[1:]:
164
- primary.frequency += other.frequency
165
- primary.contexts = list(set(primary.contexts + other.contexts))
166
- primary.elements = list(set(primary.elements + other.elements))
167
-
168
- merged.append(primary)
169
- used.add(i)
170
-
171
- return merged
172
-
173
- def _colors_are_similar(self, hex1: str, hex2: str) -> bool:
174
- """Check if two colors are visually similar."""
175
- try:
176
- parsed1 = parse_color(hex1)
177
- parsed2 = parse_color(hex2)
178
- if parsed1 is None or parsed2 is None:
179
- return False
180
- if parsed1.rgb is None or parsed2.rgb is None:
181
- return False
182
-
183
- rgb1 = parsed1.rgb
184
- rgb2 = parsed2.rgb
185
-
186
- # Calculate Euclidean distance in RGB space
187
- distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5
188
- return distance < self.color_similarity_threshold
189
- except Exception:
190
- return False
191
-
192
- def _infer_color_role(self, color: ColorToken) -> Optional[str]:
193
- """Infer the semantic role of a color from its contexts."""
194
- all_context = " ".join(color.contexts + color.elements).lower()
195
-
196
- for role, keywords in self.color_role_keywords.items():
197
- for keyword in keywords:
198
- if keyword in all_context:
199
- return role
200
-
201
- # Try to infer from color category
202
- category = categorize_color(color.value)
203
- if category in ["gray", "white", "black"]:
204
- return "neutral"
205
-
206
- return None
207
-
208
- def _generate_color_name(self, color: ColorToken, role: str) -> str:
209
- """Generate a semantic name for a color."""
210
- # Determine shade level based on luminance
211
- parsed = parse_color(color.value)
212
- if parsed and parsed.rgb:
213
- rgb = parsed.rgb
214
- luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
215
- if luminance > 0.8:
216
- shade = "50"
217
- elif luminance > 0.6:
218
- shade = "200"
219
- elif luminance > 0.4:
220
- shade = "500"
221
- elif luminance > 0.2:
222
- shade = "700"
223
- else:
224
- shade = "900"
225
- else:
226
- shade = "500"
227
-
228
- return f"color.{role}.{shade}"
229
-
230
- def _generate_color_name_from_value(self, color: ColorToken) -> str:
231
- """Generate a name based on the color value itself."""
232
- category = categorize_color(color.value)
233
- parsed = parse_color(color.value)
234
-
235
- if parsed and parsed.rgb:
236
- rgb = parsed.rgb
237
- luminance = (0.299 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2]) / 255
238
- if luminance > 0.6:
239
- shade = "light"
240
- elif luminance > 0.3:
241
- shade = "base"
242
- else:
243
- shade = "dark"
244
- else:
245
- shade = "base"
246
-
247
- return f"color.{category}.{shade}"
248
-
249
- def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]:
250
- """
251
- Normalize typography tokens:
252
- - Deduplicate identical styles
253
- - Infer type scale categories
254
- - Assign suggested names
255
- """
256
- if not typography:
257
- return []
258
-
259
- # Deduplicate by unique style combination
260
- unique_typo = {}
261
- for typo in typography:
262
- key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}"
263
- if key in unique_typo:
264
- existing = unique_typo[key]
265
- existing.frequency += typo.frequency
266
- existing.elements = list(set(existing.elements + typo.elements))
267
- else:
268
- unique_typo[key] = typo
269
-
270
- result = list(unique_typo.values())
271
-
272
- # Infer names based on size and elements
273
- for typo in result:
274
- typo.suggested_name = self._generate_typography_name(typo)
275
- typo.confidence = self._calculate_confidence(typo.frequency)
276
-
277
- # Sort by font size (largest first)
278
- result.sort(key=lambda t: -self._parse_font_size(t.font_size))
279
-
280
- return result
281
-
282
- def _generate_typography_name(self, typo: TypographyToken) -> str:
283
- """Generate a semantic name for typography."""
284
- size_px = self._parse_font_size(typo.font_size)
285
- elements = " ".join(typo.elements).lower()
286
-
287
- # Determine category from elements
288
- if any(h in elements for h in ["h1", "hero", "display"]):
289
- category = "display"
290
- elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]):
291
- category = "heading"
292
- elif any(h in elements for h in ["label", "caption", "small", "meta"]):
293
- category = "label"
294
- elif any(h in elements for h in ["body", "p", "paragraph", "text"]):
295
- category = "body"
296
- else:
297
- category = "text"
298
-
299
- # Determine size tier
300
- if size_px >= 32:
301
- size_tier = "xl"
302
- elif size_px >= 24:
303
- size_tier = "lg"
304
- elif size_px >= 18:
305
- size_tier = "md"
306
- elif size_px >= 14:
307
- size_tier = "sm"
308
- else:
309
- size_tier = "xs"
310
-
311
- return f"font.{category}.{size_tier}"
312
-
313
- def _parse_font_size(self, size: str) -> float:
314
- """Parse font size string to pixels."""
315
- if not size:
316
- return 16
317
-
318
- size = size.lower().strip()
319
-
320
- # Handle px
321
- if "px" in size:
322
- try:
323
- return float(size.replace("px", ""))
324
- except ValueError:
325
- return 16
326
-
327
- # Handle rem (assume 16px base)
328
- if "rem" in size:
329
- try:
330
- return float(size.replace("rem", "")) * 16
331
- except ValueError:
332
- return 16
333
-
334
- # Handle em (assume 16px base)
335
- if "em" in size:
336
- try:
337
- return float(size.replace("em", "")) * 16
338
- except ValueError:
339
- return 16
340
-
341
- # Try plain number
342
- try:
343
- return float(size)
344
- except ValueError:
345
- return 16
346
-
347
- def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
348
- """
349
- Normalize spacing tokens:
350
- - Merge similar values
351
- - Align to base-8 grid if close
352
- - Assign suggested names
353
- """
354
- if not spacing:
355
- return []
356
-
357
- # Deduplicate by value
358
- unique_spacing = {}
359
- for space in spacing:
360
- key = space.value
361
- if key in unique_spacing:
362
- existing = unique_spacing[key]
363
- existing.frequency += space.frequency
364
- existing.contexts = list(set(existing.contexts + space.contexts))
365
- else:
366
- unique_spacing[key] = space
367
-
368
- result = list(unique_spacing.values())
369
-
370
- # Merge very similar values
371
- result = self._merge_similar_spacing(result)
372
-
373
- # Assign names
374
- for space in result:
375
- space.suggested_name = self._generate_spacing_name(space)
376
- space.confidence = self._calculate_confidence(space.frequency)
377
-
378
- # Sort by value
379
- result.sort(key=lambda s: s.value_px)
380
-
381
- return result
382
-
383
- def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]:
384
- """Merge spacing values that are very close."""
385
- if len(spacing) <= 1:
386
- return spacing
387
-
388
- # Sort by pixel value
389
- spacing.sort(key=lambda s: s.value_px)
390
-
391
- merged = []
392
- i = 0
393
-
394
- while i < len(spacing):
395
- current = spacing[i]
396
- group = [current]
397
-
398
- # Find adjacent similar values
399
- j = i + 1
400
- while j < len(spacing):
401
- if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold:
402
- group.append(spacing[j])
403
- j += 1
404
- else:
405
- break
406
-
407
- # Merge group - prefer base-8 aligned value or most frequent
408
- group.sort(key=lambda s: (-s.fits_base_8, -s.frequency))
409
- primary = group[0]
410
-
411
- for other in group[1:]:
412
- primary.frequency += other.frequency
413
- primary.contexts = list(set(primary.contexts + other.contexts))
414
-
415
- merged.append(primary)
416
- i = j
417
-
418
- return merged
419
-
420
- def _generate_spacing_name(self, space: SpacingToken) -> str:
421
- """Generate a semantic name for spacing."""
422
- px = space.value_px
423
-
424
- # Map to t-shirt sizes based on value
425
- if px <= 2:
426
- size = "px"
427
- elif px <= 4:
428
- size = "0.5"
429
- elif px <= 8:
430
- size = "1"
431
- elif px <= 12:
432
- size = "1.5"
433
- elif px <= 16:
434
- size = "2"
435
- elif px <= 20:
436
- size = "2.5"
437
- elif px <= 24:
438
- size = "3"
439
- elif px <= 32:
440
- size = "4"
441
- elif px <= 40:
442
- size = "5"
443
- elif px <= 48:
444
- size = "6"
445
- elif px <= 64:
446
- size = "8"
447
- elif px <= 80:
448
- size = "10"
449
- elif px <= 96:
450
- size = "12"
451
- else:
452
- size = str(int(px / 4))
453
-
454
- return f"space.{size}"
455
-
456
- def _calculate_confidence(self, frequency: int) -> Confidence:
457
- """Calculate confidence based on frequency."""
458
- if frequency >= 10:
459
- return Confidence.HIGH
460
- elif frequency >= 3:
461
- return Confidence.MEDIUM
462
- else:
463
- return Confidence.LOW
464
-
465
-
466
- def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens:
467
- """Convenience function to normalize tokens."""
468
- normalizer = TokenNormalizer()
469
- return normalizer.normalize(extracted)