riazmo commited on
Commit
90cf820
·
verified ·
1 Parent(s): a74eef5

Upload dom_element_extractor.py

Browse files
Files changed (1) hide show
  1. utils/dom_element_extractor.py +513 -0
utils/dom_element_extractor.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DOM Element Extractor
3
+ Extracts UI elements from websites using Playwright.
4
+ Gets computed styles and element properties for comparison with Figma.
5
+ """
6
+
7
+ import asyncio
8
+ from typing import Dict, List, Optional, Any, Tuple
9
+ from .element_schema import (
10
+ UIElement, ElementType, ElementBounds, ElementStyles,
11
+ DOM_TYPE_MAP
12
+ )
13
+
14
+
15
+ # JavaScript to extract elements and their computed styles
16
+ EXTRACTION_SCRIPT = """
17
+ () => {
18
+ const results = [];
19
+
20
+ // Selectors for elements we care about (checkout-focused)
21
+ const selectors = [
22
+ 'button',
23
+ 'input',
24
+ 'textarea',
25
+ 'select',
26
+ 'a',
27
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
28
+ 'p',
29
+ 'label',
30
+ 'span',
31
+ 'img',
32
+ 'svg',
33
+ '[role="button"]',
34
+ '[role="link"]',
35
+ '[role="textbox"]',
36
+ '[role="checkbox"]',
37
+ '[role="radio"]',
38
+ // Common checkout classes
39
+ '.btn', '.button',
40
+ '.input', '.field',
41
+ '.price', '.total', '.amount',
42
+ '.card', '.form-group',
43
+ // Text classes (Tailwind)
44
+ '.text-muted-foreground',
45
+ '.text-gray-500', '.text-gray-600', '.text-gray-700',
46
+ // Data attributes
47
+ '[data-testid]',
48
+ '[data-cy]'
49
+ ];
50
+
51
+ // Get all matching elements
52
+ const elements = new Set();
53
+ selectors.forEach(selector => {
54
+ try {
55
+ document.querySelectorAll(selector).forEach(el => elements.add(el));
56
+ } catch(e) {}
57
+ });
58
+
59
+ // Also get important containers (forms, main sections)
60
+ document.querySelectorAll('form, main, [role="main"], .checkout, .cart, .payment').forEach(el => elements.add(el));
61
+
62
+ // Process each element
63
+ let index = 0;
64
+ elements.forEach(el => {
65
+ // Skip hidden elements
66
+ const style = window.getComputedStyle(el);
67
+ if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') {
68
+ return;
69
+ }
70
+
71
+ // Get bounding rect
72
+ const rect = el.getBoundingClientRect();
73
+
74
+ // Skip elements outside viewport or too small
75
+ if (rect.width < 5 || rect.height < 5) {
76
+ return;
77
+ }
78
+
79
+ // Skip elements way off screen
80
+ if (rect.top > 10000 || rect.left > 5000) {
81
+ return;
82
+ }
83
+
84
+ // Get element info
85
+ const tagName = el.tagName.toLowerCase();
86
+ const id = el.id || '';
87
+ const className = el.className || '';
88
+ const classStr = typeof className === 'string' ? className : '';
89
+
90
+ // Determine name (prefer aria-label, then id, then class, then tag)
91
+ let name = el.getAttribute('aria-label') ||
92
+ el.getAttribute('data-testid') ||
93
+ el.getAttribute('placeholder') ||
94
+ id ||
95
+ classStr.split(' ')[0] ||
96
+ tagName;
97
+
98
+ // Get text content (direct text only, not nested)
99
+ let textContent = '';
100
+ if (['BUTTON', 'A', 'LABEL', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'P', 'SPAN'].includes(el.tagName)) {
101
+ // Get direct text content
102
+ textContent = Array.from(el.childNodes)
103
+ .filter(node => node.nodeType === Node.TEXT_NODE)
104
+ .map(node => node.textContent.trim())
105
+ .join(' ')
106
+ .trim();
107
+
108
+ // If no direct text, get innerText but truncate
109
+ if (!textContent && el.innerText) {
110
+ textContent = el.innerText.substring(0, 100).trim();
111
+ }
112
+ }
113
+
114
+ // Use text content as name if element has meaningful text
115
+ if (textContent && textContent.length > 2 && textContent.length < 80) {
116
+ name = textContent;
117
+ }
118
+
119
+ // Get placeholder for inputs
120
+ const placeholder = el.getAttribute('placeholder') || '';
121
+
122
+ // Get input type
123
+ const inputType = el.getAttribute('type') || '';
124
+
125
+ // IMPORTANT: Get the ACTUAL computed color, not inherited
126
+ // Force a reflow to ensure styles are computed
127
+ el.offsetHeight;
128
+
129
+ // Re-get computed style after reflow
130
+ const finalStyle = window.getComputedStyle(el);
131
+
132
+ // Extract computed styles
133
+ const computedStyles = {
134
+ backgroundColor: finalStyle.backgroundColor,
135
+ color: finalStyle.color,
136
+ borderColor: finalStyle.borderColor,
137
+ borderWidth: finalStyle.borderWidth,
138
+ borderRadius: finalStyle.borderRadius,
139
+ borderStyle: finalStyle.borderStyle,
140
+ fontFamily: finalStyle.fontFamily,
141
+ fontSize: finalStyle.fontSize,
142
+ fontWeight: finalStyle.fontWeight,
143
+ lineHeight: finalStyle.lineHeight,
144
+ textAlign: finalStyle.textAlign,
145
+ letterSpacing: finalStyle.letterSpacing,
146
+ paddingTop: finalStyle.paddingTop,
147
+ paddingRight: finalStyle.paddingRight,
148
+ paddingBottom: finalStyle.paddingBottom,
149
+ paddingLeft: finalStyle.paddingLeft,
150
+ opacity: finalStyle.opacity,
151
+ boxShadow: finalStyle.boxShadow
152
+ };
153
+
154
+ // Debug: Log color for text elements
155
+ if (textContent && textContent.includes('Complete your purchase')) {
156
+ console.log('Found element:', textContent, 'Color:', finalStyle.color);
157
+ }
158
+
159
+ results.push({
160
+ index: index++,
161
+ tagName: tagName,
162
+ id: id,
163
+ className: classStr,
164
+ name: name,
165
+ textContent: textContent,
166
+ placeholder: placeholder,
167
+ inputType: inputType,
168
+ bounds: {
169
+ x: rect.left + window.scrollX,
170
+ y: rect.top + window.scrollY,
171
+ width: rect.width,
172
+ height: rect.height
173
+ },
174
+ styles: computedStyles,
175
+ isInteractive: ['BUTTON', 'A', 'INPUT', 'TEXTAREA', 'SELECT'].includes(el.tagName) ||
176
+ el.getAttribute('role') === 'button' ||
177
+ el.onclick !== null
178
+ });
179
+ });
180
+
181
+ return results;
182
+ }
183
+ """
184
+
185
+
186
+ class DOMElementExtractor:
187
+ """
188
+ Extracts UI elements from a webpage using Playwright.
189
+ Gets computed styles that match what's actually rendered.
190
+ """
191
+
192
+ def __init__(self):
193
+ self.elements: List[UIElement] = []
194
+
195
+ async def extract_from_page_async(
196
+ self,
197
+ page, # Playwright page object
198
+ viewport: str = "desktop"
199
+ ) -> List[UIElement]:
200
+ """
201
+ Extract elements from a Playwright page.
202
+
203
+ Args:
204
+ page: Playwright page object (already navigated to URL)
205
+ viewport: "desktop" or "mobile"
206
+
207
+ Returns:
208
+ List of UIElement objects
209
+ """
210
+ self.elements = []
211
+
212
+ # Execute extraction script
213
+ raw_elements = await page.evaluate(EXTRACTION_SCRIPT)
214
+
215
+ # Convert to UIElement objects
216
+ for raw in raw_elements:
217
+ element = self._create_element(raw, viewport)
218
+ if element:
219
+ self.elements.append(element)
220
+
221
+ print(f" 📊 Extracted {len(self.elements)} elements from {viewport} page")
222
+ return self.elements
223
+
224
+ def _create_element(self, raw: Dict, viewport: str) -> Optional[UIElement]:
225
+ """
226
+ Create a UIElement from raw extracted data.
227
+ """
228
+ # Create bounds
229
+ bounds_data = raw.get("bounds", {})
230
+ bounds = ElementBounds(
231
+ x=bounds_data.get("x", 0),
232
+ y=bounds_data.get("y", 0),
233
+ width=bounds_data.get("width", 0),
234
+ height=bounds_data.get("height", 0)
235
+ )
236
+
237
+ # Determine element type
238
+ element_type = self._determine_element_type(raw)
239
+
240
+ # Extract styles
241
+ styles = self._extract_styles(raw.get("styles", {}))
242
+
243
+ # Create element
244
+ return UIElement(
245
+ id=f"dom_{raw.get('index', 0)}_{raw.get('tagName', 'unknown')}",
246
+ element_type=element_type,
247
+ name=raw.get("name", ""),
248
+ bounds=bounds,
249
+ styles=styles,
250
+ text_content=raw.get("textContent", "") or None,
251
+ placeholder=raw.get("placeholder", "") or None,
252
+ source="website",
253
+ original_type=raw.get("tagName", ""),
254
+ is_interactive=raw.get("isInteractive", False),
255
+ input_type=raw.get("inputType", "") or None
256
+ )
257
+
258
+ def _determine_element_type(self, raw: Dict) -> ElementType:
259
+ """
260
+ Determine semantic element type from DOM element.
261
+ """
262
+ tag = raw.get("tagName", "").lower()
263
+ class_name = raw.get("className", "").lower()
264
+ text = raw.get("textContent", "").lower()
265
+ input_type = raw.get("inputType", "").lower()
266
+
267
+ # Check tag mapping first
268
+ if tag in DOM_TYPE_MAP:
269
+ base_type = DOM_TYPE_MAP[tag]
270
+
271
+ # Refine input types
272
+ if tag == "input":
273
+ if input_type == "checkbox":
274
+ return ElementType.CHECKBOX
275
+ elif input_type == "radio":
276
+ return ElementType.RADIO
277
+ elif input_type == "submit" or input_type == "button":
278
+ return ElementType.BUTTON
279
+ return ElementType.INPUT
280
+
281
+ return base_type
282
+
283
+ # Check class names for hints
284
+ if any(kw in class_name for kw in ["btn", "button"]):
285
+ return ElementType.BUTTON
286
+ if any(kw in class_name for kw in ["input", "field", "textbox"]):
287
+ return ElementType.INPUT
288
+ if any(kw in class_name for kw in ["price", "amount", "total", "cost"]):
289
+ return ElementType.PRICE
290
+ if any(kw in class_name for kw in ["card"]):
291
+ return ElementType.CARD
292
+ if any(kw in class_name for kw in ["icon", "ico"]):
293
+ return ElementType.ICON
294
+ if any(kw in class_name for kw in ["badge", "tag", "chip"]):
295
+ return ElementType.BADGE
296
+
297
+ # Check text content for price detection
298
+ if "$" in text or "€" in text or "£" in text:
299
+ return ElementType.PRICE
300
+
301
+ return ElementType.UNKNOWN
302
+
303
+ def _extract_styles(self, raw_styles: Dict) -> ElementStyles:
304
+ """
305
+ Convert computed CSS styles to ElementStyles.
306
+ """
307
+ styles = ElementStyles()
308
+
309
+ # Background color
310
+ bg = raw_styles.get("backgroundColor", "")
311
+ if bg and bg != "rgba(0, 0, 0, 0)" and bg != "transparent":
312
+ styles.background_color = self._css_color_to_hex(bg)
313
+
314
+ # Text color
315
+ color = raw_styles.get("color", "")
316
+ if color:
317
+ styles.text_color = self._css_color_to_hex(color)
318
+
319
+ # Border color
320
+ border_color = raw_styles.get("borderColor", "")
321
+ if border_color:
322
+ styles.border_color = self._css_color_to_hex(border_color)
323
+
324
+ # Border width (parse "1px" -> 1.0)
325
+ border_width = raw_styles.get("borderWidth", "")
326
+ if border_width:
327
+ styles.border_width = self._parse_px_value(border_width)
328
+
329
+ # Border radius
330
+ border_radius = raw_styles.get("borderRadius", "")
331
+ if border_radius:
332
+ styles.border_radius = self._parse_px_value(border_radius)
333
+
334
+ # Border style
335
+ border_style = raw_styles.get("borderStyle", "")
336
+ if border_style and border_style != "none":
337
+ styles.border_style = border_style
338
+
339
+ # Font family (clean up quotes)
340
+ font_family = raw_styles.get("fontFamily", "")
341
+ if font_family:
342
+ # Take first font in the stack
343
+ styles.font_family = font_family.split(",")[0].strip().strip('"\'')
344
+
345
+ # Font size
346
+ font_size = raw_styles.get("fontSize", "")
347
+ if font_size:
348
+ styles.font_size = self._parse_px_value(font_size)
349
+
350
+ # Font weight
351
+ font_weight = raw_styles.get("fontWeight", "")
352
+ if font_weight:
353
+ try:
354
+ styles.font_weight = int(font_weight)
355
+ except ValueError:
356
+ # Handle named weights
357
+ weight_map = {"normal": 400, "bold": 700, "lighter": 300, "bolder": 700}
358
+ styles.font_weight = weight_map.get(font_weight, 400)
359
+
360
+ # Line height
361
+ line_height = raw_styles.get("lineHeight", "")
362
+ if line_height and line_height != "normal":
363
+ styles.line_height = self._parse_px_value(line_height)
364
+
365
+ # Text align
366
+ text_align = raw_styles.get("textAlign", "")
367
+ if text_align:
368
+ styles.text_align = text_align
369
+
370
+ # Letter spacing
371
+ letter_spacing = raw_styles.get("letterSpacing", "")
372
+ if letter_spacing and letter_spacing != "normal":
373
+ styles.letter_spacing = self._parse_px_value(letter_spacing)
374
+
375
+ # Padding
376
+ styles.padding_top = self._parse_px_value(raw_styles.get("paddingTop", ""))
377
+ styles.padding_right = self._parse_px_value(raw_styles.get("paddingRight", ""))
378
+ styles.padding_bottom = self._parse_px_value(raw_styles.get("paddingBottom", ""))
379
+ styles.padding_left = self._parse_px_value(raw_styles.get("paddingLeft", ""))
380
+
381
+ # Opacity
382
+ opacity = raw_styles.get("opacity", "")
383
+ if opacity:
384
+ try:
385
+ styles.opacity = float(opacity)
386
+ except ValueError:
387
+ pass
388
+
389
+ # Box shadow
390
+ box_shadow = raw_styles.get("boxShadow", "")
391
+ if box_shadow and box_shadow != "none":
392
+ styles.box_shadow = box_shadow
393
+
394
+ return styles
395
+
396
+ def _css_color_to_hex(self, css_color: str) -> Optional[str]:
397
+ """
398
+ Convert CSS color (rgb, rgba, hex) to hex string.
399
+ """
400
+ if not css_color:
401
+ return None
402
+
403
+ css_color = css_color.strip()
404
+
405
+ # Already hex
406
+ if css_color.startswith("#"):
407
+ return css_color.upper()
408
+
409
+ # rgb() or rgba()
410
+ if css_color.startswith("rgb"):
411
+ try:
412
+ # Extract numbers
413
+ import re
414
+ numbers = re.findall(r'[\d.]+', css_color)
415
+ if len(numbers) >= 3:
416
+ r, g, b = int(float(numbers[0])), int(float(numbers[1])), int(float(numbers[2]))
417
+ hex_color = f"#{r:02X}{g:02X}{b:02X}"
418
+ return hex_color
419
+ except Exception as e:
420
+ print(f" ⚠️ Color parse error: {css_color} - {e}")
421
+
422
+ return None
423
+
424
+ def _parse_px_value(self, value: str) -> Optional[float]:
425
+ """
426
+ Parse CSS pixel value (e.g., "16px") to float.
427
+ """
428
+ if not value:
429
+ return None
430
+
431
+ try:
432
+ # Remove 'px' and convert
433
+ return float(value.replace("px", "").strip())
434
+ except ValueError:
435
+ return None
436
+
437
+ def get_interactive_elements(self) -> List[UIElement]:
438
+ """Get only interactive elements."""
439
+ return [e for e in self.elements if e.is_interactive]
440
+
441
+ def get_elements_by_type(self, element_type: ElementType) -> List[UIElement]:
442
+ """Get elements of a specific type."""
443
+ return [e for e in self.elements if e.element_type == element_type]
444
+
445
+ def summarize(self) -> Dict[str, Any]:
446
+ """Get a summary of extracted elements."""
447
+ type_counts = {}
448
+ for element in self.elements:
449
+ type_name = element.element_type.value
450
+ type_counts[type_name] = type_counts.get(type_name, 0) + 1
451
+
452
+ return {
453
+ "total_elements": len(self.elements),
454
+ "interactive_elements": len(self.get_interactive_elements()),
455
+ "by_type": type_counts
456
+ }
457
+
458
+
459
+ async def extract_dom_elements_async(
460
+ url: str,
461
+ viewport_width: int = 1440,
462
+ viewport_height: int = 900,
463
+ viewport_name: str = "desktop"
464
+ ) -> Tuple[List[UIElement], Dict]:
465
+ """
466
+ Extract elements from a URL.
467
+
468
+ Args:
469
+ url: Website URL to extract from
470
+ viewport_width: Viewport width in pixels
471
+ viewport_height: Viewport height in pixels
472
+ viewport_name: "desktop" or "mobile"
473
+
474
+ Returns:
475
+ Tuple of (list of UIElements, summary dict)
476
+ """
477
+ from playwright.async_api import async_playwright
478
+
479
+ async with async_playwright() as p:
480
+ browser = await p.chromium.launch(headless=True)
481
+
482
+ try:
483
+ page = await browser.new_page(viewport={"width": viewport_width, "height": viewport_height})
484
+
485
+ # Set mobile user agent if needed
486
+ if viewport_name == "mobile":
487
+ await page.set_extra_http_headers({
488
+ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15"
489
+ })
490
+
491
+ await page.goto(url, wait_until="networkidle", timeout=60000)
492
+ await page.wait_for_timeout(2000) # Wait for any animations
493
+
494
+ extractor = DOMElementExtractor()
495
+ elements = await extractor.extract_from_page_async(page, viewport_name)
496
+ summary = extractor.summarize()
497
+
498
+ return elements, summary
499
+
500
+ finally:
501
+ await browser.close()
502
+
503
+
504
+ def extract_dom_elements(
505
+ url: str,
506
+ viewport_width: int = 1440,
507
+ viewport_height: int = 900,
508
+ viewport_name: str = "desktop"
509
+ ) -> Tuple[List[UIElement], Dict]:
510
+ """
511
+ Synchronous wrapper for DOM element extraction.
512
+ """
513
+ return asyncio.run(extract_dom_elements_async(url, viewport_width, viewport_height, viewport_name))