riazmo commited on
Commit
d13fef9
Β·
verified Β·
1 Parent(s): b40e625

Upload firecrawl_extractor.py

Browse files
Files changed (1) hide show
  1. agents/firecrawl_extractor.py +446 -0
agents/firecrawl_extractor.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent 1B: Firecrawl CSS Extractor
3
+ Design System Extractor v2
4
+
5
+ Persona: CSS Deep Diver
6
+
7
+ Responsibilities:
8
+ - Fetch and parse all CSS files from a website
9
+ - Extract colors from CSS rules, variables, and values
10
+ - Bypass CORS restrictions by fetching CSS directly
11
+ - Complement Playwright extraction with deeper CSS analysis
12
+ """
13
+
14
+ import re
15
+ import asyncio
16
+ from typing import Optional, Callable
17
+ from datetime import datetime
18
+
19
+ # Firecrawl for web scraping
20
+ try:
21
+ from firecrawl import FirecrawlApp
22
+ FIRECRAWL_AVAILABLE = True
23
+ except ImportError:
24
+ FIRECRAWL_AVAILABLE = False
25
+
26
+ from core.color_utils import (
27
+ parse_color,
28
+ get_contrast_with_white,
29
+ get_contrast_with_black,
30
+ )
31
+
32
+
33
+ class FirecrawlExtractor:
34
+ """
35
+ Extracts colors from CSS files using Firecrawl.
36
+
37
+ This complements the Playwright extraction by:
38
+ 1. Fetching all linked CSS files
39
+ 2. Parsing inline <style> blocks
40
+ 3. Extracting CSS variables
41
+ 4. Finding all color values in CSS rules
42
+ """
43
+
44
+ def __init__(self, api_key: Optional[str] = None):
45
+ """
46
+ Initialize Firecrawl extractor.
47
+
48
+ Args:
49
+ api_key: Firecrawl API key (optional for free tier)
50
+ """
51
+ self.api_key = api_key
52
+ self.colors: dict[str, dict] = {}
53
+ self.css_variables: dict[str, str] = {}
54
+ self.errors: list[str] = []
55
+ self.warnings: list[str] = []
56
+ self.stats = {
57
+ "css_files_parsed": 0,
58
+ "style_blocks_parsed": 0,
59
+ "colors_found": 0,
60
+ "css_variables_found": 0,
61
+ }
62
+
63
+ # Color regex pattern
64
+ self.color_regex = re.compile(
65
+ r'#[0-9a-fA-F]{3,8}|'
66
+ r'rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|'
67
+ r'rgba\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*[\d.]+\s*\)|'
68
+ r'hsl\(\s*\d+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*\)|'
69
+ r'hsla\(\s*\d+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*,\s*[\d.]+\s*\)',
70
+ re.IGNORECASE
71
+ )
72
+
73
+ # CSS variable pattern
74
+ self.css_var_regex = re.compile(
75
+ r'--[\w-]+\s*:\s*([^;]+)',
76
+ re.IGNORECASE
77
+ )
78
+
79
+ def _extract_colors_from_css(self, css_text: str, source: str = "css") -> list[dict]:
80
+ """Extract all color values from CSS text."""
81
+ colors = []
82
+
83
+ # Find all color values
84
+ matches = self.color_regex.findall(css_text)
85
+ for match in matches:
86
+ colors.append({
87
+ "value": match.strip(),
88
+ "source": source,
89
+ "context": "firecrawl-css",
90
+ })
91
+
92
+ return colors
93
+
94
+ def _extract_css_variables(self, css_text: str) -> dict[str, str]:
95
+ """Extract CSS variables from CSS text."""
96
+ variables = {}
97
+
98
+ matches = self.css_var_regex.findall(css_text)
99
+ for match in matches:
100
+ # Get variable name and value
101
+ var_match = re.search(r'(--[\w-]+)\s*:\s*([^;]+)', css_text)
102
+ if var_match:
103
+ var_name = var_match.group(1)
104
+ var_value = var_match.group(2).strip()
105
+ variables[var_name] = var_value
106
+
107
+ # More precise extraction
108
+ for match in re.finditer(r'(--[\w-]+)\s*:\s*([^;]+);', css_text):
109
+ var_name = match.group(1)
110
+ var_value = match.group(2).strip()
111
+ variables[var_name] = var_value
112
+
113
+ return variables
114
+
115
+ def _process_color(self, color_value: str) -> Optional[str]:
116
+ """Process and normalize a color value to hex."""
117
+ parsed = parse_color(color_value)
118
+ if parsed:
119
+ return parsed.hex
120
+ return None
121
+
122
+ def _aggregate_color(self, color_data: dict):
123
+ """Aggregate a color into the collection."""
124
+ hex_value = self._process_color(color_data.get("value", ""))
125
+ if not hex_value:
126
+ return
127
+
128
+ if hex_value not in self.colors:
129
+ contrast_white = get_contrast_with_white(hex_value)
130
+ contrast_black = get_contrast_with_black(hex_value)
131
+
132
+ self.colors[hex_value] = {
133
+ "value": hex_value,
134
+ "frequency": 0,
135
+ "contexts": [],
136
+ "sources": [],
137
+ "contrast_white": round(contrast_white, 2),
138
+ "contrast_black": round(contrast_black, 2),
139
+ }
140
+
141
+ # Update frequency and context
142
+ self.colors[hex_value]["frequency"] += 1
143
+
144
+ context = color_data.get("context", "")
145
+ if context and context not in self.colors[hex_value]["contexts"]:
146
+ self.colors[hex_value]["contexts"].append(context)
147
+
148
+ source = color_data.get("source", "")
149
+ if source and source not in self.colors[hex_value]["sources"]:
150
+ self.colors[hex_value]["sources"].append(source)
151
+
152
+ async def extract_with_firecrawl(
153
+ self,
154
+ url: str,
155
+ log_callback: Optional[Callable[[str], None]] = None
156
+ ) -> dict:
157
+ """
158
+ Extract colors using Firecrawl API.
159
+
160
+ Args:
161
+ url: Website URL to analyze
162
+ log_callback: Optional callback for logging progress
163
+
164
+ Returns:
165
+ Dict with extracted colors and stats
166
+ """
167
+
168
+ def log(msg: str):
169
+ if log_callback:
170
+ log_callback(msg)
171
+
172
+ if not FIRECRAWL_AVAILABLE:
173
+ log("⚠️ Firecrawl not available, skipping...")
174
+ return {"colors": {}, "css_variables": {}, "stats": self.stats}
175
+
176
+ log("")
177
+ log("=" * 60)
178
+ log("πŸ”₯ FIRECRAWL CSS EXTRACTION")
179
+ log("=" * 60)
180
+ log("")
181
+
182
+ try:
183
+ # Initialize Firecrawl
184
+ if self.api_key:
185
+ app = FirecrawlApp(api_key=self.api_key)
186
+ else:
187
+ # Try without API key (limited functionality)
188
+ log(" ⚠️ No Firecrawl API key - using fallback method")
189
+ return await self._fallback_css_extraction(url, log_callback)
190
+
191
+ log(f" 🌐 Scraping: {url}")
192
+
193
+ # Scrape the page
194
+ result = app.scrape_url(
195
+ url,
196
+ params={
197
+ 'formats': ['html'],
198
+ 'includeTags': ['style', 'link'],
199
+ }
200
+ )
201
+
202
+ if not result:
203
+ log(" ❌ Firecrawl returned no results")
204
+ return {"colors": {}, "css_variables": {}, "stats": self.stats}
205
+
206
+ html_content = result.get('html', '') or result.get('content', '')
207
+
208
+ log(f" βœ… Page scraped ({len(html_content)} chars)")
209
+
210
+ # Extract <style> blocks
211
+ log(" πŸ“ Parsing <style> blocks...")
212
+ style_blocks = re.findall(r'<style[^>]*>(.*?)</style>', html_content, re.DOTALL | re.IGNORECASE)
213
+
214
+ for i, block in enumerate(style_blocks):
215
+ colors = self._extract_colors_from_css(block, f"style-block-{i}")
216
+ for color in colors:
217
+ self._aggregate_color(color)
218
+
219
+ variables = self._extract_css_variables(block)
220
+ self.css_variables.update(variables)
221
+ self.stats["style_blocks_parsed"] += 1
222
+
223
+ log(f" Found {len(style_blocks)} style blocks")
224
+
225
+ # Extract CSS file URLs
226
+ log(" πŸ”— Finding linked CSS files...")
227
+ css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE)
228
+
229
+ log(f" Found {len(css_urls)} CSS files")
230
+
231
+ # Fetch and parse each CSS file
232
+ for css_url in css_urls[:15]: # Limit to 15 files
233
+ try:
234
+ # Make URL absolute
235
+ if css_url.startswith('//'):
236
+ css_url = 'https:' + css_url
237
+ elif css_url.startswith('/'):
238
+ from urllib.parse import urlparse
239
+ parsed = urlparse(url)
240
+ css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}"
241
+ elif not css_url.startswith('http'):
242
+ from urllib.parse import urljoin
243
+ css_url = urljoin(url, css_url)
244
+
245
+ log(f" πŸ“„ Fetching: {css_url[:60]}...")
246
+
247
+ # Fetch CSS file
248
+ css_result = app.scrape_url(css_url, params={'formats': ['rawHtml']})
249
+ css_content = css_result.get('rawHtml', '') or css_result.get('content', '')
250
+
251
+ if css_content:
252
+ colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1])
253
+ for color in colors:
254
+ self._aggregate_color(color)
255
+
256
+ variables = self._extract_css_variables(css_content)
257
+ self.css_variables.update(variables)
258
+ self.stats["css_files_parsed"] += 1
259
+
260
+ log(f" βœ… Parsed ({len(colors)} colors)")
261
+
262
+ except Exception as e:
263
+ log(f" ⚠️ Failed: {str(e)[:50]}")
264
+ self.warnings.append(f"Failed to fetch {css_url}: {str(e)}")
265
+
266
+ # Process CSS variables that contain colors
267
+ log(" 🎨 Processing CSS variables...")
268
+ for var_name, var_value in self.css_variables.items():
269
+ if self.color_regex.match(var_value.strip()):
270
+ self._aggregate_color({
271
+ "value": var_value.strip(),
272
+ "source": f"css-var:{var_name}",
273
+ "context": "css-variable",
274
+ })
275
+ self.stats["css_variables_found"] += 1
276
+
277
+ self.stats["colors_found"] = len(self.colors)
278
+
279
+ # Log summary
280
+ log("")
281
+ log("πŸ“Š FIRECRAWL RESULTS:")
282
+ log(f" CSS files parsed: {self.stats['css_files_parsed']}")
283
+ log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}")
284
+ log(f" CSS variables found: {self.stats['css_variables_found']}")
285
+ log(f" Unique colors found: {self.stats['colors_found']}")
286
+ log("")
287
+
288
+ # Show top colors found
289
+ if self.colors:
290
+ sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10]
291
+ log(" 🎨 Top colors found:")
292
+ for hex_val, data in sorted_colors:
293
+ log(f" {hex_val} (used {data['frequency']}x)")
294
+
295
+ return {
296
+ "colors": self.colors,
297
+ "css_variables": self.css_variables,
298
+ "stats": self.stats,
299
+ }
300
+
301
+ except Exception as e:
302
+ log(f" ❌ Firecrawl error: {str(e)}")
303
+ self.errors.append(f"Firecrawl error: {str(e)}")
304
+ return await self._fallback_css_extraction(url, log_callback)
305
+
306
+ async def _fallback_css_extraction(
307
+ self,
308
+ url: str,
309
+ log_callback: Optional[Callable[[str], None]] = None
310
+ ) -> dict:
311
+ """
312
+ Fallback CSS extraction using httpx (no Firecrawl API key needed).
313
+ """
314
+
315
+ def log(msg: str):
316
+ if log_callback:
317
+ log_callback(msg)
318
+
319
+ log("")
320
+ log("πŸ”„ Using fallback CSS extraction (httpx)...")
321
+
322
+ try:
323
+ import httpx
324
+
325
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
326
+ # Fetch main page
327
+ log(f" 🌐 Fetching: {url}")
328
+ response = await client.get(url)
329
+ html_content = response.text
330
+
331
+ log(f" βœ… Page fetched ({len(html_content)} chars)")
332
+
333
+ # Extract <style> blocks
334
+ log(" πŸ“ Parsing <style> blocks...")
335
+ style_blocks = re.findall(r'<style[^>]*>(.*?)</style>', html_content, re.DOTALL | re.IGNORECASE)
336
+
337
+ for i, block in enumerate(style_blocks):
338
+ colors = self._extract_colors_from_css(block, f"style-block-{i}")
339
+ for color in colors:
340
+ self._aggregate_color(color)
341
+
342
+ variables = self._extract_css_variables(block)
343
+ self.css_variables.update(variables)
344
+ self.stats["style_blocks_parsed"] += 1
345
+
346
+ log(f" Found {len(style_blocks)} style blocks")
347
+
348
+ # Extract CSS file URLs
349
+ log(" πŸ”— Finding linked CSS files...")
350
+ css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE)
351
+
352
+ log(f" Found {len(css_urls)} CSS files")
353
+
354
+ # Fetch and parse each CSS file
355
+ for css_url in css_urls[:15]:
356
+ try:
357
+ # Make URL absolute
358
+ if css_url.startswith('//'):
359
+ css_url = 'https:' + css_url
360
+ elif css_url.startswith('/'):
361
+ from urllib.parse import urlparse
362
+ parsed = urlparse(url)
363
+ css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}"
364
+ elif not css_url.startswith('http'):
365
+ from urllib.parse import urljoin
366
+ css_url = urljoin(url, css_url)
367
+
368
+ log(f" πŸ“„ Fetching: {css_url[:60]}...")
369
+
370
+ css_response = await client.get(css_url)
371
+ css_content = css_response.text
372
+
373
+ if css_content:
374
+ colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1])
375
+ for color in colors:
376
+ self._aggregate_color(color)
377
+
378
+ variables = self._extract_css_variables(css_content)
379
+ self.css_variables.update(variables)
380
+ self.stats["css_files_parsed"] += 1
381
+
382
+ log(f" βœ… Parsed ({len(colors)} colors)")
383
+
384
+ except Exception as e:
385
+ log(f" ⚠�� Failed: {str(e)[:50]}")
386
+ self.warnings.append(f"Failed to fetch {css_url}: {str(e)}")
387
+
388
+ # Process CSS variables
389
+ log(" 🎨 Processing CSS variables...")
390
+ for var_name, var_value in self.css_variables.items():
391
+ if self.color_regex.match(var_value.strip()):
392
+ self._aggregate_color({
393
+ "value": var_value.strip(),
394
+ "source": f"css-var:{var_name}",
395
+ "context": "css-variable",
396
+ })
397
+ self.stats["css_variables_found"] += 1
398
+
399
+ self.stats["colors_found"] = len(self.colors)
400
+
401
+ # Log summary
402
+ log("")
403
+ log("πŸ“Š FALLBACK EXTRACTION RESULTS:")
404
+ log(f" CSS files parsed: {self.stats['css_files_parsed']}")
405
+ log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}")
406
+ log(f" CSS variables found: {self.stats['css_variables_found']}")
407
+ log(f" Unique colors found: {self.stats['colors_found']}")
408
+ log("")
409
+
410
+ # Show top colors
411
+ if self.colors:
412
+ sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10]
413
+ log(" 🎨 Top colors found:")
414
+ for hex_val, data in sorted_colors:
415
+ log(f" {hex_val} (used {data['frequency']}x)")
416
+
417
+ return {
418
+ "colors": self.colors,
419
+ "css_variables": self.css_variables,
420
+ "stats": self.stats,
421
+ }
422
+
423
+ except Exception as e:
424
+ log(f" ❌ Fallback extraction failed: {str(e)}")
425
+ self.errors.append(f"Fallback extraction failed: {str(e)}")
426
+ return {"colors": {}, "css_variables": {}, "stats": self.stats}
427
+
428
+
429
+ async def extract_css_colors(
430
+ url: str,
431
+ api_key: Optional[str] = None,
432
+ log_callback: Optional[Callable[[str], None]] = None
433
+ ) -> dict:
434
+ """
435
+ Convenience function to extract CSS colors.
436
+
437
+ Args:
438
+ url: Website URL
439
+ api_key: Optional Firecrawl API key
440
+ log_callback: Optional logging callback
441
+
442
+ Returns:
443
+ Dict with colors, css_variables, and stats
444
+ """
445
+ extractor = FirecrawlExtractor(api_key=api_key)
446
+ return await extractor.extract_with_firecrawl(url, log_callback)