| | """ |
| | Agent 1B: Firecrawl CSS Extractor |
| | Design System Extractor v2 |
| | |
| | Persona: CSS Deep Diver |
| | |
| | Responsibilities: |
| | - Fetch and parse all CSS files from a website |
| | - Extract colors from CSS rules, variables, and values |
| | - Bypass CORS restrictions by fetching CSS directly |
| | - Complement Playwright extraction with deeper CSS analysis |
| | """ |
| |
|
| | import re |
| | import asyncio |
| | from typing import Optional, Callable |
| | from datetime import datetime |
| |
|
| | |
| | try: |
| | from firecrawl import FirecrawlApp |
| | FIRECRAWL_AVAILABLE = True |
| | except ImportError: |
| | FIRECRAWL_AVAILABLE = False |
| |
|
| | from core.color_utils import ( |
| | parse_color, |
| | get_contrast_with_white, |
| | get_contrast_with_black, |
| | ) |
| |
|
| |
|
| | class FirecrawlExtractor: |
| | """ |
| | Extracts colors from CSS files using Firecrawl. |
| | |
| | This complements the Playwright extraction by: |
| | 1. Fetching all linked CSS files |
| | 2. Parsing inline <style> blocks |
| | 3. Extracting CSS variables |
| | 4. Finding all color values in CSS rules |
| | """ |
| | |
| | def __init__(self, api_key: Optional[str] = None): |
| | """ |
| | Initialize Firecrawl extractor. |
| | |
| | Args: |
| | api_key: Firecrawl API key (optional for free tier) |
| | """ |
| | self.api_key = api_key |
| | self.colors: dict[str, dict] = {} |
| | self.css_variables: dict[str, str] = {} |
| | self.errors: list[str] = [] |
| | self.warnings: list[str] = [] |
| | self.stats = { |
| | "css_files_parsed": 0, |
| | "style_blocks_parsed": 0, |
| | "colors_found": 0, |
| | "css_variables_found": 0, |
| | } |
| | |
| | |
| | self.color_regex = re.compile( |
| | r'#[0-9a-fA-F]{3,8}|' |
| | r'rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|' |
| | r'rgba\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*[\d.]+\s*\)|' |
| | r'hsl\(\s*\d+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*\)|' |
| | r'hsla\(\s*\d+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*,\s*[\d.]+\s*\)', |
| | re.IGNORECASE |
| | ) |
| | |
| | |
| | self.css_var_regex = re.compile( |
| | r'--[\w-]+\s*:\s*([^;]+)', |
| | re.IGNORECASE |
| | ) |
| | |
| | def _extract_colors_from_css(self, css_text: str, source: str = "css") -> list[dict]: |
| | """Extract all color values from CSS text.""" |
| | colors = [] |
| | |
| | |
| | matches = self.color_regex.findall(css_text) |
| | for match in matches: |
| | colors.append({ |
| | "value": match.strip(), |
| | "source": source, |
| | "context": "firecrawl-css", |
| | }) |
| | |
| | return colors |
| | |
| | def _extract_css_variables(self, css_text: str) -> dict[str, str]: |
| | """Extract CSS variables from CSS text.""" |
| | variables = {} |
| | |
| | matches = self.css_var_regex.findall(css_text) |
| | for match in matches: |
| | |
| | var_match = re.search(r'(--[\w-]+)\s*:\s*([^;]+)', css_text) |
| | if var_match: |
| | var_name = var_match.group(1) |
| | var_value = var_match.group(2).strip() |
| | variables[var_name] = var_value |
| | |
| | |
| | for match in re.finditer(r'(--[\w-]+)\s*:\s*([^;]+);', css_text): |
| | var_name = match.group(1) |
| | var_value = match.group(2).strip() |
| | variables[var_name] = var_value |
| | |
| | return variables |
| | |
| | def _process_color(self, color_value: str) -> Optional[str]: |
| | """Process and normalize a color value to hex.""" |
| | parsed = parse_color(color_value) |
| | if parsed: |
| | return parsed.hex |
| | return None |
| | |
| | def _aggregate_color(self, color_data: dict): |
| | """Aggregate a color into the collection.""" |
| | hex_value = self._process_color(color_data.get("value", "")) |
| | if not hex_value: |
| | return |
| | |
| | if hex_value not in self.colors: |
| | contrast_white = get_contrast_with_white(hex_value) |
| | contrast_black = get_contrast_with_black(hex_value) |
| | |
| | self.colors[hex_value] = { |
| | "value": hex_value, |
| | "frequency": 0, |
| | "contexts": [], |
| | "sources": [], |
| | "contrast_white": round(contrast_white, 2), |
| | "contrast_black": round(contrast_black, 2), |
| | } |
| | |
| | |
| | self.colors[hex_value]["frequency"] += 1 |
| | |
| | context = color_data.get("context", "") |
| | if context and context not in self.colors[hex_value]["contexts"]: |
| | self.colors[hex_value]["contexts"].append(context) |
| | |
| | source = color_data.get("source", "") |
| | if source and source not in self.colors[hex_value]["sources"]: |
| | self.colors[hex_value]["sources"].append(source) |
| | |
| | async def extract_with_firecrawl( |
| | self, |
| | url: str, |
| | log_callback: Optional[Callable[[str], None]] = None |
| | ) -> dict: |
| | """ |
| | Extract colors using Firecrawl API. |
| | |
| | Args: |
| | url: Website URL to analyze |
| | log_callback: Optional callback for logging progress |
| | |
| | Returns: |
| | Dict with extracted colors and stats |
| | """ |
| | |
| | def log(msg: str): |
| | if log_callback: |
| | log_callback(msg) |
| | |
| | if not FIRECRAWL_AVAILABLE: |
| | log("β οΈ Firecrawl not available, skipping...") |
| | return {"colors": {}, "css_variables": {}, "stats": self.stats} |
| | |
| | log("") |
| | log("=" * 60) |
| | log("π₯ FIRECRAWL CSS EXTRACTION") |
| | log("=" * 60) |
| | log("") |
| | |
| | try: |
| | |
| | if self.api_key: |
| | app = FirecrawlApp(api_key=self.api_key) |
| | else: |
| | |
| | log(" β οΈ No Firecrawl API key - using fallback method") |
| | return await self._fallback_css_extraction(url, log_callback) |
| | |
| | log(f" π Scraping: {url}") |
| | |
| | |
| | result = app.scrape_url( |
| | url, |
| | params={ |
| | 'formats': ['html'], |
| | 'includeTags': ['style', 'link'], |
| | } |
| | ) |
| | |
| | if not result: |
| | log(" β Firecrawl returned no results") |
| | return {"colors": {}, "css_variables": {}, "stats": self.stats} |
| | |
| | html_content = result.get('html', '') or result.get('content', '') |
| | |
| | log(f" β
Page scraped ({len(html_content)} chars)") |
| | |
| | |
| | log(" π Parsing <style> blocks...") |
| | style_blocks = re.findall(r'<style[^>]*>(.*?)</style>', html_content, re.DOTALL | re.IGNORECASE) |
| | |
| | for i, block in enumerate(style_blocks): |
| | colors = self._extract_colors_from_css(block, f"style-block-{i}") |
| | for color in colors: |
| | self._aggregate_color(color) |
| | |
| | variables = self._extract_css_variables(block) |
| | self.css_variables.update(variables) |
| | self.stats["style_blocks_parsed"] += 1 |
| | |
| | log(f" Found {len(style_blocks)} style blocks") |
| | |
| | |
| | log(" π Finding linked CSS files...") |
| | css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE) |
| | |
| | log(f" Found {len(css_urls)} CSS files") |
| | |
| | |
| | for css_url in css_urls[:15]: |
| | try: |
| | |
| | if css_url.startswith('//'): |
| | css_url = 'https:' + css_url |
| | elif css_url.startswith('/'): |
| | from urllib.parse import urlparse |
| | parsed = urlparse(url) |
| | css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}" |
| | elif not css_url.startswith('http'): |
| | from urllib.parse import urljoin |
| | css_url = urljoin(url, css_url) |
| | |
| | log(f" π Fetching: {css_url[:60]}...") |
| | |
| | |
| | css_result = app.scrape_url(css_url, params={'formats': ['rawHtml']}) |
| | css_content = css_result.get('rawHtml', '') or css_result.get('content', '') |
| | |
| | if css_content: |
| | colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1]) |
| | for color in colors: |
| | self._aggregate_color(color) |
| | |
| | variables = self._extract_css_variables(css_content) |
| | self.css_variables.update(variables) |
| | self.stats["css_files_parsed"] += 1 |
| | |
| | log(f" β
Parsed ({len(colors)} colors)") |
| | |
| | except Exception as e: |
| | log(f" β οΈ Failed: {str(e)[:50]}") |
| | self.warnings.append(f"Failed to fetch {css_url}: {str(e)}") |
| | |
| | |
| | log(" π¨ Processing CSS variables...") |
| | for var_name, var_value in self.css_variables.items(): |
| | if self.color_regex.match(var_value.strip()): |
| | self._aggregate_color({ |
| | "value": var_value.strip(), |
| | "source": f"css-var:{var_name}", |
| | "context": "css-variable", |
| | }) |
| | self.stats["css_variables_found"] += 1 |
| | |
| | self.stats["colors_found"] = len(self.colors) |
| | |
| | |
| | log("") |
| | log("π FIRECRAWL RESULTS:") |
| | log(f" CSS files parsed: {self.stats['css_files_parsed']}") |
| | log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}") |
| | log(f" CSS variables found: {self.stats['css_variables_found']}") |
| | log(f" Unique colors found: {self.stats['colors_found']}") |
| | log("") |
| | |
| | |
| | if self.colors: |
| | sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10] |
| | log(" π¨ Top colors found:") |
| | for hex_val, data in sorted_colors: |
| | log(f" {hex_val} (used {data['frequency']}x)") |
| | |
| | return { |
| | "colors": self.colors, |
| | "css_variables": self.css_variables, |
| | "stats": self.stats, |
| | } |
| | |
| | except Exception as e: |
| | log(f" β Firecrawl error: {str(e)}") |
| | self.errors.append(f"Firecrawl error: {str(e)}") |
| | return await self._fallback_css_extraction(url, log_callback) |
| | |
| | async def _fallback_css_extraction( |
| | self, |
| | url: str, |
| | log_callback: Optional[Callable[[str], None]] = None |
| | ) -> dict: |
| | """ |
| | Fallback CSS extraction using httpx (no Firecrawl API key needed). |
| | """ |
| | |
| | def log(msg: str): |
| | if log_callback: |
| | log_callback(msg) |
| | |
| | log("") |
| | log("π Using fallback CSS extraction (httpx)...") |
| | |
| | try: |
| | import httpx |
| | |
| | async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: |
| | |
| | log(f" π Fetching: {url}") |
| | response = await client.get(url) |
| | html_content = response.text |
| | |
| | log(f" β
Page fetched ({len(html_content)} chars)") |
| | |
| | |
| | log(" π Parsing <style> blocks...") |
| | style_blocks = re.findall(r'<style[^>]*>(.*?)</style>', html_content, re.DOTALL | re.IGNORECASE) |
| | |
| | for i, block in enumerate(style_blocks): |
| | colors = self._extract_colors_from_css(block, f"style-block-{i}") |
| | for color in colors: |
| | self._aggregate_color(color) |
| | |
| | variables = self._extract_css_variables(block) |
| | self.css_variables.update(variables) |
| | self.stats["style_blocks_parsed"] += 1 |
| | |
| | log(f" Found {len(style_blocks)} style blocks") |
| | |
| | |
| | log(" π Finding linked CSS files...") |
| | css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE) |
| | |
| | log(f" Found {len(css_urls)} CSS files") |
| | |
| | |
| | for css_url in css_urls[:15]: |
| | try: |
| | |
| | if css_url.startswith('//'): |
| | css_url = 'https:' + css_url |
| | elif css_url.startswith('/'): |
| | from urllib.parse import urlparse |
| | parsed = urlparse(url) |
| | css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}" |
| | elif not css_url.startswith('http'): |
| | from urllib.parse import urljoin |
| | css_url = urljoin(url, css_url) |
| | |
| | log(f" π Fetching: {css_url[:60]}...") |
| | |
| | css_response = await client.get(css_url) |
| | css_content = css_response.text |
| | |
| | if css_content: |
| | colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1]) |
| | for color in colors: |
| | self._aggregate_color(color) |
| | |
| | variables = self._extract_css_variables(css_content) |
| | self.css_variables.update(variables) |
| | self.stats["css_files_parsed"] += 1 |
| | |
| | log(f" β
Parsed ({len(colors)} colors)") |
| | |
| | except Exception as e: |
| | log(f" β οΈ Failed: {str(e)[:50]}") |
| | self.warnings.append(f"Failed to fetch {css_url}: {str(e)}") |
| | |
| | |
| | log(" π¨ Processing CSS variables...") |
| | for var_name, var_value in self.css_variables.items(): |
| | if self.color_regex.match(var_value.strip()): |
| | self._aggregate_color({ |
| | "value": var_value.strip(), |
| | "source": f"css-var:{var_name}", |
| | "context": "css-variable", |
| | }) |
| | self.stats["css_variables_found"] += 1 |
| | |
| | self.stats["colors_found"] = len(self.colors) |
| | |
| | |
| | log("") |
| | log("π FALLBACK EXTRACTION RESULTS:") |
| | log(f" CSS files parsed: {self.stats['css_files_parsed']}") |
| | log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}") |
| | log(f" CSS variables found: {self.stats['css_variables_found']}") |
| | log(f" Unique colors found: {self.stats['colors_found']}") |
| | log("") |
| | |
| | |
| | if self.colors: |
| | sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10] |
| | log(" π¨ Top colors found:") |
| | for hex_val, data in sorted_colors: |
| | log(f" {hex_val} (used {data['frequency']}x)") |
| | |
| | return { |
| | "colors": self.colors, |
| | "css_variables": self.css_variables, |
| | "stats": self.stats, |
| | } |
| | |
| | except Exception as e: |
| | log(f" β Fallback extraction failed: {str(e)}") |
| | self.errors.append(f"Fallback extraction failed: {str(e)}") |
| | return {"colors": {}, "css_variables": {}, "stats": self.stats} |
| |
|
| |
|
| | async def extract_css_colors( |
| | url: str, |
| | api_key: Optional[str] = None, |
| | log_callback: Optional[Callable[[str], None]] = None |
| | ) -> dict: |
| | """ |
| | Convenience function to extract CSS colors. |
| | |
| | Args: |
| | url: Website URL |
| | api_key: Optional Firecrawl API key |
| | log_callback: Optional logging callback |
| | |
| | Returns: |
| | Dict with colors, css_variables, and stats |
| | """ |
| | extractor = FirecrawlExtractor(api_key=api_key) |
| | return await extractor.extract_with_firecrawl(url, log_callback) |
| |
|