"""
Agent 1B: Firecrawl CSS Extractor
Design System Extractor v2
Persona: CSS Deep Diver
Responsibilities:
- Fetch and parse all CSS files from a website
- Extract colors from CSS rules, variables, and values
- Bypass CORS restrictions by fetching CSS directly
- Complement Playwright extraction with deeper CSS analysis
"""
import re
import asyncio
from typing import Optional, Callable
from datetime import datetime
# Firecrawl for web scraping
try:
from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError:
FIRECRAWL_AVAILABLE = False
from core.color_utils import (
parse_color,
get_contrast_with_white,
get_contrast_with_black,
)
class FirecrawlExtractor:
"""
Extracts colors from CSS files using Firecrawl.
This complements the Playwright extraction by:
1. Fetching all linked CSS files
2. Parsing inline ', html_content, re.DOTALL | re.IGNORECASE)
for i, block in enumerate(style_blocks):
colors = self._extract_colors_from_css(block, f"style-block-{i}")
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(block)
self.css_variables.update(variables)
self.stats["style_blocks_parsed"] += 1
log(f" Found {len(style_blocks)} style blocks")
# Extract CSS file URLs
log(" 🔗 Finding linked CSS files...")
css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE)
log(f" Found {len(css_urls)} CSS files")
# Fetch and parse each CSS file
for css_url in css_urls[:15]: # Limit to 15 files
try:
# Make URL absolute
if css_url.startswith('//'):
css_url = 'https:' + css_url
elif css_url.startswith('/'):
from urllib.parse import urlparse
parsed = urlparse(url)
css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}"
elif not css_url.startswith('http'):
from urllib.parse import urljoin
css_url = urljoin(url, css_url)
log(f" 📄 Fetching: {css_url[:60]}...")
# Fetch CSS file
css_result = app.scrape_url(css_url, params={'formats': ['rawHtml']})
css_content = css_result.get('rawHtml', '') or css_result.get('content', '')
if css_content:
colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1])
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(css_content)
self.css_variables.update(variables)
self.stats["css_files_parsed"] += 1
log(f" ✅ Parsed ({len(colors)} colors)")
except Exception as e:
log(f" ⚠️ Failed: {str(e)[:50]}")
self.warnings.append(f"Failed to fetch {css_url}: {str(e)}")
# Process CSS variables that contain colors
log(" 🎨 Processing CSS variables...")
for var_name, var_value in self.css_variables.items():
if self.color_regex.match(var_value.strip()):
self._aggregate_color({
"value": var_value.strip(),
"source": f"css-var:{var_name}",
"context": "css-variable",
})
self.stats["css_variables_found"] += 1
self.stats["colors_found"] = len(self.colors)
# Log summary
log("")
log("📊 FIRECRAWL RESULTS:")
log(f" CSS files parsed: {self.stats['css_files_parsed']}")
log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}")
log(f" CSS variables found: {self.stats['css_variables_found']}")
log(f" Unique colors found: {self.stats['colors_found']}")
log("")
# Show top colors found
if self.colors:
sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10]
log(" 🎨 Top colors found:")
for hex_val, data in sorted_colors:
log(f" {hex_val} (used {data['frequency']}x)")
return {
"colors": self.colors,
"css_variables": self.css_variables,
"stats": self.stats,
}
except Exception as e:
log(f" ❌ Firecrawl error: {str(e)}")
self.errors.append(f"Firecrawl error: {str(e)}")
return await self._fallback_css_extraction(url, log_callback)
async def _fallback_css_extraction(
self,
url: str,
log_callback: Optional[Callable[[str], None]] = None
) -> dict:
"""
Fallback CSS extraction using httpx (no Firecrawl API key needed).
"""
def log(msg: str):
if log_callback:
log_callback(msg)
log("")
log("🔄 Using fallback CSS extraction (httpx)...")
try:
import httpx
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
# Fetch main page
log(f" 🌐 Fetching: {url}")
response = await client.get(url)
html_content = response.text
log(f" ✅ Page fetched ({len(html_content)} chars)")
# Extract ', html_content, re.DOTALL | re.IGNORECASE)
for i, block in enumerate(style_blocks):
colors = self._extract_colors_from_css(block, f"style-block-{i}")
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(block)
self.css_variables.update(variables)
self.stats["style_blocks_parsed"] += 1
log(f" Found {len(style_blocks)} style blocks")
# Extract CSS file URLs
log(" 🔗 Finding linked CSS files...")
css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE)
log(f" Found {len(css_urls)} CSS files")
# Fetch and parse each CSS file
for css_url in css_urls[:15]:
try:
# Make URL absolute
if css_url.startswith('//'):
css_url = 'https:' + css_url
elif css_url.startswith('/'):
from urllib.parse import urlparse
parsed = urlparse(url)
css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}"
elif not css_url.startswith('http'):
from urllib.parse import urljoin
css_url = urljoin(url, css_url)
log(f" 📄 Fetching: {css_url[:60]}...")
css_response = await client.get(css_url)
css_content = css_response.text
if css_content:
colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1])
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(css_content)
self.css_variables.update(variables)
self.stats["css_files_parsed"] += 1
log(f" ✅ Parsed ({len(colors)} colors)")
except Exception as e:
log(f" ⚠️ Failed: {str(e)[:50]}")
self.warnings.append(f"Failed to fetch {css_url}: {str(e)}")
# Process CSS variables
log(" 🎨 Processing CSS variables...")
for var_name, var_value in self.css_variables.items():
if self.color_regex.match(var_value.strip()):
self._aggregate_color({
"value": var_value.strip(),
"source": f"css-var:{var_name}",
"context": "css-variable",
})
self.stats["css_variables_found"] += 1
self.stats["colors_found"] = len(self.colors)
# Log summary
log("")
log("📊 FALLBACK EXTRACTION RESULTS:")
log(f" CSS files parsed: {self.stats['css_files_parsed']}")
log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}")
log(f" CSS variables found: {self.stats['css_variables_found']}")
log(f" Unique colors found: {self.stats['colors_found']}")
log("")
# Show top colors
if self.colors:
sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10]
log(" 🎨 Top colors found:")
for hex_val, data in sorted_colors:
log(f" {hex_val} (used {data['frequency']}x)")
return {
"colors": self.colors,
"css_variables": self.css_variables,
"stats": self.stats,
}
except Exception as e:
log(f" ❌ Fallback extraction failed: {str(e)}")
self.errors.append(f"Fallback extraction failed: {str(e)}")
return {"colors": {}, "css_variables": {}, "stats": self.stats}
async def extract_css_colors(
url: str,
api_key: Optional[str] = None,
log_callback: Optional[Callable[[str], None]] = None
) -> dict:
"""
Convenience function to extract CSS colors.
Args:
url: Website URL
api_key: Optional Firecrawl API key
log_callback: Optional logging callback
Returns:
Dict with colors, css_variables, and stats
"""
extractor = FirecrawlExtractor(api_key=api_key)
return await extractor.extract_with_firecrawl(url, log_callback)