Design-System-Extractor-2 / agents /firecrawl_extractor.py
riazmo's picture
Upload firecrawl_extractor.py
d13fef9 verified
raw
history blame
17.6 kB
"""
Agent 1B: Firecrawl CSS Extractor
Design System Extractor v2
Persona: CSS Deep Diver
Responsibilities:
- Fetch and parse all CSS files from a website
- Extract colors from CSS rules, variables, and values
- Bypass CORS restrictions by fetching CSS directly
- Complement Playwright extraction with deeper CSS analysis
"""
import re
import asyncio
from typing import Optional, Callable
from datetime import datetime
# Firecrawl for web scraping
try:
from firecrawl import FirecrawlApp
FIRECRAWL_AVAILABLE = True
except ImportError:
FIRECRAWL_AVAILABLE = False
from core.color_utils import (
parse_color,
get_contrast_with_white,
get_contrast_with_black,
)
class FirecrawlExtractor:
"""
Extracts colors from CSS files using Firecrawl.
This complements the Playwright extraction by:
1. Fetching all linked CSS files
2. Parsing inline <style> blocks
3. Extracting CSS variables
4. Finding all color values in CSS rules
"""
def __init__(self, api_key: Optional[str] = None):
"""
Initialize Firecrawl extractor.
Args:
api_key: Firecrawl API key (optional for free tier)
"""
self.api_key = api_key
self.colors: dict[str, dict] = {}
self.css_variables: dict[str, str] = {}
self.errors: list[str] = []
self.warnings: list[str] = []
self.stats = {
"css_files_parsed": 0,
"style_blocks_parsed": 0,
"colors_found": 0,
"css_variables_found": 0,
}
# Color regex pattern
self.color_regex = re.compile(
r'#[0-9a-fA-F]{3,8}|'
r'rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|'
r'rgba\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*[\d.]+\s*\)|'
r'hsl\(\s*\d+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*\)|'
r'hsla\(\s*\d+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*,\s*[\d.]+\s*\)',
re.IGNORECASE
)
# CSS variable pattern
self.css_var_regex = re.compile(
r'--[\w-]+\s*:\s*([^;]+)',
re.IGNORECASE
)
def _extract_colors_from_css(self, css_text: str, source: str = "css") -> list[dict]:
"""Extract all color values from CSS text."""
colors = []
# Find all color values
matches = self.color_regex.findall(css_text)
for match in matches:
colors.append({
"value": match.strip(),
"source": source,
"context": "firecrawl-css",
})
return colors
def _extract_css_variables(self, css_text: str) -> dict[str, str]:
"""Extract CSS variables from CSS text."""
variables = {}
matches = self.css_var_regex.findall(css_text)
for match in matches:
# Get variable name and value
var_match = re.search(r'(--[\w-]+)\s*:\s*([^;]+)', css_text)
if var_match:
var_name = var_match.group(1)
var_value = var_match.group(2).strip()
variables[var_name] = var_value
# More precise extraction
for match in re.finditer(r'(--[\w-]+)\s*:\s*([^;]+);', css_text):
var_name = match.group(1)
var_value = match.group(2).strip()
variables[var_name] = var_value
return variables
def _process_color(self, color_value: str) -> Optional[str]:
"""Process and normalize a color value to hex."""
parsed = parse_color(color_value)
if parsed:
return parsed.hex
return None
def _aggregate_color(self, color_data: dict):
"""Aggregate a color into the collection."""
hex_value = self._process_color(color_data.get("value", ""))
if not hex_value:
return
if hex_value not in self.colors:
contrast_white = get_contrast_with_white(hex_value)
contrast_black = get_contrast_with_black(hex_value)
self.colors[hex_value] = {
"value": hex_value,
"frequency": 0,
"contexts": [],
"sources": [],
"contrast_white": round(contrast_white, 2),
"contrast_black": round(contrast_black, 2),
}
# Update frequency and context
self.colors[hex_value]["frequency"] += 1
context = color_data.get("context", "")
if context and context not in self.colors[hex_value]["contexts"]:
self.colors[hex_value]["contexts"].append(context)
source = color_data.get("source", "")
if source and source not in self.colors[hex_value]["sources"]:
self.colors[hex_value]["sources"].append(source)
async def extract_with_firecrawl(
self,
url: str,
log_callback: Optional[Callable[[str], None]] = None
) -> dict:
"""
Extract colors using Firecrawl API.
Args:
url: Website URL to analyze
log_callback: Optional callback for logging progress
Returns:
Dict with extracted colors and stats
"""
def log(msg: str):
if log_callback:
log_callback(msg)
if not FIRECRAWL_AVAILABLE:
log("⚠️ Firecrawl not available, skipping...")
return {"colors": {}, "css_variables": {}, "stats": self.stats}
log("")
log("=" * 60)
log("πŸ”₯ FIRECRAWL CSS EXTRACTION")
log("=" * 60)
log("")
try:
# Initialize Firecrawl
if self.api_key:
app = FirecrawlApp(api_key=self.api_key)
else:
# Try without API key (limited functionality)
log(" ⚠️ No Firecrawl API key - using fallback method")
return await self._fallback_css_extraction(url, log_callback)
log(f" 🌐 Scraping: {url}")
# Scrape the page
result = app.scrape_url(
url,
params={
'formats': ['html'],
'includeTags': ['style', 'link'],
}
)
if not result:
log(" ❌ Firecrawl returned no results")
return {"colors": {}, "css_variables": {}, "stats": self.stats}
html_content = result.get('html', '') or result.get('content', '')
log(f" βœ… Page scraped ({len(html_content)} chars)")
# Extract <style> blocks
log(" πŸ“ Parsing <style> blocks...")
style_blocks = re.findall(r'<style[^>]*>(.*?)</style>', html_content, re.DOTALL | re.IGNORECASE)
for i, block in enumerate(style_blocks):
colors = self._extract_colors_from_css(block, f"style-block-{i}")
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(block)
self.css_variables.update(variables)
self.stats["style_blocks_parsed"] += 1
log(f" Found {len(style_blocks)} style blocks")
# Extract CSS file URLs
log(" πŸ”— Finding linked CSS files...")
css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE)
log(f" Found {len(css_urls)} CSS files")
# Fetch and parse each CSS file
for css_url in css_urls[:15]: # Limit to 15 files
try:
# Make URL absolute
if css_url.startswith('//'):
css_url = 'https:' + css_url
elif css_url.startswith('/'):
from urllib.parse import urlparse
parsed = urlparse(url)
css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}"
elif not css_url.startswith('http'):
from urllib.parse import urljoin
css_url = urljoin(url, css_url)
log(f" πŸ“„ Fetching: {css_url[:60]}...")
# Fetch CSS file
css_result = app.scrape_url(css_url, params={'formats': ['rawHtml']})
css_content = css_result.get('rawHtml', '') or css_result.get('content', '')
if css_content:
colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1])
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(css_content)
self.css_variables.update(variables)
self.stats["css_files_parsed"] += 1
log(f" βœ… Parsed ({len(colors)} colors)")
except Exception as e:
log(f" ⚠️ Failed: {str(e)[:50]}")
self.warnings.append(f"Failed to fetch {css_url}: {str(e)}")
# Process CSS variables that contain colors
log(" 🎨 Processing CSS variables...")
for var_name, var_value in self.css_variables.items():
if self.color_regex.match(var_value.strip()):
self._aggregate_color({
"value": var_value.strip(),
"source": f"css-var:{var_name}",
"context": "css-variable",
})
self.stats["css_variables_found"] += 1
self.stats["colors_found"] = len(self.colors)
# Log summary
log("")
log("πŸ“Š FIRECRAWL RESULTS:")
log(f" CSS files parsed: {self.stats['css_files_parsed']}")
log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}")
log(f" CSS variables found: {self.stats['css_variables_found']}")
log(f" Unique colors found: {self.stats['colors_found']}")
log("")
# Show top colors found
if self.colors:
sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10]
log(" 🎨 Top colors found:")
for hex_val, data in sorted_colors:
log(f" {hex_val} (used {data['frequency']}x)")
return {
"colors": self.colors,
"css_variables": self.css_variables,
"stats": self.stats,
}
except Exception as e:
log(f" ❌ Firecrawl error: {str(e)}")
self.errors.append(f"Firecrawl error: {str(e)}")
return await self._fallback_css_extraction(url, log_callback)
async def _fallback_css_extraction(
self,
url: str,
log_callback: Optional[Callable[[str], None]] = None
) -> dict:
"""
Fallback CSS extraction using httpx (no Firecrawl API key needed).
"""
def log(msg: str):
if log_callback:
log_callback(msg)
log("")
log("πŸ”„ Using fallback CSS extraction (httpx)...")
try:
import httpx
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
# Fetch main page
log(f" 🌐 Fetching: {url}")
response = await client.get(url)
html_content = response.text
log(f" βœ… Page fetched ({len(html_content)} chars)")
# Extract <style> blocks
log(" πŸ“ Parsing <style> blocks...")
style_blocks = re.findall(r'<style[^>]*>(.*?)</style>', html_content, re.DOTALL | re.IGNORECASE)
for i, block in enumerate(style_blocks):
colors = self._extract_colors_from_css(block, f"style-block-{i}")
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(block)
self.css_variables.update(variables)
self.stats["style_blocks_parsed"] += 1
log(f" Found {len(style_blocks)} style blocks")
# Extract CSS file URLs
log(" πŸ”— Finding linked CSS files...")
css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE)
log(f" Found {len(css_urls)} CSS files")
# Fetch and parse each CSS file
for css_url in css_urls[:15]:
try:
# Make URL absolute
if css_url.startswith('//'):
css_url = 'https:' + css_url
elif css_url.startswith('/'):
from urllib.parse import urlparse
parsed = urlparse(url)
css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}"
elif not css_url.startswith('http'):
from urllib.parse import urljoin
css_url = urljoin(url, css_url)
log(f" πŸ“„ Fetching: {css_url[:60]}...")
css_response = await client.get(css_url)
css_content = css_response.text
if css_content:
colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1])
for color in colors:
self._aggregate_color(color)
variables = self._extract_css_variables(css_content)
self.css_variables.update(variables)
self.stats["css_files_parsed"] += 1
log(f" βœ… Parsed ({len(colors)} colors)")
except Exception as e:
log(f" ⚠️ Failed: {str(e)[:50]}")
self.warnings.append(f"Failed to fetch {css_url}: {str(e)}")
# Process CSS variables
log(" 🎨 Processing CSS variables...")
for var_name, var_value in self.css_variables.items():
if self.color_regex.match(var_value.strip()):
self._aggregate_color({
"value": var_value.strip(),
"source": f"css-var:{var_name}",
"context": "css-variable",
})
self.stats["css_variables_found"] += 1
self.stats["colors_found"] = len(self.colors)
# Log summary
log("")
log("πŸ“Š FALLBACK EXTRACTION RESULTS:")
log(f" CSS files parsed: {self.stats['css_files_parsed']}")
log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}")
log(f" CSS variables found: {self.stats['css_variables_found']}")
log(f" Unique colors found: {self.stats['colors_found']}")
log("")
# Show top colors
if self.colors:
sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10]
log(" 🎨 Top colors found:")
for hex_val, data in sorted_colors:
log(f" {hex_val} (used {data['frequency']}x)")
return {
"colors": self.colors,
"css_variables": self.css_variables,
"stats": self.stats,
}
except Exception as e:
log(f" ❌ Fallback extraction failed: {str(e)}")
self.errors.append(f"Fallback extraction failed: {str(e)}")
return {"colors": {}, "css_variables": {}, "stats": self.stats}
async def extract_css_colors(
url: str,
api_key: Optional[str] = None,
log_callback: Optional[Callable[[str], None]] = None
) -> dict:
"""
Convenience function to extract CSS colors.
Args:
url: Website URL
api_key: Optional Firecrawl API key
log_callback: Optional logging callback
Returns:
Dict with colors, css_variables, and stats
"""
extractor = FirecrawlExtractor(api_key=api_key)
return await extractor.extract_with_firecrawl(url, log_callback)