diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..32013115bc27285ffb851f4d4b60758e96550cb9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +__pycache__/ +*.pyc +*.pyo +.env +.env.* +!.env.example +*.egg-info/ +dist/ +build/ +.pytest_cache/ +.mypy_cache/ +node_modules/ +*.log +.DS_Store +storage/cache/ +storage/exports/ +__MACOSX/ +.claude/ diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000000000000000000000000000000000000..dbe002de570467d355a5d12d7ffd2ea96dd04723 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,466 @@ +# Design System Extractor v2 — Complete Architecture + +## Overview + +A **2-stage pipeline** that extracts, analyzes, and recommends improvements to any website's design system. Combines **deterministic rule-based analysis** (free, fast, reliable) with **4 specialized LLM agents** (context-aware reasoning) — each agent does one thing well. + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ STAGE 1: EXTRACTION │ +│ (No LLM — $0.00) │ +│ │ +│ URL → Crawler → Extractor → Normalizer → Semantic Analyzer │ +│ ↓ │ +│ [HUMAN REVIEW CHECKPOINT] │ +│ Accept/reject tokens, Desktop ↔ Mobile toggle │ +├─────────────────────────────────────────────────────────────────┤ +│ STAGE 2: ANALYSIS │ +│ │ +│ Layer 1: Rule Engine ──────────────── FREE ($0.00) │ +│ ├─ WCAG Contrast (AA/AAA) │ +│ ├─ Type Scale Detection │ +│ ├─ Spacing Grid Alignment │ +│ └─ Color Statistics │ +│ │ +│ Layer 2: Benchmark Research ──────── Semi-Free │ +│ └─ Compare to Material 3, Polaris, Atlassian, etc. │ +│ │ +│ Layer 3: LLM Agents ─────────────── ~$0.003/run │ +│ ├─ AURORA → Brand color identification │ +│ ├─ ATLAS → Benchmark recommendation │ +│ └─ SENTINEL → Best practices validation │ +│ │ +│ Layer 4: HEAD Synthesizer ────────── Final output │ +│ └─ NEXUS → Combines everything → User-facing results │ +│ │ +│ [GRACEFUL DEGRADATION: Each layer has fallbacks] │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Stage 1: Extraction & Normalization (No LLM) + +### 1A. PageDiscoverer (Crawler) + +| | | +|---|---| +| **File** | `agents/crawler.py` | +| **Model** | None | +| **Input** | Base URL | +| **Output** | List of discovered pages (title, URL, page type) | +| **How** | Playwright browser crawling + heuristic page type detection | +| **Why no LLM** | Pure URL discovery — deterministic crawling | + +### 1B. TokenExtractor + +| | | +|---|---| +| **File** | `agents/extractor.py` + `agents/firecrawl_extractor.py` | +| **Model** | None | +| **Input** | Confirmed page URLs + Viewport (1440px desktop / 375px mobile) | +| **Output** | `ExtractedTokens` — colors, typography, spacing, radius, shadows, FG/BG pairs, CSS variables | +| **How** | 7-source extraction via Playwright | +| **Why no LLM** | DOM parsing + regex — no reasoning needed | + +**7 Extraction Sources:** +1. DOM computed styles (`getComputedStyle`) +2. CSS variables (`:root { --color: }`) +3. SVG colors (fill, stroke) +4. Inline styles (`style='color:'`) +5. Stylesheet rules (CSS files) +6. External CSS files (fetched via Firecrawl) +7. Page content scan (brute-force token search) + +### 1C. TokenNormalizer + +| | | +|---|---| +| **File** | `agents/normalizer.py` | +| **Model** | None | +| **Input** | Raw `ExtractedTokens` | +| **Output** | `NormalizedTokens` — deduplicated, named, confidence-tagged | +| **How** | Deduplication (exact hex + Delta-E merge), role inference from frequency, semantic naming | +| **Why no LLM** | Algorithmic deduplication — pure math | + +### 1D. SemanticColorAnalyzer + +| | | +|---|---| +| **File** | `agents/semantic_analyzer.py` | +| **Model** | None | +| **Input** | Extracted colors with usage/frequency data | +| **Output** | Semantic mapping: `{brand, text, background, border, feedback}` | +| **How** | Rule-based: buttons → brand, `color` property → text, `background-color` → background, red → error, green → success | +| **Why no LLM** | CSS property analysis — pattern matching on property names | + +### Human Review Checkpoint + +After Stage 1, the user sees: +- Desktop vs Mobile token comparison (side-by-side) +- Accept/reject individual colors, typography, spacing tokens +- Viewport toggle to switch views +- All accepted tokens flow into Stage 2 + +--- + +## Stage 2: Analysis (Hybrid — Rule Engine + LLM) + +### Layer 1: Rule Engine (FREE — No LLM) + +**File:** `core/rule_engine.py` +**Cost:** $0.00 +**Speed:** < 1 second + +The rule engine handles everything that can be computed with math. No LLM reasoning needed. + +#### What It Calculates: + +**1. Typography Analysis (TypeScaleAnalysis)** +``` +Input: [11, 12, 14, 16, 18, 22, 24, 32] (extracted font sizes) +Output: + ├─ Detected Ratio: 1.167 + ├─ Closest Standard: Minor Third (1.2) + ├─ Consistent: No (variance: 0.24) + └─ Recommendation: 1.25 (Major Third) +``` +- Compares to standard ratios: 1.067, 1.125, 1.2, 1.25, 1.333, 1.414, 1.5 +- Calculates variance to determine consistency +- 100% deterministic math + +**2. Color Accessibility (WCAG AA/AAA)** +``` +Input: 210 colors + 220 FG/BG pairs +Output: + ├─ AA Pass: 143 + ├─ AA Fail (real pairs): 67 + └─ Fix suggestions: #06b2c4 → #048391 (4.5:1) +``` +- WCAG 2.1 contrast ratio formula +- Tests actual FG/BG pairs found on page (not just color vs white) +- Algorithmically generates AA-compliant alternatives +- Pure math — no LLM + +**3. Spacing Grid Detection** +``` +Input: [3, 8, 10, 16, 20, 24, 32, 40] (spacing values) +Output: + ├─ Detected Base: 1px (GCD) + ├─ Grid Aligned: 0% + └─ Recommendation: 8px grid +``` +- GCD math + alignment percentage calculation + +**4. Color Statistics** +``` +Input: 143 extracted colors +Output: + ├─ Unique: 143 + ├─ Near-Duplicates: 351 + ├─ Grays: 68 | Saturated: 69 + └─ Hue Distribution: {gray: 68, blue: 14, red: 11, ...} +``` + +**5. Overall Consistency Score (0–100)** +``` +Weights: + ├─ AA Compliance: 25 pts + ├─ Type Scale Consistent: 15 pts + ├─ Base Size (≥16px): 15 pts + ├─ Spacing Grid Aligned: 15 pts + ├─ Color Count (< 20): 10 pts + └─ No Near-Duplicates: 10 pts +``` + +--- + +### Layer 2: Benchmark Research + +**File:** `agents/benchmark_researcher.py` +**Cost:** Near-free (optional HF LLM for doc extraction, mostly cached) + +**Available Benchmarks:** +| System | Short Name | +|--------|-----------| +| Material Design 3 | Material 3 | +| Apple HIG | Apple | +| Shopify Polaris | Polaris | +| Atlassian Design | Atlassian | +| IBM Carbon | Carbon | +| Tailwind CSS | Tailwind | +| Ant Design | Ant | +| Chakra UI | Chakra | + +**Process:** +1. Check 24-hour cache per benchmark +2. If expired: Fetch docs via Firecrawl → Extract specs → Cache +3. Compare user's tokens to each benchmark: + - Type ratio diff, base size diff, spacing grid diff + - Weighted similarity score +4. Sort by similarity (closest match first) + +**Fallback:** Hardcoded `FALLBACK_BENCHMARKS` dict — no external fetch needed + +--- + +### Layer 3: LLM Agents (4 Specialized Agents) + +**File:** `agents/llm_agents.py` + +Each agent has a single responsibility. They run after the rule engine — they reason about patterns the rule engine can't detect. + +--- + +#### Agent 1: AURORA — Brand Color Identifier + +| | | +|---|---| +| **Persona** | Senior Brand Color Analyst | +| **Model** | Qwen 72B | +| **Temperature** | 0.4 (allows creative interpretation) | +| **Input** | Color tokens with usage counts + semantic CSS analysis | +| **Output** | `BrandIdentification` | + +**Why LLM:** Requires context understanding — "33 button instances using #06b2c4 = likely brand primary." A rule engine can count colors, but can't reason about which one is the *brand* color based on where and how it's used. + +**Sample Output:** +``` +AURORA's Analysis: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Brand Primary: #06b2c4 (confidence: HIGH) + └─ 33 buttons, 12 CTAs, dominant accent + +Brand Secondary: #373737 (confidence: HIGH) + └─ 89 text elements, consistent dark tone + +Palette Strategy: Complementary +Cohesion Score: 7/10 + └─ "Clear primary-secondary hierarchy, + accent colors well-differentiated" + +Self-Evaluation: + ├─ Confidence: 8/10 + ├─ Data Quality: good + └─ Flags: [] +``` + +--- + +#### Agent 2: ATLAS — Benchmark Advisor + +| | | +|---|---| +| **Persona** | Senior Design System Benchmark Analyst | +| **Model** | Llama 3.3 70B (128K context) | +| **Temperature** | 0.25 (analytical, data-driven) | +| **Input** | User's type ratio, base size, spacing + benchmark comparison data | +| **Output** | `BenchmarkAdvice` | + +**Why LLM:** Requires trade-off reasoning. The closest mathematical match (85%) might not be the best fit if alignment effort is high. ATLAS reasons about effort vs. value — "Polaris is 87% match and your spacing already aligns. Material 3 is 77% but would require restructuring your grid." + +**Sample Output:** +``` +ATLAS's Recommendation: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Recommended: Shopify Polaris (87% match) + +Alignment Changes: + ├─ Type scale: 1.17 → 1.25 (effort: medium) + ├─ Spacing grid: mixed → 4px (effort: high) + └─ Base size: 16px → 16px (already aligned!) + +Pros: + ├─ Closest match to existing system + ├─ E-commerce proven at scale + └─ Well-documented, community supported + +Cons: + ├─ Spacing migration is significant effort + └─ Type scale shift affects all components + +Alternative: Material 3 (77% match) + └─ "Stronger mobile patterns, 8px grid" +``` + +--- + +#### Agent 3: SENTINEL — Best Practices Validator + +| | | +|---|---| +| **Persona** | Design System Best Practices Auditor | +| **Model** | Qwen 72B | +| **Temperature** | 0.2 (strict, consistent evaluation) | +| **Input** | Rule Engine results (typography, accessibility, spacing, color stats) | +| **Output** | `BestPracticesResult` | + +**Why LLM:** Requires impact assessment and prioritization. The rule engine says "67 colors fail AA." SENTINEL says "Brand primary failing AA affects 40% of interactive elements — fix this FIRST, it's 5 minutes of work with high impact." + +**Sample Output:** +``` +SENTINEL's Audit: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Overall Score: 68/100 + +Checks: + ├─ ✅ Type Scale Standard (1.25 ratio) + ├─ ⚠️ Type Scale Consistency (variance 0.18) + ├─ ✅ Base Size Accessible (16px) + ├─ ❌ AA Compliance (67 failures) + ├─ ⚠️ Spacing Grid (0% aligned) + ├─ ⚠️ Color Count (143 unique — too many) + └─ ❌ Near-Duplicates (351 pairs) + +Priority Fixes: + #1 Fix brand color AA compliance + Impact: HIGH | Effort: 5 min + Action: #06b2c4 → #048391 + + #2 Consolidate near-duplicate colors + Impact: MEDIUM | Effort: 2 hours + Action: Merge 351 near-duplicate pairs + + #3 Align spacing to 8px grid + Impact: MEDIUM | Effort: 1 hour + Action: Snap values to [8, 16, 24, 32, 40] +``` + +--- + +#### Agent 4: NEXUS — HEAD Synthesizer (Final Agent) + +| | | +|---|---| +| **Persona** | Senior Design System Architect & Synthesizer | +| **Model** | Llama 3.3 70B (128K context) | +| **Temperature** | 0.3 (balanced synthesis) | +| **Input** | ALL Rule Engine results + AURORA + ATLAS + SENTINEL outputs | +| **Output** | `HeadSynthesis` — the final user-facing result | + +**Why LLM:** Synthesis and contradiction resolution. If ATLAS says "close to Polaris" but SENTINEL says "spacing misaligned," NEXUS reconciles: "Align to Polaris type scale now (low effort) but defer spacing migration (high effort)." + +**Sample Output:** +``` +NEXUS Final Synthesis: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Executive Summary: +"Your design system scores 68/100. Critical issue: +67 color pairs fail AA compliance. Top action: +fix brand primary contrast (5 min, high impact)." + +Scores: + ├─ Overall: 68/100 + ├─ Accessibility: 45/100 + ├─ Consistency: 75/100 + └─ Organization: 70/100 + +Benchmark Fit: + ├─ Closest: Shopify Polaris (87%) + └─ Recommendation: Adopt Polaris type scale + +Top 3 Actions: + 1. Fix brand color AA → #06b2c4 → #048391 + Impact: HIGH | Effort: 5 min + 2. Align type scale to 1.25 + Impact: MEDIUM | Effort: 1 hour + 3. Consolidate 143 → ~20 semantic colors + Impact: MEDIUM | Effort: 2 hours + +Color Recommendations: + ├─ ✅ brand.primary: #06b2c4 → #048391 (AA fix — auto-accept) + ├─ ✅ text.secondary: #999999 → #757575 (AA fix — auto-accept) + └─ ❌ brand.accent: #FF6B35 → #E65100 (aesthetic — user decides) + +Self-Evaluation: + ├─ Confidence: 7/10 + ├─ Data Quality: good + └─ Flags: ["high near-duplicate count may indicate extraction noise"] +``` + +--- + +## Cost Model + +| Component | LLM? | Cost per Run | +|-----------|-------|-------------| +| Stage 1 (Crawl + Extract + Normalize) | No | $0.00 | +| Rule Engine | No | $0.00 | +| Benchmark Research | Optional | ~$0.0005 | +| AURORA (Qwen 72B) | Yes | ~$0.0005 | +| ATLAS (Llama 3.3 70B) | Yes | ~$0.0005 | +| SENTINEL (Qwen 72B) | Yes | ~$0.0005 | +| NEXUS (Llama 3.3 70B) | Yes | ~$0.001 | +| **Total** | | **~$0.003** | + +All LLM inference via HuggingFace Inference API (PRO subscription at $9/month includes generous free tier for these models). + +--- + +## Graceful Degradation + +The system is designed to **always produce output**, even when components fail: + +| If This Fails... | Fallback | +|-------------------|----------| +| Firecrawl (CSS fetch) | Use DOM-only extraction | +| Benchmark fetch | Use hardcoded `FALLBACK_BENCHMARKS` | +| AURORA (brand ID) | Skip brand analysis, use defaults | +| ATLAS (benchmark advice) | Skip recommendation, show raw comparisons | +| SENTINEL (practices) | Use rule engine score directly | +| NEXUS (synthesis) | `create_fallback_synthesis()` from rule engine data | +| Entire LLM layer | Full rule-engine-only analysis still works | + +--- + +## Key Data Structures + +``` +ExtractedTokens (Stage 1 raw) +├─ colors: dict[ColorToken] +├─ typography: dict[TypographyToken] +├─ spacing: dict[SpacingToken] +├─ radius: dict[RadiusToken] +├─ shadows: dict[ShadowToken] +├─ fg_bg_pairs: list[dict] ← for real AA checking +└─ css_variables: dict[str, str] ← CSS var mappings + +NormalizedTokens (Stage 1 clean) +├─ colors, typography, spacing, radius, shadows (deduplicated) +├─ font_families: dict[FontFamily] +├─ detected_spacing_base: int (4 or 8) +└─ detected_naming_convention: str + +RuleEngineResults (Layer 1) +├─ typography: TypeScaleAnalysis +├─ accessibility: list[ColorAccessibility] +├─ spacing: SpacingGridAnalysis +├─ color_stats: ColorStatistics +├─ aa_failures: int +└─ consistency_score: int (0-100) + +HeadSynthesis (Final output) +├─ executive_summary: str +├─ scores: {overall, accessibility, consistency, organization} +├─ benchmark_fit: {closest, similarity, recommendation} +├─ brand_analysis: {primary, secondary, cohesion} +├─ top_3_actions: [{action, impact, effort, details}] +├─ color_recommendations: [{role, current, suggested, reason, accept}] +├─ type_scale_recommendation: dict +├─ spacing_recommendation: dict +└─ self_evaluation: {confidence, reasoning, data_quality, flags} +``` + +--- + +## Tech Stack + +| Component | Technology | +|-----------|-----------| +| Frontend | Gradio 4.x | +| Browser Automation | Playwright (Chromium) | +| Web Scraping | Firecrawl | +| LLM Inference | HuggingFace Inference API | +| Models | Qwen 72B, Llama 3.3 70B | +| Color Math | Custom WCAG implementation | +| Deployment | Docker → HuggingFace Spaces | diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000000000000000000000000000000000..3376053d7e952c10427f14f41424d0a2dea32ad4 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,1305 @@ +# Design System Extractor v2 — Project Context + +## Overview + +A multi-agent system that extracts, analyzes, and recommends improvements for design systems from websites. The system operates in two stages: + +1. **Stage 1 (Deterministic)**: Extract CSS values → Normalize → Rule Engine analysis (free, no LLM) +2. **Stage 2 (LLM-powered)**: Brand identification → Benchmark comparison → Best practices → Final synthesis + +--- + +## CURRENT STATUS: BROKEN — NEEDS RETHINK + +### What's Wrong (observed from real site tests) + +**Tested sites**: sixflagsqiddiyacity.com, others + +#### Problem 1: Color Naming is Inconsistent (CRITICAL) +Three competing naming systems produce mixed output: + +| Source | Convention | Example | +|--------|-----------|---------| +| `normalizer.py` (line 266-275) | Word-based: light/dark/base | `color.blue.light` | +| `app.py _generate_color_name_from_hex()` | Numeric: 50-900 | `color.blue.500` | +| AURORA LLM agent | Anything it wants | `brand.primary` | + +**Result in Figma**: `blue.300`, `blue.dark`, `blue.light`, `blue.base` — ALL IN THE SAME EXPORT. Unusable. + +#### Problem 2: Border Radius is Broken (CRITICAL) +- `md = 1616` (concatenated garbage) +- `full = 50` (should be 9999px) +- Nested structures: `radius.full.9999` and `radius.full.100` incorrectly inside `radius.full` +- Multi-value radii like `"0px 0px 16px 16px"` passed as-is — Figma can't use these +- **Root cause**: Normalizer doesn't process radius at all (line 94-97 just stores raw values) + +#### Problem 3: LLM Agents Are Single-Shot, No Reasoning (CRITICAL) +- AURORA does one LLM call → returns whatever it returns → no verification +- SENTINEL does one LLM call → scores and checks not validated against actual data +- NEXUS does one LLM call → synthesizes without checking if inputs make sense +- No ReAct/ToT/reflection loop. No self-correction. No critic. +- Models (Qwen 72B, Llama 3.3 70B via HF Inference) may not follow structured output reliably + +#### Problem 4: AURORA Only Names ~10 Colors +- Prompt says "Suggest Semantic Names for top 10 most-used colors" +- Remaining 20+ colors keep their normalizer names (word-based) +- AURORA doesn't see existing names — only receives hex + usage count +- No cleanup pass exists to unify naming after AURORA + +#### Problem 5: Shadow Ordering Wrong +- xs has blur=25px, sm has blur=30px, md has blur=80px — non-progressive +- Shadow naming (xs/sm/md/lg/xl) doesn't match actual elevation hierarchy +- No validation that shadow progression makes physical sense + +#### Problem 6: Font Family Detection +- All fonts showing as "sans-serif" (the fallback) instead of actual font name +- Extraction gets computed style which resolves to generic family + +--- + +## ARCHITECTURE RETHINK PLAN + +### Phase 1: Fix Stage 2 (LLM Agents) — ADD AGENTIC REASONING + +Current Stage 2 is just 4 single-shot LLM calls. Needs proper agentic framework. + +#### Current (Broken): +``` +Color Data ──→ [Single LLM Call] ──→ Output (hope for the best) +``` + +#### Target (With Reasoning): +``` +Color Data ──→ [THINK] ──→ [ACT] ──→ [OBSERVE] ──→ [REFLECT] ──→ [VERIFY] ──→ Output + │ │ │ │ │ + │ │ │ │ Does it pass + │ │ │ Is this validation? + │ │ Check against consistent? If no, loop + │ Generate real data + │ initial + │ analysis + Plan approach +``` + +#### Option A: ReAct Framework (Recommended for AURORA + SENTINEL) +``` +Thought: I need to identify brand colors from 30 extracted colors +Action: Analyze usage frequency — #005aa3 used 47x in buttons/CTAs +Observation: #005aa3 is clearly the primary CTA color +Thought: Now check if secondary color exists — look for headers/nav +Action: #ff0000 used 23x in headers → likely brand secondary +Observation: Red + Blue = complementary strategy +Thought: Now I need to name ALL colors consistently using numeric shades +Action: Generate full naming map using Tailwind convention (50-900) +Observation: 28 colors named, all using numeric shades +Thought: Let me verify — any naming conflicts? Any mixed conventions? +Action: Self-check naming consistency +Final Answer: {complete consistent output} +``` + +#### Option B: Tree of Thought (For NEXUS synthesis) +``` +Branch 1: Weight accessibility heavily → overall score 45 +Branch 2: Weight consistency heavily → overall score 68 +Branch 3: Balanced weighting → overall score 55 +Evaluate: Which scoring best reflects reality? +Select: Branch 3 with adjustments +``` + +#### Option C: Critic/Verifier Pattern (For ALL agents) +``` +Agent Output ──→ [CRITIC LLM] ──→ Pass? ──→ Final Output + │ │ + │ No: feedback + │ │ + │ ▼ + │ [RETRY with feedback] + │ + Checks: + - Naming convention consistent? + - Scores match actual data? + - All required fields present? + - Values in valid ranges? +``` + +### Proposed New Stage 2 Architecture: + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ STAGE 2: AGENTIC ANALYSIS │ +│ │ +│ ┌───────────────────────────────────────────────────┐ │ +│ │ STEP 1: AURORA (ReAct, 2-3 reasoning steps) │ │ +│ │ Think → Identify brand → Name ALL colors │ │ +│ │ → Self-verify naming consistency │ │ +│ │ → Critic check → Retry if needed │ │ +│ └───────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────────────┼───────────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ ATLAS │ │ SENTINEL │ │ VALIDATOR │ │ +│ │ Benchmark │ │ Best Prac │ │ (Critic) │ │ +│ │ (ReAct) │ │ (ReAct) │ │ Checks ALL │ │ +│ │ │ │ │ │ outputs │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ │ +│ │ │ │ │ +│ └────────────────┼─────────────────┘ │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │ NEXUS │ │ +│ │ (ToT) │ │ +│ │ + Critic │ │ +│ └─────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Model Selection Rethink + +Current models via HuggingFace Inference API: +| Agent | Current Model | Problem | +|-------|--------------|---------| +| AURORA | Qwen 72B | Doesn't follow structured output reliably | +| ATLAS | Llama 3.3 70B | Adequate for comparison | +| SENTINEL | Qwen 72B | Doesn't validate against actual data | +| NEXUS | Llama 3.3 70B | Single-shot synthesis, no verification | + +**Models to evaluate:** +- **Qwen 2.5 72B Instruct** — Better instruction following than Qwen 72B +- **Mixtral 8x22B** — Good at structured JSON output +- **DeepSeek V3** — Strong at reasoning chains +- **Llama 3.1 405B** — Largest open model, best reasoning (but slow/expensive) +- **Command R+** — Designed for tool use and structured output + +**Key question**: Should we use ONE model for all agents (consistency) or specialized models per task? + +### Phase 2: Fix Stage 1 (After Stage 2 is stable) + +#### Normalizer Fixes Needed: +1. **Unify color shade convention** — Pick ONE system (numeric 50-900 recommended) +2. **Add radius normalization** — Currently just stores raw values +3. **Handle multi-value radius** — `"0px 0px 16px 16px"` needs decomposition +4. **Deduplicate radius values** — Multiple entries for same visual radius + +#### Rule Engine Fixes Needed: +1. **Base font size filter** — DONE (>= 10px filter applied) +2. **Shadow progression validation** — Check blur/offset increase with elevation +3. **Radius grid alignment** — Check if radii follow base-4/base-8 + +#### Export Fixes Needed: +1. **Validation layer before export** — Catch mixed conventions, nested garbage +2. **Radius structure flattening** — Never nest tokens inside tokens +3. **Unit consistency** — All radius values must have `px` units + +--- + +## FILE STRUCTURE + +``` +design-system-extractor-v2-hf-fix/ +├── app.py # Main Gradio app, orchestrates everything +├── CLAUDE.md # THIS FILE — project context and plan +│ +├── agents/ +│ ├── crawler.py # Page discovery (finds links on site) +│ ├── extractor.py # Playwright-based CSS extraction +│ ├── firecrawl_extractor.py # Firecrawl CSS deep extraction +│ ├── normalizer.py # Token deduplication and naming +│ ├── llm_agents.py # AURORA, ATLAS, SENTINEL, NEXUS agents +│ ├── stage2_graph.py # LangGraph orchestration for Stage 2 +│ ├── advisor.py # Upgrade advisor +│ ├── benchmark_researcher.py # Benchmark data collection +│ └── semantic_analyzer.py # Semantic CSS analysis +│ +├── core/ +│ ├── token_schema.py # Pydantic models for all token types +│ ├── color_utils.py # Color parsing, contrast, ramp generation +│ ├── rule_engine.py # Deterministic analysis (type scale, WCAG, spacing) +│ ├── hf_inference.py # HuggingFace Inference API client +│ ├── preview_generator.py # HTML preview generation +│ ├── validation.py # Output validation +│ └── logging.py # Logging utilities +│ +├── config/ +│ └── settings.py # Configuration (viewports, timeouts, thresholds) +│ +├── tests/ +│ ├── test_stage1_extraction.py # 82 deterministic tests +│ ├── test_agent_evals.py # 27 LLM agent schema/behavior tests +│ └── test_stage2_pipeline.py # Pipeline integration tests +│ +└── output_json/ + ├── file (16).json # Latest extraction output (sixflags) + └── figma-plugin-extracted/ # Figma plugin source + └── figma-design-token-creator 5/ + └── src/code.js # Figma plugin main code +``` + +--- + +## DATA FLOW (Current vs Target) + +### Current Flow (Broken): +``` +Extraction → Normalizer (word shades) → Rule Engine → LLM (single-shot) + ↓ ↓ ↓ ↓ + Raw CSS color.blue.light Stats only Unverified output + values color.neutral.dark No radius Mixed naming + No radius processing validation No self-correction + ↓ + Export (merges 3 naming conventions → chaos) +``` + +### Target Flow: +``` +Extraction → Normalizer (numeric shades, radius too) → Rule Engine + ↓ ↓ ↓ + Raw CSS color.blue.500 Stats + validation + values color.neutral.200 Shadow progression + radius.md = 8px Radius grid check + ↓ ↓ + LLM Agents (ReAct framework) │ + ↓ │ + AURORA: Think → Act → Observe → Verify │ + SENTINEL: Think → Check data → Score │ + NEXUS: ToT → Select best synthesis │ + ↓ │ + CRITIC/VALIDATOR ←────────────────────────────┘ + ↓ (validates against Stage 1 data) + Pass? → Export + Fail? → Retry with feedback +``` + +--- + +## WHAT EACH AGENT SHOULD ACTUALLY DO + +### AURORA (Brand Identifier) — Needs ReAct +**Current**: Single-shot, names 10 colors, no verification +**Target**: +- Step 1 (Think): Plan approach based on color count and usage patterns +- Step 2 (Act): Identify brand primary/secondary/accent from usage evidence +- Step 3 (Observe): Check if identification makes sense (is primary really the most-used CTA color?) +- Step 4 (Act): Name ALL colors using consistent numeric convention (50-900) +- Step 5 (Verify): Self-check — are all names consistent? Any mixed conventions? +- Step 6 (Critic): External validation — does output match schema? Names all `color.{family}.{shade}`? + +### SENTINEL (Best Practices) — Needs ReAct + Data Grounding +**Current**: Single-shot, scores without verifying against actual data +**Target**: +- Step 1 (Think): What checks apply given the data? +- Step 2 (Act): Score each check CITING SPECIFIC DATA from rule engine +- Step 3 (Observe): Does my score match what the data shows? +- Step 4 (Verify): If rule engine says 5 AA failures, my AA check MUST be "fail" not "pass" +- Step 5 (Critic): Cross-check scores against rule engine numbers + +### NEXUS (Synthesizer) — Needs ToT +**Current**: Single-shot synthesis, no evaluation of alternatives +**Target**: +- Branch 1: Accessibility-focused scoring (weight AA failures heavily) +- Branch 2: Consistency-focused scoring (weight naming/grid alignment) +- Branch 3: Balanced approach +- Evaluate: Which branch best reflects reality? +- Critic: Does final score contradict any agent's findings? + +--- + +## KNOWN FIXES ALREADY APPLIED + +### 1. Base Font Size Detection (FIXED in rule_engine.py) +Filters out sizes < 10px before detecting base size. + +### 2. Garbage Color Names (PARTIALLY FIXED in app.py) +Detects `firecrawl.N` names and regenerates — but the replacement still creates mixed conventions. + +### 3. Visual Spec Error Handling (FIXED in code.js) +Defensive error handling for undefined errors. + +--- + +## IDEAL OUTPUT REFERENCE + +What the exported JSON SHOULD look like (for Figma): + +```json +{ + "color": { + "brand": { + "primary": { "$type": "color", "$value": "#005aa3" }, + "secondary": { "$type": "color", "$value": "#ff0000" } + }, + "text": { + "primary": { "$type": "color", "$value": "#000000" }, + "secondary": { "$type": "color", "$value": "#999999" }, + "muted": { "$type": "color", "$value": "#cccccc" } + }, + "background": { + "primary": { "$type": "color", "$value": "#ebedef" }, + "secondary": { "$type": "color", "$value": "#bfbfbf" } + }, + "blue": { + "50": { "$type": "color", "$value": "#b9daff" }, + "300": { "$type": "color", "$value": "#7fdbff" }, + "500": { "$type": "color", "$value": "#6f7597" }, + "800": { "$type": "color", "$value": "#2c3e50" } + }, + "neutral": { + "200": { "$type": "color", "$value": "#b2b8bf" }, + "700": { "$type": "color", "$value": "#333333" } + } + }, + "radius": { + "none": { "$type": "dimension", "$value": "0px" }, + "sm": { "$type": "dimension", "$value": "2px" }, + "md": { "$type": "dimension", "$value": "4px" }, + "lg": { "$type": "dimension", "$value": "8px" }, + "xl": { "$type": "dimension", "$value": "16px" }, + "2xl": { "$type": "dimension", "$value": "24px" }, + "full": { "$type": "dimension", "$value": "9999px" } + } +} +``` + +**Key rules**: +- Palette colors ALWAYS use numeric shades (50-900) +- Role colors use semantic names (primary, secondary, muted) +- Radius is FLAT — never nested, always single px values +- No mixed conventions in the same category + +--- + +## FILES TO UPDATE ON HUGGINGFACE + +When making changes, these files need updating: +1. `app.py` — Main application logic +2. `core/rule_engine.py` — Deterministic analysis +3. `agents/llm_agents.py` — LLM agent prompts and reasoning +4. `agents/normalizer.py` — Token naming and dedup +5. `agents/extractor.py` — CSS extraction +6. `output_json/figma-plugin-extracted/figma-design-token-creator 5/src/code.js` — Figma plugin + +--- + +## CRITICAL DISCOVERY: TWO COMPETING STAGE 2 ARCHITECTURES + +The codebase has **two parallel Stage 2 systems** that partially overlap: + +### System A: `llm_agents.py` (4 Specialized Agents) +``` +AURORA (brand ID) → ATLAS (benchmark) → SENTINEL (best practices) → NEXUS (synthesis) +``` +- Each agent has a focused prompt + dedicated data class +- Called from `app.py` directly via `hf_client.complete_async()` +- Uses `Qwen/Qwen2.5-72B-Instruct` and `Llama-3.3-70B-Instruct` +- **Problem**: Single-shot calls, no reasoning, no verification + +### System B: `stage2_graph.py` (LangGraph Parallel) +``` +LLM1 (Qwen) ──┐ + ├──→ HEAD ──→ Final +LLM2 (Llama) ─┘ +Rule Engine ───┘ +``` +- Two generic "analyst" LLMs run in parallel + rule engine +- Uses LangGraph `StateGraph` with `asyncio.gather()` +- HEAD compiler merges results +- **Problem**: Generic prompts, no specialization, same analysis duplicated + +### Decision: Merge into ONE system with ReAct reasoning + +Keep System A's **specialized agents** (AURORA, SENTINEL, NEXUS) but add System B's **parallel execution** and **LangGraph state management**. Drop the duplicate generic analysts (LLM1/LLM2). + +--- + +## DETAILED AGENTIC ARCHITECTURE FOR STAGE 2 + +### Design Principles +1. **ReAct (Reasoning + Acting)**: Each agent THINKS before it acts, OBSERVES the result, REFLECTS on quality +2. **Critic/Verifier**: A lightweight validation pass after each agent output +3. **Grounded Reasoning**: LLMs must cite specific data from Stage 1, not hallucinate +4. **Fail-Safe Defaults**: If LLM fails or produces garbage, fall back to rule-engine defaults +5. **Single Convention**: ALL naming uses numeric shades (50-900), enforced post-LLM + +### New Stage 2 Flow + +``` +Stage 1 Output (NormalizedTokens + RuleEngineResults) + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ PRE-PROCESSING (Deterministic, no LLM) │ +│ • Unify all color names to numeric shades (50-900) │ +│ • Normalize radius values (flatten, deduplicate) │ +│ • Validate shadow progression (sort by blur) │ +│ • Build structured data packets for each agent │ +└──────────────────────────────────────────────────────────────┘ + │ + ┌───────────┼───────────┐ + ▼ ▼ ▼ +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ AURORA │ │ ATLAS │ │ SENTINEL │ +│ (ReAct) │ │ (Single) │ │ (ReAct) │ +│ 2 steps │ │ 1 step │ │ 2 steps │ +└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ │ │ + ▼ ▼ ▼ +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ CRITIC 1 │ │ (no critic │ │ CRITIC 2 │ +│ Validate │ │ needed) │ │ Cross-ref │ +│ naming │ │ │ │ with data │ +└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ │ │ + └───────────────┼───────────────┘ + ▼ + ┌─────────────────┐ + │ NEXUS │ + │ (ToT: 2 branches, pick best) │ + └────────┬────────┘ + ▼ + ┌─────────────────┐ + │ POST-VALIDATION│ + │ (Deterministic)│ + │ • Names consistent? │ + │ • Scores in range? │ + │ • All fields present?│ + └─────────────────┘ +``` + +### AURORA — Brand Identifier (ReAct, 2 LLM Calls) + +**Why ReAct**: Brand identification requires reasoning about CONTEXT (why a color is used 47x on buttons) not just statistics. The model needs to think step-by-step. + +**Step 1: Identify + Name (Main Call)** +``` +System: You are AURORA. You will receive color data with usage context. + +TASK (do these in order, show your reasoning): + +THINK: Look at the color usage data. Which colors appear most in + interactive elements (buttons, links, CTAs)? +ACT: Identify brand primary, secondary, accent. +THINK: Now look at ALL colors. Group them by hue family. +ACT: Assign EVERY color a name using this EXACT convention: + - Role colors: color.{role}.{shade} where role=brand/text/background/border/feedback + - Palette colors: color.{hue}.{shade} where hue=red/orange/yellow/green/teal/blue/purple/pink/neutral + - Shade MUST be numeric: 50/100/200/300/400/500/600/700/800/900 + - NEVER use words like "light", "dark", "base" for shades +OBSERVE: Check your naming. Are ALL names using numeric shades? + Any duplicates? Any conflicts? + +Output JSON with brand_colors + complete naming_map for ALL colors. +``` + +**Step 2: Critic Check (Lightweight Call or Rule-Based)** +```python +# Can be done WITHOUT an LLM call — just Python validation: +def validate_aurora_output(output: dict, input_colors: list[str]) -> tuple[bool, list[str]]: + errors = [] + naming_map = output.get("naming_map", {}) + + # Check 1: All input colors have names + for hex_val in input_colors: + if hex_val not in naming_map: + errors.append(f"Missing name for {hex_val}") + + # Check 2: No word-based shades + for hex_val, name in naming_map.items(): + parts = name.split(".") + last = parts[-1] + if last in ("light", "dark", "base", "muted", "deep"): + errors.append(f"Word shade '{last}' in {name} — must be numeric") + + # Check 3: No duplicate names + names = list(naming_map.values()) + dupes = [n for n in names if names.count(n) > 1] + if dupes: + errors.append(f"Duplicate names: {set(dupes)}") + + return len(errors) == 0, errors +``` + +If validation fails → retry ONCE with error feedback appended to prompt. If still fails → fall back to deterministic HSL-based naming (already in `color_utils.py`). + +### SENTINEL — Best Practices (ReAct, 2 LLM Calls) + +**Why ReAct**: Scoring must be GROUNDED in actual data. The model needs to cite specific numbers, not make up scores. + +**Step 1: Score + Prioritize (Main Call)** +``` +System: You are SENTINEL. You MUST cite specific data for every score. + +INPUT DATA (from Rule Engine — these are FACTS, not opinions): +- AA Pass: 18 of 25 colors (72%) +- AA Fail: 7 colors (list: #ff0000 3.2:1, #ffdc00 1.8:1, ...) +- Type Scale Ratio: 1.18 (variance: 0.22) +- Base Font: 14px +- Spacing: 8px grid, 85% aligned +- Shadows: 5 defined, blur progression: 25→30→80→80→90 (non-monotonic) +- Near-duplicates: 3 pairs + +TASK (cite data for EVERY check): + +CHECK 1 - AA Compliance: + THINK: Rule Engine says 7 of 25 fail. That's 28% failure rate. + SCORE: "fail" — cite "7 colors fail AA, including brand primary #ff0000 (3.2:1)" + +CHECK 2 - Type Scale: + THINK: Ratio 1.18 is not standard (nearest: 1.2 Minor Third). Variance 0.22 > 0.15. + SCORE: "warn" — cite "1.18 is close to Minor Third but inconsistent (variance 0.22)" + +... (continue for all 8 checks) + +THEN calculate overall_score using the weighting: + AA: 25pts × (pass%/100) = 25 × 0.72 = 18 + Type Scale Consistent: ... + ... total = sum + +Output JSON with checks, overall_score, priority_fixes. +``` + +**Step 2: Cross-Reference Critic (Rule-Based)** +```python +def validate_sentinel_output(output: dict, rule_engine: RuleEngineResults) -> tuple[bool, list[str]]: + errors = [] + checks = output.get("checks", {}) + + # If rule engine found AA failures, sentinel MUST mark aa_compliance as fail/warn + aa_failures = len([a for a in rule_engine.accessibility if not a.passes_aa_normal]) + if aa_failures > 0 and checks.get("aa_compliance", {}).get("status") == "pass": + errors.append(f"Sentinel says AA passes but rule engine found {aa_failures} failures") + + # Score must be 0-100 + score = output.get("overall_score", -1) + if not (0 <= score <= 100): + errors.append(f"Score {score} out of range") + + # If many failures, score can't be high + fail_count = sum(1 for c in checks.values() if isinstance(c, dict) and c.get("status") == "fail") + if fail_count >= 3 and score > 70: + errors.append(f"Score {score} too high with {fail_count} failures") + + return len(errors) == 0, errors +``` + +### ATLAS — Benchmark Advisor (Single Call, No ReAct Needed) + +**Why single call**: This agent receives well-structured benchmark comparison data and just needs to pick the best fit. The reasoning is straightforward comparison. + +Keep current implementation but improve prompt to: +1. Explicitly output the top 3 benchmarks ranked +2. Include specific numeric diffs for each +3. Cap alignment changes at 4 + +### NEXUS — HEAD Synthesizer (ToT: 2 Branches) + +**Why Tree of Thought**: The synthesizer needs to weigh competing priorities. Should it emphasize accessibility (SENTINEL's input) or brand fidelity (AURORA's input)? ToT lets it explore both and pick the best. + +**Branch 1: Accessibility-First Scoring** +``` +Weight accessibility at 40%, consistency at 30%, organization at 30%. +If SENTINEL found 7 AA failures → accessibility score tanks → overall score lower. +Result: overall ~55 +``` + +**Branch 2: Balanced Scoring** +``` +Weight accessibility at 30%, consistency at 35%, organization at 35%. +Same data but organization counts more. +Result: overall ~65 +``` + +**Selection**: Pick the branch that: +1. Doesn't contradict any agent's hard failures (if SENTINEL says AA fails, score CAN'T say accessibility is "good") +2. Produces actionable top-3 actions (not generic) +3. Has color recommendations with specific hex values + +**Implementation**: This can be done as a SINGLE LLM call with explicit instruction: + +``` +TASK: You will synthesize from two perspectives. + +PERSPECTIVE A (Accessibility-First): Weight AA compliance heavily. +Calculate scores with accessibility=40%, consistency=30%, org=30%. + +PERSPECTIVE B (Balanced): Equal weights. +Calculate scores with accessibility=33%, consistency=33%, org=33%. + +THEN: Compare both perspectives. Choose the one that: +1. Better reflects the ACTUAL data (don't ignore failures) +2. Produces the most actionable top-3 list +3. Is internally consistent + +Output your CHOSEN perspective's scores + explain WHY you chose it. +``` + +### Model Selection (Final Decision) + +After reviewing all agents' needs: + +| Agent | Model | Reasoning | +|-------|-------|-----------| +| AURORA | `Qwen/Qwen2.5-72B-Instruct` | Best at structured JSON, good reasoning | +| ATLAS | `meta-llama/Llama-3.3-70B-Instruct` | 128K context for benchmark data | +| SENTINEL | `Qwen/Qwen2.5-72B-Instruct` | Methodical, follows rubrics well | +| NEXUS | `meta-llama/Llama-3.3-70B-Instruct` | Good synthesis, large context | + +**Keep current models** — the problem isn't the models, it's the prompting strategy (single-shot vs ReAct) and lack of validation. + +### Cost Budget Per Extraction + +| Step | LLM Calls | Est. Tokens | Est. Cost | +|------|-----------|-------------|-----------| +| AURORA main | 1 | ~2K in, ~1K out | $0.001 | +| AURORA retry (10% of time) | 0.1 | ~2K in, ~1K out | $0.0001 | +| ATLAS | 1 | ~1.5K in, ~0.8K out | $0.001 | +| SENTINEL main | 1 | ~2K in, ~1K out | $0.001 | +| SENTINEL retry (10% of time) | 0.1 | ~2K in, ~1K out | $0.0001 | +| NEXUS | 1 | ~3K in, ~1.2K out | $0.002 | +| **Total** | **~4.2** | **~14K** | **~$0.005** | + +Well within HF free tier ($0.10/mo). + +--- + +## IMPLEMENTATION PLAN + +### Step 1: Consolidate Stage 2 into ONE system +- Keep `llm_agents.py` as the agent definitions (AURORA, SENTINEL, NEXUS) +- Use `stage2_graph.py` for orchestration (parallel AURORA+ATLAS+SENTINEL, then NEXUS) +- Delete the duplicate generic LLM1/LLM2 analyst nodes +- Single entry point: `run_stage2_analysis()` + +### Step 2: Add Pre-Processing Layer +- Before any LLM call, run deterministic cleanup: + - Unify ALL color names to numeric shades (50-900) + - Flatten and deduplicate radius values + - Sort shadows by blur radius + - Build structured data packets for each agent + +### Step 3: Rewrite AURORA with ReAct Prompt +- New prompt: Think → Identify brand → Name ALL colors → Self-verify +- Add `validate_aurora_output()` rule-based critic +- Retry once on validation failure +- Fallback to `_generate_color_name_from_hex()` if LLM fails + +### Step 4: Rewrite SENTINEL with Grounded Scoring +- New prompt: Must cite rule-engine data for every check +- Add `validate_sentinel_output()` cross-reference critic +- Ensure scores match actual data (no inflated pass when data says fail) + +### Step 5: Rewrite NEXUS with ToT +- Two-perspective evaluation in single prompt +- Must choose perspective and explain why +- Post-validation: scores internally consistent, actions are specific + +### Step 6: Add Post-Validation Layer +- After all agents complete, run deterministic checks: + - All color names follow `color.{family}.{shade}` pattern + - All scores are in valid ranges + - No contradictions between agents + - All required fields present +- If post-validation fails, apply rule-based fixes (not another LLM call) + +### Step 7: Fix Normalizer (Stage 1) +- Unify `_generate_color_name_from_value()` to use numeric shades only +- Add radius normalization (flatten, single-value, deduplicate) +- Handle multi-value radius (`"0px 0px 16px 16px"` → individual values or skip) + +### Step 8: Fix Export Layer +- Validation before JSON export +- Ensure DTCG format (`$type`, `$value`) +- Flat radius (never nested tokens inside tokens) +- Consistent units (all px for dimensions) + +--- + +## STAGE 1 AUDIT: WHAT IS VALID vs WHAT NEEDS RETHINKING + +Stage 1 feeds Stage 2 — if Stage 1 produces garbage, no amount of agentic reasoning in Stage 2 can fix it. Let's audit every rule-based component honestly. + +### OVERALL VERDICT: Stage 1 is ~60% correct, 40% broken/missing + +The extraction (Playwright CSS scraping) is solid. The normalizer and rule engine have real problems that corrupt data BEFORE any LLM ever sees it. + +--- + +### Component 1: Extractor (`agents/extractor.py`) — ✅ MOSTLY VALID + +**What it does**: Playwright visits pages, extracts computed CSS styles for every element. +**What it produces**: `ExtractedTokens` — lists of `ColorToken`, `TypographyToken`, `SpacingToken`, `RadiusToken`, `ShadowToken`. + +**What's working**: +- Color extraction: Gets hex values, usage frequency, CSS property context (background-color, color, border-color), element types (button, h1, p). This is exactly what Stage 2 needs. +- Typography extraction: Gets font-family, font-size, font-weight, line-height, element context. Solid. +- Spacing extraction: Gets margin/padding/gap values with px conversion. Solid. + +**What's broken**: +- **Font family**: Returns `"sans-serif"` (the computed fallback) instead of `"Inter"` (the actual font). This is a browser behavior issue — `getComputedStyle()` resolves the font stack to the generic family. **Fix needed**: Use `document.fonts.check()` or extract from CSS `font-family` declarations before resolution. +- **Radius**: Extracts raw CSS values including multi-value shorthand like `"0px 0px 16px 16px"` and percentage values like `"50%"`. The RadiusToken has `value: str` and `value_px: Optional[int]` but the extractor doesn't parse multi-value or percentage. **Fix needed**: Parse in extractor or normalizer. +- **Shadows**: Extracts full CSS shadow string but parsing into components (offset_x, offset_y, blur, spread, color) is unreliable. Some shadows have `None` for all parsed fields. **Fix needed**: Better CSS shadow parser. + +**Verdict**: Extraction is the least broken part. Font family is the biggest issue but it's a well-known Playwright limitation with known workarounds. + +--- + +### Component 2: Normalizer (`agents/normalizer.py`) — ❌ NEEDS MAJOR RETHINK + +**What it does**: Takes raw `ExtractedTokens` lists → deduplicates → names → outputs `NormalizedTokens` dicts. + +**What's working**: +- Color deduplication by exact hex: Correct. Merges frequency/contexts. +- Similar color merging (RGB Euclidean distance < 10): Reasonable threshold, works. +- Typography dedup by unique `family|size|weight|lineHeight`: Correct. +- Spacing dedup and base-8 alignment preference: Correct. +- Confidence scoring by frequency (10+=high, 3-9=medium, 1-2=low): Reasonable. + +**What's BROKEN**: + +#### Problem 2A: Color Naming — TWO COMPETING FUNCTIONS + +``` +_generate_color_name(color, role) → line 236-256 + Input: color + inferred role (from CSS context keywords) + Output: "color.{role}.{shade}" where shade = 50/200/500/700/900 + Uses: NUMERIC shades based on luminance buckets ✅ + +_generate_color_name_from_value(color) → line 258-275 + Input: color (no role found) + Output: "color.{category}.{shade}" where shade = light/base/dark + Uses: WORD shades ❌ ← THIS IS THE ROOT OF THE NAMING PROBLEM +``` + +**The irony**: The first function (with role) already uses numeric shades! But only colors where `_infer_color_role()` finds a keyword match get numeric names. All other colors fall through to the word-based function. + +**`_infer_color_role()` (line 220-234)**: Searches color.contexts + color.elements for keywords like "primary", "button", "background". **Problem**: Most extracted colors don't have semantic class names — they come from computed styles on generic elements. A `
` with `background-color: #005aa3` has no "primary" keyword anywhere. So MOST colors fall through to word-based naming. + +**How often does role inference work?** Rough estimate: +- Sites with BEM/utility classes (Tailwind, Bootstrap): ~40% of colors get roles +- Sites with generic/minified classes: ~5-10% of colors get roles +- Remaining get word-based names → causes mixed convention chaos + +**Fix needed**: Remove `_generate_color_name_from_value()` entirely. Make `_generate_color_name()` the only path, and if no role is inferred, use hue-family + numeric shade (which `_generate_color_name_from_hex()` in app.py already does correctly). + +#### Problem 2B: Radius — NO PROCESSING AT ALL + +```python +# Line 93-97: Just stores raw values +radius_dict = {} +for r in extracted.radius: + key = f"radius-{r.value}" # Raw CSS value as dict key! + radius_dict[key] = r +``` + +**What this produces**: +- `"radius-8px"` → ok +- `"radius-0px 0px 16px 16px"` → garbage key, multi-value +- `"radius-50%"` → percentage, Figma can't use +- `"radius-16px"` AND `"radius-1rem"` → duplicates (both = 16px) + +**What's missing**: +1. No value parsing (multi-value → skip or take max) +2. No unit normalization (%, rem, em → px) +3. No deduplication by resolved px value +4. No semantic naming (none/sm/md/lg/xl/full) +5. No sorting by size + +#### Problem 2C: Shadows — NO PROCESSING AT ALL + +```python +# Line 99-102: Hash-based key, no analysis +shadows_dict = {} +for s in extracted.shadows: + key = f"shadow-{hash(s.value) % 1000}" # Meaningless key! + shadows_dict[key] = s +``` + +**What's missing**: +1. No deduplication by visual similarity +2. No sorting by elevation (blur radius) +3. No semantic naming (xs/sm/md/lg/xl) +4. No validation of shadow progression (blur should increase with elevation level) +5. No filtering of garbage shadows (blur=0, identical to another, etc.) + +#### Problem 2D: Typography Naming — COLLISION RISK + +```python +# Line 310-339: Size-tier names can collide +"font.{category}.{size_tier}" +# Two different h2 styles (24px/700 and 24px/400) both become "font.heading.lg" +``` + +The dedup key at line 86 is `suggested_name or f"{font_family}-{font_size}"`, so if two styles get the SAME suggested name, the second overwrites the first silently. + +--- + +### Component 3: Rule Engine (`core/rule_engine.py`) — ✅ MOSTLY VALID + +**What it does**: Deterministic analysis — type scale ratios, WCAG contrast, spacing grid detection, color statistics. + +**What's working**: +- **Type scale analysis**: Detects ratio between consecutive font sizes, identifies closest standard scale, measures consistency (variance). Correctly filters sizes < 10px. ✅ +- **WCAG contrast checking**: Correct `get_relative_luminance()` per WCAG 2.1 spec. Correct 4.5:1 threshold for AA normal text, 3.0:1 for large text. ✅ +- **AA fix suggestions**: `find_aa_compliant_color()` iterates darken/lighten in 1% steps until 4.5:1 is reached. Brute-force but correct. ✅ +- **Spacing grid detection**: GCD-based base detection, alignment % calculation. Correct. ✅ +- **Color statistics**: Near-duplicate detection, hue distribution, gray/saturated counts. Correct. ✅ +- **Consistency score**: Weighted formula combining all checks. Reasonable. ✅ + +**What's broken/questionable**: + +#### Problem 3A: Accessibility Only Tests Against White/Black + +```python +# Line 545-550 +contrast_white = get_contrast_ratio(hex_color, "#ffffff") +contrast_black = get_contrast_ratio(hex_color, "#000000") +passes_aa_normal = contrast_white >= 4.5 or contrast_black >= 4.5 +``` + +This tests every color against pure white AND pure black. If it passes against EITHER, it's marked as passing. But: +- A brand blue (#005aa3) that passes on white (7.2:1) might be used on a dark navy background (#1a1a2e) where it fails (1.8:1) +- A light gray (#cccccc) passes on black but is used as text on white (#ffffff) where it fails (1.6:1) + +The `fg_bg_pairs` logic (line 577-610) partially addresses this — it checks actual foreground-background combinations from the DOM. **But**: it only adds FAILURES to the results, doesn't correct the per-color assessment above. So a color could show as "passes AA" in the per-color check but "fails AA" in the pair check. **Contradictory data sent to SENTINEL**. + +**Fix needed**: Two modes — (1) per-color against white/black for palette overview, (2) per-pair for actual accessibility score. SENTINEL should see BOTH clearly labeled. + +#### Problem 3B: No Radius Analysis + +The rule engine receives `radius_tokens` (line 1034) but does NOTHING with them. No grid alignment check, no progression validation, no statistics. It's just passed through. + +#### Problem 3C: Shadow Analysis Is Minimal + +The rule engine receives `shadow_tokens` but only passes them to SENTINEL's prompt as raw strings. No programmatic analysis of: +- Blur progression (should increase with elevation) +- Y-offset progression (should increase with elevation) +- Color consistency (should all use same base color/alpha) +- Whether shadows form a coherent elevation system + +This means SENTINEL gets raw shadow CSS strings and has to evaluate them purely from text — no pre-computed metrics to ground its scoring. + +--- + +### Component 4: Semantic Analyzer (`agents/semantic_analyzer.py`) — ⚠️ USEFUL BUT UNDERTRUSTED + +**What it does**: Rule-based categorization of colors by CSS property usage. If a color is used in `background-color` on buttons → it's likely brand primary. If used in `color` property on `

` → it's likely text color. + +**What's working**: The logic is sound — CSS property + element type is a strong signal for color role. This is actually one of the best parts of Stage 1. + +**What's broken**: AURORA receives this as `semantic_analysis` parameter but the data is passed as a secondary input, not the primary. AURORA's prompt says "Suggest Semantic Names for top 10 most-used colors" — it ignores the semantic analysis for the OTHER 20 colors. The semantic analyzer's work is wasted for most colors. + +--- + +### Component 5: Color Utils (`core/color_utils.py`) — ✅ VALID + +**What it does**: Hex/RGB/HSL parsing, contrast calculation, color categorization by hue, color ramp generation. + +**What's working**: All the pure color math is correct. `categorize_color()` returns the right hue family. `generate_color_ramp()` produces reasonable 50-900 shade ramps using OKLCH. + +**No issues found.** This is the most solid component. + +--- + +### Component 6: Export Layer (`app.py` export functions) — ❌ NEEDS RETHINK + +Already documented above in the AS-IS flow. The 3-way naming merge is the killer. + +--- + +## WHAT STAGE 1 SHOULD ACTUALLY PRODUCE (for Stage 2 to work) + +### Current: What Stage 2 receives +``` +NormalizedTokens: + colors: { + "color.blue.light": ColorToken(value="#7fdbff", freq=5, contexts=["background"]), + "color.blue.dark": ColorToken(value="#2c3e50", freq=12, contexts=["text", "button"]), + "color.blue.base": ColorToken(value="#005aa3", freq=47, contexts=["button", "link"]), + "color.neutral.dark": ColorToken(value="#333333", freq=89, contexts=["text"]), + // ← word-based shades, no consistent convention + } + radius: { + "radius-8px": RadiusToken(value="8px"), + "radius-0px 0px 16px 16px": RadiusToken(value="0px 0px 16px 16px"), // ← garbage + "radius-50%": RadiusToken(value="50%"), // ← Figma can't use + } + shadows: { + "shadow-234": ShadowToken(value="0px 4px 25px rgba(0,0,0,0.1)"), // ← meaningless key + "shadow-891": ShadowToken(value="0px 2px 30px rgba(0,0,0,0.15)"), // ← unsorted + } +``` + +### Target: What Stage 2 SHOULD receive +``` +NormalizedTokens: + colors: { + "color.blue.300": ColorToken(value="#7fdbff", freq=5, contexts=["background"], + role="palette", hue="blue", shade=300), + "color.blue.800": ColorToken(value="#2c3e50", freq=12, contexts=["text", "button"], + role="palette", hue="blue", shade=800), + "color.blue.500": ColorToken(value="#005aa3", freq=47, contexts=["button", "link"], + role="brand_candidate", hue="blue", shade=500), + "color.neutral.700": ColorToken(value="#333333", freq=89, contexts=["text"], + role="text_candidate", hue="neutral", shade=700), + // ← ALL numeric shades, with role hints for AURORA + } + radius: { + "radius.sm": RadiusToken(value="4px", value_px=4), + "radius.md": RadiusToken(value="8px", value_px=8), + "radius.xl": RadiusToken(value="16px", value_px=16), + "radius.full": RadiusToken(value="9999px", value_px=9999), + // ← flat, single-value, deduped, sorted, named + } + shadows: { + "shadow.xs": ShadowToken(value="...", blur_px=4, y_offset_px=2), + "shadow.sm": ShadowToken(value="...", blur_px=8, y_offset_px=4), + "shadow.md": ShadowToken(value="...", blur_px=16, y_offset_px=8), + // ← sorted by elevation, named progressively + } +``` + +### What changes are needed in Stage 1: + +| Component | Current State | What's Wrong | Fix | +|-----------|--------------|-------------|-----| +| **Normalizer: color naming** | Two functions, word vs numeric | Mixed conventions | Remove word-based function, use numeric for ALL | +| **Normalizer: color role hints** | Keyword-based inference (5-40% hit rate) | Most colors get no role | Add `role_hint` field: "brand_candidate", "text_candidate", "bg_candidate" based on CSS property (from semantic analyzer) | +| **Normalizer: radius** | Raw values stored, no processing | Multi-value, %, no dedup | Parse → single px value → deduplicate → sort → name (none/sm/md/lg/xl/full) | +| **Normalizer: shadows** | Hash-based keys, no processing | Unsorted, unnamed, no metrics | Parse components → sort by blur → deduplicate → name (xs/sm/md/lg/xl) | +| **Normalizer: typography** | Collision-prone naming | Same name for different styles | Add weight suffix: `font.heading.lg.700` vs `font.heading.lg.400` | +| **Rule engine: accessibility** | Tests against white/black only | Doesn't match real usage | Add separate per-pair analysis, label both modes clearly | +| **Rule engine: radius** | Not analyzed | No grid check, no stats | Add radius grid analysis (base-4/base-8), dedup stats | +| **Rule engine: shadows** | Not analyzed | No progression check | Add shadow elevation analysis (blur/offset progression) | +| **Extractor: font family** | Returns fallback generic | Browser resolves to "sans-serif" | Extract from CSS declaration before computed resolution | + +--- + +## REVISED EXECUTION ORDER (Stage 1 fixes interleaved, not deferred) + +The original plan was "fix Stage 2 first, Stage 1 later." But the audit reveals: +**If normalizer sends word-based shade names to AURORA, AURORA's ReAct naming will STILL conflict with normalizer names in the export merge.** + +The pre-processing layer (Step 2 in the old plan) was supposed to fix this. But that's a bandaid — it re-normalizes what the normalizer already normalized. It's cleaner to fix the normalizer itself so it produces correct output from the start. + +### New Execution Order: + +``` +PHASE 1: FIX NORMALIZER (makes Stage 1 output clean) + 1a. Unify color naming → numeric shades only + 1b. Add radius normalization (parse, deduplicate, sort, name) + 1c. Add shadow normalization (parse, sort by blur, name) + 1d. Feed semantic_analyzer role hints into normalizer + +PHASE 2: FIX STAGE 2 (agents can now trust their input) + 2a. Consolidate two Stage 2 systems into one + 2b. Rewrite AURORA with ReAct + critic (names ALL colors, not 10) + 2c. Rewrite SENTINEL with grounded scoring + critic + 2d. Rewrite NEXUS with ToT + 2e. Add post-validation layer + +PHASE 3: FIX EXPORT (single naming authority) + 3a. AURORA naming_map is THE authority (not 3-way merge) + 3b. Radius/shadow export uses normalizer output directly + 3c. Validation before JSON write + +PHASE 4: FIX EXTRACTION (nice-to-have, not blocking) + 4a. Font family detection improvement + 4b. Rule engine: radius grid analysis + 4c. Rule engine: shadow elevation analysis +``` + +### Why this order is better: + +1. **Phase 1 first** because AURORA can't name colors well if the input names are garbage. The ReAct prompt says "observe your naming" but if the LLM sees `color.blue.light` in its input AND is asked to output `color.blue.300`, it gets confused. + +2. **Phase 2 after Phase 1** because now the LLM agents receive clean, consistently-named input. AURORA's job becomes "confirm or improve these names" rather than "fix the mess from normalizer." + +3. **Phase 3 after Phase 2** because the export layer just needs to respect one naming authority (AURORA), not reconcile three. + +4. **Phase 4 last** because font family and enhanced rule engine analysis are improvements, not blockers. + +### Deploy Plan: +- **Deploy 1**: After Phase 1 (normalizer fixes) — even without Stage 2 improvements, the export will be cleaner +- **Deploy 2**: After Phase 2 + 3 (full Stage 2 rework + export) — the big quality jump +- **Deploy 3**: After Phase 4 (font family, enhanced analysis) — polish + +--- + +## CRITIC REVIEW: SHOULD EACH COMPONENT STAY RULE-BASED OR USE LLM? + +Every rule-based component needs to justify itself. Rules are free and fast, but if they produce garbage that LLMs then have to fix, the "free" part is an illusion — you pay in bad output quality instead. + +### Decision Framework + +| Use Rules When... | Use LLM When... | +|---|---| +| Math with right answers (contrast ratio) | Judgment with context (is this the brand color?) | +| Deterministic transforms (hex→RGB) | Ambiguous signals (is this a button or just a styled div?) | +| Simple pattern matching (is 16 divisible by 8?) | Weighing competing evidence (high freq but wrong context) | +| Zero tolerance for hallucination (export format) | Understanding intent (why is this color used here?) | +| Must be 100% reproducible | Acceptable to vary slightly between runs | + +--- + +### 1. Color Naming (Normalizer) — ❌ RULES FAILING, NEEDS RETHINK + +**Current**: Rule-based. Two functions: keyword-match for role → numeric shade, fallback → word shade. + +**Critic's Question**: Can rules correctly name 30 colors with just CSS property + element context? + +**Honest Answer**: No. Here's why: + +The normalizer's `_infer_color_role()` searches for keywords like "primary", "button", "background" in the element/context strings. But: + +``` +Extracted color: #005aa3, freq=47 + css_properties: ["background-color"] + elements: ["div", "a"] + contexts: ["background"] +``` + +No keyword "primary" or "button" anywhere. Rules classify this as "unknown role" → falls to word-based naming → `color.blue.base`. But this is CLEARLY the brand primary (used 47 times on links and divs with background-color). + +An LLM can reason: "47 uses on `` elements with `background-color` = this is a CTA color = brand primary." Rules can't make that inference. + +**But**: An LLM to name 30 colors costs ~$0.001 and adds 2-3 seconds. For something that happens once per extraction, this is acceptable. + +**Verdict**: +- **Keep rules for**: Hue family detection (HSL math), shade number assignment (luminance → 50-900), deduplication (exact hex + RGB distance) +- **Move to LLM (AURORA)**: Semantic role assignment (brand.primary vs text.secondary vs background.primary). This is already AURORA's job — but currently AURORA only does it for 10 colors. Expand AURORA to name ALL colors. +- **ELIMINATE from normalizer**: The `_generate_color_name_from_value()` function and the `_infer_color_role()` function. Replace with a simpler `_generate_preliminary_name()` that just uses hue + numeric shade. Let AURORA do the semantic naming. + +**New flow**: +``` +Normalizer: "color.blue.500" (hue + shade, no role) + ↓ +AURORA: "color.brand.primary" (semantic role from context reasoning) + ↓ +Export: Uses AURORA name, falls back to normalizer name +``` + +--- + +### 2. Radius Processing — ✅ RULES ARE CORRECT APPROACH, JUST MISSING + +**Current**: No processing at all (raw values stored). + +**Critic's Question**: Does radius naming need LLM intelligence? + +**Honest Answer**: No. Radius is pure math: +- Parse CSS value → px number +- Skip multi-value shorthand (or take max) +- Convert 50% → 9999px (full circle) +- Sort by px value +- Name by size tier: 0=none, 1-3=sm, 4-8=md, 9-16=lg, 17-24=xl, 25+=2xl, 9999=full + +No ambiguity, no judgment needed. An LLM would add nothing here. + +**Verdict**: Keep rule-based. Just implement the processing that's currently missing. + +--- + +### 3. Shadow Processing — ⚠️ MOSTLY RULES, BUT LLM COULD HELP WITH EDGE CASES + +**Current**: No processing at all (hash-based keys). + +**Critic's Question**: Can rules correctly name and sort shadows? + +**Mostly yes**: +- Parse CSS shadow string → {x, y, blur, spread, color} — regex, no LLM needed +- Sort by blur radius — math +- Name by elevation tier (xs/sm/md/lg/xl) — math +- Detect non-monotonic progression — math + +**But**: Some edge cases are hard for rules: +- `0px 0px 0px 4px rgba(0,0,0,0.2)` — is this a shadow or a border simulation? (spread-only, no blur) +- Multiple shadows on same element — which is the "primary" shadow? +- `inset` shadows — different semantic meaning (inner glow vs elevation) + +These edge cases affect maybe 10% of shadows. Rules can handle 90% correctly. + +**Verdict**: Keep rule-based for parsing, sorting, naming. Add simple heuristic rules for edge cases (spread-only → treat as border, inset → separate category). NOT worth an LLM call. + +--- + +### 4. Accessibility Checking (Rule Engine) — ✅ RULES ARE THE ONLY CORRECT APPROACH + +**Current**: WCAG contrast math + fix suggestions. + +**Critic's Question**: Could an LLM improve accessibility checking? + +**Absolutely not.** WCAG is a mathematical standard. 4.5:1 is 4.5:1. An LLM cannot calculate contrast ratios — it would hallucinate them. The rule engine's `get_relative_luminance()` implementation follows the exact WCAG 2.1 spec. This MUST stay rule-based. + +**What rules CAN'T do** (and LLM CAN): Prioritize which failures matter most. "Brand primary fails AA" is more critical than "a decorative border color fails AA." This is judgment → belongs in SENTINEL. + +**Verdict**: Keep accessibility math 100% rule-based. Use SENTINEL to prioritize/contextualize the results. + +--- + +### 5. Type Scale Detection (Rule Engine) — ✅ RULES ARE CORRECT + +**Current**: Ratio calculation between consecutive font sizes, variance check, standard scale matching. + +**Critic's Question**: Could an LLM detect type scales better? + +**No.** Type scale detection is pure math: sizes → ratios → average → closest standard. An LLM would be slower and less accurate at arithmetic. + +**What rules CAN'T do**: Recommend which scale to adopt. "Your ratio is 1.18, should you round to 1.2 (Minor Third) or 1.25 (Major Third)?" — this depends on the site's purpose (content-heavy = 1.2, marketing = 1.333). This is judgment → belongs in ATLAS/NEXUS. + +**Verdict**: Keep rule-based. Already working correctly after the 10px filter fix. + +--- + +### 6. Spacing Grid Detection (Rule Engine) — ✅ RULES ARE CORRECT + +**Current**: GCD-based detection, alignment percentage, base-4/base-8 check. + +**Verdict**: Pure math, working correctly. Keep rule-based. + +--- + +### 7. Semantic Color Analysis (`semantic_analyzer.py`) — ⚠️ OVERLAPS WITH AURORA, CONSOLIDATE + +**Current**: Rule-based fallback + optional LLM call. Categorizes colors into brand/text/background/border/feedback. + +**Critic's Question**: This does THE SAME JOB as AURORA. Why do we have both? + +**The overlap**: +- Semantic Analyzer: "This color is brand.primary because it's on buttons" (rule-based + optional LLM) +- AURORA: "This color is brand.primary because it's used 47x on CTAs" (LLM) +- Both produce semantic names for colors +- Both feed into export + +**The problem**: They run at DIFFERENT STAGES: +- Semantic Analyzer runs in Stage 1 (during extraction) +- AURORA runs in Stage 2 (during analysis) +- Their outputs can conflict +- Export tries to merge both → more naming chaos + +**Verdict**: ELIMINATE the semantic analyzer as a separate component. Move its rule-based heuristics INTO the normalizer as `role_hint` field (e.g., "brand_candidate", "text_candidate"). These hints become INPUT to AURORA, not a competing output. + +``` +BEFORE: + Semantic Analyzer → state.semantic_analysis → AURORA (partially uses it) + → Export (also uses it, conflicts) + +AFTER: + Normalizer adds role_hints → AURORA uses hints as evidence → AURORA names → Export + (no separate semantic analyzer) +``` + +--- + +### 8. Color Deduplication (Normalizer) — ⚠️ RULES ARE CORRECT BUT THRESHOLD IS QUESTIONABLE + +**Current**: RGB Euclidean distance < 10 → merge. + +**Critic's Question**: Is RGB distance the right metric? + +**Not really.** RGB Euclidean distance is NOT perceptually uniform. Two colors that look identical to humans can have large RGB distance, and two that look different can have small RGB distance. The industry standard for perceptual color difference is Delta-E (CIE2000). + +However: For the purpose of "should we keep both #1a1a1a and #1b1b1b in the design system?" — RGB distance < 10 is a reasonable approximation. These truly are near-identical grays. + +The color_utils.py `color_distance()` function also uses RGB Euclidean. It's used in the rule engine for near-duplicate detection. + +**Verdict**: Keep rule-based, but consider switching to Delta-E (CIEDE2000) for better perceptual accuracy. Low priority — the current approach works for most cases. + +--- + +### 9. Color Statistics (Rule Engine) — ✅ RULES ARE CORRECT + +Counting uniques, duplicates, hue distribution — pure counting. Keep rule-based. + +--- + +### 10. Pre-Processing Layer (NEW — proposed in architecture) — SHOULD THIS BE AN LLM? + +**Current plan**: Deterministic pre-processing before Stage 2 agents. + +**Critic's Question**: The pre-processing unifies names, flattens radius, sorts shadows. Should this use an LLM? + +**No.** Everything pre-processing does is deterministic: +- Rename color.blue.light → color.blue.300 (luminance lookup table) +- Flatten "0px 0px 16px 16px" → skip or max(16) +- Sort shadows by blur px + +No judgment needed, no ambiguity. Keep deterministic. + +--- + +## SUMMARY: WHAT STAYS RULE-BASED, WHAT MOVES TO LLM + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ KEEP RULE-BASED (correct, no LLM needed) │ +│ │ +│ ✅ WCAG contrast calculation │ +│ ✅ Type scale ratio detection │ +│ ✅ Spacing grid detection (GCD) │ +│ ✅ Color deduplication (RGB/Delta-E distance) │ +│ ✅ Color statistics (counts, hue distribution) │ +│ ✅ Radius processing (parse, sort, name) — needs implementing │ +│ ✅ Shadow processing (parse, sort, name) — needs implementing │ +│ ✅ Color hue family detection (HSL math) │ +│ ✅ Color shade number assignment (luminance → 50-900) │ +│ ✅ Pre-processing layer (rename, flatten, sort) │ +│ ✅ Post-validation layer (check conventions, ranges) │ +│ ✅ AA fix suggestions (darken/lighten iteration) │ +│ ✅ Export format (DTCG structure) │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ MOVE TO LLM (requires judgment, context, ambiguity) │ +│ │ +│ 🤖 Color semantic naming (brand.primary vs text.secondary) │ +│ Currently: normalizer (bad) + semantic analyzer (conflicts) │ +│ Move to: AURORA (ReAct, names ALL colors) │ +│ │ +│ 🤖 Prioritizing which AA failures matter most │ +│ Currently: all treated equally │ +│ Move to: SENTINEL (cites data, ranks by impact) │ +│ │ +│ 🤖 Scoring cohesion/consistency holistically │ +│ Currently: simple weighted formula │ +│ Move to: NEXUS (weighs competing dimensions) │ +│ │ +│ 🤖 Recommending which design system to align with │ +│ Currently: ATLAS (already LLM) — keep as is │ +│ │ +│ 🤖 Recommending scale/spacing changes │ +│ Currently: defaults to "1.25 Major Third" │ +│ Move to: NEXUS (considers site purpose and brand) │ +└─────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────┐ +│ ELIMINATE (redundant or actively harmful) │ +│ │ +│ ❌ normalizer._generate_color_name_from_value() │ +│ Word-based shades (light/dark/base) — root cause of chaos │ +│ │ +│ ❌ normalizer._infer_color_role() │ +│ Keyword matching for role — too low hit rate (5-40%) │ +│ Replace with: role_hint from CSS property + element type │ +│ │ +│ ❌ semantic_analyzer.py as separate component │ +│ Overlaps with AURORA, creates competing names │ +│ Replace with: role_hints embedded in normalizer output │ +│ │ +│ ❌ app.py _generate_color_name_from_hex() │ +│ Third naming system (numeric), conflicts with other two │ +│ Replace with: normalizer's single naming path │ +│ │ +│ ❌ app.py _get_semantic_color_overrides() 3-way merge │ +│ Merges semantic + AURORA + NEXUS names → chaos │ +│ Replace with: AURORA naming_map as single authority │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### New LLM Budget After Critic Review + +No new LLM calls needed. We're just: +1. Expanding AURORA from "name 10 colors" to "name ALL colors" (same 1 call, slightly larger output) +2. Eliminating the semantic analyzer's optional LLM call (saves $0.001) +3. All other changes are rule-based fixes + +Net LLM cost: Same or slightly less than today (~$0.005 per extraction). diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6a24c65e091f0cb0377a19802844899e39458b79 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,68 @@ +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies for Playwright +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libatspi2.0-0 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxkbcommon0 \ + libxrandr2 \ + xdg-utils \ + libu2f-udev \ + libvulkan1 \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Install Playwright system dependencies +RUN playwright install-deps chromium + +# Create non-root user +RUN useradd -m -u 1000 user + +# Copy application code +COPY . . + +# Change ownership to user +RUN chown -R user:user /app + +# Switch to user +USER user + +# Set environment variables +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH \ + GRADIO_SERVER_NAME=0.0.0.0 \ + GRADIO_SERVER_PORT=7860 \ + PLAYWRIGHT_BROWSERS_PATH=/home/user/.cache/ms-playwright + +# Install Playwright browsers as user +RUN playwright install chromium + +# Expose port +EXPOSE 7860 + +# Run the application +CMD ["python", "app.py"] diff --git a/PLAN_W3C_DTCG_UPDATE.md b/PLAN_W3C_DTCG_UPDATE.md new file mode 100644 index 0000000000000000000000000000000000000000..47aff735429165b9813c184cbad6314b6b77ca9e --- /dev/null +++ b/PLAN_W3C_DTCG_UPDATE.md @@ -0,0 +1,318 @@ +# PLAN: Update to W3C DTCG Design Token Format + +## Overview + +Update both the **Design System Extractor export** and the **Figma plugin** to use the official **W3C DTCG (Design Tokens Community Group)** format - the industry standard as of October 2025. + +--- + +## Current vs Target Format + +### CURRENT (Custom/Legacy) +```json +{ + "global": { + "colors": { + "color.brand.primary": { + "value": "#540b79", + "type": "color" + } + }, + "typography": { + "font.heading.xl.desktop": { + "value": { + "fontFamily": "Open Sans", + "fontSize": "32px", + "fontWeight": "700", + "lineHeight": "1.3" + }, + "type": "typography" + } + }, + "spacing": { + "space.1.desktop": { + "value": "8px", + "type": "dimension" + } + }, + "borderRadius": { + "radius.md": { + "value": "8px", + "type": "borderRadius" + } + }, + "shadows": { + "shadow.sm": { + "value": { "x": "0", "y": "2", "blur": "4", ... }, + "type": "boxShadow" + } + } + } +} +``` + +### TARGET (W3C DTCG Standard) +```json +{ + "color": { + "brand": { + "primary": { + "$type": "color", + "$value": "#540b79", + "$description": "Main brand color" + } + } + }, + "font": { + "heading": { + "xl": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "Open Sans", + "fontSize": "32px", + "fontWeight": "700", + "lineHeight": "1.3" + } + } + } + } + }, + "spacing": { + "1": { + "desktop": { + "$type": "dimension", + "$value": "8px" + } + } + }, + "borderRadius": { + "md": { + "$type": "dimension", + "$value": "8px" + } + }, + "shadow": { + "sm": { + "$type": "shadow", + "$value": { + "color": "#00000026", + "offsetX": "0px", + "offsetY": "2px", + "blur": "4px", + "spread": "0px" + } + } + } +} +``` + +--- + +## Key Changes Summary + +| Aspect | Current | DTCG Target | +|--------|---------|-------------| +| Property prefix | `value`, `type` | `$value`, `$type` | +| Root wrapper | `global` | None (flat root) | +| Token nesting | Flat keys (`color.brand.primary`) | Nested objects (`color.brand.primary`) | +| Color type | `"type": "color"` | `"$type": "color"` | +| Typography type | `"type": "typography"` | `"$type": "typography"` | +| Spacing type | `"type": "dimension"` | `"$type": "dimension"` | +| Radius type | `"type": "borderRadius"` | `"$type": "dimension"` | +| Shadow type | `"type": "boxShadow"` | `"$type": "shadow"` | + +--- + +## Files to Update + +### 1. Export Functions (`app.py`) + +**File:** `/Users/yahya/design-system-extractor-v2-hf-fix/app.py` + +**Functions to modify:** +- `export_stage1_json()` (~line 3095) +- `export_tokens_json()` (~line 3248) + +**Changes:** +1. Remove `global` wrapper - tokens at root level +2. Change `value` → `$value`, `type` → `$type` +3. Convert flat keys to nested structure: + - `color.brand.primary` → `{ color: { brand: { primary: {...} } } }` + - `font.heading.xl.desktop` → `{ font: { heading: { xl: { desktop: {...} } } } }` +4. Add helper function to convert flat key to nested object +5. Update shadow format to DTCG spec +6. Keep `$description` for semantic tokens + +### 2. Figma Plugin (`code.js`) + +**File:** `/Users/yahya/design-system-extractor-v2-hf-fix/output_json/figma-plugin-extracted/figma-design-token-creator 5/src/code.js` + +**Changes:** +1. Update `normalizeTokens()` to detect DTCG format (look for `$value`, `$type`) +2. Update `extractColors()` to handle: + - `$value` instead of `value` + - Nested structure traversal +3. Update `extractTypography()` to handle DTCG composite format +4. Update `extractSpacing()` for dimension tokens +5. Add shadow extraction (currently not implemented) +6. Support both legacy AND DTCG formats for backwards compatibility + +### 3. Plugin UI (`ui.html`) + +**File:** `/Users/yahya/design-system-extractor-v2-hf-fix/output_json/figma-plugin-extracted/figma-design-token-creator 5/ui/ui.html` + +**Changes:** +1. Update `extractColorsForPreview()` to handle `$value` +2. Update `extractSpacingForPreview()` to handle `$value` +3. Update `buildTypographyPreview()` for nested + DTCG format +4. Add format detection message for DTCG +5. Add shadow preview section + +--- + +## Detailed Implementation Steps + +### Step 1: Create DTCG Export Helper Functions (app.py) + +```python +def _key_to_nested_path(flat_key: str) -> list: + """Convert 'color.brand.primary' to ['color', 'brand', 'primary']""" + return flat_key.split('.') + +def _set_nested_value(obj: dict, path: list, value: dict): + """Set a value at a nested path in a dictionary""" + for key in path[:-1]: + if key not in obj: + obj[key] = {} + obj = obj[key] + obj[path[-1]] = value + +def _to_dtcg_token(value, token_type: str, description: str = None) -> dict: + """Convert to DTCG format with $value, $type, $description""" + token = { + "$type": token_type, + "$value": value + } + if description: + token["$description"] = description + return token +``` + +### Step 2: Update Export Functions (app.py) + +Rewrite `export_stage1_json()` and `export_tokens_json()` to: +1. Build nested structure instead of flat +2. Use `$value`, `$type`, `$description` +3. Map token types correctly: + - `borderRadius` → `dimension` (DTCG uses dimension for radii) + - `boxShadow` → `shadow` + - Keep `color`, `typography`, `dimension` + +### Step 3: Update Plugin Token Extraction (code.js) + +Add DTCG detection and extraction: + +```javascript +// Detect if DTCG format +function isDTCGFormat(obj) { + if (!obj || typeof obj !== 'object') return false; + var keys = Object.keys(obj); + for (var i = 0; i < keys.length; i++) { + var val = obj[keys[i]]; + if (val && typeof val === 'object') { + if (val['$value'] !== undefined || val['$type'] !== undefined) { + return true; + } + } + } + return false; +} + +// Extract from DTCG format +function extractColorsDTCG(obj, prefix, results) { + // Handle $value, $type + // Recursively traverse nested structure +} +``` + +### Step 4: Update Plugin UI (ui.html) + +Update preview functions to handle both formats. + +### Step 5: Add Shadow Support to Plugin + +Currently the plugin doesn't create Effect Styles for shadows. Add: + +```javascript +// CREATE EFFECT STYLES (Shadows) +for (var si = 0; si < tokens.shadows.length; si++) { + var shadowToken = tokens.shadows[si]; + var effectStyle = figma.createEffectStyle(); + effectStyle.name = 'shadows/' + shadowToken.name; + effectStyle.effects = [{ + type: 'DROP_SHADOW', + color: { r: 0, g: 0, b: 0, a: 0.25 }, + offset: { x: parseFloat(shadowToken.value.offsetX), y: parseFloat(shadowToken.value.offsetY) }, + radius: parseFloat(shadowToken.value.blur), + spread: parseFloat(shadowToken.value.spread), + visible: true, + blendMode: 'NORMAL' + }]; +} +``` + +--- + +## Testing Checklist + +After implementation, verify: + +- [ ] Export Stage 1 JSON produces valid DTCG format +- [ ] Export Final JSON produces valid DTCG format +- [ ] Token names are properly nested (`color.brand.primary` → nested object) +- [ ] All `$value`, `$type` prefixes present +- [ ] Figma plugin successfully imports DTCG JSON +- [ ] Colors → Paint Styles created correctly +- [ ] Typography → Text Styles created correctly +- [ ] Spacing → Variables created correctly +- [ ] Border Radius → Variables created correctly +- [ ] Shadows → Effect Styles created correctly +- [ ] Plugin still works with legacy format (backwards compatible) + +--- + +## Benefits After Implementation + +1. **Interoperability** - Works with Figma, Sketch, Framer, Style Dictionary, Tokens Studio +2. **Future-proof** - Official W3C standard, adopted by industry +3. **Tool ecosystem** - Compatible with 10+ design tools +4. **Code generation** - Works with Style Dictionary for CSS/iOS/Android +5. **No vendor lock-in** - Standard format, portable + +--- + +## Estimated Effort + +| Task | Complexity | Time | +|------|------------|------| +| Export helper functions | Low | 15 min | +| Update export_stage1_json | Medium | 30 min | +| Update export_tokens_json | Medium | 30 min | +| Update plugin code.js | Medium | 45 min | +| Update plugin ui.html | Low | 20 min | +| Add shadow support to plugin | Medium | 30 min | +| Testing & fixes | Medium | 30 min | +| **Total** | | **~3 hours** | + +--- + +## Awaiting Confirmation + +Please confirm: +1. ✅ Proceed with W3C DTCG format update? +2. ✅ Update both app.py export AND Figma plugin? +3. ✅ Add shadow Effect Style support to plugin? +4. ✅ Maintain backwards compatibility for legacy format in plugin? + +**Reply "approved" or provide feedback to proceed.** diff --git a/PROJECT_CONTEXT.md b/PROJECT_CONTEXT.md new file mode 100644 index 0000000000000000000000000000000000000000..929ea85f434494c648f11f6072722d4367978f19 --- /dev/null +++ b/PROJECT_CONTEXT.md @@ -0,0 +1,170 @@ +# Design System Extractor v2 — Project Context + +## Architecture Overview + +``` +Stage 0: Configuration Stage 1: Discovery & Extraction Stage 2: AI Analysis Stage 3: Export + ┌──────────────────┐ ┌──────────────────────────┐ ┌──────────────────────────┐ ┌──────────────┐ + │ HF Token Setup │ ──────> │ URL Discovery (sitemap/ │ ──────> │ Layer 1: Rule Engine │ ──> │ Figma Tokens │ + │ Benchmark Select │ │ crawl) + Token Extraction │ │ Layer 2: Benchmarks │ │ JSON Export │ + └──────────────────┘ │ (Desktop + Mobile CSS) │ │ Layer 3: LLM Agents (x3) │ └──────────────┘ + └──────────────────────────┘ │ Layer 4: HEAD Synthesizer│ + └──────────────────────────┘ +``` + +### Stage 1: Discovery & Extraction (Rule-Based, Free) +- **Discover Pages**: Fetches sitemap.xml or crawls site to find pages +- **Extract Tokens**: Playwright visits each page at 2 viewports (Desktop 1440px, Mobile 375px), extracts computed CSS for colors, typography, spacing, radius, shadows +- **User Review**: Interactive tables with Accept/Reject checkboxes + visual previews + +### Stage 2: AI-Powered Analysis (4 Layers) + +| Layer | Type | What It Does | Cost | +|-------|------|--------------|------| +| **Layer 1** | Rule Engine | Type scale detection, AA contrast checking, spacing grid analysis, color statistics | FREE | +| **Layer 2** | Benchmark Research | Compare against Material Design 3, Apple HIG, Tailwind, etc. | ~$0.001 | +| **Layer 3** | LLM Agents (x3) | AURORA (Brand ID) + ATLAS (Benchmark) + SENTINEL (Best Practices) | ~$0.002 | +| **Layer 4** | HEAD Synthesizer | NEXUS combines all outputs into final recommendations | ~$0.001 | + +### Stage 3: Export +- Apply/reject individual color, typography, spacing recommendations +- Export Figma Tokens Studio-compatible JSON + +--- + +## Agent Roster + +| Agent | Codename | Model | Temp | Input | Output | Specialty | +|-------|----------|-------|------|-------|--------|-----------| +| Brand Identifier | **AURORA** | Qwen/Qwen2.5-72B-Instruct | 0.4 | Color tokens + semantic CSS analysis | Brand primary/secondary/accent, palette strategy, cohesion score, semantic names | Creative/visual reasoning, color harmony assessment | +| Benchmark Advisor | **ATLAS** | meta-llama/Llama-3.3-70B-Instruct | 0.25 | User's type scale, spacing, font sizes + benchmark comparison data | Recommended benchmark, alignment changes, pros/cons | 128K context for large benchmark data, comparative reasoning | +| Best Practices Validator | **SENTINEL** | Qwen/Qwen2.5-72B-Instruct | 0.2 | Rule Engine results (typography, accessibility, spacing, color stats) | Overall score (0-100), check results, prioritized fix list | Methodical rule-following, precise judgment | +| HEAD Synthesizer | **NEXUS** | meta-llama/Llama-3.3-70B-Instruct | 0.3 | All 3 agent outputs + Rule Engine facts | Executive summary, scores, top 3 actions, color/type/spacing recs | 128K context for combined inputs, synthesis capability | + +### Why These Models + +- **Qwen 72B** (AURORA, SENTINEL): Strong creative reasoning for brand analysis; methodical structured output for best practices. Available on HF serverless without gated access. +- **Llama 3.3 70B** (ATLAS, NEXUS): 128K context window handles large combined inputs from multiple agents. Excellent comparative and synthesis reasoning. +- **Fallback**: Qwen/Qwen2.5-7B-Instruct (free tier, available when primary models fail) + +### Temperature Rationale + +- **0.4** (AURORA): Allows creative interpretation of color stories and palette harmony +- **0.25** (ATLAS): Analytical comparison needs consistency but some flexibility for trade-off reasoning +- **0.2** (SENTINEL): Strict rule evaluation — consistency is critical for compliance scoring +- **0.3** (NEXUS): Balanced — needs to synthesize creatively but stay grounded in agent data + +--- + +## Evaluation & Scoring + +### Self-Evaluation (All Agents) +Each agent includes a `self_evaluation` block in its JSON output: +```json +{ + "confidence": 8, // 1-10: How confident the agent is + "reasoning": "Clear usage patterns with 20+ colors", + "data_quality": "good", // good | fair | poor + "flags": [] // e.g., ["insufficient_context", "ambiguous_data"] +} +``` + +### AURORA Scoring Rubric (Cohesion 1-10) +- **9-10**: Clear harmony rule, distinct brand colors, consistent palette +- **7-8**: Mostly harmonious, clear brand identity +- **5-6**: Some relationships visible but not systematic +- **3-4**: Random palette, no clear strategy +- **1-2**: Conflicting colors, no brand identity + +### SENTINEL Scoring Rubric (Overall 0-100) +Weighted checks: +- AA Compliance: 25 points +- Type Scale Consistency: 15 points +- Base Size Accessible: 15 points +- Spacing Grid: 15 points +- Type Scale Standard Ratio: 10 points +- Color Count: 10 points +- No Near-Duplicates: 10 points + +### NEXUS Scoring Rubric (Overall 0-100) +- **90-100**: Production-ready, minor polishing only +- **75-89**: Solid foundation, 2-3 targeted improvements +- **60-74**: Functional but needs focused attention +- **40-59**: Significant gaps requiring systematic improvement +- **20-39**: Major rework needed +- **0-19**: Fundamental redesign recommended + +### Evaluation Summary (Logged After Analysis) +``` +═══════════════════════════════════════════════════ +🔍 AGENT EVALUATION SUMMARY +═══════════════════════════════════════════════════ + 🎨 AURORA (Brand ID): confidence=8/10, data=good + 🏢 ATLAS (Benchmark): confidence=7/10, data=good + ✅ SENTINEL (Practices): confidence=9/10, data=good, score=72/100 + 🧠 NEXUS (Synthesis): confidence=8/10, data=good, overall=65/100 +═══════════════════════════════════════════════════ +``` + +--- + +## User Journey + +1. **Enter HF Token** — Required for LLM inference (free tier works) +2. **Enter Website URL** — The site to extract design tokens from +3. **Discover Pages** — Auto-finds pages via sitemap or crawling +4. **Select Pages** — Check/uncheck pages to include (max 10) +5. **Extract Tokens** — Scans selected pages at Desktop + Mobile viewports +6. **Review Stage 1** — Interactive tables: Colors, Typography, Spacing, Radius, Shadows, Semantic Colors. Each tab has a data table + visual preview accordion. Accept/reject individual tokens. +7. **Proceed to Stage 2** — Select benchmarks to compare against +8. **Run AI Analysis** — 4-layer pipeline executes (Rule Engine -> Benchmarks -> LLM Agents -> Synthesis) +9. **Review Analysis** — Dashboard with scores, recommendations, benchmark comparison, color recs +10. **Apply Upgrades** — Accept/reject individual recommendations +11. **Export JSON** — Download Figma Tokens Studio-compatible JSON + +--- + +## File Structure + +| File | Responsibility | +|------|----------------| +| `app.py` | Main Gradio UI — all stages, CSS, event bindings, formatting functions | +| `agents/llm_agents.py` | 4 LLM agent classes (AURORA, ATLAS, SENTINEL, NEXUS) + dataclasses | +| `agents/semantic_analyzer.py` | Semantic color categorization (brand, text, background, etc.) | +| `config/settings.py` | Model routing, env var loading, agent-to-model mapping | +| `core/hf_inference.py` | HF Inference API client, model registry, temperature mapping | +| `core/preview_generator.py` | HTML preview generators for Stage 1 visual previews | +| `core/rule_engine.py` | Layer 1: Type scale, AA contrast, spacing grid, color stats | +| `core/benchmarks.py` | Benchmark definitions (Material Design 3, Apple HIG, etc.) | +| `core/extractor.py` | Playwright-based CSS token extraction | +| `core/discovery.py` | Page discovery via sitemap.xml / crawling | + +--- + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `HF_TOKEN` | (required) | HuggingFace API token | +| `BRAND_IDENTIFIER_MODEL` | `Qwen/Qwen2.5-72B-Instruct` | Model for AURORA | +| `BENCHMARK_ADVISOR_MODEL` | `meta-llama/Llama-3.3-70B-Instruct` | Model for ATLAS | +| `BEST_PRACTICES_MODEL` | `Qwen/Qwen2.5-72B-Instruct` | Model for SENTINEL | +| `HEAD_SYNTHESIZER_MODEL` | `meta-llama/Llama-3.3-70B-Instruct` | Model for NEXUS | +| `FALLBACK_MODEL` | `Qwen/Qwen2.5-7B-Instruct` | Fallback when primary fails | +| `HF_MAX_NEW_TOKENS` | `2048` | Max tokens per LLM response | +| `HF_TEMPERATURE` | `0.3` | Global default temperature | +| `MAX_PAGES` | `20` | Max pages to discover | +| `BROWSER_TIMEOUT` | `30000` | Playwright timeout (ms) | + +### Model Override Examples +```bash +# Use Llama for all agents +export BRAND_IDENTIFIER_MODEL="meta-llama/Llama-3.3-70B-Instruct" +export BEST_PRACTICES_MODEL="meta-llama/Llama-3.3-70B-Instruct" + +# Use budget models +export BRAND_IDENTIFIER_MODEL="Qwen/Qwen2.5-7B-Instruct" +export BENCHMARK_ADVISOR_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1" +``` diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0a67d289e1914b1fd2fb7b0440f445e3e69aa9cf --- /dev/null +++ b/README.md @@ -0,0 +1,233 @@ +--- +title: Design System Extractor v3 +emoji: 🎨 +colorFrom: purple +colorTo: blue +sdk: docker +pinned: false +license: mit +--- + +# Design System Extractor v3 + +> 🎨 A semi-automated, human-in-the-loop agentic system that reverse-engineers design systems from live websites. + +## 🎯 What It Does + +When you have a website but no design system documentation (common when the original Sketch/Figma files are lost), this tool helps you: + +1. **Crawl** your website to discover pages +2. **Extract** design tokens (colors, typography, spacing, shadows) +3. **Review** and validate extracted tokens with visual previews +4. **Upgrade** your system with modern best practices (optional) +5. **Export** production-ready JSON tokens for Figma/code + +## 🧠 Philosophy + +This is **not a magic button** — it's a design-aware co-pilot. + +- **Agents propose → Humans decide** +- **Every action is visible, reversible, and previewed** +- **No irreversible automation** + +## 🏗️ Architecture + +``` +┌──────────────────────────────────────────────────────────────┐ +│ TECH STACK │ +├──────────────────────────────────────────────────────────────┤ +│ Frontend: Gradio (interactive UI with live preview) │ +│ Orchestration: LangGraph (agent workflow management) │ +│ Models: Claude API (reasoning) + Rule-based │ +│ Browser: Playwright (crawling & extraction) │ +│ Hosting: Hugging Face Spaces │ +└──────────────────────────────────────────────────────────────┘ +``` + +### Agent Personas + +| Agent | Persona | Job | +|-------|---------|-----| +| **Agent 1** | Design Archaeologist | Discover pages, extract raw tokens | +| **Agent 2** | Design System Librarian | Normalize, dedupe, structure tokens | +| **Agent 3** | Senior DS Architect | Recommend upgrades (type scales, spacing, a11y) | +| **Agent 4** | Automation Engineer | Generate final JSON for Figma/code | + +## 🚀 Quick Start + +### Prerequisites + +- Python 3.11+ +- Node.js (for some dependencies) + +### Installation + +```bash +# Clone the repository +git clone +cd design-system-extractor + +# Create virtual environment +python -m venv venv +source venv/bin/activate # or `venv\Scripts\activate` on Windows + +# Install dependencies +pip install -r requirements.txt + +# Install Playwright browsers +playwright install chromium + +# Copy environment file +cp config/.env.example config/.env +# Edit .env and add your ANTHROPIC_API_KEY +``` + +### Running + +```bash +python app.py +``` + +Open `http://localhost:7860` in your browser. + +## 📖 Usage Guide + +### Stage 1: Discovery + +1. Enter your website URL (e.g., `https://example.com`) +2. Click "Discover Pages" +3. Review discovered pages and select which to extract from +4. Ensure you have a mix of page types (homepage, listing, detail, etc.) + +### Stage 2: Extraction + +1. Choose viewport (Desktop 1440px or Mobile 375px) +2. Click "Extract Tokens" +3. Review extracted: + - **Colors**: With frequency, context, and AA compliance + - **Typography**: Font families, sizes, weights + - **Spacing**: Values with 8px grid fit indicators +4. Accept or reject individual tokens + +### Stage 3: Export + +1. Review final token set +2. Export as JSON +3. Import into Figma via Tokens Studio or your plugin + +## 📁 Project Structure + +``` +design-system-extractor/ +├── app.py # Main Gradio application +├── requirements.txt +├── README.md +│ +├── config/ +│ ├── .env.example # Environment template +│ ├── agents.yaml # Agent personas & settings +│ └── settings.py # Configuration loader +│ +├── agents/ +│ ├── state.py # LangGraph state definitions +│ ├── graph.py # Workflow orchestration +│ ├── crawler.py # Agent 1: Page discovery +│ ├── extractor.py # Agent 1: Token extraction +│ ├── normalizer.py # Agent 2: Normalization +│ ├── advisor.py # Agent 3: Best practices +│ └── generator.py # Agent 4: JSON generation +│ +├── core/ +│ ├── token_schema.py # Pydantic data models +│ └── color_utils.py # Color analysis utilities +│ +├── ui/ +│ └── (Gradio components) +│ +└── docs/ + └── CONTEXT.md # Context file for AI assistance +``` + +## 🔧 Configuration + +### Environment Variables + +```env +# Required +ANTHROPIC_API_KEY=your_key_here + +# Optional +DEBUG=false +LOG_LEVEL=INFO +BROWSER_HEADLESS=true +``` + +### Agent Configuration + +Agent personas and behavior are defined in `config/agents.yaml`. This includes: + +- Extraction targets (colors, typography, spacing) +- Naming conventions +- Confidence thresholds +- Upgrade options + +## 🛠️ Development + +### Running Tests + +```bash +pytest tests/ +``` + +### Adding New Features + +1. Update token schema in `core/token_schema.py` +2. Add agent logic in `agents/` +3. Update UI in `app.py` +4. Update `docs/CONTEXT.md` for AI assistance + +## 📦 Output Format + +Tokens are exported in a platform-agnostic JSON format: + +```json +{ + "metadata": { + "source_url": "https://example.com", + "version": "v1-recovered", + "viewport": "desktop" + }, + "colors": { + "primary-500": { + "value": "#007bff", + "source": "detected", + "contrast_white": 4.5 + } + }, + "typography": { + "heading-lg": { + "fontFamily": "Inter", + "fontSize": "24px", + "fontWeight": 700 + } + }, + "spacing": { + "md": { + "value": "16px", + "source": "detected" + } + } +} +``` + +## 🤝 Contributing + +Contributions are welcome! Please read the contribution guidelines first. + +## 📄 License + +MIT + +--- + +Built with ❤️ for designers who've lost their source files. diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fd59657046087316d92371d66e69a4348919211a --- /dev/null +++ b/agents/__init__.py @@ -0,0 +1,76 @@ +""" +Agents for Design System Extractor v2. + +This package contains: +- Stage 1 Agents: Crawler, Extractor, Normalizer, Semantic Analyzer +- Stage 2 Agents: Rule Engine, Benchmark Researcher, LLM Analysis Agents +- Workflow Graphs: LangGraph orchestration +""" + +# Stage 2 components (no langgraph dependency) +from agents.benchmark_researcher import ( + BenchmarkResearcher, + BenchmarkCache, + DESIGN_SYSTEM_SOURCES, + FALLBACK_BENCHMARKS, + get_available_benchmarks, + get_benchmark_choices, + BenchmarkData, + BenchmarkComparison, +) + +try: + from agents.llm_agents import ( + BrandIdentifierAgent, + BenchmarkAdvisorAgent, + BestPracticesValidatorAgent, + HeadSynthesizerAgent, + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + HeadSynthesis, + ) +except ImportError: + BrandIdentifierAgent = None + BenchmarkAdvisorAgent = None + BestPracticesValidatorAgent = None + HeadSynthesizerAgent = None + BrandIdentification = None + BenchmarkAdvice = None + BestPracticesResult = None + HeadSynthesis = None + +# Lazy imports for langgraph-dependent modules +def get_state_module(): + """Lazy import for state module (requires langgraph).""" + from agents import state as state_module + return state_module + +def get_graph_module(): + """Lazy import for graph module (requires langgraph).""" + from agents import graph as graph_module + return graph_module + +__all__ = [ + # Benchmark Research + "BenchmarkResearcher", + "BenchmarkCache", + "DESIGN_SYSTEM_SOURCES", + "FALLBACK_BENCHMARKS", + "get_available_benchmarks", + "get_benchmark_choices", + "BenchmarkData", + "BenchmarkComparison", + # LLM Agents + "BrandIdentifierAgent", + "BenchmarkAdvisorAgent", + "BestPracticesValidatorAgent", + "HeadSynthesizerAgent", + "BrandIdentification", + "BenchmarkAdvice", + "BestPracticesResult", + "HeadSynthesis", + # Lazy loaders + "get_state_module", + "get_graph_module", +] diff --git a/agents/advisor.py b/agents/advisor.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2e4f50063db5b89a3abcc09af2c8d03e124f01 --- /dev/null +++ b/agents/advisor.py @@ -0,0 +1,697 @@ +""" +Agent 3: Design System Best Practices Advisor +Design System Extractor v2 + +Persona: Senior Staff Design Systems Architect + +Responsibilities: +- Analyze extracted tokens against best practices (Material, Polaris, Carbon) +- Propose upgrade OPTIONS with rationale (LLM-powered reasoning) +- Generate type scales, color ramps, spacing grids (Rule-based calculation) +- Never change: font families, primary/secondary base colors + +Hybrid Approach: +- LLM: Analyzes patterns, recommends options, explains rationale +- Rules: Calculates actual values (math-based) +""" + +import os +import json +from typing import Optional, Callable +from dataclasses import dataclass, field +from enum import Enum + +from core.token_schema import ( + NormalizedTokens, + ColorToken, + TypographyToken, + SpacingToken, + UpgradeOption, + UpgradeRecommendations, +) +from core.color_utils import ( + parse_color, + generate_color_ramp, + get_contrast_ratio, +) + + +# ============================================================================= +# TYPE SCALE CALCULATIONS (Rule-Based) +# ============================================================================= + +class TypeScaleRatio(Enum): + """Common type scale ratios.""" + MINOR_SECOND = 1.067 + MAJOR_SECOND = 1.125 + MINOR_THIRD = 1.200 + MAJOR_THIRD = 1.250 + PERFECT_FOURTH = 1.333 + AUGMENTED_FOURTH = 1.414 + PERFECT_FIFTH = 1.500 + + +def generate_type_scale(base_size: float, ratio: float, steps_up: int = 5, steps_down: int = 2) -> dict: + """ + Generate a type scale from a base size. + + Args: + base_size: Base font size in pixels (e.g., 16) + ratio: Scale ratio (e.g., 1.25) + steps_up: Number of sizes larger than base + steps_down: Number of sizes smaller than base + + Returns: + Dict with size names and values + """ + scale = {} + + # Generate sizes below base + for i in range(steps_down, 0, -1): + size = base_size / (ratio ** i) + name = f"text.{['xs', 'sm'][steps_down - i] if i <= 2 else f'xs-{i}'}" + scale[name] = round(size) + + # Base size + scale["text.base"] = round(base_size) + + # Generate sizes above base + size_names = ["text.lg", "text.xl", "heading.sm", "heading.md", "heading.lg", "heading.xl", "heading.2xl", "display"] + for i in range(1, steps_up + 1): + size = base_size * (ratio ** i) + name = size_names[i - 1] if i <= len(size_names) else f"heading.{i}xl" + scale[name] = round(size) + + return scale + + +# ============================================================================= +# SPACING GRID CALCULATIONS (Rule-Based) +# ============================================================================= + +def snap_to_grid(value: float, base: int = 8) -> int: + """Snap a value to the nearest grid unit.""" + return round(value / base) * base + + +def generate_spacing_scale(base: int = 8, max_value: int = 96) -> dict: + """ + Generate a spacing scale based on a base unit. + + Args: + base: Base unit (4 or 8) + max_value: Maximum spacing value + + Returns: + Dict with spacing names and values + """ + scale = {} + multipliers = [0.5, 1, 1.5, 2, 2.5, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24] + names = ["0.5", "1", "1.5", "2", "2.5", "3", "4", "5", "6", "8", "10", "12", "16", "20", "24"] + + for mult, name in zip(multipliers, names): + value = int(base * mult) + if value <= max_value: + scale[f"space.{name}"] = f"{value}px" + + return scale + + +def analyze_spacing_fit(detected_values: list[int], base: int = 8) -> dict: + """ + Analyze how well detected spacing values fit a grid. + + Returns: + Dict with fit percentage and adjustments needed + """ + fits = 0 + adjustments = [] + + for value in detected_values: + snapped = snap_to_grid(value, base) + if value == snapped: + fits += 1 + else: + adjustments.append({ + "original": value, + "snapped": snapped, + "delta": snapped - value + }) + + return { + "base": base, + "fit_percentage": (fits / len(detected_values) * 100) if detected_values else 0, + "adjustments": adjustments, + "already_aligned": fits, + "needs_adjustment": len(adjustments) + } + + +# ============================================================================= +# COLOR RAMP GENERATION (Rule-Based) +# ============================================================================= + +def generate_semantic_color_ramp(base_color: str, role: str = "primary") -> dict: + """ + Generate a full color ramp from a base color. + + Args: + base_color: Hex color (e.g., "#373737") + role: Semantic role (primary, secondary, neutral, etc.) + + Returns: + Dict with shade names (50-900) and hex values + """ + ramp = generate_color_ramp(base_color) + + result = {} + shades = ["50", "100", "200", "300", "400", "500", "600", "700", "800", "900"] + + for shade, color in zip(shades, ramp): + result[f"{role}.{shade}"] = color + + return result + + +# ============================================================================= +# LLM-POWERED ANALYSIS (Agent 3 Brain) +# ============================================================================= + +class DesignSystemAdvisor: + """ + Agent 3: Analyzes tokens and proposes upgrades. + + Uses LLM for reasoning and recommendations. + Uses rules for calculating actual values. + """ + + def __init__(self, log_callback: Optional[Callable[[str], None]] = None): + self.log = log_callback or print + self.hf_token = os.getenv("HF_TOKEN", "") + self.model = os.getenv("AGENT3_MODEL", "meta-llama/Llama-3.1-70B-Instruct") + + async def analyze( + self, + desktop_tokens: NormalizedTokens, + mobile_tokens: NormalizedTokens, + ) -> UpgradeRecommendations: + """ + Analyze tokens and generate upgrade recommendations. + + Args: + desktop_tokens: Normalized desktop tokens + mobile_tokens: Normalized mobile tokens + + Returns: + UpgradeRecommendations with options for each category + """ + self.log("🤖 Agent 3: Starting design system analysis...") + + # Gather token statistics + stats = self._gather_statistics(desktop_tokens, mobile_tokens) + self.log(f"📊 Gathered statistics: {len(stats['colors'])} colors, {len(stats['typography'])} typography, {len(stats['spacing'])} spacing") + + # Generate rule-based options first + self.log("🔧 Generating rule-based options...") + type_scale_options = self._generate_type_scale_options(stats) + spacing_options = self._generate_spacing_options(stats) + color_ramp_options = self._generate_color_ramp_options(stats) + + # Get LLM analysis and recommendations + self.log(f"🤖 Calling LLM ({self.model}) for analysis...") + llm_analysis = await self._get_llm_analysis(stats, type_scale_options, spacing_options) + + # Apply LLM recommendations to options + self._apply_llm_recommendations(type_scale_options, spacing_options, color_ramp_options, llm_analysis) + + self.log("✅ Analysis complete!") + + return UpgradeRecommendations( + typography_scales=type_scale_options, + spacing_systems=spacing_options, + color_ramps=color_ramp_options, + naming_conventions=[], + llm_rationale=llm_analysis.get("rationale", ""), + detected_patterns=llm_analysis.get("patterns", []), + brand_analysis=llm_analysis.get("brand_analysis", []), + color_observations=llm_analysis.get("color_observations", ""), + accessibility_issues=llm_analysis.get("accessibility_issues", []), + ) + + def _gather_statistics(self, desktop: NormalizedTokens, mobile: NormalizedTokens) -> dict: + """Gather statistics from tokens for analysis.""" + + # Combine colors (colors are viewport-agnostic) + colors = {} + for name, token in desktop.colors.items(): + colors[token.value] = { + "value": token.value, + "frequency": token.frequency, + "contexts": token.contexts, + "suggested_name": token.suggested_name, + } + + # Typography (viewport-specific) + typography = { + "desktop": [], + "mobile": [], + } + for name, token in desktop.typography.items(): + typography["desktop"].append({ + "font_family": token.font_family, + "font_size": token.font_size, + "font_weight": token.font_weight, + "frequency": token.frequency, + }) + for name, token in mobile.typography.items(): + typography["mobile"].append({ + "font_family": token.font_family, + "font_size": token.font_size, + "font_weight": token.font_weight, + "frequency": token.frequency, + }) + + # Spacing + spacing = { + "desktop": [], + "mobile": [], + } + for name, token in desktop.spacing.items(): + spacing["desktop"].append(token.value_px) + for name, token in mobile.spacing.items(): + spacing["mobile"].append(token.value_px) + + # Find most used font family + font_families = {} + for t in typography["desktop"]: + family = t["font_family"] + font_families[family] = font_families.get(family, 0) + t["frequency"] + + primary_font = max(font_families.items(), key=lambda x: x[1])[0] if font_families else "sans-serif" + + # Find base font size (most frequent in body context) + font_sizes = [self._parse_size(t["font_size"]) for t in typography["desktop"]] + base_font_size = 16 # Default + if font_sizes: + # Find most common size between 14-18px (typical body text) + body_sizes = [s for s in font_sizes if 14 <= s <= 18] + if body_sizes: + base_font_size = max(set(body_sizes), key=body_sizes.count) + + return { + "colors": colors, + "typography": typography, + "spacing": spacing, + "primary_font": primary_font, + "base_font_size": base_font_size, + "all_font_sizes": list(set(font_sizes)), + } + + def _parse_size(self, size_str: str) -> float: + """Parse a size string to pixels.""" + if not size_str: + return 16 + size_str = str(size_str).lower().strip() + if "px" in size_str: + return float(size_str.replace("px", "")) + if "rem" in size_str: + return float(size_str.replace("rem", "")) * 16 + if "em" in size_str: + return float(size_str.replace("em", "")) * 16 + try: + return float(size_str) + except: + return 16 + + def _generate_type_scale_options(self, stats: dict) -> list[UpgradeOption]: + """Generate type scale options.""" + base = stats["base_font_size"] + options = [] + + ratios = [ + ("minor_third", 1.200, "Conservative — subtle size differences"), + ("major_third", 1.250, "Balanced — clear hierarchy without extremes"), + ("perfect_fourth", 1.333, "Bold — strong visual hierarchy"), + ] + + for id_name, ratio, desc in ratios: + scale = generate_type_scale(base, ratio) + options.append(UpgradeOption( + id=f"type_scale_{id_name}", + name=f"Type Scale {ratio}", + description=desc, + category="typography", + values={ + "ratio": ratio, + "base": base, + "scale": scale, + }, + pros=[ + f"Based on {base}px base (detected)", + f"Ratio {ratio} is industry standard", + ], + cons=[], + effort="low", + recommended=False, + )) + + # Add "keep original" option + options.append(UpgradeOption( + id="type_scale_keep", + name="Keep Original", + description="Preserve detected font sizes without scaling", + category="typography", + values={ + "ratio": None, + "base": base, + "scale": {f"size_{i}": s for i, s in enumerate(stats["all_font_sizes"])}, + }, + pros=["No changes needed", "Preserves original design"], + cons=["May have inconsistent scale"], + effort="none", + recommended=False, + )) + + return options + + def _generate_spacing_options(self, stats: dict) -> list[UpgradeOption]: + """Generate spacing system options.""" + desktop_spacing = stats["spacing"]["desktop"] + + options = [] + + for base in [8, 4]: + fit_analysis = analyze_spacing_fit(desktop_spacing, base) + scale = generate_spacing_scale(base) + + options.append(UpgradeOption( + id=f"spacing_{base}px", + name=f"{base}px Base Grid", + description=f"{'Modern standard' if base == 8 else 'Finer control'} — {fit_analysis['fit_percentage']:.0f}% of your values already fit", + category="spacing", + values={ + "base": base, + "scale": scale, + "fit_analysis": fit_analysis, + }, + pros=[ + f"{fit_analysis['already_aligned']} values already aligned", + "Consistent visual rhythm" if base == 8 else "More granular control", + ], + cons=[ + f"{fit_analysis['needs_adjustment']} values need adjustment" if fit_analysis['needs_adjustment'] > 0 else None, + ], + effort="low" if fit_analysis['fit_percentage'] > 70 else "medium", + recommended=False, + )) + + # Add "keep original" option + options.append(UpgradeOption( + id="spacing_keep", + name="Keep Original", + description="Preserve detected spacing values", + category="spacing", + values={ + "base": None, + "scale": {f"space_{v}": f"{v}px" for v in desktop_spacing}, + }, + pros=["No changes needed"], + cons=["May have irregular spacing"], + effort="none", + recommended=False, + )) + + return options + + def _generate_color_ramp_options(self, stats: dict) -> list[UpgradeOption]: + """Generate color ramp options.""" + options = [] + + # Find primary colors (high frequency, used in text/background) + primary_candidates = [] + for hex_val, data in stats["colors"].items(): + if data["frequency"] > 10: + primary_candidates.append((hex_val, data)) + + # Sort by frequency + primary_candidates.sort(key=lambda x: -x[1]["frequency"]) + + # Generate ramps for top colors + for hex_val, data in primary_candidates[:5]: + role = self._infer_color_role(data) + ramp = generate_semantic_color_ramp(hex_val, role) + + options.append(UpgradeOption( + id=f"color_ramp_{role}", + name=f"{role.title()} Ramp", + description=f"Generate 50-900 shades from {hex_val}", + category="colors", + values={ + "base_color": hex_val, + "role": role, + "ramp": ramp, + "preserve_base": True, + }, + pros=[ + f"Base color {hex_val} preserved", + "Full shade range for UI states", + "AA contrast compliant", + ], + cons=[], + effort="low", + recommended=True, + )) + + return options + + def _infer_color_role(self, color_data: dict) -> str: + """Infer semantic role from color context.""" + contexts = " ".join(color_data.get("contexts", [])).lower() + + if "primary" in contexts or "brand" in contexts: + return "primary" + if "secondary" in contexts or "accent" in contexts: + return "secondary" + if "background" in contexts or "surface" in contexts: + return "surface" + if "text" in contexts or "foreground" in contexts: + return "text" + if "border" in contexts or "divider" in contexts: + return "border" + if "success" in contexts or "green" in contexts: + return "success" + if "error" in contexts or "red" in contexts: + return "error" + if "warning" in contexts or "yellow" in contexts: + return "warning" + + return "neutral" + + async def _get_llm_analysis(self, stats: dict, type_options: list, spacing_options: list) -> dict: + """Get LLM analysis and recommendations.""" + + if not self.hf_token: + self.log("⚠️ No HF token, using default recommendations") + return self._get_default_recommendations(stats, type_options, spacing_options) + + try: + from core.hf_inference import HFInferenceClient + + # HFInferenceClient gets token from settings/env + client = HFInferenceClient() + + # Build prompt + prompt = self._build_analysis_prompt(stats, type_options, spacing_options) + + self.log("📤 Sending analysis request to LLM...") + + # Use the agent-specific complete method + response = await client.complete_async( + agent_name="advisor", + system_prompt="You are a Senior Design Systems Architect analyzing design tokens.", + user_message=prompt, + max_tokens=1500, + ) + + self.log("📥 Received LLM response") + + # Parse LLM response + return self._parse_llm_response(response) + + except Exception as e: + self.log(f"⚠️ LLM error: {str(e)}, using default recommendations") + return self._get_default_recommendations(stats, type_options, spacing_options) + + def _build_analysis_prompt(self, stats: dict, type_options: list, spacing_options: list) -> str: + """Build the prompt for LLM analysis.""" + + # Format colors + colors_str = "\n".join([ + f" - {data['value']}: frequency={data['frequency']}, contexts={data['contexts'][:3]}" + for hex_val, data in list(stats['colors'].items())[:10] + ]) + + # Format typography + typo_str = "\n".join([ + f" - {t['font_family']} {t['font_size']} (weight: {t['font_weight']}, freq: {t['frequency']})" + for t in stats['typography']['desktop'][:10] + ]) + + # Format spacing + spacing_str = f"Desktop: {sorted(stats['spacing']['desktop'])[:15]}" + + return f"""You are a Senior Design Systems Architect. Analyze these extracted design tokens and provide recommendations based on industry best practices. + +## EXTRACTED TOKENS + +### Colors (top 10 by frequency): +{colors_str} + +### Typography: +Primary font: {stats['primary_font']} +Base size: {stats['base_font_size']}px +{typo_str} + +### Spacing: +{spacing_str} + +## YOUR TASK + +Research and compare against these top design systems: +1. **Material Design 3** (Google) - Type scale, spacing grid, color system +2. **Apple Human Interface Guidelines** - Typography scale, spacing +3. **Shopify Polaris** - Type scale ratios, spacing system +4. **IBM Carbon** - Type tokens, spacing tokens +5. **Atlassian Design System** - Typography, spacing patterns + +For each, note: +- Type scale ratio used +- Base font size +- Spacing grid (4px or 8px) +- Key observations + +Then recommend: +1. Which TYPE SCALE ratio (1.2, 1.25, or 1.333) best matches this site's existing design? +2. Which SPACING BASE (4px or 8px) fits better? +3. Any ACCESSIBILITY concerns with the detected colors? + +Respond in this JSON format: +{{ + "brand_analysis": [ + {{"brand": "Material Design 3", "ratio": 1.2, "base": 16, "spacing": "8px", "notes": "..."}}, + {{"brand": "Apple HIG", "ratio": 1.19, "base": 17, "spacing": "4px", "notes": "..."}}, + {{"brand": "Shopify Polaris", "ratio": 1.25, "base": 16, "spacing": "4px", "notes": "..."}}, + {{"brand": "IBM Carbon", "ratio": 1.25, "base": 14, "spacing": "8px", "notes": "..."}}, + {{"brand": "Atlassian", "ratio": 1.14, "base": 14, "spacing": "8px", "notes": "..."}} + ], + "recommended_type_scale": "minor_third|major_third|perfect_fourth|keep", + "recommended_spacing": "8px|4px|keep", + "rationale": "Detailed explanation comparing the extracted tokens to the brand analysis...", + "color_observations": "Analysis of the color palette compared to industry standards...", + "accessibility_issues": ["issue 1", "issue 2"] +}}""" + + def _parse_llm_response(self, response: str) -> dict: + """Parse LLM response into structured recommendations.""" + try: + # Try to extract JSON from response + import re + json_match = re.search(r'\{[\s\S]*\}', response) + if json_match: + parsed = json.loads(json_match.group()) + # Ensure all expected fields exist + parsed.setdefault("brand_analysis", []) + parsed.setdefault("recommended_type_scale", "major_third") + parsed.setdefault("recommended_spacing", "8px") + parsed.setdefault("rationale", "") + parsed.setdefault("color_observations", "") + parsed.setdefault("accessibility_issues", []) + return parsed + except Exception as e: + self.log(f" JSON parse error: {str(e)}") + + # Default if parsing fails + return self._get_default_recommendations({}, [], []) + + def _get_default_recommendations(self, stats: dict, type_options: list, spacing_options: list) -> dict: + """Get default recommendations without LLM.""" + + # Default brand analysis (rule-based knowledge) + brand_analysis = [ + {"brand": "Material Design 3", "ratio": 1.2, "base": 16, "spacing": "8px", + "notes": "Google's design system uses Major Second (1.125) to Minor Third (1.2) scales"}, + {"brand": "Apple HIG", "ratio": 1.19, "base": 17, "spacing": "4px", + "notes": "Apple uses SF Pro with dynamic type scaling, 4pt grid"}, + {"brand": "Shopify Polaris", "ratio": 1.25, "base": 16, "spacing": "4px", + "notes": "Polaris uses Major Third (1.25) with 4px spacing unit"}, + {"brand": "IBM Carbon", "ratio": 1.25, "base": 14, "spacing": "8px", + "notes": "Carbon uses productive (14px) and expressive (16px) type sets"}, + {"brand": "Atlassian", "ratio": 1.14, "base": 14, "spacing": "8px", + "notes": "Atlassian uses a compact scale for dense interfaces"}, + ] + + # Recommend based on fit analysis if available + spacing_8_fit = 0 + spacing_4_fit = 0 + for opt in spacing_options: + if opt and hasattr(opt, 'id'): + if opt.id == "spacing_8px": + spacing_8_fit = opt.values.get("fit_analysis", {}).get("fit_percentage", 0) + elif opt.id == "spacing_4px": + spacing_4_fit = opt.values.get("fit_analysis", {}).get("fit_percentage", 0) + + return { + "brand_analysis": brand_analysis, + "recommended_type_scale": "major_third", + "recommended_spacing": "8px" if spacing_8_fit >= spacing_4_fit else "4px", + "rationale": "Based on industry analysis: Major Third (1.25) type scale is the most commonly used ratio across modern design systems including Shopify Polaris and IBM Carbon. The 8px spacing grid is the modern standard used by Material Design and most enterprise design systems, providing a good balance between flexibility and consistency.", + "color_observations": "The detected color palette shows a neutral-heavy design with good contrast potential. Consider generating full color ramps for better UI state coverage (hover, active, disabled states).", + "accessibility_issues": [], + } + + def _apply_llm_recommendations( + self, + type_options: list[UpgradeOption], + spacing_options: list[UpgradeOption], + color_options: list[UpgradeOption], + llm_analysis: dict + ): + """Apply LLM recommendations to options.""" + + # Mark recommended type scale + rec_type = llm_analysis.get("recommended_type_scale", "major_third") + for opt in type_options: + if rec_type in opt.id: + opt.recommended = True + opt.description += " ⭐ LLM Recommended" + + # Mark recommended spacing + rec_spacing = llm_analysis.get("recommended_spacing", "8px") + for opt in spacing_options: + if rec_spacing.replace("px", "") in opt.id: + opt.recommended = True + opt.description += " ⭐ LLM Recommended" + + +# ============================================================================= +# CONVENIENCE FUNCTIONS +# ============================================================================= + +async def analyze_design_system( + desktop_tokens: NormalizedTokens, + mobile_tokens: NormalizedTokens, + log_callback: Optional[Callable[[str], None]] = None +) -> UpgradeRecommendations: + """ + Convenience function to analyze a design system. + + Args: + desktop_tokens: Normalized desktop tokens + mobile_tokens: Normalized mobile tokens + log_callback: Optional callback for logging + + Returns: + UpgradeRecommendations + """ + advisor = DesignSystemAdvisor(log_callback=log_callback) + return await advisor.analyze(desktop_tokens, mobile_tokens) diff --git a/agents/benchmark_researcher.py b/agents/benchmark_researcher.py new file mode 100644 index 0000000000000000000000000000000000000000..7aa9b772f11806b0903651f256142f74fe0f174c --- /dev/null +++ b/agents/benchmark_researcher.py @@ -0,0 +1,717 @@ +""" +Benchmark Research Agent +========================= +Fetches LIVE data from design system documentation sites +using Firecrawl, with 24-hour caching. + +This agent: +1. Fetches official documentation from design system sites +2. Extracts typography, spacing, color specifications using LLM +3. Caches results for 24 hours +4. Compares user's tokens to researched benchmarks +""" + +import asyncio +import json +import os +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Optional, Callable +import hashlib + + +# ============================================================================= +# DESIGN SYSTEM SOURCES (Official Documentation URLs) +# ============================================================================= + +DESIGN_SYSTEM_SOURCES = { + "material_design_3": { + "name": "Material Design 3", + "short_name": "Material 3", + "vendor": "Google", + "urls": { + "typography": "https://m3.material.io/styles/typography/type-scale-tokens", + "spacing": "https://m3.material.io/foundations/layout/understanding-layout/spacing", + "colors": "https://m3.material.io/styles/color/the-color-system/key-colors-tones", + }, + "best_for": ["Android apps", "Web apps", "Enterprise software"], + "icon": "🟢", + }, + "apple_hig": { + "name": "Apple Human Interface Guidelines", + "short_name": "Apple HIG", + "vendor": "Apple", + "urls": { + "typography": "https://developer.apple.com/design/human-interface-guidelines/typography", + "spacing": "https://developer.apple.com/design/human-interface-guidelines/layout", + }, + "best_for": ["iOS apps", "macOS apps", "Premium consumer products"], + "icon": "🍎", + }, + "shopify_polaris": { + "name": "Shopify Polaris", + "short_name": "Polaris", + "vendor": "Shopify", + "urls": { + "typography": "https://polaris.shopify.com/design/typography", + "spacing": "https://polaris.shopify.com/design/spacing", + "colors": "https://polaris.shopify.com/design/colors", + }, + "best_for": ["E-commerce", "Admin dashboards", "Merchant tools"], + "icon": "🛒", + }, + "atlassian_design": { + "name": "Atlassian Design System", + "short_name": "Atlassian", + "vendor": "Atlassian", + "urls": { + "typography": "https://atlassian.design/foundations/typography", + "spacing": "https://atlassian.design/foundations/spacing", + "colors": "https://atlassian.design/foundations/color", + }, + "best_for": ["Productivity tools", "Dense interfaces", "Enterprise B2B"], + "icon": "🔵", + }, + "ibm_carbon": { + "name": "IBM Carbon Design System", + "short_name": "Carbon", + "vendor": "IBM", + "urls": { + "typography": "https://carbondesignsystem.com/guidelines/typography/overview", + "spacing": "https://carbondesignsystem.com/guidelines/spacing/overview", + "colors": "https://carbondesignsystem.com/guidelines/color/overview", + }, + "best_for": ["Enterprise software", "Data-heavy applications", "IBM products"], + "icon": "🔷", + }, + "tailwind_css": { + "name": "Tailwind CSS", + "short_name": "Tailwind", + "vendor": "Tailwind Labs", + "urls": { + "typography": "https://tailwindcss.com/docs/font-size", + "spacing": "https://tailwindcss.com/docs/customizing-spacing", + "colors": "https://tailwindcss.com/docs/customizing-colors", + }, + "best_for": ["Web applications", "Startups", "Rapid prototyping"], + "icon": "🌊", + }, + "ant_design": { + "name": "Ant Design", + "short_name": "Ant Design", + "vendor": "Ant Group", + "urls": { + "typography": "https://ant.design/docs/spec/font", + "spacing": "https://ant.design/docs/spec/layout", + "colors": "https://ant.design/docs/spec/colors", + }, + "best_for": ["Enterprise B2B", "Admin panels", "Chinese market"], + "icon": "🐜", + }, + "chakra_ui": { + "name": "Chakra UI", + "short_name": "Chakra", + "vendor": "Chakra UI", + "urls": { + "typography": "https://chakra-ui.com/docs/styled-system/theme#typography", + "spacing": "https://chakra-ui.com/docs/styled-system/theme#spacing", + "colors": "https://chakra-ui.com/docs/styled-system/theme#colors", + }, + "best_for": ["React applications", "Startups", "Accessible products"], + "icon": "⚡", + }, +} + + +# ============================================================================= +# DATA CLASSES +# ============================================================================= + +@dataclass +class BenchmarkData: + """Researched benchmark data from a design system.""" + key: str + name: str + short_name: str + vendor: str + icon: str + + # Extracted specifications + typography: dict = field(default_factory=dict) + # Expected: {scale_ratio, base_size, sizes[], font_family, line_height_body} + + spacing: dict = field(default_factory=dict) + # Expected: {base, scale[], grid} + + colors: dict = field(default_factory=dict) + # Expected: {palette_size, uses_ramps, ramp_steps} + + # Metadata + fetched_at: str = "" + confidence: str = "low" # high, medium, low + source_urls: list = field(default_factory=list) + best_for: list = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "key": self.key, + "name": self.name, + "short_name": self.short_name, + "vendor": self.vendor, + "icon": self.icon, + "typography": self.typography, + "spacing": self.spacing, + "colors": self.colors, + "fetched_at": self.fetched_at, + "confidence": self.confidence, + "best_for": self.best_for, + } + + +@dataclass +class BenchmarkComparison: + """Comparison result between user's tokens and a benchmark.""" + benchmark: BenchmarkData + similarity_score: float # Lower = more similar + + # Individual comparisons + type_ratio_diff: float + base_size_diff: int + spacing_grid_diff: int + + # Match percentages + type_match_pct: float + spacing_match_pct: float + overall_match_pct: float + + def to_dict(self) -> dict: + return { + "name": self.benchmark.name, + "short_name": self.benchmark.short_name, + "icon": self.benchmark.icon, + "similarity_score": round(self.similarity_score, 2), + "overall_match_pct": round(self.overall_match_pct, 1), + "comparison": { + "type_ratio": { + "diff": round(self.type_ratio_diff, 3), + "match_pct": round(self.type_match_pct, 1), + }, + "base_size": { + "diff": self.base_size_diff, + }, + "spacing_grid": { + "diff": self.spacing_grid_diff, + "match_pct": round(self.spacing_match_pct, 1), + }, + }, + "benchmark_values": { + "type_ratio": self.benchmark.typography.get("scale_ratio"), + "base_size": self.benchmark.typography.get("base_size"), + "spacing_grid": self.benchmark.spacing.get("base"), + }, + "best_for": self.benchmark.best_for, + "confidence": self.benchmark.confidence, + } + + +# ============================================================================= +# CACHE MANAGER +# ============================================================================= + +class BenchmarkCache: + """Manages 24-hour caching of benchmark research results.""" + + def __init__(self, cache_dir: str = None): + if cache_dir is None: + cache_dir = os.path.join(os.path.dirname(__file__), "..", "storage") + self.cache_file = os.path.join(cache_dir, "benchmark_cache.json") + self._ensure_cache_dir() + + def _ensure_cache_dir(self): + """Ensure cache directory exists.""" + os.makedirs(os.path.dirname(self.cache_file), exist_ok=True) + + def _load_cache(self) -> dict: + """Load cache from file.""" + if os.path.exists(self.cache_file): + try: + with open(self.cache_file, 'r') as f: + return json.load(f) + except Exception: + return {} + return {} + + def _save_cache(self, cache: dict): + """Save cache to file.""" + try: + with open(self.cache_file, 'w') as f: + json.dump(cache, f, indent=2) + except Exception: + pass + + def get(self, key: str) -> Optional[BenchmarkData]: + """Get cached benchmark if valid (< 24 hours old).""" + cache = self._load_cache() + + if key not in cache: + return None + + entry = cache[key] + fetched_at = datetime.fromisoformat(entry.get("fetched_at", "2000-01-01")) + + # Check if expired (24 hours) + if datetime.now() - fetched_at > timedelta(hours=24): + return None + + # Reconstruct BenchmarkData + source = DESIGN_SYSTEM_SOURCES.get(key, {}) + return BenchmarkData( + key=key, + name=entry.get("name", source.get("name", key)), + short_name=entry.get("short_name", source.get("short_name", key)), + vendor=entry.get("vendor", source.get("vendor", "")), + icon=entry.get("icon", source.get("icon", "📦")), + typography=entry.get("typography", {}), + spacing=entry.get("spacing", {}), + colors=entry.get("colors", {}), + fetched_at=entry.get("fetched_at", ""), + confidence=entry.get("confidence", "low"), + source_urls=entry.get("source_urls", []), + best_for=entry.get("best_for", source.get("best_for", [])), + ) + + def set(self, key: str, data: BenchmarkData): + """Cache benchmark data.""" + cache = self._load_cache() + cache[key] = data.to_dict() + self._save_cache(cache) + + def get_cache_status(self) -> dict: + """Get status of all cached items.""" + cache = self._load_cache() + status = {} + + for key in DESIGN_SYSTEM_SOURCES.keys(): + if key in cache: + fetched_at = datetime.fromisoformat(cache[key].get("fetched_at", "2000-01-01")) + age_hours = (datetime.now() - fetched_at).total_seconds() / 3600 + is_valid = age_hours < 24 + status[key] = { + "cached": True, + "valid": is_valid, + "age_hours": round(age_hours, 1), + } + else: + status[key] = {"cached": False, "valid": False} + + return status + + +# ============================================================================= +# FALLBACK DATA (Used when research fails) +# ============================================================================= + +FALLBACK_BENCHMARKS = { + "material_design_3": { + "typography": {"scale_ratio": 1.2, "base_size": 16, "font_family": "Roboto", "line_height_body": 1.5}, + "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 48, 64], "grid": "8px"}, + "colors": {"palette_size": 13, "uses_ramps": True}, + }, + "apple_hig": { + "typography": {"scale_ratio": 1.19, "base_size": 17, "font_family": "SF Pro", "line_height_body": 1.47}, + "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40], "grid": "4px"}, + "colors": {"palette_size": 9, "uses_ramps": True}, + }, + "shopify_polaris": { + "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "Inter", "line_height_body": 1.5}, + "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64], "grid": "4px"}, + "colors": {"palette_size": 11, "uses_ramps": True}, + }, + "atlassian_design": { + "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "Inter", "line_height_body": 1.43}, + "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"}, + "colors": {"palette_size": 15, "uses_ramps": True}, + }, + "ibm_carbon": { + "typography": {"scale_ratio": 1.25, "base_size": 14, "font_family": "IBM Plex Sans", "line_height_body": 1.5}, + "spacing": {"base": 8, "scale": [0, 2, 4, 8, 12, 16, 24, 32, 40, 48], "grid": "8px"}, + "colors": {"palette_size": 12, "uses_ramps": True}, + }, + "tailwind_css": { + "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5}, + "spacing": {"base": 4, "scale": [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32], "grid": "4px"}, + "colors": {"palette_size": 22, "uses_ramps": True}, + }, + "ant_design": { + "typography": {"scale_ratio": 1.14, "base_size": 14, "font_family": "system-ui", "line_height_body": 1.57}, + "spacing": {"base": 8, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48], "grid": "8px"}, + "colors": {"palette_size": 13, "uses_ramps": True}, + }, + "chakra_ui": { + "typography": {"scale_ratio": 1.25, "base_size": 16, "font_family": "system-ui", "line_height_body": 1.5}, + "spacing": {"base": 4, "scale": [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 56, 64], "grid": "4px"}, + "colors": {"palette_size": 15, "uses_ramps": True}, + }, +} + + +# ============================================================================= +# BENCHMARK RESEARCHER +# ============================================================================= + +class BenchmarkResearcher: + """ + Research agent that fetches live design system specifications. + + Uses Firecrawl to fetch documentation and LLM to extract specs. + Results are cached for 24 hours. + """ + + def __init__(self, firecrawl_client=None, hf_client=None): + """ + Initialize researcher. + + Args: + firecrawl_client: Firecrawl API client for fetching docs + hf_client: HuggingFace client for LLM extraction + """ + self.firecrawl = firecrawl_client + self.hf_client = hf_client + self.cache = BenchmarkCache() + + async def research_benchmark( + self, + system_key: str, + log_callback: Callable = None, + force_refresh: bool = False, + ) -> BenchmarkData: + """ + Research a specific design system. + + Args: + system_key: Key from DESIGN_SYSTEM_SOURCES + log_callback: Function to log progress + force_refresh: Bypass cache and fetch fresh + + Returns: + BenchmarkData with extracted specifications + """ + def log(msg: str): + if log_callback: + log_callback(msg) + + if system_key not in DESIGN_SYSTEM_SOURCES: + raise ValueError(f"Unknown design system: {system_key}") + + source = DESIGN_SYSTEM_SOURCES[system_key] + + # Check cache first (unless force refresh) + if not force_refresh: + cached = self.cache.get(system_key) + if cached: + log(f" ├─ {source['icon']} {source['short_name']}: Using cached data ✅") + return cached + + log(f" ├─ {source['icon']} {source['short_name']}: Fetching documentation...") + + # Try to fetch and extract + raw_content = "" + confidence = "low" + + if self.firecrawl: + try: + # Fetch typography docs + typo_url = source["urls"].get("typography") + if typo_url: + log(f" │ ├─ Fetching {typo_url[:50]}...") + typo_content = await self._fetch_url(typo_url) + if typo_content: + raw_content += f"\n\n=== TYPOGRAPHY ===\n{typo_content[:4000]}" + confidence = "medium" + + # Fetch spacing docs + spacing_url = source["urls"].get("spacing") + if spacing_url: + log(f" │ ├─ Fetching spacing docs...") + spacing_content = await self._fetch_url(spacing_url) + if spacing_content: + raw_content += f"\n\n=== SPACING ===\n{spacing_content[:3000]}" + if confidence == "medium": + confidence = "high" + + except Exception as e: + log(f" │ ├─ ⚠️ Fetch error: {str(e)[:50]}") + + # Extract specs with LLM (or use fallback) + if raw_content and self.hf_client: + log(f" │ ├─ Extracting specifications...") + extracted = await self._extract_specs_with_llm(source["name"], raw_content) + else: + log(f" │ ├─ Using fallback data (fetch unavailable)") + extracted = FALLBACK_BENCHMARKS.get(system_key, {}) + confidence = "fallback" + + # Build result + result = BenchmarkData( + key=system_key, + name=source["name"], + short_name=source["short_name"], + vendor=source["vendor"], + icon=source["icon"], + typography=extracted.get("typography", FALLBACK_BENCHMARKS.get(system_key, {}).get("typography", {})), + spacing=extracted.get("spacing", FALLBACK_BENCHMARKS.get(system_key, {}).get("spacing", {})), + colors=extracted.get("colors", FALLBACK_BENCHMARKS.get(system_key, {}).get("colors", {})), + fetched_at=datetime.now().isoformat(), + confidence=confidence, + source_urls=list(source["urls"].values()), + best_for=source["best_for"], + ) + + # Cache result + self.cache.set(system_key, result) + + ratio = result.typography.get("scale_ratio", "?") + base = result.typography.get("base_size", "?") + grid = result.spacing.get("base", "?") + log(f" │ └─ ✅ ratio={ratio}, base={base}px, grid={grid}px [{confidence}]") + + return result + + async def _fetch_url(self, url: str) -> Optional[str]: + """Fetch URL content using Firecrawl.""" + if not self.firecrawl: + return None + + try: + # Firecrawl scrape + result = self.firecrawl.scrape_url( + url, + params={"formats": ["markdown"]} + ) + + if result and result.get("markdown"): + return result["markdown"] + elif result and result.get("content"): + return result["content"] + + except Exception as e: + pass + + return None + + async def _extract_specs_with_llm(self, system_name: str, raw_content: str) -> dict: + """Extract structured specs from documentation using LLM.""" + if not self.hf_client: + return {} + + prompt = f"""Extract the design system specifications from this documentation. + +DESIGN SYSTEM: {system_name} + +DOCUMENTATION: +{raw_content[:6000]} + +Return ONLY a JSON object with these exact fields (use null if not found): +{{ + "typography": {{ + "scale_ratio": , + "base_size": , + "font_family": "", + "sizes": [], + "line_height_body": + }}, + "spacing": {{ + "base": , + "scale": [], + "grid": "" + }}, + "colors": {{ + "palette_size": , + "uses_ramps": + }} +}} + +Return ONLY valid JSON, no explanation.""" + + try: + response = await self.hf_client.complete_async( + agent_name="benchmark_extractor", + system_prompt="You are a design system specification extractor. Extract only the factual specifications.", + user_message=prompt, + max_tokens=600, + json_mode=True, + ) + + # Parse JSON from response + import re + json_match = re.search(r'\{[\s\S]*\}', response) + if json_match: + return json.loads(json_match.group()) + + except Exception as e: + pass + + return {} + + async def research_selected_benchmarks( + self, + selected_keys: list[str], + log_callback: Callable = None, + ) -> list[BenchmarkData]: + """ + Research multiple selected design systems. + + Args: + selected_keys: List of system keys to research + log_callback: Function to log progress + + Returns: + List of BenchmarkData + """ + def log(msg: str): + if log_callback: + log_callback(msg) + + log("") + log("═" * 60) + log("🔬 LAYER 2: BENCHMARK RESEARCH (Firecrawl + Cache)") + log("═" * 60) + log("") + log(f" Selected systems: {', '.join(selected_keys)}") + log("") + + results = [] + + for key in selected_keys: + if key in DESIGN_SYSTEM_SOURCES: + try: + result = await self.research_benchmark(key, log_callback) + results.append(result) + except Exception as e: + log(f" ├─ ⚠️ Error researching {key}: {e}") + # Use fallback + source = DESIGN_SYSTEM_SOURCES[key] + fallback = FALLBACK_BENCHMARKS.get(key, {}) + results.append(BenchmarkData( + key=key, + name=source["name"], + short_name=source["short_name"], + vendor=source["vendor"], + icon=source["icon"], + typography=fallback.get("typography", {}), + spacing=fallback.get("spacing", {}), + colors=fallback.get("colors", {}), + fetched_at=datetime.now().isoformat(), + confidence="fallback", + best_for=source["best_for"], + )) + + log("") + log(f" ✅ Researched {len(results)}/{len(selected_keys)} design systems") + + return results + + def compare_to_benchmarks( + self, + your_ratio: float, + your_base_size: int, + your_spacing_grid: int, + benchmarks: list[BenchmarkData], + log_callback: Callable = None, + ) -> list[BenchmarkComparison]: + """ + Compare user's tokens to researched benchmarks. + + Args: + your_ratio: Detected type scale ratio + your_base_size: Detected base font size + your_spacing_grid: Detected spacing grid base + benchmarks: List of researched BenchmarkData + log_callback: Function to log progress + + Returns: + List of BenchmarkComparison sorted by similarity + """ + def log(msg: str): + if log_callback: + log_callback(msg) + + log("") + log(" 📊 BENCHMARK COMPARISON") + log(" " + "─" * 40) + log(f" Your values: ratio={your_ratio:.2f}, base={your_base_size}px, grid={your_spacing_grid}px") + log("") + + comparisons = [] + + for b in benchmarks: + b_ratio = b.typography.get("scale_ratio", 1.25) + b_base = b.typography.get("base_size", 16) + b_grid = b.spacing.get("base", 8) + + # Calculate differences + ratio_diff = abs(your_ratio - b_ratio) + base_diff = abs(your_base_size - b_base) + grid_diff = abs(your_spacing_grid - b_grid) + + # Calculate match percentages + type_match = max(0, 100 - (ratio_diff * 100)) # 0.1 diff = 90% match + spacing_match = max(0, 100 - (grid_diff * 10)) # 4px diff = 60% match + + # Weighted similarity score (lower = more similar) + similarity = (ratio_diff * 10) + (base_diff * 0.5) + (grid_diff * 0.3) + + # Overall match percentage + overall_match = (type_match * 0.5) + (spacing_match * 0.3) + (100 - base_diff * 5) * 0.2 + overall_match = max(0, min(100, overall_match)) + + comparisons.append(BenchmarkComparison( + benchmark=b, + similarity_score=similarity, + type_ratio_diff=ratio_diff, + base_size_diff=base_diff, + spacing_grid_diff=grid_diff, + type_match_pct=type_match, + spacing_match_pct=spacing_match, + overall_match_pct=overall_match, + )) + + # Sort by similarity (lower = better) + comparisons.sort(key=lambda x: x.similarity_score) + + # Log results + medals = ["🥇", "🥈", "🥉"] + for i, c in enumerate(comparisons[:5]): + medal = medals[i] if i < 3 else " " + b = c.benchmark + log(f" {medal} {b.icon} {b.short_name}: {c.overall_match_pct:.0f}% match (score: {c.similarity_score:.2f})") + log(f" └─ ratio={b.typography.get('scale_ratio')}, base={b.typography.get('base_size')}px, grid={b.spacing.get('base')}px") + + return comparisons + + +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + +def get_available_benchmarks() -> list[dict]: + """Get list of available design systems for UI dropdown.""" + return [ + { + "key": key, + "name": source["name"], + "short_name": source["short_name"], + "icon": source["icon"], + "vendor": source["vendor"], + "best_for": source["best_for"], + } + for key, source in DESIGN_SYSTEM_SOURCES.items() + ] + + +def get_benchmark_choices() -> list[tuple[str, str]]: + """Get choices for Gradio dropdown.""" + return [ + (f"{source['icon']} {source['short_name']} ({source['vendor']})", key) + for key, source in DESIGN_SYSTEM_SOURCES.items() + ] diff --git a/agents/crawler.py b/agents/crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..6dd8e4351d2db3013ffbfc8b2b7bf11325dca34d --- /dev/null +++ b/agents/crawler.py @@ -0,0 +1,366 @@ +""" +Agent 1: Website Crawler +Design System Extractor v2 + +Persona: Meticulous Design Archaeologist + +Responsibilities: +- Auto-discover pages from base URL +- Classify page types (homepage, listing, detail, etc.) +- Prepare page list for user confirmation +""" + +import asyncio +import re +from urllib.parse import urljoin, urlparse +from typing import Optional, Callable +from datetime import datetime + +from playwright.async_api import async_playwright, Browser, Page, BrowserContext + +from core.token_schema import DiscoveredPage, PageType, Viewport +from config.settings import get_settings + + +class PageDiscoverer: + """ + Discovers pages from a website for design system extraction. + + This is the first part of Agent 1's job — finding pages before + the human confirms which ones to crawl. + """ + + def __init__(self): + self.settings = get_settings() + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.visited_urls: set[str] = set() + self.discovered_pages: list[DiscoveredPage] = [] + + async def __aenter__(self): + """Async context manager entry.""" + await self._init_browser() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + await self._close_browser() + + async def _init_browser(self): + """Initialize Playwright browser.""" + playwright = await async_playwright().start() + self.browser = await playwright.chromium.launch( + headless=self.settings.browser.headless + ) + self.context = await self.browser.new_context( + viewport={ + "width": self.settings.viewport.desktop_width, + "height": self.settings.viewport.desktop_height, + }, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" + ) + + async def _close_browser(self): + """Close browser and cleanup.""" + if self.context: + await self.context.close() + if self.browser: + await self.browser.close() + + def _normalize_url(self, url: str, base_url: str) -> Optional[str]: + """Normalize and validate URL.""" + # Handle relative URLs + if not url.startswith(('http://', 'https://')): + url = urljoin(base_url, url) + + parsed = urlparse(url) + base_parsed = urlparse(base_url) + + # Only allow same domain + if parsed.netloc != base_parsed.netloc: + return None + + # Remove fragments and normalize + normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + + # Remove trailing slash for consistency + if normalized.endswith('/') and len(normalized) > len(f"{parsed.scheme}://{parsed.netloc}/"): + normalized = normalized.rstrip('/') + + return normalized + + def _classify_page_type(self, url: str, title: str = "") -> PageType: + """ + Classify page type based on URL patterns and title. + + This is a heuristic — not perfect, but good enough for discovery. + """ + url_lower = url.lower() + title_lower = title.lower() if title else "" + + # Check URL patterns + patterns = { + PageType.HOMEPAGE: [r'/$', r'/home$', r'/index'], + PageType.LISTING: [r'/products', r'/catalog', r'/list', r'/category', r'/collection', r'/search'], + PageType.DETAIL: [r'/product/', r'/item/', r'/detail/', r'/p/', r'/[a-z-]+/\d+'], + PageType.FORM: [r'/contact', r'/form', r'/apply', r'/submit', r'/register'], + PageType.AUTH: [r'/login', r'/signin', r'/signup', r'/auth', r'/account'], + PageType.CHECKOUT: [r'/cart', r'/checkout', r'/basket', r'/payment'], + PageType.MARKETING: [r'/landing', r'/promo', r'/campaign', r'/offer'], + PageType.ABOUT: [r'/about', r'/team', r'/company', r'/story'], + PageType.CONTACT: [r'/contact', r'/support', r'/help'], + } + + for page_type, url_patterns in patterns.items(): + for pattern in url_patterns: + if re.search(pattern, url_lower): + return page_type + + # Check title patterns + title_patterns = { + PageType.HOMEPAGE: ['home', 'welcome'], + PageType.LISTING: ['products', 'catalog', 'collection', 'browse'], + PageType.DETAIL: ['product', 'item'], + PageType.AUTH: ['login', 'sign in', 'sign up', 'register'], + PageType.ABOUT: ['about', 'our story', 'team'], + PageType.CONTACT: ['contact', 'get in touch', 'support'], + } + + for page_type, keywords in title_patterns.items(): + for keyword in keywords: + if keyword in title_lower: + return page_type + + return PageType.OTHER + + async def _extract_links(self, page: Page, base_url: str) -> list[str]: + """Extract all internal links from a page.""" + links = await page.evaluate(""" + () => { + const links = Array.from(document.querySelectorAll('a[href]')); + return links.map(a => a.href).filter(href => + href && + !href.startsWith('javascript:') && + !href.startsWith('mailto:') && + !href.startsWith('tel:') && + !href.includes('#') + ); + } + """) + + # Normalize and filter + valid_links = [] + for link in links: + normalized = self._normalize_url(link, base_url) + if normalized and normalized not in self.visited_urls: + valid_links.append(normalized) + + return list(set(valid_links)) + + async def _get_page_title(self, page: Page) -> str: + """Get page title.""" + try: + return await page.title() + except Exception: + return "" + + async def discover( + self, + base_url: str, + max_pages: int = None, + progress_callback: Optional[Callable[[float], None]] = None + ) -> list[DiscoveredPage]: + """ + Discover pages from a website. + + Args: + base_url: The starting URL + max_pages: Maximum pages to discover (default from settings) + progress_callback: Optional callback for progress updates + + Returns: + List of discovered pages + """ + max_pages = max_pages or self.settings.crawl.max_pages + + async with self: + # Start with homepage + normalized_base = self._normalize_url(base_url, base_url) + if not normalized_base: + raise ValueError(f"Invalid base URL: {base_url}") + + queue = [normalized_base] + self.visited_urls = set() + self.discovered_pages = [] + + while queue and len(self.discovered_pages) < max_pages: + current_url = queue.pop(0) + + if current_url in self.visited_urls: + continue + + self.visited_urls.add(current_url) + + try: + page = await self.context.new_page() + + # Navigate to page with more lenient settings + # Use 'domcontentloaded' instead of 'networkidle' for faster/more reliable loading + try: + await page.goto( + current_url, + wait_until="domcontentloaded", + timeout=60000 # 60 seconds + ) + # Wait a bit more for JS to render + await page.wait_for_timeout(2000) + except Exception as nav_error: + # Try with 'load' event as fallback + try: + await page.goto( + current_url, + wait_until="load", + timeout=60000 + ) + await page.wait_for_timeout(3000) + except Exception: + # Last resort - just try to get whatever loaded + pass + + # Get page info + title = await self._get_page_title(page) + page_type = self._classify_page_type(current_url, title) + depth = len(urlparse(current_url).path.split('/')) - 1 + + # Create discovered page + discovered = DiscoveredPage( + url=current_url, + title=title, + page_type=page_type, + depth=depth, + selected=True, + ) + self.discovered_pages.append(discovered) + + # Extract links for further crawling + new_links = await self._extract_links(page, base_url) + + # Prioritize certain page types + priority_patterns = ['/product', '/listing', '/category', '/about', '/contact'] + priority_links = [l for l in new_links if any(p in l.lower() for p in priority_patterns)] + other_links = [l for l in new_links if l not in priority_links] + + # Add to queue (priority first) + for link in priority_links + other_links: + if link not in self.visited_urls and link not in queue: + queue.append(link) + + await page.close() + + # Progress callback + if progress_callback: + progress = len(self.discovered_pages) / max_pages + progress_callback(min(progress, 1.0)) + + # Rate limiting + await asyncio.sleep(self.settings.crawl.crawl_delay_ms / 1000) + + except Exception as e: + # Log error but continue + discovered = DiscoveredPage( + url=current_url, + title="", + page_type=PageType.OTHER, + depth=0, + selected=False, + error=str(e), + ) + self.discovered_pages.append(discovered) + + return self.discovered_pages + + def get_pages_by_type(self) -> dict[PageType, list[DiscoveredPage]]: + """Group discovered pages by type.""" + grouped: dict[PageType, list[DiscoveredPage]] = {} + for page in self.discovered_pages: + if page.page_type not in grouped: + grouped[page.page_type] = [] + grouped[page.page_type].append(page) + return grouped + + def get_suggested_pages(self, min_pages: int = None) -> list[DiscoveredPage]: + """ + Get suggested pages for extraction. + + Ensures diversity of page types and prioritizes key templates. + """ + min_pages = min_pages or self.settings.crawl.min_pages + + # Priority order for page types + priority_types = [ + PageType.HOMEPAGE, + PageType.LISTING, + PageType.DETAIL, + PageType.FORM, + PageType.MARKETING, + PageType.AUTH, + PageType.ABOUT, + PageType.CONTACT, + PageType.OTHER, + ] + + selected = [] + grouped = self.get_pages_by_type() + + # First pass: get at least one of each priority type + for page_type in priority_types: + if page_type in grouped and grouped[page_type]: + # Take the first (usually shallowest) page of this type + page = sorted(grouped[page_type], key=lambda p: p.depth)[0] + if page not in selected: + selected.append(page) + + # Second pass: fill up to min_pages with remaining pages + remaining = [p for p in self.discovered_pages if p not in selected and not p.error] + remaining.sort(key=lambda p: p.depth) + + while len(selected) < min_pages and remaining: + selected.append(remaining.pop(0)) + + # Mark as selected + for page in selected: + page.selected = True + + return selected + + +# ============================================================================= +# CONVENIENCE FUNCTIONS +# ============================================================================= + +async def discover_pages(base_url: str, max_pages: int = 20) -> list[DiscoveredPage]: + """Convenience function to discover pages.""" + discoverer = PageDiscoverer() + return await discoverer.discover(base_url, max_pages) + + +async def quick_discover(base_url: str) -> dict: + """Quick discovery returning summary dict.""" + pages = await discover_pages(base_url) + + return { + "total_found": len(pages), + "by_type": { + pt.value: len([p for p in pages if p.page_type == pt]) + for pt in PageType + }, + "pages": [ + { + "url": p.url, + "title": p.title, + "type": p.page_type.value, + "selected": p.selected, + } + for p in pages + ], + } diff --git a/agents/extractor.py b/agents/extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..26476c46f768ab104ce44feb47ccd4c648e14b61 --- /dev/null +++ b/agents/extractor.py @@ -0,0 +1,1294 @@ +""" +Agent 1: Token Extractor +Design System Extractor v2 + +Persona: Meticulous Design Archaeologist + +Responsibilities: +- Crawl pages at specified viewport +- Extract computed styles from all elements +- Parse CSS files for variables and rules +- Extract colors from SVGs +- Collect colors, typography, spacing, radius, shadows +- Track frequency and context for each token +""" + +import asyncio +import re +from typing import Optional, Callable +from datetime import datetime +from collections import defaultdict + +from playwright.async_api import async_playwright, Browser, Page, BrowserContext + +from core.token_schema import ( + Viewport, + ExtractedTokens, + ColorToken, + TypographyToken, + SpacingToken, + RadiusToken, + ShadowToken, + FontFamily, + TokenSource, + Confidence, +) +from core.color_utils import ( + normalize_hex, + parse_color, + get_contrast_with_white, + get_contrast_with_black, + check_wcag_compliance, +) +from config.settings import get_settings + + +class TokenExtractor: + """ + Extracts design tokens from web pages. + + This is the second part of Agent 1's job — after pages are confirmed, + we crawl and extract all CSS values. + + Enhanced with: + - CSS file parsing for variables and rules + - SVG color extraction + - Inline style extraction + """ + + def __init__(self, viewport: Viewport = Viewport.DESKTOP): + self.settings = get_settings() + self.viewport = viewport + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + + # Token collection + self.colors: dict[str, ColorToken] = {} + self.typography: dict[str, TypographyToken] = {} + self.spacing: dict[str, SpacingToken] = {} + self.radius: dict[str, RadiusToken] = {} + self.shadows: dict[str, ShadowToken] = {} + + # Foreground-background pairs extracted from actual DOM elements + self.fg_bg_pairs: list[dict] = [] + + # CSS Variables collection + self.css_variables: dict[str, str] = {} + + # Font tracking + self.font_families: dict[str, FontFamily] = {} + + # Statistics + self.total_elements = 0 + self.errors: list[str] = [] + self.warnings: list[str] = [] + + async def __aenter__(self): + """Async context manager entry.""" + await self._init_browser() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + await self._close_browser() + + async def _init_browser(self): + """Initialize Playwright browser.""" + playwright = await async_playwright().start() + self.browser = await playwright.chromium.launch( + headless=self.settings.browser.headless + ) + + # Set viewport based on extraction mode + if self.viewport == Viewport.DESKTOP: + width = self.settings.viewport.desktop_width + height = self.settings.viewport.desktop_height + else: + width = self.settings.viewport.mobile_width + height = self.settings.viewport.mobile_height + + self.context = await self.browser.new_context( + viewport={"width": width, "height": height}, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" + ) + + async def _close_browser(self): + """Close browser and cleanup.""" + if self.context: + await self.context.close() + if self.browser: + await self.browser.close() + + async def _scroll_page(self, page: Page): + """Scroll page to load lazy content.""" + await page.evaluate(""" + async () => { + const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); + const height = document.body.scrollHeight; + const step = window.innerHeight; + + for (let y = 0; y < height; y += step) { + window.scrollTo(0, y); + await delay(100); + } + + // Scroll back to top + window.scrollTo(0, 0); + } + """) + + # Wait for network idle after scrolling + await page.wait_for_load_state("networkidle", timeout=self.settings.browser.network_idle_timeout) + + async def _extract_styles_from_page(self, page: Page) -> dict: + """ + Extract computed styles from all elements on the page. + + This is the core extraction logic — we get getComputedStyle for every element. + """ + styles_data = await page.evaluate(""" + () => { + const elements = document.querySelectorAll('*'); + const results = { + colors: [], + typography: [], + spacing: [], + radius: [], + shadows: [], + elements_count: elements.length, + loaded_fonts: [], + }; + + // v3: Collect actually loaded font names via document.fonts API + // This gives us the REAL font names, not generic fallbacks + const genericFamilies = new Set([ + 'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', + 'system-ui', 'ui-serif', 'ui-sans-serif', 'ui-monospace', 'ui-rounded', + 'math', 'emoji', 'fangsong', + ]); + try { + document.fonts.forEach(function(font) { + const name = font.family.replace(/['"]/g, '').trim(); + if (name && !genericFamilies.has(name.toLowerCase())) { + results.loaded_fonts.push(name); + } + }); + } catch(e) {} + // Deduplicate + results.loaded_fonts = [...new Set(results.loaded_fonts)]; + + const colorProperties = [ + 'color', 'background-color', 'border-color', + 'border-top-color', 'border-right-color', + 'border-bottom-color', 'border-left-color', + 'outline-color', 'text-decoration-color', + ]; + + const spacingProperties = [ + 'margin-top', 'margin-right', 'margin-bottom', 'margin-left', + 'padding-top', 'padding-right', 'padding-bottom', 'padding-left', + 'gap', 'row-gap', 'column-gap', + ]; + + elements.forEach(el => { + const tag = el.tagName.toLowerCase(); + const styles = window.getComputedStyle(el); + + // Skip invisible elements + if (styles.display === 'none' || styles.visibility === 'hidden') { + return; + } + + // --- COLORS --- + colorProperties.forEach(prop => { + const value = styles.getPropertyValue(prop); + if (value && value !== 'rgba(0, 0, 0, 0)' && value !== 'transparent') { + results.colors.push({ + value: value, + property: prop, + element: tag, + context: prop.includes('background') ? 'background' : + prop.includes('border') ? 'border' : 'text', + }); + } + }); + + // --- TYPOGRAPHY --- + // v3: Get full font-family stack (not just computed generic) + const fontFamily = styles.getPropertyValue('font-family'); + const fontSize = styles.getPropertyValue('font-size'); + const fontWeight = styles.getPropertyValue('font-weight'); + const lineHeight = styles.getPropertyValue('line-height'); + const letterSpacing = styles.getPropertyValue('letter-spacing'); + + if (fontSize && fontFamily) { + results.typography.push({ + fontFamily: fontFamily, + fontSize: fontSize, + fontWeight: fontWeight, + lineHeight: lineHeight, + letterSpacing: letterSpacing, + element: tag, + }); + } + + // --- SPACING --- + spacingProperties.forEach(prop => { + const value = styles.getPropertyValue(prop); + if (value && value !== '0px' && value !== 'auto' && value !== 'normal') { + const px = parseFloat(value); + if (!isNaN(px) && px > 0 && px < 500) { + results.spacing.push({ + value: value, + valuePx: Math.round(px), + property: prop, + context: prop.includes('margin') ? 'margin' : + prop.includes('padding') ? 'padding' : 'gap', + }); + } + } + }); + + // --- BORDER RADIUS --- + const radiusProps = [ + 'border-radius', 'border-top-left-radius', + 'border-top-right-radius', 'border-bottom-left-radius', + 'border-bottom-right-radius', + ]; + + radiusProps.forEach(prop => { + const value = styles.getPropertyValue(prop); + if (value && value !== '0px') { + results.radius.push({ + value: value, + element: tag, + }); + } + }); + + // --- BOX SHADOW --- + const shadow = styles.getPropertyValue('box-shadow'); + if (shadow && shadow !== 'none') { + results.shadows.push({ + value: shadow, + element: tag, + }); + } + }); + + return results; + } + """) + + return styles_data + + async def _extract_fg_bg_pairs(self, page: Page) -> list[dict]: + """ + Extract actual foreground-background color pairs from visible DOM elements. + + For each visible element that has a non-transparent text color, walk up the + ancestor chain to find the effective background color. This gives us real + foreground/background pairs so we can do accurate WCAG AA checks instead of + only comparing every color against white/black. + """ + pairs = await page.evaluate(""" + () => { + const pairs = []; + const seen = new Set(); + + function rgbToHex(rgb) { + if (!rgb || rgb === 'transparent' || rgb === 'rgba(0, 0, 0, 0)') return null; + const match = rgb.match(/rgba?\\((\\d+),\\s*(\\d+),\\s*(\\d+)/); + if (!match) return null; + const r = parseInt(match[1]); + const g = parseInt(match[2]); + const b = parseInt(match[3]); + return '#' + [r, g, b].map(c => c.toString(16).padStart(2, '0')).join(''); + } + + function getEffectiveBackground(el) { + let current = el; + while (current && current !== document.documentElement) { + const bg = window.getComputedStyle(current).backgroundColor; + if (bg && bg !== 'rgba(0, 0, 0, 0)' && bg !== 'transparent') { + return rgbToHex(bg); + } + current = current.parentElement; + } + return '#ffffff'; // default page background + } + + const elements = document.querySelectorAll('*'); + elements.forEach(el => { + const styles = window.getComputedStyle(el); + if (styles.display === 'none' || styles.visibility === 'hidden') return; + + const fg = rgbToHex(styles.color); + if (!fg) return; + + const bg = getEffectiveBackground(el); + if (!bg) return; + + const key = fg + '|' + bg; + if (seen.has(key)) return; + seen.add(key); + + pairs.push({ + foreground: fg, + background: bg, + element: el.tagName.toLowerCase(), + }); + }); + + return pairs; + } + """) + return pairs or [] + + async def _extract_css_variables(self, page: Page) -> dict: + """ + Extract CSS custom properties (variables) from :root and stylesheets. + + This catches colors defined as: + - :root { --primary-color: #3860be; } + - :root { --brand-cyan: #00c4cc; } + """ + css_vars = await page.evaluate(""" + () => { + const variables = {}; + + // 1. Get CSS variables from :root computed styles + const rootStyles = getComputedStyle(document.documentElement); + const rootCss = document.documentElement.style.cssText; + + // 2. Parse all stylesheets for CSS variables + for (const sheet of document.styleSheets) { + try { + const rules = sheet.cssRules || sheet.rules; + for (const rule of rules) { + if (rule.style) { + for (let i = 0; i < rule.style.length; i++) { + const prop = rule.style[i]; + if (prop.startsWith('--')) { + const value = rule.style.getPropertyValue(prop).trim(); + if (value) { + variables[prop] = value; + } + } + } + } + // Also check @media rules + if (rule.cssRules) { + for (const innerRule of rule.cssRules) { + if (innerRule.style) { + for (let i = 0; i < innerRule.style.length; i++) { + const prop = innerRule.style[i]; + if (prop.startsWith('--')) { + const value = innerRule.style.getPropertyValue(prop).trim(); + if (value) { + variables[prop] = value; + } + } + } + } + } + } + } + } catch (e) { + // CORS may block access to external stylesheets + console.log('Could not access stylesheet:', e); + } + } + + // 3. Get computed CSS variable values from :root + const computedVars = {}; + for (const prop of Object.keys(variables)) { + const computed = rootStyles.getPropertyValue(prop).trim(); + if (computed) { + computedVars[prop] = computed; + } + } + + return { raw: variables, computed: computedVars }; + } + """) + + return css_vars + + async def _extract_svg_colors(self, page: Page) -> list[dict]: + """ + Extract colors from SVG elements (fill, stroke). + + This catches colors in: + - + - + - + """ + svg_colors = await page.evaluate(""" + () => { + const colors = []; + + // Find all SVG elements + const svgs = document.querySelectorAll('svg, svg *'); + + svgs.forEach(el => { + // Check fill attribute + const fill = el.getAttribute('fill'); + if (fill && fill !== 'none' && fill !== 'currentColor' && !fill.startsWith('url(')) { + colors.push({ + value: fill, + property: 'svg-fill', + element: el.tagName.toLowerCase(), + context: 'svg', + }); + } + + // Check stroke attribute + const stroke = el.getAttribute('stroke'); + if (stroke && stroke !== 'none' && stroke !== 'currentColor' && !stroke.startsWith('url(')) { + colors.push({ + value: stroke, + property: 'svg-stroke', + element: el.tagName.toLowerCase(), + context: 'svg', + }); + } + + // Check computed styles for SVG elements + const styles = getComputedStyle(el); + const computedFill = styles.fill; + const computedStroke = styles.stroke; + + if (computedFill && computedFill !== 'none' && !computedFill.startsWith('url(')) { + colors.push({ + value: computedFill, + property: 'svg-fill-computed', + element: el.tagName.toLowerCase(), + context: 'svg', + }); + } + + if (computedStroke && computedStroke !== 'none' && !computedStroke.startsWith('url(')) { + colors.push({ + value: computedStroke, + property: 'svg-stroke-computed', + element: el.tagName.toLowerCase(), + context: 'svg', + }); + } + }); + + return colors; + } + """) + + return svg_colors + + async def _extract_inline_styles(self, page: Page) -> dict: + """ + Extract colors from inline style attributes. + + This catches colors in: + -

+ - + """ + inline_data = await page.evaluate(""" + () => { + const colors = []; + const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi; + + // Find all elements with inline styles + const elements = document.querySelectorAll('[style]'); + + elements.forEach(el => { + const styleAttr = el.getAttribute('style'); + if (styleAttr) { + const matches = styleAttr.match(colorRegex); + if (matches) { + matches.forEach(color => { + colors.push({ + value: color, + property: 'inline-style', + element: el.tagName.toLowerCase(), + context: 'inline', + }); + }); + } + } + }); + + return colors; + } + """) + + return inline_data + + async def _extract_stylesheet_colors(self, page: Page) -> list[dict]: + """ + Parse CSS stylesheets for color values. + + This catches colors defined in CSS rules that may not be + currently applied to visible elements. + + Also fetches external stylesheets that may be CORS-blocked. + """ + css_colors = await page.evaluate(""" + () => { + const colors = []; + const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi; + + // Color-related CSS properties + const colorProps = [ + 'color', 'background-color', 'background', 'border-color', + 'border-top-color', 'border-right-color', 'border-bottom-color', 'border-left-color', + 'outline-color', 'box-shadow', 'text-shadow', 'fill', 'stroke', + 'caret-color', 'column-rule-color', 'text-decoration-color', + ]; + + // Parse all stylesheets + for (const sheet of document.styleSheets) { + try { + const rules = sheet.cssRules || sheet.rules; + for (const rule of rules) { + if (rule.style) { + colorProps.forEach(prop => { + const value = rule.style.getPropertyValue(prop); + if (value) { + const matches = value.match(colorRegex); + if (matches) { + matches.forEach(color => { + colors.push({ + value: color, + property: prop, + element: 'css-rule', + context: 'stylesheet', + selector: rule.selectorText || '', + }); + }); + } + } + }); + } + } + } catch (e) { + // CORS may block access to external stylesheets + } + } + + return colors; + } + """) + + return css_colors + + async def _fetch_external_css_colors(self, page: Page) -> list[dict]: + """ + Fetch and parse external CSS files directly to bypass CORS. + + This catches colors in external stylesheets that are blocked by CORS. + """ + colors = [] + + try: + # Get all stylesheet URLs + css_urls = await page.evaluate(""" + () => { + const urls = []; + const links = document.querySelectorAll('link[rel="stylesheet"]'); + links.forEach(link => { + if (link.href) { + urls.push(link.href); + } + }); + return urls; + } + """) + + # Color regex pattern + color_regex = re.compile(r'#[0-9a-fA-F]{3,8}|rgb\([^)]+\)|rgba\([^)]+\)|hsl\([^)]+\)|hsla\([^)]+\)', re.IGNORECASE) + + # Fetch each CSS file + for css_url in css_urls[:10]: # Limit to 10 files + try: + response = await page.request.get(css_url, timeout=5000) + if response.ok: + css_text = await response.text() + + # Find all color values in CSS text + matches = color_regex.findall(css_text) + for match in matches: + colors.append({ + "value": match, + "property": "external-css", + "element": "css-file", + "context": "external-stylesheet", + }) + except Exception as e: + # Skip if fetch fails + pass + + except Exception as e: + self.warnings.append(f"External CSS fetch failed: {str(e)}") + + return colors + + async def _extract_all_page_colors(self, page: Page) -> list[dict]: + """ + Extract ALL color values from the page source and styles. + + This is a brute-force approach that scans the entire page HTML + and all style blocks for any color values. + """ + colors = await page.evaluate(""" + () => { + const colors = []; + const colorRegex = /#[0-9a-fA-F]{3,8}|rgb\\([^)]+\\)|rgba\\([^)]+\\)|hsl\\([^)]+\\)|hsla\\([^)]+\\)/gi; + + // 1. Scan all ', html_content, re.DOTALL | re.IGNORECASE) + + for i, block in enumerate(style_blocks): + colors = self._extract_colors_from_css(block, f"style-block-{i}") + for color in colors: + self._aggregate_color(color) + + variables = self._extract_css_variables(block) + self.css_variables.update(variables) + self.stats["style_blocks_parsed"] += 1 + + log(f" Found {len(style_blocks)} style blocks") + + # Extract CSS file URLs + log(" 🔗 Finding linked CSS files...") + css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE) + + log(f" Found {len(css_urls)} CSS files") + + # Fetch and parse each CSS file + for css_url in css_urls[:15]: # Limit to 15 files + try: + # Make URL absolute + if css_url.startswith('//'): + css_url = 'https:' + css_url + elif css_url.startswith('/'): + from urllib.parse import urlparse + parsed = urlparse(url) + css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}" + elif not css_url.startswith('http'): + from urllib.parse import urljoin + css_url = urljoin(url, css_url) + + log(f" 📄 Fetching: {css_url[:60]}...") + + # Fetch CSS file + css_result = app.scrape_url(css_url, params={'formats': ['rawHtml']}) + css_content = css_result.get('rawHtml', '') or css_result.get('content', '') + + if css_content: + colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1]) + for color in colors: + self._aggregate_color(color) + + variables = self._extract_css_variables(css_content) + self.css_variables.update(variables) + self.stats["css_files_parsed"] += 1 + + log(f" ✅ Parsed ({len(colors)} colors)") + + except Exception as e: + log(f" ⚠️ Failed: {str(e)[:50]}") + self.warnings.append(f"Failed to fetch {css_url}: {str(e)}") + + # Process CSS variables that contain colors + log(" 🎨 Processing CSS variables...") + for var_name, var_value in self.css_variables.items(): + if self.color_regex.match(var_value.strip()): + self._aggregate_color({ + "value": var_value.strip(), + "source": f"css-var:{var_name}", + "context": "css-variable", + }) + self.stats["css_variables_found"] += 1 + + self.stats["colors_found"] = len(self.colors) + + # Log summary + log("") + log("📊 FIRECRAWL RESULTS:") + log(f" CSS files parsed: {self.stats['css_files_parsed']}") + log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}") + log(f" CSS variables found: {self.stats['css_variables_found']}") + log(f" Unique colors found: {self.stats['colors_found']}") + log("") + + # Show top colors found + if self.colors: + sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10] + log(" 🎨 Top colors found:") + for hex_val, data in sorted_colors: + log(f" {hex_val} (used {data['frequency']}x)") + + return { + "colors": self.colors, + "css_variables": self.css_variables, + "stats": self.stats, + } + + except Exception as e: + log(f" ❌ Firecrawl error: {str(e)}") + self.errors.append(f"Firecrawl error: {str(e)}") + return await self._fallback_css_extraction(url, log_callback) + + async def _fallback_css_extraction( + self, + url: str, + log_callback: Optional[Callable[[str], None]] = None + ) -> dict: + """ + Fallback CSS extraction using httpx (no Firecrawl API key needed). + """ + + def log(msg: str): + if log_callback: + log_callback(msg) + + log("") + log("🔄 Using fallback CSS extraction (httpx)...") + + try: + import httpx + + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + # Fetch main page + log(f" 🌐 Fetching: {url}") + response = await client.get(url) + html_content = response.text + + log(f" ✅ Page fetched ({len(html_content)} chars)") + + # Extract ', html_content, re.DOTALL | re.IGNORECASE) + + for i, block in enumerate(style_blocks): + colors = self._extract_colors_from_css(block, f"style-block-{i}") + for color in colors: + self._aggregate_color(color) + + variables = self._extract_css_variables(block) + self.css_variables.update(variables) + self.stats["style_blocks_parsed"] += 1 + + log(f" Found {len(style_blocks)} style blocks") + + # Extract CSS file URLs + log(" 🔗 Finding linked CSS files...") + css_urls = re.findall(r'href=["\']([^"\']*\.css[^"\']*)["\']', html_content, re.IGNORECASE) + + log(f" Found {len(css_urls)} CSS files") + + # Fetch and parse each CSS file + for css_url in css_urls[:15]: + try: + # Make URL absolute + if css_url.startswith('//'): + css_url = 'https:' + css_url + elif css_url.startswith('/'): + from urllib.parse import urlparse + parsed = urlparse(url) + css_url = f"{parsed.scheme}://{parsed.netloc}{css_url}" + elif not css_url.startswith('http'): + from urllib.parse import urljoin + css_url = urljoin(url, css_url) + + log(f" 📄 Fetching: {css_url[:60]}...") + + css_response = await client.get(css_url) + css_content = css_response.text + + if css_content: + colors = self._extract_colors_from_css(css_content, css_url.split('/')[-1]) + for color in colors: + self._aggregate_color(color) + + variables = self._extract_css_variables(css_content) + self.css_variables.update(variables) + self.stats["css_files_parsed"] += 1 + + log(f" ✅ Parsed ({len(colors)} colors)") + + except Exception as e: + log(f" ⚠️ Failed: {str(e)[:50]}") + self.warnings.append(f"Failed to fetch {css_url}: {str(e)}") + + # Process CSS variables + log(" 🎨 Processing CSS variables...") + for var_name, var_value in self.css_variables.items(): + if self.color_regex.match(var_value.strip()): + self._aggregate_color({ + "value": var_value.strip(), + "source": f"css-var:{var_name}", + "context": "css-variable", + }) + self.stats["css_variables_found"] += 1 + + self.stats["colors_found"] = len(self.colors) + + # Log summary + log("") + log("📊 FALLBACK EXTRACTION RESULTS:") + log(f" CSS files parsed: {self.stats['css_files_parsed']}") + log(f" Style blocks parsed: {self.stats['style_blocks_parsed']}") + log(f" CSS variables found: {self.stats['css_variables_found']}") + log(f" Unique colors found: {self.stats['colors_found']}") + log("") + + # Show top colors + if self.colors: + sorted_colors = sorted(self.colors.items(), key=lambda x: -x[1]['frequency'])[:10] + log(" 🎨 Top colors found:") + for hex_val, data in sorted_colors: + log(f" {hex_val} (used {data['frequency']}x)") + + return { + "colors": self.colors, + "css_variables": self.css_variables, + "stats": self.stats, + } + + except Exception as e: + log(f" ❌ Fallback extraction failed: {str(e)}") + self.errors.append(f"Fallback extraction failed: {str(e)}") + return {"colors": {}, "css_variables": {}, "stats": self.stats} + + +async def extract_css_colors( + url: str, + api_key: Optional[str] = None, + log_callback: Optional[Callable[[str], None]] = None +) -> dict: + """ + Convenience function to extract CSS colors. + + Args: + url: Website URL + api_key: Optional Firecrawl API key + log_callback: Optional logging callback + + Returns: + Dict with colors, css_variables, and stats + """ + extractor = FirecrawlExtractor(api_key=api_key) + return await extractor.extract_with_firecrawl(url, log_callback) diff --git a/agents/graph.py b/agents/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..3d8022b30650b9d1a1bdc3fe29bdea9ec7195e08 --- /dev/null +++ b/agents/graph.py @@ -0,0 +1,540 @@ +""" +LangGraph Workflow Orchestration +Design System Extractor v2 + +Defines the main workflow graph with agents, checkpoints, and transitions. +""" + +from typing import Literal +from datetime import datetime +from langgraph.graph import StateGraph, END +from langgraph.checkpoint.memory import MemorySaver + +from agents.state import AgentState, create_initial_state, get_stage_progress +from core.token_schema import Viewport + + +# ============================================================================= +# NODE FUNCTIONS (Agent Entry Points) +# ============================================================================= + +async def discover_pages(state: AgentState) -> AgentState: + """ + Agent 1 - Part 1: Discover pages from base URL. + + This node: + 1. Takes the base URL + 2. Crawls to find linked pages + 3. Classifies page types (homepage, listing, detail, etc.) + 4. Returns discovered pages for user confirmation + """ + from agents.crawler import PageDiscoverer + + state["current_stage"] = "discover" + state["stage_started_at"] = datetime.now() + + try: + discoverer = PageDiscoverer() + pages = await discoverer.discover(state["base_url"]) + + state["discovered_pages"] = pages + state["awaiting_human_input"] = True + state["checkpoint_name"] = "confirm_pages" + + except Exception as e: + state["errors"].append(f"Discovery failed: {str(e)}") + + return state + + +async def extract_tokens_desktop(state: AgentState) -> AgentState: + """ + Agent 1 - Part 2a: Extract tokens from desktop viewport. + """ + from agents.extractor import TokenExtractor + + state["current_stage"] = "extract" + + try: + extractor = TokenExtractor(viewport=Viewport.DESKTOP) + result = await extractor.extract( + pages=state["pages_to_crawl"], + progress_callback=lambda p: state.update({"desktop_crawl_progress": p}) + ) + + state["desktop_extraction"] = result + + except Exception as e: + state["errors"].append(f"Desktop extraction failed: {str(e)}") + + return state + + +async def extract_tokens_mobile(state: AgentState) -> AgentState: + """ + Agent 1 - Part 2b: Extract tokens from mobile viewport. + """ + from agents.extractor import TokenExtractor + + try: + extractor = TokenExtractor(viewport=Viewport.MOBILE) + result = await extractor.extract( + pages=state["pages_to_crawl"], + progress_callback=lambda p: state.update({"mobile_crawl_progress": p}) + ) + + state["mobile_extraction"] = result + + except Exception as e: + state["errors"].append(f"Mobile extraction failed: {str(e)}") + + return state + + +async def normalize_tokens(state: AgentState) -> AgentState: + """ + Agent 2: Normalize and structure extracted tokens. + """ + from agents.normalizer import TokenNormalizer + + state["current_stage"] = "normalize" + state["stage_started_at"] = datetime.now() + + try: + normalizer = TokenNormalizer() + + if state["desktop_extraction"]: + state["desktop_normalized"] = normalizer.normalize(state["desktop_extraction"]) + + if state["mobile_extraction"]: + state["mobile_normalized"] = normalizer.normalize(state["mobile_extraction"]) + + # After normalization, wait for human review + state["awaiting_human_input"] = True + state["checkpoint_name"] = "review_tokens" + + except Exception as e: + state["errors"].append(f"Normalization failed: {str(e)}") + + return state + + +async def generate_recommendations(state: AgentState) -> AgentState: + """ + Agent 3: Generate upgrade recommendations. + """ + from agents.advisor import DesignSystemAdvisor + + state["current_stage"] = "advise" + state["stage_started_at"] = datetime.now() + + try: + advisor = DesignSystemAdvisor() + recommendations = await advisor.analyze_and_recommend( + desktop=state["desktop_normalized"], + mobile=state["mobile_normalized"], + ) + + state["upgrade_recommendations"] = recommendations + + # Wait for human to select upgrades + state["awaiting_human_input"] = True + state["checkpoint_name"] = "select_upgrades" + + except Exception as e: + state["errors"].append(f"Recommendation generation failed: {str(e)}") + + return state + + +async def generate_final_tokens(state: AgentState) -> AgentState: + """ + Agent 4: Generate final token JSON. + """ + from agents.generator import TokenGenerator + + state["current_stage"] = "generate" + state["stage_started_at"] = datetime.now() + + try: + generator = TokenGenerator() + + # Build selection config from user choices + selections = { + "type_scale": state["selected_type_scale"], + "spacing_system": state["selected_spacing_system"], + "naming_convention": state["selected_naming_convention"], + "color_ramps": state["selected_color_ramps"], + "a11y_fixes": state["selected_a11y_fixes"], + } + + if state["desktop_normalized"]: + state["desktop_final"] = generator.generate( + normalized=state["desktop_normalized"], + selections=selections, + version=state["version_label"], + ) + + if state["mobile_normalized"]: + state["mobile_final"] = generator.generate( + normalized=state["mobile_normalized"], + selections=selections, + version=state["version_label"], + ) + + # Wait for human to approve export + state["awaiting_human_input"] = True + state["checkpoint_name"] = "approve_export" + + except Exception as e: + state["errors"].append(f"Token generation failed: {str(e)}") + + return state + + +async def complete_workflow(state: AgentState) -> AgentState: + """ + Final node: Mark workflow as complete. + """ + state["current_stage"] = "export" + state["awaiting_human_input"] = False + state["checkpoint_name"] = None + + return state + + +# ============================================================================= +# HUMAN CHECKPOINT HANDLERS +# ============================================================================= + +def handle_page_confirmation(state: AgentState, confirmed_pages: list[str]) -> AgentState: + """Handle human confirmation of pages to crawl.""" + state["pages_to_crawl"] = confirmed_pages + state["awaiting_human_input"] = False + state["checkpoint_name"] = None + return state + + +def handle_token_review( + state: AgentState, + color_decisions: dict[str, bool], + typography_decisions: dict[str, bool], + spacing_decisions: dict[str, bool], +) -> AgentState: + """Handle human review of extracted tokens.""" + state["accepted_colors"] = [k for k, v in color_decisions.items() if v] + state["rejected_colors"] = [k for k, v in color_decisions.items() if not v] + state["accepted_typography"] = [k for k, v in typography_decisions.items() if v] + state["rejected_typography"] = [k for k, v in typography_decisions.items() if not v] + state["accepted_spacing"] = [k for k, v in spacing_decisions.items() if v] + state["rejected_spacing"] = [k for k, v in spacing_decisions.items() if not v] + + state["awaiting_human_input"] = False + state["checkpoint_name"] = None + return state + + +def handle_upgrade_selection( + state: AgentState, + type_scale: str | None, + spacing_system: str | None, + naming_convention: str | None, + color_ramps: dict[str, bool], + a11y_fixes: list[str], +) -> AgentState: + """Handle human selection of upgrade options.""" + state["selected_type_scale"] = type_scale + state["selected_spacing_system"] = spacing_system + state["selected_naming_convention"] = naming_convention + state["selected_color_ramps"] = color_ramps + state["selected_a11y_fixes"] = a11y_fixes + + state["awaiting_human_input"] = False + state["checkpoint_name"] = None + return state + + +def handle_export_approval(state: AgentState, version_label: str) -> AgentState: + """Handle human approval of final export.""" + state["version_label"] = version_label + state["awaiting_human_input"] = False + state["checkpoint_name"] = None + return state + + +# ============================================================================= +# ROUTING FUNCTIONS +# ============================================================================= + +def route_after_discovery(state: AgentState) -> Literal["wait_for_pages", "extract"]: + """Route after discovery: wait for human or continue.""" + if state["awaiting_human_input"]: + return "wait_for_pages" + return "extract" + + +def route_after_extraction(state: AgentState) -> Literal["normalize", "error"]: + """Route after extraction: normalize or handle error.""" + if state["desktop_extraction"] is None and state["mobile_extraction"] is None: + return "error" + return "normalize" + + +def route_after_normalization(state: AgentState) -> Literal["wait_for_review", "advise"]: + """Route after normalization: wait for review or continue.""" + if state["awaiting_human_input"]: + return "wait_for_review" + return "advise" + + +def route_after_recommendations(state: AgentState) -> Literal["wait_for_selection", "generate"]: + """Route after recommendations: wait for selection or continue.""" + if state["awaiting_human_input"]: + return "wait_for_selection" + return "generate" + + +def route_after_generation(state: AgentState) -> Literal["wait_for_approval", "complete"]: + """Route after generation: wait for approval or complete.""" + if state["awaiting_human_input"]: + return "wait_for_approval" + return "complete" + + +# ============================================================================= +# GRAPH BUILDER +# ============================================================================= + +def build_workflow_graph() -> StateGraph: + """ + Build the main LangGraph workflow. + + Flow: + 1. discover_pages -> [human confirms pages] + 2. extract_desktop + extract_mobile (parallel) + 3. normalize_tokens -> [human reviews tokens] + 4. generate_recommendations -> [human selects upgrades] + 5. generate_final_tokens -> [human approves export] + 6. complete + """ + + # Create the graph + workflow = StateGraph(AgentState) + + # ------------------------------------------------------------------------- + # ADD NODES + # ------------------------------------------------------------------------- + + # Discovery + workflow.add_node("discover", discover_pages) + + # Extraction (will be parallel in subgraph) + workflow.add_node("extract_desktop", extract_tokens_desktop) + workflow.add_node("extract_mobile", extract_tokens_mobile) + + # Normalization + workflow.add_node("normalize", normalize_tokens) + + # Advisor + workflow.add_node("advise", generate_recommendations) + + # Generator + workflow.add_node("generate", generate_final_tokens) + + # Completion + workflow.add_node("complete", complete_workflow) + + # Human checkpoint placeholder nodes (these just pass through) + workflow.add_node("wait_for_pages", lambda s: s) + workflow.add_node("wait_for_review", lambda s: s) + workflow.add_node("wait_for_selection", lambda s: s) + workflow.add_node("wait_for_approval", lambda s: s) + + # ------------------------------------------------------------------------- + # ADD EDGES + # ------------------------------------------------------------------------- + + # Entry point + workflow.set_entry_point("discover") + + # Discovery -> (wait or extract) + workflow.add_conditional_edges( + "discover", + route_after_discovery, + { + "wait_for_pages": "wait_for_pages", + "extract": "extract_desktop", + } + ) + + # After human confirms pages -> extract + workflow.add_edge("wait_for_pages", "extract_desktop") + + # Parallel extraction + workflow.add_edge("extract_desktop", "extract_mobile") + + # After extraction -> normalize + workflow.add_conditional_edges( + "extract_mobile", + route_after_extraction, + { + "normalize": "normalize", + "error": END, + } + ) + + # Normalization -> (wait or advise) + workflow.add_conditional_edges( + "normalize", + route_after_normalization, + { + "wait_for_review": "wait_for_review", + "advise": "advise", + } + ) + + # After human reviews -> advise + workflow.add_edge("wait_for_review", "advise") + + # Advisor -> (wait or generate) + workflow.add_conditional_edges( + "advise", + route_after_recommendations, + { + "wait_for_selection": "wait_for_selection", + "generate": "generate", + } + ) + + # After human selects upgrades -> generate + workflow.add_edge("wait_for_selection", "generate") + + # Generation -> (wait or complete) + workflow.add_conditional_edges( + "generate", + route_after_generation, + { + "wait_for_approval": "wait_for_approval", + "complete": "complete", + } + ) + + # After human approves -> complete + workflow.add_edge("wait_for_approval", "complete") + + # Complete -> END + workflow.add_edge("complete", END) + + return workflow + + +# ============================================================================= +# WORKFLOW RUNNER +# ============================================================================= + +class WorkflowRunner: + """ + Manages workflow execution with human-in-the-loop support. + """ + + def __init__(self): + self.graph = build_workflow_graph() + self.checkpointer = MemorySaver() + self.app = self.graph.compile(checkpointer=self.checkpointer) + self.current_state: AgentState | None = None + self.thread_id: str | None = None + + async def start(self, base_url: str, thread_id: str | None = None) -> AgentState: + """Start a new workflow.""" + self.thread_id = thread_id or f"workflow_{datetime.now().timestamp()}" + self.current_state = create_initial_state(base_url) + + config = {"configurable": {"thread_id": self.thread_id}} + + # Run until first human checkpoint + async for event in self.app.astream(self.current_state, config): + self.current_state = event + if self.current_state.get("awaiting_human_input"): + break + + return self.current_state + + async def resume(self, human_input: dict) -> AgentState: + """Resume workflow after human input.""" + if not self.current_state or not self.thread_id: + raise ValueError("No active workflow to resume") + + checkpoint = self.current_state.get("checkpoint_name") + + # Apply human input based on checkpoint + if checkpoint == "confirm_pages": + self.current_state = handle_page_confirmation( + self.current_state, + human_input.get("confirmed_pages", []) + ) + elif checkpoint == "review_tokens": + self.current_state = handle_token_review( + self.current_state, + human_input.get("color_decisions", {}), + human_input.get("typography_decisions", {}), + human_input.get("spacing_decisions", {}), + ) + elif checkpoint == "select_upgrades": + self.current_state = handle_upgrade_selection( + self.current_state, + human_input.get("type_scale"), + human_input.get("spacing_system"), + human_input.get("naming_convention"), + human_input.get("color_ramps", {}), + human_input.get("a11y_fixes", []), + ) + elif checkpoint == "approve_export": + self.current_state = handle_export_approval( + self.current_state, + human_input.get("version_label", "v1") + ) + + config = {"configurable": {"thread_id": self.thread_id}} + + # Continue until next checkpoint or completion + async for event in self.app.astream(self.current_state, config): + self.current_state = event + if self.current_state.get("awaiting_human_input"): + break + + return self.current_state + + def get_progress(self) -> dict: + """Get current workflow progress.""" + if not self.current_state: + return {"status": "not_started"} + return get_stage_progress(self.current_state) + + def get_state(self) -> AgentState | None: + """Get current state.""" + return self.current_state + + +# ============================================================================= +# CONVENIENCE FUNCTIONS +# ============================================================================= + +def create_workflow() -> WorkflowRunner: + """Create a new workflow runner instance.""" + return WorkflowRunner() + + +async def run_discovery_only(base_url: str) -> list: + """Run only the discovery phase (for testing).""" + from agents.crawler import PageDiscoverer + + discoverer = PageDiscoverer() + return await discoverer.discover(base_url) + + +async def run_extraction_only(pages: list[str], viewport: Viewport) -> dict: + """Run only the extraction phase (for testing).""" + from agents.extractor import TokenExtractor + + extractor = TokenExtractor(viewport=viewport) + return await extractor.extract(pages) diff --git a/agents/llm_agents.py b/agents/llm_agents.py new file mode 100644 index 0000000000000000000000000000000000000000..b2786fef8bbadd3229054787fee1620a649593ea --- /dev/null +++ b/agents/llm_agents.py @@ -0,0 +1,1232 @@ +""" +Stage 2 LLM Agents — v3 Agentic Architecture +============================================== + +Each agent: +- Researches ALL token types (colors, typography, spacing, radius, shadows) +- Uses ReAct framework: THINK → ACT → OBSERVE → VERIFY +- Returns visible reasoning chain for the UI +- Has a Python-based critic for validation + +Agents run IN PARALLEL (asyncio.gather), then NEXUS compiles. + +Agent Responsibilities: +- AURORA: Brand identity + semantic naming for ALL colors + notes on all token types +- SENTINEL: Best practices audit across ALL token types, grounded in rule-engine data +- ATLAS: Benchmark comparison for ALL token types +- NEXUS (HEAD): Tree-of-Thought synthesis, compiles all agent outputs +""" + +import json +import re +from dataclasses import dataclass, field +from typing import Optional, Callable, Any +from datetime import datetime + + +# ============================================================================= +# DATA CLASSES — v3: includes reasoning_trace + naming_map +# ============================================================================= + +@dataclass +class BrandIdentification: + """Results from AURORA — Brand Identifier (ReAct).""" + brand_primary: dict = field(default_factory=dict) + brand_secondary: dict = field(default_factory=dict) + brand_accent: dict = field(default_factory=dict) + palette_strategy: str = "" + cohesion_score: int = 5 + cohesion_notes: str = "" + + # v3: naming_map covers ALL colors, not just top 10 + naming_map: dict = field(default_factory=dict) + # {hex: "color.brand.primary"} or {hex: "color.blue.500"} + + semantic_names: dict = field(default_factory=dict) # backward compat + self_evaluation: dict = field(default_factory=dict) + + # v3: reasoning trace visible to user + reasoning_trace: list = field(default_factory=list) + validation_passed: bool = False + retry_count: int = 0 + + # v3: per-token-type observations + typography_notes: str = "" + spacing_notes: str = "" + radius_notes: str = "" + shadow_notes: str = "" + + def to_dict(self) -> dict: + return { + "brand_primary": self.brand_primary, + "brand_secondary": self.brand_secondary, + "brand_accent": self.brand_accent, + "palette_strategy": self.palette_strategy, + "cohesion_score": self.cohesion_score, + "cohesion_notes": self.cohesion_notes, + "naming_map": self.naming_map, + "semantic_names": self.semantic_names, + "self_evaluation": self.self_evaluation, + "typography_notes": self.typography_notes, + "spacing_notes": self.spacing_notes, + "radius_notes": self.radius_notes, + "shadow_notes": self.shadow_notes, + } + + +@dataclass +class BenchmarkAdvice: + """Results from ATLAS — Benchmark Advisor (ReAct).""" + recommended_benchmark: str = "" + recommended_benchmark_name: str = "" + reasoning: str = "" + alignment_changes: list = field(default_factory=list) + pros_of_alignment: list = field(default_factory=list) + cons_of_alignment: list = field(default_factory=list) + alternative_benchmarks: list = field(default_factory=list) + self_evaluation: dict = field(default_factory=dict) + + # v3: per-token-type benchmark comparison + typography_comparison: dict = field(default_factory=dict) + spacing_comparison: dict = field(default_factory=dict) + color_comparison: dict = field(default_factory=dict) + radius_comparison: dict = field(default_factory=dict) + shadow_comparison: dict = field(default_factory=dict) + + reasoning_trace: list = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "recommended_benchmark": self.recommended_benchmark, + "recommended_benchmark_name": self.recommended_benchmark_name, + "reasoning": self.reasoning, + "alignment_changes": self.alignment_changes, + "pros": self.pros_of_alignment, + "cons": self.cons_of_alignment, + "alternatives": self.alternative_benchmarks, + "self_evaluation": self.self_evaluation, + "typography_comparison": self.typography_comparison, + "spacing_comparison": self.spacing_comparison, + "color_comparison": self.color_comparison, + "radius_comparison": self.radius_comparison, + "shadow_comparison": self.shadow_comparison, + } + + +@dataclass +class BestPracticesResult: + """Results from SENTINEL — Best Practices Auditor (ReAct).""" + overall_score: int = 50 + checks: dict = field(default_factory=dict) + priority_fixes: list = field(default_factory=list) + passing_practices: list = field(default_factory=list) + failing_practices: list = field(default_factory=list) + self_evaluation: dict = field(default_factory=dict) + + # v3: per-token-type assessments + color_assessment: dict = field(default_factory=dict) + typography_assessment: dict = field(default_factory=dict) + spacing_assessment: dict = field(default_factory=dict) + radius_assessment: dict = field(default_factory=dict) + shadow_assessment: dict = field(default_factory=dict) + + reasoning_trace: list = field(default_factory=list) + validation_passed: bool = False + + def to_dict(self) -> dict: + return { + "overall_score": self.overall_score, + "checks": self.checks, + "priority_fixes": self.priority_fixes, + "passing": self.passing_practices, + "failing": self.failing_practices, + "self_evaluation": self.self_evaluation, + "color_assessment": self.color_assessment, + "typography_assessment": self.typography_assessment, + "spacing_assessment": self.spacing_assessment, + "radius_assessment": self.radius_assessment, + "shadow_assessment": self.shadow_assessment, + } + + +@dataclass +class HeadSynthesis: + """Results from NEXUS — HEAD Synthesizer (Tree of Thought).""" + executive_summary: str = "" + scores: dict = field(default_factory=dict) + benchmark_fit: dict = field(default_factory=dict) + brand_analysis: dict = field(default_factory=dict) + top_3_actions: list = field(default_factory=list) + color_recommendations: list = field(default_factory=list) + type_scale_recommendation: dict = field(default_factory=dict) + spacing_recommendation: dict = field(default_factory=dict) + radius_recommendation: dict = field(default_factory=dict) + shadow_recommendation: dict = field(default_factory=dict) + self_evaluation: dict = field(default_factory=dict) + + # v3: ToT branches visible to user + perspective_a: dict = field(default_factory=dict) + perspective_b: dict = field(default_factory=dict) + chosen_perspective: str = "" + choice_reasoning: str = "" + + reasoning_trace: list = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "executive_summary": self.executive_summary, + "scores": self.scores, + "benchmark_fit": self.benchmark_fit, + "brand_analysis": self.brand_analysis, + "top_3_actions": self.top_3_actions, + "color_recommendations": self.color_recommendations, + "type_scale_recommendation": self.type_scale_recommendation, + "spacing_recommendation": self.spacing_recommendation, + "radius_recommendation": self.radius_recommendation, + "shadow_recommendation": self.shadow_recommendation, + "self_evaluation": self.self_evaluation, + "chosen_perspective": self.chosen_perspective, + "choice_reasoning": self.choice_reasoning, + } + + +# ============================================================================= +# SHARED HELPERS — format token data for prompts +# ============================================================================= + +def _fmt_colors(tokens: dict, limit: int = 40) -> str: + """Format color tokens for any agent prompt.""" + if not tokens: + return "No color data" + lines = [] + for name, t in list(tokens.items())[:limit]: + d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} + hex_val = d.get("value", "") + freq = d.get("frequency", 0) + hint = d.get("role_hint", "") + ctx = ", ".join((d.get("contexts") or [])[:3]) + els = ", ".join((d.get("elements") or [])[:3]) + hint_s = f" [hint:{hint}]" if hint else "" + lines.append(f"- {hex_val}: {freq}x, ctx=[{ctx}], el=[{els}]{hint_s}") + return "\n".join(lines) + + +def _fmt_typography(tokens: dict, limit: int = 15) -> str: + if not tokens: + return "No typography data" + lines = [] + for name, t in list(tokens.items())[:limit]: + d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} + fam = d.get("font_family", "?") + sz = d.get("font_size", "?") + w = d.get("font_weight", 400) + lh = d.get("line_height", "?") + freq = d.get("frequency", 0) + els = ", ".join((d.get("elements") or [])[:3]) + lines.append(f"- {fam} {sz} w{w} lh={lh} ({freq}x) [{els}]") + return "\n".join(lines) + + +def _fmt_spacing(tokens: dict, limit: int = 15) -> str: + if not tokens: + return "No spacing data" + lines = [] + for name, t in list(tokens.items())[:limit]: + d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} + val = d.get("value", "?") + px = d.get("value_px", "?") + freq = d.get("frequency", 0) + ctx = ", ".join((d.get("contexts") or [])[:3]) + lines.append(f"- {val} ({px}px) {freq}x [{ctx}]") + return "\n".join(lines) + + +def _fmt_radius(tokens: dict, limit: int = 10) -> str: + if not tokens: + return "No radius data" + lines = [] + for name, t in list(tokens.items())[:limit]: + d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} + val = d.get("value", "?") + px = d.get("value_px", "?") + freq = d.get("frequency", 0) + b4 = d.get("fits_base_4", False) + b8 = d.get("fits_base_8", False) + els = ", ".join((d.get("elements") or [])[:3]) + lines.append(f"- {name}: {val} (base4={b4}, base8={b8}, {freq}x) [{els}]") + return "\n".join(lines) + + +def _fmt_shadows(tokens: dict, limit: int = 10) -> str: + if not tokens: + return "No shadow data" + lines = [] + for name, t in list(tokens.items())[:limit]: + d = t if isinstance(t, dict) else t.__dict__ if hasattr(t, '__dict__') else {} + blur = d.get("blur_px", "?") + y = d.get("y_offset_px", "?") + freq = d.get("frequency", 0) + els = ", ".join((d.get("elements") or [])[:3]) + lines.append(f"- {name}: blur={blur}px y={y}px ({freq}x) [{els}]") + return "\n".join(lines) + + +def _log_reasoning(steps: list, log_fn: Callable): + """Log ReAct reasoning steps with icons.""" + icons = {"THINK": "🧠", "ACT": "⚡", "OBSERVE": "👁️", "VERIFY": "✅"} + for step in (steps or []): + if isinstance(step, dict): + st = step.get("step", "?") + area = step.get("area", "") + content = step.get("content", "")[:90] + icon = icons.get(st, "📝") + log_fn(f" {icon} [{area}] {content}") + + +def _extract_hexes(tokens: dict) -> list: + """Get list of hex values from color token dict.""" + hexes = [] + for name, t in tokens.items(): + if isinstance(t, dict): + h = t.get("value", "") + else: + h = getattr(t, "value", "") + if h: + hexes.append(h.lower()) + return hexes + + +# ============================================================================= +# AURORA — Brand Identifier (ReAct Framework) +# ============================================================================= + +class BrandIdentifierAgent: + """ + AURORA — Senior Brand & Visual Identity Analyst. + ReAct on ALL token types. Names ALL colors. + Model: Qwen 72B · Temperature: 0.4 + """ + + SYSTEM_PROMPT = """You are AURORA, a Senior Brand & Visual Identity Analyst. + +## REASONING FRAMEWORK (ReAct) +Structure your response with explicit reasoning steps. +For each area: THINK → ACT → OBSERVE → VERIFY. + +## ANALYZE ALL TOKEN TYPES: + +### 1. COLORS (Primary focus) +- Identify brand primary/secondary/accent from usage + role_hints +- Name EVERY color: color.{role}.{sub} or color.{hue}.{shade} +- Shades MUST be numeric (50-900), NEVER words (light/dark/base) +- Role colors: color.brand.primary, color.text.primary, color.bg.primary +- Palette colors: color.blue.500, color.neutral.200 + +### 2. TYPOGRAPHY — Identify heading vs body hierarchy, font pairing +### 3. SPACING — Identify grid system, note consistency +### 4. RADIUS — Identify radius strategy (sharp/rounded/pill) +### 5. SHADOWS — Identify elevation strategy, blur progression + +## QUALITY RULES +- naming_map MUST include EVERY hex color — no orphans +- Brand Primary MUST cite usage evidence (e.g. "47x on buttons") +- Cohesion 1-10: most sites score 5-7. Use the full range. + +## OUTPUT (JSON) + +{ + "reasoning_steps": [ + {"step": "THINK", "area": "colors", "content": "..."}, + {"step": "ACT", "area": "colors", "content": "..."}, + {"step": "OBSERVE", "area": "typography", "content": "..."}, + {"step": "ACT", "area": "typography", "content": "..."}, + {"step": "ACT", "area": "spacing", "content": "..."}, + {"step": "ACT", "area": "radius", "content": "..."}, + {"step": "ACT", "area": "shadows", "content": "..."}, + {"step": "VERIFY", "area": "all", "content": "Cross-checking consistency..."} + ], + "brand_primary": {"color": "#hex", "confidence": "high|medium|low", "reasoning": "cite evidence", "usage_count": N}, + "brand_secondary": {"color": "#hex", "confidence": "...", "reasoning": "..."}, + "brand_accent": {"color": "#hex or null", "confidence": "...", "reasoning": "..."}, + "palette_strategy": "complementary|analogous|triadic|monochromatic|random", + "cohesion_score": N, + "cohesion_notes": "...", + "naming_map": {"#hex1": "color.brand.primary", "#hex2": "color.blue.500", ...}, + "typography_notes": "Heading: Inter 700, Body: Inter 400. Clean hierarchy.", + "spacing_notes": "8px grid, 92% aligned.", + "radius_notes": "Rounded style: 4px inputs, 8px cards.", + "shadow_notes": "3-level elevation: blur 4/8/24px.", + "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "good|fair|poor", "flags": []} +} + +Return ONLY valid JSON.""" + + PROMPT_TEMPLATE = """Analyze the complete design system. + +## COLORS (with role_hints) +{color_data} + +## TYPOGRAPHY +{typography_data} + +## SPACING +{spacing_data} + +## RADIUS +{radius_data} + +## SHADOWS +{shadow_data} + +Use ReAct for each area. Name EVERY color in naming_map.""" + + def __init__(self, hf_client): + self.hf_client = hf_client + + async def analyze( + self, + color_tokens: dict, + typography_tokens: dict = None, + spacing_tokens: dict = None, + radius_tokens: dict = None, + shadow_tokens: dict = None, + log_callback: Callable = None, + ) -> BrandIdentification: + def log(msg): + if log_callback: + log_callback(msg) + + log(" 🎨 AURORA — Brand & Visual Identity (Qwen 72B)") + log(" └─ ReAct: Analyzing colors + typography + spacing + radius + shadows...") + + prompt = self.PROMPT_TEMPLATE.format( + color_data=_fmt_colors(color_tokens), + typography_data=_fmt_typography(typography_tokens), + spacing_data=_fmt_spacing(spacing_tokens), + radius_data=_fmt_radius(radius_tokens), + shadow_data=_fmt_shadows(shadow_tokens), + ) + + try: + start = datetime.now() + response = await self.hf_client.complete_async( + agent_name="brand_identifier", + system_prompt=self.SYSTEM_PROMPT, + user_message=prompt, + max_tokens=2000, + json_mode=True, + ) + dur = (datetime.now() - start).total_seconds() + result = self._parse(response) + + # Critic validation + input_hexes = _extract_hexes(color_tokens) + passed, errors = validate_aurora_output(result, input_hexes) + result.validation_passed = passed + + if not passed and result.retry_count == 0: + log(f" ⚠️ Critic: {len(errors)} issues — retrying with feedback...") + for e in errors[:3]: + log(f" └─ {e}") + retry_prompt = prompt + "\n\n## CRITIC FEEDBACK — Fix:\n" + "\n".join(errors[:10]) + resp2 = await self.hf_client.complete_async( + agent_name="brand_identifier", + system_prompt=self.SYSTEM_PROMPT, + user_message=retry_prompt, + max_tokens=2000, + json_mode=True, + ) + result = self._parse(resp2) + result.retry_count = 1 + p2, e2 = validate_aurora_output(result, input_hexes) + result.validation_passed = p2 + if not p2: + log(f" ⚠️ Retry: still {len(e2)} issues — using normalizer fallback names") + + # Log reasoning chain + log(f" ─────────────────────────────────────────") + log(f" 🎨 AURORA — COMPLETE ({dur:.1f}s)") + _log_reasoning(result.reasoning_trace, log) + log(f" ├─ Brand Primary: {result.brand_primary.get('color', '?')} ({result.brand_primary.get('confidence', '?')})") + log(f" ├─ Palette: {result.palette_strategy} · Cohesion: {result.cohesion_score}/10") + log(f" ├─ Colors Named: {len(result.naming_map)}/{len(input_hexes)}") + log(f" ├─ Typography: {(result.typography_notes or 'N/A')[:60]}") + log(f" ├─ Spacing: {(result.spacing_notes or 'N/A')[:60]}") + log(f" ├─ Radius: {(result.radius_notes or 'N/A')[:60]}") + log(f" ├─ Shadows: {(result.shadow_notes or 'N/A')[:60]}") + log(f" └─ Critic: {'✅ PASSED' if result.validation_passed else '⚠️ FALLBACK'}") + return result + + except Exception as e: + log(f" ⚠️ AURORA failed: {str(e)[:120]}") + return BrandIdentification() + + def _parse(self, response: str) -> BrandIdentification: + try: + m = re.search(r'\{[\s\S]*\}', response) + if m: + d = json.loads(m.group()) + return BrandIdentification( + brand_primary=d.get("brand_primary", {}), + brand_secondary=d.get("brand_secondary", {}), + brand_accent=d.get("brand_accent", {}), + palette_strategy=d.get("palette_strategy", "unknown"), + cohesion_score=d.get("cohesion_score", 5), + cohesion_notes=d.get("cohesion_notes", ""), + naming_map=d.get("naming_map", {}), + semantic_names=d.get("naming_map", {}), + self_evaluation=d.get("self_evaluation", {}), + reasoning_trace=d.get("reasoning_steps", []), + typography_notes=d.get("typography_notes", ""), + spacing_notes=d.get("spacing_notes", ""), + radius_notes=d.get("radius_notes", ""), + shadow_notes=d.get("shadow_notes", ""), + ) + except Exception: + pass + return BrandIdentification() + + +# ============================================================================= +# ATLAS — Benchmark Advisor (ReAct Framework) +# ============================================================================= + +class BenchmarkAdvisorAgent: + """ + ATLAS — Senior Design System Benchmark Analyst. + ReAct comparison of ALL token types against industry benchmarks. + Model: Llama 3.3 70B · Temperature: 0.25 + """ + + SYSTEM_PROMPT = """You are ATLAS, a Senior Design System Benchmark Analyst. + +## REASONING FRAMEWORK (ReAct) +For EACH token type: THINK → ACT → OBSERVE → VERIFY. + +Compare the user's values against benchmarks for: +1. TYPOGRAPHY — ratio, base size, scale pattern +2. SPACING — grid base, alignment, scale +3. COLORS — palette size, brand color usage +4. RADIUS — strategy (sharp/rounded/pill), tier count +5. SHADOWS — elevation levels, blur range + +Then pick the BEST OVERALL FIT benchmark. +Max 4 alignment changes. If >85% match, say "already well-aligned". + +## OUTPUT (JSON) + +{ + "reasoning_steps": [ + {"step": "THINK", "area": "typography", "content": "User ratio 1.18 vs Material 1.25..."}, + {"step": "ACT", "area": "typography", "content": "Material closest for type"}, + {"step": "THINK", "area": "spacing", "content": "8px matches Material and Polaris"}, + {"step": "ACT", "area": "spacing", "content": "Both aligned"}, + {"step": "THINK", "area": "colors", "content": "25 colors vs Polaris 18..."}, + {"step": "THINK", "area": "radius", "content": "4/8px tiers..."}, + {"step": "THINK", "area": "shadows", "content": "3 levels vs Material 5..."}, + {"step": "VERIFY", "area": "overall", "content": "Material best: 4/5 areas align"} + ], + "recommended_benchmark": "material_design_3", + "recommended_benchmark_name": "Material Design 3", + "reasoning": "Best fit across all token types — cite data", + "alignment_changes": [ + {"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium", "token_type": "typography"} + ], + "typography_comparison": {"user": "1.18", "benchmark": "1.25", "gap": "minor"}, + "spacing_comparison": {"user": "8px", "benchmark": "8px", "gap": "aligned"}, + "color_comparison": {"user": "25", "benchmark": "18", "gap": "reduce"}, + "radius_comparison": {"user": "2 tiers", "benchmark": "3 tiers", "gap": "add xl"}, + "shadow_comparison": {"user": "3 levels", "benchmark": "5 levels", "gap": "add 2"}, + "pros_of_alignment": ["..."], + "cons_of_alignment": ["..."], + "alternative_benchmarks": [{"name": "Polaris", "reason": "..."}], + "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []} +} + +Return ONLY valid JSON.""" + + PROMPT_TEMPLATE = """Compare this design system against benchmarks — ALL token types. + +## CURRENT VALUES +- Type Scale Ratio: {user_ratio} | Base: {user_base}px | Sizes: {user_sizes} +- Spacing Grid: {user_spacing}px | Values: {spacing_values} +- Colors: {color_count} unique | Brand: {brand_info} +- Radius: {radius_data} +- Shadows: {shadow_data} + +## BENCHMARKS +{benchmark_comparison} + +Use ReAct per token type. Pick the best overall fit.""" + + def __init__(self, hf_client): + self.hf_client = hf_client + + async def analyze( + self, + user_ratio: float, user_base: int, user_spacing: int, + benchmark_comparisons: list, + color_count: int = 0, brand_info: str = "", + user_sizes: str = "", spacing_values: str = "", + radius_data: str = "", shadow_data: str = "", + log_callback: Callable = None, + ) -> BenchmarkAdvice: + def log(msg): + if log_callback: + log_callback(msg) + + log("") + log(" 🏢 ATLAS — Benchmark Advisor (Llama 3.3 70B)") + log(" └─ ReAct: Comparing typography + spacing + colors + radius + shadows...") + + prompt = self.PROMPT_TEMPLATE.format( + user_ratio=user_ratio, user_base=user_base, user_spacing=user_spacing, + user_sizes=user_sizes or "N/A", + spacing_values=spacing_values or "N/A", + color_count=color_count, brand_info=brand_info or "N/A", + radius_data=radius_data or "No radius data", + shadow_data=shadow_data or "No shadow data", + benchmark_comparison=self._fmt_benchmarks(benchmark_comparisons), + ) + + try: + start = datetime.now() + response = await self.hf_client.complete_async( + agent_name="benchmark_advisor", + system_prompt=self.SYSTEM_PROMPT, + user_message=prompt, + max_tokens=1500, + json_mode=True, + ) + dur = (datetime.now() - start).total_seconds() + result = self._parse(response) + + log(f" ─────────────────────────────────────────") + log(f" 🏢 ATLAS — COMPLETE ({dur:.1f}s)") + _log_reasoning(result.reasoning_trace, log) + log(f" ├─ Recommended: {result.recommended_benchmark_name}") + log(f" ├─ Changes: {len(result.alignment_changes)}") + log(f" ├─ Typography: {result.typography_comparison}") + log(f" ├─ Spacing: {result.spacing_comparison}") + log(f" ├─ Colors: {result.color_comparison}") + log(f" ├─ Radius: {result.radius_comparison}") + log(f" └─ Shadows: {result.shadow_comparison}") + return result + + except Exception as e: + log(f" ⚠️ ATLAS failed: {str(e)[:120]}") + return BenchmarkAdvice() + + def _fmt_benchmarks(self, comparisons: list) -> str: + lines = [] + for i, c in enumerate(comparisons[:5]): + b = c.benchmark + lines.append(f"{i+1}. {b.icon} {b.name} — Match: {c.overall_match_pct:.0f}%" + f" | Type: {b.typography.get('scale_ratio', '?')}" + f" | Spacing: {b.spacing.get('base', '?')}px" + f" | Best for: {', '.join(b.best_for)}") + return "\n".join(lines) if lines else "No benchmark data" + + def _parse(self, response: str) -> BenchmarkAdvice: + try: + m = re.search(r'\{[\s\S]*\}', response) + if m: + d = json.loads(m.group()) + return BenchmarkAdvice( + recommended_benchmark=d.get("recommended_benchmark", ""), + recommended_benchmark_name=d.get("recommended_benchmark_name", ""), + reasoning=d.get("reasoning", ""), + alignment_changes=d.get("alignment_changes", []), + pros_of_alignment=d.get("pros_of_alignment", []), + cons_of_alignment=d.get("cons_of_alignment", []), + alternative_benchmarks=d.get("alternative_benchmarks", []), + self_evaluation=d.get("self_evaluation", {}), + typography_comparison=d.get("typography_comparison", {}), + spacing_comparison=d.get("spacing_comparison", {}), + color_comparison=d.get("color_comparison", {}), + radius_comparison=d.get("radius_comparison", {}), + shadow_comparison=d.get("shadow_comparison", {}), + reasoning_trace=d.get("reasoning_steps", []), + ) + except Exception: + pass + return BenchmarkAdvice() + + +# ============================================================================= +# SENTINEL — Best Practices Auditor (ReAct + Grounded Scoring) +# ============================================================================= + +class BestPracticesValidatorAgent: + """ + SENTINEL — Design System Best Practices Auditor. + ReAct: Grounds EVERY score in actual rule-engine data. Audits ALL token types. + Model: Qwen 72B · Temperature: 0.2 + """ + + SYSTEM_PROMPT = """You are SENTINEL, a Design System Best Practices Auditor. + +## REASONING FRAMEWORK (ReAct + Grounded) +For EACH check: THINK → ACT (cite data) → OBSERVE → VERIFY. +You MUST CITE the exact input data for every score. + +## AUDIT ALL TOKEN TYPES: + +### COLORS (25 pts) +- aa_compliance: CITE AA pass/fail count +- color_count: < 20 semantic colors ideal +- near_duplicates: should be 0 + +### TYPOGRAPHY (25 pts) +- type_scale_standard: nearest standard ratio +- type_scale_consistent: variance check +- base_size_accessible: >= 16px + +### SPACING (20 pts) +- spacing_grid: 4px or 8px consistency +- spacing_alignment: > 80% target + +### RADIUS (15 pts) +- radius_consistency: base-4/8 grid, clear tiers + +### SHADOWS (15 pts) +- shadow_system: elevation hierarchy, blur progression + +## CRITICAL: If data says 7 AA failures, you CANNOT say "pass". + +## OUTPUT (JSON) + +{ + "reasoning_steps": [ + {"step": "THINK", "area": "colors", "content": "7/25 fail AA = 28%"}, + {"step": "ACT", "area": "colors", "content": "aa_compliance = FAIL"}, + {"step": "THINK", "area": "typography", "content": "ratio 1.18, variance 0.22"}, + {"step": "ACT", "area": "typography", "content": "type_scale_consistent = WARN"}, + {"step": "THINK", "area": "spacing", "content": "8px base, 85% aligned"}, + {"step": "ACT", "area": "spacing", "content": "spacing_grid = PASS"}, + {"step": "THINK", "area": "radius", "content": "4px,8px,16px all base-4"}, + {"step": "ACT", "area": "radius", "content": "radius_consistency = PASS"}, + {"step": "THINK", "area": "shadows", "content": "3 levels, blur 4→8→24"}, + {"step": "ACT", "area": "shadows", "content": "shadow_system = WARN"}, + {"step": "VERIFY", "area": "scoring", "content": "3 pass, 2 warn, 1 fail → 62/100"} + ], + "overall_score": N, + "checks": { + "aa_compliance": {"status": "pass|warn|fail", "note": "CITE: 7/25 fail AA"}, + "type_scale_standard": {"status": "...", "note": "CITE: ratio 1.18 nearest 1.2"}, + "type_scale_consistent": {"status": "...", "note": "CITE: variance 0.22 > 0.15"}, + "base_size_accessible": {"status": "...", "note": "CITE: base = Npx"}, + "spacing_grid": {"status": "...", "note": "CITE: N% aligned to Npx"}, + "color_count": {"status": "...", "note": "CITE: N unique colors"}, + "near_duplicates": {"status": "...", "note": "CITE: N pairs"}, + "radius_consistency": {"status": "...", "note": "CITE: tiers and grid"}, + "shadow_system": {"status": "...", "note": "CITE: N levels, progression"} + }, + "color_assessment": {"aa_pass_rate": "72%", "palette_size": 25, "verdict": "needs work"}, + "typography_assessment": {"ratio": 1.18, "consistent": false, "base_ok": true, "verdict": "fair"}, + "spacing_assessment": {"grid": "8px", "alignment": "85%", "verdict": "good"}, + "radius_assessment": {"tiers": 3, "base_aligned": true, "verdict": "good"}, + "shadow_assessment": {"levels": 3, "progression": "non-linear", "verdict": "fair"}, + "priority_fixes": [ + {"rank": 1, "issue": "...", "impact": "high", "effort": "low", "action": "Specific fix", "token_type": "color"} + ], + "passing_practices": ["spacing_grid"], + "failing_practices": ["aa_compliance"], + "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []} +} + +Return ONLY valid JSON.""" + + PROMPT_TEMPLATE = """Audit this design system. CITE the data for every score. + +## RULE ENGINE FACTS (verified) + +### Typography +- Ratio: {type_ratio} ({type_consistent}) | Base: {base_size}px | Sizes: {sizes} + +### Accessibility +- Total: {total_colors} | AA Pass: {aa_pass} | AA Fail: {aa_fail} +- Failing: {failing_colors} + +### Spacing +- Base: {spacing_base}px | Aligned: {spacing_aligned}% | Values: {spacing_values} + +### Color Stats +- Unique: {unique_colors} | Near-Duplicates: {near_duplicates} + +### Radius +{radius_data} + +### Shadows +{shadow_data} + +CITE the EXACT numbers above for every check.""" + + def __init__(self, hf_client): + self.hf_client = hf_client + + async def analyze( + self, + rule_engine_results: Any, + radius_tokens: dict = None, + shadow_tokens: dict = None, + log_callback: Callable = None, + ) -> BestPracticesResult: + def log(msg): + if log_callback: + log_callback(msg) + + log("") + log(" ✅ SENTINEL — Best Practices Auditor (Qwen 72B)") + log(" └─ ReAct: Auditing colors + typography + spacing + radius + shadows...") + + typo = rule_engine_results.typography + spacing = rule_engine_results.spacing + color_stats = rule_engine_results.color_stats + accessibility = rule_engine_results.accessibility + failures = [a for a in accessibility if not a.passes_aa_normal] + failing_str = ", ".join([f"{a.hex_color} ({a.contrast_on_white:.1f}:1)" for a in failures[:8]]) + sizes_str = ", ".join([f"{s}px" for s in typo.sizes_px[:8]]) if typo.sizes_px else "N/A" + sp_vals = ", ".join([f"{v}px" for v in spacing.current_values[:10]]) if hasattr(spacing, 'current_values') and spacing.current_values else "N/A" + + prompt = self.PROMPT_TEMPLATE.format( + type_ratio=f"{typo.detected_ratio:.3f}", + type_consistent="consistent" if typo.is_consistent else f"inconsistent (var={typo.variance:.2f})", + base_size=typo.sizes_px[0] if typo.sizes_px else 16, + sizes=sizes_str, + total_colors=len(accessibility), + aa_pass=len(accessibility) - len(failures), + aa_fail=len(failures), + failing_colors=failing_str or "None", + spacing_base=spacing.detected_base, + spacing_aligned=f"{spacing.alignment_percentage:.0f}", + spacing_values=sp_vals, + unique_colors=color_stats.unique_count, + near_duplicates=len(color_stats.near_duplicates), + radius_data=_fmt_radius(radius_tokens) if radius_tokens else "No radius data", + shadow_data=_fmt_shadows(shadow_tokens) if shadow_tokens else "No shadow data", + ) + + try: + start = datetime.now() + response = await self.hf_client.complete_async( + agent_name="best_practices_validator", + system_prompt=self.SYSTEM_PROMPT, + user_message=prompt, + max_tokens=2000, + json_mode=True, + ) + dur = (datetime.now() - start).total_seconds() + result = self._parse(response) + + # Critic cross-reference + passed, errors = validate_sentinel_output(result, rule_engine_results) + result.validation_passed = passed + if not passed: + log(f" ⚠️ Critic: {len(errors)} issues — applying fixes...") + for e in errors[:3]: + log(f" └─ {e}") + result = _apply_sentinel_fixes(result, rule_engine_results, errors) + + log(f" ─────────────────────────────────────────") + log(f" ✅ SENTINEL — COMPLETE ({dur:.1f}s)") + _log_reasoning(result.reasoning_trace, log) + log(f" ├─ Overall Score: {result.overall_score}/100") + for cn, cv in (result.checks or {}).items(): + if isinstance(cv, dict): + s = cv.get("status", "?") + si = {"pass": "✅", "warn": "⚠️", "fail": "❌"}.get(s, "?") + log(f" │ {si} {cn}: {s}") + log(f" ├─ Priority Fixes: {len(result.priority_fixes)}") + log(f" └─ Critic: {'✅ PASSED' if result.validation_passed else '⚠️ FIXED'}") + return result + + except Exception as e: + log(f" ⚠️ SENTINEL failed: {str(e)[:120]}") + return BestPracticesResult() + + def _parse(self, response: str) -> BestPracticesResult: + try: + m = re.search(r'\{[\s\S]*\}', response) + if m: + d = json.loads(m.group()) + return BestPracticesResult( + overall_score=d.get("overall_score", 50), + checks=d.get("checks", {}), + priority_fixes=d.get("priority_fixes", []), + passing_practices=d.get("passing_practices", []), + failing_practices=d.get("failing_practices", []), + self_evaluation=d.get("self_evaluation", {}), + color_assessment=d.get("color_assessment", {}), + typography_assessment=d.get("typography_assessment", {}), + spacing_assessment=d.get("spacing_assessment", {}), + radius_assessment=d.get("radius_assessment", {}), + shadow_assessment=d.get("shadow_assessment", {}), + reasoning_trace=d.get("reasoning_steps", []), + ) + except Exception: + pass + return BestPracticesResult() + + +# ============================================================================= +# NEXUS — HEAD Synthesizer (Tree of Thought) +# ============================================================================= + +class HeadSynthesizerAgent: + """ + NEXUS — Senior Design System Architect. + Tree of Thought: 2 perspectives, picks best, compiles all agent outputs. + Recommendations for ALL token types. + Model: Llama 3.3 70B · Temperature: 0.3 + """ + + SYSTEM_PROMPT = """You are NEXUS, a Senior Design System Architect — the final synthesizer. + +## REASONING FRAMEWORK (Tree of Thought) +Evaluate TWO perspectives: + +### PERSPECTIVE A — Accessibility-First +Weights: accessibility=40%, consistency=30%, organization=30% +Penalize heavily for AA failures. + +### PERSPECTIVE B — Balanced +Weights: accessibility=30%, consistency=35%, organization=35% +Equal emphasis across areas. + +For each: calculate scores, determine top 3 actions. +Then CHOOSE the perspective that better reflects reality. + +## SYNTHESIZE ALL TOKEN TYPES: +- Colors: AURORA brand + SENTINEL AA findings → color recommendations +- Typography: ATLAS benchmark match + SENTINEL scale audit → type scale rec +- Spacing: ATLAS grid comparison + SENTINEL alignment → spacing rec +- Radius: SENTINEL consistency + ATLAS benchmark → radius rec +- Shadows: SENTINEL elevation + ATLAS benchmark → shadow rec + +## OUTPUT (JSON) + +{ + "reasoning_steps": [ + {"step": "THINK", "area": "perspective_a", "content": "Accessibility-first weighting..."}, + {"step": "ACT", "area": "perspective_a", "content": "Score: overall=52..."}, + {"step": "THINK", "area": "perspective_b", "content": "Balanced weighting..."}, + {"step": "ACT", "area": "perspective_b", "content": "Score: overall=63..."}, + {"step": "OBSERVE", "area": "comparison", "content": "A shows severity of AA failures..."}, + {"step": "VERIFY", "area": "decision", "content": "Choosing A — honest about AA issues"} + ], + "perspective_a": {"scores": {"overall": 52, "accessibility": 38, "consistency": 72, "organization": 68}, "reasoning": "..."}, + "perspective_b": {"scores": {"overall": 63, "accessibility": 45, "consistency": 72, "organization": 68}, "reasoning": "..."}, + "chosen_perspective": "A", + "choice_reasoning": "AA failures affect real users — lower score is more honest", + "executive_summary": "Your design system scores X/100...", + "scores": {"overall": 52, "accessibility": 38, "consistency": 72, "organization": 68}, + "top_3_actions": [ + {"action": "Fix AA compliance", "impact": "high", "effort": "medium", "details": "#X→#Y", "token_type": "color"} + ], + "color_recommendations": [ + {"role": "brand.primary", "current": "#hex", "suggested": "#hex", "reason": "AA", "accept": true} + ], + "type_scale_recommendation": {"current_ratio": 1.18, "recommended_ratio": 1.25, "reason": "..."}, + "spacing_recommendation": {"current": "8px", "recommended": "8px", "reason": "Already aligned"}, + "radius_recommendation": {"current": "3 tiers", "recommended": "Add xl tier", "reason": "..."}, + "shadow_recommendation": {"current": "3 levels", "recommended": "Add 2 more", "reason": "..."}, + "benchmark_fit": {"closest": "Material", "similarity": "78%", "recommendation": "..."}, + "brand_analysis": {"primary": "#hex", "secondary": "#hex", "cohesion": 7}, + "self_evaluation": {"confidence": N, "reasoning": "...", "data_quality": "...", "flags": []} +} + +Return ONLY valid JSON.""" + + PROMPT_TEMPLATE = """Synthesize all analysis into a final report. + +## RULE ENGINE FACTS +- Type: {type_ratio} ({type_status}) | Base: {base_size}px +- AA Failures: {aa_failures}/{total_colors} +- Spacing: {spacing_status} +- Colors: {unique_colors} unique | Consistency: {consistency_score}/100 +- Radius: {radius_facts} +- Shadows: {shadow_facts} + +## AURORA — Brand Analysis +- Primary: {brand_primary} ({brand_confidence}) | Secondary: {brand_secondary} +- Palette: {palette_strategy} | Cohesion: {cohesion_score}/10 +- Typography: {aurora_typo} +- Spacing: {aurora_spacing} +- Radius: {aurora_radius} +- Shadows: {aurora_shadows} + +## ATLAS — Benchmark +- Closest: {closest_benchmark} ({match_pct}%) +- Typo: {atlas_typo} | Spacing: {atlas_spacing} | Colors: {atlas_colors} +- Radius: {atlas_radius} | Shadows: {atlas_shadows} +- Changes: {benchmark_changes} + +## SENTINEL — Audit +- Score: {best_practices_score}/100 +- Color: {sentinel_color} | Typo: {sentinel_typo} | Spacing: {sentinel_spacing} +- Radius: {sentinel_radius} | Shadows: {sentinel_shadows} +- Fixes: {priority_fixes} + +## AA FIXES NEEDED +{accessibility_fixes} + +Evaluate from TWO perspectives (Tree of Thought). Choose one. Recommend for ALL token types.""" + + def __init__(self, hf_client): + self.hf_client = hf_client + + async def synthesize( + self, + rule_engine_results: Any, + benchmark_comparisons: list, + brand_identification: BrandIdentification, + benchmark_advice: BenchmarkAdvice, + best_practices: BestPracticesResult, + log_callback: Callable = None, + ) -> HeadSynthesis: + def log(msg): + if log_callback: + log_callback(msg) + + log("") + log("═" * 60) + log("🧠 NEXUS — HEAD SYNTHESIZER (Tree of Thought)") + log("═" * 60) + log(" Evaluating Perspective A (Accessibility-First) vs B (Balanced)...") + log(" Compiling: Rule Engine + AURORA + ATLAS + SENTINEL...") + + typo = rule_engine_results.typography + spacing = rule_engine_results.spacing + color_stats = rule_engine_results.color_stats + accessibility = rule_engine_results.accessibility + failures = [a for a in accessibility if not a.passes_aa_normal] + aa_fixes_str = "\n".join([ + f"- {a.name}: {a.hex_color} ({a.contrast_on_white:.1f}:1) → {a.suggested_fix} ({a.suggested_fix_contrast:.1f}:1)" + for a in failures[:8] if a.suggested_fix + ]) + closest = benchmark_comparisons[0] if benchmark_comparisons else None + + def _s(obj): + """Safely stringify a dict/value for prompt.""" + if isinstance(obj, dict): + parts = [f"{k}={v}" for k, v in list(obj.items())[:4]] + return ", ".join(parts) if parts else "N/A" + return str(obj) if obj else "N/A" + + prompt = self.PROMPT_TEMPLATE.format( + type_ratio=f"{typo.detected_ratio:.3f}", + type_status="consistent" if typo.is_consistent else "inconsistent", + base_size=typo.sizes_px[0] if typo.sizes_px else 16, + aa_failures=len(failures), total_colors=len(accessibility), + spacing_status=f"{spacing.detected_base}px, {spacing.alignment_percentage:.0f}% aligned", + unique_colors=color_stats.unique_count, + consistency_score=rule_engine_results.consistency_score, + radius_facts=_s(best_practices.radius_assessment) or "N/A", + shadow_facts=_s(best_practices.shadow_assessment) or "N/A", + brand_primary=brand_identification.brand_primary.get("color", "?"), + brand_confidence=brand_identification.brand_primary.get("confidence", "?"), + brand_secondary=brand_identification.brand_secondary.get("color", "?"), + palette_strategy=brand_identification.palette_strategy, + cohesion_score=brand_identification.cohesion_score, + aurora_typo=brand_identification.typography_notes or "N/A", + aurora_spacing=brand_identification.spacing_notes or "N/A", + aurora_radius=brand_identification.radius_notes or "N/A", + aurora_shadows=brand_identification.shadow_notes or "N/A", + closest_benchmark=closest.benchmark.name if closest else "?", + match_pct=f"{closest.overall_match_pct:.0f}" if closest else "0", + atlas_typo=_s(benchmark_advice.typography_comparison), + atlas_spacing=_s(benchmark_advice.spacing_comparison), + atlas_colors=_s(benchmark_advice.color_comparison), + atlas_radius=_s(benchmark_advice.radius_comparison), + atlas_shadows=_s(benchmark_advice.shadow_comparison), + benchmark_changes="; ".join([c.get("change", "") for c in benchmark_advice.alignment_changes[:4]]), + best_practices_score=best_practices.overall_score, + sentinel_color=_s(best_practices.color_assessment), + sentinel_typo=_s(best_practices.typography_assessment), + sentinel_spacing=_s(best_practices.spacing_assessment), + sentinel_radius=_s(best_practices.radius_assessment), + sentinel_shadows=_s(best_practices.shadow_assessment), + priority_fixes="; ".join([f.get("issue", "") for f in best_practices.priority_fixes[:5]]), + accessibility_fixes=aa_fixes_str or "None needed", + ) + + try: + start = datetime.now() + response = await self.hf_client.complete_async( + agent_name="head_synthesizer", + system_prompt=self.SYSTEM_PROMPT, + user_message=prompt, + max_tokens=2500, + json_mode=True, + ) + dur = (datetime.now() - start).total_seconds() + result = self._parse(response) + + log("") + log(f" 🧠 NEXUS — COMPLETE ({dur:.1f}s)") + _log_reasoning(result.reasoning_trace, log) + pa = result.perspective_a.get("scores", {}).get("overall", "?") if result.perspective_a else "?" + pb = result.perspective_b.get("scores", {}).get("overall", "?") if result.perspective_b else "?" + log(f" ├─ Perspective A: {pa}/100") + log(f" ├─ Perspective B: {pb}/100") + log(f" ├─ Chosen: {result.chosen_perspective}") + log(f" ├─ Why: {(result.choice_reasoning or 'N/A')[:80]}") + log(f" ├─ Final Score: {result.scores.get('overall', '?')}/100" if result.scores else " ├─ Scores: N/A") + log(f" ├─ Actions: {len(result.top_3_actions)} | Color Recs: {len(result.color_recommendations)}") + log(f" ├─ Typography: {_s(result.type_scale_recommendation)}") + log(f" ├─ Spacing: {_s(result.spacing_recommendation)}") + log(f" ├─ Radius: {_s(result.radius_recommendation)}") + log(f" └─ Shadows: {_s(result.shadow_recommendation)}") + log("") + return result + + except Exception as e: + log(f" ⚠️ NEXUS failed: {str(e)[:120]}") + return HeadSynthesis() + + def _parse(self, response: str) -> HeadSynthesis: + try: + m = re.search(r'\{[\s\S]*\}', response) + if m: + d = json.loads(m.group()) + return HeadSynthesis( + executive_summary=d.get("executive_summary", ""), + scores=d.get("scores", {}), + benchmark_fit=d.get("benchmark_fit", {}), + brand_analysis=d.get("brand_analysis", {}), + top_3_actions=d.get("top_3_actions", []), + color_recommendations=d.get("color_recommendations", []), + type_scale_recommendation=d.get("type_scale_recommendation", {}), + spacing_recommendation=d.get("spacing_recommendation", {}), + radius_recommendation=d.get("radius_recommendation", {}), + shadow_recommendation=d.get("shadow_recommendation", {}), + self_evaluation=d.get("self_evaluation", {}), + perspective_a=d.get("perspective_a", {}), + perspective_b=d.get("perspective_b", {}), + chosen_perspective=d.get("chosen_perspective", ""), + choice_reasoning=d.get("choice_reasoning", ""), + reasoning_trace=d.get("reasoning_steps", []), + ) + except Exception: + pass + return HeadSynthesis() + + +# ============================================================================= +# CRITIC / VALIDATOR FUNCTIONS (Rule-based, no LLM) +# ============================================================================= + +def validate_aurora_output(output: BrandIdentification, input_hexes: list) -> tuple: + """Validate AURORA naming_map. Returns (passed, errors).""" + errors = [] + nm = output.naming_map or {} + + # All input colors must have names + for h in input_hexes: + if h not in nm and h.lower() not in nm: + errors.append(f"Missing name for {h}") + + # No word-based shades + bad_words = {"light", "dark", "base", "muted", "deep", "lighter", "darker"} + for h, name in nm.items(): + for part in name.split("."): + if part.lower() in bad_words: + errors.append(f"Word shade '{part}' in {name}") + + # No duplicates + seen = set() + for n in nm.values(): + if n in seen: + errors.append(f"Duplicate: {n}") + seen.add(n) + + # Convention: color.X.Y + for h, name in nm.items(): + if not name.startswith("color."): + errors.append(f"'{name}' must start with 'color.'") + if len(name.split(".")) < 3: + errors.append(f"'{name}' needs 3+ parts") + + return len(errors) == 0, errors + + +def validate_sentinel_output(output: BestPracticesResult, rule_engine) -> tuple: + """Cross-reference SENTINEL scores against rule engine data.""" + errors = [] + checks = output.checks or {} + accessibility = rule_engine.accessibility + + aa_failures = len([a for a in accessibility if not a.passes_aa_normal]) + aa_check = checks.get("aa_compliance", {}) + if aa_failures > 0 and isinstance(aa_check, dict) and aa_check.get("status") == "pass": + errors.append(f"aa_compliance='pass' but {aa_failures} fail AA") + + score = output.overall_score + if not (0 <= score <= 100): + errors.append(f"Score {score} out of 0-100 range") + + fail_count = sum(1 for c in checks.values() if isinstance(c, dict) and c.get("status") == "fail") + if fail_count >= 3 and score > 70: + errors.append(f"Score {score} too high with {fail_count} failures") + + typo = rule_engine.typography + base_size = typo.sizes_px[0] if typo.sizes_px else 16 + base_check = checks.get("base_size_accessible", {}) + if base_size < 16 and isinstance(base_check, dict) and base_check.get("status") == "pass": + errors.append(f"base_size 'pass' but {base_size}px < 16") + + return len(errors) == 0, errors + + +def _apply_sentinel_fixes(result: BestPracticesResult, rule_engine, errors: list) -> BestPracticesResult: + """Deterministic fixes when critic finds issues.""" + accessibility = rule_engine.accessibility + failures = [a for a in accessibility if not a.passes_aa_normal] + + for err in errors: + if "aa_compliance" in err and "pass" in err: + if "aa_compliance" in result.checks: + result.checks["aa_compliance"]["status"] = "fail" + result.checks["aa_compliance"]["note"] = f"CORRECTED: {len(failures)} fail AA" + + if "too high" in err.lower(): + fail_count = sum(1 for c in result.checks.values() if isinstance(c, dict) and c.get("status") == "fail") + max_s = max(30, 100 - fail_count * 15) + if result.overall_score > max_s: + result.overall_score = max_s + + result.overall_score = max(0, min(100, result.overall_score)) + result.validation_passed = True + return result + + +def post_validate_stage2( + aurora: BrandIdentification, + sentinel: BestPracticesResult, + nexus: HeadSynthesis, + rule_engine: Any, +) -> list: + """Final deterministic checks after ALL agents. Returns issues list.""" + issues = [] + + for h, name in (aurora.naming_map or {}).items(): + if not re.match(r'^color\.\w+\.[\w]+$', name): + issues.append(f"Bad name: {name}") + + for key, val in (nexus.scores or {}).items(): + if isinstance(val, (int, float)) and not (0 <= val <= 100): + issues.append(f"Score {key}={val} OOB") + + aa_failures = len([a for a in rule_engine.accessibility if not a.passes_aa_normal]) + n_acc = nexus.scores.get("accessibility", 50) if nexus.scores else 50 + if aa_failures > 3 and n_acc > 85: + issues.append(f"Nexus accessibility={n_acc} but {aa_failures} AA failures") + + for rec in (nexus.color_recommendations or []): + for field in ("current", "suggested"): + v = rec.get(field, "") + if v and not v.startswith("#"): + issues.append(f"Color rec {field} missing #: {v}") + + return issues diff --git a/agents/normalizer.py b/agents/normalizer.py new file mode 100644 index 0000000000000000000000000000000000000000..f29d4208d51b101eb2d7c7516180126e2a7d9c38 --- /dev/null +++ b/agents/normalizer.py @@ -0,0 +1,949 @@ +""" +Agent 2: Token Normalizer & Structurer +Design System Extractor v3 + +Persona: Design System Librarian + +Responsibilities: +- Clean noisy extraction data +- Deduplicate similar tokens (colors within threshold, similar spacing) +- Assign ALL color names using NUMERIC shades only (50-900) +- Add role_hints based on CSS property/element context (absorbed from semantic_analyzer) +- Normalize radius values (parse, deduplicate, sort, name) +- Normalize shadow values (parse, sort by blur, name) +- Fix typography naming collisions (add weight suffix) +- Tag tokens as: detected | inferred | low-confidence +""" + +import re +from typing import Optional +from collections import defaultdict + +from core.token_schema import ( + ColorToken, + TypographyToken, + SpacingToken, + RadiusToken, + ShadowToken, + ExtractedTokens, + NormalizedTokens, + Confidence, + TokenSource, +) +from core.color_utils import ( + parse_color, + normalize_hex, + categorize_color, +) + + +class TokenNormalizer: + """ + Normalizes and structures extracted tokens. + + This is Agent 2's job — taking raw extraction data and + organizing it into a clean, deduplicated structure. + + v3 changes: + - Color naming: ALWAYS numeric shades (50-900), NEVER words (light/dark/base) + - Role hints: CSS-property-based metadata for AURORA to consume + - Radius: Full normalization (parse, deduplicate, sort, name) + - Shadows: Full normalization (parse, sort by blur, deduplicate, name) + - Typography: Collision-proof naming with weight suffix + """ + + def __init__(self): + # Thresholds for duplicate detection + self.color_similarity_threshold = 10 # Delta in RGB space + self.spacing_merge_threshold = 2 # px difference to merge + + # Radius semantic tiers (px -> name) + self.radius_tiers = [ + (0, "none"), + (2, "sm"), + (4, "md"), + (8, "lg"), + (16, "xl"), + (24, "2xl"), + (9999, "full"), + ] + + # Shadow elevation tiers (by count) + self.shadow_tier_names = ["xs", "sm", "md", "lg", "xl", "2xl"] + + def normalize(self, extracted: ExtractedTokens) -> NormalizedTokens: + """ + Normalize extracted tokens. + + Args: + extracted: Raw extraction results from Agent 1 + + Returns: + NormalizedTokens with cleaned, deduplicated data + """ + # Process each token type (returns lists) + colors_list = self._normalize_colors(extracted.colors) + typography_list = self._normalize_typography(extracted.typography) + spacing_list = self._normalize_spacing(extracted.spacing) + radius_list = self._normalize_radius(extracted.radius) + shadows_list = self._normalize_shadows(extracted.shadows) + + # Convert to dicts keyed by suggested_name + colors_dict = {} + for c in colors_list: + key = c.suggested_name or c.value + # Handle duplicate names by appending a suffix + if key in colors_dict: + suffix = 2 + while f"{key}_{suffix}" in colors_dict: + suffix += 1 + key = f"{key}_{suffix}" + colors_dict[key] = c + + typography_dict = {} + for t in typography_list: + key = t.suggested_name or f"{t.font_family}-{t.font_size}" + if key in typography_dict: + suffix = 2 + while f"{key}_{suffix}" in typography_dict: + suffix += 1 + key = f"{key}_{suffix}" + typography_dict[key] = t + + spacing_dict = {} + for s in spacing_list: + key = s.suggested_name or s.value + if key in spacing_dict: + suffix = 2 + while f"{key}_{suffix}" in spacing_dict: + suffix += 1 + key = f"{key}_{suffix}" + spacing_dict[key] = s + + # Radius and shadows are already properly named + radius_dict = {} + for r in radius_list: + key = r.suggested_name or f"radius-{r.value}" + if key in radius_dict: + suffix = 2 + while f"{key}_{suffix}" in radius_dict: + suffix += 1 + key = f"{key}_{suffix}" + radius_dict[key] = r + + shadows_dict = {} + for s in shadows_list: + key = s.suggested_name or f"shadow-{hash(s.value) % 1000}" + if key in shadows_dict: + suffix = 2 + while f"{key}_{suffix}" in shadows_dict: + suffix += 1 + key = f"{key}_{suffix}" + shadows_dict[key] = s + + # Create normalized result + normalized = NormalizedTokens( + viewport=extracted.viewport, + source_url=extracted.source_url, + colors=colors_dict, + typography=typography_dict, + spacing=spacing_dict, + radius=radius_dict, + shadows=shadows_dict, + font_families=extracted.font_families, + detected_spacing_base=extracted.spacing_base, + detected_naming_convention=extracted.naming_convention, + ) + + return normalized + + # ========================================================================= + # COLOR NORMALIZATION + # ========================================================================= + + def _normalize_colors(self, colors: list[ColorToken]) -> list[ColorToken]: + """ + Normalize color tokens: + - Deduplicate similar colors + - Assign role_hints based on CSS context (absorbed from semantic_analyzer) + - Assign suggested names using hue + NUMERIC shade (50-900) + - Calculate confidence + + v3: Removed _infer_color_role() and _generate_color_name_from_value(). + ALL colors now get numeric shades via _generate_preliminary_name(). + Role hints are set for AURORA to consume (not used in naming). + """ + if not colors: + return [] + + # Step 1: Deduplicate by exact hex value + unique_colors = {} + for color in colors: + hex_val = normalize_hex(color.value) + if hex_val in unique_colors: + # Merge frequency and contexts + existing = unique_colors[hex_val] + existing.frequency += color.frequency + existing.contexts = list(set(existing.contexts + color.contexts)) + existing.elements = list(set(existing.elements + color.elements)) + existing.css_properties = list(set(existing.css_properties + color.css_properties)) + else: + color.value = hex_val + unique_colors[hex_val] = color + + # Step 2: Merge visually similar colors + merged_colors = self._merge_similar_colors(list(unique_colors.values())) + + # Step 3: Assign role_hints and preliminary names (ALL numeric) + for color in merged_colors: + # Set role_hint based on CSS property/element context + color.role_hint = self._infer_role_hint(color) + + # Generate name: ALWAYS hue + numeric shade (50-900) + color.suggested_name = self._generate_preliminary_name(color) + + # Update confidence based on frequency + color.confidence = self._calculate_confidence(color.frequency) + + # Sort by frequency (most used first) + merged_colors.sort(key=lambda c: -c.frequency) + + return merged_colors + + def _merge_similar_colors(self, colors: list[ColorToken]) -> list[ColorToken]: + """Merge colors that are visually very similar.""" + if len(colors) <= 1: + return colors + + merged = [] + used = set() + + for i, color1 in enumerate(colors): + if i in used: + continue + + # Find similar colors + similar_group = [color1] + for j, color2 in enumerate(colors[i+1:], i+1): + if j in used: + continue + if self._colors_are_similar(color1.value, color2.value): + similar_group.append(color2) + used.add(j) + + # Merge the group - keep the most frequent + similar_group.sort(key=lambda c: -c.frequency) + primary = similar_group[0] + + # Aggregate data from similar colors + for other in similar_group[1:]: + primary.frequency += other.frequency + primary.contexts = list(set(primary.contexts + other.contexts)) + primary.elements = list(set(primary.elements + other.elements)) + primary.css_properties = list(set(primary.css_properties + other.css_properties)) + + merged.append(primary) + used.add(i) + + return merged + + def _colors_are_similar(self, hex1: str, hex2: str) -> bool: + """Check if two colors are visually similar.""" + try: + parsed1 = parse_color(hex1) + parsed2 = parse_color(hex2) + if parsed1 is None or parsed2 is None: + return False + if parsed1.rgb is None or parsed2.rgb is None: + return False + + rgb1 = parsed1.rgb + rgb2 = parsed2.rgb + + # Calculate Euclidean distance in RGB space + distance = sum((a - b) ** 2 for a, b in zip(rgb1, rgb2)) ** 0.5 + return distance < self.color_similarity_threshold + except Exception: + return False + + def _infer_role_hint(self, color: ColorToken) -> Optional[str]: + """ + Infer a role_hint for AURORA based on CSS property and element context. + + This replaces the old _infer_color_role() (which was used for naming) + and absorbs the useful heuristics from semantic_analyzer.py. + + Role hints are metadata for AURORA — they do NOT affect the color name. + """ + css_props = [p.lower() for p in color.css_properties] + elements = [e.lower() for e in color.elements] + contexts = [c.lower() for c in color.contexts] + all_context = " ".join(css_props + elements + contexts) + + # Calculate color properties for additional heuristics + parsed = parse_color(color.value) + if parsed and parsed.rgb: + r, g, b = parsed.rgb + luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255 + max_c = max(r, g, b) + min_c = min(r, g, b) + saturation = (max_c - min_c) / 255 if max_c > 0 else 0 + else: + luminance = 0.5 + saturation = 0 + + # --- BRAND/INTERACTIVE candidate --- + interactive_elements = ["button", "a", "input", "select", "submit", "btn", "cta", "link"] + is_interactive = any(el in all_context for el in interactive_elements) + has_bg_prop = any("background" in p for p in css_props) + + # Interactive elements with background-color + saturated color + if saturation > 0.25 and is_interactive and has_bg_prop: + return "brand_candidate" + # Highly saturated + high frequency + if saturation > 0.35 and color.frequency > 15: + return "brand_candidate" + + # --- TEXT candidate --- + has_color_prop = any( + p == "color" or (p.endswith("-color") and "background" not in p and "border" not in p) + for p in css_props + ) + text_elements = ["p", "span", "h1", "h2", "h3", "h4", "h5", "h6", "label", "text"] + is_text_element = any(el in all_context for el in text_elements) + + if saturation < 0.15 and (has_color_prop or is_text_element): + return "text_candidate" + if saturation < 0.1 and luminance < 0.5 and color.frequency > 30: + return "text_candidate" + + # --- BACKGROUND candidate --- + container_elements = ["div", "section", "main", "body", "article", "header", "footer", "card"] + is_container = any(el in all_context for el in container_elements) + + if has_bg_prop and is_container and saturation < 0.15: + return "bg_candidate" + if luminance > 0.9 and saturation < 0.1: + return "bg_candidate" + + # --- BORDER candidate --- + has_border_prop = any("border" in p for p in css_props) + if has_border_prop or "border" in all_context: + return "border_candidate" + + # --- FEEDBACK candidate --- + # Check for error/success/warning keywords in context + feedback_keywords = { + "error": ["error", "danger", "invalid", "negative"], + "success": ["success", "valid", "positive"], + "warning": ["warning", "caution", "alert"], + "info": ["info", "notice"], + } + for fb_type, keywords in feedback_keywords.items(): + if any(kw in all_context for kw in keywords): + return "feedback_candidate" + + # --- Generic palette color (saturated but no clear role) --- + if saturation > 0.2: + return "palette" + + return None + + def _generate_preliminary_name(self, color: ColorToken) -> str: + """ + Generate a preliminary name using hue family + numeric shade. + + This is the SINGLE naming path for ALL colors. + Convention: color.{hue_family}.{shade} + + Shade is ALWAYS numeric (50-900) based on HSL lightness. + NEVER uses words like light/dark/base. + + AURORA may later override these with semantic names (color.brand.primary), + but the normalizer's job is just hue + shade. + """ + category = categorize_color(color.value) + parsed = parse_color(color.value) + + if parsed and parsed.hsl: + h, s, l = parsed.hsl + + # Map lightness to shade number (50-900) + # Uses HSL lightness which is more perceptually accurate than + # the old luminance-based approach + if l >= 95: + shade = "50" + elif l >= 85: + shade = "100" + elif l >= 75: + shade = "200" + elif l >= 65: + shade = "300" + elif l >= 55: + shade = "400" + elif l >= 45: + shade = "500" + elif l >= 35: + shade = "600" + elif l >= 25: + shade = "700" + elif l >= 15: + shade = "800" + else: + shade = "900" + else: + shade = "500" + + return f"color.{category}.{shade}" + + # ========================================================================= + # TYPOGRAPHY NORMALIZATION + # ========================================================================= + + def _normalize_typography(self, typography: list[TypographyToken]) -> list[TypographyToken]: + """ + Normalize typography tokens: + - Deduplicate identical styles + - Infer type scale categories + - Assign suggested names with weight suffix to prevent collisions + """ + if not typography: + return [] + + # Deduplicate by unique style combination + unique_typo = {} + for typo in typography: + key = f"{typo.font_family}|{typo.font_size}|{typo.font_weight}|{typo.line_height}" + if key in unique_typo: + existing = unique_typo[key] + existing.frequency += typo.frequency + existing.elements = list(set(existing.elements + typo.elements)) + else: + unique_typo[key] = typo + + result = list(unique_typo.values()) + + # Infer names based on size, elements, AND weight (v3: collision fix) + for typo in result: + typo.suggested_name = self._generate_typography_name(typo) + typo.confidence = self._calculate_confidence(typo.frequency) + + # Sort by font size (largest first) + result.sort(key=lambda t: -self._parse_font_size(t.font_size)) + + return result + + def _generate_typography_name(self, typo: TypographyToken) -> str: + """ + Generate a semantic name for typography. + + v3: Includes font weight in name to prevent collisions. + Two styles at 24px with weight 700 and 400 now produce + font.heading.lg.700 and font.heading.lg.400 instead of both being font.heading.lg. + """ + size_px = self._parse_font_size(typo.font_size) + elements = " ".join(typo.elements).lower() + + # Determine category from elements + if any(h in elements for h in ["h1", "hero", "display"]): + category = "display" + elif any(h in elements for h in ["h2", "h3", "h4", "h5", "h6", "heading", "title"]): + category = "heading" + elif any(h in elements for h in ["label", "caption", "small", "meta"]): + category = "label" + elif any(h in elements for h in ["body", "p", "paragraph", "text"]): + category = "body" + else: + category = "text" + + # Determine size tier + if size_px >= 32: + size_tier = "xl" + elif size_px >= 24: + size_tier = "lg" + elif size_px >= 18: + size_tier = "md" + elif size_px >= 14: + size_tier = "sm" + else: + size_tier = "xs" + + # v3: Include weight to prevent collisions + weight = typo.font_weight + return f"font.{category}.{size_tier}.{weight}" + + def _parse_font_size(self, size: str) -> float: + """Parse font size string to pixels.""" + if not size: + return 16 + + size = size.lower().strip() + + # Handle px + if "px" in size: + try: + return float(size.replace("px", "")) + except ValueError: + return 16 + + # Handle rem (assume 16px base) + if "rem" in size: + try: + return float(size.replace("rem", "")) * 16 + except ValueError: + return 16 + + # Handle em (assume 16px base) + if "em" in size: + try: + return float(size.replace("em", "")) * 16 + except ValueError: + return 16 + + # Try plain number + try: + return float(size) + except ValueError: + return 16 + + # ========================================================================= + # SPACING NORMALIZATION + # ========================================================================= + + def _normalize_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]: + """ + Normalize spacing tokens: + - Merge similar values + - Align to base-8 grid if close + - Assign suggested names + """ + if not spacing: + return [] + + # Deduplicate by value + unique_spacing = {} + for space in spacing: + key = space.value + if key in unique_spacing: + existing = unique_spacing[key] + existing.frequency += space.frequency + existing.contexts = list(set(existing.contexts + space.contexts)) + else: + unique_spacing[key] = space + + result = list(unique_spacing.values()) + + # Merge very similar values + result = self._merge_similar_spacing(result) + + # Assign names + for space in result: + space.suggested_name = self._generate_spacing_name(space) + space.confidence = self._calculate_confidence(space.frequency) + + # Sort by value + result.sort(key=lambda s: s.value_px) + + return result + + def _merge_similar_spacing(self, spacing: list[SpacingToken]) -> list[SpacingToken]: + """Merge spacing values that are very close.""" + if len(spacing) <= 1: + return spacing + + # Sort by pixel value + spacing.sort(key=lambda s: s.value_px) + + merged = [] + i = 0 + + while i < len(spacing): + current = spacing[i] + group = [current] + + # Find adjacent similar values + j = i + 1 + while j < len(spacing): + if abs(spacing[j].value_px - current.value_px) <= self.spacing_merge_threshold: + group.append(spacing[j]) + j += 1 + else: + break + + # Merge group - prefer base-8 aligned value or most frequent + group.sort(key=lambda s: (-s.fits_base_8, -s.frequency)) + primary = group[0] + + for other in group[1:]: + primary.frequency += other.frequency + primary.contexts = list(set(primary.contexts + other.contexts)) + + merged.append(primary) + i = j + + return merged + + def _generate_spacing_name(self, space: SpacingToken) -> str: + """Generate a semantic name for spacing.""" + px = space.value_px + + # Map to t-shirt sizes based on value + if px <= 2: + size = "px" + elif px <= 4: + size = "0.5" + elif px <= 8: + size = "1" + elif px <= 12: + size = "1.5" + elif px <= 16: + size = "2" + elif px <= 20: + size = "2.5" + elif px <= 24: + size = "3" + elif px <= 32: + size = "4" + elif px <= 40: + size = "5" + elif px <= 48: + size = "6" + elif px <= 64: + size = "8" + elif px <= 80: + size = "10" + elif px <= 96: + size = "12" + else: + size = str(int(px / 4)) + + return f"space.{size}" + + # ========================================================================= + # RADIUS NORMALIZATION (NEW in v3) + # ========================================================================= + + def _normalize_radius(self, radius_tokens: list[RadiusToken]) -> list[RadiusToken]: + """ + Normalize border radius tokens. + + v3: Full processing instead of just storing raw values. + - Parse multi-value shorthand (take max single value) + - Convert percentage values (50% -> 9999px for "full") + - Convert rem/em to px + - Deduplicate by resolved px value + - Sort by size + - Assign semantic names (none, sm, md, lg, xl, 2xl, full) + """ + if not radius_tokens: + return [] + + # Step 1: Parse each radius to a single px value + parsed_radii = [] + for token in radius_tokens: + px_value = self._parse_radius_value(token.value) + if px_value is not None: + token.value_px = int(px_value) + token.value = f"{int(px_value)}px" + # Set grid alignment flags + token.fits_base_4 = (px_value % 4 == 0) if px_value > 0 else True + token.fits_base_8 = (px_value % 8 == 0) if px_value > 0 else True + parsed_radii.append(token) + + # Step 2: Deduplicate by px value + unique_radii = {} + for token in parsed_radii: + key = token.value_px + if key in unique_radii: + existing = unique_radii[key] + existing.frequency += token.frequency + existing.elements = list(set(existing.elements + token.elements)) + else: + unique_radii[key] = token + + result = list(unique_radii.values()) + + # Step 3: Sort by px value + result.sort(key=lambda r: r.value_px or 0) + + # Step 4: Assign semantic names + for token in result: + token.suggested_name = self._generate_radius_name(token) + token.confidence = self._calculate_confidence(token.frequency) + + return result + + def _parse_radius_value(self, value: str) -> Optional[int]: + """ + Parse a CSS border-radius value to a single integer px value. + + Handles: + - Single values: "8px", "0.5rem", "1em" + - Multi-value shorthand: "0px 0px 16px 16px" -> take max (16) + - Percentage: "50%" -> 9999 (treated as "full") + - "none" / "0" -> 0 + """ + if not value: + return None + + value = value.strip().lower() + + # Handle "none" + if value == "none" or value == "0": + return 0 + + # Handle percentage — 50% means fully round, map to 9999 + if "%" in value: + try: + pct = float(value.replace("%", "").strip()) + if pct >= 50: + return 9999 + # For lower percentages, approximate (not exact, but reasonable) + # Most radius percentages in practice are 50% for circles + return int(pct) + except ValueError: + return None + + # Handle multi-value shorthand: "0px 0px 16px 16px" + # Split by spaces and take the max value + parts = value.split() + if len(parts) > 1: + max_px = 0 + for part in parts: + px = self._parse_single_length(part) + if px is not None and px > max_px: + max_px = px + return int(max_px) if max_px > 0 else 0 + + # Single value + px = self._parse_single_length(value) + return int(round(px)) if px is not None else None + + def _parse_single_length(self, value: str) -> Optional[float]: + """Parse a single CSS length value to px.""" + value = value.strip().lower() + + if "px" in value: + try: + return float(value.replace("px", "")) + except ValueError: + return None + + if "rem" in value: + try: + return float(value.replace("rem", "")) * 16 + except ValueError: + return None + + if "em" in value: + try: + return float(value.replace("em", "")) * 16 + except ValueError: + return None + + # Try plain number (treat as px) + try: + return float(value) + except ValueError: + return None + + def _generate_radius_name(self, token: RadiusToken) -> str: + """ + Generate a semantic name for a border radius token. + + Maps px values to semantic tiers: + - 0 -> radius.none + - 1-3 -> radius.sm + - 4-7 -> radius.md + - 8-15 -> radius.lg + - 16-23 -> radius.xl + - 24-9998 -> radius.2xl + - 9999 -> radius.full + """ + px = token.value_px or 0 + + if px == 0: + return "radius.none" + elif px >= 9999: + return "radius.full" + elif px <= 3: + return "radius.sm" + elif px <= 7: + return "radius.md" + elif px <= 15: + return "radius.lg" + elif px <= 23: + return "radius.xl" + else: + return "radius.2xl" + + # ========================================================================= + # SHADOW NORMALIZATION (NEW in v3) + # ========================================================================= + + def _normalize_shadows(self, shadow_tokens: list[ShadowToken]) -> list[ShadowToken]: + """ + Normalize box shadow tokens. + + v3: Full processing instead of hash-based keys. + - Parse shadow CSS into components (if not already parsed) + - Compute blur_px and y_offset_px for sorting + - Filter out spread-only shadows (border simulations) + - Separate inset shadows into their own category + - Sort by blur radius (elevation) + - Deduplicate visually similar shadows + - Assign semantic names (xs, sm, md, lg, xl) + """ + if not shadow_tokens: + return [] + + # Step 1: Parse and compute numeric values + parsed_shadows = [] + for token in shadow_tokens: + self._ensure_shadow_parsed(token) + + # Skip spread-only shadows (border simulations) + if (token.blur_px is None or token.blur_px == 0) and token.spread and token.spread != "0px": + continue + + # Skip inset shadows (different semantic — handle separately if needed) + if token.inset: + continue + + # Skip shadows with no meaningful blur + if token.blur_px is not None and token.blur_px <= 0: + continue + + parsed_shadows.append(token) + + if not parsed_shadows: + return [] + + # Step 2: Deduplicate by visual similarity (same blur + y-offset range) + unique_shadows = [] + seen_blur_values = set() + for token in parsed_shadows: + blur = token.blur_px or 0 + # Round to nearest 2px for dedup + blur_bucket = round(blur / 2) * 2 + if blur_bucket not in seen_blur_values: + seen_blur_values.add(blur_bucket) + unique_shadows.append(token) + else: + # Merge frequency with existing + for existing in unique_shadows: + existing_blur = round((existing.blur_px or 0) / 2) * 2 + if existing_blur == blur_bucket: + existing.frequency += token.frequency + existing.elements = list(set(existing.elements + token.elements)) + break + + # Step 3: Sort by blur radius (ascending = increasing elevation) + unique_shadows.sort(key=lambda s: s.blur_px or 0) + + # Step 4: Assign semantic names based on sort order + for i, token in enumerate(unique_shadows): + if i < len(self.shadow_tier_names): + tier_name = self.shadow_tier_names[i] + else: + tier_name = f"{i + 1}xl" + token.suggested_name = f"shadow.{tier_name}" + token.confidence = self._calculate_confidence(token.frequency) + + return unique_shadows + + def _ensure_shadow_parsed(self, token: ShadowToken): + """ + Ensure shadow token has parsed components and computed px values. + + If offset_x/offset_y/blur/spread/color are None, attempt to parse + from the raw CSS value string. + """ + # Compute blur_px from blur string + if token.blur is not None and token.blur_px is None: + px = self._parse_single_length(token.blur) + token.blur_px = px if px is not None else 0 + + # Compute y_offset_px from offset_y string + if token.offset_y is not None and token.y_offset_px is None: + px = self._parse_single_length(token.offset_y) + token.y_offset_px = px if px is not None else 0 + + # If components are all None, try to parse from CSS value + if token.blur is None and token.offset_x is None: + self._parse_shadow_css(token) + + def _parse_shadow_css(self, token: ShadowToken): + """ + Parse a CSS box-shadow value into components. + + Format: [inset] [blur] [spread] + Example: "0px 4px 8px 0px rgba(0,0,0,0.1)" + """ + value = token.value.strip() + + # Check for inset + if value.startswith("inset"): + token.inset = True + value = value[5:].strip() + + # Extract color (rgba/rgb/hex at the end or beginning) + color_match = re.search( + r'(rgba?\s*\([^)]+\)|#[0-9a-fA-F]{3,8})\s*$', + value + ) + if color_match: + token.color = color_match.group(1).strip() + value = value[:color_match.start()].strip() + else: + # Try color at the beginning + color_match = re.search( + r'^(rgba?\s*\([^)]+\)|#[0-9a-fA-F]{3,8})\s+', + value + ) + if color_match: + token.color = color_match.group(1).strip() + value = value[color_match.end():].strip() + + # Parse remaining length values + length_pattern = r'(-?\d+(?:\.\d+)?(?:px|rem|em|%)?)' + lengths = re.findall(length_pattern, value) + + if len(lengths) >= 2: + token.offset_x = lengths[0] + token.offset_y = lengths[1] + px = self._parse_single_length(lengths[1]) + token.y_offset_px = px if px is not None else 0 + + if len(lengths) >= 3: + token.blur = lengths[2] + px = self._parse_single_length(lengths[2]) + token.blur_px = px if px is not None else 0 + + if len(lengths) >= 4: + token.spread = lengths[3] + + # Default blur_px to 0 if still None + if token.blur_px is None: + token.blur_px = 0 + if token.y_offset_px is None: + token.y_offset_px = 0 + + # ========================================================================= + # SHARED UTILITIES + # ========================================================================= + + def _calculate_confidence(self, frequency: int) -> Confidence: + """Calculate confidence based on frequency.""" + if frequency >= 10: + return Confidence.HIGH + elif frequency >= 3: + return Confidence.MEDIUM + else: + return Confidence.LOW + + +def normalize_tokens(extracted: ExtractedTokens) -> NormalizedTokens: + """Convenience function to normalize tokens.""" + normalizer = TokenNormalizer() + return normalizer.normalize(extracted) diff --git a/agents/semantic_analyzer.py b/agents/semantic_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..ab8e5f8e948b09111833d9af29a15a6d822978e8 --- /dev/null +++ b/agents/semantic_analyzer.py @@ -0,0 +1,901 @@ +""" +Agent 1C: Semantic Color Analyzer +Design System Extractor v2 + +Persona: Design System Semanticist + +Responsibilities: +- Analyze colors based on their actual CSS usage +- Categorize into semantic roles (brand, text, background, border, feedback) +- Use LLM to understand color relationships and hierarchy +- Provide structured output for Stage 1 UI and Stage 2 analysis +""" + +import json +import re +from typing import Optional, Callable +from datetime import datetime + +from core.color_utils import ( + parse_color, + get_contrast_with_white, + get_contrast_with_black, + check_wcag_compliance, +) + + +class SemanticColorAnalyzer: + """ + Analyzes extracted colors and categorizes them by semantic role. + + Uses LLM to understand: + - Which colors are brand/primary colors (used on buttons, CTAs) + - Which colors are for text (used with 'color' property) + - Which colors are backgrounds (used with 'background-color') + - Which colors are borders (used with 'border-color') + - Which colors are feedback states (error, success, warning) + """ + + def __init__(self, llm_provider=None): + """ + Initialize the semantic analyzer. + + Args: + llm_provider: Optional LLM provider for AI analysis. + If None, uses rule-based fallback. + """ + self.llm_provider = llm_provider + self.analysis_result = {} + self.logs = [] + + def log(self, message: str): + """Add timestamped log message.""" + timestamp = datetime.now().strftime("%H:%M:%S") + self.logs.append(f"[{timestamp}] {message}") + + def get_logs(self) -> str: + """Get all logs as string.""" + return "\n".join(self.logs) + + def _prepare_color_data_for_llm(self, colors: dict) -> str: + """ + Prepare color data in a format optimized for LLM analysis. + + Args: + colors: Dict of color tokens with metadata + + Returns: + Formatted string for LLM prompt + """ + color_entries = [] + + for name, token in colors.items(): + # Handle both dict and object formats + if hasattr(token, 'value'): + hex_val = token.value + frequency = token.frequency + contexts = token.contexts if hasattr(token, 'contexts') else [] + elements = token.elements if hasattr(token, 'elements') else [] + css_props = token.css_properties if hasattr(token, 'css_properties') else [] + else: + hex_val = token.get('value', '#000000') + frequency = token.get('frequency', 0) + contexts = token.get('contexts', []) + elements = token.get('elements', []) + css_props = token.get('css_properties', []) + + # Calculate color properties + contrast_white = get_contrast_with_white(hex_val) + contrast_black = get_contrast_with_black(hex_val) + + # Determine luminance + try: + r = int(hex_val[1:3], 16) + g = int(hex_val[3:5], 16) + b = int(hex_val[5:7], 16) + luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255 + + # Calculate saturation + max_c = max(r, g, b) + min_c = min(r, g, b) + saturation = (max_c - min_c) / 255 if max_c > 0 else 0 + except: + luminance = 0.5 + saturation = 0 + + entry = { + "hex": hex_val, + "name": name, + "frequency": frequency, + "css_properties": css_props[:5], # Limit for prompt size + "elements": elements[:5], + "contexts": contexts[:3], + "luminance": round(luminance, 2), + "saturation": round(saturation, 2), + "contrast_on_white": round(contrast_white, 2), + "contrast_on_black": round(contrast_black, 2), + "aa_compliant_on_white": contrast_white >= 4.5, + } + color_entries.append(entry) + + # Sort by frequency for LLM to see most important first + color_entries.sort(key=lambda x: -x['frequency']) + + # Limit to top 50 colors for LLM (avoid token limits) + return json.dumps(color_entries[:50], indent=2) + + def _build_llm_prompt(self, color_data: str) -> str: + """Build the prompt for LLM semantic analysis.""" + + return f"""You are a Design System Analyst specializing in color semantics. + +TASK: Analyze these extracted colors and categorize them by their semantic role in the UI. + +EXTRACTED COLORS (sorted by frequency): +{color_data} + +ANALYSIS RULES: +1. BRAND/PRIMARY colors are typically: + - Used on buttons, links, CTAs (elements: button, a, input[type=submit]) + - Applied via background-color on interactive elements + - Saturated (saturation > 0.3) and not gray + - High frequency on interactive elements + +2. TEXT colors are typically: + - Applied via "color" CSS property (not background-color) + - Used on text elements (p, span, h1-h6, label) + - Form a hierarchy: primary (darkest), secondary (medium), muted (lightest) + - Low saturation (grays) + +3. BACKGROUND colors are typically: + - Applied via "background-color" on containers + - Used on div, section, main, body, card elements + - Light colors (luminance > 0.8) for light themes + - May include dark backgrounds for inverse sections + +4. BORDER colors are typically: + - Applied via border-color properties + - Often gray/neutral + - Lower frequency than text/background + +5. FEEDBACK colors are: + - Red variants = error + - Green variants = success + - Yellow/orange = warning + - Blue variants = info + - Often used with specific class contexts + +OUTPUT FORMAT (JSON): +{{ + "brand": {{ + "primary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "secondary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "accent": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}} + }}, + "text": {{ + "primary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "secondary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "muted": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "inverse": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}} + }}, + "background": {{ + "primary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "secondary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "tertiary": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "inverse": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}} + }}, + "border": {{ + "default": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "strong": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}} + }}, + "feedback": {{ + "error": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "success": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "warning": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}}, + "info": {{"hex": "#xxx", "confidence": "high|medium|low", "reason": "..."}} + }}, + "summary": {{ + "total_colors_analyzed": 50, + "brand_colors_found": 2, + "has_clear_hierarchy": true, + "accessibility_notes": "..." + }} +}} + +IMPORTANT: +- Only include roles where you found a matching color +- Set confidence based on how certain you are +- Provide brief reasoning for each categorization +- If no color fits a role, omit that key + +Return ONLY valid JSON, no other text.""" + + def _rule_based_analysis(self, colors: dict) -> dict: + """ + Fallback rule-based analysis when LLM is not available. + + Uses heuristics based on: + - CSS properties (color vs background-color vs border-color) + - Element types (button, a, p, div, etc.) + - Color characteristics (saturation, luminance) + - Frequency + """ + self.log(" Using rule-based analysis (no LLM)") + + result = { + "brand": {}, + "text": {}, + "background": {}, + "border": {}, + "feedback": {}, + "summary": { + "total_colors_analyzed": len(colors), + "brand_colors_found": 0, + "has_clear_hierarchy": False, + "accessibility_notes": "", + "method": "rule-based" + } + } + + # Categorize colors + brand_candidates = [] + text_candidates = [] + background_candidates = [] + border_candidates = [] + feedback_candidates = {"error": [], "success": [], "warning": [], "info": []} + + for name, token in colors.items(): + # Extract data + if hasattr(token, 'value'): + hex_val = token.value + frequency = token.frequency + contexts = token.contexts if hasattr(token, 'contexts') else [] + elements = token.elements if hasattr(token, 'elements') else [] + css_props = token.css_properties if hasattr(token, 'css_properties') else [] + else: + hex_val = token.get('value', '#000000') + frequency = token.get('frequency', 0) + contexts = token.get('contexts', []) + elements = token.get('elements', []) + css_props = token.get('css_properties', []) + + # Calculate color properties + try: + r = int(hex_val[1:3], 16) + g = int(hex_val[3:5], 16) + b = int(hex_val[5:7], 16) + luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255 + max_c = max(r, g, b) + min_c = min(r, g, b) + saturation = (max_c - min_c) / 255 if max_c > 0 else 0 + + # Determine hue for feedback colors + if max_c == min_c: + hue = 0 + elif max_c == r: + hue = 60 * ((g - b) / (max_c - min_c) % 6) + elif max_c == g: + hue = 60 * ((b - r) / (max_c - min_c) + 2) + else: + hue = 60 * ((r - g) / (max_c - min_c) + 4) + except: + luminance = 0.5 + saturation = 0 + hue = 0 + + color_info = { + "hex": hex_val, + "name": name, + "frequency": frequency, + "luminance": luminance, + "saturation": saturation, + "hue": hue, + "css_props": css_props, + "elements": elements, + "contexts": contexts, + } + + # --- CATEGORIZATION RULES --- + + # BRAND: Saturated colors - multiple detection methods + interactive_elements = ['button', 'a', 'input', 'select', 'submit', 'btn', 'cta'] + is_interactive = any(el in str(elements).lower() for el in interactive_elements) + has_bg_prop = any('background' in str(p).lower() for p in css_props) + + # Method 1: Interactive elements with background + if saturation > 0.25 and is_interactive and has_bg_prop: + brand_candidates.append(color_info) + # Method 2: Highly saturated + high frequency (works for Firecrawl) + elif saturation > 0.35 and frequency > 15: + brand_candidates.append(color_info) + # Method 3: Very saturated colors regardless of frequency + elif saturation > 0.5 and frequency > 5: + brand_candidates.append(color_info) + # Method 4: Cyan/Teal range (common brand colors) + elif 160 <= hue <= 200 and saturation > 0.4 and frequency > 10: + brand_candidates.append(color_info) + # Method 5: Lime/Green-Yellow (secondary brand colors) + elif 60 <= hue <= 90 and saturation > 0.5 and frequency > 5: + brand_candidates.append(color_info) + + # TEXT: Low saturation, used with 'color' property + has_color_prop = any(p == 'color' or (p.endswith('-color') and 'background' not in p and 'border' not in p) + for p in css_props) + text_elements = ['p', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', 'div', 'text'] + is_text_element = any(el in str(elements).lower() for el in text_elements) + + # Text detection - low saturation grays + if saturation < 0.15 and (has_color_prop or 'text' in str(contexts).lower()): + text_candidates.append(color_info) + elif saturation < 0.1 and 0.1 < luminance < 0.8: # Gray range + text_candidates.append(color_info) + elif saturation < 0.1 and luminance < 0.5 and frequency > 50: # Dark grays used a lot + text_candidates.append(color_info) + + if saturation < 0.15 and (has_color_prop or 'text' in str(contexts).lower()): + text_candidates.append(color_info) + elif saturation < 0.1 and luminance < 0.7 and is_text_element: + text_candidates.append(color_info) + + # BACKGROUND: Used with background-color on containers + container_elements = ['div', 'section', 'main', 'body', 'article', 'header', 'footer', 'card'] + is_container = any(el in str(elements).lower() for el in container_elements) + + if has_bg_prop and (is_container or 'background' in str(contexts).lower()): + if saturation < 0.15: # Mostly neutral backgrounds + background_candidates.append(color_info) + + # BORDER: Used with border-color properties + has_border_prop = any('border' in str(p).lower() for p in css_props) + + if has_border_prop or 'border' in str(contexts).lower(): + border_candidates.append(color_info) + + # FEEDBACK: Based on hue + if saturation > 0.3: + if 0 <= hue <= 30 or 330 <= hue <= 360: # Red + feedback_candidates["error"].append(color_info) + elif 90 <= hue <= 150: # Green + feedback_candidates["success"].append(color_info) + elif 30 <= hue <= 60: # Yellow/Orange + feedback_candidates["warning"].append(color_info) + elif 180 <= hue <= 250: # Blue + feedback_candidates["info"].append(color_info) + + # --- SELECT BEST CANDIDATES --- + + # Brand: Sort by frequency * saturation + brand_candidates.sort(key=lambda x: -(x['frequency'] * x['saturation'])) + if brand_candidates: + result["brand"]["primary"] = { + "hex": brand_candidates[0]["hex"], + "confidence": "high" if brand_candidates[0]["frequency"] > 20 else "medium", + "reason": f"Most frequent saturated color on interactive elements (freq: {brand_candidates[0]['frequency']})" + } + result["summary"]["brand_colors_found"] += 1 + if len(brand_candidates) > 1: + result["brand"]["secondary"] = { + "hex": brand_candidates[1]["hex"], + "confidence": "medium", + "reason": f"Second most frequent brand color (freq: {brand_candidates[1]['frequency']})" + } + result["summary"]["brand_colors_found"] += 1 + + # Text: Sort by luminance (darkest first for primary) + text_candidates.sort(key=lambda x: x['luminance']) + if text_candidates: + result["text"]["primary"] = { + "hex": text_candidates[0]["hex"], + "confidence": "high" if text_candidates[0]["luminance"] < 0.3 else "medium", + "reason": f"Darkest text color (luminance: {text_candidates[0]['luminance']:.2f})" + } + if len(text_candidates) > 1: + # Find secondary (mid-luminance) + mid_idx = len(text_candidates) // 2 + result["text"]["secondary"] = { + "hex": text_candidates[mid_idx]["hex"], + "confidence": "medium", + "reason": f"Mid-tone text color (luminance: {text_candidates[mid_idx]['luminance']:.2f})" + } + if len(text_candidates) > 2: + result["text"]["muted"] = { + "hex": text_candidates[-1]["hex"], + "confidence": "medium", + "reason": f"Lightest text color (luminance: {text_candidates[-1]['luminance']:.2f})" + } + + # Check for text hierarchy + if len(text_candidates) >= 3: + result["summary"]["has_clear_hierarchy"] = True + + # Background: Sort by luminance (lightest first for primary) + background_candidates.sort(key=lambda x: -x['luminance']) + if background_candidates: + result["background"]["primary"] = { + "hex": background_candidates[0]["hex"], + "confidence": "high" if background_candidates[0]["luminance"] > 0.9 else "medium", + "reason": f"Lightest background (luminance: {background_candidates[0]['luminance']:.2f})" + } + if len(background_candidates) > 1: + result["background"]["secondary"] = { + "hex": background_candidates[1]["hex"], + "confidence": "medium", + "reason": f"Secondary background (luminance: {background_candidates[1]['luminance']:.2f})" + } + # Find dark background for inverse + dark_bgs = [c for c in background_candidates if c['luminance'] < 0.2] + if dark_bgs: + result["background"]["inverse"] = { + "hex": dark_bgs[0]["hex"], + "confidence": "medium", + "reason": f"Dark background for inverse sections (luminance: {dark_bgs[0]['luminance']:.2f})" + } + + # Border: Sort by frequency + border_candidates.sort(key=lambda x: -x['frequency']) + if border_candidates: + result["border"]["default"] = { + "hex": border_candidates[0]["hex"], + "confidence": "medium", + "reason": f"Most common border color (freq: {border_candidates[0]['frequency']})" + } + + # Feedback: Pick highest frequency for each + for feedback_type, candidates in feedback_candidates.items(): + if candidates: + candidates.sort(key=lambda x: -x['frequency']) + result["feedback"][feedback_type] = { + "hex": candidates[0]["hex"], + "confidence": "medium", + "reason": f"Detected {feedback_type} color by hue analysis" + } + + return result + + async def analyze_with_llm(self, colors: dict, log_callback: Optional[Callable] = None) -> dict: + """ + Analyze colors using LLM for semantic categorization. + + Args: + colors: Dict of color tokens + log_callback: Optional callback for logging + + Returns: + Semantic analysis result + """ + def log(msg): + self.log(msg) + if log_callback: + log_callback(msg) + + log("") + log("=" * 60) + log("🧠 SEMANTIC COLOR ANALYSIS (LLM)") + log("=" * 60) + log("") + + # Prepare data for LLM + log(" 📊 Preparing color data for analysis...") + color_data = self._prepare_color_data_for_llm(colors) + log(f" ✅ Prepared {min(50, len(colors))} colors for analysis") + + # Check if LLM provider is available + if self.llm_provider is None: + log(" ⚠️ No LLM provider configured, using rule-based analysis") + self.analysis_result = self._rule_based_analysis(colors) + else: + try: + log(" 🤖 Calling LLM for semantic analysis...") + + prompt = self._build_llm_prompt(color_data) + + # Call LLM + response = await self.llm_provider.generate( + prompt=prompt, + max_tokens=2000, + temperature=0.3, # Low temperature for consistent categorization + ) + + log(" ✅ LLM response received") + + # Parse JSON response + try: + # Extract JSON from response + json_match = re.search(r'\{[\s\S]*\}', response) + if json_match: + self.analysis_result = json.loads(json_match.group()) + self.analysis_result["summary"]["method"] = "llm" + log(" ✅ Successfully parsed LLM analysis") + else: + raise ValueError("No JSON found in response") + + except json.JSONDecodeError as e: + log(f" ⚠️ Failed to parse LLM response: {e}") + log(" 🔄 Falling back to rule-based analysis") + self.analysis_result = self._rule_based_analysis(colors) + + except Exception as e: + log(f" ❌ LLM analysis failed: {str(e)}") + log(" 🔄 Falling back to rule-based analysis") + self.analysis_result = self._rule_based_analysis(colors) + + # Log results + self._log_analysis_results(log) + + return self.analysis_result + + def analyze_sync(self, colors: dict, log_callback: Optional[Callable] = None) -> dict: + """ + Synchronous analysis using rule-based approach. + + Args: + colors: Dict of color tokens + log_callback: Optional callback for logging + + Returns: + Semantic analysis result + """ + def log(msg): + self.log(msg) + if log_callback: + log_callback(msg) + + log("") + log("=" * 60) + log("🧠 SEMANTIC COLOR ANALYSIS") + log("=" * 60) + log("") + + log(f" 📊 Analyzing {len(colors)} colors...") + + self.analysis_result = self._rule_based_analysis(colors) + + # Log results + self._log_analysis_results(log) + + return self.analysis_result + + def _log_analysis_results(self, log: Callable): + """Log the analysis results in a formatted way.""" + + result = self.analysis_result + + log("") + log("📊 SEMANTIC ANALYSIS RESULTS:") + log("") + + # Brand colors + if result.get("brand"): + log(" 🎨 BRAND COLORS:") + for role, data in result["brand"].items(): + if data: + log(f" {role}: {data['hex']} ({data['confidence']})") + log(f" └─ {data['reason']}") + + # Text colors + if result.get("text"): + log("") + log(" 📝 TEXT COLORS:") + for role, data in result["text"].items(): + if data: + log(f" {role}: {data['hex']} ({data['confidence']})") + + # Background colors + if result.get("background"): + log("") + log(" 🖼️ BACKGROUND COLORS:") + for role, data in result["background"].items(): + if data: + log(f" {role}: {data['hex']} ({data['confidence']})") + + # Border colors + if result.get("border"): + log("") + log(" 📏 BORDER COLORS:") + for role, data in result["border"].items(): + if data: + log(f" {role}: {data['hex']} ({data['confidence']})") + + # Feedback colors + if result.get("feedback"): + log("") + log(" 🚨 FEEDBACK COLORS:") + for role, data in result["feedback"].items(): + if data: + log(f" {role}: {data['hex']} ({data['confidence']})") + + # Summary + summary = result.get("summary", {}) + log("") + log(" 📈 SUMMARY:") + log(f" Total colors analyzed: {summary.get('total_colors_analyzed', 0)}") + log(f" Brand colors found: {summary.get('brand_colors_found', 0)}") + log(f" Clear hierarchy: {'Yes' if summary.get('has_clear_hierarchy') else 'No'}") + log(f" Analysis method: {summary.get('method', 'unknown')}") + log("") + + +def generate_semantic_preview_html(analysis_result: dict) -> str: + """ + Generate HTML preview showing colors organized by semantic role. + + Args: + analysis_result: Output from SemanticColorAnalyzer + + Returns: + HTML string for Gradio HTML component + """ + + # Handle empty or invalid result + if not analysis_result: + return ''' +
+

+ ⚠️ Semantic analysis did not produce results. Check the logs for errors. +

+
+ + ''' + + def color_card(hex_val: str, role: str, confidence: str, reason: str = "") -> str: + """Generate HTML for a single color card.""" + # Determine text color based on luminance + try: + r = int(hex_val[1:3], 16) + g = int(hex_val[3:5], 16) + b = int(hex_val[5:7], 16) + luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255 + text_color = "#1a1a1a" if luminance > 0.5 else "#ffffff" + except: + text_color = "#1a1a1a" + + confidence_badge = { + "high": 'High', + "medium": 'Medium', + "low": 'Low', + }.get(confidence, "") + + return f''' +
+
+ {hex_val} +
+
+
{role.replace("_", " ").title()}
+ {confidence_badge} +
+
+ ''' + + def category_section(title: str, icon: str, colors: dict) -> str: + """Generate HTML for a category section.""" + if not colors: + return "" + + cards_html = "" + for role, data in colors.items(): + if data and isinstance(data, dict) and "hex" in data: + cards_html += color_card( + data["hex"], + role, + data.get("confidence", "medium"), + data.get("reason", "") + ) + + if not cards_html: + return "" + + return f''' +
+

{icon} {title}

+
+ {cards_html} +
+
+ ''' + + # Build sections + sections_html = "" + sections_html += category_section("Brand Colors", "🎨", analysis_result.get("brand", {})) + sections_html += category_section("Text Colors", "📝", analysis_result.get("text", {})) + sections_html += category_section("Background Colors", "🖼️", analysis_result.get("background", {})) + sections_html += category_section("Border Colors", "📏", analysis_result.get("border", {})) + sections_html += category_section("Feedback Colors", "🚨", analysis_result.get("feedback", {})) + + # Check if any sections were created + if not sections_html.strip(): + return ''' +
+

+ ⚠️ No semantic color categories were detected. The colors may not have enough context data (elements, CSS properties) for classification. +

+
+ + ''' + + # Summary + summary = analysis_result.get("summary", {}) + summary_html = f''' +
+

📈 Analysis Summary

+
+
+ {summary.get("total_colors_analyzed", 0)} + Colors Analyzed +
+
+ {summary.get("brand_colors_found", 0)} + Brand Colors +
+
+ {"✓" if summary.get("has_clear_hierarchy") else "✗"} + Clear Hierarchy +
+
+ {summary.get("method", "rule-based").upper()} + Analysis Method +
+
+
+ ''' + + html = f''' + + +
+ {sections_html} + {summary_html} +
+ ''' + + return html diff --git a/agents/state.py b/agents/state.py new file mode 100644 index 0000000000000000000000000000000000000000..590dc421d619552d7665d4addf4336ca2c04228c --- /dev/null +++ b/agents/state.py @@ -0,0 +1,292 @@ +""" +LangGraph State Definitions +Design System Extractor v2 + +Defines the state schema and type hints for LangGraph workflow. +""" + +from typing import TypedDict, Annotated, Sequence, Optional +from datetime import datetime +from langgraph.graph.message import add_messages + +from core.token_schema import ( + DiscoveredPage, + ExtractedTokens, + NormalizedTokens, + UpgradeRecommendations, + FinalTokens, + Viewport, +) + + +# ============================================================================= +# STATE ANNOTATIONS +# ============================================================================= + +def merge_lists(left: list, right: list) -> list: + """Merge two lists, avoiding duplicates.""" + seen = set() + result = [] + for item in left + right: + key = str(item) if not hasattr(item, 'url') else item.url + if key not in seen: + seen.add(key) + result.append(item) + return result + + +def replace_value(left, right): + """Replace left with right (simple override).""" + return right if right is not None else left + + +# ============================================================================= +# MAIN WORKFLOW STATE +# ============================================================================= + +class AgentState(TypedDict): + """ + Main state for the LangGraph workflow. + + This state is passed between all agents and accumulates data + as the workflow progresses through stages. + """ + + # ------------------------------------------------------------------------- + # INPUT + # ------------------------------------------------------------------------- + base_url: str # The website URL to extract from + + # ------------------------------------------------------------------------- + # DISCOVERY STAGE (Agent 1 - Part 1) + # ------------------------------------------------------------------------- + discovered_pages: Annotated[list[DiscoveredPage], merge_lists] + pages_to_crawl: list[str] # User-confirmed pages + + # ------------------------------------------------------------------------- + # EXTRACTION STAGE (Agent 1 - Part 2) + # ------------------------------------------------------------------------- + # Desktop extraction + desktop_extraction: Optional[ExtractedTokens] + desktop_crawl_progress: float # 0.0 to 1.0 + + # Mobile extraction + mobile_extraction: Optional[ExtractedTokens] + mobile_crawl_progress: float # 0.0 to 1.0 + + # ------------------------------------------------------------------------- + # NORMALIZATION STAGE (Agent 2) + # ------------------------------------------------------------------------- + desktop_normalized: Optional[NormalizedTokens] + mobile_normalized: Optional[NormalizedTokens] + + # User decisions from Stage 1 review + accepted_colors: list[str] # List of accepted color values + rejected_colors: list[str] # List of rejected color values + accepted_typography: list[str] + rejected_typography: list[str] + accepted_spacing: list[str] + rejected_spacing: list[str] + + # ------------------------------------------------------------------------- + # ADVISOR STAGE (Agent 3) + # ------------------------------------------------------------------------- + upgrade_recommendations: Optional[UpgradeRecommendations] + + # User selections from Stage 2 playground + selected_type_scale: Optional[str] # ID of selected scale + selected_spacing_system: Optional[str] + selected_naming_convention: Optional[str] + selected_color_ramps: dict[str, bool] # {"primary": True, "secondary": False} + selected_a11y_fixes: list[str] # IDs of accepted fixes + + # ------------------------------------------------------------------------- + # GENERATION STAGE (Agent 4) + # ------------------------------------------------------------------------- + desktop_final: Optional[FinalTokens] + mobile_final: Optional[FinalTokens] + + # Version info + version_label: str # e.g., "v1-recovered", "v2-upgraded" + + # ------------------------------------------------------------------------- + # WORKFLOW METADATA + # ------------------------------------------------------------------------- + current_stage: str # "discover", "extract", "normalize", "advise", "generate", "export" + + # Human checkpoints + awaiting_human_input: bool + checkpoint_name: Optional[str] # "confirm_pages", "review_tokens", "select_upgrades", "approve_export" + + # Errors and warnings (accumulated) + errors: Annotated[list[str], merge_lists] + warnings: Annotated[list[str], merge_lists] + + # Messages for LLM agents (if using chat-based agents) + messages: Annotated[Sequence[dict], add_messages] + + # Timing + started_at: Optional[datetime] + stage_started_at: Optional[datetime] + + +# ============================================================================= +# STAGE-SPECIFIC STATES (for parallel execution) +# ============================================================================= + +class DiscoveryState(TypedDict): + """State for page discovery sub-graph.""" + base_url: str + discovered_pages: list[DiscoveredPage] + discovery_complete: bool + error: Optional[str] + + +class ExtractionState(TypedDict): + """State for extraction sub-graph (per viewport).""" + viewport: Viewport + pages_to_crawl: list[str] + extraction_result: Optional[ExtractedTokens] + progress: float + current_page: Optional[str] + error: Optional[str] + + +class NormalizationState(TypedDict): + """State for normalization sub-graph.""" + raw_tokens: ExtractedTokens + normalized_tokens: Optional[NormalizedTokens] + duplicates_found: list[tuple[str, str]] + error: Optional[str] + + +class AdvisorState(TypedDict): + """State for advisor sub-graph.""" + normalized_desktop: NormalizedTokens + normalized_mobile: Optional[NormalizedTokens] + recommendations: Optional[UpgradeRecommendations] + error: Optional[str] + + +class GenerationState(TypedDict): + """State for generation sub-graph.""" + normalized_tokens: NormalizedTokens + selected_upgrades: dict[str, str] + final_tokens: Optional[FinalTokens] + error: Optional[str] + + +# ============================================================================= +# CHECKPOINT STATES (Human-in-the-loop) +# ============================================================================= + +class PageConfirmationState(TypedDict): + """State for page confirmation checkpoint.""" + discovered_pages: list[DiscoveredPage] + confirmed_pages: list[str] + user_confirmed: bool + + +class TokenReviewState(TypedDict): + """State for token review checkpoint (Stage 1 UI).""" + desktop_tokens: NormalizedTokens + mobile_tokens: Optional[NormalizedTokens] + + # User decisions + color_decisions: dict[str, bool] # {value: accepted} + typography_decisions: dict[str, bool] + spacing_decisions: dict[str, bool] + + user_confirmed: bool + + +class UpgradeSelectionState(TypedDict): + """State for upgrade selection checkpoint (Stage 2 UI).""" + recommendations: UpgradeRecommendations + current_tokens: NormalizedTokens + + # User selections + selected_options: dict[str, str] # {category: option_id} + + user_confirmed: bool + + +class ExportApprovalState(TypedDict): + """State for export approval checkpoint (Stage 3 UI).""" + desktop_final: FinalTokens + mobile_final: Optional[FinalTokens] + + version_label: str + user_confirmed: bool + + +# ============================================================================= +# STATE FACTORY FUNCTIONS +# ============================================================================= + +def create_initial_state(base_url: str) -> AgentState: + """Create initial state for a new workflow.""" + return { + # Input + "base_url": base_url, + + # Discovery + "discovered_pages": [], + "pages_to_crawl": [], + + # Extraction + "desktop_extraction": None, + "desktop_crawl_progress": 0.0, + "mobile_extraction": None, + "mobile_crawl_progress": 0.0, + + # Normalization + "desktop_normalized": None, + "mobile_normalized": None, + "accepted_colors": [], + "rejected_colors": [], + "accepted_typography": [], + "rejected_typography": [], + "accepted_spacing": [], + "rejected_spacing": [], + + # Advisor + "upgrade_recommendations": None, + "selected_type_scale": None, + "selected_spacing_system": None, + "selected_naming_convention": None, + "selected_color_ramps": {}, + "selected_a11y_fixes": [], + + # Generation + "desktop_final": None, + "mobile_final": None, + "version_label": "v1-recovered", + + # Workflow + "current_stage": "discover", + "awaiting_human_input": False, + "checkpoint_name": None, + "errors": [], + "warnings": [], + "messages": [], + + # Timing + "started_at": datetime.now(), + "stage_started_at": datetime.now(), + } + + +def get_stage_progress(state: AgentState) -> dict: + """Get progress information for the current workflow.""" + stages = ["discover", "extract", "normalize", "advise", "generate", "export"] + current_idx = stages.index(state["current_stage"]) if state["current_stage"] in stages else 0 + + return { + "current_stage": state["current_stage"], + "stage_index": current_idx, + "total_stages": len(stages), + "progress_percent": (current_idx / len(stages)) * 100, + "awaiting_human": state["awaiting_human_input"], + "checkpoint": state["checkpoint_name"], + } diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..0bcad212f703f036263a582c424c79047a6a87bd --- /dev/null +++ b/app.py @@ -0,0 +1,4637 @@ +""" +Design System Extractor v2 — Main Application +============================================== + +Flow: +1. User enters URL +2. Agent 1 discovers pages → User confirms +3. Agent 1 extracts tokens (Desktop + Mobile) +4. Agent 2 normalizes tokens +5. Stage 1 UI: User reviews tokens (accept/reject, Desktop↔Mobile toggle) +6. Agent 3 proposes upgrades +7. Stage 2 UI: User selects options with live preview +8. Agent 4 generates JSON +9. Stage 3 UI: User exports +""" + +import os +import asyncio +import json +import gradio as gr +from datetime import datetime +from typing import Optional + +# Get HF token from environment +HF_TOKEN_FROM_ENV = os.getenv("HF_TOKEN", "") + +# ============================================================================= +# GLOBAL STATE +# ============================================================================= + +class AppState: + """Global application state.""" + def __init__(self): + self.reset() + + def reset(self): + self.discovered_pages = [] + self.base_url = "" + self.desktop_raw = None # ExtractedTokens + self.mobile_raw = None # ExtractedTokens + self.desktop_normalized = None # NormalizedTokens + self.mobile_normalized = None # NormalizedTokens + self.upgrade_recommendations = None # UpgradeRecommendations + self.selected_upgrades = {} # User selections + self.logs = [] + + def log(self, message: str): + timestamp = datetime.now().strftime("%H:%M:%S") + self.logs.append(f"[{timestamp}] {message}") + if len(self.logs) > 100: + self.logs.pop(0) + + def get_logs(self) -> str: + return "\n".join(self.logs) + +state = AppState() + + +# ============================================================================= +# MESSAGE HELPERS +# ============================================================================= + +def success_message(title: str, details: str, next_step: str) -> str: + """Generate a formatted success message with next-step guidance.""" + return f"## ✅ {title}\n\n{details}\n\n**Next step:** {next_step}" + + +def error_message(title: str, details: str, how_to_fix: str) -> str: + """Generate a formatted error message with fix guidance.""" + return f"## ❌ {title}\n\n{details}\n\n**How to fix:** {how_to_fix}" + + +# ============================================================================= +# LAZY IMPORTS +# ============================================================================= + +def get_crawler(): + import agents.crawler + return agents.crawler + +def get_extractor(): + import agents.extractor + return agents.extractor + +def get_normalizer(): + import agents.normalizer + return agents.normalizer + +def get_advisor(): + import agents.advisor + return agents.advisor + +def get_schema(): + import core.token_schema + return core.token_schema + + +# ============================================================================= +# PHASE 1: DISCOVER PAGES +# ============================================================================= + +async def discover_pages(url: str, progress=gr.Progress()): + """Discover pages from URL.""" + state.reset() + + if not url or not url.startswith(("http://", "https://")): + return error_message("Invalid URL", + "The URL must start with `https://` or `http://`.", + "Enter a full URL like `https://example.com` and try again."), "", None + + state.log(f"🚀 Starting discovery for: {url}") + progress(0.1, desc="🔍 Discovering pages...") + + try: + crawler = get_crawler() + discoverer = crawler.PageDiscoverer() + + pages = await discoverer.discover(url) + + state.discovered_pages = pages + state.base_url = url + + state.log(f"✅ Found {len(pages)} pages") + + # Format for display + pages_data = [] + for page in pages: + pages_data.append([ + True, # Selected by default + page.url, + page.title if page.title else "(No title)", + page.page_type.value, + "✓" if not page.error else f"⚠ {page.error}" + ]) + + progress(1.0, desc="✅ Discovery complete!") + + status = success_message( + f"Found {len(pages)} Pages", + f"The crawler discovered **{len(pages)} pages** on `{url}`. Review the table below — " + "use the **Select** checkboxes to choose which pages to scan for design tokens.", + "Click **'Extract Tokens (Desktop + Mobile)'** to begin extraction." + ) + + return status, state.get_logs(), pages_data + + except Exception as e: + import traceback + state.log(f"❌ Error: {str(e)}") + error_detail = str(e).lower() + if "timeout" in error_detail: + hint = "The website took too long to respond. Try again, or check if the site is accessible in your browser." + elif "dns" in error_detail or "name resolution" in error_detail: + hint = "Could not find this website. Please check the URL for typos." + elif "ssl" in error_detail or "certificate" in error_detail: + hint = "SSL/certificate error. Try using `http://` instead of `https://`, or check if the site has a valid certificate." + else: + hint = "Check that the URL is correct and the site is publicly accessible. Review the log above for details." + return error_message("Discovery Failed", str(e)[:200], hint), state.get_logs(), None + + +# ============================================================================= +# PHASE 2: EXTRACT TOKENS +# ============================================================================= + +async def extract_tokens(pages_data, progress=gr.Progress()): + """Extract tokens from selected pages (both viewports).""" + + state.log(f"📥 Received pages_data type: {type(pages_data)}") + + if pages_data is None: + return (error_message("No Pages Discovered", + "No pages have been discovered yet.", + "Go to **Step 1** above, enter a URL, and click **'Discover Pages'** first."), + state.get_logs(), None, None) + + # Get selected URLs - handle pandas DataFrame + selected_urls = [] + + try: + # Check if it's a pandas DataFrame + if hasattr(pages_data, 'iterrows'): + state.log(f"📥 DataFrame with {len(pages_data)} rows, columns: {list(pages_data.columns)}") + + for idx, row in pages_data.iterrows(): + # Get values by column name or position + try: + # Try column names first + is_selected = row.get('Select', row.iloc[0] if len(row) > 0 else False) + url = row.get('URL', row.iloc[1] if len(row) > 1 else '') + except (KeyError, IndexError, TypeError): + # Fallback to positional + is_selected = row.iloc[0] if len(row) > 0 else False + url = row.iloc[1] if len(row) > 1 else '' + + if is_selected and url: + selected_urls.append(url) + + # If it's a dict (Gradio sometimes sends this) + elif isinstance(pages_data, dict): + state.log(f"📥 Dict with keys: {list(pages_data.keys())}") + data = pages_data.get('data', []) + for row in data: + if isinstance(row, (list, tuple)) and len(row) >= 2 and row[0]: + selected_urls.append(row[1]) + + # If it's a list + elif isinstance(pages_data, (list, tuple)): + state.log(f"📥 List with {len(pages_data)} items") + for row in pages_data: + if isinstance(row, (list, tuple)) and len(row) >= 2 and row[0]: + selected_urls.append(row[1]) + + except Exception as e: + state.log(f"❌ Error parsing pages_data: {str(e)}") + import traceback + state.log(traceback.format_exc()) + + state.log(f"📋 Found {len(selected_urls)} selected URLs") + + # If still no URLs, try using stored discovered pages + if not selected_urls and state.discovered_pages: + state.log("⚠️ No URLs from table, using all discovered pages") + selected_urls = [p.url for p in state.discovered_pages if not p.error][:10] + + if not selected_urls: + return (error_message("No Pages Selected", + "No pages are selected for extraction.", + "Go back to the pages table above and check the **Select** boxes for the pages you want to extract, then click this button again."), + state.get_logs(), None, None) + + # Limit to 10 pages for performance + selected_urls = selected_urls[:10] + + state.log(f"📋 Extracting from {len(selected_urls)} pages:") + for url in selected_urls[:3]: + state.log(f" • {url}") + if len(selected_urls) > 3: + state.log(f" ... and {len(selected_urls) - 3} more") + + progress(0.05, desc="🚀 Starting extraction...") + + try: + schema = get_schema() + extractor_mod = get_extractor() + normalizer_mod = get_normalizer() + + # === DESKTOP EXTRACTION === + state.log("") + state.log("=" * 60) + state.log("🖥️ DESKTOP EXTRACTION (1440px)") + state.log("=" * 60) + state.log("") + state.log("📡 Enhanced extraction from 7 sources:") + state.log(" 1. DOM computed styles (getComputedStyle)") + state.log(" 2. CSS variables (:root { --color: })") + state.log(" 3. SVG colors (fill, stroke)") + state.log(" 4. Inline styles (style='color:')") + state.log(" 5. Stylesheet rules (CSS files)") + state.log(" 6. External CSS files (fetch & parse)") + state.log(" 7. Page content scan (brute-force)") + state.log("") + + progress(0.1, desc="🖥️ Extracting desktop tokens...") + + desktop_extractor = extractor_mod.TokenExtractor(viewport=schema.Viewport.DESKTOP) + + def desktop_progress(p): + progress(0.1 + (p * 0.35), desc=f"🖥️ Desktop... {int(p*100)}%") + + state.desktop_raw = await desktop_extractor.extract(selected_urls, progress_callback=desktop_progress) + + # Log extraction details + state.log("📊 EXTRACTION RESULTS:") + state.log(f" Colors: {len(state.desktop_raw.colors)} unique") + state.log(f" Typography: {len(state.desktop_raw.typography)} styles") + state.log(f" Spacing: {len(state.desktop_raw.spacing)} values") + state.log(f" Radius: {len(state.desktop_raw.radius)} values") + state.log(f" Shadows: {len(state.desktop_raw.shadows)} values") + + # Store foreground-background pairs for real AA checking in Stage 2 + if hasattr(desktop_extractor, 'fg_bg_pairs') and desktop_extractor.fg_bg_pairs: + state.fg_bg_pairs = desktop_extractor.fg_bg_pairs + state.log(f" FG/BG Pairs: {len(state.fg_bg_pairs)} unique pairs for AA checking") + else: + state.fg_bg_pairs = [] + + # Log CSS variables if found + if hasattr(desktop_extractor, 'css_variables') and desktop_extractor.css_variables: + state.log("") + state.log(f"🎨 CSS Variables found: {len(desktop_extractor.css_variables)}") + for var_name, var_value in list(desktop_extractor.css_variables.items())[:5]: + state.log(f" {var_name}: {var_value}") + if len(desktop_extractor.css_variables) > 5: + state.log(f" ... and {len(desktop_extractor.css_variables) - 5} more") + + # Log warnings if any + if desktop_extractor.warnings: + state.log("") + state.log("⚠️ Warnings:") + for w in desktop_extractor.warnings[:3]: + state.log(f" {w}") + + # Normalize desktop + state.log("") + state.log("🔄 Normalizing (deduping, naming)...") + state.desktop_normalized = normalizer_mod.normalize_tokens(state.desktop_raw) + state.log(f" ✅ Normalized: {len(state.desktop_normalized.colors)} colors, {len(state.desktop_normalized.typography)} typography, {len(state.desktop_normalized.spacing)} spacing") + + # === MOBILE EXTRACTION === + state.log("") + state.log("=" * 60) + state.log("📱 MOBILE EXTRACTION (375px)") + state.log("=" * 60) + state.log("") + + progress(0.5, desc="📱 Extracting mobile tokens...") + + mobile_extractor = extractor_mod.TokenExtractor(viewport=schema.Viewport.MOBILE) + + def mobile_progress(p): + progress(0.5 + (p * 0.35), desc=f"📱 Mobile... {int(p*100)}%") + + state.mobile_raw = await mobile_extractor.extract(selected_urls, progress_callback=mobile_progress) + + # Log extraction details + state.log("📊 EXTRACTION RESULTS:") + state.log(f" Colors: {len(state.mobile_raw.colors)} unique") + state.log(f" Typography: {len(state.mobile_raw.typography)} styles") + state.log(f" Spacing: {len(state.mobile_raw.spacing)} values") + state.log(f" Radius: {len(state.mobile_raw.radius)} values") + state.log(f" Shadows: {len(state.mobile_raw.shadows)} values") + + # Normalize mobile + state.log("") + state.log("🔄 Normalizing...") + state.mobile_normalized = normalizer_mod.normalize_tokens(state.mobile_raw) + state.log(f" ✅ Normalized: {len(state.mobile_normalized.colors)} colors, {len(state.mobile_normalized.typography)} typography, {len(state.mobile_normalized.spacing)} spacing") + + # === FIRECRAWL CSS EXTRACTION (Agent 1B) === + progress(0.88, desc="🔥 Firecrawl CSS analysis...") + + try: + from agents.firecrawl_extractor import extract_css_colors + + # Get base URL for Firecrawl + base_url = selected_urls[0] if selected_urls else state.base_url + + # Extract CSS colors using Firecrawl + firecrawl_result = await extract_css_colors( + url=base_url, + api_key=None, # Will use fallback method + log_callback=state.log + ) + + # Merge Firecrawl colors into desktop normalized + firecrawl_colors = firecrawl_result.get("colors", {}) + + if firecrawl_colors: + state.log("") + state.log("🔀 Merging Firecrawl colors with Playwright extraction...") + + # Count new colors + new_colors_count = 0 + + for hex_val, color_data in firecrawl_colors.items(): + # Check if this color already exists + existing = False + for name, existing_color in state.desktop_normalized.colors.items(): + if existing_color.value.lower() == hex_val.lower(): + existing = True + # Update frequency + existing_color.frequency += color_data.get("frequency", 1) + if "firecrawl" not in existing_color.contexts: + existing_color.contexts.append("firecrawl") + break + + if not existing: + # Add new color from Firecrawl + from core.token_schema import ColorToken, TokenSource, Confidence + + new_token = ColorToken( + value=hex_val, + frequency=color_data.get("frequency", 1), + contexts=["firecrawl"] + color_data.get("contexts", []), + elements=["css-file"], + css_properties=color_data.get("sources", []), + contrast_white=color_data.get("contrast_white", 0), + contrast_black=color_data.get("contrast_black", 0), + source=TokenSource.DETECTED, + confidence=Confidence.MEDIUM, + ) + + # Generate name based on color characteristics (not garbage like firecrawl.34) + # This will be a fallback; semantic analysis may override later + new_token.suggested_name = None # Let consolidation generate proper name + + state.desktop_normalized.colors[hex_val] = new_token + new_colors_count += 1 + + state.log(f" ✅ Added {new_colors_count} new colors from Firecrawl") + state.log(f" 📊 Total colors now: {len(state.desktop_normalized.colors)}") + + except Exception as e: + state.log(f" ⚠️ Firecrawl extraction skipped: {str(e)}") + + # === SEMANTIC COLOR ANALYSIS (Agent 1C) === + progress(0.92, desc="🧠 Semantic color analysis...") + + semantic_result = {} + semantic_preview_html = "" + + try: + from agents.semantic_analyzer import SemanticColorAnalyzer, generate_semantic_preview_html + + # Create analyzer (using rule-based for now, can add LLM later) + semantic_analyzer = SemanticColorAnalyzer(llm_provider=None) + + # Run analysis + semantic_result = semantic_analyzer.analyze_sync( + colors=state.desktop_normalized.colors, + log_callback=state.log + ) + + # Store in state for Stage 2 + state.semantic_analysis = semantic_result + + # Generate preview HTML + semantic_preview_html = generate_semantic_preview_html(semantic_result) + + except Exception as e: + state.log(f" ⚠️ Semantic analysis skipped: {str(e)}") + import traceback + state.log(traceback.format_exc()) + + progress(0.95, desc="📊 Preparing results...") + + # Format results for Stage 1 UI + desktop_data = format_tokens_for_display(state.desktop_normalized) + mobile_data = format_tokens_for_display(state.mobile_normalized) + + # Generate visual previews - AS-IS for Stage 1 (no ramps, no enhancements) + state.log("") + state.log("🎨 Generating AS-IS visual previews...") + + from core.preview_generator import ( + generate_typography_preview_html, + generate_colors_asis_preview_html, + generate_spacing_asis_preview_html, + generate_radius_asis_preview_html, + generate_shadows_asis_preview_html, + ) + + # Get detected font + fonts = get_detected_fonts() + primary_font = fonts.get("primary", "Open Sans") + + # Convert typography tokens to dict format for preview + typo_dict = {} + for name, t in state.desktop_normalized.typography.items(): + typo_dict[name] = { + "font_size": t.font_size, + "font_weight": t.font_weight, + "line_height": t.line_height or "1.5", + "letter_spacing": "0", + } + + # Convert color tokens to dict format for preview (with full metadata) + color_dict = {} + for name, c in state.desktop_normalized.colors.items(): + color_dict[name] = { + "value": c.value, + "frequency": c.frequency, + "contexts": c.contexts[:3] if c.contexts else [], + "elements": c.elements[:3] if c.elements else [], + "css_properties": c.css_properties[:3] if c.css_properties else [], + "contrast_white": c.contrast_white, + "contrast_black": getattr(c, 'contrast_black', 0), + } + + # Convert spacing tokens to dict format + spacing_dict = {} + for name, s in state.desktop_normalized.spacing.items(): + spacing_dict[name] = { + "value": s.value, + "value_px": s.value_px, + } + + # Convert radius tokens to dict format + radius_dict = {} + for name, r in state.desktop_normalized.radius.items(): + radius_dict[name] = {"value": r.value} + + # Convert shadow tokens to dict format + shadow_dict = {} + for name, s in state.desktop_normalized.shadows.items(): + shadow_dict[name] = {"value": s.value} + + # Generate AS-IS previews (Stage 1 - raw extracted values) + typography_preview_html = generate_typography_preview_html( + typography_tokens=typo_dict, + font_family=primary_font, + sample_text="The quick brown fox jumps over the lazy dog", + ) + + # AS-IS color preview (no ramps) + colors_asis_preview_html = generate_colors_asis_preview_html( + color_tokens=color_dict, + ) + + # AS-IS spacing preview + spacing_asis_preview_html = generate_spacing_asis_preview_html( + spacing_tokens=spacing_dict, + ) + + # AS-IS radius preview + radius_asis_preview_html = generate_radius_asis_preview_html( + radius_tokens=radius_dict, + ) + + # AS-IS shadows preview + shadows_asis_preview_html = generate_shadows_asis_preview_html( + shadow_tokens=shadow_dict, + ) + + state.log(" ✅ Typography preview generated") + state.log(" ✅ Colors AS-IS preview generated (no ramps)") + state.log(" ✅ Semantic color analysis preview generated") + state.log(" ✅ Spacing AS-IS preview generated") + state.log(" ✅ Radius AS-IS preview generated") + state.log(" ✅ Shadows AS-IS preview generated") + + # Get semantic summary for status + brand_count = len(semantic_result.get("brand", {})) + text_count = len(semantic_result.get("text", {})) + bg_count = len(semantic_result.get("background", {})) + + state.log("") + state.log("=" * 50) + state.log("✅ EXTRACTION COMPLETE!") + state.log(f" Enhanced extraction captured:") + state.log(f" • {len(state.desktop_normalized.colors)} colors (DOM + CSS vars + SVG + inline)") + state.log(f" • {len(state.desktop_normalized.typography)} typography styles") + state.log(f" • {len(state.desktop_normalized.spacing)} spacing values") + state.log(f" • {len(state.desktop_normalized.radius)} radius values") + state.log(f" • {len(state.desktop_normalized.shadows)} shadow values") + state.log(f" Semantic Analysis:") + state.log(f" • {brand_count} brand colors identified") + state.log(f" • {text_count} text colors identified") + state.log(f" • {bg_count} background colors identified") + state.log("=" * 50) + + progress(1.0, desc="✅ Complete!") + + status = f"""## ✅ Extraction Complete! + +| Viewport | Colors | Typography | Spacing | Radius | Shadows | +|----------|--------|------------|---------|--------|---------| +| Desktop | {len(state.desktop_normalized.colors)} | {len(state.desktop_normalized.typography)} | {len(state.desktop_normalized.spacing)} | {len(state.desktop_normalized.radius)} | {len(state.desktop_normalized.shadows)} | +| Mobile | {len(state.mobile_normalized.colors)} | {len(state.mobile_normalized.typography)} | {len(state.mobile_normalized.spacing)} | {len(state.mobile_normalized.radius)} | {len(state.mobile_normalized.shadows)} | + +**Primary Font:** {primary_font} + +**Semantic Analysis:** {brand_count} brand, {text_count} text, {bg_count} background colors + +**Enhanced Extraction:** DOM + CSS Variables + SVG + Inline + Stylesheets + Firecrawl + +**Next:** Review the tokens below. Accept or reject, then proceed to Stage 2. +""" + + # Return all AS-IS previews including semantic + return ( + status, + state.get_logs(), + desktop_data, + mobile_data, + typography_preview_html, + colors_asis_preview_html, + semantic_preview_html, + spacing_asis_preview_html, + radius_asis_preview_html, + shadows_asis_preview_html, + ) + + except Exception as e: + import traceback + state.log(f"❌ Error: {str(e)}") + state.log(traceback.format_exc()) + error_detail = str(e).lower() + if "timeout" in error_detail or "navigation" in error_detail: + hint = "The page took too long to load. Try selecting fewer pages, or check if the site requires authentication." + elif "no tokens" in error_detail or "empty" in error_detail: + hint = "No design tokens could be extracted. The site may use unusual CSS patterns. Try a different page selection." + else: + hint = "Check the log above for details. Try selecting fewer pages or a different set of pages." + return (error_message("Extraction Failed", str(e)[:200], hint), + state.get_logs(), None, None, "", "", "", "", "", "") + + +def format_tokens_for_display(normalized) -> dict: + """Format normalized tokens for Gradio display.""" + if normalized is None: + return {"colors": [], "typography": [], "spacing": []} + + # Colors are now a dict + colors = [] + color_items = list(normalized.colors.values()) if isinstance(normalized.colors, dict) else normalized.colors + for c in sorted(color_items, key=lambda x: -x.frequency)[:50]: + colors.append([ + True, # Accept checkbox + c.value, + c.suggested_name or "", + c.frequency, + c.confidence.value if c.confidence else "medium", + f"{c.contrast_white:.1f}:1" if c.contrast_white else "N/A", + "✓" if c.wcag_aa_small_text else "✗", + ", ".join(c.contexts[:2]) if c.contexts else "", + ]) + + # Typography + typography = [] + typo_items = list(normalized.typography.values()) if isinstance(normalized.typography, dict) else normalized.typography + for t in sorted(typo_items, key=lambda x: -x.frequency)[:30]: + typography.append([ + True, # Accept checkbox + t.font_family, + t.font_size, + str(t.font_weight), + t.line_height or "", + t.suggested_name or "", + t.frequency, + t.confidence.value if t.confidence else "medium", + ]) + + # Spacing + spacing = [] + spacing_items = list(normalized.spacing.values()) if isinstance(normalized.spacing, dict) else normalized.spacing + for s in sorted(spacing_items, key=lambda x: x.value_px)[:20]: + spacing.append([ + True, # Accept checkbox + s.value, + f"{s.value_px}px", + s.suggested_name or "", + s.frequency, + "✓" if s.fits_base_8 else "", + s.confidence.value if s.confidence else "medium", + ]) + + # Radius + radius = [] + radius_items = list(normalized.radius.values()) if isinstance(normalized.radius, dict) else normalized.radius + for r in sorted(radius_items, key=lambda x: -x.frequency)[:20]: + radius.append([ + True, # Accept checkbox + r.value, + r.frequency, + ", ".join(r.elements[:3]) if r.elements else "", + ]) + + return { + "colors": colors, + "typography": typography, + "spacing": spacing, + "radius": radius, + } + + +def switch_viewport(viewport: str): + """Switch between desktop and mobile view.""" + if viewport == "Desktop (1440px)": + data = format_tokens_for_display(state.desktop_normalized) + else: + data = format_tokens_for_display(state.mobile_normalized) + + return data["colors"], data["typography"], data["spacing"], data["radius"] + + +# Legacy run_stage2_analysis() removed in v3 — use run_stage2_analysis_v2() + + +def normalized_to_dict(normalized) -> dict: + """Convert NormalizedTokens to dict for workflow. + + v3: Includes full context (elements, contexts, role_hint, blur_px, etc.) + so agents can reason about WHY each token is used. + """ + if not normalized: + return {} + + result = { + "colors": {}, + "typography": {}, + "spacing": {}, + "radius": {}, + "shadows": {}, + } + + # Colors — include contexts, elements, role_hint for AURORA + for name, c in normalized.colors.items(): + result["colors"][name] = { + "value": c.value, + "frequency": c.frequency, + "suggested_name": c.suggested_name, + "contrast_white": c.contrast_white, + "contrast_black": c.contrast_black, + "contexts": getattr(c, 'contexts', []), + "elements": getattr(c, 'elements', []), + "role_hint": getattr(c, 'role_hint', None), + } + + # Typography — include elements for hierarchy analysis + for name, t in normalized.typography.items(): + result["typography"][name] = { + "font_family": t.font_family, + "font_size": t.font_size, + "font_weight": t.font_weight, + "line_height": t.line_height, + "frequency": t.frequency, + "elements": getattr(t, 'elements', []), + } + + # Spacing — include contexts and grid-alignment flags + for name, s in normalized.spacing.items(): + result["spacing"][name] = { + "value": s.value, + "value_px": s.value_px, + "frequency": s.frequency, + "contexts": getattr(s, 'contexts', []), + "fits_base_4": getattr(s, 'fits_base_4', False), + "fits_base_8": getattr(s, 'fits_base_8', False), + } + + # Radius — include grid-alignment flags and elements + for name, r in normalized.radius.items(): + result["radius"][name] = { + "value": r.value, + "value_px": getattr(r, 'value_px', None), + "frequency": r.frequency, + "elements": getattr(r, 'elements', []), + "fits_base_4": getattr(r, 'fits_base_4', False), + "fits_base_8": getattr(r, 'fits_base_8', False), + } + + # Shadows — include parsed components for elevation analysis + for name, s in normalized.shadows.items(): + result["shadows"][name] = { + "value": s.value, + "frequency": s.frequency, + "blur_px": getattr(s, 'blur_px', None), + "y_offset_px": getattr(s, 'y_offset_px', None), + "elements": getattr(s, 'elements', []), + } + + return result + + +# ============================================================================= +# STAGE 2: NEW ARCHITECTURE (Rule Engine + Benchmark Research + LLM Agents) +# ============================================================================= + +async def run_stage2_analysis_v2( + selected_benchmarks: list[str] = None, + progress=gr.Progress() +): + """ + Run Stage 2 analysis with new architecture: + - Layer 1: Rule Engine (FREE) + - Layer 2: Benchmark Research (Firecrawl + Cache) + - Layer 3: LLM Agents (Brand ID, Benchmark Advisor, Best Practices) + - Layer 4: HEAD Synthesizer + + Includes comprehensive error handling for graceful degradation. + """ + + # Validate Stage 1 completion + if not state.desktop_normalized or not state.mobile_normalized: + return create_stage2_error_response( + error_message("Stage 1 Not Complete", + "No extracted tokens found. Stage 1 extraction must be completed before running analysis.", + "Go back to **Step 1**, enter a URL, discover pages, and extract tokens first.") + ) + + # Default benchmarks if none selected + if not selected_benchmarks or len(selected_benchmarks) == 0: + selected_benchmarks = [ + "material_design_3", + "shopify_polaris", + "atlassian_design", + ] + + state.log("") + state.log("═" * 60) + state.log("🚀 STAGE 2: MULTI-AGENT ANALYSIS") + state.log("═" * 60) + state.log(f" Started: {datetime.now().strftime('%H:%M:%S')}") + state.log(f" Benchmarks: {', '.join(selected_benchmarks)}") + state.log("") + + # Import dataclasses early so fallbacks always work + try: + from agents.llm_agents import ( + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + ) + except ImportError: + # Minimal v3-compatible fallback dataclasses + from dataclasses import dataclass, field + @dataclass + class BrandIdentification: + brand_primary: dict = field(default_factory=dict) + brand_secondary: dict = field(default_factory=dict) + brand_accent: dict = field(default_factory=dict) + palette_strategy: str = "" + cohesion_score: int = 5 + cohesion_notes: str = "" + naming_map: dict = field(default_factory=dict) + semantic_names: dict = field(default_factory=dict) + self_evaluation: dict = field(default_factory=dict) + reasoning_trace: list = field(default_factory=list) + validation_passed: bool = False + retry_count: int = 0 + typography_notes: str = "" + spacing_notes: str = "" + radius_notes: str = "" + shadow_notes: str = "" + def to_dict(self): + return {k: getattr(self, k) for k in ['brand_primary', 'brand_secondary', 'brand_accent', 'palette_strategy', 'cohesion_score', 'naming_map', 'self_evaluation']} + + @dataclass + class BenchmarkAdvice: + recommended_benchmark: str = "" + recommended_benchmark_name: str = "" + reasoning: str = "" + alignment_changes: list = field(default_factory=list) + pros_of_alignment: list = field(default_factory=list) + cons_of_alignment: list = field(default_factory=list) + alternative_benchmarks: list = field(default_factory=list) + self_evaluation: dict = field(default_factory=dict) + reasoning_trace: list = field(default_factory=list) + typography_comparison: dict = field(default_factory=dict) + spacing_comparison: dict = field(default_factory=dict) + color_comparison: dict = field(default_factory=dict) + radius_comparison: dict = field(default_factory=dict) + shadow_comparison: dict = field(default_factory=dict) + def to_dict(self): + return {k: getattr(self, k) for k in ['recommended_benchmark', 'recommended_benchmark_name', 'reasoning', 'alignment_changes']} + + @dataclass + class BestPracticesResult: + overall_score: int = 50 + checks: dict = field(default_factory=dict) + priority_fixes: list = field(default_factory=list) + passing_practices: list = field(default_factory=list) + failing_practices: list = field(default_factory=list) + self_evaluation: dict = field(default_factory=dict) + reasoning_trace: list = field(default_factory=list) + validation_passed: bool = False + color_assessment: dict = field(default_factory=dict) + typography_assessment: dict = field(default_factory=dict) + spacing_assessment: dict = field(default_factory=dict) + radius_assessment: dict = field(default_factory=dict) + shadow_assessment: dict = field(default_factory=dict) + def to_dict(self): + return {k: getattr(self, k) for k in ['overall_score', 'checks', 'priority_fixes', 'passing_practices', 'failing_practices']} + + # Initialize results with defaults (for graceful degradation) + rule_results = None + benchmark_comparisons = [] + brand_result = None + benchmark_advice = None + best_practices = None + final_synthesis = None + + progress(0.05, desc="⚙️ Running Rule Engine...") + + try: + # ================================================================= + # LAYER 1: RULE ENGINE (FREE) - Critical, must succeed + # ================================================================= + try: + from core.rule_engine import run_rule_engine + + # Convert tokens to dict + desktop_dict = normalized_to_dict(state.desktop_normalized) + mobile_dict = normalized_to_dict(state.mobile_normalized) + + # Validate we have data + if not desktop_dict.get("colors") and not desktop_dict.get("typography"): + raise ValueError("No tokens extracted from Stage 1") + + # Run rule engine + rule_results = run_rule_engine( + typography_tokens=desktop_dict.get("typography", {}), + color_tokens=desktop_dict.get("colors", {}), + spacing_tokens=desktop_dict.get("spacing", {}), + radius_tokens=desktop_dict.get("radius", {}), + shadow_tokens=desktop_dict.get("shadows", {}), + log_callback=state.log, + fg_bg_pairs=getattr(state, 'fg_bg_pairs', None), + ) + + state.rule_engine_results = rule_results + state.log("") + state.log(" ✅ Rule Engine: SUCCESS") + + except Exception as e: + state.log(f" ❌ Rule Engine FAILED: {str(e)[:100]}") + state.log(" └─ Cannot proceed without rule engine results") + import traceback + state.log(traceback.format_exc()[:500]) + return create_stage2_error_response( + error_message("Rule Engine Failed", + f"The rule engine could not analyze your tokens: {str(e)[:150]}", + "This usually means the extracted tokens are incomplete. Try re-running Stage 1 extraction with different pages selected.") + ) + + progress(0.20, desc="🔬 Researching benchmarks...") + + # ================================================================= + # LAYER 2: BENCHMARK RESEARCH - Can use fallback + # ================================================================= + try: + from agents.benchmark_researcher import BenchmarkResearcher, FALLBACK_BENCHMARKS, BenchmarkData + + # Try to get Firecrawl client (optional) + firecrawl_client = None + try: + from agents.firecrawl_extractor import get_firecrawl_client + firecrawl_client = get_firecrawl_client() + state.log(" ├─ Firecrawl client: Available") + except Exception as fc_err: + state.log(f" ├─ Firecrawl client: Not available ({str(fc_err)[:30]})") + state.log(" │ └─ Will use cached/fallback data") + + # Get HF client for LLM extraction (optional) + hf_client = None + try: + from core.hf_inference import get_inference_client + hf_client = get_inference_client() + state.log(" ├─ HF client: Available") + except Exception as hf_err: + state.log(f" ├─ HF client: Not available ({str(hf_err)[:30]})") + + researcher = BenchmarkResearcher( + firecrawl_client=firecrawl_client, + hf_client=hf_client, + ) + + # Research selected benchmarks (with fallback) + try: + benchmarks = await researcher.research_selected_benchmarks( + selected_keys=selected_benchmarks, + log_callback=state.log, + ) + except Exception as research_err: + state.log(f" ⚠️ Research failed, using fallback: {str(research_err)[:50]}") + # Use fallback data + benchmarks = [] + for key in selected_benchmarks: + if key in FALLBACK_BENCHMARKS: + data = FALLBACK_BENCHMARKS[key] + benchmarks.append(BenchmarkData( + key=key, + name=key.replace("_", " ").title(), + short_name=key.split("_")[0].title(), + vendor="", + icon="📦", + typography=data.get("typography", {}), + spacing=data.get("spacing", {}), + colors=data.get("colors", {}), + fetched_at=datetime.now().isoformat(), + confidence="fallback", + best_for=[], + )) + + # Compare to benchmarks + if benchmarks and rule_results: + benchmark_comparisons = researcher.compare_to_benchmarks( + your_ratio=rule_results.typography.detected_ratio, + your_base_size=int(rule_results.typography.base_size) if rule_results.typography.sizes_px else 16, + your_spacing_grid=rule_results.spacing.detected_base, + benchmarks=benchmarks, + log_callback=state.log, + ) + state.benchmark_comparisons = benchmark_comparisons + state.log("") + state.log(f" ✅ Benchmark Research: SUCCESS ({len(benchmarks)} systems)") + else: + state.log(" ⚠️ No benchmarks available for comparison") + + except Exception as e: + state.log(f" ⚠️ Benchmark Research FAILED: {str(e)[:100]}") + state.log(" └─ Continuing without benchmark comparison...") + benchmark_comparisons = [] + + progress(0.40, desc="🤖 Running LLM Agents in parallel...") + + # ================================================================= + # LAYER 3: LLM AGENTS — v3: ALL token types, ReAct reasoning + # ================================================================= + try: + from agents.llm_agents import ( + BrandIdentifierAgent, + BenchmarkAdvisorAgent, + BestPracticesValidatorAgent, + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + ) + + state.log("") + state.log("=" * 60) + state.log("LAYER 3: LLM AGENTS (ReAct + Parallel)") + state.log("=" * 60) + state.log(" Each agent researches ALL token types:") + state.log(" Colors + Typography + Spacing + Radius + Shadows") + state.log("") + + # Check if HF client is available + if not hf_client: + try: + from core.hf_inference import get_inference_client + hf_client = get_inference_client() + except Exception: + state.log(" HF client not available - skipping LLM agents") + hf_client = None + + if hf_client: + # Initialize agents + brand_agent = BrandIdentifierAgent(hf_client) + benchmark_agent = BenchmarkAdvisorAgent(hf_client) + best_practices_agent = BestPracticesValidatorAgent(hf_client) + + # Full token dict with all context + desktop_dict = normalized_to_dict(state.desktop_normalized) + + # Prepare shared data for agents + typo = rule_results.typography + spacing = rule_results.spacing + sizes_str = ", ".join([f"{s}px" for s in typo.sizes_px[:10]]) if typo.sizes_px else "N/A" + sp_vals = ", ".join([f"{v}px" for v in spacing.current_values[:10]]) if spacing.current_values else "N/A" + color_count = rule_results.color_stats.unique_count + brand_info_str = "" + + # Radius/shadow formatted strings for ATLAS + from agents.llm_agents import _fmt_radius, _fmt_shadows + radius_str = _fmt_radius(desktop_dict.get("radius", {})) + shadow_str = _fmt_shadows(desktop_dict.get("shadows", {})) + + # ─── AURORA: ALL token types ─── + async def _run_aurora(): + try: + return await brand_agent.analyze( + color_tokens=desktop_dict.get("colors", {}), + typography_tokens=desktop_dict.get("typography", {}), + spacing_tokens=desktop_dict.get("spacing", {}), + radius_tokens=desktop_dict.get("radius", {}), + shadow_tokens=desktop_dict.get("shadows", {}), + log_callback=state.log, + ) + except Exception as e: + state.log(f" AURORA failed: {str(e)[:120]}") + return BrandIdentification() + + # ─── ATLAS: ALL token types ─── + async def _run_atlas(): + if not benchmark_comparisons: + state.log(" ATLAS skipped (no benchmarks)") + return BenchmarkAdvice() + try: + return await benchmark_agent.analyze( + user_ratio=typo.detected_ratio, + user_base=int(typo.base_size) if typo.sizes_px else 16, + user_spacing=spacing.detected_base, + benchmark_comparisons=benchmark_comparisons, + color_count=color_count, + brand_info=brand_info_str, + user_sizes=sizes_str, + spacing_values=sp_vals, + radius_data=radius_str, + shadow_data=shadow_str, + log_callback=state.log, + ) + except Exception as e: + state.log(f" ATLAS failed: {str(e)[:120]}") + return BenchmarkAdvice() + + # ─── SENTINEL: ALL token types ─── + async def _run_sentinel(): + try: + return await best_practices_agent.analyze( + rule_engine_results=rule_results, + radius_tokens=desktop_dict.get("radius", {}), + shadow_tokens=desktop_dict.get("shadows", {}), + log_callback=state.log, + ) + except Exception as e: + state.log(f" SENTINEL failed: {str(e)[:120]}") + return BestPracticesResult(overall_score=rule_results.consistency_score) + + # Execute AURORA + ATLAS + SENTINEL in parallel + import asyncio + state.log(" Running 3 agents in parallel: AURORA | ATLAS | SENTINEL") + state.log("") + brand_result, benchmark_advice, best_practices = await asyncio.gather( + _run_aurora(), + _run_atlas(), + _run_sentinel(), + ) + else: + # No HF client - use defaults + state.log(" Using default values (no LLM)") + brand_result = BrandIdentification() + benchmark_advice = BenchmarkAdvice() + best_practices = BestPracticesResult(overall_score=rule_results.consistency_score) + + except Exception as e: + state.log(f" LLM Agents FAILED: {str(e)[:100]}") + brand_result = BrandIdentification() if not brand_result else brand_result + benchmark_advice = BenchmarkAdvice() if not benchmark_advice else benchmark_advice + best_practices = BestPracticesResult(overall_score=rule_results.consistency_score if rule_results else 50) + + progress(0.70, desc="Synthesizing results...") + + # ================================================================= + # LAYER 4: NEXUS — Tree of Thought synthesis + # ================================================================= + try: + from agents.llm_agents import HeadSynthesizerAgent, HeadSynthesis, post_validate_stage2 + + if hf_client and brand_result and benchmark_advice and best_practices: + head_agent = HeadSynthesizerAgent(hf_client) + + try: + final_synthesis = await head_agent.synthesize( + rule_engine_results=rule_results, + benchmark_comparisons=benchmark_comparisons, + brand_identification=brand_result, + benchmark_advice=benchmark_advice, + best_practices=best_practices, + log_callback=state.log, + ) + except Exception as e: + state.log(f" NEXUS failed: {str(e)[:120]}") + import traceback + state.log(f" {traceback.format_exc()[:200]}") + final_synthesis = None + + # ─── POST-VALIDATION (deterministic) ─── + if final_synthesis: + try: + pv_issues = post_validate_stage2( + aurora=brand_result, + sentinel=best_practices, + nexus=final_synthesis, + rule_engine=rule_results, + ) + if pv_issues: + state.log("") + state.log(" POST-VALIDATION: Issues found") + for issue in pv_issues[:5]: + state.log(f" {issue}") + else: + state.log(" POST-VALIDATION: All checks passed") + except Exception as pv_err: + state.log(f" POST-VALIDATION error: {str(pv_err)[:80]}") + + # Create fallback synthesis if needed + if not final_synthesis: + state.log(" Creating fallback synthesis...") + final_synthesis = create_fallback_synthesis( + rule_results, benchmark_comparisons, brand_result, best_practices + ) + + state.final_synthesis = final_synthesis + state.brand_result = brand_result # Preserve AURORA naming_map for export + + # ─── AGENT EVALUATION SUMMARY ─── + state.log("") + state.log("=" * 60) + state.log("AGENT EVALUATION SUMMARY") + state.log("=" * 60) + + def _eval_line(name, emoji, result_obj): + se = getattr(result_obj, 'self_evaluation', None) or {} + if isinstance(se, dict) and se: + conf = se.get('confidence', '?') + dq = se.get('data_quality', '?') + flags = se.get('flags', []) + flag_str = f", flags={flags}" if flags else "" + return f" {emoji} {name}: confidence={conf}/10, data={dq}{flag_str}" + return f" {emoji} {name}: no self-evaluation returned" + + if brand_result: + named = len(brand_result.naming_map) if brand_result.naming_map else 0 + valid = "PASSED" if brand_result.validation_passed else "FALLBACK" + state.log(_eval_line("AURORA (Brand ID)", "", brand_result) + f", named={named}, critic={valid}") + if benchmark_advice: + state.log(_eval_line("ATLAS (Benchmark)", "", benchmark_advice)) + if best_practices: + bp_score = getattr(best_practices, 'overall_score', '?') + valid = "PASSED" if best_practices.validation_passed else "FIXED" + state.log(_eval_line("SENTINEL (Practices)", "", best_practices) + f", score={bp_score}/100, critic={valid}") + if final_synthesis: + synth_overall = final_synthesis.scores.get('overall', '?') if final_synthesis.scores else '?' + chosen = final_synthesis.chosen_perspective or "?" + state.log(_eval_line("NEXUS (Synthesis)", "", final_synthesis) + f", overall={synth_overall}/100, perspective={chosen}") + + state.log("=" * 60) + state.log("") + + except Exception as e: + state.log(f" Synthesis FAILED: {str(e)[:100]}") + final_synthesis = create_fallback_synthesis( + rule_results, benchmark_comparisons, brand_result, best_practices + ) + state.final_synthesis = final_synthesis + + progress(0.85, desc="📊 Formatting results...") + + # ================================================================= + # FORMAT OUTPUTS FOR UI + # ================================================================= + + try: + # Build status markdown + status_md = format_stage2_status_v2( + rule_results=rule_results, + final_synthesis=final_synthesis, + best_practices=best_practices, + ) + + # Build benchmark comparison HTML + benchmark_md = format_benchmark_comparison_v2( + benchmark_comparisons=benchmark_comparisons, + benchmark_advice=benchmark_advice, + ) + + # Build scores dashboard HTML + scores_html = format_scores_dashboard_v2( + rule_results=rule_results, + final_synthesis=final_synthesis, + best_practices=best_practices, + ) + + # Build priority actions HTML + actions_html = format_priority_actions_v2( + rule_results=rule_results, + final_synthesis=final_synthesis, + best_practices=best_practices, + ) + + # Build color recommendations table + color_recs_table = format_color_recommendations_table_v2( + rule_results=rule_results, + brand_result=brand_result, + final_synthesis=final_synthesis, + ) + + # Get fonts and typography data + fonts = get_detected_fonts() + base_size = get_base_font_size() + + typography_desktop_data = format_typography_comparison_viewport( + state.desktop_normalized, base_size, "desktop" + ) + typography_mobile_data = format_typography_comparison_viewport( + state.mobile_normalized, base_size, "mobile" + ) + + # Generate spacing comparison table from rule_results + spacing_data = [] + if rule_results and rule_results.spacing: + sp = rule_results.spacing + current_vals = sp.current_values or [] + suggested_8 = [i * 8 for i in range(1, 11)] + suggested_4 = [i * 4 for i in range(1, 11)] + for i in range(min(10, max(len(current_vals), 10))): + cur = f"{current_vals[i]}px" if i < len(current_vals) else "—" + g8 = f"{suggested_8[i]}px" if i < len(suggested_8) else "—" + g4 = f"{suggested_4[i]}px" if i < len(suggested_4) else "—" + spacing_data.append([cur, g8, g4]) + + # Generate base colors, color ramps, radius, shadows markdown + base_colors_md = format_base_colors() + color_ramps_md = "" # Visual ramps are in color_ramps_preview_html + try: + from core.preview_generator import generate_color_ramp + colors = list(state.desktop_normalized.colors.values()) + colors.sort(key=lambda c: -c.frequency) + ramp_lines = ["### 🌈 Color Ramps (Top Colors)", ""] + for c in colors[:6]: + ramp = generate_color_ramp(c.value) + if ramp: + shades_str = " → ".join(f"`{s['hex']}`" for s in ramp[::2]) # every other shade + ramp_lines.append(f"**{c.value}** ({c.frequency}x): {shades_str}") + ramp_lines.append("") + color_ramps_md = "\n".join(ramp_lines) + except Exception: + color_ramps_md = "*Color ramps shown in visual preview above*" + + radius_md = format_radius_with_tokens() + shadows_md = format_shadows_with_tokens() + + # Generate visual previews + typography_preview_html = "" + color_ramps_preview_html = "" + llm_recs_html = "" + + try: + from core.preview_generator import ( + generate_typography_preview_html, + generate_semantic_color_ramps_html, + generate_color_ramps_preview_html, + ) + + primary_font = fonts.get("primary", "Open Sans") + desktop_typo_dict = { + name: { + "font_size": t.font_size, + "font_weight": t.font_weight, + "line_height": t.line_height, + } + for name, t in state.desktop_normalized.typography.items() + } + typography_preview_html = generate_typography_preview_html(desktop_typo_dict, primary_font) + + # Generate color ramps preview (semantic groups) + semantic_analysis = getattr(state, 'semantic_analysis', {}) + desktop_dict_for_colors = normalized_to_dict(state.desktop_normalized) + + if semantic_analysis: + color_ramps_preview_html = generate_semantic_color_ramps_html( + semantic_analysis=semantic_analysis, + color_tokens=desktop_dict_for_colors.get("colors", {}), + ) + else: + color_ramps_preview_html = generate_color_ramps_preview_html( + color_tokens=desktop_dict_for_colors.get("colors", {}), + ) + + state.log(" ✅ Color ramps preview generated") + + except Exception as preview_err: + state.log(f" ⚠️ Preview generation failed: {str(preview_err)[:80]}") + typography_preview_html = typography_preview_html or "
Preview unavailable
" + color_ramps_preview_html = "
Color ramps preview unavailable
" + + # Generate LLM recommendations HTML + try: + # Build recs dict in the format expected by the HTML formatter + synth_recs = {} + if final_synthesis: + # Convert list of color recs to dict keyed by role + # HeadSynthesis uses: {role, current, suggested, reason, accept} + # Formatter expects: {current, suggested, action, rationale} + color_recs_dict = {} + for rec in (final_synthesis.color_recommendations or []): + if isinstance(rec, dict) and rec.get("role"): + current_val = rec.get("current", "?") + suggested_val = rec.get("suggested", current_val) + accept = rec.get("accept", True) + reason = rec.get("reason", "") + # Determine action: if suggested differs from current, it's a change + if suggested_val and suggested_val != current_val and not accept: + action = "change" + elif suggested_val and suggested_val != current_val: + action = "change" + else: + action = "keep" + color_recs_dict[rec["role"]] = { + "current": current_val, + "suggested": suggested_val, + "action": action, + "rationale": reason, + } + synth_recs["color_recommendations"] = color_recs_dict + + # Add AA fixes from rule engine + # Formatter expects: {color, role, issue, fix, current_contrast, fixed_contrast} + aa_fixes = [] + if rule_results and rule_results.accessibility: + for a in rule_results.accessibility: + if not a.passes_aa_normal: + best_contrast = a.contrast_on_white if a.best_text_color == "#FFFFFF" else a.contrast_on_black + aa_fixes.append({ + "color": a.hex_color, + "role": a.name or "unknown", + "issue": f"Fails AA normal ({best_contrast:.1f}:1 < 4.5:1)", + "fix": a.suggested_fix or a.hex_color, + "current_contrast": f"{best_contrast:.1f}", + "fixed_contrast": f"{a.suggested_fix_contrast:.1f}" if a.suggested_fix_contrast else "—", + }) + synth_recs["accessibility_fixes"] = aa_fixes + + llm_recs_html = format_llm_color_recommendations_html( + final_recs=synth_recs, + semantic_analysis=getattr(state, 'semantic_analysis', {}), + ) + except Exception as recs_err: + state.log(f" ⚠️ LLM recs HTML failed: {str(recs_err)[:120]}") + import traceback + state.log(f" └─ {traceback.format_exc()[:200]}") + llm_recs_html = "
LLM recommendations unavailable
" + + # Store upgrade_recommendations for Apply Upgrades button + aa_failures_list = [] + if rule_results and rule_results.accessibility: + aa_failures_list = [ + a.to_dict() for a in rule_results.accessibility + if not a.passes_aa_normal + ] + state.upgrade_recommendations = { + "color_recommendations": (final_synthesis.color_recommendations if final_synthesis else []), + "accessibility_fixes": aa_failures_list, + "scores": (final_synthesis.scores if final_synthesis else {}), + "top_3_actions": (final_synthesis.top_3_actions if final_synthesis else []), + } + + except Exception as format_err: + state.log(f" ⚠️ Formatting failed: {str(format_err)[:100]}") + import traceback + state.log(traceback.format_exc()[:500]) + # Return minimal results (must match 16 outputs) + return ( + f"⚠️ Analysis completed with formatting errors: {str(format_err)[:50]}", + state.get_logs(), + "*Benchmark comparison unavailable*", + "
Scores unavailable
", + "
Actions unavailable
", + [], + None, + None, + "
Typography preview unavailable
", + "
Color ramps preview unavailable
", + "
LLM recommendations unavailable
", + [], # spacing_data + "*Formatting error - base colors unavailable*", # base_colors_md + "*Formatting error - color ramps unavailable*", # color_ramps_md + "*Formatting error - radius tokens unavailable*", # radius_md + "*Formatting error - shadow tokens unavailable*", # shadows_md + ) + + progress(0.95, desc="✅ Complete!") + + # Final log summary + state.log("") + state.log("═" * 60) + state.log("📊 FINAL RESULTS") + state.log("═" * 60) + state.log("") + overall_score = final_synthesis.scores.get('overall', rule_results.consistency_score) if final_synthesis else rule_results.consistency_score + state.log(f" 🎯 OVERALL SCORE: {overall_score}/100") + if final_synthesis and final_synthesis.scores: + state.log(f" ├─ Accessibility: {final_synthesis.scores.get('accessibility', '?')}/100") + state.log(f" ├─ Consistency: {final_synthesis.scores.get('consistency', '?')}/100") + state.log(f" └─ Organization: {final_synthesis.scores.get('organization', '?')}/100") + state.log("") + if benchmark_comparisons: + state.log(f" 🏆 Closest Benchmark: {benchmark_comparisons[0].benchmark.name if benchmark_comparisons else 'N/A'}") + state.log("") + state.log(" 🎯 TOP 3 ACTIONS:") + if final_synthesis and final_synthesis.top_3_actions: + for i, action in enumerate(final_synthesis.top_3_actions[:3]): + impact = action.get('impact', 'medium') + icon = "🔴" if impact == "high" else "🟡" if impact == "medium" else "🟢" + state.log(f" │ {i+1}. {icon} {action.get('action', 'N/A')}") + else: + state.log(f" │ 1. 🔴 Fix {rule_results.aa_failures} AA compliance failures") + state.log("") + state.log("═" * 60) + state.log(f" 💰 TOTAL COST: ~$0.003") + state.log(f" ⏱️ COMPLETED: {datetime.now().strftime('%H:%M:%S')}") + state.log("═" * 60) + + return ( + status_md, + state.get_logs(), + benchmark_md, + scores_html, + actions_html, + color_recs_table, + typography_desktop_data, + typography_mobile_data, + typography_preview_html, + color_ramps_preview_html, + llm_recs_html, + spacing_data, + base_colors_md, + color_ramps_md, + radius_md, + shadows_md, + ) + + except Exception as e: + import traceback + state.log(f"❌ Critical Error: {str(e)}") + state.log(traceback.format_exc()) + error_detail = str(e).lower() + if "token" in error_detail or "auth" in error_detail or "401" in error_detail: + hint = "Your HuggingFace token may be invalid or expired. Go to **Configuration** above and re-enter your token." + elif "rate" in error_detail or "429" in error_detail: + hint = "Rate limit reached. Wait a few minutes and try again." + else: + hint = "Check the analysis log above for details. Try running the analysis again." + return create_stage2_error_response( + error_message("Analysis Failed", str(e)[:200], hint) + ) + + +def create_fallback_synthesis(rule_results, benchmark_comparisons, brand_result, best_practices): + """Create a fallback synthesis when LLM synthesis fails. + + v3: includes radius_recommendation, shadow_recommendation, perspective fields. + """ + try: + from agents.llm_agents import HeadSynthesis + except ImportError: + from dataclasses import dataclass, field + @dataclass + class HeadSynthesis: + executive_summary: str = "" + scores: dict = field(default_factory=dict) + benchmark_fit: dict = field(default_factory=dict) + brand_analysis: dict = field(default_factory=dict) + top_3_actions: list = field(default_factory=list) + color_recommendations: list = field(default_factory=list) + type_scale_recommendation: dict = field(default_factory=dict) + spacing_recommendation: dict = field(default_factory=dict) + radius_recommendation: dict = field(default_factory=dict) + shadow_recommendation: dict = field(default_factory=dict) + self_evaluation: dict = field(default_factory=dict) + perspective_a: dict = field(default_factory=dict) + perspective_b: dict = field(default_factory=dict) + chosen_perspective: str = "" + choice_reasoning: str = "" + reasoning_trace: list = field(default_factory=list) + def to_dict(self): + return {k: getattr(self, k) for k in [ + 'executive_summary', 'scores', 'benchmark_fit', 'brand_analysis', + 'top_3_actions', 'color_recommendations', 'type_scale_recommendation', + 'spacing_recommendation', 'radius_recommendation', 'shadow_recommendation', + 'self_evaluation', 'chosen_perspective', 'choice_reasoning', + ]} + + # Calculate scores from rule engine + overall = rule_results.consistency_score if rule_results else 50 + accessibility = max(0, 100 - (rule_results.aa_failures * 10)) if rule_results else 50 + + # Build actions from rule engine + actions = [] + if rule_results and rule_results.aa_failures > 0: + actions.append({ + "action": f"Fix {rule_results.aa_failures} colors failing AA compliance", + "impact": "high", + "token_type": "color", + }) + if rule_results and not rule_results.typography.is_consistent: + actions.append({ + "action": f"Align type scale to {rule_results.typography.recommendation} ({rule_results.typography.recommendation_name})", + "impact": "medium", + "token_type": "typography", + }) + if rule_results and rule_results.color_stats.unique_count > 30: + actions.append({ + "action": f"Consolidate {rule_results.color_stats.unique_count} colors to ~15 semantic colors", + "impact": "medium", + "token_type": "color", + }) + + return HeadSynthesis( + executive_summary=f"Your design system scores {overall}/100. Analysis completed with fallback synthesis.", + scores={ + "overall": overall, + "accessibility": accessibility, + "consistency": overall, + "organization": 50, + }, + benchmark_fit={ + "closest": benchmark_comparisons[0].benchmark.name if benchmark_comparisons else "Unknown", + "similarity": f"{benchmark_comparisons[0].overall_match_pct:.0f}%" if benchmark_comparisons else "N/A", + }, + brand_analysis={ + "primary": brand_result.brand_primary.get("color", "Unknown") if brand_result else "Unknown", + "cohesion": brand_result.cohesion_score if brand_result else 5, + }, + top_3_actions=actions[:3], + color_recommendations=[], + type_scale_recommendation={ + "current_ratio": rule_results.typography.detected_ratio if rule_results else 1.0, + "recommended_ratio": rule_results.typography.recommendation if rule_results else 1.25, + }, + spacing_recommendation={ + "current": f"{rule_results.spacing.detected_base}px" if rule_results else "Unknown", + "recommended": f"{rule_results.spacing.recommendation}px" if rule_results else "8px", + }, + radius_recommendation={}, + shadow_recommendation={}, + ) + + +def create_stage2_error_response(error_msg: str): + """Create error response tuple for Stage 2 (must match 16 outputs).""" + return ( + error_msg, + state.get_logs(), + "", # benchmark_md + f"
{error_msg}
", # scores_html + "", # actions_html + [], # color_recs_table + None, # typography_desktop + None, # typography_mobile + "", # typography_preview + "", # color_ramps_preview + "", # llm_recs_html + [], # spacing_data + "*Run analysis to see base colors*", # base_colors_md + "*Run analysis to see color ramps*", # color_ramps_md + "*Run analysis to see radius tokens*", # radius_md + "*Run analysis to see shadow tokens*", # shadows_md + ) + + +def format_stage2_status_v2(rule_results, final_synthesis, best_practices) -> str: + """Format Stage 2 status with new architecture results.""" + + lines = [] + lines.append("## ✅ Analysis Complete!") + lines.append("") + + # Overall Score + overall = final_synthesis.scores.get('overall', rule_results.consistency_score) + lines.append(f"### 🎯 Overall Score: {overall}/100") + lines.append("") + + # Executive Summary + if final_synthesis.executive_summary: + lines.append(f"*{final_synthesis.executive_summary}*") + lines.append("") + + # Quick Stats + lines.append("### 📊 Quick Stats") + lines.append(f"- **AA Failures:** {rule_results.aa_failures}") + lines.append(f"- **Type Scale:** {rule_results.typography.detected_ratio:.3f} ({rule_results.typography.scale_name})") + lines.append(f"- **Spacing Grid:** {rule_results.spacing.detected_base}px ({rule_results.spacing.alignment_percentage:.0f}% aligned)") + lines.append(f"- **Unique Colors:** {rule_results.color_stats.unique_count}") + lines.append("") + + # Cost + lines.append("### 💰 Cost") + lines.append("**Total:** ~$0.003 (Rule Engine: $0 + LLM: ~$0.003)") + lines.append("") + + # Next step guidance + lines.append("---") + lines.append("**Next:** Review the analysis results below. Accept or reject color recommendations, " + "choose your type scale and spacing grid, then click **'Apply Selected Upgrades'** at the bottom.") + + return "\n".join(lines) + + +def format_benchmark_comparison_v2(benchmark_comparisons, benchmark_advice) -> str: + """Format benchmark comparison results.""" + + if not benchmark_comparisons: + return "*No benchmark comparison available*" + + lines = [] + lines.append("## 📊 Benchmark Comparison") + lines.append("") + + # Recommended benchmark + if benchmark_advice and benchmark_advice.recommended_benchmark_name: + lines.append(f"### 🏆 Recommended: {benchmark_advice.recommended_benchmark_name}") + if benchmark_advice.reasoning: + lines.append(f"*{benchmark_advice.reasoning[:200]}*") + lines.append("") + + # Comparison table + lines.append("### 📈 Similarity Ranking") + lines.append("") + lines.append("| Rank | Design System | Match | Type Ratio | Base | Grid |") + lines.append("|------|---------------|-------|------------|------|------|") + + medals = ["🥇", "🥈", "🥉"] + for i, c in enumerate(benchmark_comparisons[:5]): + medal = medals[i] if i < 3 else str(i+1) + b = c.benchmark + lines.append( + f"| {medal} | {b.icon} {b.short_name} | {c.overall_match_pct:.0f}% | " + f"{b.typography.get('scale_ratio', '?')} | {b.typography.get('base_size', '?')}px | " + f"{b.spacing.get('base', '?')}px |" + ) + + lines.append("") + + # Alignment changes needed + if benchmark_advice and benchmark_advice.alignment_changes: + lines.append("### 🔧 Changes to Align") + for change in benchmark_advice.alignment_changes[:3]: + lines.append(f"- **{change.get('change', '?')}**: {change.get('from', '?')} → {change.get('to', '?')} (effort: {change.get('effort', '?')})") + + return "\n".join(lines) + + +def format_scores_dashboard_v2(rule_results, final_synthesis, best_practices) -> str: + """Format scores dashboard HTML.""" + + overall = final_synthesis.scores.get('overall', rule_results.consistency_score) + accessibility = final_synthesis.scores.get('accessibility', 100 - (rule_results.aa_failures * 5)) + consistency = final_synthesis.scores.get('consistency', rule_results.consistency_score) + organization = final_synthesis.scores.get('organization', 50) + + def score_color(score): + if score >= 80: + return "#10b981" # Green + elif score >= 60: + return "#f59e0b" # Yellow + else: + return "#ef4444" # Red + + html = f""" + +
+
+
{overall}
+
OVERALL
+
+
+
{accessibility}
+
Accessibility
+
+
+
{consistency}
+
Consistency
+
+
+
{organization}
+
Organization
+
+
+ """ + + return html + + +def format_priority_actions_v2(rule_results, final_synthesis, best_practices) -> str: + """Format priority actions HTML.""" + + actions = final_synthesis.top_3_actions if final_synthesis.top_3_actions else [] + + # If no synthesis actions, build from rule engine + if not actions and best_practices and best_practices.priority_fixes: + actions = best_practices.priority_fixes + + if not actions: + # Default actions from rule engine + actions = [] + if rule_results.aa_failures > 0: + actions.append({ + "action": f"Fix {rule_results.aa_failures} colors failing AA compliance", + "impact": "high", + "effort": "30 min", + }) + if not rule_results.typography.is_consistent: + actions.append({ + "action": f"Align type scale to {rule_results.typography.recommendation} ({rule_results.typography.recommendation_name})", + "impact": "medium", + "effort": "1 hour", + }) + if rule_results.color_stats.unique_count > 30: + actions.append({ + "action": f"Consolidate {rule_results.color_stats.unique_count} colors to ~15 semantic colors", + "impact": "medium", + "effort": "2 hours", + }) + + html_items = [] + for i, action in enumerate(actions[:3]): + impact = action.get('impact', 'medium') + border_color = "#ef4444" if impact == "high" else "#f59e0b" if impact == "medium" else "#10b981" + impact_bg = "#fee2e2" if impact == "high" else "#fef3c7" if impact == "medium" else "#dcfce7" + impact_text = "#991b1b" if impact == "high" else "#92400e" if impact == "medium" else "#166534" + icon = "🔴" if impact == "high" else "🟡" if impact == "medium" else "🟢" + + html_items.append(f""" +
+
+
+
+ {icon} {action.get('action', 'N/A')} +
+
+ {action.get('details', '')} +
+
+
+ + {impact.upper()} + + + {action.get('effort', '?')} + +
+
+
+ """) + + return f""" + +
+

🎯 Priority Actions

+ {''.join(html_items)} +
+ """ + + +def format_color_recommendations_table_v2(rule_results, brand_result, final_synthesis) -> list: + """Format color recommendations as table data.""" + + rows = [] + + # Add AA failures with fixes + for a in rule_results.accessibility: + if not a.passes_aa_normal and a.suggested_fix: + role = "brand.primary" if brand_result and brand_result.brand_primary.get("color") == a.hex_color else a.name + rows.append([ + True, # Accept checkbox + role, + a.hex_color, + f"Fails AA ({a.contrast_on_white:.1f}:1)", + a.suggested_fix, + f"{a.suggested_fix_contrast:.1f}:1", + ]) + + # Add recommendations from synthesis + if final_synthesis and final_synthesis.color_recommendations: + for rec in final_synthesis.color_recommendations: + if rec.get("current") != rec.get("suggested"): + # Check if not already in rows + if not any(r[2] == rec.get("current") for r in rows): + rows.append([ + rec.get("accept", True), + rec.get("role", "unknown"), + rec.get("current", ""), + rec.get("reason", ""), + rec.get("suggested", ""), + "", + ]) + + return rows + + +def build_analysis_status(final_recs: dict, cost_tracking: dict, errors: list) -> str: + """Build status markdown from analysis results.""" + + lines = ["## 🧠 Multi-Agent Analysis Complete!"] + lines.append("") + + # Cost summary + if cost_tracking: + total_cost = cost_tracking.get("total_cost", 0) + lines.append(f"### 💰 Cost Summary") + lines.append(f"**Total estimated cost:** ${total_cost:.4f}") + lines.append(f"*(Free tier: $0.10/mo | Pro: $2.00/mo)*") + lines.append("") + + # Final recommendations + if final_recs and "final_recommendations" in final_recs: + recs = final_recs["final_recommendations"] + lines.append("### 📋 Recommendations") + + if recs.get("type_scale"): + lines.append(f"**Type Scale:** {recs['type_scale']}") + if recs.get("type_scale_rationale"): + lines.append(f" *{recs['type_scale_rationale'][:100]}*") + + if recs.get("spacing_base"): + lines.append(f"**Spacing:** {recs['spacing_base']}") + + lines.append("") + + # Summary + if final_recs.get("summary"): + lines.append("### 📝 Summary") + lines.append(final_recs["summary"]) + lines.append("") + + # Confidence + if final_recs.get("overall_confidence"): + lines.append(f"**Confidence:** {final_recs['overall_confidence']}%") + + # Errors + if errors: + lines.append("") + lines.append("### ⚠️ Warnings") + for err in errors[:3]: + lines.append(f"- {err[:100]}") + + return "\n".join(lines) + + +def format_multi_agent_comparison(llm1: dict, llm2: dict, final: dict) -> str: + """Format comparison from multi-agent analysis.""" + + lines = ["### 📊 Multi-Agent Analysis Comparison"] + lines.append("") + + # Agreements + if final.get("agreements"): + lines.append("#### ✅ Agreements (High Confidence)") + for a in final["agreements"][:5]: + topic = a.get("topic", "?") + finding = a.get("finding", "?")[:80] + lines.append(f"- **{topic}**: {finding}") + lines.append("") + + # Disagreements and resolutions + if final.get("disagreements"): + lines.append("#### 🔄 Resolved Disagreements") + for d in final["disagreements"][:3]: + topic = d.get("topic", "?") + resolution = d.get("resolution", "?")[:100] + lines.append(f"- **{topic}**: {resolution}") + lines.append("") + + # Score comparison + lines.append("#### 📈 Score Comparison") + lines.append("") + lines.append("| Category | LLM 1 (Qwen) | LLM 2 (Llama) |") + lines.append("|----------|--------------|---------------|") + + categories = ["typography", "colors", "accessibility", "spacing"] + for cat in categories: + llm1_score = llm1.get(cat, {}).get("score", "?") if isinstance(llm1.get(cat), dict) else "?" + llm2_score = llm2.get(cat, {}).get("score", "?") if isinstance(llm2.get(cat), dict) else "?" + lines.append(f"| {cat.title()} | {llm1_score}/10 | {llm2_score}/10 |") + + return "\n".join(lines) + + +def format_spacing_comparison_from_rules(rule_calculations: dict) -> list: + """Format spacing comparison from rule engine.""" + if not rule_calculations: + return [] + + spacing_options = rule_calculations.get("spacing_options", {}) + + data = [] + for i in range(10): + current = f"{(i+1) * 4}px" if i < 5 else f"{(i+1) * 8}px" + grid_8 = spacing_options.get("8px", []) + grid_4 = spacing_options.get("4px", []) + + val_8 = f"{grid_8[i+1]}px" if i+1 < len(grid_8) else "—" + val_4 = f"{grid_4[i+1]}px" if i+1 < len(grid_4) else "—" + + data.append([current, val_8, val_4]) + + return data + + +def format_color_ramps_from_rules(rule_calculations: dict) -> str: + """Format color ramps from rule engine.""" + if not rule_calculations: + return "*No color ramps generated*" + + ramps = rule_calculations.get("color_ramps", {}) + if not ramps: + return "*No color ramps generated*" + + lines = ["### 🌈 Generated Color Ramps"] + lines.append("") + + for name, ramp in list(ramps.items())[:6]: + lines.append(f"**{name}**") + if isinstance(ramp, list) and len(ramp) >= 10: + lines.append("| 50 | 100 | 200 | 300 | 400 | 500 | 600 | 700 | 800 | 900 |") + lines.append("|---|---|---|---|---|---|---|---|---|---|") + row = "| " + " | ".join([f"`{ramp[i]}`" for i in range(10)]) + " |" + lines.append(row) + lines.append("") + + return "\n".join(lines) + + +def get_detected_fonts() -> dict: + """Get detected font information.""" + if not state.desktop_normalized: + return {"primary": "Unknown", "weights": []} + + fonts = {} + weights = set() + + for t in state.desktop_normalized.typography.values(): + family = t.font_family + weight = t.font_weight + + if family not in fonts: + fonts[family] = 0 + fonts[family] += t.frequency + + if weight: + try: + weights.add(int(weight)) + except (ValueError, TypeError): + pass + + primary = max(fonts.items(), key=lambda x: x[1])[0] if fonts else "Unknown" + + return { + "primary": primary, + "weights": sorted(weights) if weights else [400], + "all_fonts": fonts, + } + + +def get_base_font_size() -> int: + """Detect base font size from typography.""" + if not state.desktop_normalized: + return 16 + + # Find most common size in body range (14-18px) + sizes = {} + for t in state.desktop_normalized.typography.values(): + size_str = str(t.font_size).replace('px', '').replace('rem', '').replace('em', '') + try: + size = float(size_str) + if 14 <= size <= 18: + sizes[size] = sizes.get(size, 0) + t.frequency + except (ValueError, TypeError): + pass + + if sizes: + return int(max(sizes.items(), key=lambda x: x[1])[0]) + return 16 + + +def format_brand_comparison(recommendations) -> str: + """Format brand comparison as markdown table.""" + if not recommendations.brand_analysis: + return "*Brand analysis not available*" + + lines = [ + "### 📊 Design System Comparison (5 Top Brands)", + "", + "| Brand | Type Ratio | Base Size | Spacing | Notes |", + "|-------|------------|-----------|---------|-------|", + ] + + for brand in recommendations.brand_analysis[:5]: + name = brand.get("brand", "Unknown") + ratio = brand.get("ratio", "?") + base = brand.get("base", "?") + spacing = brand.get("spacing", "?") + notes = brand.get("notes", "")[:50] + ("..." if len(brand.get("notes", "")) > 50 else "") + lines.append(f"| {name} | {ratio} | {base}px | {spacing} | {notes} |") + + return "\n".join(lines) + + +def format_font_families_display(fonts: dict) -> str: + """Format detected font families for display.""" + lines = [] + + primary = fonts.get("primary", "Unknown") + weights = fonts.get("weights", [400]) + all_fonts = fonts.get("all_fonts", {}) + + lines.append(f"### Primary Font: **{primary}**") + lines.append("") + lines.append(f"**Weights detected:** {', '.join(map(str, weights))}") + lines.append("") + + if all_fonts and len(all_fonts) > 1: + lines.append("### All Fonts Detected") + lines.append("") + lines.append("| Font Family | Usage Count |") + lines.append("|-------------|-------------|") + + sorted_fonts = sorted(all_fonts.items(), key=lambda x: -x[1]) + for font, count in sorted_fonts[:5]: + lines.append(f"| {font} | {count:,} |") + + lines.append("") + lines.append("*Note: This analysis focuses on English typography only.*") + + return "\n".join(lines) + + +def format_llm_color_recommendations_html(final_recs: dict, semantic_analysis: dict) -> str: + """Generate HTML showing LLM color recommendations with before/after comparison.""" + + if not final_recs: + return ''' +
+

No LLM recommendations available yet. Run analysis first.

+
+ ''' + + color_recs = final_recs.get("color_recommendations", {}) + aa_fixes = final_recs.get("accessibility_fixes", []) + + if not color_recs and not aa_fixes: + return ''' +
+

✅ No color changes recommended. Your colors look good!

+
+ + ''' + + # Build recommendations HTML + recs_html = "" + + # Process color recommendations + for role, rec in color_recs.items(): + if not isinstance(rec, dict): + continue + if role in ["generate_ramps_for", "changes_made"]: + continue + + current = rec.get("current", "?") + suggested = rec.get("suggested", current) + action = rec.get("action", "keep") + rationale = rec.get("rationale", "") + + if action == "keep" or suggested == current: + # No change needed + recs_html += f''' +
+
+
+ {role} + {current} + ✓ Keep +
+
+ ''' + else: + # Change suggested + recs_html += f''' +
+
+
+
+ Before + {current} +
+ +
+
+ After + {suggested} +
+
+
+ {role} + {rationale[:80]}... +
+
+ ''' + + # Process accessibility fixes + for fix in aa_fixes: + if not isinstance(fix, dict): + continue + + color = fix.get("color", "?") + role = fix.get("role", "unknown") + issue = fix.get("issue", "contrast issue") + fix_color = fix.get("fix", color) + current_contrast = fix.get("current_contrast", "?") + fixed_contrast = fix.get("fixed_contrast", "?") + + if fix_color and fix_color != color: + recs_html += f''' +
+
+
+
+ ⚠️ {current_contrast}:1 + {color} +
+ +
+
+ ✓ {fixed_contrast}:1 + {fix_color} +
+
+
+ {role} + 🔴 {issue} +
+
+ ''' + + if not recs_html: + return ''' +
+

✅ No color changes recommended. Your colors look good!

+
+ + ''' + + html = f''' + + +
+ {recs_html} +
+ ''' + + return html + + +def format_llm_color_recommendations_table(final_recs: dict, semantic_analysis: dict) -> list: + """Generate table data for LLM color recommendations with accept/reject checkboxes.""" + + rows = [] + + if not final_recs: + return rows + + color_recs = final_recs.get("color_recommendations", {}) + aa_fixes = final_recs.get("accessibility_fixes", []) + + # Process color recommendations + for role, rec in color_recs.items(): + if not isinstance(rec, dict): + continue + if role in ["generate_ramps_for", "changes_made"]: + continue + + current = rec.get("current", "?") + suggested = rec.get("suggested", current) + action = rec.get("action", "keep") + rationale = rec.get("rationale", "")[:50] + + if action != "keep" and suggested != current: + # Calculate contrast improvement + try: + from core.color_utils import get_contrast_with_white + old_contrast = get_contrast_with_white(current) + new_contrast = get_contrast_with_white(suggested) + contrast_str = f"{old_contrast:.1f} → {new_contrast:.1f}" + except (ValueError, TypeError, ZeroDivisionError): + contrast_str = "?" + + rows.append([ + True, # Accept checkbox (default True) + role, + current, + rationale or action, + suggested, + contrast_str, + ]) + + # Process accessibility fixes + for fix in aa_fixes: + if not isinstance(fix, dict): + continue + + color = fix.get("color", "?") + role = fix.get("role", "unknown") + issue = fix.get("issue", "contrast")[:40] + fix_color = fix.get("fix", color) + current_contrast = fix.get("current_contrast", "?") + fixed_contrast = fix.get("fixed_contrast", "?") + + if fix_color and fix_color != color: + rows.append([ + True, # Accept checkbox + f"{role} (AA fix)", + color, + issue, + fix_color, + f"{current_contrast}:1 → {fixed_contrast}:1", + ]) + + return rows + + +def format_typography_comparison_viewport(normalized_tokens, base_size: int, viewport: str) -> list: + """Format typography comparison for a specific viewport.""" + if not normalized_tokens: + return [] + + # Get current typography sorted by size + current_typo = list(normalized_tokens.typography.values()) + + # Parse and sort sizes + def parse_size(t): + size_str = str(t.font_size).replace('px', '').replace('rem', '').replace('em', '') + try: + return float(size_str) + except (ValueError, TypeError): + return 16 + + current_typo.sort(key=lambda t: -parse_size(t)) + sizes = [parse_size(t) for t in current_typo] + + # Use detected base or default + base = base_size if base_size else 16 + + # Scale factors for mobile (typically 0.85-0.9 of desktop) + mobile_factor = 0.875 if viewport == "mobile" else 1.0 + + # Token names (13 levels) + token_names = [ + "display.2xl", "display.xl", "display.lg", "display.md", + "heading.xl", "heading.lg", "heading.md", "heading.sm", + "body.lg", "body.md", "body.sm", + "caption", "overline" + ] + + # Generate scales - use base size and round to sensible values + def round_to_even(val): + """Round to even numbers for cleaner type scales.""" + return int(round(val / 2) * 2) + + scales = { + "1.2": [round_to_even(base * mobile_factor * (1.2 ** (8-i))) for i in range(13)], + "1.25": [round_to_even(base * mobile_factor * (1.25 ** (8-i))) for i in range(13)], + "1.333": [round_to_even(base * mobile_factor * (1.333 ** (8-i))) for i in range(13)], + } + + # Build comparison table + data = [] + for i, name in enumerate(token_names): + current = f"{int(sizes[i])}px" if i < len(sizes) else "—" + s12 = f"{scales['1.2'][i]}px" + s125 = f"{scales['1.25'][i]}px" + s133 = f"{scales['1.333'][i]}px" + keep = current + data.append([name, current, s12, s125, s133, keep]) + + return data + + +def format_base_colors() -> str: + """Format base colors (detected) separately from ramps.""" + if not state.desktop_normalized: + return "*No colors detected*" + + colors = list(state.desktop_normalized.colors.values()) + colors.sort(key=lambda c: -c.frequency) + + lines = [ + "### 🎨 Base Colors (Detected)", + "", + "These are the primary colors extracted from your website:", + "", + "| Color | Hex | Role | Frequency | Contrast |", + "|-------|-----|------|-----------|----------|", + ] + + for color in colors[:10]: + hex_val = color.value + role = "Primary" if color.suggested_name and "primary" in color.suggested_name.lower() else \ + "Text" if color.suggested_name and "text" in color.suggested_name.lower() else \ + "Background" if color.suggested_name and "background" in color.suggested_name.lower() else \ + "Border" if color.suggested_name and "border" in color.suggested_name.lower() else \ + "Accent" + freq = f"{color.frequency:,}" + contrast = f"{color.contrast_white:.1f}:1" if color.contrast_white else "—" + + # Create a simple color indicator + lines.append(f"| 🟦 | `{hex_val}` | {role} | {freq} | {contrast} |") + + return "\n".join(lines) + + +def format_color_ramps_visual(recommendations) -> str: + """Format color ramps with visual display showing all shades.""" + if not state.desktop_normalized: + return "*No colors to display*" + + colors = list(state.desktop_normalized.colors.values()) + colors.sort(key=lambda c: -c.frequency) + + lines = [ + "### 🌈 Generated Color Ramps", + "", + "Full ramp (50-950) generated for each base color:", + "", + ] + + from core.color_utils import generate_color_ramp + + for color in colors[:6]: # Top 6 colors + hex_val = color.value + role = color.suggested_name.split('.')[1] if color.suggested_name and '.' in color.suggested_name else "color" + + # Generate ramp + try: + ramp = generate_color_ramp(hex_val) + + lines.append(f"**{role.upper()}** (base: `{hex_val}`)") + lines.append("") + lines.append("| 50 | 100 | 200 | 300 | 400 | 500 | 600 | 700 | 800 | 900 |") + lines.append("|---|---|---|---|---|---|---|---|---|---|") + + # Create row with hex values + row = "|" + for i in range(10): + if i < len(ramp): + row += f" `{ramp[i]}` |" + else: + row += " — |" + lines.append(row) + lines.append("") + + except Exception as e: + lines.append(f"**{role}** (`{hex_val}`) — Could not generate ramp: {str(e)}") + lines.append("") + + return "\n".join(lines) + + +def format_radius_with_tokens() -> str: + """Format radius with token name suggestions.""" + if not state.desktop_normalized or not state.desktop_normalized.radius: + return "*No border radius values detected.*" + + radii = list(state.desktop_normalized.radius.values()) + + lines = [ + "### 🔘 Border Radius Tokens", + "", + "| Detected | Suggested Token | Usage |", + "|----------|-----------------|-------|", + ] + + # Sort by pixel value + def parse_radius(r): + val = str(r.value).replace('px', '').replace('%', '') + try: + return float(val) + except (ValueError, TypeError): + return 999 + + radii.sort(key=lambda r: parse_radius(r)) + + token_map = { + (0, 2): ("radius.none", "Sharp corners"), + (2, 4): ("radius.xs", "Subtle rounding"), + (4, 6): ("radius.sm", "Small elements"), + (6, 10): ("radius.md", "Buttons, cards"), + (10, 16): ("radius.lg", "Modals, panels"), + (16, 32): ("radius.xl", "Large containers"), + (32, 100): ("radius.2xl", "Pill shapes"), + } + + for r in radii[:8]: + val = str(r.value) + px = parse_radius(r) + + if "%" in str(r.value) or px >= 50: + token = "radius.full" + usage = "Circles, avatars" + else: + token = "radius.md" + usage = "General use" + for (low, high), (t, u) in token_map.items(): + if low <= px < high: + token = t + usage = u + break + + lines.append(f"| {val} | `{token}` | {usage} |") + + return "\n".join(lines) + + +def format_shadows_with_tokens() -> str: + """Format shadows with token name suggestions.""" + if not state.desktop_normalized or not state.desktop_normalized.shadows: + return "*No shadow values detected.*" + + shadows = list(state.desktop_normalized.shadows.values()) + + lines = [ + "### 🌫️ Shadow Tokens", + "", + "| Detected Value | Suggested Token | Use Case |", + "|----------------|-----------------|----------|", + ] + + shadow_sizes = ["shadow.xs", "shadow.sm", "shadow.md", "shadow.lg", "shadow.xl", "shadow.2xl"] + + for i, s in enumerate(shadows[:6]): + val = str(s.value)[:40] + ("..." if len(str(s.value)) > 40 else "") + token = shadow_sizes[i] if i < len(shadow_sizes) else f"shadow.custom-{i}" + + # Guess use case based on index + use_cases = ["Subtle elevation", "Cards, dropdowns", "Modals, dialogs", "Popovers", "Floating elements", "Dramatic effect"] + use = use_cases[i] if i < len(use_cases) else "Custom" + + lines.append(f"| `{val}` | `{token}` | {use} |") + + return "\n".join(lines) + + +def format_spacing_comparison(recommendations) -> list: + """Format spacing comparison table.""" + if not state.desktop_normalized: + return [] + + # Get current spacing + current_spacing = list(state.desktop_normalized.spacing.values()) + current_spacing.sort(key=lambda s: s.value_px) + + data = [] + for s in current_spacing[:10]: + current = f"{s.value_px}px" + grid_8 = f"{snap_to_grid(s.value_px, 8)}px" + grid_4 = f"{snap_to_grid(s.value_px, 4)}px" + + # Mark if value fits + if s.value_px == snap_to_grid(s.value_px, 8): + grid_8 += " ✓" + if s.value_px == snap_to_grid(s.value_px, 4): + grid_4 += " ✓" + + data.append([current, grid_8, grid_4]) + + return data + + +def snap_to_grid(value: float, base: int) -> int: + """Snap value to grid.""" + return round(value / base) * base + + +def reset_to_original(): + """Reset all upgrade selections to defaults.""" + state.selected_upgrades = {} + state.log("") + state.log("↩️ Reset all upgrade selections to original values.") + + return ( + "Scale 1.25 (Major Third) ⭐", # type_scale_radio + "8px Base Grid ⭐", # spacing_radio + True, # color_ramps_checkbox + "## ↩️ Reset Complete\n\nAll selections reverted to defaults. Review and apply again when ready.", # apply_status + state.get_logs(), # stage2_log + ) + + +def apply_selected_upgrades(type_choice: str, spacing_choice: str, apply_ramps: bool, color_recs_table: list = None): + """Apply selected upgrade options including LLM color recommendations.""" + if not state.upgrade_recommendations: + return "## ❌ Run Analysis First\n\nPlease run the **v2 Analysis** before applying upgrades.", state.get_logs() + + state.log("") + state.log("═" * 50) + state.log("✨ APPLYING SELECTED UPGRADES") + state.log("═" * 50) + + # Store selections + state.selected_upgrades = { + "type_scale": type_choice, + "spacing": spacing_choice, + "color_ramps": apply_ramps, + } + + state.log(f" 📐 Type Scale: {type_choice}") + state.log(f" 📏 Spacing: {spacing_choice}") + state.log(f" 🌈 Color Ramps: {'Yes' if apply_ramps else 'No'}") + + # Process accepted color recommendations + accepted_color_changes = [] + rejected_count = 0 + if color_recs_table: + state.log("") + state.log(" 🎨 LLM Color Recommendations:") + for row in color_recs_table: + if len(row) >= 5: + accept = row[0] # Boolean checkbox + role = row[1] # Role name + current = row[2] # Current color + issue = row[3] # Issue description + suggested = row[4] # Suggested color + + if accept and suggested and current != suggested: + accepted_color_changes.append({ + "role": role, + "from": current, + "to": suggested, + "reason": issue, + }) + state.log(f" ├─ ✅ ACCEPTED: {role}") + state.log(f" │ └─ {current} → {suggested}") + elif not accept: + rejected_count += 1 + state.log(f" ├─ ❌ REJECTED: {role} (keeping {current})") + + # Store accepted changes + state.selected_upgrades["color_changes"] = accepted_color_changes + + state.log("") + if accepted_color_changes: + state.log(f" 📊 {len(accepted_color_changes)} color change(s) will be applied to export") + if rejected_count: + state.log(f" 📊 {rejected_count} color change(s) rejected (keeping original)") + + state.log("") + state.log("✅ Upgrades applied! Proceed to Stage 3 for export.") + state.log("═" * 50) + + # Build visible feedback summary + summary_parts = [] + summary_parts.append(f"**Type Scale:** {type_choice}") + summary_parts.append(f"**Spacing:** {spacing_choice}") + summary_parts.append(f"**Color Ramps:** {'✅ Enabled' if apply_ramps else '❌ Disabled'}") + if accepted_color_changes: + summary_parts.append(f"**Color Changes:** {len(accepted_color_changes)} accepted") + if rejected_count: + summary_parts.append(f"**Rejected:** {rejected_count} kept as-is") + + status_md = f"""## ✅ Upgrades Applied Successfully! + +{chr(10).join('- ' + p for p in summary_parts)} + +👉 **Proceed to Stage 3** to export your upgraded tokens. +""" + return status_md, state.get_logs() + + +# ============================================================================= +# EXPORT HELPERS — Semantic Token Naming +# ============================================================================= + +def _get_radius_token_name(value_str, seen_names: dict = None) -> str: + """Map radius px value to semantic token name (radius.sm, radius.md, etc.).""" + val = str(value_str).replace('px', '').replace('%', '') + try: + px = float(val) + except (ValueError, TypeError): + return "radius.md" + + # Handle percentage values (e.g., "50%" for circular) + if "%" in str(value_str): + base_name = "radius.full" + # "none" is ONLY for exactly 0px + elif px == 0: + base_name = "radius.none" + elif px >= 9999: + # Large values (like 9999px) are essentially "full" + base_name = "radius.full" + else: + # Semantic naming based on pixel ranges (inclusive both ends for clarity) + mapping = [ + (1, 1, "radius.xs"), # 1px = xs + (2, 3, "radius.sm"), # 2-3px = sm + (4, 7, "radius.md"), # 4-7px = md + (8, 11, "radius.lg"), # 8-11px = lg + (12, 19, "radius.xl"), # 12-19px = xl + (20, 31, "radius.2xl"), # 20-31px = 2xl + (32, 99, "radius.3xl"), # 32-99px = 3xl + ] + base_name = "radius.md" + for low, high, name in mapping: + if low <= px <= high: + base_name = name + break + + # Handle duplicates: if two radii map to same name, append px value + if seen_names is not None: + if base_name in seen_names: + return f"{base_name}.{int(px)}" + seen_names[base_name] = True + return base_name + + +def _get_shadow_blur(value_str: str) -> float: + """Extract blur radius from shadow value for sorting.""" + import re + # Shadow format: "Xpx Ypx BLURpx SPREADpx color" + parts = re.findall(r'([\d.]+)px', str(value_str)) + if len(parts) >= 3: + return float(parts[2]) # blur is 3rd px value + elif len(parts) >= 1: + return float(parts[0]) + return 0 + + +def _parse_shadow_to_tokens_studio(value_str: str) -> dict: + """Parse CSS shadow string to Figma Tokens Studio boxShadow format. + + Input: "rgba(0, 0, 0, 0.5) 0px 2px 4px 0px" or "0px 2px 4px 0px rgba(0,0,0,0.5)" + Output: {"x": "0", "y": "2", "blur": "4", "spread": "0", "color": "rgba(0,0,0,0.5)", "type": "dropShadow"} + """ + import re + value_str = str(value_str).strip() + + # Extract color (rgba/rgb/hex) + color_match = re.search(r'(rgba?\([^)]+\)|#[0-9a-fA-F]{3,8})', value_str) + color = color_match.group(1) if color_match else "rgba(0,0,0,0.25)" + + # Extract px values + px_values = re.findall(r'(-?[\d.]+)px', value_str) + + # Standard order: x y blur spread + x = px_values[0] if len(px_values) > 0 else "0" + y = px_values[1] if len(px_values) > 1 else "0" + blur = px_values[2] if len(px_values) > 2 else "0" + spread = px_values[3] if len(px_values) > 3 else "0" + + # Determine if inset + shadow_type = "innerShadow" if "inset" in value_str.lower() else "dropShadow" + + return { + "x": x, + "y": y, + "blur": blur, + "spread": spread, + "color": color, + "type": shadow_type, + } + + +# ============================================================================= +# W3C DTCG FORMAT HELPERS +# ============================================================================= + +def _flat_key_to_nested(flat_key: str, value: dict, root: dict): + """Convert 'color.brand.primary' into nested dict structure. + + Example: _flat_key_to_nested('color.brand.primary', token, {}) + Result: {'color': {'brand': {'primary': token}}} + """ + parts = flat_key.split('.') + current = root + for part in parts[:-1]: + if part not in current: + current[part] = {} + current = current[part] + current[parts[-1]] = value + + +def _to_dtcg_token(value, token_type: str, description: str = None, source: str = None) -> dict: + """Wrap value in W3C DTCG format with $value, $type, $description. + + Args: + value: The token value + token_type: W3C DTCG type (color, typography, dimension, shadow) + description: Optional human-readable description + source: Optional source indicator (extracted, recommended, semantic) + """ + token = {"$type": token_type, "$value": value} + if description: + token["$description"] = description + if source: + token["$description"] = f"[{source}] {description or ''}" + return token + + +def _shadow_to_dtcg(shadow_dict: dict) -> dict: + """Convert our internal shadow format to W3C DTCG shadow spec. + + Input: {"x": "0", "y": "2", "blur": "4", "spread": "0", "color": "rgba(...)"} + Output: {"color": "...", "offsetX": "0px", "offsetY": "2px", "blur": "4px", "spread": "0px"} + """ + return { + "color": shadow_dict.get("color", "rgba(0,0,0,0.25)"), + "offsetX": str(shadow_dict.get("x", "0")) + "px", + "offsetY": str(shadow_dict.get("y", "0")) + "px", + "blur": str(shadow_dict.get("blur", "0")) + "px", + "spread": str(shadow_dict.get("spread", "0")) + "px", + } + + +def _get_semantic_color_overrides() -> dict: + """Build color hex->semantic name map. + + v3: AURORA naming_map is the SINGLE naming authority. + Falls back to normalizer suggested_name, then _generate_color_name_from_hex. + """ + overrides = {} # hex -> semantic_name + + # PRIMARY: AURORA's naming_map (covers ALL colors if critic passed) + brand_result = getattr(state, 'brand_result', None) + if brand_result: + naming_map = getattr(brand_result, 'naming_map', None) + if isinstance(naming_map, dict) and naming_map: + for hex_val, name in naming_map.items(): + hex_clean = str(hex_val).strip().lower() + if hex_clean.startswith('#') and name: + # Ensure color. prefix + clean_name = name if name.startswith('color.') else f'color.{name}' + overrides[hex_clean] = clean_name + + return overrides + + +def _is_valid_hex_color(value: str) -> bool: + """Validate that a string is a proper hex color (not CSS garbage).""" + import re + if not value or not isinstance(value, str): + return False + # Must be exactly #RGB, #RRGGBB, or #RRGGBBAA + clean = value.strip().lower() + return bool(re.match(r'^#([a-f0-9]{3}|[a-f0-9]{6}|[a-f0-9]{8})$', clean)) + + +def _generate_color_name_from_hex(hex_val: str, used_names: set = None) -> str: + """Generate a semantic color name based on the color's HSL characteristics. + + Returns names like: color.neutral.400, color.blue.500, color.red.300 + Uses standard design system naming conventions. + """ + import colorsys + + used_names = used_names or set() + + # Parse hex to RGB + hex_clean = hex_val.lstrip('#').lower() + if len(hex_clean) == 3: + hex_clean = ''.join([c*2 for c in hex_clean]) + + try: + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + except (ValueError, IndexError): + return "color.other.base" + + # Convert to HSL + h, l, s = colorsys.rgb_to_hls(r, g, b) + hue = h * 360 + saturation = s + lightness = l + + # Determine color family based on hue (for saturated colors) + if saturation < 0.1: + # Grayscale / neutral + color_family = "neutral" + else: + # Map hue to color name + if hue < 15 or hue >= 345: + color_family = "red" + elif hue < 45: + color_family = "orange" + elif hue < 75: + color_family = "yellow" + elif hue < 150: + color_family = "green" + elif hue < 195: + color_family = "teal" + elif hue < 255: + color_family = "blue" + elif hue < 285: + color_family = "purple" + elif hue < 345: + color_family = "pink" + else: + color_family = "red" + + # Determine shade based on lightness (100-900 scale) + if lightness >= 0.95: + shade = "50" + elif lightness >= 0.85: + shade = "100" + elif lightness >= 0.75: + shade = "200" + elif lightness >= 0.65: + shade = "300" + elif lightness >= 0.50: + shade = "400" + elif lightness >= 0.40: + shade = "500" + elif lightness >= 0.30: + shade = "600" + elif lightness >= 0.20: + shade = "700" + elif lightness >= 0.10: + shade = "800" + else: + shade = "900" + + # Generate base name + base_name = f"color.{color_family}.{shade}" + + # Handle conflicts by adding suffix + final_name = base_name + suffix = 1 + while final_name in used_names: + suffix += 1 + final_name = f"{base_name}_{suffix}" + + return final_name + + +def _consolidate_colors(colors_dict: dict, overrides: dict, max_colors: int = 30) -> dict: + """Consolidate colors: semantic first, then top by frequency, capped.""" + if not colors_dict: + return {} + + result = {} + remaining = [] + used_generated_names = set() # Track generated names to avoid conflicts + + for name, c in colors_dict.items(): + hex_val = c.value.lower() if hasattr(c, 'value') else str(c.get('value', '')).lower() + + # IMPORTANT: Skip invalid/garbage color values (CSS parsing errors) + if not _is_valid_hex_color(hex_val): + continue + + freq = c.frequency if hasattr(c, 'frequency') else c.get('frequency', 0) + + # Check if this color has a semantic override + semantic_name = overrides.get(hex_val) + if semantic_name: + result[semantic_name] = { + "value": hex_val, + "type": "color", + "source": "semantic", + } + else: + # Check for garbage names (firecrawl.N, numeric-only, etc.) + base_name = (c.suggested_name if hasattr(c, 'suggested_name') else name) or name + clean_name = base_name.replace(" ", ".").replace("_", ".").lower() + + # Detect garbage names and generate proper color-based names + is_garbage_name = ( + 'firecrawl' in clean_name.lower() or + clean_name.split('.')[-1].isdigit() or # Ends with just a number + len(clean_name.split('.')) == 2 and clean_name.split('.')[-1].isdigit() # color.34 + ) + + if is_garbage_name: + # Generate proper name based on color characteristics + clean_name = _generate_color_name_from_hex(hex_val, used_generated_names) + used_generated_names.add(clean_name) + elif not clean_name.startswith("color."): + clean_name = f"color.{clean_name}" + + remaining.append((clean_name, hex_val, freq)) + + # Sort remaining by frequency (highest first), take up to max + remaining.sort(key=lambda x: -x[2]) + slots_left = max_colors - len(result) + for clean_name, hex_val, freq in remaining[:slots_left]: + if clean_name not in result: + result[clean_name] = { + "value": hex_val, + "type": "color", + "source": "detected", + } + + return result + + +def export_stage1_json(): + """Export Stage 1 tokens (as-is extraction) to W3C DTCG format.""" + if not state.desktop_normalized: + gr.Warning("No tokens extracted yet. Complete Stage 1 extraction first.") + return json.dumps({ + "error": "No tokens extracted yet.", + "how_to_fix": "Go to Step 1, enter a URL, discover pages, and extract tokens first.", + "stage": "Stage 1 required" + }, indent=2) + + # W3C DTCG format: nested structure, no wrapper, $value/$type + result = {} + token_count = 0 + + # ========================================================================= + # COLORS — Nested structure with $value, $type, $description + # ========================================================================= + if state.desktop_normalized and state.desktop_normalized.colors: + overrides = _get_semantic_color_overrides() + consolidated = _consolidate_colors( + state.desktop_normalized.colors, overrides, max_colors=30, + ) + for flat_key, entry in consolidated.items(): + # flat_key = "color.brand.primary" + source = entry.get("source", "extracted") + source_label = "LLM Semantic" if source == "semantic" else "Auto-Generated" if source == "detected" else "Extracted" + dtcg_token = _to_dtcg_token(entry["value"], "color", description=source_label) + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # TYPOGRAPHY — Nested structure with viewport suffix + # ========================================================================= + # Desktop typography + if state.desktop_normalized and state.desktop_normalized.typography: + for name, t in state.desktop_normalized.typography.items(): + base_name = t.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("font."): + clean_name = f"font.{clean_name}" + + flat_key = f"{clean_name}.desktop" + typo_value = { + "fontFamily": t.font_family, + "fontSize": t.font_size, + "fontWeight": str(t.font_weight), + "lineHeight": t.line_height or "1.5", + } + dtcg_token = _to_dtcg_token(typo_value, "typography", description="Extracted from site") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # Mobile typography + if state.mobile_normalized and state.mobile_normalized.typography: + for name, t in state.mobile_normalized.typography.items(): + base_name = t.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("font."): + clean_name = f"font.{clean_name}" + + flat_key = f"{clean_name}.mobile" + typo_value = { + "fontFamily": t.font_family, + "fontSize": t.font_size, + "fontWeight": str(t.font_weight), + "lineHeight": t.line_height or "1.5", + } + dtcg_token = _to_dtcg_token(typo_value, "typography", description="Extracted from site") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # SPACING — Nested structure with viewport suffix + # ========================================================================= + # Desktop spacing + if state.desktop_normalized and state.desktop_normalized.spacing: + for name, s in state.desktop_normalized.spacing.items(): + base_name = s.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("space."): + clean_name = f"space.{clean_name}" + + flat_key = f"{clean_name}.desktop" + dtcg_token = _to_dtcg_token(s.value, "dimension", description="Extracted from site") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # Mobile spacing + if state.mobile_normalized and state.mobile_normalized.spacing: + for name, s in state.mobile_normalized.spacing.items(): + base_name = s.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("space."): + clean_name = f"space.{clean_name}" + + flat_key = f"{clean_name}.mobile" + dtcg_token = _to_dtcg_token(s.value, "dimension", description="Extracted from site") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # BORDER RADIUS — Nested structure (DTCG uses "dimension" type for radii) + # ========================================================================= + if state.desktop_normalized and state.desktop_normalized.radius: + seen_radius = {} + for name, r in state.desktop_normalized.radius.items(): + token_name = _get_radius_token_name(r.value, seen_radius) + # Convert "radius.md" to nested: radius.md (keep as "radius" for consistency) + flat_key = token_name + dtcg_token = _to_dtcg_token(r.value, "dimension", description="Extracted from site") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # SHADOWS — W3C DTCG shadow format + # ========================================================================= + if state.desktop_normalized and state.desktop_normalized.shadows: + shadow_names = ["xs", "sm", "md", "lg", "xl", "2xl"] + sorted_shadows = sorted( + state.desktop_normalized.shadows.items(), + key=lambda x: _get_shadow_blur(x[1].value), + ) + for i, (name, s) in enumerate(sorted_shadows): + size_name = shadow_names[i] if i < len(shadow_names) else str(i + 1) + flat_key = f"shadow.{size_name}" + # Parse CSS shadow and convert to DTCG format + parsed = _parse_shadow_to_tokens_studio(s.value) + dtcg_shadow_value = _shadow_to_dtcg(parsed) + dtcg_token = _to_dtcg_token(dtcg_shadow_value, "shadow", description="Extracted from site") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + json_str = json.dumps(result, indent=2, default=str) + gr.Info(f"Stage 1 exported: {token_count} tokens (W3C DTCG format)") + return json_str + + +def export_tokens_json(): + """Export final tokens with selected upgrades applied - FLAT structure for Figma Tokens Studio.""" + if not state.desktop_normalized: + gr.Warning("No tokens extracted yet. Complete Stage 1 extraction first.") + return json.dumps({ + "error": "No tokens extracted yet.", + "how_to_fix": "Complete Stage 1 extraction first, then optionally run Stage 2 analysis before exporting.", + "stage": "Stage 1 required" + }, indent=2) + + # Get selected upgrades + upgrades = getattr(state, 'selected_upgrades', {}) + if not upgrades: + state.log("⚠️ Exporting final JSON without Stage 2 upgrades applied. Consider running Stage 2 analysis first.") + type_scale_choice = upgrades.get('type_scale', 'Keep Current') + spacing_choice = upgrades.get('spacing', 'Keep Current') + apply_ramps = upgrades.get('color_ramps', True) + + # Determine ratio from choice + ratio = None + if "1.2" in type_scale_choice: + ratio = 1.2 + elif "1.25" in type_scale_choice: + ratio = 1.25 + elif "1.333" in type_scale_choice: + ratio = 1.333 + + # Determine spacing base + spacing_base = None + if "8px" in spacing_choice: + spacing_base = 8 + elif "4px" in spacing_choice: + spacing_base = 4 + + # W3C DTCG format: nested structure, no wrapper + result = {} + token_count = 0 + + fonts_info = get_detected_fonts() + primary_font = fonts_info.get("primary", "sans-serif") + + # ========================================================================= + # COLORS — Consolidated with semantic naming + optional ramps + # ========================================================================= + if state.desktop_normalized and state.desktop_normalized.colors: + from core.color_utils import generate_color_ramp + + overrides = _get_semantic_color_overrides() + consolidated = _consolidate_colors( + state.desktop_normalized.colors, overrides, max_colors=30, + ) + + for flat_key, entry in consolidated.items(): + if apply_ramps: + try: + ramp = generate_color_ramp(entry["value"]) + shades = ["50", "100", "200", "300", "400", "500", "600", "700", "800", "900", "950"] + for i, shade in enumerate(shades): + if i < len(ramp): + shade_key = f"{flat_key}.{shade}" + color_val = ramp[i] if isinstance(ramp[i], str) else ramp[i].get("hex", entry["value"]) + dtcg_token = _to_dtcg_token(color_val, "color") + _flat_key_to_nested(shade_key, dtcg_token, result) + token_count += 1 + except (ValueError, KeyError, TypeError, IndexError): + dtcg_token = _to_dtcg_token(entry["value"], "color") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + else: + dtcg_token = _to_dtcg_token(entry["value"], "color") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # TYPOGRAPHY - FLAT structure with viewport suffix + # ========================================================================= + base_size = get_base_font_size() + token_names = [ + "font.display.2xl", "font.display.xl", "font.display.lg", "font.display.md", + "font.heading.xl", "font.heading.lg", "font.heading.md", "font.heading.sm", + "font.body.lg", "font.body.md", "font.body.sm", "font.caption", "font.overline" + ] + + # Weight + lineHeight mapping by token tier + _weight_map = { + "display": "700", "heading": "600", + "body": "400", "caption": "400", "overline": "500", + } + _lh_map = { + "display": "1.2", "heading": "1.3", + "body": "1.5", "caption": "1.4", "overline": "1.2", + } + + def _tier_from_token(token_name: str) -> str: + """Extract tier (display/heading/body/caption/overline) from token name.""" + for tier in ("display", "heading", "body", "caption", "overline"): + if tier in token_name: + return tier + return "body" + + # Desktop typography — W3C DTCG format + if ratio: + scales = [int(round(base_size * (ratio ** (8-i)) / 2) * 2) for i in range(13)] + for i, token_name in enumerate(token_names): + tier = _tier_from_token(token_name) + flat_key = f"{token_name}.desktop" + typo_value = { + "fontFamily": primary_font, + "fontSize": f"{scales[i]}px", + "fontWeight": _weight_map.get(tier, "400"), + "lineHeight": _lh_map.get(tier, "1.5"), + } + dtcg_token = _to_dtcg_token(typo_value, "typography") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + elif state.desktop_normalized and state.desktop_normalized.typography: + for name, t in state.desktop_normalized.typography.items(): + base_name = t.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("font."): + clean_name = f"font.{clean_name}" + + flat_key = f"{clean_name}.desktop" + typo_value = { + "fontFamily": t.font_family, + "fontSize": t.font_size, + "fontWeight": str(t.font_weight), + "lineHeight": t.line_height or "1.5", + } + dtcg_token = _to_dtcg_token(typo_value, "typography") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # Mobile typography — W3C DTCG format + if ratio: + mobile_factor = 0.875 + scales = [int(round(base_size * mobile_factor * (ratio ** (8-i)) / 2) * 2) for i in range(13)] + for i, token_name in enumerate(token_names): + tier = _tier_from_token(token_name) + flat_key = f"{token_name}.mobile" + typo_value = { + "fontFamily": primary_font, + "fontSize": f"{scales[i]}px", + "fontWeight": _weight_map.get(tier, "400"), + "lineHeight": _lh_map.get(tier, "1.5"), + } + dtcg_token = _to_dtcg_token(typo_value, "typography") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + elif state.mobile_normalized and state.mobile_normalized.typography: + for name, t in state.mobile_normalized.typography.items(): + base_name = t.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("font."): + clean_name = f"font.{clean_name}" + + flat_key = f"{clean_name}.mobile" + typo_value = { + "fontFamily": t.font_family, + "fontSize": t.font_size, + "fontWeight": str(t.font_weight), + "lineHeight": t.line_height or "1.5", + } + dtcg_token = _to_dtcg_token(typo_value, "typography") + _flat_key_to_nested(flat_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # SPACING — W3C DTCG format with nested structure + # ========================================================================= + spacing_token_names = [ + "space.1", "space.2", "space.3", "space.4", "space.5", + "space.6", "space.8", "space.10", "space.12", "space.16" + ] + + if spacing_base: + # Generate grid-aligned spacing for both viewports + for i, token_name in enumerate(spacing_token_names): + value = spacing_base * (i + 1) + + # Desktop + desktop_key = f"{token_name}.desktop" + dtcg_token = _to_dtcg_token(f"{value}px", "dimension") + _flat_key_to_nested(desktop_key, dtcg_token, result) + token_count += 1 + + # Mobile (same values) + mobile_key = f"{token_name}.mobile" + dtcg_token = _to_dtcg_token(f"{value}px", "dimension") + _flat_key_to_nested(mobile_key, dtcg_token, result) + token_count += 1 + else: + # Keep original with nested structure + if state.desktop_normalized and state.desktop_normalized.spacing: + for name, s in state.desktop_normalized.spacing.items(): + base_name = s.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("space."): + clean_name = f"space.{clean_name}" + + desktop_key = f"{clean_name}.desktop" + dtcg_token = _to_dtcg_token(s.value, "dimension") + _flat_key_to_nested(desktop_key, dtcg_token, result) + token_count += 1 + + if state.mobile_normalized and state.mobile_normalized.spacing: + for name, s in state.mobile_normalized.spacing.items(): + base_name = s.suggested_name or name + clean_name = base_name.replace(" ", ".").replace("_", ".").replace("-", ".").lower() + if not clean_name.startswith("space."): + clean_name = f"space.{clean_name}" + + mobile_key = f"{clean_name}.mobile" + dtcg_token = _to_dtcg_token(s.value, "dimension") + _flat_key_to_nested(mobile_key, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # BORDER RADIUS — W3C DTCG format (uses "dimension" type per spec) + # ========================================================================= + if state.desktop_normalized and state.desktop_normalized.radius: + seen_radius = {} + for name, r in state.desktop_normalized.radius.items(): + token_name = _get_radius_token_name(r.value, seen_radius) + # DTCG uses "dimension" for radii, not "borderRadius" + dtcg_token = _to_dtcg_token(r.value, "dimension") + _flat_key_to_nested(token_name, dtcg_token, result) + token_count += 1 + + # ========================================================================= + # SHADOWS — W3C DTCG format with shadow spec + # ========================================================================= + if state.desktop_normalized and state.desktop_normalized.shadows: + shadow_names = ["shadow.xs", "shadow.sm", "shadow.md", "shadow.lg", "shadow.xl", "shadow.2xl"] + sorted_shadows = sorted( + state.desktop_normalized.shadows.items(), + key=lambda x: _get_shadow_blur(x[1].value), + ) + for i, (name, s) in enumerate(sorted_shadows): + token_name = shadow_names[i] if i < len(shadow_names) else f"shadow.{i + 1}" + # Convert to DTCG shadow format + shadow_value = _shadow_to_dtcg(_parse_shadow_to_tokens_studio(s.value)) + dtcg_token = _to_dtcg_token(shadow_value, "shadow") + _flat_key_to_nested(token_name, dtcg_token, result) + token_count += 1 + + json_str = json.dumps(result, indent=2, default=str) + upgrades_note = " (with upgrades)" if upgrades else " (no upgrades applied)" + gr.Info(f"Final export: {token_count} tokens{upgrades_note}") + return json_str + + +# ============================================================================= +# UI BUILDING +# ============================================================================= + +def create_ui(): + """Create the Gradio interface with corporate branding.""" + + # Corporate theme customization + corporate_theme = gr.themes.Base( + primary_hue=gr.themes.colors.blue, + secondary_hue=gr.themes.colors.slate, + neutral_hue=gr.themes.colors.slate, + font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"], + font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "monospace"], + ).set( + # Colors + body_background_fill="#f8fafc", + body_background_fill_dark="#0f172a", + block_background_fill="white", + block_background_fill_dark="#1e293b", + block_border_color="#e2e8f0", + block_border_color_dark="#334155", + block_label_background_fill="#f1f5f9", + block_label_background_fill_dark="#1e293b", + block_title_text_color="#0f172a", + block_title_text_color_dark="#f1f5f9", + + # Primary button + button_primary_background_fill="#2563eb", + button_primary_background_fill_hover="#1d4ed8", + button_primary_text_color="white", + + # Secondary button + button_secondary_background_fill="#f1f5f9", + button_secondary_background_fill_hover="#e2e8f0", + button_secondary_text_color="#1e293b", + + # Input fields + input_background_fill="#ffffff", + input_background_fill_dark="#1e293b", + input_border_color="#cbd5e1", + input_border_color_dark="#475569", + + # Shadows and radius + block_shadow="0 1px 3px rgba(0,0,0,0.1)", + block_shadow_dark="0 1px 3px rgba(0,0,0,0.3)", + block_border_width="1px", + block_radius="8px", + + # Text + body_text_color="#1e293b", + body_text_color_dark="#e2e8f0", + body_text_size="14px", + ) + + # Custom CSS for additional styling + custom_css = """ + /* Global styles */ + .gradio-container { + max-width: 1400px !important; + margin: 0 auto !important; + } + + /* Header branding */ + .app-header { + background: linear-gradient(135deg, #1e40af 0%, #3b82f6 100%); + padding: 24px 32px; + border-radius: 12px; + margin-bottom: 24px; + color: white; + } + .app-header h1 { + margin: 0 0 8px 0; + font-size: 28px; + font-weight: 700; + } + .app-header p { + margin: 0; + opacity: 0.9; + font-size: 14px; + } + + /* Stage indicators */ + .stage-header { + background: linear-gradient(90deg, #f1f5f9 0%, #ffffff 100%); + padding: 16px 20px; + border-radius: 8px; + border-left: 4px solid #2563eb; + margin-bottom: 16px; + } + .stage-header h2 { + margin: 0; + font-size: 18px; + color: #1e293b; + } + + /* Log styling */ + .log-container textarea { + font-family: 'JetBrains Mono', monospace !important; + font-size: 12px !important; + line-height: 1.6 !important; + background: #0f172a !important; + color: #e2e8f0 !important; + border-radius: 8px !important; + } + + /* Color swatch */ + .color-swatch { + display: inline-block; + width: 24px; + height: 24px; + border-radius: 4px; + margin-right: 8px; + vertical-align: middle; + border: 1px solid rgba(0,0,0,0.1); + } + + /* Score badges */ + .score-badge { + display: inline-block; + padding: 4px 12px; + border-radius: 20px; + font-weight: 600; + font-size: 13px; + } + .score-badge.high { background: #dcfce7; color: #166534; } + .score-badge.medium { background: #fef3c7; color: #92400e; } + .score-badge.low { background: #fee2e2; color: #991b1b; } + + /* Benchmark cards */ + .benchmark-card { + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 16px; + margin-bottom: 12px; + } + .benchmark-card.selected { + border-color: #2563eb; + background: #eff6ff; + } + + /* Action items */ + .action-item { + background: white; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 16px; + margin-bottom: 8px; + } + .action-item.high-priority { + border-left: 4px solid #ef4444; + } + .action-item.medium-priority { + border-left: 4px solid #f59e0b; + } + + /* Progress indicator */ + .progress-bar { + height: 4px; + background: #e2e8f0; + border-radius: 2px; + overflow: hidden; + } + .progress-bar-fill { + height: 100%; + background: linear-gradient(90deg, #2563eb, #3b82f6); + transition: width 0.3s ease; + } + + /* Accordion styling */ + .accordion-header { + font-weight: 600 !important; + } + + /* Table styling */ + table { + border-collapse: collapse; + width: 100%; + } + th { + background: #f1f5f9; + color: #1e293b; + padding: 12px; + text-align: left; + font-weight: 600; + border-bottom: 2px solid #e2e8f0; + } + td { + padding: 12px; + color: #1e293b; + border-bottom: 1px solid #e2e8f0; + } + + /* Section descriptions */ + .section-desc p, .section-desc { + font-size: 13px !important; + color: #64748b !important; + line-height: 1.5 !important; + margin-top: -4px !important; + margin-bottom: 12px !important; + } + .dark .section-desc p, .dark .section-desc { + color: #94a3b8 !important; + } + + /* Success messages */ + .success-msg { background: #f0fdf4; border: 1px solid #bbf7d0; border-radius: 8px; padding: 16px; margin: 8px 0; } + .success-msg h2 { color: #166534 !important; } + .dark .success-msg { background: #052e16 !important; border-color: #166534 !important; } + .dark .success-msg h2 { color: #bbf7d0 !important; } + .dark .success-msg p { color: #d1d5db !important; } + + /* Error messages */ + .error-msg { background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; padding: 16px; margin: 8px 0; } + .error-msg h2 { color: #991b1b !important; } + .dark .error-msg { background: #450a0a !important; border-color: #991b1b !important; } + .dark .error-msg h2 { color: #fecaca !important; } + .dark .error-msg p { color: #d1d5db !important; } + + /* Placeholder messages */ + .placeholder-msg { + padding: 20px; + background: #f5f5f5; + border-radius: 8px; + color: #666; + } + .placeholder-msg.placeholder-lg { + padding: 40px; + text-align: center; + } + + /* Progress bar */ + .progress-bar { + background: #e2e8f0; + } + + /* Dark mode adjustments */ + .dark .stage-header { + background: linear-gradient(90deg, #1e293b 0%, #0f172a 100%); + border-left-color: #3b82f6; + } + .dark .stage-header h2 { + color: #f1f5f9; + } + .dark .stage-header-subtitle, + .dark .tip-text { + color: #94a3b8 !important; + } + .dark .benchmark-card { + background: #1e293b; + border-color: #334155; + } + .dark .action-item { + background: #1e293b; + border-color: #475569; + color: #e2e8f0; + } + /* Dark mode: Placeholder messages */ + .dark .placeholder-msg { + background: #1e293b !important; + color: #94a3b8 !important; + } + /* Dark mode: Progress bar */ + .dark .progress-bar { + background: #334155 !important; + } + /* Dark mode: Gradio Dataframe tables */ + .dark table th { + background: #1e293b !important; + color: #e2e8f0 !important; + border-bottom-color: #475569 !important; + } + .dark table td { + color: #e2e8f0 !important; + border-bottom-color: #334155 !important; + } + .dark table tr { + background: #0f172a !important; + } + .dark table tr:nth-child(even) { + background: #1e293b !important; + } + /* Dark mode: HTML preview tables (typography, benchmarks) */ + .dark .typography-preview { + background: #1e293b !important; + } + .dark .typography-preview th { + background: #334155 !important; + color: #e2e8f0 !important; + border-bottom-color: #475569 !important; + } + .dark .typography-preview td { + color: #e2e8f0 !important; + } + .dark .typography-preview .meta-row { + background: #1e293b !important; + border-top-color: #334155 !important; + } + .dark .typography-preview .scale-name, + .dark .typography-preview .scale-label { + color: #f1f5f9 !important; + background: #475569 !important; + } + .dark .typography-preview .meta { + color: #cbd5e1 !important; + } + .dark .typography-preview .preview-cell { + background: #0f172a !important; + border-bottom-color: #334155 !important; + } + .dark .typography-preview .preview-text { + color: #f1f5f9 !important; + } + .dark .typography-preview tr:hover .preview-cell { + background: #1e293b !important; + } + + /* Dark mode: Colors AS-IS preview */ + .dark .colors-asis-header { + color: #e2e8f0 !important; + background: #1e293b !important; + } + .dark .colors-asis-preview { + background: #0f172a !important; + } + .dark .color-row-asis { + background: #1e293b !important; + border-color: #475569 !important; + } + .dark .color-name-asis { + color: #f1f5f9 !important; + } + .dark .frequency { + color: #cbd5e1 !important; + } + .dark .color-meta-asis .aa-pass { + color: #22c55e !important; + background: #14532d !important; + } + .dark .color-meta-asis .aa-fail { + color: #f87171 !important; + background: #450a0a !important; + } + .dark .context-badge { + background: #334155 !important; + color: #e2e8f0 !important; + } + + /* Dark mode: Color ramps preview */ + .dark .color-ramps-preview { + background: #0f172a !important; + } + .dark .ramps-header-info { + color: #e2e8f0 !important; + background: #1e293b !important; + } + .dark .ramp-header { + background: #1e293b !important; + } + .dark .ramp-header-label { + color: #cbd5e1 !important; + } + .dark .color-row { + background: #1e293b !important; + border-color: #475569 !important; + } + .dark .color-name { + color: #f1f5f9 !important; + background: #475569 !important; + } + .dark .color-hex { + color: #cbd5e1 !important; + } + + /* Dark mode: Spacing preview */ + .dark .spacing-asis-preview { + background: #0f172a !important; + } + .dark .spacing-row-asis { + background: #1e293b !important; + } + .dark .spacing-label { + color: #f1f5f9 !important; + } + + /* Dark mode: Radius preview */ + .dark .radius-asis-preview { + background: #0f172a !important; + } + .dark .radius-item { + background: #1e293b !important; + } + .dark .radius-label { + color: #f1f5f9 !important; + } + + /* Dark mode: Shadows preview */ + .dark .shadows-asis-preview { + background: #0f172a !important; + } + .dark .shadow-item { + background: #1e293b !important; + } + .dark .shadow-box { + background: #334155 !important; + } + .dark .shadow-label { + color: #f1f5f9 !important; + } + .dark .shadow-value { + color: #94a3b8 !important; + } + + /* Dark mode: Semantic color ramps */ + .dark .sem-ramps-preview { + background: #0f172a !important; + } + .dark .sem-category { + background: #1e293b !important; + border-color: #475569 !important; + } + .dark .sem-cat-title { + color: #f1f5f9 !important; + border-bottom-color: #475569 !important; + } + .dark .sem-color-row { + background: #0f172a !important; + border-color: #334155 !important; + } + .dark .sem-role { + color: #f1f5f9 !important; + } + .dark .sem-hex { + color: #cbd5e1 !important; + } + .dark .llm-rec { + background: #422006 !important; + border-color: #b45309 !important; + } + .dark .rec-label { + color: #fbbf24 !important; + } + .dark .rec-issue { + color: #fde68a !important; + } + .dark .rec-arrow { + color: #fbbf24 !important; + } + .dark .llm-summary { + background: #1e3a5f !important; + border-color: #3b82f6 !important; + } + .dark .llm-summary h4 { + color: #93c5fd !important; + } + .dark .llm-summary ul, + .dark .llm-summary li { + color: #bfdbfe !important; + } + + /* Dark mode: Score badges */ + .dark .score-badge.high { background: #14532d; color: #86efac; } + .dark .score-badge.medium { background: #422006; color: #fde68a; } + .dark .score-badge.low { background: #450a0a; color: #fca5a5; } + + /* Dark mode: Benchmark & action cards */ + .dark .benchmark-card.selected { + border-color: #3b82f6; + background: #1e3a5f; + } + .dark .action-item.high-priority { + border-left-color: #ef4444; + } + .dark .action-item.medium-priority { + border-left-color: #f59e0b; + } + + /* Dark mode: Gradio markdown rendered tables */ + .dark .prose table th, + .dark .markdown-text table th { + background: #1e293b !important; + color: #e2e8f0 !important; + border-color: #475569 !important; + } + .dark .prose table td, + .dark .markdown-text table td { + color: #e2e8f0 !important; + border-color: #334155 !important; + } + .dark .prose table tr, + .dark .markdown-text table tr { + background: #0f172a !important; + } + .dark .prose table tr:nth-child(even), + .dark .markdown-text table tr:nth-child(even) { + background: #1e293b !important; + } + + /* Dark mode: Generic text in HTML components */ + .dark .gradio-html p, + .dark .gradio-html span, + .dark .gradio-html div { + color: #e2e8f0; + } + """ + + with gr.Blocks( + title="Design System Extractor v2", + theme=corporate_theme, + css=custom_css + ) as app: + + # Header with branding + gr.HTML(""" +
+

🎨 Design System Extractor v2

+

Reverse-engineer design systems from live websites • AI-powered analysis • Figma-ready export

+
+ """) + gr.Markdown("This tool works in **3 stages**: (1) Discover & extract design tokens from a live website, " + "(2) Run AI-powered analysis to benchmark and improve your tokens, " + "(3) Export Figma-ready JSON. Start by entering a URL below.", + elem_classes=["section-desc"]) + + # ================================================================= + # CONFIGURATION + # ================================================================= + + with gr.Accordion("⚙️ Configuration", open=not bool(HF_TOKEN_FROM_ENV)): + gr.Markdown("**HuggingFace Token** — Required for Stage 2 AI analysis (LLM agents). " + "Get a free token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). " + "Stage 1 (extraction) works without a token. If set as an environment variable, it loads automatically.", + elem_classes=["section-desc"]) + with gr.Row(): + hf_token_input = gr.Textbox( + label="HF Token", placeholder="hf_xxxx", type="password", + scale=4, value=HF_TOKEN_FROM_ENV, + ) + save_token_btn = gr.Button("💾 Save", scale=1) + token_status = gr.Markdown("✅ Token loaded" if HF_TOKEN_FROM_ENV else "⏳ Enter token") + + def save_token(token): + if token and len(token) > 10: + os.environ["HF_TOKEN"] = token.strip() + return "✅ **Token saved!** You can now use Stage 2 AI analysis. Close this section and enter a URL below to begin." + return "❌ **Invalid token** — please enter a valid HuggingFace token (starts with `hf_`, at least 10 characters). Get one free at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)." + + save_token_btn.click(save_token, [hf_token_input], [token_status]) + + # ================================================================= + # URL INPUT & PAGE DISCOVERY + # ================================================================= + + with gr.Accordion("🔍 Step 1: Discover Pages", open=True): + gr.Markdown("Enter the homepage URL of any website. The crawler will find up to 20 internal pages " + "(homepage, about, contact, product pages, etc.). You then select which pages to scan " + "for design tokens (colors, typography, spacing, radius, and shadows).", + elem_classes=["section-desc"]) + + with gr.Row(): + url_input = gr.Textbox(label="Website URL", placeholder="https://example.com", scale=4) + discover_btn = gr.Button("🔍 Discover Pages", variant="primary", scale=1) + gr.Markdown("*Enter the full URL including `https://` — the crawler will follow internal links from this page.*", + elem_classes=["section-desc"]) + + discover_status = gr.Markdown("") + + with gr.Row(): + log_output = gr.Textbox(label="📋 Log", lines=8, interactive=False) + + pages_table = gr.Dataframe( + headers=["Select", "URL", "Title", "Type", "Status"], + datatype=["bool", "str", "str", "str", "str"], + label="Discovered Pages", + interactive=True, + visible=False, + ) + gr.Markdown("*Use the **Select** checkbox to choose pages for extraction. Uncheck pages you want to skip " + "(login pages, error pages, etc.). **Type** shows the detected page category. Up to 10 pages will be processed.*", + elem_classes=["section-desc"]) + + gr.Markdown("*Extraction scans each selected page at two viewport sizes — Desktop (1440px) and Mobile (375px) — " + "pulling colors, typography, spacing, radius, and shadows from computed CSS.*", + elem_classes=["section-desc"]) + extract_btn = gr.Button("🚀 Extract Tokens (Desktop + Mobile)", variant="primary", visible=False) + + # ================================================================= + # STAGE 1: EXTRACTION REVIEW + # ================================================================= + + with gr.Accordion("📊 Stage 1: Review Extracted Tokens", open=False) as stage1_accordion: + + extraction_status = gr.Markdown("") + + gr.Markdown("Review the design tokens extracted from your website. Use the **viewport toggle** to switch between " + "Desktop (1440px) and Mobile (375px) data. **Accept or reject** individual tokens using the checkboxes — " + "rejected tokens will be excluded from your design system export.", + elem_classes=["section-desc"]) + + viewport_toggle = gr.Radio( + choices=["Desktop (1440px)", "Mobile (375px)"], + value="Desktop (1440px)", + label="Viewport", + ) + + with gr.Tabs(): + with gr.Tab("🎨 Colors"): + gr.Markdown("*Each row is a unique color found on the site. **Confidence** shows extraction certainty. " + "**AA** indicates WCAG accessibility pass/fail for normal text. **Context** shows where the color was used.*", + elem_classes=["section-desc"]) + colors_table = gr.Dataframe( + headers=["Accept", "Color", "Suggested Name", "Frequency", "Confidence", "Contrast", "AA", "Context"], + datatype=["bool", "str", "str", "number", "str", "str", "str", "str"], + label="Colors", + interactive=True, + ) + with gr.Accordion("👁️ Visual Preview", open=False): + stage1_colors_preview = gr.HTML( + value="
Colors preview will appear after extraction...
", + label="Colors Preview" + ) + + with gr.Tab("📝 Typography"): + gr.Markdown("*Detected font styles sorted by frequency. **Size** is computed font-size, **Weight** is font-weight " + "(400=regular, 700=bold). **Suggested Name** is a semantic token name (e.g., heading.xl). " + "Uncheck rows to exclude from your design system.*", + elem_classes=["section-desc"]) + typography_table = gr.Dataframe( + headers=["Accept", "Font", "Size", "Weight", "Line Height", "Suggested Name", "Frequency", "Confidence"], + datatype=["bool", "str", "str", "str", "str", "str", "number", "str"], + label="Typography", + interactive=True, + ) + with gr.Accordion("👁️ Visual Preview", open=False): + stage1_typography_preview = gr.HTML( + value="
Typography preview will appear after extraction...
", + label="Typography Preview" + ) + + with gr.Tab("📏 Spacing"): + gr.Markdown("*Spacing values (margins, paddings, gaps) extracted from the site. **Base 8** shows whether " + "the value aligns with the 8px grid standard. Values are sorted smallest to largest. " + "Uncheck irregular spacing values you want to exclude.*", + elem_classes=["section-desc"]) + spacing_table = gr.Dataframe( + headers=["Accept", "Value", "Pixels", "Suggested Name", "Frequency", "Base 8", "Confidence"], + datatype=["bool", "str", "str", "str", "number", "str", "str"], + label="Spacing", + interactive=True, + ) + with gr.Accordion("👁️ Visual Preview", open=False): + stage1_spacing_preview = gr.HTML( + value="
Spacing preview will appear after extraction...
", + label="Spacing Preview" + ) + + with gr.Tab("🔘 Radius"): + gr.Markdown("*Border-radius values found across UI elements (buttons, cards, inputs). **Context** shows " + "which elements use each value. A consistent radius scale creates a cohesive UI.*", + elem_classes=["section-desc"]) + radius_table = gr.Dataframe( + headers=["Accept", "Value", "Frequency", "Context"], + datatype=["bool", "str", "number", "str"], + label="Border Radius", + interactive=True, + ) + with gr.Accordion("👁️ Visual Preview", open=False): + stage1_radius_preview = gr.HTML( + value="
Radius preview will appear after extraction...
", + label="Radius Preview" + ) + + with gr.Tab("🌑 Shadows"): + gr.Markdown("*Box shadow values used for elevation and depth across the site. " + "Shows blur radius, spread, and color for each shadow layer.*", + elem_classes=["section-desc"]) + stage1_shadows_preview = gr.HTML( + value="
Shadows preview will appear after extraction...
", + label="Shadows Preview" + ) + + with gr.Tab("🧠 Semantic Colors"): + gr.Markdown("*Colors automatically categorized by their usage role: Brand (primary, secondary, accent), " + "Text (headings, body, muted), Background, Border, and Feedback (success, warning, error).*", + elem_classes=["section-desc"]) + stage1_semantic_preview = gr.HTML( + value="
Semantic color analysis will appear after extraction...
", + label="Semantic Colors Preview" + ) + + gr.Markdown("---") + gr.Markdown("When you are satisfied with the accepted tokens, **proceed to Stage 2** for AI-powered analysis " + "and improvement suggestions. Or **download the raw Stage 1 JSON** for immediate use in Figma Tokens Studio.", + elem_classes=["section-desc"]) + with gr.Row(): + proceed_stage2_btn = gr.Button("➡️ Proceed to Stage 2: AI Upgrades", variant="primary") + download_stage1_btn = gr.Button("📥 Download Stage 1 JSON", variant="secondary") + + # ================================================================= + # STAGE 2: AI UPGRADES + # ================================================================= + + with gr.Accordion("🧠 Stage 2: AI-Powered Analysis", open=False) as stage2_accordion: + + # Stage header + gr.HTML(""" +
+

🧠 Stage 2: Multi-Agent Analysis

+

Rule Engine + Benchmark Research + LLM Agents

+
+ """) + + stage2_status = gr.Markdown("Click **'Run Analysis'** below to start AI-powered design system analysis. " + "This runs a 4-layer pipeline: Rule Engine → Benchmark Research → LLM Agents → Head Synthesizer.") + + # ============================================================= + # NEW ARCHITECTURE CONFIGURATION + # ============================================================= + with gr.Accordion("⚙️ Analysis Configuration", open=True): + + # Architecture explanation + gr.Markdown(""" + ### 🏗️ New Analysis Architecture + + | Layer | Type | What It Does | Cost | + |-------|------|--------------|------| + | **Layer 1** | Rule Engine | Type scale, AA check, spacing grid, color stats | FREE | + | **Layer 2** | Benchmark Research | Fetch live specs via Firecrawl (24h cache) | ~$0.001 | + | **Layer 3** | LLM Agents | Brand ID, Benchmark Advisor, Best Practices | ~$0.002 | + | **Layer 4** | HEAD Synthesizer | Combine all → Final recommendations | ~$0.001 | + + **Total Cost:** ~$0.003-0.004 per analysis + """) + + gr.Markdown("---") + + # Benchmark selection + gr.Markdown("### 📊 Select Design Systems to Compare Against") + gr.Markdown("*Choose which design systems to benchmark your tokens against:*") + + benchmark_checkboxes = gr.CheckboxGroup( + choices=[ + ("🟢 Material Design 3 (Google)", "material_design_3"), + ("🍎 Apple HIG", "apple_hig"), + ("🛒 Shopify Polaris", "shopify_polaris"), + ("🔵 Atlassian Design System", "atlassian_design"), + ("🔷 IBM Carbon", "ibm_carbon"), + ("🌊 Tailwind CSS", "tailwind_css"), + ("🐜 Ant Design", "ant_design"), + ("⚡ Chakra UI", "chakra_ui"), + ], + value=["material_design_3", "shopify_polaris", "atlassian_design"], + label="Benchmarks", + ) + + gr.Markdown(""" + + 💡 Tip: Select 2-4 benchmarks for best results. More benchmarks = longer analysis time. +
+ 📦 Results are cached for 24 hours to speed up subsequent analyses. +
+ """) + + gr.Markdown("**Run Analysis** triggers the 4-layer architecture: Rule Engine (free) " + "then AURORA + ATLAS + SENTINEL in parallel, then NEXUS compiles. " + "Review scores, recommendations, and visual previews below, then apply your chosen upgrades.", + elem_classes=["section-desc"]) + + # Analyze button + with gr.Row(): + analyze_btn_v2 = gr.Button( + "🚀 Run Analysis", + variant="primary", + size="lg", + scale=2 + ) + + # ============================================================= + # ANALYSIS LOG + # ============================================================= + with gr.Accordion("📋 Analysis Log", open=True): + gr.Markdown("*Real-time log of the analysis pipeline. Each layer reports its progress, results, and any errors. " + "Scroll through to see detailed statistics and individual agent outputs.*", + elem_classes=["section-desc"]) + stage2_log = gr.Textbox( + label="Log", + lines=20, + interactive=False, + elem_classes=["log-container"] + ) + + # ============================================================= + # SCORES DASHBOARD + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 📊 Analysis Results") + gr.Markdown("*Overall scores for your design system across accessibility, consistency, brand alignment, and best practices. " + "Each score is out of 100 — aim for 70+ in all categories. Priority actions below show the highest-impact fixes.*", + elem_classes=["section-desc"]) + + scores_dashboard = gr.HTML( + value="
Scores will appear after analysis...
", + label="Scores" + ) + + # ============================================================= + # PRIORITY ACTIONS + # ============================================================= + priority_actions_html = gr.HTML( + value="
Priority actions will appear after analysis...
", + label="Priority Actions" + ) + + # ============================================================= + # BENCHMARK COMPARISON + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 📊 Benchmark Comparison") + gr.Markdown("*Your design tokens compared against industry-leading design systems (Material Design 3, Shopify Polaris, etc.). " + "Shows how closely your type scale, spacing grid, and color palette align with each benchmark. " + "Helps you decide which system to adopt or draw inspiration from.*", + elem_classes=["section-desc"]) + benchmark_comparison_md = gr.Markdown("*Benchmark comparison will appear after analysis*") + + # ============================================================= + # COLOR RECOMMENDATIONS + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 🎨 Color Recommendations") + gr.Markdown("*AI-suggested color changes based on WCAG AA compliance, brand consistency, and industry best practices. " + "Each recommendation shows the current color, the issue found, and a suggested replacement. " + "Use the checkboxes to accept or reject individual changes before exporting.*", + elem_classes=["section-desc"]) + + # ============================================================= + # TYPOGRAPHY SECTION + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 📐 Typography") + gr.Markdown("*Your detected type scale compared against standard ratios (Minor Third 1.2, Major Third 1.25, Perfect Fourth 1.333). " + "The visual preview shows how text will look at each scale. Desktop and mobile sizes are shown separately — " + "choose a scale below to apply to your exported tokens.*", + elem_classes=["section-desc"]) + + with gr.Accordion("👁️ Typography Visual Preview", open=True): + stage2_typography_preview = gr.HTML( + value="
Typography preview will appear after analysis...
", + label="Typography Preview" + ) + + with gr.Row(): + with gr.Column(scale=2): + gr.Markdown("### 🖥️ Desktop (1440px)") + typography_desktop = gr.Dataframe( + headers=["Token", "Current", "Scale 1.2", "Scale 1.25 ⭐", "Scale 1.333", "Keep"], + datatype=["str", "str", "str", "str", "str", "str"], + label="Desktop Typography", + interactive=False, + ) + + with gr.Column(scale=2): + gr.Markdown("### 📱 Mobile (375px)") + typography_mobile = gr.Dataframe( + headers=["Token", "Current", "Scale 1.2", "Scale 1.25 ⭐", "Scale 1.333", "Keep"], + datatype=["str", "str", "str", "str", "str", "str"], + label="Mobile Typography", + interactive=False, + ) + + with gr.Row(): + with gr.Column(): + gr.Markdown("### Select Type Scale Option") + type_scale_radio = gr.Radio( + choices=["Keep Current", "Scale 1.2 (Minor Third)", "Scale 1.25 (Major Third) ⭐", "Scale 1.333 (Perfect Fourth)"], + value="Scale 1.25 (Major Third) ⭐", + label="Type Scale", + interactive=True, + ) + gr.Markdown("*Font family will be preserved. Sizes rounded to even numbers.*") + + # ============================================================= + # COLORS SECTION - Base Colors + Ramps + LLM Recommendations + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 🎨 Colors") + gr.Markdown("*Complete color analysis: base colors extracted from your site, AI-generated semantic color ramps (50–950 shades), " + "and LLM-powered recommendations for accessibility fixes. The visual preview groups colors by semantic role " + "(brand, text, background, border, feedback).*", + elem_classes=["section-desc"]) + + # LLM Recommendations Section (NEW) + with gr.Accordion("🤖 LLM Color Recommendations", open=True): + gr.Markdown("*Four AI agents analyzed your colors: **Brand Identifier** (detects primary/secondary brand colors), " + "**Benchmark Advisor** (compares to design system standards), **Best Practices Auditor** (WCAG, contrast, naming), " + "and **Head Synthesizer** (combines all findings into actionable suggestions). Use the table to accept or reject each change.*", + elem_classes=["section-desc"]) + + llm_color_recommendations = gr.HTML( + value="
LLM recommendations will appear after analysis...
", + label="LLM Recommendations" + ) + + # Accept/Reject table for color recommendations + color_recommendations_table = gr.Dataframe( + headers=["Accept", "Role", "Current", "Issue", "Suggested", "Contrast"], + datatype=["bool", "str", "str", "str", "str", "str"], + label="Color Recommendations", + interactive=True, + col_count=(6, "fixed"), + ) + + # Visual Preview + with gr.Accordion("👁️ Color Ramps Visual Preview (Semantic Groups)", open=True): + gr.Markdown("*AI-generated color ramps expanding each base color into a 50–950 shade scale (similar to Tailwind CSS). " + "Colors are grouped by semantic role. These ramps will be included in your final export if the checkbox below is enabled.*", + elem_classes=["section-desc"]) + stage2_color_ramps_preview = gr.HTML( + value="
Color ramps preview will appear after analysis...
", + label="Color Ramps Preview" + ) + + gr.Markdown("**Base Colors** — Primary colors extracted from your site, organized by frequency and semantic role:", + elem_classes=["section-desc"]) + base_colors_display = gr.Markdown("*Base colors will appear after analysis*") + + gr.Markdown("---") + + gr.Markdown("**Color Ramps** — Full shade tables (50–950) generated from each base color:", + elem_classes=["section-desc"]) + color_ramps_display = gr.Markdown("*Color ramps will appear after analysis*") + + color_ramps_checkbox = gr.Checkbox( + label="✓ Generate color ramps (keeps base colors, adds 50-950 shades)", + value=True, + ) + + # ============================================================= + # SPACING SECTION + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 📏 Spacing (Rule-Based)") + gr.Markdown("*Your detected spacing values compared against standard 8px and 4px grid systems. " + "Consistent spacing creates visual rhythm and alignment. The 8px grid (8, 16, 24, 32...) is the industry standard — " + "select your preferred system below to normalize spacing in the export.*", + elem_classes=["section-desc"]) + + with gr.Row(): + with gr.Column(scale=2): + spacing_comparison = gr.Dataframe( + headers=["Current", "8px Grid", "4px Grid"], + datatype=["str", "str", "str"], + label="Spacing Comparison", + interactive=False, + ) + + with gr.Column(scale=1): + spacing_radio = gr.Radio( + choices=["Keep Current", "8px Base Grid ⭐", "4px Base Grid"], + value="8px Base Grid ⭐", + label="Spacing System", + interactive=True, + ) + + # ============================================================= + # RADIUS SECTION + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 🔘 Border Radius (Rule-Based)") + gr.Markdown("*Border radius values detected from your site, mapped to standard design tokens (radius.none → radius.full). " + "Consistent radius tokens ensure buttons, cards, and modals share a cohesive visual language. " + "Values are sorted from sharp corners to fully rounded.*", + elem_classes=["section-desc"]) + + radius_display = gr.Markdown("*Radius tokens will appear after analysis*") + + # ============================================================= + # SHADOWS SECTION + # ============================================================= + gr.Markdown("---") + gr.Markdown("## 🌫️ Shadows (Rule-Based)") + gr.Markdown("*Box shadow values detected from your site, organized into elevation tokens (shadow.xs → shadow.2xl). " + "A well-defined shadow scale creates depth hierarchy — subtle shadows for cards, deeper shadows for modals and popovers. " + "Exported tokens are ready for Figma elevation styles.*", + elem_classes=["section-desc"]) + + shadows_display = gr.Markdown("*Shadow tokens will appear after analysis*") + + # ============================================================= + # APPLY SECTION + # ============================================================= + gr.Markdown("---") + gr.Markdown("**Apply** saves your chosen type scale, spacing grid, color ramp, and LLM recommendation selections. " + "These choices will be baked into your Stage 3 export. **Reset** reverts all selections back to the original extracted values.", + elem_classes=["section-desc"]) + + with gr.Row(): + apply_upgrades_btn = gr.Button("✨ Apply Selected Upgrades", variant="primary", scale=2) + reset_btn = gr.Button("↩️ Reset to Original", variant="secondary", scale=1) + + apply_status = gr.Markdown("", elem_classes=["apply-status-box"]) + + # ================================================================= + # STAGE 3: EXPORT + # ================================================================= + + with gr.Accordion("📦 Stage 3: Export", open=False) as stage3_accordion: + gr.Markdown("Export your finalized design tokens as JSON, compatible with **Figma Tokens Studio**.", + elem_classes=["section-desc"]) + gr.Markdown(""" +- **Stage 1 JSON (As-Is):** Raw extracted tokens with no modifications — useful for archival or baseline comparison. Includes desktop and mobile viewport variants. +- **Final JSON (Upgraded):** Tokens with your selected improvements applied (type scale, spacing grid, color ramps, and accepted LLM recommendations). **This is the recommended export.** + +Copy the JSON output below or save it as a `.json` file for import into Figma. + """, elem_classes=["section-desc"]) + + with gr.Row(): + export_stage1_btn = gr.Button("📥 Export Stage 1 (As-Is)", variant="secondary") + export_final_btn = gr.Button("📥 Export Final (Upgraded)", variant="primary") + + gr.Markdown("*The generated JSON uses a flat token structure compatible with Figma Tokens Studio. " + "Copy the contents or save as a `.json` file.*", + elem_classes=["section-desc"]) + export_output = gr.Code(label="Tokens JSON", language="json", lines=25) + + export_stage1_btn.click(export_stage1_json, outputs=[export_output]) + export_final_btn.click(export_tokens_json, outputs=[export_output]) + + # ================================================================= + # EVENT HANDLERS + # ================================================================= + + # Store data for viewport toggle + desktop_data = gr.State({}) + mobile_data = gr.State({}) + + # Discover pages + discover_btn.click( + fn=discover_pages, + inputs=[url_input], + outputs=[discover_status, log_output, pages_table], + ).then( + fn=lambda: (gr.update(visible=True), gr.update(visible=True)), + outputs=[pages_table, extract_btn], + ) + + # Extract tokens + extract_btn.click( + fn=extract_tokens, + inputs=[pages_table], + outputs=[extraction_status, log_output, desktop_data, mobile_data, + stage1_typography_preview, stage1_colors_preview, + stage1_semantic_preview, + stage1_spacing_preview, stage1_radius_preview, stage1_shadows_preview], + ).then( + fn=lambda d: (d.get("colors", []), d.get("typography", []), d.get("spacing", []), d.get("radius", [])), + inputs=[desktop_data], + outputs=[colors_table, typography_table, spacing_table, radius_table], + ).then( + fn=lambda: gr.update(open=True), + outputs=[stage1_accordion], + ) + + # Viewport toggle + viewport_toggle.change( + fn=switch_viewport, + inputs=[viewport_toggle], + outputs=[colors_table, typography_table, spacing_table, radius_table], + ) + + # Stage 2: NEW Architecture Analyze + analyze_btn_v2.click( + fn=run_stage2_analysis_v2, + inputs=[benchmark_checkboxes], + outputs=[ + stage2_status, + stage2_log, + benchmark_comparison_md, + scores_dashboard, + priority_actions_html, + color_recommendations_table, + typography_desktop, + typography_mobile, + stage2_typography_preview, + stage2_color_ramps_preview, + llm_color_recommendations, + spacing_comparison, + base_colors_display, + color_ramps_display, + radius_display, + shadows_display, + ], + ) + + # Stage 2: Apply upgrades + apply_upgrades_btn.click( + fn=apply_selected_upgrades, + inputs=[type_scale_radio, spacing_radio, color_ramps_checkbox, color_recommendations_table], + outputs=[apply_status, stage2_log], + ).then( + fn=lambda: gr.update(open=True), + outputs=[stage3_accordion], + ) + + # Stage 2: Reset to original + reset_btn.click( + fn=reset_to_original, + outputs=[type_scale_radio, spacing_radio, color_ramps_checkbox, apply_status, stage2_log], + ) + + # Stage 1: Download JSON + download_stage1_btn.click( + fn=export_stage1_json, + outputs=[export_output], + ) + + # Proceed to Stage 2 button + proceed_stage2_btn.click( + fn=lambda: gr.update(open=True), + outputs=[stage2_accordion], + ) + + # ================================================================= + # FOOTER + # ================================================================= + + gr.Markdown(""" + --- + **Design System Extractor v3** | Built with Playwright + Firecrawl + HuggingFace + + *A multi-agent co-pilot for design system recovery and modernization.* + + **Architecture:** Rule Engine (FREE) + Benchmark Research + ReAct LLM Agents (AURORA | ATLAS | SENTINEL | NEXUS) + """) + + return app + + +# ============================================================================= +# MAIN +# ============================================================================= + +if __name__ == "__main__": + app = create_ui() + app.launch(server_name="0.0.0.0", server_port=7860) diff --git a/config/.env.example b/config/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..c84a74e43edb71009bcc760008e39f69228bcf4b --- /dev/null +++ b/config/.env.example @@ -0,0 +1,143 @@ +# ============================================================================= +# Design System Extractor v2 — Environment Variables +# ============================================================================= +# Copy this file to .env and fill in your values +# NEVER commit .env to version control +# ============================================================================= + +# ----------------------------------------------------------------------------- +# REQUIRED: Hugging Face Token (Pro recommended for best models) +# ----------------------------------------------------------------------------- + +# HuggingFace Token (for Spaces deployment and model access) +# Get yours at: https://huggingface.co/settings/tokens +# Pro subscription unlocks: Llama 3.1 405B, Qwen 72B, Command R+, etc. +HF_TOKEN=your_huggingface_token_here + +# HuggingFace Space name (for deployment) +HF_SPACE_NAME=your-username/design-system-extractor + +# ----------------------------------------------------------------------------- +# MODEL CONFIGURATION — Diverse Models for Different Tasks +# ----------------------------------------------------------------------------- + +# === Agent 1 (Crawler/Extractor): NO LLM NEEDED === +# Pure rule-based extraction using Playwright + CSS parsing + +# === Agent 2 (Normalizer): FAST STRUCTURED OUTPUT === +# Task: Token naming, duplicate detection, pattern inference +# Needs: Good instruction following, JSON output, SPEED +# +# Options (pick one): +# - microsoft/Phi-3.5-mini-instruct (Fast, great for structured tasks) +# - mistralai/Mistral-7B-Instruct-v0.3 (Fast, good JSON) +# - google/gemma-2-9b-it (Balanced speed/quality) +# - Qwen/Qwen2.5-7B-Instruct (Good all-rounder) +AGENT2_MODEL=microsoft/Phi-3.5-mini-instruct + +# === Agent 3 (Advisor): STRONG REASONING — Most Important! === +# Task: Design system analysis, best practice recommendations, trade-off analysis +# Needs: Deep reasoning, design knowledge, creative suggestions +# +# Options (pick one - Pro tier recommended): +# - meta-llama/Llama-3.1-70B-Instruct (Excellent reasoning, long context) +# - CohereForAI/c4ai-command-r-plus (Great for analysis & recommendations) +# - Qwen/Qwen2.5-72B-Instruct (Strong reasoning, good design knowledge) +# - mistralai/Mixtral-8x22B-Instruct-v0.1 (Large MoE, good balance) +# - meta-llama/Llama-3.1-405B-Instruct (BEST - if you have Pro++) +AGENT3_MODEL=meta-llama/Llama-3.1-70B-Instruct + +# === Agent 4 (Generator): CODE/JSON SPECIALIST === +# Task: Generate Tokens Studio JSON, CSS variables, structured output +# Needs: Precise formatting, code generation, schema adherence +# +# Options (pick one): +# - codellama/CodeLlama-34b-Instruct-hf (Code specialist) +# - bigcode/starcoder2-15b-instruct-v0.1 (Code generation) +# - mistralai/Codestral-22B-v0.1 (Mistral's code model) +# - deepseek-ai/deepseek-coder-33b-instruct (Strong code model) +AGENT4_MODEL=mistralai/Codestral-22B-v0.1 + +# === Fallback Model (if primary fails) === +FALLBACK_MODEL=mistralai/Mistral-7B-Instruct-v0.3 + +# ----------------------------------------------------------------------------- +# PRESET CONFIGURATIONS +# ----------------------------------------------------------------------------- + +# Uncomment ONE preset below, or configure individually above + +# --- PRESET: BUDGET (Free tier compatible) --- +# AGENT2_MODEL=microsoft/Phi-3.5-mini-instruct +# AGENT3_MODEL=mistralai/Mixtral-8x7B-Instruct-v0.1 +# AGENT4_MODEL=mistralai/Mistral-7B-Instruct-v0.3 + +# --- PRESET: BALANCED (Pro tier) --- +# AGENT2_MODEL=google/gemma-2-9b-it +# AGENT3_MODEL=meta-llama/Llama-3.1-70B-Instruct +# AGENT4_MODEL=mistralai/Codestral-22B-v0.1 + +# --- PRESET: MAXIMUM QUALITY (Pro tier) --- +# AGENT2_MODEL=google/gemma-2-27b-it +# AGENT3_MODEL=meta-llama/Llama-3.1-405B-Instruct +# AGENT4_MODEL=deepseek-ai/deepseek-coder-33b-instruct + +# ----------------------------------------------------------------------------- +# OPTIONAL: Application Settings +# ----------------------------------------------------------------------------- + +DEBUG=false +LOG_LEVEL=INFO +MAX_PAGES=20 +MIN_PAGES=10 + +# ----------------------------------------------------------------------------- +# OPTIONAL: Browser Settings (Playwright) +# ----------------------------------------------------------------------------- + +BROWSER_TYPE=chromium +BROWSER_HEADLESS=true +BROWSER_TIMEOUT=30000 +NETWORK_IDLE_TIMEOUT=5000 + +# ----------------------------------------------------------------------------- +# OPTIONAL: Storage Settings +# ----------------------------------------------------------------------------- + +STORAGE_PATH=/data +ENABLE_PERSISTENCE=true +MAX_VERSIONS=10 + +# ----------------------------------------------------------------------------- +# OPTIONAL: Rate Limiting +# ----------------------------------------------------------------------------- + +CRAWL_DELAY_MS=1000 +MAX_CONCURRENT_CRAWLS=3 +RESPECT_ROBOTS_TXT=true + +# ----------------------------------------------------------------------------- +# OPTIONAL: HuggingFace Inference Settings +# ----------------------------------------------------------------------------- + +USE_HF_INFERENCE_API=true +HF_INFERENCE_TIMEOUT=120 +HF_MAX_NEW_TOKENS=2048 +HF_TEMPERATURE=0.3 + +# ----------------------------------------------------------------------------- +# OPTIONAL: UI Settings +# ----------------------------------------------------------------------------- + +SERVER_PORT=7860 +SHARE=false +UI_THEME=soft + +# ----------------------------------------------------------------------------- +# OPTIONAL: Feature Flags +# ----------------------------------------------------------------------------- + +FEATURE_COLOR_RAMPS=true +FEATURE_TYPE_SCALES=true +FEATURE_A11Y_CHECKS=true +FEATURE_PARALLEL_EXTRACTION=true diff --git a/config/agents.yaml b/config/agents.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8e1d064233a34dfe2b6cbe5b715502fd16ccd72 --- /dev/null +++ b/config/agents.yaml @@ -0,0 +1,435 @@ +# ============================================================================= +# DESIGN SYSTEM EXTRACTOR v2 - AGENT CONFIGURATIONS +# ============================================================================= +# +# This file defines the personas and configurations for each agent in the +# design system extraction pipeline. +# +# Model Provider Options (via HuggingFace Inference Providers): +# - novita (default, good balance of cost/quality) +# - groq (fastest, good for Llama/Qwen) +# - cerebras (ultra-fast) +# - sambanova (good for Llama) +# - together (wide model selection) +# - fireworks-ai (good for code models) +# +# ============================================================================= + +# ============================================================================= +# INFERENCE PROVIDER CONFIGURATION +# ============================================================================= +inference: + # Default provider for all LLM calls + default_provider: "novita" + + # Fallback chain if primary fails + fallback_providers: + - "together" + - "sambanova" + - "groq" + + # HuggingFace account tiers + billing: + free_tier_credits: 0.10 # USD per month + pro_tier_credits: 2.00 # USD per month ($9/mo subscription) + warning_threshold: 0.80 # Warn at 80% usage + + # Show cost tracking in logs + show_cost_tracking: true + +# ============================================================================= +# STAGE 1: EXTRACTION (No LLM needed) +# ============================================================================= + +agent1_crawler: + name: "Website Crawler" + persona: "Meticulous Design Archaeologist" + description: | + Discovers and crawls website pages to extract design tokens. + Uses Playwright for browser automation. + model: null # Rule-based, no LLM needed + +agent1_extractor: + name: "Token Extractor" + persona: "Meticulous Design Archaeologist" + description: | + Extracts colors, typography, spacing, radius, and shadows from pages. + Pure CSS/computed style extraction. + model: null # Rule-based, no LLM needed + +# ============================================================================= +# STAGE 1: NORMALIZATION +# ============================================================================= + +agent2_normalizer: + name: "Token Normalizer" + persona: "Design System Librarian" + description: | + Cleans, deduplicates, and structures extracted tokens. + Infers naming patterns and semantic roles. + model: "microsoft/Phi-3.5-mini-instruct" + provider: "novita" + max_tokens: 1000 + temperature: 0.3 + tasks: + - Clean noisy extraction data + - Deduplicate similar tokens + - Infer semantic naming (primary, secondary, etc.) + - Tag confidence levels + +# ============================================================================= +# STAGE 2: MULTI-AGENT ANALYSIS (LangGraph Parallel) +# ============================================================================= +# +# Architecture: +# LLM 1 ─┐ +# ├──> HEAD ──> Final Recommendations +# LLM 2 ─┘ +# +# LLM 1 and LLM 2 run in PARALLEL via LangGraph +# HEAD compiles results and resolves conflicts +# +# ============================================================================= + +stage2_llm1: + name: "Design Analyst 1" + persona: "Senior Design Systems Architect (Global Perspective)" + description: | + Analyzes design tokens against industry best practices. + Focuses on global/Asian design systems (Ant Design, etc.) + model: "Qwen/Qwen2.5-72B-Instruct" + provider: "novita" + max_tokens: 1500 + temperature: 0.4 + # Cost tracking (per million tokens) + cost_per_million_input: 0.29 + cost_per_million_output: 0.59 + tasks: + - Analyze typography patterns and scale consistency + - Evaluate color system and semantic usage + - Check AA/WCAG accessibility compliance + - Assess spacing consistency and grid alignment + - Compare against competitor design systems + expertise: + - Typography best practices and type scales + - Color theory and accessibility standards + - Spacing systems (4px, 8px grids) + - International design standards + - Ant Design, Chakra UI patterns + +stage2_llm2: + name: "Design Analyst 2" + persona: "Senior Design Systems Architect (Western Perspective)" + description: | + Analyzes design tokens against industry best practices. + Focuses on Western/US design systems (Material, Apple, etc.) + model: "meta-llama/Llama-3.3-70B-Instruct" + provider: "novita" + max_tokens: 1500 + temperature: 0.4 + # Cost tracking (per million tokens) + cost_per_million_input: 0.59 + cost_per_million_output: 0.79 + tasks: + - Analyze typography patterns and scale consistency + - Evaluate color system and semantic usage + - Check AA/WCAG accessibility compliance + - Assess spacing consistency and grid alignment + - Compare against competitor design systems + expertise: + - Material Design 3 patterns + - Apple Human Interface Guidelines + - Shopify Polaris conventions + - IBM Carbon design standards + - Atlassian Design System + +stage2_head: + name: "Head Compiler" + persona: "Principal Design Systems Architect" + description: | + Compiles analyses from both LLM analysts. + Resolves conflicts and synthesizes final recommendations. + model: "meta-llama/Llama-3.3-70B-Instruct" + provider: "novita" + max_tokens: 2000 + temperature: 0.3 + # Cost tracking (per million tokens) + cost_per_million_input: 0.59 + cost_per_million_output: 0.79 + tasks: + - Compare both analyst perspectives + - Identify agreements and disagreements + - Resolve conflicts with clear reasoning + - Synthesize final recommendations + - Provide confidence scores + expertise: + - Cross-cultural design synthesis + - Conflict resolution and arbitration + - Best practice prioritization + +stage2_rules: + name: "Rule Engine" + persona: "Calculation Engine" + description: | + Performs deterministic calculations. + No LLM needed - pure math/logic. + model: null # No LLM - FREE + tasks: + - Generate type scale options (1.2, 1.25, 1.333) + - Calculate spacing grid alignments (4px, 8px) + - Generate color ramps (50-950 shades) + - Compute contrast ratios for accessibility + +# ============================================================================= +# STAGE 3: GENERATION +# ============================================================================= + +agent4_generator: + name: "JSON Generator" + persona: "Automation Engineer" + description: | + Converts finalized tokens to production-ready JSON. + Generates Figma-compatible token files. + model: "mistralai/Codestral-22B-v0.1" + provider: "novita" + max_tokens: 2000 + temperature: 0.2 + tasks: + - Generate final JSON structure + - Apply selected upgrades + - Format for Figma Tokens Studio + - Include metadata and versioning + +# ============================================================================= +# COMPETITOR DESIGN SYSTEMS +# ============================================================================= + +competitors: + # Default list (shown in UI, user can edit) + default: + - "Material Design 3" + - "Apple Human Interface Guidelines" + - "Shopify Polaris" + - "IBM Carbon" + - "Atlassian Design System" + + # Suggestions for user to add + suggestions: + - "Ant Design" + - "Chakra UI" + - "Tailwind CSS" + - "Bootstrap" + - "Salesforce Lightning" + - "Adobe Spectrum" + - "GitHub Primer" + - "Microsoft Fluent" + - "Radix UI" + - "MUI (Material UI)" + +# ============================================================================= +# LLM PROMPT TEMPLATES +# ============================================================================= + +prompts: + # Prompt for LLM 1 and LLM 2 (same structure, different perspective) + analyst: | + You are a {persona}. + + ## YOUR TASK + Analyze these design tokens extracted from a website and compare against industry best practices. + + ## EXTRACTED TOKENS + {tokens_summary} + + ## COMPETITOR DESIGN SYSTEMS TO RESEARCH + {competitors} + + ## ANALYZE THE FOLLOWING: + + ### 1. Typography + - Is the type scale consistent? Does it follow a mathematical ratio? + - What is the detected base size? + - Compare to competitors: what ratios do they use? + - Score (1-10) and specific recommendations + + ### 2. Colors + - Is the color palette cohesive? + - Are semantic colors properly defined (primary, secondary, etc.)? + - Compare to competitors: how do they structure colors? + - Score (1-10) and specific recommendations + + ### 3. Accessibility (AA Compliance) + - What contrast issues exist? + - Which color combinations fail WCAG AA (4.5:1 for text)? + - Specific fixes needed + - Score (1-10) + + ### 4. Spacing + - Is spacing consistent? Does it follow a grid (4px, 8px)? + - What base unit is detected? + - Compare to competitors: what spacing systems do they use? + - Score (1-10) and specific recommendations + + ### 5. Overall Assessment + - Top 3 priorities for improvement + - What works well (keep these) + - What needs immediate attention + + ## RESPOND IN JSON FORMAT: + ```json + {{ + "typography": {{ + "analysis": "...", + "detected_ratio": 1.2, + "detected_base": 16, + "score": 7, + "recommendations": ["...", "..."] + }}, + "colors": {{ + "analysis": "...", + "score": 6, + "recommendations": ["...", "..."] + }}, + "accessibility": {{ + "issues": ["...", "..."], + "score": 5, + "fixes": ["...", "..."] + }}, + "spacing": {{ + "analysis": "...", + "detected_base": 8, + "score": 7, + "recommendations": ["...", "..."] + }}, + "overall_assessment": "...", + "top_3_priorities": ["...", "...", "..."], + "confidence": 85 + }} + ``` + + # Prompt for HEAD compiler + head_compiler: | + You are a Principal Design Systems Architect compiling analyses from two expert analysts. + + ## ANALYST 1 (Global/Asian Perspective) FINDINGS: + {llm1_analysis} + + ## ANALYST 2 (Western/US Perspective) FINDINGS: + {llm2_analysis} + + ## RULE-BASED CALCULATIONS: + {rule_calculations} + + ## YOUR TASK: + 1. Compare both analyst perspectives + 2. Identify where they AGREE (high confidence) + 3. Identify where they DISAGREE (need resolution) + 4. Resolve disagreements with clear reasoning + 5. Synthesize final recommendations + + ## RESPOND IN JSON FORMAT: + ```json + {{ + "agreements": [ + {{"topic": "typography", "finding": "...", "confidence": 95}}, + ... + ], + "disagreements": [ + {{"topic": "spacing", "llm1_view": "...", "llm2_view": "...", "resolution": "...", "reasoning": "..."}} + ], + "final_recommendations": {{ + "type_scale": "1.25", + "type_scale_rationale": "Both analysts agree 1.25 (Major Third) balances readability and hierarchy", + "spacing_base": "8px", + "spacing_rationale": "8px grid provides better alignment with industry standards", + "color_improvements": ["Generate full ramps", "Fix contrast on secondary"], + "accessibility_fixes": ["Increase body text contrast to 4.5:1"] + }}, + "overall_confidence": 88, + "summary": "The extracted design system shows strong typography foundations but needs spacing normalization and color accessibility improvements." + }} + ``` + +# ============================================================================= +# EXTRACTION SETTINGS +# ============================================================================= + +extraction: + viewports: + desktop: + width: 1440 + height: 900 + name: "Desktop" + mobile: + width: 375 + height: 812 + name: "Mobile" + + crawling: + max_pages: 20 + min_pages: 10 + scroll_behavior: "smooth" + wait_for_network_idle: true + network_idle_timeout_ms: 5000 + skip_infinite_scroll: true + +# ============================================================================= +# TYPE SCALE OPTIONS +# ============================================================================= + +type_scales: + minor_second: + ratio: 1.067 + name: "Minor Second" + description: "Very subtle, good for dense data UIs" + major_second: + ratio: 1.125 + name: "Major Second" + description: "Subtle progression" + minor_third: + ratio: 1.2 + name: "Minor Third" + description: "Balanced, widely used" + major_third: + ratio: 1.25 + name: "Major Third" + description: "Classic, recommended for most UIs" + recommended: true + perfect_fourth: + ratio: 1.333 + name: "Perfect Fourth" + description: "Strong hierarchy, good for marketing" + golden_ratio: + ratio: 1.618 + name: "Golden Ratio" + description: "Dramatic, use sparingly" + +# ============================================================================= +# SPACING SYSTEMS +# ============================================================================= + +spacing_systems: + 4px_grid: + base: 4 + name: "4px Grid" + description: "Fine-grained control, good for dense UIs" + scale: [0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64] + 8px_grid: + base: 8 + name: "8px Grid" + description: "Industry standard, recommended" + recommended: true + scale: [0, 8, 16, 24, 32, 48, 64, 80, 96, 128] + +# ============================================================================= +# COLOR RAMP GENERATION +# ============================================================================= + +color_ramps: + shades: [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 950] + method: "oklch" # Modern perceptually uniform + + accessibility: + aa_minimum: 4.5 + aaa_minimum: 7.0 + large_text_aa: 3.0 diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..607b7f453838b5ce74d5443a8cc1b489fe1ae7a4 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,230 @@ +""" +Application Settings +Design System Extractor v2 + +Loads configuration from environment variables and YAML files. +""" + +import os +from pathlib import Path +from typing import Optional +from dataclasses import dataclass, field +from dotenv import load_dotenv +import yaml + +# Load environment variables from .env file +env_path = Path(__file__).parent / ".env" +if env_path.exists(): + load_dotenv(env_path) +else: + # Try loading from parent directory (for development) + load_dotenv(Path(__file__).parent.parent / ".env") + + +@dataclass +class HFSettings: + """Hugging Face configuration.""" + hf_token: str = field(default_factory=lambda: os.getenv("HF_TOKEN", "")) + hf_space_name: str = field(default_factory=lambda: os.getenv("HF_SPACE_NAME", "")) + use_inference_api: bool = field(default_factory=lambda: os.getenv("USE_HF_INFERENCE_API", "true").lower() == "true") + inference_timeout: int = field(default_factory=lambda: int(os.getenv("HF_INFERENCE_TIMEOUT", "120"))) + max_new_tokens: int = field(default_factory=lambda: int(os.getenv("HF_MAX_NEW_TOKENS", "2048"))) + temperature: float = field(default_factory=lambda: float(os.getenv("HF_TEMPERATURE", "0.3"))) + + +@dataclass +class ModelSettings: + """Model configuration for each agent — Diverse providers.""" + # Agent 1: Rule-based, no LLM needed + + # Agent 2 (Normalizer): Fast structured output + # Default: Microsoft Phi (fast, great structured output) + agent2_model: str = field(default_factory=lambda: os.getenv("AGENT2_MODEL", "microsoft/Phi-3.5-mini-instruct")) + + # Agent 3 (Advisor): Strong reasoning - MOST IMPORTANT + # Default: Qwen 2.5 72B (freely available on HF serverless, no gated access needed) + # Alternative: meta-llama/Llama-3.1-70B-Instruct (requires Meta license acceptance) + agent3_model: str = field(default_factory=lambda: os.getenv("AGENT3_MODEL", "Qwen/Qwen2.5-72B-Instruct")) + + # Agent 4 (Generator): Code/JSON specialist + # Default: Mistral Codestral (code specialist) + agent4_model: str = field(default_factory=lambda: os.getenv("AGENT4_MODEL", "mistralai/Codestral-22B-v0.1")) + + # Fallback (must be freely available on HF serverless inference) + fallback_model: str = field(default_factory=lambda: os.getenv("FALLBACK_MODEL", "Qwen/Qwen2.5-7B-Instruct")) + + +@dataclass +class APISettings: + """API key configuration (optional alternatives).""" + anthropic_api_key: str = field(default_factory=lambda: os.getenv("ANTHROPIC_API_KEY", "")) + openai_api_key: str = field(default_factory=lambda: os.getenv("OPENAI_API_KEY", "")) + + +@dataclass +class BrowserSettings: + """Playwright browser configuration.""" + browser_type: str = field(default_factory=lambda: os.getenv("BROWSER_TYPE", "chromium")) + headless: bool = field(default_factory=lambda: os.getenv("BROWSER_HEADLESS", "true").lower() == "true") + timeout: int = field(default_factory=lambda: int(os.getenv("BROWSER_TIMEOUT", "30000"))) + network_idle_timeout: int = field(default_factory=lambda: int(os.getenv("NETWORK_IDLE_TIMEOUT", "5000"))) + + +@dataclass +class CrawlSettings: + """Website crawling configuration.""" + max_pages: int = field(default_factory=lambda: int(os.getenv("MAX_PAGES", "20"))) + min_pages: int = field(default_factory=lambda: int(os.getenv("MIN_PAGES", "10"))) + crawl_delay_ms: int = field(default_factory=lambda: int(os.getenv("CRAWL_DELAY_MS", "1000"))) + max_concurrent: int = field(default_factory=lambda: int(os.getenv("MAX_CONCURRENT_CRAWLS", "3"))) + respect_robots_txt: bool = field(default_factory=lambda: os.getenv("RESPECT_ROBOTS_TXT", "true").lower() == "true") + + +@dataclass +class ViewportSettings: + """Viewport configuration for extraction.""" + desktop_width: int = 1440 + desktop_height: int = 900 + mobile_width: int = 375 + mobile_height: int = 812 + + +@dataclass +class StorageSettings: + """Persistent storage configuration.""" + storage_path: str = field(default_factory=lambda: os.getenv("STORAGE_PATH", "/data")) + enable_persistence: bool = field(default_factory=lambda: os.getenv("ENABLE_PERSISTENCE", "true").lower() == "true") + max_versions: int = field(default_factory=lambda: int(os.getenv("MAX_VERSIONS", "10"))) + + +@dataclass +class UISettings: + """UI configuration.""" + server_port: int = field(default_factory=lambda: int(os.getenv("SERVER_PORT", "7860"))) + share: bool = field(default_factory=lambda: os.getenv("SHARE", "false").lower() == "true") + theme: str = field(default_factory=lambda: os.getenv("UI_THEME", "soft")) + + +@dataclass +class FeatureFlags: + """Feature toggles.""" + color_ramps: bool = field(default_factory=lambda: os.getenv("FEATURE_COLOR_RAMPS", "true").lower() == "true") + type_scales: bool = field(default_factory=lambda: os.getenv("FEATURE_TYPE_SCALES", "true").lower() == "true") + a11y_checks: bool = field(default_factory=lambda: os.getenv("FEATURE_A11Y_CHECKS", "true").lower() == "true") + parallel_extraction: bool = field(default_factory=lambda: os.getenv("FEATURE_PARALLEL_EXTRACTION", "true").lower() == "true") + + +@dataclass +class Settings: + """Main settings container.""" + debug: bool = field(default_factory=lambda: os.getenv("DEBUG", "false").lower() == "true") + log_level: str = field(default_factory=lambda: os.getenv("LOG_LEVEL", "INFO")) + + hf: HFSettings = field(default_factory=HFSettings) + models: ModelSettings = field(default_factory=ModelSettings) + api: APISettings = field(default_factory=APISettings) + browser: BrowserSettings = field(default_factory=BrowserSettings) + crawl: CrawlSettings = field(default_factory=CrawlSettings) + viewport: ViewportSettings = field(default_factory=ViewportSettings) + storage: StorageSettings = field(default_factory=StorageSettings) + ui: UISettings = field(default_factory=UISettings) + features: FeatureFlags = field(default_factory=FeatureFlags) + + # Agent configuration loaded from YAML + agents_config: dict = field(default_factory=dict) + + def __post_init__(self): + """Load agent configuration from YAML after initialization.""" + self.load_agents_config() + + def load_agents_config(self): + """Load agent personas and settings from YAML file.""" + yaml_path = Path(__file__).parent / "agents.yaml" + if yaml_path.exists(): + with open(yaml_path, "r") as f: + self.agents_config = yaml.safe_load(f) + else: + print(f"Warning: agents.yaml not found at {yaml_path}") + self.agents_config = {} + + def get_agent_persona(self, agent_name: str) -> str: + """Get persona string for an agent.""" + agent_key = f"agent_{agent_name}" + if agent_key in self.agents_config: + return self.agents_config[agent_key].get("persona", "") + return "" + + def get_agent_config(self, agent_name: str) -> dict: + """Get full configuration for an agent.""" + agent_key = f"agent_{agent_name}" + return self.agents_config.get(agent_key, {}) + + def get_model_for_agent(self, agent_name: str) -> str: + """Get the model ID for a specific agent.""" + model_map = { + # Legacy agents + "normalizer": self.models.agent2_model, + "advisor": self.models.agent3_model, + "generator": self.models.agent4_model, + + # Stage 2 New Architecture agents — optimized model per role + # AURORA: Creative/visual reasoning for brand color analysis + "brand_identifier": os.getenv("BRAND_IDENTIFIER_MODEL", "Qwen/Qwen2.5-72B-Instruct"), + # ATLAS: 128K context for large benchmark data, strong comparative reasoning + "benchmark_advisor": os.getenv("BENCHMARK_ADVISOR_MODEL", "meta-llama/Llama-3.3-70B-Instruct"), + # SENTINEL: Methodical rule-following, precise judgment + "best_practices_validator": os.getenv("BEST_PRACTICES_MODEL", "Qwen/Qwen2.5-72B-Instruct"), + # NEXUS: 128K context for combined inputs, strong synthesis + "head_synthesizer": os.getenv("HEAD_SYNTHESIZER_MODEL", "meta-llama/Llama-3.3-70B-Instruct"), + "benchmark_extractor": self.models.agent2_model, # Phi-3.5 - structured extraction + } + return model_map.get(agent_name, self.models.fallback_model) + + def validate(self) -> list[str]: + """Validate settings and return list of errors.""" + errors = [] + + if not self.hf.hf_token: + errors.append("HF_TOKEN is required for model inference") + + if self.crawl.max_pages < self.crawl.min_pages: + errors.append("MAX_PAGES must be >= MIN_PAGES") + + return errors + + +# Global settings instance +settings = Settings() + + +def get_settings() -> Settings: + """Get the global settings instance.""" + return settings + + +def reload_settings() -> Settings: + """Reload settings from environment and config files.""" + global settings + settings = Settings() + return settings + + +# Convenience functions +def is_debug() -> bool: + """Check if debug mode is enabled.""" + return settings.debug + + +def get_hf_token() -> str: + """Get HuggingFace token.""" + return settings.hf.hf_token + + +def get_agent_persona(agent_name: str) -> str: + """Get persona for an agent.""" + return settings.get_agent_persona(agent_name) + + +def get_model_for_agent(agent_name: str) -> str: + """Get model ID for an agent.""" + return settings.get_model_for_agent(agent_name) diff --git a/content/LINKEDIN_POST.md b/content/LINKEDIN_POST.md new file mode 100644 index 0000000000000000000000000000000000000000..65a1ed064e0caf072d17c1a2a2ad0c6a15ac53be --- /dev/null +++ b/content/LINKEDIN_POST.md @@ -0,0 +1,40 @@ +# LinkedIn Post + +--- + +I built a system that audits any website's design system — automatically. + +Point it at a URL. It extracts every color, font, spacing value from the DOM. Then 4 AI agents analyze it like a senior design team. + +The secret? Not everything needs AI. + +Layer 1 (free, <1 second): +- WCAG contrast checker (pure math) +- Type scale detection +- Spacing grid analysis +- Color deduplication + +Layer 2 (~$0.003): +- AURORA: identifies brand colors from usage context +- ATLAS: recommends which design system to align with +- SENTINEL: prioritizes fixes by business impact +- NEXUS: synthesizes everything into a final report + +My V1 used LLMs for everything. +Cost: ~$1/run. Accuracy: mediocre (LLMs hallucinate math). + +V2 flipped the approach: +Deterministic code handles certainty. LLMs handle ambiguity. + +Result: 100-300x cheaper. More accurate. Always produces output even when LLMs fail. + +The rule engine does 80% of the work for $0. +The agents handle the 20% that requires judgment. + +Built with: Playwright + HuggingFace Inference API (Qwen 72B, Llama 3.3 70B) + Gradio + Docker + +Full write-up on Medium (link in comments). + +What design workflows are you automating? Would love to hear. + +#UXDesign #AIEngineering #DesignSystems #HuggingFace #LLM #Accessibility #WCAG #MultiAgent #Gradio #BuildInPublic diff --git a/content/MEDIUM_ARTICLE.md b/content/MEDIUM_ARTICLE.md new file mode 100644 index 0000000000000000000000000000000000000000..fbf6137f2bb05ffa57ceff457ecfa0c01cf89e6a --- /dev/null +++ b/content/MEDIUM_ARTICLE.md @@ -0,0 +1,406 @@ +# 🚅 AI in My Daily Work — Episode [X]: Building a Design System Analyzer with 4 AI Agents + a Free Rule Engine + +*How I built a system that extracts any website's design tokens and audits them like a senior design team — for ~$0.003 per run.* + +[IMAGE: Hero banner — Gradio UI showing the pipeline output] + +--- + +## The Problem + +Every week, the same story. + +A designer opens a website and squints: "Is that our brand blue? Why does this button look different on mobile? How many shades of gray are we actually using?" + +Design systems are supposed to prevent this. But **auditing** one? That's a different problem entirely. + +- Open DevTools on every page +- Manually extract colors, fonts, spacing +- Cross-reference against WCAG accessibility guidelines +- Compare to industry benchmarks like Material Design or Polaris +- Write a report with prioritized recommendations + +For a 20-page website, this takes **2–3 days of manual work**. And by the time you're done, the codebase has already changed. + +I wanted a system that could think like a design team: + +- a **crawler** discovering every page +- an **extractor** pulling every token from the DOM +- a **rule engine** checking accessibility and consistency — for free +- and **specialized AI agents** interpreting what the numbers actually mean + +So I built one. + +--- + +## The Solution (In One Sentence) + +I built a 4-agent system backed by a free rule engine that acts like an entire design audit team: data extraction + WCAG compliance + benchmark comparison + brand analysis + prioritized recommendations. It runs on HuggingFace Spaces, costs ~$0.003 per analysis, and delivers actionable output automatically. + +--- + +## Architecture Overview: Two Layers, Four Agents + +My first attempt (V1) made a classic mistake: +**I used a large language model for everything.** + +### Why Two Layers? + +My V1 mistake: Used GPT-4 for everything +❌ Cost: $0.50–1.00 per run +❌ Speed: 15+ seconds for basic math +❌ Accuracy: LLMs hallucinate contrast ratios + +The fix: **Not every task needs AI. Some need good engineering.** + +V2 flipped the approach. + +> **Deterministic code handles certainty. LLMs handle ambiguity.** + +This led to a two-layer architecture. + +[IMAGE: Architecture diagram — Layer 1 (Deterministic) → Layer 2 (AI Agents)] + +``` +┌─────────────────────────────────────────────────┐ +│ LAYER 1: DETERMINISTIC (Free — $0.00) │ +│ ├─ Crawler + Extractor + Normalizer │ +│ ├─ WCAG Contrast Checker (math) │ +│ ├─ Type Scale Detection (ratio math) │ +│ ├─ Spacing Grid Analysis (GCD math) │ +│ └─ Color Statistics (deduplication) │ +├─────────────────────────────────────────────────┤ +│ LAYER 2: AI AGENTS (~$0.003) │ +│ ├─ AURORA — Brand Color Analyst │ +│ ├─ ATLAS — Benchmark Advisor │ +│ ├─ SENTINEL — Best Practices Auditor │ +│ └─ NEXUS — Head Synthesizer │ +└─────────────────────────────────────────────────┘ +``` + +--- + +## Layer 1: Deterministic Intelligence (No LLM) + +These agents do the heavy lifting — no LLMs involved. + +### What This Layer Does + +- Crawls every page with Playwright (desktop 1440px + mobile 375px) +- Extracts tokens from **7 sources**: DOM computed styles, CSS variables, SVG colors, inline styles, stylesheet rules, external CSS files (Firecrawl), brute-force page scan +- Deduplicates colors (exact hex + Delta-E distance) +- Checks **actual FG/BG pairs** against WCAG — not just "color vs white" +- Detects type scale ratio and spacing grid +- Scores overall consistency (0–100) + +### Rule Engine Output: + +``` +📐 TYPE SCALE ANALYSIS +├─ Detected Ratio: 1.167 +├─ Closest Standard: Minor Third (1.2) +├─ Consistent: ⚠️ No (variance: 0.24) +└─ 💡 Recommendation: 1.25 (Major Third) + +♿ ACCESSIBILITY CHECK (WCAG AA/AAA) +├─ Colors Analyzed: 210 +├─ FG/BG Pairs Checked: 220 +├─ AA Pass: 143 ✅ +├─ AA Fail (real FG/BG pairs): 67 ❌ +│ ├─ fg:#06b2c4 on bg:#ffffff → 💡 Fix: #048391 (4.5:1) +│ ├─ fg:#999999 on bg:#ffffff → 💡 Fix: #757575 (4.6:1) +│ └─ ... and 62 more + +📏 SPACING GRID +├─ Detected Base: 1px (GCD) +├─ Grid Aligned: ⚠️ 0% +└─ 💡 Recommendation: 8px grid + +📊 CONSISTENCY SCORE: 52/100 +``` + +This entire layer runs **in under 1 second** and costs nothing beyond compute — the single biggest cost optimization in the system. + +--- + +## Layer 2: AI Analysis & Interpretation (4 Agents) + +This is where language models actually add value — tasks that require **context, reasoning, and judgment**. + +[IMAGE: Agent pipeline diagram — AURORA → ATLAS → SENTINEL → NEXUS] + +--- + +### Agent 1: AURORA — Brand Color Analyst +**Model:** Qwen 72B (HuggingFace PRO) +**Cost:** Free within PRO subscription ($9/month) +**Temperature:** 0.4 + +**The Challenge:** The rule engine found 143 colors. Which one is the *brand* primary? + +A rule engine can count that `#06b2c4` appears in 33 buttons. But it can't reason: "33 buttons + 12 CTAs + dominant accent positioning = this is almost certainly the brand primary." That requires **context understanding**. + +**Sample Output:** + +``` +AURORA's Analysis: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +🎨 Brand Primary: #06b2c4 (confidence: HIGH) + └─ 33 buttons, 12 CTAs, dominant accent + +🎨 Brand Secondary: #373737 (confidence: HIGH) + └─ 89 text elements, consistent dark tone + +Palette Strategy: Complementary +Cohesion Score: 7/10 + └─ "Clear hierarchy, accent colors differentiated" + +Self-Evaluation: confidence=8/10, data=good +``` + +--- + +### Agent 2: ATLAS — Benchmark Advisor +**Model:** Llama 3.3 70B (128K context) +**Cost:** Free within PRO subscription +**Temperature:** 0.25 + +**Unique Capability:** Industry benchmarking against 8 design systems (Material 3, Polaris, Atlassian, Carbon, Apple HIG, Tailwind, Ant, Chakra). + +[IMAGE: Benchmark comparison table from the UI] + +This agent doesn't just pick the closest match — it reasons about **effort vs. value**: + +``` +ATLAS's Recommendation: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Recommended: Shopify Polaris (87% match) + +Alignment Changes: + ├─ Type scale: 1.17 → 1.25 (effort: medium) + ├─ Spacing grid: mixed → 4px (effort: high) + └─ Base size: 16px → 16px (already aligned ✅) + +Pros: Closest match, e-commerce proven, well-documented +Cons: Spacing migration is significant effort + +Alternative: Material 3 (77% match) + └─ "Stronger mobile patterns, but 8px grid + requires more restructuring" +``` + +ATLAS's Value Add: + +> "You're 87% aligned to Polaris already. Closing the gap on type scale takes ~1 hour and makes your system industry-standard. **Priority: MEDIUM.**" + +--- + +### Agent 3: SENTINEL — Best Practices Auditor +**Model:** Qwen 72B +**Cost:** Free within PRO subscription +**Temperature:** 0.2 (strict, consistent) + +**The Challenge:** The rule engine says "67 AA failures." But which ones matter most? + +SENTINEL prioritizes by **business impact** — not just severity: + +``` +SENTINEL's Priority Fixes: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Overall Score: 68/100 + +Checks: + ├─ ✅ Type Scale Standard (1.25 ratio) + ├─ ⚠️ Type Scale Consistency (variance 0.18) + ├─ ✅ Base Size Accessible (16px) + ├─ ❌ AA Compliance (67 failures) + ├─ ⚠️ Spacing Grid (0% aligned) + └─ ❌ Near-Duplicates (351 pairs) + +Priority Fixes: + #1 Fix brand color AA compliance + Impact: HIGH | Effort: 5 min + → "Affects 40% of interactive elements" + + #2 Consolidate near-duplicate colors + Impact: MEDIUM | Effort: 2 hours + + #3 Align spacing to 8px grid + Impact: MEDIUM | Effort: 1 hour +``` + +--- + +### Agent 4: NEXUS — Head Synthesizer (Final Output) +**Model:** Llama 3.3 70B (128K context) +**Cost:** ~$0.001 +**Temperature:** 0.3 + +**No AI for Agents 1–3 can replace this.** NEXUS takes outputs from ALL three agents + the rule engine and synthesizes a final recommendation — **resolving contradictions**, weighting scores, and producing the executive summary the user actually sees. + +If ATLAS says "close to Polaris" but SENTINEL says "spacing misaligned," NEXUS reconciles: *"Align to Polaris type scale now (low effort) but defer spacing migration (high effort)."* + +``` +NEXUS Final Synthesis: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +📝 Executive Summary: +"Your design system scores 68/100. Critical: +67 color pairs fail AA. Top action: fix brand +primary contrast (5 min, high impact)." + +📊 Scores: + ├─ Overall: 68/100 + ├─ Accessibility: 45/100 + ├─ Consistency: 75/100 + └─ Organization: 70/100 + +🎯 Top 3 Actions: + 1. Fix brand color AA (#06b2c4 → #048391) + Impact: HIGH | Effort: 5 min + 2. Align type scale to 1.25 + Impact: MEDIUM | Effort: 1 hour + 3. Consolidate 143 → ~20 semantic colors + Impact: MEDIUM | Effort: 2 hours + +🎨 Color Recommendations: + ├─ ✅ brand.primary: #06b2c4 → #048391 (auto-accept) + ├─ ✅ text.secondary: #999999 → #757575 (auto-accept) + └─ ❌ brand.accent: #FF6B35 → #E65100 (user decides) +``` + +--- + +## Real Analysis: Two Websites + +### Website A: The Clean System + +``` +Landing → Product → Cart → Checkout +``` + +**Consistency Score:** 78/100 +**AA Failures:** 3 (all minor text colors) +**Type Scale:** 1.25 ratio, consistent across pages +**Agent Insight:** "Well-structured system. Minor AA fixes on secondary text. Already 92% aligned to Material 3." + +### Website B: The Messy System + +``` +Landing → Features → Pricing → ⚠️ Contact → Signup +``` + +**Consistency Score:** 34/100 +**AA Failures:** 67 +**Colors:** 143 unique (351 near-duplicates) +**Agent Insight:** "No clear type scale. Brand primary fails AA on every interactive element. 143 colors suggests no design system is actually enforced." + +**NEXUS's Diagnosis:** +> "This isn't a broken design system — it's the absence of one. Start with AA compliance (5 min fix), then consolidate to ~20 semantic colors (2 hours). Align to Polaris as your foundation." + +That last line is the difference between a report and an **action plan**. + +--- + +## Cost & Model Strategy + +Different agents use different models — intentionally. + +[IMAGE: Cost comparison table] + +| Agent | Model | Why This Model | Cost | +|-------|-------|---------------|------| +| Rule Engine | None | Math doesn't need AI | $0.00 | +| AURORA | Qwen 72B | Creative color reasoning | ~Free (HF PRO) | +| ATLAS | Llama 3.3 70B | 128K context for benchmarks | ~Free (HF PRO) | +| SENTINEL | Qwen 72B | Strict, consistent evaluation | ~Free (HF PRO) | +| NEXUS | Llama 3.3 70B | 128K context for synthesis | ~$0.001 | +| **Total** | | | **~$0.003** | + +For designer-scale usage (weekly runs), inference costs are effectively negligible, with HuggingFace PRO ($9/month) covering most models. + +Compared to V1, this architecture delivers: +- **~100–300x cost reduction** +- **Faster execution** (rule engine: <1s vs LLM: 15s for the same math) +- **Better accuracy** (LLMs hallucinate math; rule engines don't) +- **Graceful degradation** (always produces output, even when LLMs fail) + +--- + +## Graceful Degradation + +The system **always produces output**, even when components fail: + +| If This Fails... | What Happens | +|-------------------|-------------| +| LLM agents down | Rule engine analysis still works (free) | +| Firecrawl unavailable | DOM-only extraction (slightly fewer tokens) | +| Benchmark fetch fails | Hardcoded fallback data from 8 systems | +| NEXUS synthesis fails | `create_fallback_synthesis()` from rule engine | +| **Entire AI layer** | **Full rule-engine-only report — still useful** | + +--- + +## What I Learned + +**1. Overusing LLMs is a design failure.** +If rules can do it faster and cheaper — use rules. My WCAG checker is 100% accurate. An LLM's contrast ratio calculation? Maybe 85% accurate, and 100x slower. + +**2. Industry benchmarks are gold.** +Without benchmarks: "Your type scale is inconsistent" → *PM nods* +With benchmarks: "You're 87% aligned to Shopify Polaris. Closing the gap takes 1 hour and makes your system industry-standard." → *PM schedules meeting* + +Time to build benchmark database: 1 day +Value: Transforms analysis into prioritized action + +**3. Specialized agents > one big prompt.** +One mega-prompt doing brand analysis + benchmark comparison + accessibility audit + synthesis = confused, unfocused output. Four agents, each with a single responsibility = sharp, reliable analysis. + +The same principle as microservices: do one thing well. + +**4. UX skills transfer directly to AI systems.** +Agent design feels a lot like service design: +- flows +- handoffs +- failure modes +- human interpretation + +The best AI architectures are the ones designed like good products. + +--- + +## A Note on the Tech Stack + +**On HuggingFace Spaces:** I'm using HF Spaces as the hosting platform with a Gradio frontend running in Docker. The LLM models (Qwen 72B, Llama 3.3 70B) are called via HuggingFace Inference API. Browser automation (Playwright + Chromium) runs inside the container. + +**On the Data:** This system works on **live websites** — point it at any URL and it extracts real design tokens from the actual DOM. No synthetic data. The architecture, LLM integrations, and rule engine are production-ready. + +🔗 **HuggingFace Space** (Live Demo): [link] + +[IMAGE: Screenshot of the Gradio UI showing full analysis results] + +--- + +## Closing Thought + +AI engineering isn't about fancy models or complex architecture. It's about knowing which problems need AI vs good engineering. + +It's **compression** — compressing days of manual audit, multiple expert perspectives, and industry benchmarking into something a team can act on Monday morning. + +Instead of 2–3 days reviewing DevTools, your team gets: +> "Top 3 issues, ranked by impact, with specific fixes, benchmark alignment, and brand color identification" + +That's AI amplifying design systems impact. + +🔗 Full code on GitHub: [link] + +--- + +*This is Episode [X] of "AI in My Daily Work."* + +*If you missed the previous episodes:* +- *Episode 5: Building a 7-Agent UX Friction Analysis System in Databricks* +- *Episode 4: Automating UI Regression Testing with AI Agents (Part-1)* +- *Episode 3: Building a Multi-Agent Review Intelligence System* +- *Episode 2: How I Use a Team of AI Agents to Automate Secondary Research* + +*What problems are you automating with AI? Drop a comment — I'd love to discuss what you're building.* diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4474914eec4d2b31a00d6a8b0a84346c3a88502e --- /dev/null +++ b/core/__init__.py @@ -0,0 +1,85 @@ +""" +Core utilities for Design System Extractor v2. +""" + +from core.token_schema import ( + TokenSource, + Confidence, + Viewport, + PageType, + ColorToken, + TypographyToken, + SpacingToken, + RadiusToken, + ShadowToken, + ExtractedTokens, + NormalizedTokens, + FinalTokens, + WorkflowState, +) + +from core.color_utils import ( + parse_color, + normalize_hex, + get_contrast_ratio, + check_wcag_compliance, + generate_color_ramp, + generate_accessible_ramp, + categorize_color, + suggest_color_name, +) + +from core.rule_engine import ( + run_rule_engine, + analyze_type_scale, + analyze_accessibility, + analyze_spacing_grid, + analyze_color_statistics, + TypeScaleAnalysis, + ColorAccessibility, + SpacingGridAnalysis, + ColorStatistics, + RuleEngineResults, +) + +# HF Inference is imported lazily to avoid circular imports +# Use: from core.hf_inference import get_inference_client + +__all__ = [ + # Enums + "TokenSource", + "Confidence", + "Viewport", + "PageType", + # Token models + "ColorToken", + "TypographyToken", + "SpacingToken", + "RadiusToken", + "ShadowToken", + # Result models + "ExtractedTokens", + "NormalizedTokens", + "FinalTokens", + "WorkflowState", + # Color utilities + "parse_color", + "normalize_hex", + "get_contrast_ratio", + "check_wcag_compliance", + "generate_color_ramp", + "generate_accessible_ramp", + "categorize_color", + "suggest_color_name", + # Rule Engine + "run_rule_engine", + "analyze_type_scale", + "analyze_accessibility", + "analyze_spacing_grid", + "analyze_color_statistics", + "TypeScaleAnalysis", + "ColorAccessibility", + "SpacingGridAnalysis", + "ColorStatistics", + "RuleEngineResults", +] diff --git a/core/color_utils.py b/core/color_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..583f852ac9f7a90e18bf1647d7662efbcfb1d18e --- /dev/null +++ b/core/color_utils.py @@ -0,0 +1,462 @@ +""" +Color Utilities +Design System Extractor v2 + +Functions for color analysis, contrast calculation, and ramp generation. +""" + +import re +import colorsys +from typing import Optional +from dataclasses import dataclass + + +# ============================================================================= +# COLOR PARSING +# ============================================================================= + +@dataclass +class ParsedColor: + """Parsed color with multiple representations.""" + hex: str + rgb: tuple[int, int, int] + hsl: tuple[float, float, float] + oklch: Optional[tuple[float, float, float]] = None + + +def hex_to_rgb(hex_color: str) -> tuple[int, int, int]: + """Convert hex color to RGB tuple.""" + hex_color = hex_color.lstrip("#") + if len(hex_color) == 3: + hex_color = "".join([c * 2 for c in hex_color]) + return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + + +def rgb_to_hex(r: int, g: int, b: int) -> str: + """Convert RGB to hex color.""" + return f"#{r:02x}{g:02x}{b:02x}" + + +def rgb_to_hsl(r: int, g: int, b: int) -> tuple[float, float, float]: + """Convert RGB to HSL.""" + r_norm, g_norm, b_norm = r / 255, g / 255, b / 255 + h, l, s = colorsys.rgb_to_hls(r_norm, g_norm, b_norm) + return (h * 360, s * 100, l * 100) + + +def hsl_to_rgb(h: float, s: float, l: float) -> tuple[int, int, int]: + """Convert HSL to RGB.""" + h_norm, s_norm, l_norm = h / 360, s / 100, l / 100 + r, g, b = colorsys.hls_to_rgb(h_norm, l_norm, s_norm) + return (int(r * 255), int(g * 255), int(b * 255)) + + +def parse_color(color_string: str) -> Optional[ParsedColor]: + """ + Parse any CSS color format to ParsedColor. + + Supports: + - Hex: #fff, #ffffff + - RGB: rgb(255, 255, 255) + - RGBA: rgba(255, 255, 255, 0.5) + - HSL: hsl(0, 100%, 50%) + """ + color_string = color_string.strip().lower() + + # Hex format + if color_string.startswith("#"): + hex_color = color_string + if len(hex_color) == 4: + hex_color = f"#{hex_color[1]*2}{hex_color[2]*2}{hex_color[3]*2}" + try: + rgb = hex_to_rgb(hex_color) + hsl = rgb_to_hsl(*rgb) + return ParsedColor(hex=hex_color, rgb=rgb, hsl=hsl) + except ValueError: + return None + + # RGB/RGBA format + rgb_match = re.match(r"rgba?\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)", color_string) + if rgb_match: + r, g, b = int(rgb_match.group(1)), int(rgb_match.group(2)), int(rgb_match.group(3)) + hex_color = rgb_to_hex(r, g, b) + hsl = rgb_to_hsl(r, g, b) + return ParsedColor(hex=hex_color, rgb=(r, g, b), hsl=hsl) + + # HSL format + hsl_match = re.match(r"hsl\s*\(\s*(\d+)\s*,\s*(\d+)%?\s*,\s*(\d+)%?", color_string) + if hsl_match: + h, s, l = float(hsl_match.group(1)), float(hsl_match.group(2)), float(hsl_match.group(3)) + rgb = hsl_to_rgb(h, s, l) + hex_color = rgb_to_hex(*rgb) + return ParsedColor(hex=hex_color, rgb=rgb, hsl=(h, s, l)) + + return None + + +def normalize_hex(color: str) -> str: + """Normalize hex color to lowercase 6-digit format.""" + parsed = parse_color(color) + return parsed.hex if parsed else color + + +# ============================================================================= +# CONTRAST CALCULATIONS (WCAG) +# ============================================================================= + +def get_luminance(r: int, g: int, b: int) -> float: + """ + Calculate relative luminance according to WCAG 2.1. + + Formula: L = 0.2126 * R + 0.7152 * G + 0.0722 * B + where R, G, B are linearized values. + """ + def linearize(c: int) -> float: + c_norm = c / 255 + if c_norm <= 0.04045: + return c_norm / 12.92 + return ((c_norm + 0.055) / 1.055) ** 2.4 + + return 0.2126 * linearize(r) + 0.7152 * linearize(g) + 0.0722 * linearize(b) + + +def get_contrast_ratio(color1: str, color2: str) -> float: + """ + Calculate WCAG contrast ratio between two colors. + + Returns ratio from 1:1 to 21:1 + """ + parsed1 = parse_color(color1) + parsed2 = parse_color(color2) + + if not parsed1 or not parsed2: + return 1.0 + + l1 = get_luminance(*parsed1.rgb) + l2 = get_luminance(*parsed2.rgb) + + lighter = max(l1, l2) + darker = min(l1, l2) + + return (lighter + 0.05) / (darker + 0.05) + + +def check_wcag_compliance(foreground: str, background: str) -> dict: + """ + Check WCAG compliance for a color pair. + + Returns dict with AA and AAA compliance for normal and large text. + """ + ratio = get_contrast_ratio(foreground, background) + + return { + "contrast_ratio": round(ratio, 2), + "aa_normal_text": ratio >= 4.5, # AA for normal text + "aa_large_text": ratio >= 3.0, # AA for large text (18pt+ or 14pt+ bold) + "aaa_normal_text": ratio >= 7.0, # AAA for normal text + "aaa_large_text": ratio >= 4.5, # AAA for large text + } + + +def get_contrast_with_white(color: str) -> float: + """Get contrast ratio against white.""" + return get_contrast_ratio(color, "#ffffff") + + +def get_contrast_with_black(color: str) -> float: + """Get contrast ratio against black.""" + return get_contrast_ratio(color, "#000000") + + +def get_best_text_color(background: str) -> str: + """Determine whether white or black text works better on a background.""" + white_contrast = get_contrast_with_white(background) + black_contrast = get_contrast_with_black(background) + return "#ffffff" if white_contrast > black_contrast else "#000000" + + +# ============================================================================= +# COLOR SIMILARITY & DEDUPLICATION +# ============================================================================= + +def color_distance(color1: str, color2: str) -> float: + """ + Calculate perceptual distance between two colors. + + Uses simple Euclidean distance in RGB space. + For more accuracy, consider using CIE Lab Delta E. + """ + parsed1 = parse_color(color1) + parsed2 = parse_color(color2) + + if not parsed1 or not parsed2: + return float("inf") + + r1, g1, b1 = parsed1.rgb + r2, g2, b2 = parsed2.rgb + + return ((r1 - r2) ** 2 + (g1 - g2) ** 2 + (b1 - b2) ** 2) ** 0.5 + + +def are_colors_similar(color1: str, color2: str, threshold: float = 10.0) -> bool: + """Check if two colors are perceptually similar.""" + return color_distance(color1, color2) < threshold + + +def find_duplicate_colors(colors: list[str], threshold: float = 10.0) -> list[tuple[str, str]]: + """ + Find pairs of colors that are potentially duplicates. + + Returns list of (color1, color2) tuples. + """ + duplicates = [] + normalized = [normalize_hex(c) for c in colors] + + for i, color1 in enumerate(normalized): + for color2 in normalized[i + 1:]: + if are_colors_similar(color1, color2, threshold): + duplicates.append((color1, color2)) + + return duplicates + + +def deduplicate_colors(colors: list[str], threshold: float = 10.0) -> list[str]: + """ + Remove duplicate colors, keeping the first occurrence. + """ + result = [] + normalized = [normalize_hex(c) for c in colors] + + for color in normalized: + is_duplicate = False + for existing in result: + if are_colors_similar(color, existing, threshold): + is_duplicate = True + break + if not is_duplicate: + result.append(color) + + return result + + +# ============================================================================= +# COLOR RAMP GENERATION +# ============================================================================= + +def generate_color_ramp( + base_color: str, + shades: list[int] = None, + method: str = "hsl" +) -> dict[str, str]: + """ + Generate a color ramp from a base color. + + Args: + base_color: The base color (typically becomes 500) + shades: List of shade values (default: [50, 100, 200, 300, 400, 500, 600, 700, 800, 900]) + method: "hsl" (simple) or "oklch" (perceptually uniform, not implemented) + + Returns: + Dict mapping shade to hex color. + """ + if shades is None: + shades = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900] + + parsed = parse_color(base_color) + if not parsed: + return {} + + h, s, l = parsed.hsl + ramp = {} + + # Base color is 500 + base_shade = 500 + + for shade in shades: + if shade == base_shade: + ramp[str(shade)] = parsed.hex + continue + + # Calculate lightness adjustment + # Lighter shades (50-400): increase lightness + # Darker shades (600-900): decrease lightness + if shade < base_shade: + # Lighter: interpolate toward white + factor = (base_shade - shade) / base_shade + new_l = l + (100 - l) * factor * 0.85 + # Slightly reduce saturation for very light shades + new_s = s * (1 - factor * 0.3) + else: + # Darker: interpolate toward black + factor = (shade - base_shade) / (900 - base_shade) + new_l = l * (1 - factor * 0.85) + # Increase saturation slightly for dark shades + new_s = min(100, s * (1 + factor * 0.2)) + + new_rgb = hsl_to_rgb(h, new_s, new_l) + ramp[str(shade)] = rgb_to_hex(*new_rgb) + + return ramp + + +def generate_accessible_ramp( + base_color: str, + background: str = "#ffffff" +) -> dict[str, dict]: + """ + Generate a color ramp with accessibility information. + + Returns dict with color values and their contrast ratios. + """ + ramp = generate_color_ramp(base_color) + result = {} + + for shade, color in ramp.items(): + compliance = check_wcag_compliance(color, background) + result[shade] = { + "value": color, + "contrast_ratio": compliance["contrast_ratio"], + "aa_text": compliance["aa_normal_text"], + "best_text_color": get_best_text_color(color), + } + + return result + + +# ============================================================================= +# COLOR CATEGORIZATION +# ============================================================================= + +def categorize_color(color: str) -> str: + """ + Categorize a color by its general hue. + + Returns: "red", "orange", "yellow", "green", "cyan", "blue", "purple", "pink", "neutral" + """ + parsed = parse_color(color) + if not parsed: + return "unknown" + + h, s, l = parsed.hsl + + # Neutrals (low saturation or extreme lightness) + if s < 10 or l < 5 or l > 95: + return "neutral" + + # Categorize by hue + if h < 15 or h >= 345: + return "red" + elif h < 45: + return "orange" + elif h < 70: + return "yellow" + elif h < 150: + return "green" + elif h < 190: + return "cyan" + elif h < 260: + return "blue" + elif h < 290: + return "purple" + else: + return "pink" + + +def suggest_color_name(color: str, role: str = None) -> str: + """ + Suggest a semantic name for a color. + + Args: + color: The color value + role: Optional role hint ("primary", "background", "text", etc.) + + Returns suggested name like "blue-500" or "neutral-100" + """ + parsed = parse_color(color) + if not parsed: + return "unknown" + + category = categorize_color(color) + h, s, l = parsed.hsl + + # Determine shade level based on lightness + if l >= 95: + shade = "50" + elif l >= 85: + shade = "100" + elif l >= 75: + shade = "200" + elif l >= 65: + shade = "300" + elif l >= 55: + shade = "400" + elif l >= 45: + shade = "500" + elif l >= 35: + shade = "600" + elif l >= 25: + shade = "700" + elif l >= 15: + shade = "800" + else: + shade = "900" + + return f"{category}-{shade}" + + +def group_colors_by_category(colors: list[str]) -> dict[str, list[str]]: + """ + Group colors by their category. + + Returns dict mapping category to list of colors. + """ + groups: dict[str, list[str]] = {} + + for color in colors: + category = categorize_color(color) + if category not in groups: + groups[category] = [] + groups[category].append(normalize_hex(color)) + + return groups + + +# ============================================================================= +# UTILITY FUNCTIONS +# ============================================================================= + +def sort_colors_by_hue(colors: list[str]) -> list[str]: + """Sort colors by hue, then by lightness.""" + def sort_key(color: str): + parsed = parse_color(color) + if not parsed: + return (0, 0, 0) + h, s, l = parsed.hsl + # Neutrals (low saturation) go at the end + if s < 10: + return (360 + l, s, l) + return (h, s, l) + + return sorted([normalize_hex(c) for c in colors], key=sort_key) + + +def sort_colors_by_lightness(colors: list[str]) -> list[str]: + """Sort colors from light to dark.""" + def sort_key(color: str): + parsed = parse_color(color) + return -parsed.hsl[2] if parsed else 0 + + return sorted([normalize_hex(c) for c in colors], key=sort_key) + + +def is_dark_color(color: str) -> bool: + """Check if a color is considered dark (for text on backgrounds).""" + parsed = parse_color(color) + if not parsed: + return False + return parsed.hsl[2] < 50 + + +def is_light_color(color: str) -> bool: + """Check if a color is considered light.""" + return not is_dark_color(color) diff --git a/core/hf_inference.py b/core/hf_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..348e5adc86b6973cae0e5dd24a797f15dd2b1444 --- /dev/null +++ b/core/hf_inference.py @@ -0,0 +1,608 @@ +""" +HuggingFace Inference Client +Design System Extractor v2 + +Handles all LLM inference calls using HuggingFace Inference API. +Supports diverse models from different providers for specialized tasks. +""" + +import os +from typing import Optional, AsyncGenerator +from dataclasses import dataclass +from huggingface_hub import InferenceClient, AsyncInferenceClient + +from config.settings import get_settings + + +@dataclass +class ModelInfo: + """Information about a model.""" + model_id: str + provider: str + context_length: int + strengths: list[str] + best_for: str + tier: str # "free", "pro", "pro+" + + +# ============================================================================= +# COMPREHENSIVE MODEL REGISTRY — Organized by Provider +# ============================================================================= + +AVAILABLE_MODELS = { + # ========================================================================= + # META — Llama Family (Best for reasoning) + # ========================================================================= + "meta-llama/Llama-3.1-405B-Instruct": ModelInfo( + model_id="meta-llama/Llama-3.1-405B-Instruct", + provider="Meta", + context_length=128000, + strengths=["Best reasoning", "Massive knowledge", "Complex analysis"], + best_for="Agent 3 (Advisor) — PREMIUM CHOICE", + tier="pro+" + ), + "meta-llama/Llama-3.1-70B-Instruct": ModelInfo( + model_id="meta-llama/Llama-3.1-70B-Instruct", + provider="Meta", + context_length=128000, + strengths=["Excellent reasoning", "Long context", "Design knowledge"], + best_for="Agent 3 (Advisor) — RECOMMENDED", + tier="pro" + ), + "meta-llama/Llama-3.1-8B-Instruct": ModelInfo( + model_id="meta-llama/Llama-3.1-8B-Instruct", + provider="Meta", + context_length=128000, + strengths=["Fast", "Good reasoning for size", "Long context"], + best_for="Budget Agent 3 fallback", + tier="free" + ), + + # ========================================================================= + # MISTRAL — European Excellence + # ========================================================================= + "mistralai/Mixtral-8x22B-Instruct-v0.1": ModelInfo( + model_id="mistralai/Mixtral-8x22B-Instruct-v0.1", + provider="Mistral", + context_length=65536, + strengths=["Large MoE", "Strong reasoning", "Efficient"], + best_for="Agent 3 (Advisor) — Pro alternative", + tier="pro" + ), + "mistralai/Mixtral-8x7B-Instruct-v0.1": ModelInfo( + model_id="mistralai/Mixtral-8x7B-Instruct-v0.1", + provider="Mistral", + context_length=32768, + strengths=["Good MoE efficiency", "Solid reasoning"], + best_for="Agent 3 (Advisor) — Free tier option", + tier="free" + ), + "mistralai/Mistral-7B-Instruct-v0.3": ModelInfo( + model_id="mistralai/Mistral-7B-Instruct-v0.3", + provider="Mistral", + context_length=32768, + strengths=["Fast", "Good instruction following"], + best_for="General fallback", + tier="free" + ), + "mistralai/Codestral-22B-v0.1": ModelInfo( + model_id="mistralai/Codestral-22B-v0.1", + provider="Mistral", + context_length=32768, + strengths=["Code specialist", "JSON generation", "Structured output"], + best_for="Agent 4 (Generator) — RECOMMENDED", + tier="pro" + ), + + # ========================================================================= + # COHERE — Command R Family (Analysis & Retrieval) + # ========================================================================= + "CohereForAI/c4ai-command-r-plus": ModelInfo( + model_id="CohereForAI/c4ai-command-r-plus", + provider="Cohere", + context_length=128000, + strengths=["Excellent analysis", "RAG optimized", "Long context"], + best_for="Agent 3 (Advisor) — Great for research tasks", + tier="pro" + ), + "CohereForAI/c4ai-command-r-v01": ModelInfo( + model_id="CohereForAI/c4ai-command-r-v01", + provider="Cohere", + context_length=128000, + strengths=["Good analysis", "Efficient"], + best_for="Agent 3 budget option", + tier="free" + ), + + # ========================================================================= + # GOOGLE — Gemma Family + # ========================================================================= + "google/gemma-2-27b-it": ModelInfo( + model_id="google/gemma-2-27b-it", + provider="Google", + context_length=8192, + strengths=["Strong instruction following", "Good balance"], + best_for="Agent 2 (Normalizer) — Quality option", + tier="pro" + ), + "google/gemma-2-9b-it": ModelInfo( + model_id="google/gemma-2-9b-it", + provider="Google", + context_length=8192, + strengths=["Fast", "Good instruction following"], + best_for="Agent 2 (Normalizer) — Balanced", + tier="free" + ), + + # ========================================================================= + # MICROSOFT — Phi Family (Small but Mighty) + # ========================================================================= + "microsoft/Phi-3.5-mini-instruct": ModelInfo( + model_id="microsoft/Phi-3.5-mini-instruct", + provider="Microsoft", + context_length=128000, + strengths=["Very fast", "Great structured output", "Long context"], + best_for="Agent 2 (Normalizer) — RECOMMENDED", + tier="free" + ), + "microsoft/Phi-3-medium-4k-instruct": ModelInfo( + model_id="microsoft/Phi-3-medium-4k-instruct", + provider="Microsoft", + context_length=4096, + strengths=["Fast", "Good for simple tasks"], + best_for="Simple naming tasks", + tier="free" + ), + + # ========================================================================= + # QWEN — Alibaba Family + # ========================================================================= + "Qwen/Qwen2.5-72B-Instruct": ModelInfo( + model_id="Qwen/Qwen2.5-72B-Instruct", + provider="Alibaba", + context_length=32768, + strengths=["Strong reasoning", "Multilingual", "Good design knowledge"], + best_for="Agent 3 (Advisor) — Alternative", + tier="pro" + ), + "Qwen/Qwen2.5-32B-Instruct": ModelInfo( + model_id="Qwen/Qwen2.5-32B-Instruct", + provider="Alibaba", + context_length=32768, + strengths=["Good balance", "Multilingual"], + best_for="Medium-tier option", + tier="pro" + ), + "Qwen/Qwen2.5-Coder-32B-Instruct": ModelInfo( + model_id="Qwen/Qwen2.5-Coder-32B-Instruct", + provider="Alibaba", + context_length=32768, + strengths=["Code specialist", "JSON/structured output"], + best_for="Agent 4 (Generator) — Alternative", + tier="pro" + ), + "Qwen/Qwen2.5-7B-Instruct": ModelInfo( + model_id="Qwen/Qwen2.5-7B-Instruct", + provider="Alibaba", + context_length=32768, + strengths=["Fast", "Good all-rounder"], + best_for="General fallback", + tier="free" + ), + + # ========================================================================= + # DEEPSEEK — Code Specialists + # ========================================================================= + "deepseek-ai/deepseek-coder-33b-instruct": ModelInfo( + model_id="deepseek-ai/deepseek-coder-33b-instruct", + provider="DeepSeek", + context_length=16384, + strengths=["Excellent code generation", "JSON specialist"], + best_for="Agent 4 (Generator) — Code focused", + tier="pro" + ), + "deepseek-ai/DeepSeek-V2.5": ModelInfo( + model_id="deepseek-ai/DeepSeek-V2.5", + provider="DeepSeek", + context_length=32768, + strengths=["Strong reasoning", "Good code"], + best_for="Multi-purpose", + tier="pro" + ), + + # ========================================================================= + # BIGCODE — StarCoder Family + # ========================================================================= + "bigcode/starcoder2-15b-instruct-v0.1": ModelInfo( + model_id="bigcode/starcoder2-15b-instruct-v0.1", + provider="BigCode", + context_length=16384, + strengths=["Code generation", "Multiple languages"], + best_for="Agent 4 (Generator) — Open source code model", + tier="free" + ), +} + + +# ============================================================================= +# RECOMMENDED CONFIGURATIONS BY TIER +# ============================================================================= + +MODEL_PRESETS = { + "budget": { + "name": "Budget (Free Tier)", + "description": "Best free models for each task", + "agent2": "microsoft/Phi-3.5-mini-instruct", + "agent3": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "agent4": "bigcode/starcoder2-15b-instruct-v0.1", + "fallback": "mistralai/Mistral-7B-Instruct-v0.3", + }, + "balanced": { + "name": "Balanced (Pro Tier)", + "description": "Good quality/cost balance", + "agent2": "google/gemma-2-9b-it", + "agent3": "meta-llama/Llama-3.1-70B-Instruct", + "agent4": "mistralai/Codestral-22B-v0.1", + "fallback": "Qwen/Qwen2.5-7B-Instruct", + }, + "quality": { + "name": "Maximum Quality (Pro+)", + "description": "Best models regardless of cost", + "agent2": "google/gemma-2-27b-it", + "agent3": "meta-llama/Llama-3.1-405B-Instruct", + "agent4": "deepseek-ai/deepseek-coder-33b-instruct", + "fallback": "meta-llama/Llama-3.1-8B-Instruct", + }, + "diverse": { + "name": "Diverse Providers", + "description": "One model from each major provider", + "agent2": "microsoft/Phi-3.5-mini-instruct", # Microsoft + "agent3": "CohereForAI/c4ai-command-r-plus", # Cohere + "agent4": "mistralai/Codestral-22B-v0.1", # Mistral + "fallback": "meta-llama/Llama-3.1-8B-Instruct", # Meta + }, +} + + +# ============================================================================= +# AGENT-SPECIFIC RECOMMENDATIONS +# ============================================================================= + +AGENT_MODEL_RECOMMENDATIONS = { + "crawler": { + "requires_llm": False, + "notes": "Pure rule-based extraction using Playwright + CSS parsing" + }, + "extractor": { + "requires_llm": False, + "notes": "Pure rule-based extraction using Playwright + CSS parsing" + }, + "normalizer": { + "requires_llm": True, + "task": "Token naming, duplicate detection, pattern inference", + "needs": ["Fast inference", "Good instruction following", "Structured output"], + "recommended": [ + ("microsoft/Phi-3.5-mini-instruct", "BEST — Fast, great structured output"), + ("google/gemma-2-9b-it", "Good balance of speed and quality"), + ("Qwen/Qwen2.5-7B-Instruct", "Reliable all-rounder"), + ], + "temperature": 0.2, + }, + "advisor": { + "requires_llm": True, + "task": "Design system analysis, best practice recommendations", + "needs": ["Strong reasoning", "Design knowledge", "Creative suggestions"], + "recommended": [ + ("meta-llama/Llama-3.1-70B-Instruct", "BEST — Excellent reasoning"), + ("CohereForAI/c4ai-command-r-plus", "Great for analysis tasks"), + ("Qwen/Qwen2.5-72B-Instruct", "Strong alternative"), + ("mistralai/Mixtral-8x7B-Instruct-v0.1", "Best free option"), + ], + "temperature": 0.4, + }, + "generator": { + "requires_llm": True, + "task": "Generate JSON tokens, CSS variables, structured output", + "needs": ["Code generation", "JSON formatting", "Schema adherence"], + "recommended": [ + ("mistralai/Codestral-22B-v0.1", "BEST — Mistral's code model"), + ("deepseek-ai/deepseek-coder-33b-instruct", "Excellent code specialist"), + ("Qwen/Qwen2.5-Coder-32B-Instruct", "Strong code model"), + ("bigcode/starcoder2-15b-instruct-v0.1", "Best free option"), + ], + "temperature": 0.1, + }, +} + + +# ============================================================================= +# INFERENCE CLIENT +# ============================================================================= + +class HFInferenceClient: + """ + Wrapper around HuggingFace Inference API. + + Handles model selection, retries, and fallbacks. + """ + + def __init__(self): + self.settings = get_settings() + # Read token fresh from env — the Settings singleton may have been + # created before the user entered their token via the Gradio UI. + self.token = os.getenv("HF_TOKEN", "") or self.settings.hf.hf_token + + if not self.token: + raise ValueError("HF_TOKEN is required for inference") + + # Let huggingface_hub route to the best available provider automatically. + # Do NOT set base_url (overrides per-model routing) or + # provider="hf-inference" (that provider no longer hosts most models). + # The default provider="auto" picks the first available third-party + # provider (novita, together, cerebras, etc.) for each model. + self.sync_client = InferenceClient(token=self.token) + self.async_client = AsyncInferenceClient(token=self.token) + + def get_model_for_agent(self, agent_name: str) -> str: + """Get the appropriate model for an agent.""" + return self.settings.get_model_for_agent(agent_name) + + def get_temperature_for_agent(self, agent_name: str) -> float: + """Get recommended temperature for an agent.""" + temps = { + # Legacy agents + "normalizer": 0.2, # Consistent naming + "advisor": 0.4, # Creative recommendations + "generator": 0.1, # Precise formatting + # Stage 2 agents — tuned per persona + "brand_identifier": 0.4, # AURORA — creative color reasoning + "benchmark_advisor": 0.25, # ATLAS — analytical comparison + "best_practices_validator": 0.2, # SENTINEL — precise rule-checking + "head_synthesizer": 0.3, # NEXUS — balanced synthesis + } + return temps.get(agent_name, 0.3) + + def _build_messages( + self, + system_prompt: str, + user_message: str, + examples: list[dict] = None + ) -> list[dict]: + """Build message list for chat completion.""" + messages = [] + + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + if examples: + for example in examples: + messages.append({"role": "user", "content": example["user"]}) + messages.append({"role": "assistant", "content": example["assistant"]}) + + messages.append({"role": "user", "content": user_message}) + + return messages + + def complete( + self, + agent_name: str, + system_prompt: str, + user_message: str, + examples: list[dict] = None, + max_tokens: int = None, + temperature: float = None, + json_mode: bool = False, + ) -> str: + """ + Synchronous completion. + + Args: + agent_name: Which agent is making the call (for model selection) + system_prompt: System instructions + user_message: User input + examples: Optional few-shot examples + max_tokens: Max tokens to generate + temperature: Sampling temperature (uses agent default if not specified) + json_mode: If True, instruct model to output JSON + + Returns: + Generated text + """ + model = self.get_model_for_agent(agent_name) + max_tokens = max_tokens or self.settings.hf.max_new_tokens + temperature = temperature or self.get_temperature_for_agent(agent_name) + + # Build messages + if json_mode: + system_prompt = f"{system_prompt}\n\nYou must respond with valid JSON only. No markdown, no explanation, just JSON." + + messages = self._build_messages(system_prompt, user_message, examples) + + try: + response = self.sync_client.chat_completion( + model=model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + ) + return response.choices[0].message.content + + except Exception as e: + error_msg = str(e) + print(f"[HF] Primary model {model} failed: {error_msg[:120]}") + fallback = self.settings.models.fallback_model + if fallback and fallback != model: + print(f"[HF] Trying fallback: {fallback}") + try: + response = self.sync_client.chat_completion( + model=fallback, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + ) + return response.choices[0].message.content + except Exception as fallback_err: + print(f"[HF] Fallback {fallback} also failed: {str(fallback_err)[:120]}") + raise fallback_err + raise e + + async def complete_async( + self, + agent_name: str, + system_prompt: str, + user_message: str, + examples: list[dict] = None, + max_tokens: int = None, + temperature: float = None, + json_mode: bool = False, + ) -> str: + """ + Asynchronous completion. + + Same parameters as complete(). + """ + model = self.get_model_for_agent(agent_name) + max_tokens = max_tokens or self.settings.hf.max_new_tokens + temperature = temperature or self.get_temperature_for_agent(agent_name) + + if json_mode: + system_prompt = f"{system_prompt}\n\nYou must respond with valid JSON only. No markdown, no explanation, just JSON." + + messages = self._build_messages(system_prompt, user_message, examples) + + try: + response = await self.async_client.chat_completion( + model=model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + ) + return response.choices[0].message.content + + except Exception as e: + error_msg = str(e) + print(f"[HF] Primary model {model} failed: {error_msg[:120]}") + fallback = self.settings.models.fallback_model + if fallback and fallback != model: + print(f"[HF] Trying fallback: {fallback}") + try: + response = await self.async_client.chat_completion( + model=fallback, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + ) + return response.choices[0].message.content + except Exception as fallback_err: + print(f"[HF] Fallback {fallback} also failed: {str(fallback_err)[:120]}") + raise fallback_err + raise e + + async def stream_async( + self, + agent_name: str, + system_prompt: str, + user_message: str, + max_tokens: int = None, + temperature: float = None, + ) -> AsyncGenerator[str, None]: + """ + Async streaming completion. + + Yields tokens as they are generated. + """ + model = self.get_model_for_agent(agent_name) + max_tokens = max_tokens or self.settings.hf.max_new_tokens + temperature = temperature or self.get_temperature_for_agent(agent_name) + + messages = self._build_messages(system_prompt, user_message) + + async for chunk in await self.async_client.chat_completion( + model=model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + stream=True, + ): + if chunk.choices[0].delta.content: + yield chunk.choices[0].delta.content + + +# ============================================================================= +# SINGLETON & CONVENIENCE FUNCTIONS +# ============================================================================= + +_client: Optional[HFInferenceClient] = None + + +def get_inference_client() -> HFInferenceClient: + """Get or create the inference client singleton. + + Re-creates the client if the token has changed (e.g. user entered it + via the Gradio UI after initial startup). + """ + global _client + current_token = os.getenv("HF_TOKEN", "") + if _client is None or (_client.token != current_token and current_token): + _client = HFInferenceClient() + return _client + + +def complete( + agent_name: str, + system_prompt: str, + user_message: str, + **kwargs +) -> str: + """Convenience function for sync completion.""" + client = get_inference_client() + return client.complete(agent_name, system_prompt, user_message, **kwargs) + + +async def complete_async( + agent_name: str, + system_prompt: str, + user_message: str, + **kwargs +) -> str: + """Convenience function for async completion.""" + client = get_inference_client() + return await client.complete_async(agent_name, system_prompt, user_message, **kwargs) + + +def get_model_info(model_id: str) -> dict: + """Get information about a specific model.""" + if model_id in AVAILABLE_MODELS: + info = AVAILABLE_MODELS[model_id] + return { + "model_id": info.model_id, + "provider": info.provider, + "context_length": info.context_length, + "strengths": info.strengths, + "best_for": info.best_for, + "tier": info.tier, + } + return {"model_id": model_id, "provider": "unknown"} + + +def get_models_by_provider() -> dict[str, list[str]]: + """Get all models grouped by provider.""" + by_provider = {} + for model_id, info in AVAILABLE_MODELS.items(): + if info.provider not in by_provider: + by_provider[info.provider] = [] + by_provider[info.provider].append(model_id) + return by_provider + + +def get_models_by_tier(tier: str) -> list[str]: + """Get all models for a specific tier (free, pro, pro+).""" + return [ + model_id for model_id, info in AVAILABLE_MODELS.items() + if info.tier == tier + ] + + +def get_preset_config(preset_name: str) -> dict: + """Get a preset model configuration.""" + return MODEL_PRESETS.get(preset_name, MODEL_PRESETS["balanced"]) diff --git a/core/logging.py b/core/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..8d580889199c8bf4dfb73faecbac247f9b0c671b --- /dev/null +++ b/core/logging.py @@ -0,0 +1,70 @@ +""" +Structured Logging for Design System Extractor +================================================ + +Provides consistent logging across the application using loguru. +Falls back to standard logging if loguru is not available. +""" + +import sys +from typing import Optional + +try: + from loguru import logger as _loguru_logger + + # Remove default handler + _loguru_logger.remove() + + # Add structured console handler + _loguru_logger.add( + sys.stderr, + format="{time:HH:mm:ss} | {level: <8} | {extra[module]} | {message}", + level="INFO", + colorize=True, + ) + + # Add file handler for debugging (rotated) + _loguru_logger.add( + "logs/extractor_{time:YYYY-MM-DD}.log", + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {extra[module]} | {message}", + level="DEBUG", + rotation="10 MB", + retention="7 days", + compression="gz", + catch=True, # Don't crash on log errors + ) + + HAS_LOGURU = True + +except ImportError: + import logging + + HAS_LOGURU = False + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s", + datefmt="%H:%M:%S", + ) + + +def get_logger(module_name: str = "app"): + """ + Get a logger instance for a specific module. + + Args: + module_name: Name of the module (e.g., 'rule_engine', 'aurora', 'app') + + Returns: + Logger instance with module context + """ + if HAS_LOGURU: + return _loguru_logger.bind(module=module_name) + else: + return logging.getLogger(module_name) + + +# Pre-configured loggers for common modules +app_logger = get_logger("app") +rule_engine_logger = get_logger("rule_engine") +agent_logger = get_logger("agents") +extraction_logger = get_logger("extraction") diff --git a/core/preview_generator.py b/core/preview_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..912d4027b4722f2820ef81ac82605dd1fa7ce0f1 --- /dev/null +++ b/core/preview_generator.py @@ -0,0 +1,1534 @@ +""" +Preview Generator for Typography and Color Previews + +Generates HTML previews for: +1. Typography - Actual font rendering with detected styles +2. Colors AS-IS - Simple swatches showing extracted colors (Stage 1) +3. Color Ramps - 11 shades (50-950) with AA compliance (Stage 2) +4. Spacing AS-IS - Visual spacing blocks +5. Radius AS-IS - Rounded corner examples +6. Shadows AS-IS - Shadow examples +""" + +from typing import Optional +import colorsys +import re + + +# ============================================================================= +# STAGE 1: AS-IS PREVIEWS (No enhancements, just raw extracted values) +# ============================================================================= + +def generate_colors_asis_preview_html( + color_tokens: dict, + background: str = "#FAFAFA", + max_colors: int = 50 +) -> str: + """ + Generate HTML preview for AS-IS colors (Stage 1). + + Shows simple color swatches without generated ramps. + Sorted by frequency (most used first). + + Args: + color_tokens: Dict of colors {name: {value: "#hex", ...}} + background: Background color + max_colors: Maximum colors to display (default 50) + + Returns: + HTML string for Gradio HTML component + """ + + # Sort by frequency (highest first) + sorted_tokens = [] + for name, token in color_tokens.items(): + if isinstance(token, dict): + freq = token.get("frequency", 0) + else: + freq = 0 + sorted_tokens.append((name, token, freq)) + + sorted_tokens.sort(key=lambda x: -x[2]) # Descending by frequency + + rows_html = "" + + for name, token, freq in sorted_tokens[:max_colors]: + # Get hex value + if isinstance(token, dict): + hex_val = token.get("value", "#888888") + frequency = token.get("frequency", 0) + contexts = token.get("contexts", []) + contrast_white = token.get("contrast_white", 0) + contrast_black = token.get("contrast_black", 0) + else: + hex_val = str(token) + frequency = 0 + contexts = [] + contrast_white = 0 + contrast_black = 0 + + # Clean up hex + if not hex_val.startswith("#"): + hex_val = f"#{hex_val}" + + # Determine text color based on background luminance + # Use contrast ratios to pick best text color + text_color = "#1a1a1a" if contrast_white and contrast_white < 4.5 else "#ffffff" + if not contrast_white: + # Fallback: calculate from hex + try: + r = int(hex_val[1:3], 16) + g = int(hex_val[3:5], 16) + b = int(hex_val[5:7], 16) + luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255 + text_color = "#1a1a1a" if luminance > 0.5 else "#ffffff" + except: + text_color = "#1a1a1a" + + # Clean name + display_name = name.replace("_", " ").replace("-", " ").replace(".", " ").title() + if len(display_name) > 25: + display_name = display_name[:22] + "..." + + # AA compliance check + aa_status = "✓ AA" if contrast_white and contrast_white >= 4.5 else "✗ AA" if contrast_white else "" + aa_class = "aa-pass" if contrast_white and contrast_white >= 4.5 else "aa-fail" + + # Context badges (limit to 3) + context_html = "" + for ctx in contexts[:3]: + ctx_display = ctx[:12] + "..." if len(ctx) > 12 else ctx + context_html += f'{ctx_display}' + + rows_html += f''' +
+
+ {hex_val} +
+
+
{display_name}
+
+ Used {frequency}x + {aa_status} +
+
+ {context_html} +
+
+
+ ''' + + # Show count info + total_colors = len(color_tokens) + showing = min(max_colors, total_colors) + count_info = f"Showing {showing} of {total_colors} colors (sorted by frequency)" + + html = f''' + + +
{count_info}
+
+ {rows_html} +
+ ''' + + return html + + +def generate_spacing_asis_preview_html( + spacing_tokens: dict, + background: str = "#FAFAFA" +) -> str: + """ + Generate HTML preview for AS-IS spacing (Stage 1). + + Shows visual blocks representing each spacing value. + """ + + rows_html = "" + + # Sort by pixel value + sorted_tokens = [] + for name, token in spacing_tokens.items(): + if isinstance(token, dict): + value_px = token.get("value_px", 0) + value = token.get("value", "0px") + else: + value = str(token) + value_px = float(re.sub(r'[^0-9.]', '', value) or 0) + sorted_tokens.append((name, token, value_px, value)) + + sorted_tokens.sort(key=lambda x: x[2]) + + for name, token, value_px, value in sorted_tokens[:15]: + # Cap visual width at 200px + visual_width = min(value_px, 200) + + rows_html += f''' +
+
{value}
+
+
+ ''' + + html = f''' + + +
+ {rows_html} +
+ ''' + + return html + + +def generate_radius_asis_preview_html( + radius_tokens: dict, + background: str = "#FAFAFA" +) -> str: + """ + Generate HTML preview for AS-IS border radius (Stage 1). + + Shows boxes with each radius value applied. + """ + + rows_html = "" + + for name, token in list(radius_tokens.items())[:12]: + if isinstance(token, dict): + value = token.get("value", "0px") + else: + value = str(token) + + rows_html += f''' +
+
+
{value}
+
+ ''' + + html = f''' + + +
+ {rows_html} +
+ ''' + + return html + + +def generate_shadows_asis_preview_html( + shadow_tokens: dict, + background: str = "#FAFAFA" +) -> str: + """ + Generate HTML preview for AS-IS shadows (Stage 1). + + Shows cards with each shadow value applied. + """ + + rows_html = "" + + for name, token in list(shadow_tokens.items())[:8]: + if isinstance(token, dict): + value = token.get("value", "none") + else: + value = str(token) + + # Clean name for display + display_name = name.replace("_", " ").replace("-", " ").title() + if len(display_name) > 15: + display_name = display_name[:12] + "..." + + rows_html += f''' +
+
+
{display_name}
+
{value[:40]}...
+
+ ''' + + html = f''' + + +
+ {rows_html} +
+ ''' + + return html + + +# ============================================================================= +# STAGE 2: TYPOGRAPHY PREVIEW (with rendered font) +# ============================================================================= + +def generate_typography_preview_html( + typography_tokens: dict, + font_family: str = "Open Sans", + background: str = "#FAFAFA", + sample_text: str = "The quick brown fox jumps over the lazy dog" +) -> str: + """ + Generate HTML preview for typography tokens. + + Args: + typography_tokens: Dict of typography styles {name: {font_size, font_weight, line_height, letter_spacing}} + font_family: Primary font family detected + background: Background color (neutral) + sample_text: Text to render for preview + + Returns: + HTML string for Gradio HTML component + """ + + # Sort tokens by font size (largest first) + sorted_tokens = [] + for name, token in typography_tokens.items(): + size_str = str(token.get("font_size", "16px")) + size_num = float(re.sub(r'[^0-9.]', '', size_str) or 16) + sorted_tokens.append((name, token, size_num)) + + sorted_tokens.sort(key=lambda x: -x[2]) # Descending by size + + # Generate rows + rows_html = "" + for name, token, size_num in sorted_tokens[:15]: # Limit to 15 styles + font_size = token.get("font_size", "16px") + font_weight = token.get("font_weight", "400") + line_height = token.get("line_height", "1.5") + letter_spacing = token.get("letter_spacing", "0") + + # Convert weight names to numbers + weight_map = { + "thin": 100, "extralight": 200, "light": 300, "regular": 400, + "medium": 500, "semibold": 600, "bold": 700, "extrabold": 800, "black": 900 + } + if isinstance(font_weight, str) and font_weight.lower() in weight_map: + font_weight = weight_map[font_weight.lower()] + + # Weight label + weight_labels = { + 100: "Thin", 200: "ExtraLight", 300: "Light", 400: "Regular", + 500: "Medium", 600: "SemiBold", 700: "Bold", 800: "ExtraBold", 900: "Black" + } + weight_label = weight_labels.get(int(font_weight) if str(font_weight).isdigit() else 400, "Regular") + + # Clean up name for display + display_name = name.replace("_", " ").replace("-", " ").title() + if len(display_name) > 15: + display_name = display_name[:15] + "..." + + # Truncate sample text for large sizes + display_text = sample_text + if size_num > 48: + display_text = sample_text[:30] + "..." + elif size_num > 32: + display_text = sample_text[:40] + "..." + + rows_html += f''' + + +
{display_name}
+ + {font_family} + {weight_label} + {int(size_num)} + Sentence + {letter_spacing} + + + +
{display_text}
+ + + ''' + + html = f''' + + +
+ + + + + + + + + + + + + {rows_html} + +
Scale CategoryTypefaceWeightSizeCaseLetter Spacing
+
+ ''' + + return html + + +# ============================================================================= +# COLOR RAMP PREVIEW +# ============================================================================= + +def hex_to_rgb(hex_color: str) -> tuple: + """Convert hex color to RGB tuple.""" + hex_color = hex_color.lstrip('#') + if len(hex_color) == 3: + hex_color = ''.join([c*2 for c in hex_color]) + return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + + +def rgb_to_hex(rgb: tuple) -> str: + """Convert RGB tuple to hex string.""" + return '#{:02x}{:02x}{:02x}'.format(int(rgb[0]), int(rgb[1]), int(rgb[2])) + + +def get_luminance(rgb: tuple) -> float: + """Calculate relative luminance for contrast ratio.""" + def adjust(c): + c = c / 255 + return c / 12.92 if c <= 0.03928 else ((c + 0.055) / 1.055) ** 2.4 + + r, g, b = rgb + return 0.2126 * adjust(r) + 0.7152 * adjust(g) + 0.0722 * adjust(b) + + +def get_contrast_ratio(color1: tuple, color2: tuple) -> float: + """Calculate contrast ratio between two colors.""" + l1 = get_luminance(color1) + l2 = get_luminance(color2) + lighter = max(l1, l2) + darker = min(l1, l2) + return (lighter + 0.05) / (darker + 0.05) + + +def generate_color_ramp(base_hex: str) -> list[dict]: + """ + Generate 11 shades (50-950) from a base color. + + Uses OKLCH-like approach for perceptually uniform steps. + """ + try: + rgb = hex_to_rgb(base_hex) + except: + return [] + + # Convert to HLS for easier manipulation + r, g, b = [x / 255 for x in rgb] + h, l, s = colorsys.rgb_to_hls(r, g, b) + + # Define lightness levels for each shade + # 50 = very light (0.95), 500 = base, 950 = very dark (0.05) + shade_lightness = { + 50: 0.95, + 100: 0.90, + 200: 0.80, + 300: 0.70, + 400: 0.60, + 500: l, # Keep original lightness for 500 + 600: 0.45, + 700: 0.35, + 800: 0.25, + 900: 0.15, + 950: 0.08, + } + + # Adjust saturation for light/dark shades + ramp = [] + for shade, target_l in shade_lightness.items(): + # Reduce saturation for very light colors + if target_l > 0.8: + adjusted_s = s * 0.6 + elif target_l < 0.2: + adjusted_s = s * 0.8 + else: + adjusted_s = s + + # Generate new RGB + new_r, new_g, new_b = colorsys.hls_to_rgb(h, target_l, adjusted_s) + new_rgb = (int(new_r * 255), int(new_g * 255), int(new_b * 255)) + new_hex = rgb_to_hex(new_rgb) + + # Check AA compliance + white = (255, 255, 255) + black = (0, 0, 0) + contrast_white = get_contrast_ratio(new_rgb, white) + contrast_black = get_contrast_ratio(new_rgb, black) + + # AA requires 4.5:1 for normal text + aa_on_white = contrast_white >= 4.5 + aa_on_black = contrast_black >= 4.5 + + ramp.append({ + "shade": shade, + "hex": new_hex, + "rgb": new_rgb, + "contrast_white": round(contrast_white, 2), + "contrast_black": round(contrast_black, 2), + "aa_on_white": aa_on_white, + "aa_on_black": aa_on_black, + }) + + return ramp + + +def generate_color_ramps_preview_html( + color_tokens: dict, + background: str = "#FAFAFA", + max_colors: int = 20 +) -> str: + """ + Generate HTML preview for color ramps. + + Sorts colors by frequency and filters out near-white/near-black + to prioritize showing actual brand colors. + + Args: + color_tokens: Dict of colors {name: {value: "#hex", ...}} + background: Background color + max_colors: Maximum colors to show ramps for + + Returns: + HTML string for Gradio HTML component + """ + + def get_color_priority(name, token): + """Calculate priority score for a color (higher = more important).""" + if isinstance(token, dict): + hex_val = token.get("value", "#888888") + frequency = token.get("frequency", 0) + else: + hex_val = str(token) + frequency = 0 + + # Clean hex + if not hex_val.startswith("#"): + hex_val = f"#{hex_val}" + + # Calculate luminance + try: + r = int(hex_val[1:3], 16) + g = int(hex_val[3:5], 16) + b = int(hex_val[5:7], 16) + luminance = (0.299 * r + 0.587 * g + 0.114 * b) / 255 + + # Calculate saturation (simplified) + max_c = max(r, g, b) + min_c = min(r, g, b) + saturation = (max_c - min_c) / 255 if max_c > 0 else 0 + except: + luminance = 0.5 + saturation = 0 + + # Priority scoring: + # - Penalize near-white (luminance > 0.9) + # - Penalize near-black (luminance < 0.1) + # - Penalize low saturation (grays) + # - Reward high frequency + # - Reward colors with "primary", "brand", "accent" in name + + score = frequency * 10 # Base score from frequency + + # Penalize extremes + if luminance > 0.9: + score -= 500 # Near white + if luminance < 0.1: + score -= 300 # Near black + + # Reward saturated colors (actual brand colors) + score += saturation * 200 + + # Reward named brand colors + name_lower = name.lower() + if any(kw in name_lower for kw in ['primary', 'brand', 'accent', 'cyan', 'blue', 'green', 'red', 'orange', 'purple']): + score += 100 + + # Penalize "background", "border", "text" colors + if any(kw in name_lower for kw in ['background', 'border', 'neutral', 'gray', 'grey']): + score -= 50 + + return score + + # Sort colors by priority + sorted_colors = [] + for name, token in color_tokens.items(): + priority = get_color_priority(name, token) + sorted_colors.append((name, token, priority)) + + sorted_colors.sort(key=lambda x: -x[2]) # Descending by priority + + rows_html = "" + shown_count = 0 + + for name, token, priority in sorted_colors: + if shown_count >= max_colors: + break + + # Get hex value + if isinstance(token, dict): + hex_val = token.get("value", "#888888") + else: + hex_val = str(token) + + # Clean up hex + if not hex_val.startswith("#"): + hex_val = f"#{hex_val}" + + # Skip invalid hex + if len(hex_val) < 7: + continue + + # Generate ramp + ramp = generate_color_ramp(hex_val) + if not ramp: + continue + + # Clean name + display_name = name.replace("_", " ").replace("-", " ").replace(".", " ").title() + if len(display_name) > 18: + display_name = display_name[:15] + "..." + + # Generate shade cells + shades_html = "" + for shade_info in ramp: + shade = shade_info["shade"] + hex_color = shade_info["hex"] + aa_white = shade_info["aa_on_white"] + aa_black = shade_info["aa_on_black"] + + # Determine text color for label + text_color = "#000" if shade < 500 else "#FFF" + + # AA indicator + if aa_white or aa_black: + aa_indicator = "✓" + aa_class = "aa-pass" + else: + aa_indicator = "" + aa_class = "" + + shades_html += f''' +
+ {shade} + {aa_indicator} +
+ ''' + + rows_html += f''' +
+
+
+
+
{display_name}
+
{hex_val}
+
+
+
+ {shades_html} +
+
+ ''' + shown_count += 1 + + # Count info + total_colors = len(color_tokens) + count_info = f"Showing {shown_count} of {total_colors} colors (sorted by brand priority)" + + html = f''' + + +
+
{count_info}
+
+ 50 + 100 + 200 + 300 + 400 + 500 + 600 + 700 + 800 + 900 + 950 +
+ {rows_html} +
+ ''' + + return html + + +# ============================================================================= +# SEMANTIC COLOR RAMPS WITH LLM RECOMMENDATIONS (Stage 2) +# ============================================================================= + +def generate_semantic_color_ramps_html( + semantic_analysis: dict, + color_tokens: dict, + llm_recommendations: dict = None, + background: str = "#F5F5F5" +) -> str: + """ + Generate HTML preview for colors organized by semantic role with LLM recommendations. + + Args: + semantic_analysis: Output from SemanticColorAnalyzer + color_tokens: Dict of all color tokens + llm_recommendations: LLM suggestions for color improvements + background: Background color + + Returns: + HTML string for Gradio HTML component + """ + + def generate_single_ramp(hex_val: str) -> str: + """Generate a single color ramp HTML.""" + ramp = generate_color_ramp(hex_val) + if not ramp: + return "" + + shades_html = "" + for shade_info in ramp: + shade = shade_info["shade"] + hex_color = shade_info["hex"] + aa_white = shade_info["aa_on_white"] + aa_black = shade_info["aa_on_black"] + + text_color = "#000" if shade < 500 else "#FFF" + aa_indicator = "✓" if aa_white or aa_black else "" + + shades_html += f''' +
+ {shade} + {aa_indicator} +
+ ''' + return shades_html + + def color_row_with_recommendation(hex_val: str, role: str, role_display: str, recommendation: dict = None) -> str: + """Generate a color row with optional LLM recommendation.""" + ramp_html = generate_single_ramp(hex_val) + + # Calculate contrast + try: + from core.color_utils import get_contrast_with_white + contrast = get_contrast_with_white(hex_val) + aa_status = "✓ AA" if contrast >= 4.5 else f"⚠️ {contrast:.1f}:1" + aa_class = "aa-ok" if contrast >= 4.5 else "aa-warn" + except: + aa_status = "" + aa_class = "" + + # LLM recommendation display + rec_html = "" + if recommendation: + suggested = recommendation.get("suggested", "") + issue = recommendation.get("issue", "") + if suggested and suggested != hex_val: + rec_html = f''' +
+ 💡 LLM: + {issue} + + {suggested} +
+ ''' + + return f''' +
+
+
+
+
{role_display}
+
{hex_val} {aa_status}
+
+
+
{ramp_html}
+ {rec_html} +
+ ''' + + def category_section(title: str, icon: str, colors: dict, category_key: str) -> str: + """Generate a category section with color rows.""" + if not colors: + return "" + + rows_html = "" + for role, data in colors.items(): + if data and isinstance(data, dict) and "hex" in data: + # Get LLM recommendation for this role + rec = None + if llm_recommendations: + color_recs = llm_recommendations.get("color_recommendations", {}) + rec = color_recs.get(f"{category_key}.{role}", {}) + + role_display = role.replace("_", " ").title() + rows_html += color_row_with_recommendation( + data["hex"], + f"{category_key}.{role}", + role_display, + rec + ) + + if not rows_html: + return "" + + return f''' +
+

{icon} {title}

+ {rows_html} +
+ ''' + + # Handle empty analysis + if not semantic_analysis: + return ''' +
+

⚠️ No semantic analysis available.

+
+ + ''' + + # Build sections + sections_html = "" + sections_html += category_section("Brand Colors", "🎨", semantic_analysis.get("brand", {}), "brand") + sections_html += category_section("Text Colors", "📝", semantic_analysis.get("text", {}), "text") + sections_html += category_section("Background Colors", "🖼️", semantic_analysis.get("background", {}), "background") + sections_html += category_section("Border Colors", "📏", semantic_analysis.get("border", {}), "border") + sections_html += category_section("Feedback Colors", "🚨", semantic_analysis.get("feedback", {}), "feedback") + + # LLM Impact Summary + llm_summary = "" + if llm_recommendations: + changes = llm_recommendations.get("changes_made", []) + if changes: + changes_html = "".join([f"
  • {c}
  • " for c in changes[:5]]) + llm_summary = f''' +
    +

    🤖 LLM Recommendations Applied:

    +
      {changes_html}
    +
    + ''' + + html = f''' + + +
    + {sections_html} + {llm_summary} +
    + ''' + + return html + + +# ============================================================================= +# COMBINED PREVIEW +# ============================================================================= + +def generate_design_system_preview_html( + typography_tokens: dict, + color_tokens: dict, + font_family: str = "Open Sans", + sample_text: str = "The quick brown fox jumps over the lazy dog" +) -> tuple[str, str]: + """ + Generate both typography and color ramp previews. + + Returns: + Tuple of (typography_html, color_ramps_html) + """ + typography_html = generate_typography_preview_html( + typography_tokens=typography_tokens, + font_family=font_family, + sample_text=sample_text, + ) + + color_ramps_html = generate_color_ramps_preview_html( + color_tokens=color_tokens, + ) + + return typography_html, color_ramps_html diff --git a/core/rule_engine.py b/core/rule_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..74d9bae0bc2a9daca14d48812fb5ac31ae5833b3 --- /dev/null +++ b/core/rule_engine.py @@ -0,0 +1,1137 @@ +""" +Rule Engine — Deterministic Design System Analysis +=================================================== + +This module handles ALL calculations that don't need LLM reasoning: +- Type scale detection +- AA/AAA contrast checking +- Algorithmic color fixes +- Spacing grid detection +- Color statistics and deduplication + +LLMs should ONLY be used for: +- Brand color identification (requires context understanding) +- Palette cohesion (subjective assessment) +- Design maturity scoring (holistic evaluation) +- Prioritized recommendations (business reasoning) +""" + +import colorsys +import re +from dataclasses import dataclass, field +from functools import reduce +from math import gcd +from typing import Optional + + +# ============================================================================= +# DATA CLASSES +# ============================================================================= + +@dataclass +class TypeScaleAnalysis: + """Results of type scale analysis.""" + detected_ratio: float + closest_standard_ratio: float + scale_name: str + is_consistent: bool + variance: float + sizes_px: list[float] + ratios_between_sizes: list[float] + recommendation: float + recommendation_name: str + base_size: float = 16.0 # Detected base/body font size + + def to_dict(self) -> dict: + return { + "detected_ratio": round(self.detected_ratio, 3), + "closest_standard_ratio": self.closest_standard_ratio, + "scale_name": self.scale_name, + "is_consistent": self.is_consistent, + "variance": round(self.variance, 3), + "sizes_px": self.sizes_px, + "base_size": self.base_size, + "recommendation": self.recommendation, + "recommendation_name": self.recommendation_name, + } + + +@dataclass +class ColorAccessibility: + """Accessibility analysis for a single color.""" + hex_color: str + name: str + contrast_on_white: float + contrast_on_black: float + passes_aa_normal: bool # 4.5:1 + passes_aa_large: bool # 3.0:1 + passes_aaa_normal: bool # 7.0:1 + best_text_color: str # White or black + suggested_fix: Optional[str] = None + suggested_fix_contrast: Optional[float] = None + + def to_dict(self) -> dict: + return { + "color": self.hex_color, + "name": self.name, + "contrast_white": round(self.contrast_on_white, 2), + "contrast_black": round(self.contrast_on_black, 2), + "aa_normal": self.passes_aa_normal, + "aa_large": self.passes_aa_large, + "aaa_normal": self.passes_aaa_normal, + "best_text": self.best_text_color, + "suggested_fix": self.suggested_fix, + "suggested_fix_contrast": round(self.suggested_fix_contrast, 2) if self.suggested_fix_contrast else None, + } + + +@dataclass +class SpacingGridAnalysis: + """Results of spacing grid analysis.""" + detected_base: int + is_aligned: bool + alignment_percentage: float + misaligned_values: list[int] + recommendation: int + recommendation_reason: str + current_values: list[int] + suggested_scale: list[int] + + def to_dict(self) -> dict: + return { + "detected_base": self.detected_base, + "is_aligned": self.is_aligned, + "alignment_percentage": round(self.alignment_percentage, 1), + "misaligned_values": self.misaligned_values, + "recommendation": self.recommendation, + "recommendation_reason": self.recommendation_reason, + "current_values": self.current_values, + "suggested_scale": self.suggested_scale, + } + + +@dataclass +class ColorStatistics: + """Statistical analysis of color palette.""" + total_count: int + unique_count: int + duplicate_count: int + gray_count: int + saturated_count: int + near_duplicates: list[tuple[str, str, float]] # (color1, color2, similarity) + hue_distribution: dict[str, int] # {"red": 5, "blue": 3, ...} + + def to_dict(self) -> dict: + return { + "total": self.total_count, + "unique": self.unique_count, + "duplicates": self.duplicate_count, + "grays": self.gray_count, + "saturated": self.saturated_count, + "near_duplicates_count": len(self.near_duplicates), + "hue_distribution": self.hue_distribution, + } + + +@dataclass +class RadiusAnalysis: + """v3: Analysis of border radius tokens.""" + tier_count: int = 0 # How many distinct radius tiers + values_px: list = field(default_factory=list) # Sorted px values + base_4_aligned: int = 0 # Count aligned to base-4 grid + base_8_aligned: int = 0 # Count aligned to base-8 grid + alignment_pct: float = 0.0 # % aligned to best grid + grid_base: int = 4 # Detected grid base (4 or 8) + has_full: bool = False # Has a "full" / 9999px value + strategy: str = "mixed" # "sharp" (<= 4px), "rounded" (4-16), "pill" (>= 24), "mixed" + + def to_dict(self) -> dict: + return { + "tier_count": self.tier_count, + "values_px": self.values_px, + "base_4_aligned": self.base_4_aligned, + "base_8_aligned": self.base_8_aligned, + "alignment_pct": round(self.alignment_pct, 1), + "grid_base": self.grid_base, + "has_full": self.has_full, + "strategy": self.strategy, + } + + +@dataclass +class ShadowAnalysis: + """v3: Analysis of shadow / elevation tokens.""" + level_count: int = 0 # Number of distinct shadows + blur_values: list = field(default_factory=list) # Sorted blur px + is_monotonic: bool = True # Blur increases with each level + y_offset_monotonic: bool = True # Y-offset increases with each level + color_consistent: bool = True # All shadows use same base color + elevation_verdict: str = "none" # "good" / "inconsistent" / "insufficient" / "none" + + def to_dict(self) -> dict: + return { + "level_count": self.level_count, + "blur_values": self.blur_values, + "is_monotonic": self.is_monotonic, + "y_offset_monotonic": self.y_offset_monotonic, + "color_consistent": self.color_consistent, + "elevation_verdict": self.elevation_verdict, + } + + +@dataclass +class RuleEngineResults: + """Complete rule engine analysis results.""" + typography: TypeScaleAnalysis + accessibility: list[ColorAccessibility] + spacing: SpacingGridAnalysis + color_stats: ColorStatistics + + # v3: radius and shadow analysis + radius: RadiusAnalysis = field(default_factory=RadiusAnalysis) + shadows: ShadowAnalysis = field(default_factory=ShadowAnalysis) + + # Summary + aa_failures: int = 0 + consistency_score: int = 50 # 0-100 + + def to_dict(self) -> dict: + return { + "typography": self.typography.to_dict(), + "accessibility": [a.to_dict() for a in self.accessibility if not a.passes_aa_normal], + "accessibility_all": [a.to_dict() for a in self.accessibility], + "spacing": self.spacing.to_dict(), + "color_stats": self.color_stats.to_dict(), + "radius": self.radius.to_dict(), + "shadows": self.shadows.to_dict(), + "summary": { + "aa_failures": self.aa_failures, + "consistency_score": self.consistency_score, + } + } + + +# ============================================================================= +# COLOR UTILITIES +# ============================================================================= + +def hex_to_rgb(hex_color: str) -> tuple[int, int, int]: + """Convert hex to RGB tuple.""" + hex_color = hex_color.lstrip('#') + if len(hex_color) == 3: + hex_color = ''.join([c*2 for c in hex_color]) + return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + + +def rgb_to_hex(r: int, g: int, b: int) -> str: + """Convert RGB to hex string.""" + r = max(0, min(255, r)) + g = max(0, min(255, g)) + b = max(0, min(255, b)) + return f"#{r:02x}{g:02x}{b:02x}" + + +def get_relative_luminance(hex_color: str) -> float: + """Calculate relative luminance per WCAG 2.1.""" + r, g, b = hex_to_rgb(hex_color) + + def channel_luminance(c): + c = c / 255 + return c / 12.92 if c <= 0.03928 else ((c + 0.055) / 1.055) ** 2.4 + + return 0.2126 * channel_luminance(r) + 0.7152 * channel_luminance(g) + 0.0722 * channel_luminance(b) + + +def get_contrast_ratio(color1: str, color2: str) -> float: + """Calculate WCAG contrast ratio between two colors.""" + l1 = get_relative_luminance(color1) + l2 = get_relative_luminance(color2) + lighter = max(l1, l2) + darker = min(l1, l2) + return (lighter + 0.05) / (darker + 0.05) + + +def is_gray(hex_color: str, threshold: float = 0.1) -> bool: + """Check if color is a gray (low saturation).""" + r, g, b = hex_to_rgb(hex_color) + h, s, v = colorsys.rgb_to_hsv(r/255, g/255, b/255) + return s < threshold + + +def get_saturation(hex_color: str) -> float: + """Get saturation value (0-1).""" + r, g, b = hex_to_rgb(hex_color) + h, s, v = colorsys.rgb_to_hsv(r/255, g/255, b/255) + return s + + +def get_hue_name(hex_color: str) -> str: + """Get human-readable hue name.""" + r, g, b = hex_to_rgb(hex_color) + h, s, v = colorsys.rgb_to_hsv(r/255, g/255, b/255) + + if s < 0.1: + return "gray" + + hue_deg = h * 360 + + if hue_deg < 15 or hue_deg >= 345: + return "red" + elif hue_deg < 45: + return "orange" + elif hue_deg < 75: + return "yellow" + elif hue_deg < 150: + return "green" + elif hue_deg < 210: + return "cyan" + elif hue_deg < 270: + return "blue" + elif hue_deg < 315: + return "purple" + else: + return "pink" + + +def color_distance(hex1: str, hex2: str) -> float: + """Calculate perceptual color distance (0-1, lower = more similar).""" + r1, g1, b1 = hex_to_rgb(hex1) + r2, g2, b2 = hex_to_rgb(hex2) + + # Simple Euclidean distance in RGB space (normalized) + dr = (r1 - r2) / 255 + dg = (g1 - g2) / 255 + db = (b1 - b2) / 255 + + return (dr**2 + dg**2 + db**2) ** 0.5 / (3 ** 0.5) + + +def darken_color(hex_color: str, factor: float) -> str: + """Darken a color by a factor (0-1).""" + r, g, b = hex_to_rgb(hex_color) + r = int(r * (1 - factor)) + g = int(g * (1 - factor)) + b = int(b * (1 - factor)) + return rgb_to_hex(r, g, b) + + +def lighten_color(hex_color: str, factor: float) -> str: + """Lighten a color by a factor (0-1).""" + r, g, b = hex_to_rgb(hex_color) + r = int(r + (255 - r) * factor) + g = int(g + (255 - g) * factor) + b = int(b + (255 - b) * factor) + return rgb_to_hex(r, g, b) + + +def find_aa_compliant_color(hex_color: str, background: str = "#ffffff", target_contrast: float = 4.5) -> str: + """ + Algorithmically adjust a color until it meets AA contrast requirements. + + Returns the original color if it already passes, otherwise returns + a darkened/lightened version that passes. + """ + current_contrast = get_contrast_ratio(hex_color, background) + + if current_contrast >= target_contrast: + return hex_color + + # Determine direction: move fg *away* from bg in luminance. + # If fg is lighter than bg → darken fg to increase gap. + # If fg is darker than bg → lighten fg to increase gap. + bg_luminance = get_relative_luminance(background) + color_luminance = get_relative_luminance(hex_color) + + should_darken = color_luminance >= bg_luminance + + best_color = hex_color + best_contrast = current_contrast + + for i in range(1, 101): + factor = i / 100 + + if should_darken: + new_color = darken_color(hex_color, factor) + else: + new_color = lighten_color(hex_color, factor) + + new_contrast = get_contrast_ratio(new_color, background) + + if new_contrast >= target_contrast: + return new_color + + if new_contrast > best_contrast: + best_contrast = new_contrast + best_color = new_color + + # If first direction didn't reach target, try the opposite direction + # (e.g., very similar luminances where either direction could work) + should_darken = not should_darken + for i in range(1, 101): + factor = i / 100 + + if should_darken: + new_color = darken_color(hex_color, factor) + else: + new_color = lighten_color(hex_color, factor) + + new_contrast = get_contrast_ratio(new_color, background) + + if new_contrast >= target_contrast: + return new_color + + if new_contrast > best_contrast: + best_contrast = new_contrast + best_color = new_color + + return best_color + + +# ============================================================================= +# TYPE SCALE ANALYSIS +# ============================================================================= + +# Standard type scale ratios +STANDARD_SCALES = { + 1.067: "Minor Second", + 1.125: "Major Second", + 1.200: "Minor Third", + 1.250: "Major Third", # ⭐ Recommended + 1.333: "Perfect Fourth", + 1.414: "Augmented Fourth", + 1.500: "Perfect Fifth", + 1.618: "Golden Ratio", + 2.000: "Octave", +} + + +def parse_size_to_px(size: str) -> Optional[float]: + """Convert any size string to pixels.""" + if isinstance(size, (int, float)): + return float(size) + + size = str(size).strip().lower() + + # Extract number + match = re.search(r'([\d.]+)', size) + if not match: + return None + + value = float(match.group(1)) + + if 'rem' in size: + return value * 16 # Assume 16px base + elif 'em' in size: + return value * 16 # Approximate + elif 'px' in size or size.replace('.', '').isdigit(): + return value + + return value + + +def analyze_type_scale(typography_tokens: dict) -> TypeScaleAnalysis: + """ + Analyze typography tokens to detect type scale ratio. + + Args: + typography_tokens: Dict of typography tokens with font_size + + Returns: + TypeScaleAnalysis with detected ratio and recommendations + """ + # Extract and parse sizes + sizes = [] + for name, token in typography_tokens.items(): + if isinstance(token, dict): + size = token.get("font_size") or token.get("fontSize") or token.get("size") + else: + size = getattr(token, "font_size", None) + + if size: + px = parse_size_to_px(size) + if px and px > 0: + sizes.append(px) + + # Sort and dedupe + sizes_px = sorted(set(sizes)) + + if len(sizes_px) < 2: + # Use the size if valid (>= 10px), otherwise default to 16px + if sizes_px and sizes_px[0] >= 10: + base_size = sizes_px[0] + else: + base_size = 16.0 + return TypeScaleAnalysis( + detected_ratio=1.0, + closest_standard_ratio=1.25, + scale_name="Unknown", + is_consistent=False, + variance=0, + sizes_px=sizes_px, + ratios_between_sizes=[], + recommendation=1.25, + recommendation_name="Major Third", + base_size=base_size, + ) + + # Calculate ratios between consecutive sizes + ratios = [] + for i in range(len(sizes_px) - 1): + if sizes_px[i] > 0: + ratio = sizes_px[i + 1] / sizes_px[i] + if 1.0 < ratio < 3.0: # Reasonable range + ratios.append(ratio) + + if not ratios: + # Detect base size even if no valid ratios + # Filter out tiny sizes (< 10px) which are likely captions/icons + valid_body_sizes = [s for s in sizes_px if s >= 10] + base_candidates = [s for s in valid_body_sizes if 14 <= s <= 18] + if base_candidates: + base_size = min(base_candidates, key=lambda x: abs(x - 16)) + elif valid_body_sizes: + base_size = min(valid_body_sizes, key=lambda x: abs(x - 16)) + elif sizes_px: + base_size = max(sizes_px) # Last resort: largest of tiny sizes + else: + base_size = 16.0 + return TypeScaleAnalysis( + detected_ratio=1.0, + closest_standard_ratio=1.25, + scale_name="Unknown", + is_consistent=False, + variance=0, + sizes_px=sizes_px, + ratios_between_sizes=[], + recommendation=1.25, + recommendation_name="Major Third", + base_size=base_size, + ) + + # Average ratio + avg_ratio = sum(ratios) / len(ratios) + + # Variance (consistency check) + variance = max(ratios) - min(ratios) if ratios else 0 + is_consistent = variance < 0.15 # Within 15% variance is "consistent" + + # Find closest standard scale + closest_scale = min(STANDARD_SCALES.keys(), key=lambda x: abs(x - avg_ratio)) + scale_name = STANDARD_SCALES[closest_scale] + + # Detect base size (closest to 16px, or 14-18px range typical for body) + # The base size is typically the most common body text size + # IMPORTANT: Filter out tiny sizes (< 10px) which are likely captions/icons + valid_body_sizes = [s for s in sizes_px if s >= 10] + + base_candidates = [s for s in valid_body_sizes if 14 <= s <= 18] + if base_candidates: + # Prefer 16px if present, otherwise closest to 16 + if 16 in base_candidates: + base_size = 16.0 + else: + base_size = min(base_candidates, key=lambda x: abs(x - 16)) + elif valid_body_sizes: + # Fallback: find size closest to 16px from valid sizes (>= 10px) + # This avoids picking tiny caption/icon sizes like 7px + base_size = min(valid_body_sizes, key=lambda x: abs(x - 16)) + elif sizes_px: + # Last resort: just use the largest size if all are tiny + base_size = max(sizes_px) + else: + base_size = 16.0 + + # Recommendation + if is_consistent and abs(avg_ratio - closest_scale) < 0.05: + # Already using a standard scale + recommendation = closest_scale + recommendation_name = scale_name + else: + # Recommend Major Third (1.25) as default + recommendation = 1.25 + recommendation_name = "Major Third" + + return TypeScaleAnalysis( + detected_ratio=avg_ratio, + closest_standard_ratio=closest_scale, + scale_name=scale_name, + is_consistent=is_consistent, + variance=variance, + sizes_px=sizes_px, + ratios_between_sizes=ratios, + recommendation=recommendation, + recommendation_name=recommendation_name, + base_size=base_size, + ) + + +# ============================================================================= +# ACCESSIBILITY ANALYSIS +# ============================================================================= + +def analyze_accessibility(color_tokens: dict, fg_bg_pairs: list[dict] = None) -> list[ColorAccessibility]: + """ + Analyze all colors for WCAG accessibility compliance. + + Args: + color_tokens: Dict of color tokens with value/hex + fg_bg_pairs: Optional list of actual foreground/background pairs + extracted from the DOM (each dict has 'foreground', + 'background', 'element' keys). + + Returns: + List of ColorAccessibility results + """ + results = [] + + for name, token in color_tokens.items(): + if isinstance(token, dict): + hex_color = token.get("value") or token.get("hex") or token.get("color") + else: + hex_color = getattr(token, "value", None) + + if not hex_color or not hex_color.startswith("#"): + continue + + try: + contrast_white = get_contrast_ratio(hex_color, "#ffffff") + contrast_black = get_contrast_ratio(hex_color, "#000000") + + passes_aa_normal = contrast_white >= 4.5 or contrast_black >= 4.5 + passes_aa_large = contrast_white >= 3.0 or contrast_black >= 3.0 + passes_aaa_normal = contrast_white >= 7.0 or contrast_black >= 7.0 + + best_text = "#ffffff" if contrast_white > contrast_black else "#000000" + + # Generate fix suggestion if needed + suggested_fix = None + suggested_fix_contrast = None + + if not passes_aa_normal: + suggested_fix = find_aa_compliant_color(hex_color, "#ffffff", 4.5) + suggested_fix_contrast = get_contrast_ratio(suggested_fix, "#ffffff") + + results.append(ColorAccessibility( + hex_color=hex_color, + name=name, + contrast_on_white=contrast_white, + contrast_on_black=contrast_black, + passes_aa_normal=passes_aa_normal, + passes_aa_large=passes_aa_large, + passes_aaa_normal=passes_aaa_normal, + best_text_color=best_text, + suggested_fix=suggested_fix, + suggested_fix_contrast=suggested_fix_contrast, + )) + except Exception: + continue + + # --- Real foreground-background pair checks --- + if fg_bg_pairs: + for pair in fg_bg_pairs: + fg = pair.get("foreground", "").lower() + bg = pair.get("background", "").lower() + element = pair.get("element", "") + if not (fg.startswith("#") and bg.startswith("#")): + continue + # Skip same-color pairs (invisible/placeholder text — not real failures) + if fg == bg: + continue + try: + ratio = get_contrast_ratio(fg, bg) + # Skip near-identical pairs (ratio < 1.1) — likely decorative/hidden + if ratio < 1.1: + continue + if ratio < 4.5: + # This pair fails AA — record it + fix = find_aa_compliant_color(fg, bg, 4.5) + fix_contrast = get_contrast_ratio(fix, bg) + results.append(ColorAccessibility( + hex_color=fg, + name=f"fg:{fg} on bg:{bg} ({element}) [{ratio:.1f}:1]", + contrast_on_white=get_contrast_ratio(fg, "#ffffff"), + contrast_on_black=get_contrast_ratio(fg, "#000000"), + passes_aa_normal=False, + passes_aa_large=ratio >= 3.0, + passes_aaa_normal=False, + best_text_color="#ffffff" if get_contrast_ratio(fg, "#ffffff") > get_contrast_ratio(fg, "#000000") else "#000000", + suggested_fix=fix, + suggested_fix_contrast=fix_contrast, + )) + except Exception: + continue + + return results + + +# ============================================================================= +# SPACING GRID ANALYSIS +# ============================================================================= + +def analyze_spacing_grid(spacing_tokens: dict) -> SpacingGridAnalysis: + """ + Analyze spacing tokens to detect grid alignment. + + Args: + spacing_tokens: Dict of spacing tokens with value_px or value + + Returns: + SpacingGridAnalysis with detected grid and recommendations + """ + values = [] + + for name, token in spacing_tokens.items(): + if isinstance(token, dict): + px = token.get("value_px") or token.get("value") + else: + px = getattr(token, "value_px", None) or getattr(token, "value", None) + + if px: + try: + px_val = int(float(str(px).replace('px', ''))) + if px_val > 0: + values.append(px_val) + except (ValueError, TypeError): + continue + + if not values: + return SpacingGridAnalysis( + detected_base=8, + is_aligned=False, + alignment_percentage=0, + misaligned_values=[], + recommendation=8, + recommendation_reason="No spacing values detected, defaulting to 8px grid", + current_values=[], + suggested_scale=[0, 4, 8, 12, 16, 20, 24, 32, 40, 48, 64], + ) + + values = sorted(set(values)) + + # Find GCD (greatest common divisor) of all values + detected_base = reduce(gcd, values) + + # Check alignment to common grids (4px, 8px) + aligned_to_4 = all(v % 4 == 0 for v in values) + aligned_to_8 = all(v % 8 == 0 for v in values) + + # Find misaligned values (not divisible by detected base) + misaligned = [v for v in values if v % detected_base != 0] if detected_base > 1 else values + + alignment_percentage = (len(values) - len(misaligned)) / len(values) * 100 if values else 0 + + # Determine recommendation + if aligned_to_8: + recommendation = 8 + recommendation_reason = "All values already align to 8px grid" + is_aligned = True + elif aligned_to_4: + recommendation = 4 + recommendation_reason = "Values align to 4px grid (consider 8px for simpler system)" + is_aligned = True + elif detected_base in [4, 8]: + recommendation = detected_base + recommendation_reason = f"Detected {detected_base}px base with {alignment_percentage:.0f}% alignment" + is_aligned = alignment_percentage >= 80 + else: + recommendation = 8 + recommendation_reason = f"Inconsistent spacing detected (GCD={detected_base}), recommend 8px grid" + is_aligned = False + + # Generate suggested scale + base = recommendation + suggested_scale = [0] + [base * i for i in [0.5, 1, 1.5, 2, 2.5, 3, 4, 5, 6, 8, 10, 12, 16] if base * i == int(base * i)] + suggested_scale = sorted(set([int(v) for v in suggested_scale])) + + return SpacingGridAnalysis( + detected_base=detected_base, + is_aligned=is_aligned, + alignment_percentage=alignment_percentage, + misaligned_values=misaligned, + recommendation=recommendation, + recommendation_reason=recommendation_reason, + current_values=values, + suggested_scale=suggested_scale, + ) + + +# ============================================================================= +# COLOR STATISTICS +# ============================================================================= + +def analyze_color_statistics(color_tokens: dict, similarity_threshold: float = 0.05) -> ColorStatistics: + """ + Analyze color palette statistics. + + Args: + color_tokens: Dict of color tokens + similarity_threshold: Distance threshold for "near duplicate" (0-1) + + Returns: + ColorStatistics with palette analysis + """ + colors = [] + + for name, token in color_tokens.items(): + if isinstance(token, dict): + hex_color = token.get("value") or token.get("hex") + else: + hex_color = getattr(token, "value", None) + + if hex_color and hex_color.startswith("#"): + colors.append(hex_color.lower()) + + unique_colors = list(set(colors)) + + # Count grays and saturated + grays = [c for c in unique_colors if is_gray(c)] + saturated = [c for c in unique_colors if get_saturation(c) > 0.3] + + # Find near duplicates + near_duplicates = [] + for i, c1 in enumerate(unique_colors): + for c2 in unique_colors[i+1:]: + dist = color_distance(c1, c2) + if dist < similarity_threshold and dist > 0: + near_duplicates.append((c1, c2, round(dist, 4))) + + # Hue distribution + hue_dist = {} + for c in unique_colors: + hue = get_hue_name(c) + hue_dist[hue] = hue_dist.get(hue, 0) + 1 + + return ColorStatistics( + total_count=len(colors), + unique_count=len(unique_colors), + duplicate_count=len(colors) - len(unique_colors), + gray_count=len(grays), + saturated_count=len(saturated), + near_duplicates=near_duplicates, + hue_distribution=hue_dist, + ) + + +# ============================================================================= +# v3: RADIUS GRID ANALYSIS +# ============================================================================= + +def analyze_radius_grid(radius_tokens: dict) -> RadiusAnalysis: + """Analyze border radius tokens for grid alignment and strategy.""" + if not radius_tokens: + return RadiusAnalysis() + + values_px = [] + for name, t in radius_tokens.items(): + px = None + if isinstance(t, dict): + px = t.get("value_px") + else: + px = getattr(t, "value_px", None) + if px is not None and isinstance(px, (int, float)): + values_px.append(int(px)) + + if not values_px: + return RadiusAnalysis() + + values_px = sorted(set(values_px)) + has_full = 9999 in values_px + # For grid analysis, exclude 0 and 9999 + grid_candidates = [v for v in values_px if 0 < v < 9999] + + base_4 = sum(1 for v in grid_candidates if v % 4 == 0) + base_8 = sum(1 for v in grid_candidates if v % 8 == 0) + total = len(grid_candidates) if grid_candidates else 1 + + if base_8 / total >= 0.7: + grid_base = 8 + alignment_pct = (base_8 / total) * 100 + else: + grid_base = 4 + alignment_pct = (base_4 / total) * 100 + + # Determine strategy: sharp, rounded, pill, mixed + if grid_candidates: + max_val = max(grid_candidates) + if max_val <= 4: + strategy = "sharp" + elif max_val <= 16: + strategy = "rounded" + elif max_val >= 24: + strategy = "pill" + else: + strategy = "mixed" + else: + strategy = "none" + + return RadiusAnalysis( + tier_count=len(values_px), + values_px=values_px, + base_4_aligned=base_4, + base_8_aligned=base_8, + alignment_pct=alignment_pct, + grid_base=grid_base, + has_full=has_full, + strategy=strategy, + ) + + +# ============================================================================= +# v3: SHADOW ELEVATION ANALYSIS +# ============================================================================= + +def analyze_shadow_elevation(shadow_tokens: dict) -> ShadowAnalysis: + """Analyze shadow tokens for elevation hierarchy and consistency.""" + if not shadow_tokens: + return ShadowAnalysis() + + blur_values = [] + y_offsets = [] + colors_seen = set() + + for name, t in shadow_tokens.items(): + if isinstance(t, dict): + blur = t.get("blur_px") + y_off = t.get("y_offset_px") + color = t.get("color") or t.get("value", "")[:30] + else: + blur = getattr(t, "blur_px", None) + y_off = getattr(t, "y_offset_px", None) + color = getattr(t, "color", None) or str(getattr(t, "value", ""))[:30] + + if blur is not None: + blur_values.append(float(blur)) + if y_off is not None: + y_offsets.append(float(y_off)) + # Normalize color for consistency check (strip alpha variations) + if color: + base_color = color.split("(")[0].strip() if "(" in color else color[:7] + colors_seen.add(base_color) + + if not blur_values: + return ShadowAnalysis() + + sorted_blur = sorted(blur_values) + sorted_y = sorted(y_offsets) if y_offsets else [] + + # Check monotonic progression + is_mono_blur = all(sorted_blur[i] <= sorted_blur[i+1] for i in range(len(sorted_blur)-1)) + is_mono_y = all(sorted_y[i] <= sorted_y[i+1] for i in range(len(sorted_y)-1)) if len(sorted_y) > 1 else True + + # Color consistency: all shadows should use the same base color + color_consistent = len(colors_seen) <= 2 + + # Verdict + level_count = len(sorted_blur) + if level_count == 0: + verdict = "none" + elif level_count < 3: + verdict = "insufficient" + elif is_mono_blur and color_consistent: + verdict = "good" + else: + verdict = "inconsistent" + + return ShadowAnalysis( + level_count=level_count, + blur_values=[round(b, 1) for b in sorted_blur], + is_monotonic=is_mono_blur, + y_offset_monotonic=is_mono_y, + color_consistent=color_consistent, + elevation_verdict=verdict, + ) + + +# ============================================================================= +# MAIN ANALYSIS FUNCTION +# ============================================================================= + +def run_rule_engine( + typography_tokens: dict, + color_tokens: dict, + spacing_tokens: dict, + radius_tokens: dict = None, + shadow_tokens: dict = None, + log_callback: Optional[callable] = None, + fg_bg_pairs: list[dict] = None, +) -> RuleEngineResults: + """ + Run complete rule-based analysis on design tokens. + + This is FREE (no LLM costs) and handles all deterministic calculations. + + Args: + typography_tokens: Dict of typography tokens + color_tokens: Dict of color tokens + spacing_tokens: Dict of spacing tokens + radius_tokens: Dict of border radius tokens (optional) + shadow_tokens: Dict of shadow tokens (optional) + log_callback: Function to log messages + + Returns: + RuleEngineResults with all analysis data + """ + + def log(msg: str): + if log_callback: + log_callback(msg) + + log("") + log("═" * 60) + log("⚙️ LAYER 1: RULE ENGINE (FREE - $0.00)") + log("═" * 60) + log("") + + # ───────────────────────────────────────────────────────────── + # Typography Analysis + # ───────────────────────────────────────────────────────────── + log(" 📐 TYPE SCALE ANALYSIS") + log(" " + "─" * 40) + typography = analyze_type_scale(typography_tokens) + + consistency_icon = "✅" if typography.is_consistent else "⚠️" + log(f" ├─ Detected Ratio: {typography.detected_ratio:.3f}") + log(f" ├─ Closest Standard: {typography.scale_name} ({typography.closest_standard_ratio})") + log(f" ├─ Consistent: {consistency_icon} {'Yes' if typography.is_consistent else f'No (variance: {typography.variance:.2f})'}") + log(f" ├─ Sizes Found: {typography.sizes_px}") + log(f" └─ 💡 Recommendation: {typography.recommendation} ({typography.recommendation_name})") + log("") + + # ───────────────────────────────────────────────────────────── + # Accessibility Analysis + # ───────────────────────────────────────────────────────────── + log(" ♿ ACCESSIBILITY CHECK (WCAG AA/AAA)") + log(" " + "─" * 40) + accessibility = analyze_accessibility(color_tokens, fg_bg_pairs=fg_bg_pairs) + + # Separate individual-color failures from real FG/BG pair failures + pair_failures = [a for a in accessibility if not a.passes_aa_normal and a.name.startswith("fg:")] + color_only_failures = [a for a in accessibility if not a.passes_aa_normal and not a.name.startswith("fg:")] + failures = [a for a in accessibility if not a.passes_aa_normal] + passes = len(accessibility) - len(failures) + + pair_count = len(fg_bg_pairs) if fg_bg_pairs else 0 + log(f" ├─ Colors Analyzed: {len(accessibility)}") + log(f" ├─ FG/BG Pairs Checked: {pair_count}") + log(f" ├─ AA Pass: {passes} ✅") + log(f" ├─ AA Fail (color vs white/black): {len(color_only_failures)} {'❌' if color_only_failures else '✅'}") + log(f" ├─ AA Fail (real FG/BG pairs): {len(pair_failures)} {'❌' if pair_failures else '✅'}") + + if color_only_failures: + log(" │") + log(" │ ⚠️ FAILING COLORS (vs white/black):") + for i, f in enumerate(color_only_failures[:5]): + fix_info = f" → 💡 Fix: {f.suggested_fix} ({f.suggested_fix_contrast:.1f}:1)" if f.suggested_fix else "" + log(f" │ ├─ {f.name}: {f.hex_color} ({f.contrast_on_white:.1f}:1 on white){fix_info}") + if len(color_only_failures) > 5: + log(f" │ └─ ... and {len(color_only_failures) - 5} more") + + if pair_failures: + log(" │") + log(" │ ❌ FAILING FG/BG PAIRS (actual on-page combinations):") + for i, f in enumerate(pair_failures[:5]): + fix_info = f" → 💡 Fix: {f.suggested_fix} ({f.suggested_fix_contrast:.1f}:1)" if f.suggested_fix else "" + log(f" │ ├─ {f.name}{fix_info}") + if len(pair_failures) > 5: + log(f" │ └─ ... and {len(pair_failures) - 5} more") + + log("") + + # ───────────────────────────────────────────────────────────── + # Spacing Grid Analysis + # ───────────────────────────────────────────────────────────── + log(" 📏 SPACING GRID ANALYSIS") + log(" " + "─" * 40) + spacing = analyze_spacing_grid(spacing_tokens) + + alignment_icon = "✅" if spacing.is_aligned else "⚠️" + log(f" ├─ Detected Base: {spacing.detected_base}px") + log(f" ├─ Grid Aligned: {alignment_icon} {spacing.alignment_percentage:.0f}%") + + if spacing.misaligned_values: + log(f" ├─ Misaligned Values: {spacing.misaligned_values[:8]}{'...' if len(spacing.misaligned_values) > 8 else ''}") + + log(f" ├─ Suggested Scale: {spacing.suggested_scale[:10]}...") + log(f" └─ 💡 Recommendation: {spacing.recommendation}px ({spacing.recommendation_reason})") + log("") + + # ───────────────────────────────────────────────────────────── + # Color Statistics + # ───────────────────────────────────────────────────────────── + log(" 🎨 COLOR PALETTE STATISTICS") + log(" " + "─" * 40) + color_stats = analyze_color_statistics(color_tokens) + + dup_icon = "⚠️" if color_stats.duplicate_count > 10 else "✅" + unique_icon = "⚠️" if color_stats.unique_count > 30 else "✅" + + log(f" ├─ Total Colors: {color_stats.total_count}") + log(f" ├─ Unique Colors: {color_stats.unique_count} {unique_icon}") + log(f" ├─ Exact Duplicates: {color_stats.duplicate_count} {dup_icon}") + log(f" ├─ Near-Duplicates: {len(color_stats.near_duplicates)}") + log(f" ├─ Grays: {color_stats.gray_count} | Saturated: {color_stats.saturated_count}") + log(f" └─ Hue Distribution: {dict(list(color_stats.hue_distribution.items())[:5])}...") + log("") + + # ───────────────────────────────────────────────────────────── + # v3: Radius Grid Analysis + # ───────────────────────────────────────────────────────────── + radius_result = analyze_radius_grid(radius_tokens or {}) + if radius_result.tier_count > 0: + log(" 🔘 RADIUS GRID ANALYSIS") + log(" " + "─" * 40) + align_icon = "✅" if radius_result.alignment_pct >= 80 else "⚠️" + log(f" ├─ Tiers: {radius_result.tier_count} | Values: {radius_result.values_px[:8]}") + log(f" ├─ Grid: base-{radius_result.grid_base} | Aligned: {align_icon} {radius_result.alignment_pct:.0f}%") + log(f" ├─ Strategy: {radius_result.strategy} | Has full: {radius_result.has_full}") + log(f" └─ Base-4: {radius_result.base_4_aligned}/{radius_result.tier_count} | Base-8: {radius_result.base_8_aligned}/{radius_result.tier_count}") + log("") + + # ───────────────────────────────────────────────────────────── + # v3: Shadow Elevation Analysis + # ───────────────────────────────────────────────────────────── + shadow_result = analyze_shadow_elevation(shadow_tokens or {}) + if shadow_result.level_count > 0: + log(" 🌗 SHADOW ELEVATION ANALYSIS") + log(" " + "─" * 40) + mono_icon = "✅" if shadow_result.is_monotonic else "⚠️" + color_icon = "✅" if shadow_result.color_consistent else "⚠️" + log(f" ├─ Levels: {shadow_result.level_count} | Blur: {shadow_result.blur_values}") + log(f" ├─ Monotonic Blur: {mono_icon} {'Yes' if shadow_result.is_monotonic else 'No — progression is non-linear'}") + log(f" ├─ Color Consistent: {color_icon} {'Yes' if shadow_result.color_consistent else 'No — mixed shadow colors'}") + log(f" └─ Verdict: {shadow_result.elevation_verdict}") + log("") + + # ───────────────────────────────────────────────────────────── + # Calculate Summary Scores + # ───────────────────────────────────────────────────────────── + + # Consistency score (0-100) — v3: includes radius + shadow + type_score = 20 if typography.is_consistent else 8 + aa_score = 20 * (passes / max(len(accessibility), 1)) + spacing_score = 20 * (spacing.alignment_percentage / 100) + color_score = 20 * (1 - min(color_stats.duplicate_count / max(color_stats.total_count, 1), 1)) + radius_score = 10 * (radius_result.alignment_pct / 100) if radius_result.tier_count > 0 else 5 + shadow_score = 10 if shadow_result.elevation_verdict == "good" else 5 if shadow_result.level_count >= 3 else 2 + + consistency_score = int(type_score + aa_score + spacing_score + color_score + radius_score + shadow_score) + + log(" " + "─" * 40) + log(f" RULE ENGINE SUMMARY") + log(f" ├─ Consistency Score: {consistency_score}/100") + log(f" ├─ AA Failures: {len(failures)}") + log(f" ├─ Radius: {radius_result.tier_count} tiers ({radius_result.strategy})") + log(f" ├─ Shadows: {shadow_result.level_count} levels ({shadow_result.elevation_verdict})") + log(f" └─ Cost: $0.00 (free)") + log("") + + return RuleEngineResults( + typography=typography, + accessibility=accessibility, + spacing=spacing, + color_stats=color_stats, + radius=radius_result, + shadows=shadow_result, + aa_failures=len(failures), + consistency_score=consistency_score, + ) \ No newline at end of file diff --git a/core/token_schema.py b/core/token_schema.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7b2839fd42773dd7a101316dc83c9eff09e30f --- /dev/null +++ b/core/token_schema.py @@ -0,0 +1,488 @@ +""" +Token Schema Definitions +Design System Extractor v3 + +Pydantic models for all token types and extraction results. +These are the core data structures used throughout the application. +""" + +from datetime import datetime +from enum import Enum +from typing import Optional, Any +from pydantic import BaseModel, Field, field_validator + + +# ============================================================================= +# ENUMS +# ============================================================================= + +class TokenSource(str, Enum): + """Origin of a token value.""" + DETECTED = "detected" # Directly found in CSS + INFERRED = "inferred" # Derived from patterns + UPGRADED = "upgraded" # User-selected improvement + MANUAL = "manual" # User manually added + + +class Confidence(str, Enum): + """Confidence level for extracted tokens.""" + HIGH = "high" # 10+ occurrences, consistent usage + MEDIUM = "medium" # 3-9 occurrences + LOW = "low" # 1-2 occurrences or conflicting + + +class Viewport(str, Enum): + """Viewport type.""" + DESKTOP = "desktop" # 1440px width + MOBILE = "mobile" # 375px width + + +class PageType(str, Enum): + """Type of page template.""" + HOMEPAGE = "homepage" + LISTING = "listing" + DETAIL = "detail" + FORM = "form" + MARKETING = "marketing" + AUTH = "auth" + CHECKOUT = "checkout" + ABOUT = "about" + CONTACT = "contact" + OTHER = "other" + + +# ============================================================================= +# BASE TOKEN MODEL +# ============================================================================= + +class BaseToken(BaseModel): + """Base class for all tokens.""" + source: TokenSource = TokenSource.DETECTED + confidence: Confidence = Confidence.MEDIUM + frequency: int = 0 + suggested_name: Optional[str] = None + + # For tracking user decisions + accepted: bool = True + flagged: bool = False + notes: Optional[str] = None + + +# ============================================================================= +# COLOR TOKENS +# ============================================================================= + +class ColorToken(BaseToken): + """Extracted color token.""" + value: str # hex value (e.g., "#007bff") + value_rgb: Optional[str] = None # "rgb(0, 123, 255)" + value_hsl: Optional[str] = None # "hsl(211, 100%, 50%)" + + # Context information + contexts: list[str] = Field(default_factory=list) # ["background", "text", "border"] + elements: list[str] = Field(default_factory=list) # ["button", "header", "link"] + css_properties: list[str] = Field(default_factory=list) # ["background-color", "color"] + + # Role hint from normalizer (for AURORA to consume) + # Values: "brand_candidate", "text_candidate", "bg_candidate", "border_candidate", + # "feedback_candidate", "palette" (generic colored), None (unclassified) + role_hint: Optional[str] = None + + # Accessibility + contrast_white: Optional[float] = None # Contrast ratio against white + contrast_black: Optional[float] = None # Contrast ratio against black + wcag_aa_large_text: bool = False + wcag_aa_small_text: bool = False + wcag_aaa_large_text: bool = False + wcag_aaa_small_text: bool = False + + @field_validator("value") + @classmethod + def validate_hex(cls, v: str) -> str: + """Ensure hex color is properly formatted.""" + v = v.strip().lower() + if not v.startswith("#"): + v = f"#{v}" + # Convert 3-digit hex to 6-digit + if len(v) == 4: + v = f"#{v[1]}{v[1]}{v[2]}{v[2]}{v[3]}{v[3]}" + return v + + +class ColorRamp(BaseModel): + """Generated color ramp with shades.""" + base_color: str # Original extracted color + name: str # e.g., "primary", "neutral" + shades: dict[str, str] = Field(default_factory=dict) # {"50": "#e6f2ff", "500": "#007bff", ...} + source: TokenSource = TokenSource.UPGRADED + + +# ============================================================================= +# TYPOGRAPHY TOKENS +# ============================================================================= + +class TypographyToken(BaseToken): + """Extracted typography token.""" + font_family: str + font_size: str # "16px" or "1rem" + font_size_px: Optional[float] = None # Computed px value + font_weight: int = 400 + line_height: str = "1.5" # "1.5" or "24px" + line_height_computed: Optional[float] = None # Computed ratio + letter_spacing: Optional[str] = None + text_transform: Optional[str] = None # "uppercase", "lowercase", etc. + + # Context + elements: list[str] = Field(default_factory=list) # ["h1", "p", "button"] + css_selectors: list[str] = Field(default_factory=list) # [".heading", ".body-text"] + + +class TypeScale(BaseModel): + """Typography scale configuration.""" + name: str # "Major Third", "Perfect Fourth" + ratio: float # 1.25, 1.333 + base_size: int = 16 # px + sizes: dict[str, str] = Field(default_factory=dict) # {"xs": "12px", "sm": "14px", ...} + source: TokenSource = TokenSource.UPGRADED + + +class FontFamily(BaseModel): + """Font family information.""" + name: str # "Inter" + fallbacks: list[str] = Field(default_factory=list) # ["system-ui", "sans-serif"] + category: str = "sans-serif" # "serif", "sans-serif", "monospace" + frequency: int = 0 + usage: str = "primary" # "primary", "secondary", "accent", "monospace" + + +# ============================================================================= +# SPACING TOKENS +# ============================================================================= + +class SpacingToken(BaseToken): + """Extracted spacing token.""" + value: str # "16px" + value_px: int # 16 + + # Context + contexts: list[str] = Field(default_factory=list) # ["margin", "padding", "gap"] + properties: list[str] = Field(default_factory=list) # ["margin-top", "padding-left"] + + # Analysis + fits_base_4: bool = False # Divisible by 4 + fits_base_8: bool = False # Divisible by 8 + is_outlier: bool = False # Doesn't fit common patterns + + +class SpacingScale(BaseModel): + """Spacing scale configuration.""" + name: str # "8px base" + base: int # 8 + scale: list[int] = Field(default_factory=list) # [4, 8, 16, 24, 32, 48, 64] + names: dict[int, str] = Field(default_factory=dict) # {4: "xs", 8: "sm", 16: "md"} + source: TokenSource = TokenSource.UPGRADED + + +# ============================================================================= +# BORDER RADIUS TOKENS +# ============================================================================= + +class RadiusToken(BaseToken): + """Extracted border radius token.""" + value: str # "8px" or "50%" + value_px: Optional[int] = None # If px value + + # Context + elements: list[str] = Field(default_factory=list) # ["button", "card", "input"] + + # Analysis + fits_base_4: bool = False + fits_base_8: bool = False + + +# ============================================================================= +# SHADOW TOKENS +# ============================================================================= + +class ShadowToken(BaseToken): + """Extracted box shadow token.""" + value: str # Full CSS shadow value + + # Parsed components + offset_x: Optional[str] = None + offset_y: Optional[str] = None + blur: Optional[str] = None + spread: Optional[str] = None + color: Optional[str] = None + inset: bool = False + + # Computed numeric values for sorting/analysis + blur_px: Optional[float] = None + y_offset_px: Optional[float] = None + + # Context + elements: list[str] = Field(default_factory=list) + + +# ============================================================================= +# PAGE & CRAWL MODELS +# ============================================================================= + +class DiscoveredPage(BaseModel): + """A page discovered during crawling.""" + url: str + title: Optional[str] = None + page_type: PageType = PageType.OTHER + depth: int = 0 # Distance from homepage + selected: bool = True # User can deselect pages + + # Crawl status + crawled: bool = False + error: Optional[str] = None + + +class CrawlResult(BaseModel): + """Result of crawling a single page.""" + url: str + viewport: Viewport + success: bool + + # Timing + started_at: datetime + completed_at: Optional[datetime] = None + duration_ms: Optional[int] = None + + # Results + colors_found: int = 0 + typography_found: int = 0 + spacing_found: int = 0 + + # Errors + error: Optional[str] = None + warnings: list[str] = Field(default_factory=list) + + +# ============================================================================= +# EXTRACTION RESULT +# ============================================================================= + +class ExtractedTokens(BaseModel): + """Complete extraction result for one viewport.""" + viewport: Viewport + source_url: str + pages_crawled: list[str] = Field(default_factory=list) + + # Extracted tokens + colors: list[ColorToken] = Field(default_factory=list) + typography: list[TypographyToken] = Field(default_factory=list) + spacing: list[SpacingToken] = Field(default_factory=list) + radius: list[RadiusToken] = Field(default_factory=list) + shadows: list[ShadowToken] = Field(default_factory=list) + + # Detected patterns + font_families: list[FontFamily] = Field(default_factory=list) + base_font_size: Optional[str] = None + spacing_base: Optional[int] = None # Detected: 4 or 8 + naming_convention: Optional[str] = None # "bem", "utility", "none" + + # Metadata + extraction_timestamp: datetime = Field(default_factory=datetime.now) + extraction_duration_ms: Optional[int] = None + + # Quality indicators + total_elements_analyzed: int = 0 + unique_colors: int = 0 + unique_font_sizes: int = 0 + unique_spacing_values: int = 0 + + # Issues + errors: list[str] = Field(default_factory=list) + warnings: list[str] = Field(default_factory=list) + + def summary(self) -> dict: + """Get extraction summary.""" + return { + "viewport": self.viewport.value, + "pages_crawled": len(self.pages_crawled), + "colors": len(self.colors), + "typography": len(self.typography), + "spacing": len(self.spacing), + "radius": len(self.radius), + "shadows": len(self.shadows), + "font_families": len(self.font_families), + "errors": len(self.errors), + "warnings": len(self.warnings), + } + + +# ============================================================================= +# NORMALIZED TOKENS (Agent 2 Output) +# ============================================================================= + +class NormalizedTokens(BaseModel): + """Normalized and structured tokens from Agent 2.""" + viewport: Viewport + source_url: str + + # Normalized tokens with suggested names + colors: dict[str, ColorToken] = Field(default_factory=dict) # {"primary-500": ColorToken, ...} + typography: dict[str, TypographyToken] = Field(default_factory=dict) + spacing: dict[str, SpacingToken] = Field(default_factory=dict) + radius: dict[str, RadiusToken] = Field(default_factory=dict) + shadows: dict[str, ShadowToken] = Field(default_factory=dict) + + # Detected info + font_families: list[FontFamily] = Field(default_factory=list) + detected_spacing_base: Optional[int] = None + detected_naming_convention: Optional[str] = None + + # Duplicates & conflicts + duplicate_colors: list[tuple[str, str]] = Field(default_factory=list) # [("#1a1a1a", "#1b1b1b"), ...] + conflicting_tokens: list[str] = Field(default_factory=list) + + # Metadata + normalized_at: datetime = Field(default_factory=datetime.now) + + +# ============================================================================= +# UPGRADE OPTIONS (Agent 3 Output) +# ============================================================================= + +class UpgradeOption(BaseModel): + """A single upgrade option.""" + id: str + name: str + description: str + category: str # "typography", "spacing", "colors", "naming" + + # The actual values + values: dict[str, Any] = Field(default_factory=dict) + + # Metadata + pros: list[str] = Field(default_factory=list) + cons: list[str] = Field(default_factory=list) + effort: str = "low" # "low", "medium", "high" + recommended: bool = False + + # Selection state + selected: bool = False + + +class UpgradeRecommendations(BaseModel): + """All upgrade recommendations from Agent 3.""" + + # Options by category + typography_scales: list[UpgradeOption] = Field(default_factory=list) + spacing_systems: list[UpgradeOption] = Field(default_factory=list) + color_ramps: list[UpgradeOption] = Field(default_factory=list) + naming_conventions: list[UpgradeOption] = Field(default_factory=list) + + # LLM analysis results + llm_rationale: str = "" + detected_patterns: list[str] = Field(default_factory=list) + brand_analysis: list[dict] = Field(default_factory=list) # From LLM research + color_observations: str = "" + + # Accessibility + accessibility_issues: list[str] = Field(default_factory=list) + accessibility_fixes: list[UpgradeOption] = Field(default_factory=list) + + # Metadata + generated_at: datetime = Field(default_factory=datetime.now) + + +# ============================================================================= +# FINAL OUTPUT (Agent 4 Output) +# ============================================================================= + +class TokenMetadata(BaseModel): + """Metadata for exported tokens.""" + source_url: str + extracted_at: datetime + version: str + viewport: Viewport + generator: str = "Design System Extractor v3" + + +class FinalTokens(BaseModel): + """Final exported token set.""" + metadata: TokenMetadata + + # Token collections + colors: dict[str, dict] = Field(default_factory=dict) + typography: dict[str, dict] = Field(default_factory=dict) + spacing: dict[str, dict] = Field(default_factory=dict) + radius: dict[str, dict] = Field(default_factory=dict) + shadows: dict[str, dict] = Field(default_factory=dict) + + def to_tokens_studio_format(self) -> dict: + """Convert to Tokens Studio compatible format.""" + return { + "$metadata": { + "source": self.metadata.source_url, + "version": self.metadata.version, + }, + "color": self.colors, + "typography": self.typography, + "spacing": self.spacing, + "borderRadius": self.radius, + "boxShadow": self.shadows, + } + + def to_css_variables(self) -> str: + """Convert to CSS custom properties.""" + lines = [":root {"] + + for name, data in self.colors.items(): + value = data.get("value", data) if isinstance(data, dict) else data + lines.append(f" --color-{name}: {value};") + + for name, data in self.spacing.items(): + value = data.get("value", data) if isinstance(data, dict) else data + lines.append(f" --space-{name}: {value};") + + lines.append("}") + return "\n".join(lines) + + +# ============================================================================= +# LANGGRAPH STATE +# ============================================================================= + +class WorkflowState(BaseModel): + """LangGraph workflow state.""" + + # Input + base_url: str + + # Discovery phase + discovered_pages: list[DiscoveredPage] = Field(default_factory=list) + confirmed_pages: list[str] = Field(default_factory=list) + + # Extraction phase + desktop_tokens: Optional[ExtractedTokens] = None + mobile_tokens: Optional[ExtractedTokens] = None + + # Normalization phase + desktop_normalized: Optional[NormalizedTokens] = None + mobile_normalized: Optional[NormalizedTokens] = None + + # Upgrade phase + upgrade_recommendations: Optional[UpgradeRecommendations] = None + selected_upgrades: dict[str, str] = Field(default_factory=dict) # {"typography_scale": "major_third", ...} + + # Generation phase + desktop_final: Optional[FinalTokens] = None + mobile_final: Optional[FinalTokens] = None + + # Workflow status + current_stage: str = "init" # "init", "discover", "confirm", "extract", "normalize", "review", "upgrade", "generate", "export" + errors: list[str] = Field(default_factory=list) + warnings: list[str] = Field(default_factory=list) + + # Timestamps + started_at: Optional[datetime] = None + completed_at: Optional[datetime] = None + + class Config: + arbitrary_types_allowed = True diff --git a/core/validation.py b/core/validation.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccbdb8564e861b0db9c4c70bb3d9e3fbb709aa3 --- /dev/null +++ b/core/validation.py @@ -0,0 +1,172 @@ +""" +Agent Output Validation +======================== + +JSON schemas for validating LLM agent outputs. +Ensures data integrity between pipeline stages. +""" + +from typing import Any, Optional + +try: + from jsonschema import validate, ValidationError + + HAS_JSONSCHEMA = True +except ImportError: + HAS_JSONSCHEMA = False + +from core.logging import get_logger + +logger = get_logger("validation") + + +# ============================================================================= +# SCHEMAS +# ============================================================================= + +BRAND_IDENTIFICATION_SCHEMA = { + "type": "object", + "properties": { + "brand_primary": {"type": ["string", "null"]}, + "brand_secondary": {"type": ["string", "null"]}, + "brand_accent": {"type": ["string", "null"]}, + "palette_strategy": {"type": "string"}, + "cohesion_score": {"type": ["number", "integer"]}, + "cohesion_notes": {"type": "string"}, + "semantic_names": {"type": "object"}, + "self_evaluation": {"type": "object"}, + }, + "required": ["brand_primary", "palette_strategy"], +} + +BENCHMARK_ADVICE_SCHEMA = { + "type": "object", + "properties": { + "recommended_benchmark": {"type": "string"}, + "recommended_benchmark_name": {"type": "string"}, + "reasoning": {"type": "string"}, + "alignment_changes": {"type": "array"}, + "pros_of_alignment": {"type": "array"}, + "cons_of_alignment": {"type": "array"}, + "alternative_benchmarks": {"type": "array"}, + "self_evaluation": {"type": "object"}, + }, + "required": ["recommended_benchmark", "reasoning"], +} + +BEST_PRACTICES_SCHEMA = { + "type": "object", + "properties": { + "overall_score": {"type": ["number", "integer"]}, + "checks": {"type": "array"}, + "priority_fixes": {"type": "array"}, + "passing_practices": {"type": "array"}, + "failing_practices": {"type": "array"}, + "self_evaluation": {"type": "object"}, + }, + "required": ["overall_score", "priority_fixes"], +} + +HEAD_SYNTHESIS_SCHEMA = { + "type": "object", + "properties": { + "executive_summary": {"type": "string"}, + "scores": {"type": "object"}, + "benchmark_fit": {"type": "object"}, + "brand_analysis": {"type": "object"}, + "top_3_actions": {"type": "array"}, + "color_recommendations": {"type": "array"}, + "type_scale_recommendation": {"type": "object"}, + "spacing_recommendation": {"type": "object"}, + "self_evaluation": {"type": "object"}, + }, + "required": ["executive_summary", "top_3_actions"], +} + +# Map agent names to schemas +AGENT_SCHEMAS = { + "aurora": BRAND_IDENTIFICATION_SCHEMA, + "brand_identifier": BRAND_IDENTIFICATION_SCHEMA, + "atlas": BENCHMARK_ADVICE_SCHEMA, + "benchmark_advisor": BENCHMARK_ADVICE_SCHEMA, + "sentinel": BEST_PRACTICES_SCHEMA, + "best_practices": BEST_PRACTICES_SCHEMA, + "nexus": HEAD_SYNTHESIS_SCHEMA, + "head_synthesizer": HEAD_SYNTHESIS_SCHEMA, +} + + +# ============================================================================= +# VALIDATION FUNCTIONS +# ============================================================================= + +def validate_agent_output(data: Any, agent_name: str) -> tuple[bool, Optional[str]]: + """ + Validate an agent's output against its expected schema. + + Args: + data: The output data (dict or dataclass with to_dict()) + agent_name: Name of the agent (e.g., 'aurora', 'nexus') + + Returns: + (is_valid, error_message) tuple + """ + agent_key = agent_name.lower().strip() + schema = AGENT_SCHEMAS.get(agent_key) + + if not schema: + logger.warning(f"No schema found for agent: {agent_name}") + return True, None # No schema = pass (don't block) + + # Convert dataclass to dict if needed + if hasattr(data, "to_dict"): + data_dict = data.to_dict() + elif hasattr(data, "__dataclass_fields__"): + from dataclasses import asdict + data_dict = asdict(data) + elif isinstance(data, dict): + data_dict = data + else: + return False, f"Cannot validate: unexpected type {type(data)}" + + if not HAS_JSONSCHEMA: + # Fallback: manual required-field check + return _manual_validate(data_dict, schema, agent_name) + + try: + validate(instance=data_dict, schema=schema) + logger.debug(f"Validation passed for {agent_name}") + return True, None + except ValidationError as e: + error_msg = f"Validation failed for {agent_name}: {e.message}" + logger.warning(error_msg) + return False, error_msg + + +def _manual_validate(data: dict, schema: dict, agent_name: str) -> tuple[bool, Optional[str]]: + """Fallback validation without jsonschema library.""" + required = schema.get("required", []) + missing = [field for field in required if field not in data] + + if missing: + error_msg = f"{agent_name} output missing required fields: {missing}" + logger.warning(error_msg) + return False, error_msg + + return True, None + + +def validate_all_agents(outputs: dict) -> dict[str, tuple[bool, Optional[str]]]: + """ + Validate all agent outputs at once. + + Args: + outputs: Dict mapping agent_name → output data + + Returns: + Dict mapping agent_name → (is_valid, error_message) + """ + results = {} + for agent_name, data in outputs.items(): + results[agent_name] = validate_agent_output(data, agent_name) + return results diff --git a/docs/CONTEXT.md b/docs/CONTEXT.md new file mode 100644 index 0000000000000000000000000000000000000000..33a5c22397002ba53f0e67e437854e1851650c87 --- /dev/null +++ b/docs/CONTEXT.md @@ -0,0 +1,797 @@ +# Design System Extractor v2 — Master Context File + +> **Upload this file to refresh Claude's context when continuing work on this project.** + +**Last Updated:** January 2026 + +--- + +## 📁 Files Changed in Latest Session + +| File | What Changed | +|------|--------------| +| `agents/extractor.py` | Enhanced 7-source extraction (DOM, CSS vars, SVG, inline, stylesheets, external CSS, page scan) | +| `agents/firecrawl_extractor.py` | **NEW** Agent 1B for deep CSS parsing | +| `agents/semantic_analyzer.py` | **NEW** Agent 1C for semantic color categorization (brand/text/bg/border) | +| `core/preview_generator.py` | AS-IS previews + Color Ramps sorted by brand priority | +| `app.py` | Stage 1 UI now has 6 preview tabs including Semantic Colors | +| `docs/CONTEXT.md` | Updated with semantic analyzer, full architecture diagrams | + +--- + +## 🎯 Project Goal + +Build a **semi-automated, human-in-the-loop agentic system** that: +1. Reverse-engineers a design system from a live website +2. Reconstructs and upgrades it into a modern, scalable design system +3. Outputs production-ready JSON tokens (Figma Tokens Studio compatible) + +**Philosophy:** This is a design-aware co-pilot, NOT a magic button. Humans decide, agents propose. + +--- + +## 🤔 Why This Project? (Market Differentiation) + +### The Problem We Solve + +| Pain Point | Who Has It | Current Solutions | Why They Fail | +|------------|------------|-------------------|---------------| +| Legacy websites with no design system | Enterprise teams | Manual audit (weeks) | Time-consuming, error-prone | +| Inconsistent design tokens scattered in CSS | Agencies inheriting projects | Figma plugins (style extractors) | Only extract from Figma, not live sites | +| Need to modernize without breaking existing | Product teams | Design system generators | Generate new, don't reverse-engineer existing | +| AA compliance gaps unknown | Accessibility teams | Contrast checkers | Check one color at a time, no system view | + +### Existing Tools & Their Gaps + +| Tool | What It Does | Gap We Fill | +|------|--------------|-------------| +| **Figma Tokens Studio** | Manages tokens in Figma | Doesn't extract from websites | +| **Style Dictionary** | Transforms tokens to code | Needs tokens first (we create them) | +| **Polypane/VisBug** | Inspect live sites | No systematic extraction or upgrade | +| **AI Design Tools** (Galileo, Uizard) | Generate new designs | Don't reverse-engineer existing | +| **CSS Stats** | Analyze CSS files | Statistics only, no actionable tokens | +| **Chromatic/Percy** | Visual regression | Compare, don't extract or upgrade | + +### Our Unique Value Proposition + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ WHAT MAKES US DIFFERENT │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ 1. REVERSE-ENGINEERING (not generation) │ +│ • Extracts from LIVE websites, not design files │ +│ • Preserves what's working, upgrades what's broken │ +│ • Respects existing brand decisions │ +│ │ +│ 2. MULTI-AGENT REASONING (not single LLM) │ +│ • Two analysts with different perspectives │ +│ • HEAD compiler resolves conflicts │ +│ • Shows reasoning, not just results │ +│ │ +│ 3. HUMAN-IN-THE-LOOP (not magic button) │ +│ • Designer reviews every stage │ +│ • Accept/reject individual tokens │ +│ • Choose from upgrade OPTIONS, not forced decisions │ +│ │ +│ 4. VISUAL PREVIEWS (not just data tables) │ +│ • Typography rendered in actual detected font │ +│ • Color ramps with AA compliance per shade │ +│ • See before you export │ +│ │ +│ 5. COST-TRANSPARENT (not black box) │ +│ • Shows token usage and cost per analysis │ +│ • Uses HF free tier ($0.10/mo) or Pro ($2/mo) │ +│ • ~$0.05 per full analysis │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Target Users + +| User | Use Case | Value | +|------|----------|-------| +| **UX Managers** (like you!) | Modernize legacy booking platforms | Weeks → Hours | +| **Design System Teams** | Audit and standardize existing properties | Systematic, not ad-hoc | +| **Agencies** | Onboard client projects with no documentation | Instant design inventory | +| **Accessibility Consultants** | AA compliance audit with fixes | Full palette view | +| **Developers** | Get production-ready tokens from designer's website | No manual translation | + +### Why Not Just Use [X]? + +**"Why not just inspect the CSS manually?"** +→ You could, but it takes weeks for a complex site. We do it in minutes with systematic coverage. + +**"Why not use Figma's native styles?"** +→ Many legacy sites were never in Figma. We extract from the source of truth: the live website. + +**"Why do you need AI? Can't rules handle this?"** +→ Rules extract tokens. AI understands *design intent* — why is this color used here? What scale was intended? Where does it deviate from best practices? + +**"Isn't this just CSS Stats with AI?"** +→ CSS Stats tells you what exists. We tell you what it *should* be and give you actionable upgrade paths. + +--- + +## 🏗️ Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ TECH STACK │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ Frontend: Gradio (long-scroll, sectioned UI with live preview) │ +│ Orchestration: LangGraph (agent state management & workflow) │ +│ Models: HuggingFace Inference Providers (Novita, Groq, etc.) │ +│ Hosting: Hugging Face Spaces │ +│ Storage: HF Spaces persistent storage │ +│ Output: Platform-agnostic JSON tokens (Figma Tokens Studio) │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 🧠 Model Assignments + +### Stage 2: Multi-Agent Analysis (4 Named Agents + Rule Engine) + +| Agent | Persona | Model | Temperature | Cost | +|-------|---------|-------|-------------|------| +| **Rule Engine** | — (deterministic) | None | — | FREE | +| **AURORA** | Brand Color Analyst | `Qwen/Qwen2.5-72B-Instruct` | 0.4 | ~Free (HF PRO) | +| **ATLAS** | Benchmark Advisor | `meta-llama/Llama-3.3-70B-Instruct` | 0.25 | ~Free (HF PRO) | +| **SENTINEL** | Best Practices Auditor | `Qwen/Qwen2.5-72B-Instruct` | 0.2 | ~Free (HF PRO) | +| **NEXUS** | Head Synthesizer | `meta-llama/Llama-3.3-70B-Instruct` | 0.3 | ~$0.001 | + +**Architecture:** +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ LAYER 1: DETERMINISTIC (Free — $0.00) │ +│ ├─ WCAG Contrast Checker (actual FG/BG pairs, not just vs white) │ +│ ├─ Type Scale Detection (ratio math, variance, standard comparison) │ +│ ├─ Spacing Grid Analysis (GCD math, alignment %) │ +│ └─ Color Statistics (unique, near-duplicates, hue distribution) │ +│ │ +│ LAYER 2: 4 AI AGENTS (~$0.003 total) │ +│ │ +│ Rule Engine Results │ +│ │ │ +│ ┌────┼────────────────┐ │ +│ ↓ ↓ ↓ │ +│ ┌────────┐ ┌────────┐ ┌──────────┐ │ +│ │ AURORA │ │ ATLAS │ │ SENTINEL │ (analyze in parallel) │ +│ │ Brand │ │ Bench- │ │ Best │ │ +│ │ Colors │ │ marks │ │ Practices│ │ +│ │Qwen 72B│ │Llama70B│ │ Qwen 72B │ │ +│ └───┬────┘ └───┬────┘ └────┬─────┘ │ +│ └───────────┼────────────┘ │ +│ ↓ │ +│ ┌───────────┐ │ +│ │ NEXUS │ (final synthesis) │ +│ │ Llama 70B │ │ +│ │ • Resolve │ │ +│ │ • Score │ │ +│ │ • Top 3 │ │ +│ └───────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Other Agents + +| Agent | Role | Model | Provider | Why | +|-------|------|-------|----------|-----| +| **Agent 1** | Crawler & Extractor | None (Rule-based) | — | Pure CSS extraction, no LLM needed | +| **Agent 2** | Normalizer | `microsoft/Phi-3.5-mini-instruct` | Novita | Fast, great structured output | +| **Agent 4** | Generator | `mistralai/Codestral-22B-v0.1` | Novita | Code specialist, JSON formatting | + +### Provider Configuration + +Default provider: **Novita** (configurable in `config/agents.yaml`) + +Available providers (via HuggingFace Inference Providers): +- **novita** - Default, good balance +- **groq** - Fastest +- **cerebras** - Ultra-fast +- **sambanova** - Good for Llama +- **together** - Wide model selection + +### Cost Tracking + +Estimated cost per Stage 2 analysis: **~$0.003** +- Rule Engine: $0.00 (free — pure math) +- AURORA + ATLAS + SENTINEL: ~Free within HF PRO ($9/mo subscription) +- NEXUS: ~$0.001 +- HuggingFace PRO tier: $9/month (covers inference for all models) + +--- + +## 👁️ Visual Previews + +### Stage 1: AS-IS Previews (No Enhancements) + +Shows raw extracted values exactly as found on the website: + +| Preview | What It Shows | +|---------|---------------| +| **Typography** | Actual font rendered with detected styles | +| **Colors** | Simple swatches with hex, frequency, context, AA status | +| **Spacing** | Visual bars representing each spacing value | +| **Radius** | Boxes with each border-radius applied | +| **Shadows** | Cards with each box-shadow applied | + +### Stage 2: Enhanced Previews (Upgraded) + +Shows proposed upgrades and improvements: + +| Preview | What It Shows | +|---------|---------------| +| **Typography** | Type scale comparison (1.2, 1.25, 1.333 ratios) | +| **Color Ramps** | 11 shades (50-950) with AA compliance per shade | + +--- + +## 🔍 Enhanced Extraction (Agent 1) + +Agent 1 now extracts from **5 sources** to capture ALL colors: + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ENHANCED EXTRACTION SOURCES │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ 1. DOM Computed Styles │ +│ • window.getComputedStyle(element) │ +│ • Captures: color, background-color, border-color, etc. │ +│ │ +│ 2. CSS Variables │ +│ • :root { --primary-color: #3860be; } │ +│ • Parses all stylesheets for CSS custom properties │ +│ │ +│ 3. SVG Colors │ +│ • │ +│ • │ +│ │ +│ 4. Inline Styles │ +│ •
    │ +│ • Parses style attributes for color values │ +│ │ +│ 5. Stylesheet Rules │ +│ • Parses CSS rules that may not be applied to visible elements │ +│ • Catches hover states, pseudo-elements, etc. │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 📋 Enhanced Logging + +### Stage 1 Extraction Logs + +Shows detailed extraction progress: +``` +============================================================ +🖥️ DESKTOP EXTRACTION (1440px) +============================================================ + +📡 Enhanced extraction from 5 sources: + 1. DOM computed styles (getComputedStyle) + 2. CSS variables (:root { --color: }) + 3. SVG colors (fill, stroke) + 4. Inline styles (style='color:') + 5. Stylesheet rules (CSS files) + 6. External CSS files (fetch & parse) + 7. Page content scan (brute-force) + +📊 EXTRACTION RESULTS: + Colors: 45 unique + Typography: 12 styles + Spacing: 28 values + Radius: 8 values + Shadows: 4 values + +🎨 CSS Variables found: 15 + --primary-color: #3860be + --accent-color: #00c4cc + --brand-lime: #bcd432 + ... and 12 more + +🔄 Normalizing (deduping, naming)... + ✅ Normalized: 32 colors, 10 typography, 18 spacing + +============================================================ +🔥 FIRECRAWL CSS EXTRACTION +============================================================ + + 🌐 Scraping: https://example.com + ✅ Page scraped (125000 chars) + 📝 Parsing + + +
    +
    +

    🎨 Design Token Creator

    +

    Upload design tokens JSON → Create Figma styles & variables

    +
    + +
    + +
    + + + +
    +
    No file selected
    +
    + + + +
    + +
    Upload a JSON file to see tokens
    +
    + + + +
    + +
    +
    0 / 0
    +
    +
    + +
    + +
    + + + +
    +
    + + + + diff --git a/output_json/file (16).json b/output_json/file (16).json new file mode 100644 index 0000000000000000000000000000000000000000..15f75ae40912efb606101085f4d775609fa50f74 --- /dev/null +++ b/output_json/file (16).json @@ -0,0 +1,584 @@ +{ + "color": { + "background": { + "primary": { + "$type": "color", + "$value": "#ebedef" + }, + "secondary": { + "$type": "color", + "$value": "#bfbfbf" + } + }, + "border": { + "default": { + "$type": "color", + "$value": "#122f44" + } + }, + "text": { + "primary": { + "$type": "color", + "$value": "#000000" + }, + "secondary": { + "$type": "color", + "$value": "#999999" + }, + "muted": { + "$type": "color", + "$value": "#cccccc" + } + }, + "brand": { + "primary": { + "$type": "color", + "$value": "#005aa3" + }, + "secondary": { + "$type": "color", + "$value": "#ff0000" + } + }, + "feedback": { + "success": { + "$type": "color", + "$value": "#3c7312" + }, + "warning": { + "$type": "color", + "$value": "#ffdc00" + } + }, + "button": { + "$type": "color", + "$value": "#ffffff" + }, + "purple": { + "500": { + "$type": "color", + "$value": "#885b9a" + } + }, + "neutral": { + "dark": { + "$type": "color", + "$value": "#333333" + }, + "light": { + "$type": "color", + "$value": "#b2b8bf" + } + }, + "blue": { + "dark": { + "$type": "color", + "$value": "#2c3e50" + }, + "light": { + "$type": "color", + "$value": "#b9daff" + }, + "300": { + "$type": "color", + "$value": "#7fdbff" + }, + "base": { + "$type": "color", + "$value": "#6f7597" + } + }, + "yellow": { + "light": { + "$type": "color", + "$value": "#fff6db" + } + }, + "orange": { + "light": { + "$type": "color", + "$value": "#d0bfa4" + }, + "base": { + "$type": "color", + "$value": "#a85410" + }, + "100": { + "$type": "color", + "$value": "#fdebdd" + } + }, + "green": { + "500": { + "$type": "color", + "$value": "#2ecc40" + } + }, + "red": { + "base": { + "$type": "color", + "$value": "#ff2d55" + } + } + }, + "font": { + "display": { + "2xl": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "68px", + "fontWeight": "700", + "lineHeight": "1.2" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "60px", + "fontWeight": "700", + "lineHeight": "1.2" + } + } + }, + "xl": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "58px", + "fontWeight": "700", + "lineHeight": "1.2" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "50px", + "fontWeight": "700", + "lineHeight": "1.2" + } + } + }, + "lg": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "48px", + "fontWeight": "700", + "lineHeight": "1.2" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "42px", + "fontWeight": "700", + "lineHeight": "1.2" + } + } + }, + "md": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "40px", + "fontWeight": "700", + "lineHeight": "1.2" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "34px", + "fontWeight": "700", + "lineHeight": "1.2" + } + } + } + }, + "heading": { + "xl": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "34px", + "fontWeight": "600", + "lineHeight": "1.3" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "30px", + "fontWeight": "600", + "lineHeight": "1.3" + } + } + }, + "lg": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "28px", + "fontWeight": "600", + "lineHeight": "1.3" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "24px", + "fontWeight": "600", + "lineHeight": "1.3" + } + } + }, + "md": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "24px", + "fontWeight": "600", + "lineHeight": "1.3" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "20px", + "fontWeight": "600", + "lineHeight": "1.3" + } + } + }, + "sm": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "20px", + "fontWeight": "600", + "lineHeight": "1.3" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "16px", + "fontWeight": "600", + "lineHeight": "1.3" + } + } + } + }, + "body": { + "lg": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "16px", + "fontWeight": "400", + "lineHeight": "1.5" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "14px", + "fontWeight": "400", + "lineHeight": "1.5" + } + } + }, + "md": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "14px", + "fontWeight": "400", + "lineHeight": "1.5" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "12px", + "fontWeight": "400", + "lineHeight": "1.5" + } + } + }, + "sm": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "12px", + "fontWeight": "400", + "lineHeight": "1.5" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "10px", + "fontWeight": "400", + "lineHeight": "1.5" + } + } + } + }, + "caption": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "10px", + "fontWeight": "400", + "lineHeight": "1.4" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "8px", + "fontWeight": "400", + "lineHeight": "1.4" + } + } + }, + "overline": { + "desktop": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "8px", + "fontWeight": "500", + "lineHeight": "1.2" + } + }, + "mobile": { + "$type": "typography", + "$value": { + "fontFamily": "sans-serif", + "fontSize": "6px", + "fontWeight": "500", + "lineHeight": "1.2" + } + } + } + }, + "space": { + "1": { + "desktop": { + "$type": "dimension", + "$value": "8px" + }, + "mobile": { + "$type": "dimension", + "$value": "8px" + } + }, + "2": { + "desktop": { + "$type": "dimension", + "$value": "16px" + }, + "mobile": { + "$type": "dimension", + "$value": "16px" + } + }, + "3": { + "desktop": { + "$type": "dimension", + "$value": "24px" + }, + "mobile": { + "$type": "dimension", + "$value": "24px" + } + }, + "4": { + "desktop": { + "$type": "dimension", + "$value": "32px" + }, + "mobile": { + "$type": "dimension", + "$value": "32px" + } + }, + "5": { + "desktop": { + "$type": "dimension", + "$value": "40px" + }, + "mobile": { + "$type": "dimension", + "$value": "40px" + } + }, + "6": { + "desktop": { + "$type": "dimension", + "$value": "48px" + }, + "mobile": { + "$type": "dimension", + "$value": "48px" + } + }, + "8": { + "desktop": { + "$type": "dimension", + "$value": "56px" + }, + "mobile": { + "$type": "dimension", + "$value": "56px" + } + }, + "10": { + "desktop": { + "$type": "dimension", + "$value": "64px" + }, + "mobile": { + "$type": "dimension", + "$value": "64px" + } + }, + "12": { + "desktop": { + "$type": "dimension", + "$value": "72px" + }, + "mobile": { + "$type": "dimension", + "$value": "72px" + } + }, + "16": { + "desktop": { + "$type": "dimension", + "$value": "80px" + }, + "mobile": { + "$type": "dimension", + "$value": "80px" + } + } + }, + "radius": { + "xl": { + "$type": "dimension", + "$value": "16px" + }, + "3xl": { + "$type": "dimension", + "$value": "50px" + }, + "full": { + "$type": "dimension", + "$value": "50%", + "9999": { + "$type": "dimension", + "$value": "9999px" + }, + "100": { + "$type": "dimension", + "$value": "100%" + } + }, + "2xl": { + "$type": "dimension", + "$value": "24px" + }, + "md": { + "$type": "dimension", + "$value": "0px 0px 16px 16px", + "4": { + "$type": "dimension", + "$value": "4px" + } + }, + "lg": { + "$type": "dimension", + "$value": "8px" + } + }, + "shadow": { + "xs": { + "$type": "shadow", + "$value": { + "color": "rgba(0, 0, 0, 0.2)", + "offsetX": "0px", + "offsetY": "10px", + "blur": "25px", + "spread": "0px" + } + }, + "sm": { + "$type": "shadow", + "$value": { + "color": "rgba(0, 0, 0, 0.2)", + "offsetX": "0px", + "offsetY": "2px", + "blur": "30px", + "spread": "0px" + } + }, + "md": { + "$type": "shadow", + "$value": { + "color": "rgba(0, 0, 0, 0.04)", + "offsetX": "0px", + "offsetY": "0px", + "blur": "80px", + "spread": "0px" + } + }, + "lg": { + "$type": "shadow", + "$value": { + "color": "rgba(0, 0, 0, 0.06)", + "offsetX": "0px", + "offsetY": "0px", + "blur": "80px", + "spread": "0px" + } + }, + "xl": { + "$type": "shadow", + "$value": { + "color": "rgba(0, 0, 0, 0.3)", + "offsetX": "0px", + "offsetY": "16px", + "blur": "90px", + "spread": "0px" + } + } + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..418d3a4bff8c360035da0591f04b5b3ed8b6055c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,76 @@ +# ============================================================================= +# Design System Extractor v2 — Dependencies +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Core Framework +# ----------------------------------------------------------------------------- +gradio>=4.44.0 +langgraph>=0.2.0 +langchain>=0.3.0 + +# ----------------------------------------------------------------------------- +# HuggingFace (Primary LLM Provider) +# ----------------------------------------------------------------------------- +huggingface-hub>=0.27.0 # Updated for new router.huggingface.co endpoint +transformers>=4.40.0 + +# ----------------------------------------------------------------------------- +# Data Validation & Configuration +# ----------------------------------------------------------------------------- +pydantic>=2.0.0 +pydantic-settings>=2.0.0 +python-dotenv>=1.0.0 +PyYAML>=6.0.0 + +# ----------------------------------------------------------------------------- +# Web Crawling & Browser Automation +# ----------------------------------------------------------------------------- +playwright>=1.40.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +httpx>=0.25.0 +firecrawl-py>=4.0.0 + +# ----------------------------------------------------------------------------- +# CSS & Color Processing +# ----------------------------------------------------------------------------- +cssutils>=2.9.0 +colormath>=3.0.0 +colour>=0.1.5 + +# ----------------------------------------------------------------------------- +# Data Processing +# ----------------------------------------------------------------------------- +numpy>=1.24.0 +pandas>=2.0.0 + +# ----------------------------------------------------------------------------- +# Async Support +# ----------------------------------------------------------------------------- +aiofiles>=23.0.0 + +# ----------------------------------------------------------------------------- +# Utilities +# ----------------------------------------------------------------------------- +rich>=13.0.0 +tqdm>=4.66.0 +python-slugify>=8.0.0 +loguru>=0.7.0 +jsonschema>=4.20.0 + +# ----------------------------------------------------------------------------- +# Testing (development only) +# ----------------------------------------------------------------------------- +pytest>=7.4.0 +pytest-asyncio>=0.21.0 +pytest-cov>=4.1.0 +pytest-timeout>=2.2.0 +deepeval>=1.0.0 + +# ----------------------------------------------------------------------------- +# Type Checking (development only) +# ----------------------------------------------------------------------------- +mypy>=1.5.0 +types-PyYAML>=6.0.0 +types-beautifulsoup4>=4.12.0 diff --git a/storage/__init__.py b/storage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/storage/benchmark_cache.json b/storage/benchmark_cache.json new file mode 100644 index 0000000000000000000000000000000000000000..2103f1cc719ae3696912e37ce32890c85764de2a --- /dev/null +++ b/storage/benchmark_cache.json @@ -0,0 +1,20 @@ +{ + "test_system": { + "key": "test_system", + "name": "Test System", + "short_name": "Test", + "vendor": "Test Vendor", + "icon": "\ud83e\uddea", + "typography": { + "scale_ratio": 1.25, + "base_size": 16 + }, + "spacing": { + "base": 8 + }, + "colors": {}, + "fetched_at": "2026-02-15T12:12:38.917158", + "confidence": "high", + "best_for": [] + } +} \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/test_agent_evals.py b/tests/test_agent_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..10f77bfc46e242777d8af583657e32177bcf8eb3 --- /dev/null +++ b/tests/test_agent_evals.py @@ -0,0 +1,726 @@ +#!/usr/bin/env python3 +""" +LLM Agent Evaluation Tests +============================ + +Evaluates the 4 named AI agents using mock HF client responses. +Tests schema compliance, output correctness, and consistency. + +Uses DeepEval when available, falls back to manual assertions. + +Run: pytest tests/test_agent_evals.py -v +""" + +import asyncio +import json +import os +import sys +from dataclasses import asdict +from typing import Optional + +import pytest + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from agents.llm_agents import ( + BrandIdentifierAgent, + BenchmarkAdvisorAgent, + BestPracticesValidatorAgent, + HeadSynthesizerAgent, + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + HeadSynthesis, +) + +# Try importing DeepEval +try: + from deepeval import assert_test + from deepeval.test_case import LLMTestCase + from deepeval.metrics import JsonSchemaMetric + + HAS_DEEPEVAL = True +except ImportError: + HAS_DEEPEVAL = False + + +# ============================================================================= +# MOCK HF CLIENT +# ============================================================================= + +# Canned JSON responses that each agent would return +AURORA_RESPONSE = json.dumps({ + "brand_primary": { + "color": "#06b2c4", + "confidence": "high", + "reasoning": "Used in 33 buttons and 12 CTAs — dominant interactive color", + "usage_count": 45, + }, + "brand_secondary": { + "color": "#c1df1f", + "confidence": "medium", + "reasoning": "Used in highlights and badges", + "usage_count": 23, + }, + "brand_accent": None, + "palette_strategy": "complementary", + "cohesion_score": 6, + "cohesion_notes": "Primary and secondary are near-complementary on the color wheel. Reasonable coherence but accent is missing.", + "semantic_names": { + "#06b2c4": "brand.primary", + "#c1df1f": "brand.secondary", + "#1a1a1a": "text.primary", + "#666666": "text.secondary", + }, + "self_evaluation": { + "confidence": 8, + "reasoning": "Clear dominant primary from button usage. Secondary less certain.", + "data_quality": "good", + "flags": [], + }, +}) + +ATLAS_RESPONSE = json.dumps({ + "recommended_benchmark": "shopify_polaris", + "recommended_benchmark_name": "Shopify Polaris", + "reasoning": "87% structural match. Polaris uses similar type scale and spacing grid approach.", + "alignment_changes": [ + {"change": "Adopt 1.25 Major Third type scale", "from": "1.18 random", "to": "1.25", "effort": "low"}, + {"change": "Standardize to 4px spacing grid", "from": "mixed", "to": "4px", "effort": "medium"}, + ], + "pros_of_alignment": [ + "Industry-standard component patterns", + "Strong accessibility built-in", + ], + "cons_of_alignment": [ + "May feel generic without customization", + ], + "alternative_benchmarks": [ + {"name": "Material Design 3", "reason": "77% match, stronger theming support"}, + {"name": "Atlassian Design System", "reason": "76% match, similar enterprise focus"}, + ], + "self_evaluation": { + "confidence": 7, + "reasoning": "Good structural match but benchmark comparison limited to 8 systems", + "data_quality": "good", + "flags": [], + }, +}) + +SENTINEL_RESPONSE = json.dumps({ + "overall_score": 62, + "checks": { + "color_contrast": {"status": "fail", "note": "67 AA failures including brand primary"}, + "type_scale": {"status": "warn", "note": "Near-consistent but not standard ratio"}, + "spacing_grid": {"status": "pass", "note": "4px grid detected with 85% alignment"}, + "color_count": {"status": "warn", "note": "143 unique colors — recommend consolidation to ~20"}, + "shadow_system": {"status": "pass", "note": "4 elevation levels (xs, sm, md, lg) with consistent blur progression"}, + }, + "priority_fixes": [ + {"rank": 1, "issue": "Brand primary fails AA contrast", "impact": "high", "effort": "low", "action": "Darken #06b2c4 to #048391"}, + {"rank": 2, "issue": "143 colors too many", "impact": "medium", "effort": "medium", "action": "Consolidate to semantic palette"}, + {"rank": 3, "issue": "Type scale inconsistent", "impact": "medium", "effort": "low", "action": "Adopt 1.25 Major Third"}, + ], + "passing_practices": ["spacing_grid", "font_family_consistency", "shadow_system"], + "failing_practices": ["color_contrast", "color_count"], + "self_evaluation": { + "confidence": 8, + "reasoning": "Rule engine data is clear. Priority ordering based on impact analysis.", + "data_quality": "good", + "flags": [], + }, +}) + +NEXUS_RESPONSE = json.dumps({ + "executive_summary": "Design system shows strong structural foundation (4px grid, consistent typography) but needs critical accessibility fixes. Brand primary #06b2c4 fails AA — recommend darkened variant. 87% aligned to Polaris.", + "scores": { + "overall": 62, + "accessibility": 45, + "consistency": 72, + "organization": 68, + }, + "benchmark_fit": { + "closest": "Shopify Polaris", + "similarity": 87, + "recommendation": "Align type scale and consolidate colors for 95%+ match", + }, + "brand_analysis": { + "primary": "#06b2c4", + "secondary": "#c1df1f", + "cohesion": 6, + }, + "top_3_actions": [ + {"action": "Fix brand primary contrast", "impact": "high", "effort": "low", "details": "Darken to #048391 for AA 4.5:1"}, + {"action": "Consolidate color palette", "impact": "medium", "effort": "medium", "details": "Reduce 143 → ~20 semantic colors"}, + {"action": "Standardize type scale", "impact": "medium", "effort": "low", "details": "Adopt 1.25 Major Third ratio"}, + ], + "color_recommendations": [ + {"role": "brand-primary", "current": "#06b2c4", "suggested": "#048391", "reason": "AA compliance", "accept": True}, + ], + "type_scale_recommendation": { + "current_ratio": 1.18, + "recommended_ratio": 1.25, + "name": "Major Third", + }, + "spacing_recommendation": { + "current_base": 4, + "recommended_base": 8, + "reason": "Simpler system with fewer decisions", + }, + "self_evaluation": { + "confidence": 8, + "reasoning": "Strong data from rule engine and all 3 agents. Minor disagreement on spacing resolved by averaging.", + "data_quality": "good", + "flags": [], + }, +}) + + +class MockHFClient: + """Mock HF Inference client that returns canned responses per agent.""" + + AGENT_RESPONSES = { + "brand_identifier": AURORA_RESPONSE, + "benchmark_advisor": ATLAS_RESPONSE, + "best_practices": SENTINEL_RESPONSE, + "best_practices_validator": SENTINEL_RESPONSE, + "head_synthesizer": NEXUS_RESPONSE, + } + + async def complete_async( + self, + agent_name: str, + system_prompt: str, + user_message: str, + max_tokens: int = 2000, + json_mode: bool = True, + ) -> str: + """Return canned response for the agent.""" + return self.AGENT_RESPONSES.get(agent_name, "{}") + + +# ============================================================================= +# TEST DATA +# ============================================================================= + +MOCK_COLOR_TOKENS = { + "brand-primary": {"value": "#06b2c4", "frequency": 45, "context": "buttons, links"}, + "brand-secondary": {"value": "#c1df1f", "frequency": 23, "context": "highlights"}, + "text-primary": {"value": "#1a1a1a", "frequency": 120, "context": "headings, body"}, + "text-secondary": {"value": "#666666", "frequency": 80, "context": "captions"}, + "background": {"value": "#ffffff", "frequency": 200, "context": "page background"}, +} + +MOCK_SEMANTIC_ANALYSIS = { + "brand": [{"hex": "#06b2c4", "name": "brand-primary"}], + "text": [{"hex": "#1a1a1a", "name": "text-primary"}], +} + +MOCK_SHADOW_TOKENS = { + "shadow-xs": {"value": "rgba(0,0,0,0.05) 0px 1px 2px 0px"}, + "shadow-sm": {"value": "rgba(0,0,0,0.1) 0px 2px 4px 0px"}, + "shadow-md": {"value": "rgba(0,0,0,0.15) 0px 4px 8px 0px"}, + "shadow-lg": {"value": "rgba(0,0,0,0.2) 0px 8px 16px 0px"}, +} + +MOCK_SHADOW_TOKENS_POOR = { + # Only 2 levels - not enough for proper elevation hierarchy + "shadow-1": {"value": "rgba(0,0,0,0.5) 0px 2px 0px 0px"}, # No blur, harsh + "shadow-2": {"value": "rgba(0,0,0,0.5) 0px 4px 2px 0px"}, # High opacity +} + +class MockBenchmarkSystem: + """Mock benchmark system object (what c.benchmark returns).""" + def __init__(self, name, icon, scale_ratio, base_size, spacing_base, best_for): + self.name = name + self.icon = icon + self.typography = {"scale_ratio": scale_ratio, "base_size": base_size} + self.spacing = {"base": spacing_base} + self.best_for = best_for + + +class MockBenchmarkComparison: + """Mock benchmark comparison object (what ATLAS._format_comparisons expects).""" + def __init__(self, benchmark, similarity_score, overall_match_pct, type_ratio_diff, base_size_diff, spacing_grid_diff): + self.benchmark = benchmark + self.similarity_score = similarity_score + self.overall_match_pct = overall_match_pct + self.type_ratio_diff = type_ratio_diff + self.base_size_diff = base_size_diff + self.spacing_grid_diff = spacing_grid_diff + + +MOCK_BENCHMARK_COMPARISONS = [ + MockBenchmarkComparison( + benchmark=MockBenchmarkSystem("Shopify Polaris", "🟢", 1.25, 16, 4, ["e-commerce", "admin"]), + similarity_score=0.13, overall_match_pct=87, type_ratio_diff=0.07, base_size_diff=0, spacing_grid_diff=0, + ), + MockBenchmarkComparison( + benchmark=MockBenchmarkSystem("Material Design 3", "🔵", 1.25, 16, 8, ["mobile", "web"]), + similarity_score=0.23, overall_match_pct=77, type_ratio_diff=0.07, base_size_diff=0, spacing_grid_diff=4, + ), + MockBenchmarkComparison( + benchmark=MockBenchmarkSystem("Atlassian", "🔷", 1.2, 14, 8, ["enterprise", "tools"]), + similarity_score=0.24, overall_match_pct=76, type_ratio_diff=0.02, base_size_diff=2, spacing_grid_diff=4, + ), +] + + +# Mock RuleEngineResults for SENTINEL and NEXUS +class MockTypography: + detected_ratio = 1.18 + base_size = 16.0 + sizes_px = [12, 14, 16, 18, 22, 28, 36, 48] + is_consistent = False + variance = 0.22 + scale_name = "Minor Third" + closest_standard_ratio = 1.2 + recommendation = 1.25 + recommendation_name = "Major Third" + + def to_dict(self): + return {"detected_ratio": self.detected_ratio, "base_size": self.base_size} + + +class MockSpacing: + detected_base = 4 + is_aligned = True + alignment_percentage = 85.0 + misaligned_values = [5, 10] + recommendation = 8 + recommendation_reason = "Simpler grid" + current_values = [4, 8, 12, 16, 24, 32] + suggested_scale = [0, 4, 8, 12, 16, 24, 32, 48] + + def to_dict(self): + return {"detected_base": self.detected_base, "alignment_percentage": self.alignment_percentage} + + +class MockColorStats: + total_count = 160 + unique_count = 143 + duplicate_count = 17 + gray_count = 22 + saturated_count = 45 + near_duplicates = [("#06b2c4", "#07b3c5", 0.01)] + hue_distribution = {"cyan": 5, "gray": 22, "green": 3} + + def to_dict(self): + return {"total": self.total_count, "unique": self.unique_count} + + +class MockAccessibility: + def __init__(self): + self.hex_color = "#06b2c4" + self.name = "brand-primary" + self.passes_aa_normal = False + self.contrast_on_white = 2.57 + self.contrast_on_black = 8.18 + self.suggested_fix = "#048391" + self.suggested_fix_contrast = 4.5 + + def to_dict(self): + return {"color": self.hex_color, "aa_normal": self.passes_aa_normal} + + +class MockRuleEngineResults: + typography = MockTypography() + spacing = MockSpacing() + color_stats = MockColorStats() + accessibility = [MockAccessibility()] + aa_failures = 67 + consistency_score = 52 + + def to_dict(self): + return { + "typography": self.typography.to_dict(), + "spacing": self.spacing.to_dict(), + "color_stats": self.color_stats.to_dict(), + "summary": {"aa_failures": self.aa_failures, "consistency_score": self.consistency_score}, + } + + +# ============================================================================= +# SCHEMA COMPLIANCE TESTS +# ============================================================================= + +class TestAuroraSchemaCompliance: + """AURORA (Brand Identifier) output schema validation.""" + + @pytest.fixture + def agent(self): + return BrandIdentifierAgent(MockHFClient()) + + @pytest.mark.asyncio + async def test_schema_compliance(self, agent): + """AURORA output has all required BrandIdentification fields.""" + result = await agent.analyze( + color_tokens=MOCK_COLOR_TOKENS, + typography_tokens={}, + ) + assert isinstance(result, BrandIdentification) + # Required fields present + assert hasattr(result, "brand_primary") + assert hasattr(result, "palette_strategy") + assert hasattr(result, "cohesion_score") + assert hasattr(result, "self_evaluation") + + @pytest.mark.asyncio + async def test_brand_primary_detected(self, agent): + """AURORA correctly identifies brand primary from high-usage color.""" + result = await agent.analyze( + color_tokens=MOCK_COLOR_TOKENS, + typography_tokens={}, + ) + bp = result.brand_primary + assert isinstance(bp, dict) + assert bp.get("color") == "#06b2c4" + assert bp.get("confidence") in ("high", "medium", "low") + + @pytest.mark.asyncio + async def test_palette_strategy_valid(self, agent): + """Palette strategy is a recognized value.""" + result = await agent.analyze( + color_tokens=MOCK_COLOR_TOKENS, + typography_tokens={}, + ) + valid_strategies = ["complementary", "analogous", "triadic", "monochromatic", "split-complementary", "random", ""] + assert result.palette_strategy in valid_strategies + + @pytest.mark.asyncio + async def test_to_dict_serializable(self, agent): + """Output is JSON-serializable.""" + result = await agent.analyze( + color_tokens=MOCK_COLOR_TOKENS, + typography_tokens={}, + ) + d = result.to_dict() + json_str = json.dumps(d) + assert len(json_str) > 10 + + +class TestAtlasSchemaCompliance: + """ATLAS (Benchmark Advisor) output schema validation.""" + + @pytest.fixture + def agent(self): + return BenchmarkAdvisorAgent(MockHFClient()) + + @pytest.mark.asyncio + async def test_schema_compliance(self, agent): + """ATLAS output has all required BenchmarkAdvice fields.""" + result = await agent.analyze( + user_ratio=1.18, + user_base=16, + user_spacing=4, + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + ) + assert isinstance(result, BenchmarkAdvice) + assert hasattr(result, "recommended_benchmark") + assert hasattr(result, "reasoning") + assert hasattr(result, "alignment_changes") + assert hasattr(result, "self_evaluation") + + @pytest.mark.asyncio + async def test_benchmark_recommended(self, agent): + """ATLAS recommends a valid benchmark.""" + result = await agent.analyze( + user_ratio=1.18, + user_base=16, + user_spacing=4, + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + ) + assert result.recommended_benchmark != "" + assert result.reasoning != "" + + @pytest.mark.asyncio + async def test_alignment_changes_structured(self, agent): + """Alignment changes are structured dicts.""" + result = await agent.analyze( + user_ratio=1.18, + user_base=16, + user_spacing=4, + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + ) + assert isinstance(result.alignment_changes, list) + if result.alignment_changes: + change = result.alignment_changes[0] + assert isinstance(change, dict) + assert "change" in change + + +class TestSentinelSchemaCompliance: + """SENTINEL (Best Practices Validator) output schema validation.""" + + @pytest.fixture + def agent(self): + return BestPracticesValidatorAgent(MockHFClient()) + + @pytest.mark.asyncio + async def test_schema_compliance(self, agent): + """SENTINEL output has all required BestPracticesResult fields.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + ) + assert isinstance(result, BestPracticesResult) + assert hasattr(result, "overall_score") + assert hasattr(result, "priority_fixes") + assert hasattr(result, "self_evaluation") + + @pytest.mark.asyncio + async def test_score_in_range(self, agent): + """Overall score is between 0-100.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + ) + assert 0 <= result.overall_score <= 100 + + @pytest.mark.asyncio + async def test_priority_fixes_ranked(self, agent): + """Priority fixes are a list with high-impact items first.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + ) + assert isinstance(result.priority_fixes, list) + if len(result.priority_fixes) >= 2: + # First fix should be highest priority + first = result.priority_fixes[0] + if isinstance(first, dict) and "rank" in first: + assert first["rank"] == 1 + + +class TestSentinelShadowAnalysis: + """SENTINEL shadow system evaluation tests.""" + + @pytest.fixture + def agent(self): + return BestPracticesValidatorAgent(MockHFClient()) + + @pytest.mark.asyncio + async def test_shadow_check_in_output(self, agent): + """SENTINEL includes shadow_system check in output.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + shadow_tokens=MOCK_SHADOW_TOKENS, + ) + assert "shadow_system" in result.checks + shadow_check = result.checks["shadow_system"] + assert isinstance(shadow_check, dict) + assert "status" in shadow_check + assert shadow_check["status"] in ("pass", "warn", "fail") + + @pytest.mark.asyncio + async def test_shadow_tokens_passed_to_prompt(self, agent): + """Shadow tokens are included in SENTINEL prompt.""" + # The mock response includes shadow check, verifying the prompt includes shadow data + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + shadow_tokens=MOCK_SHADOW_TOKENS, + ) + # If shadow_system is passing, we know the shadows were evaluated + assert result.checks.get("shadow_system", {}).get("status") == "pass" + + @pytest.mark.asyncio + async def test_shadow_in_passing_practices(self, agent): + """Well-structured shadow system appears in passing_practices.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + shadow_tokens=MOCK_SHADOW_TOKENS, + ) + # Mock response has shadow_system in passing_practices + assert "shadow_system" in result.passing_practices + + @pytest.mark.asyncio + async def test_no_shadow_tokens_handled(self, agent): + """SENTINEL handles missing shadow tokens gracefully.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + shadow_tokens=None, + ) + # Should still return valid result + assert isinstance(result, BestPracticesResult) + assert result.overall_score >= 0 + + @pytest.mark.asyncio + async def test_empty_shadow_tokens_handled(self, agent): + """SENTINEL handles empty shadow tokens gracefully.""" + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + shadow_tokens={}, + ) + assert isinstance(result, BestPracticesResult) + + +class TestNexusSchemaCompliance: + """NEXUS (Head Synthesizer) output schema validation.""" + + @pytest.fixture + def agent(self): + return HeadSynthesizerAgent(MockHFClient()) + + @pytest.mark.asyncio + async def test_schema_compliance(self, agent): + """NEXUS output has all required HeadSynthesis fields.""" + result = await agent.synthesize( + rule_engine_results=MockRuleEngineResults(), + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + brand_identification=BrandIdentification( + brand_primary={"color": "#06b2c4", "confidence": "high"}, + palette_strategy="complementary", + cohesion_score=6, + ), + benchmark_advice=BenchmarkAdvice( + recommended_benchmark="shopify_polaris", + reasoning="87% structural match", + ), + best_practices=BestPracticesResult( + overall_score=62, + priority_fixes=[{"issue": "AA contrast", "impact": "high"}], + ), + ) + assert isinstance(result, HeadSynthesis) + assert hasattr(result, "executive_summary") + assert hasattr(result, "top_3_actions") + assert hasattr(result, "scores") + assert hasattr(result, "self_evaluation") + + @pytest.mark.asyncio + async def test_executive_summary_non_empty(self, agent): + """NEXUS produces a non-empty executive summary.""" + result = await agent.synthesize( + rule_engine_results=MockRuleEngineResults(), + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + brand_identification=BrandIdentification(), + benchmark_advice=BenchmarkAdvice(), + best_practices=BestPracticesResult(), + ) + assert result.executive_summary != "" + + @pytest.mark.asyncio + async def test_top_3_actions_present(self, agent): + """NEXUS provides top 3 action items.""" + result = await agent.synthesize( + rule_engine_results=MockRuleEngineResults(), + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + brand_identification=BrandIdentification(), + benchmark_advice=BenchmarkAdvice(), + best_practices=BestPracticesResult(), + ) + assert isinstance(result.top_3_actions, list) + assert len(result.top_3_actions) >= 1 + + +# ============================================================================= +# SELF-EVALUATION TESTS +# ============================================================================= + +class TestSelfEvaluation: + """All agents should include self_evaluation with confidence scoring.""" + + @pytest.mark.asyncio + async def test_aurora_self_evaluation(self): + agent = BrandIdentifierAgent(MockHFClient()) + result = await agent.analyze( + color_tokens=MOCK_COLOR_TOKENS, + typography_tokens={}, + ) + se = result.self_evaluation + assert isinstance(se, dict) + assert "confidence" in se + assert "data_quality" in se + + @pytest.mark.asyncio + async def test_atlas_self_evaluation(self): + agent = BenchmarkAdvisorAgent(MockHFClient()) + result = await agent.analyze( + user_ratio=1.18, + user_base=16, + user_spacing=4, + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + ) + se = result.self_evaluation + assert isinstance(se, dict) + assert "confidence" in se + + @pytest.mark.asyncio + async def test_sentinel_self_evaluation(self): + agent = BestPracticesValidatorAgent(MockHFClient()) + result = await agent.analyze( + rule_engine_results=MockRuleEngineResults(), + ) + se = result.self_evaluation + assert isinstance(se, dict) + assert "confidence" in se + + @pytest.mark.asyncio + async def test_nexus_self_evaluation(self): + agent = HeadSynthesizerAgent(MockHFClient()) + result = await agent.synthesize( + rule_engine_results=MockRuleEngineResults(), + benchmark_comparisons=MOCK_BENCHMARK_COMPARISONS, + brand_identification=BrandIdentification(), + benchmark_advice=BenchmarkAdvice(), + best_practices=BestPracticesResult(), + ) + se = result.self_evaluation + assert isinstance(se, dict) + assert "confidence" in se + + +# ============================================================================= +# VALIDATION MODULE TESTS +# ============================================================================= + +class TestValidationModule: + """Test the core/validation.py module.""" + + def test_validate_aurora_output(self): + from core.validation import validate_agent_output + + data = { + "brand_primary": {"color": "#06b2c4"}, + "palette_strategy": "complementary", + "cohesion_score": 6, + } + is_valid, error = validate_agent_output(data, "aurora") + assert is_valid + + def test_validate_aurora_missing_required(self): + from core.validation import validate_agent_output + + data = {"cohesion_score": 6} # Missing brand_primary and palette_strategy + is_valid, error = validate_agent_output(data, "aurora") + assert not is_valid + assert error is not None + + def test_validate_nexus_output(self): + from core.validation import validate_agent_output + + data = { + "executive_summary": "Test summary", + "top_3_actions": [{"action": "Fix contrast"}], + "scores": {"overall": 62}, + } + is_valid, error = validate_agent_output(data, "nexus") + assert is_valid + + def test_validate_unknown_agent_passes(self): + from core.validation import validate_agent_output + + is_valid, error = validate_agent_output({"anything": True}, "unknown_agent") + assert is_valid # No schema = pass + + def test_validate_dataclass(self): + from core.validation import validate_agent_output + + brand = BrandIdentification( + brand_primary={"color": "#06b2c4"}, + palette_strategy="complementary", + ) + is_valid, error = validate_agent_output(brand, "aurora") + assert is_valid + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_agent_evals_live.py b/tests/test_agent_evals_live.py new file mode 100644 index 0000000000000000000000000000000000000000..b3f67eb632516cb37ae4bb488a2d33ec52e3bd7b --- /dev/null +++ b/tests/test_agent_evals_live.py @@ -0,0 +1,652 @@ +#!/usr/bin/env python3 +""" +Live LLM Agent Evaluations with DeepEval +========================================== + +Tests the 4 AI agents with REAL HuggingFace API calls + DeepEval metrics. +Unlike test_agent_evals.py (mock), this hits live LLMs and evaluates output quality. + +WHAT THIS TESTS: + - Does the LLM return valid JSON? (not just our parser) + - Is the brand identification sensible for known colors? + - Does the benchmark advisor pick a relevant system? + - Are priority fixes ranked by actual impact? + - Does NEXUS reference all 3 upstream agents? + - Are self-evaluation confidence scores honest? + +REQUIRES: + - HF_TOKEN env var set (HuggingFace Pro $9/month) + - pip install deepeval (optional — falls back to manual assertions) + +RUN: + # With DeepEval dashboard: + deepeval test run tests/test_agent_evals_live.py -v + + # With plain pytest: + pytest tests/test_agent_evals_live.py -v -s --timeout=120 + + # Skip if no HF_TOKEN: + pytest tests/test_agent_evals_live.py -v -k "not live" + +COST: ~$0.003 per full run (4 agent calls) +TIME: ~30s sequential, ~10s with parallelized agents +""" + +import asyncio +import json +import os +import sys +from typing import Optional + +import pytest + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Skip all tests if no HF_TOKEN +HF_TOKEN = os.getenv("HF_TOKEN", "") +SKIP_REASON = "HF_TOKEN not set — skipping live LLM evals (set HF_TOKEN to run)" +pytestmark = pytest.mark.skipif(not HF_TOKEN, reason=SKIP_REASON) + +from agents.llm_agents import ( + BrandIdentifierAgent, + BenchmarkAdvisorAgent, + BestPracticesValidatorAgent, + HeadSynthesizerAgent, + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + HeadSynthesis, +) + +# Try importing DeepEval +try: + from deepeval import assert_test + from deepeval.test_case import LLMTestCase + from deepeval.metrics import GEval + from deepeval.metrics.g_eval import GEvalParameter + + HAS_DEEPEVAL = True +except ImportError: + HAS_DEEPEVAL = False + + +# ============================================================================= +# LIVE HF CLIENT +# ============================================================================= + +def get_live_client(): + """Get the real HF inference client.""" + from core.hf_inference import get_inference_client + return get_inference_client() + + +# ============================================================================= +# REALISTIC TEST DATA (simulates a real website extraction) +# ============================================================================= + +# Simulates tokens extracted from a SaaS dashboard website +LIVE_COLOR_TOKENS = { + "primary-button": {"value": "#2563eb", "frequency": 45, "context": "buttons, links, CTAs"}, + "secondary-button": {"value": "#7c3aed", "frequency": 18, "context": "secondary actions"}, + "success": {"value": "#16a34a", "frequency": 12, "context": "success states, badges"}, + "warning": {"value": "#eab308", "frequency": 8, "context": "warnings, alerts"}, + "error": {"value": "#dc2626", "frequency": 6, "context": "error states"}, + "text-primary": {"value": "#111827", "frequency": 200, "context": "headings, body text"}, + "text-secondary": {"value": "#6b7280", "frequency": 150, "context": "secondary text, labels"}, + "text-muted": {"value": "#9ca3af", "frequency": 80, "context": "placeholders, disabled"}, + "bg-white": {"value": "#ffffff", "frequency": 300, "context": "page background"}, + "bg-gray-50": {"value": "#f9fafb", "frequency": 100, "context": "card backgrounds"}, + "bg-gray-100": {"value": "#f3f4f6", "frequency": 60, "context": "section backgrounds"}, + "border": {"value": "#e5e7eb", "frequency": 90, "context": "borders, dividers"}, + "light-accent": {"value": "#bfdbfe", "frequency": 15, "context": "highlights, selected"}, +} + +LIVE_SEMANTIC_ANALYSIS = { + "brand": [ + {"hex": "#2563eb", "name": "primary-button", "context": "buttons, links, CTAs"}, + {"hex": "#7c3aed", "name": "secondary-button", "context": "secondary actions"}, + ], + "text": [ + {"hex": "#111827", "name": "text-primary"}, + {"hex": "#6b7280", "name": "text-secondary"}, + ], + "status": [ + {"hex": "#16a34a", "name": "success"}, + {"hex": "#dc2626", "name": "error"}, + ], +} + + +# Mock benchmark comparison objects (same structure as real pipeline) +class _BenchmarkSystem: + def __init__(self, name, icon, scale_ratio, base_size, spacing_base, best_for): + self.name = name + self.icon = icon + self.typography = {"scale_ratio": scale_ratio, "base_size": base_size} + self.spacing = {"base": spacing_base} + self.best_for = best_for + + +class _BenchmarkComparison: + def __init__(self, benchmark, similarity_score, overall_match_pct, type_ratio_diff, base_size_diff, spacing_grid_diff): + self.benchmark = benchmark + self.similarity_score = similarity_score + self.overall_match_pct = overall_match_pct + self.type_ratio_diff = type_ratio_diff + self.base_size_diff = base_size_diff + self.spacing_grid_diff = spacing_grid_diff + + +LIVE_BENCHMARK_COMPARISONS = [ + _BenchmarkComparison( + benchmark=_BenchmarkSystem("Shopify Polaris", "🟢", 1.2, 16, 4, ["e-commerce", "admin"]), + similarity_score=0.15, overall_match_pct=85, type_ratio_diff=0.05, base_size_diff=0, spacing_grid_diff=0, + ), + _BenchmarkComparison( + benchmark=_BenchmarkSystem("Material Design 3", "🔵", 1.25, 16, 8, ["mobile", "web"]), + similarity_score=0.20, overall_match_pct=80, type_ratio_diff=0.1, base_size_diff=0, spacing_grid_diff=4, + ), + _BenchmarkComparison( + benchmark=_BenchmarkSystem("Atlassian Design System", "🔷", 1.143, 14, 8, ["enterprise", "tools"]), + similarity_score=0.25, overall_match_pct=75, type_ratio_diff=0.007, base_size_diff=2, spacing_grid_diff=4, + ), +] + + +# Mock RuleEngineResults (realistic values) +class _MockTypography: + detected_ratio = 1.15 + base_size = 16.0 + sizes_px = [12, 14, 16, 18, 20, 24, 30, 36, 48] + is_consistent = False + variance = 0.18 + scale_name = "Major Second" + closest_standard_ratio = 1.125 + recommendation = 1.25 + recommendation_name = "Major Third" + + def to_dict(self): + return {"detected_ratio": self.detected_ratio, "base_size": self.base_size, "sizes_px": self.sizes_px} + + +class _MockSpacing: + detected_base = 4 + is_aligned = True + alignment_percentage = 92.0 + misaligned_values = [6, 10] + recommendation = 4 + recommendation_reason = "4px grid with 92% alignment" + current_values = [4, 8, 12, 16, 20, 24, 32, 48, 64] + suggested_scale = [0, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64] + + def to_dict(self): + return {"detected_base": self.detected_base, "alignment_percentage": self.alignment_percentage} + + +class _MockColorStats: + total_count = 42 + unique_count = 13 + duplicate_count = 29 + gray_count = 5 + saturated_count = 5 + near_duplicates = [("#f3f4f6", "#f9fafb", 0.02)] + hue_distribution = {"blue": 3, "purple": 1, "green": 1, "red": 1, "yellow": 1, "gray": 6} + + def to_dict(self): + return {"total": self.total_count, "unique": self.unique_count} + + +class _MockAccessibility: + def __init__(self, hex_color, name, passes, contrast_white, fix=None, fix_contrast=None): + self.hex_color = hex_color + self.name = name + self.passes_aa_normal = passes + self.contrast_on_white = contrast_white + self.contrast_on_black = 21.0 - contrast_white # approximate + self.suggested_fix = fix + self.suggested_fix_contrast = fix_contrast + + def to_dict(self): + return {"color": self.hex_color, "aa_normal": self.passes_aa_normal} + + +LIVE_ACCESSIBILITY = [ + _MockAccessibility("#2563eb", "primary-button", True, 4.68), + _MockAccessibility("#7c3aed", "secondary-button", True, 5.32), + _MockAccessibility("#9ca3af", "text-muted", False, 2.85, "#6b7280", 4.56), + _MockAccessibility("#eab308", "warning", False, 2.09, "#a16207", 4.52), + _MockAccessibility("#bfdbfe", "light-accent", False, 1.51, "#3b82f6", 4.68), +] + + +class MockRuleEngineResults: + typography = _MockTypography() + spacing = _MockSpacing() + color_stats = _MockColorStats() + accessibility = LIVE_ACCESSIBILITY + aa_failures = 3 + consistency_score = 68 + + def to_dict(self): + return { + "typography": self.typography.to_dict(), + "spacing": self.spacing.to_dict(), + "color_stats": self.color_stats.to_dict(), + "accessibility": [a.to_dict() for a in self.accessibility], + "summary": {"aa_failures": self.aa_failures, "consistency_score": self.consistency_score}, + } + + +# ============================================================================= +# HELPER: Run async in pytest +# ============================================================================= + +def run_async(coro): + """Run async function in sync context.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +# ============================================================================= +# LIVE TESTS: AURORA (Brand Identifier) +# ============================================================================= + +class TestAuroraLive: + """Live evaluation of AURORA — Brand Identifier agent.""" + + @pytest.fixture(scope="class") + def aurora_result(self): + client = get_live_client() + agent = BrandIdentifierAgent(client) + return run_async(agent.analyze( + color_tokens=LIVE_COLOR_TOKENS, + semantic_analysis=LIVE_SEMANTIC_ANALYSIS, + )) + + def test_returns_brand_identification(self, aurora_result): + """AURORA returns a BrandIdentification dataclass.""" + assert isinstance(aurora_result, BrandIdentification) + + def test_identifies_primary_as_blue(self, aurora_result): + """AURORA should identify #2563eb (blue) as brand primary — it has highest frequency in buttons.""" + bp = aurora_result.brand_primary + assert isinstance(bp, dict), f"Expected dict, got {type(bp)}" + color = bp.get("color", "").lower() + # Should be blue (#2563eb) — the dominant CTA color + assert color == "#2563eb", f"Expected #2563eb as primary, got {color}" + + def test_confidence_is_high(self, aurora_result): + """With 45 button usages, confidence should be high.""" + bp = aurora_result.brand_primary + confidence = bp.get("confidence", "").lower() + assert confidence in ("high", "very high"), f"Expected high confidence, got '{confidence}'" + + def test_palette_strategy_identified(self, aurora_result): + """Palette strategy should be identified (blue + purple = near-analogous).""" + assert aurora_result.palette_strategy != "" + assert aurora_result.palette_strategy in ( + "analogous", "complementary", "triadic", "monochromatic", + "split-complementary", "near-analogous", "random", + ) + + def test_cohesion_score_reasonable(self, aurora_result): + """Cohesion score 1-10, this palette is decent so expect 5+.""" + score = aurora_result.cohesion_score + assert 1 <= score <= 10, f"Cohesion score out of range: {score}" + assert score >= 4, f"Expected 4+ for a decent SaaS palette, got {score}" + + def test_self_evaluation_present(self, aurora_result): + """Self-evaluation includes confidence and data_quality.""" + se = aurora_result.self_evaluation + assert isinstance(se, dict) + assert "confidence" in se, f"Missing confidence in self_evaluation: {se}" + + def test_json_serializable(self, aurora_result): + """Output is fully JSON-serializable.""" + d = aurora_result.to_dict() + json_str = json.dumps(d) + assert len(json_str) > 50 + + def test_deepeval_quality(self, aurora_result): + """DeepEval G-Eval: Is the brand analysis coherent and useful?""" + if not HAS_DEEPEVAL: + pytest.skip("DeepEval not installed — run: pip install deepeval") + + test_case = LLMTestCase( + input=f"Analyze brand colors: primary-button=#2563eb (45 uses), secondary=#7c3aed (18 uses), 13 total colors", + actual_output=json.dumps(aurora_result.to_dict(), indent=2), + ) + + coherence_metric = GEval( + name="Brand Analysis Coherence", + criteria="The brand analysis should correctly identify the most-used button color as primary, provide a valid palette strategy, and include reasoning that references usage frequency.", + evaluation_params=[GEvalParameter.ACTUAL_OUTPUT], + threshold=0.6, + ) + + assert_test(test_case, [coherence_metric]) + + +# ============================================================================= +# LIVE TESTS: ATLAS (Benchmark Advisor) +# ============================================================================= + +class TestAtlasLive: + """Live evaluation of ATLAS — Benchmark Advisor agent.""" + + @pytest.fixture(scope="class") + def atlas_result(self): + client = get_live_client() + agent = BenchmarkAdvisorAgent(client) + return run_async(agent.analyze( + user_ratio=1.15, + user_base=16, + user_spacing=4, + benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS, + )) + + def test_returns_benchmark_advice(self, atlas_result): + assert isinstance(atlas_result, BenchmarkAdvice) + + def test_recommends_known_benchmark(self, atlas_result): + """Should recommend one of the provided benchmarks.""" + rec = atlas_result.recommended_benchmark.lower() + assert any(name in rec for name in ["polaris", "material", "atlassian"]), \ + f"Unexpected benchmark: {atlas_result.recommended_benchmark}" + + def test_reasoning_non_empty(self, atlas_result): + """Reasoning explains WHY this benchmark fits.""" + assert len(atlas_result.reasoning) > 20, \ + f"Reasoning too short: '{atlas_result.reasoning}'" + + def test_alignment_changes_actionable(self, atlas_result): + """Alignment changes should be a list of specific steps.""" + changes = atlas_result.alignment_changes + assert isinstance(changes, list) + assert len(changes) >= 1, "Expected at least 1 alignment change" + + def test_pros_and_cons_present(self, atlas_result): + """Both pros and cons should be listed.""" + assert isinstance(atlas_result.pros_of_alignment, list) + assert len(atlas_result.pros_of_alignment) >= 1 + + def test_self_evaluation_present(self, atlas_result): + se = atlas_result.self_evaluation + assert isinstance(se, dict) + assert "confidence" in se + + def test_deepeval_quality(self, atlas_result): + """DeepEval G-Eval: Is the benchmark recommendation well-reasoned?""" + if not HAS_DEEPEVAL: + pytest.skip("DeepEval not installed") + + test_case = LLMTestCase( + input="Compare against: Polaris (85%), Material 3 (80%), Atlassian (75%)", + actual_output=json.dumps(atlas_result.to_dict(), indent=2), + ) + + relevance_metric = GEval( + name="Benchmark Recommendation Relevance", + criteria="The recommendation should pick the highest-matching benchmark, explain why structurally, and list concrete alignment changes needed.", + evaluation_params=[GEvalParameter.ACTUAL_OUTPUT], + threshold=0.6, + ) + + assert_test(test_case, [relevance_metric]) + + +# ============================================================================= +# LIVE TESTS: SENTINEL (Best Practices Validator) +# ============================================================================= + +class TestSentinelLive: + """Live evaluation of SENTINEL — Best Practices Validator agent.""" + + @pytest.fixture(scope="class") + def sentinel_result(self): + client = get_live_client() + agent = BestPracticesValidatorAgent(client) + return run_async(agent.analyze( + rule_engine_results=MockRuleEngineResults(), + )) + + def test_returns_best_practices_result(self, sentinel_result): + assert isinstance(sentinel_result, BestPracticesResult) + + def test_score_in_range(self, sentinel_result): + """Score should be 0-100.""" + assert 0 <= sentinel_result.overall_score <= 100 + + def test_score_reflects_failures(self, sentinel_result): + """With 3 AA failures and inconsistent type scale, score should be < 80.""" + assert sentinel_result.overall_score < 85, \ + f"Score {sentinel_result.overall_score} seems too high for 3 AA failures + inconsistent type" + + def test_priority_fixes_ranked(self, sentinel_result): + """Priority fixes should exist and be ranked.""" + fixes = sentinel_result.priority_fixes + assert isinstance(fixes, list) + assert len(fixes) >= 1, "Expected at least 1 priority fix" + # First fix should address accessibility (most impactful) + if isinstance(fixes[0], dict): + first_issue = str(fixes[0].get("issue", "")).lower() + # Should mention contrast/accessibility/AA in top fixes + assert any(kw in first_issue for kw in ("contrast", "aa", "accessib", "color")), \ + f"Top fix doesn't address accessibility: '{first_issue}'" + + def test_checks_cover_key_areas(self, sentinel_result): + """Checks should cover contrast, type scale, spacing.""" + if sentinel_result.checks: + check_keys = " ".join(str(k).lower() for k in sentinel_result.checks.keys()) + # At least 2 of these should appear + areas_found = sum(1 for area in ["contrast", "type", "spacing", "color"] + if area in check_keys) + assert areas_found >= 2, f"Only {areas_found} key areas in checks: {list(sentinel_result.checks.keys())}" + + def test_self_evaluation_present(self, sentinel_result): + se = sentinel_result.self_evaluation + assert isinstance(se, dict) + + def test_deepeval_quality(self, sentinel_result): + """DeepEval G-Eval: Are priority fixes correctly ordered by impact?""" + if not HAS_DEEPEVAL: + pytest.skip("DeepEval not installed") + + test_case = LLMTestCase( + input="Rule engine: 3 AA failures, inconsistent type scale (variance=0.18), 4px grid 92% aligned, 13 unique colors", + actual_output=json.dumps(sentinel_result.to_dict(), indent=2), + ) + + impact_metric = GEval( + name="Priority Fix Impact Ordering", + criteria="Accessibility failures should be ranked highest priority since they affect legal compliance and usability. Type scale inconsistency and color consolidation should follow.", + evaluation_params=[GEvalParameter.ACTUAL_OUTPUT], + threshold=0.6, + ) + + assert_test(test_case, [impact_metric]) + + +# ============================================================================= +# LIVE TESTS: NEXUS (Head Synthesizer) +# ============================================================================= + +class TestNexusLive: + """Live evaluation of NEXUS — Head Synthesizer agent.""" + + @pytest.fixture(scope="class") + def nexus_result(self): + client = get_live_client() + + # First run the 3 upstream agents + aurora_agent = BrandIdentifierAgent(client) + atlas_agent = BenchmarkAdvisorAgent(client) + sentinel_agent = BestPracticesValidatorAgent(client) + + aurora_result = run_async(aurora_agent.analyze( + color_tokens=LIVE_COLOR_TOKENS, + semantic_analysis=LIVE_SEMANTIC_ANALYSIS, + )) + atlas_result = run_async(atlas_agent.analyze( + user_ratio=1.15, + user_base=16, + user_spacing=4, + benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS, + )) + sentinel_result = run_async(sentinel_agent.analyze( + rule_engine_results=MockRuleEngineResults(), + )) + + # Now run NEXUS with real upstream outputs + nexus_agent = HeadSynthesizerAgent(client) + return run_async(nexus_agent.synthesize( + rule_engine_results=MockRuleEngineResults(), + benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS, + brand_identification=aurora_result, + benchmark_advice=atlas_result, + best_practices=sentinel_result, + )) + + def test_returns_head_synthesis(self, nexus_result): + assert isinstance(nexus_result, HeadSynthesis) + + def test_executive_summary_substantial(self, nexus_result): + """Executive summary should be a meaningful paragraph.""" + assert len(nexus_result.executive_summary) > 50, \ + f"Summary too short ({len(nexus_result.executive_summary)} chars): '{nexus_result.executive_summary}'" + + def test_top_3_actions_present(self, nexus_result): + """Should provide 3 action items.""" + assert isinstance(nexus_result.top_3_actions, list) + assert len(nexus_result.top_3_actions) >= 2, \ + f"Expected 2+ actions, got {len(nexus_result.top_3_actions)}" + + def test_scores_present(self, nexus_result): + """Overall scores dict should have key metrics.""" + scores = nexus_result.scores + assert isinstance(scores, dict) + assert len(scores) >= 1, "Expected at least 1 score dimension" + + def test_color_recommendations_present(self, nexus_result): + """Should include color-specific recommendations.""" + recs = nexus_result.color_recommendations + assert isinstance(recs, list) + # With 3 AA failures, should have some color recs + # (may be empty if NEXUS consolidates into actions instead) + + def test_references_all_agents(self, nexus_result): + """Executive summary should reference brand + benchmark + practices.""" + summary_lower = nexus_result.executive_summary.lower() + to_dict = json.dumps(nexus_result.to_dict()).lower() + # NEXUS should incorporate insights from all 3 agents + # Check in full output since summary might be concise + has_brand = any(kw in to_dict for kw in ("brand", "primary", "color")) + has_benchmark = any(kw in to_dict for kw in ("benchmark", "polaris", "material", "system")) + has_practices = any(kw in to_dict for kw in ("accessibility", "contrast", "score", "fix")) + assert has_brand, "NEXUS output missing brand analysis references" + assert has_practices, "NEXUS output missing best practices references" + + def test_self_evaluation_present(self, nexus_result): + se = nexus_result.self_evaluation + assert isinstance(se, dict) + + def test_json_serializable(self, nexus_result): + d = nexus_result.to_dict() + json_str = json.dumps(d) + assert len(json_str) > 100 + + def test_deepeval_synthesis_quality(self, nexus_result): + """DeepEval G-Eval: Does NEXUS produce a coherent synthesis?""" + if not HAS_DEEPEVAL: + pytest.skip("DeepEval not installed") + + test_case = LLMTestCase( + input="Synthesize: AURORA found blue primary (#2563eb), ATLAS recommends Polaris (85% match), SENTINEL found 3 AA failures, score 68/100", + actual_output=json.dumps(nexus_result.to_dict(), indent=2), + ) + + synthesis_metric = GEval( + name="Synthesis Quality", + criteria="The synthesis should: (1) reference findings from all 3 upstream agents, (2) prioritize actionable recommendations, (3) include an executive summary that a non-technical stakeholder could understand, (4) not contradict upstream agent findings.", + evaluation_params=[GEvalParameter.ACTUAL_OUTPUT], + threshold=0.6, + ) + + assert_test(test_case, [synthesis_metric]) + + +# ============================================================================= +# CROSS-AGENT CONSISTENCY TEST +# ============================================================================= + +class TestCrossAgentConsistency: + """Tests that verify consistency across all 4 agents.""" + + @pytest.fixture(scope="class") + def all_results(self): + """Run all 4 agents and return results.""" + client = get_live_client() + + aurora = run_async(BrandIdentifierAgent(client).analyze( + color_tokens=LIVE_COLOR_TOKENS, + semantic_analysis=LIVE_SEMANTIC_ANALYSIS, + )) + atlas = run_async(BenchmarkAdvisorAgent(client).analyze( + user_ratio=1.15, user_base=16, user_spacing=4, + benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS, + )) + sentinel = run_async(BestPracticesValidatorAgent(client).analyze( + rule_engine_results=MockRuleEngineResults(), + )) + nexus = run_async(HeadSynthesizerAgent(client).synthesize( + rule_engine_results=MockRuleEngineResults(), + benchmark_comparisons=LIVE_BENCHMARK_COMPARISONS, + brand_identification=aurora, + benchmark_advice=atlas, + best_practices=sentinel, + )) + + return {"aurora": aurora, "atlas": atlas, "sentinel": sentinel, "nexus": nexus} + + def test_all_agents_return_results(self, all_results): + """All 4 agents should return non-None results.""" + for name, result in all_results.items(): + assert result is not None, f"{name} returned None" + + def test_all_have_self_evaluation(self, all_results): + """Every agent should include self-evaluation.""" + for name, result in all_results.items(): + se = result.self_evaluation + assert isinstance(se, dict), f"{name} self_evaluation is not dict: {type(se)}" + + def test_validation_passes(self, all_results): + """All agent outputs pass schema validation.""" + from core.validation import validate_agent_output + + validations = { + "aurora": all_results["aurora"], + "atlas": all_results["atlas"], + "sentinel": all_results["sentinel"], + "nexus": all_results["nexus"], + } + for agent_name, result in validations.items(): + is_valid, error = validate_agent_output(result, agent_name) + assert is_valid, f"{agent_name} validation failed: {error}" + + def test_nexus_score_near_sentinel(self, all_results): + """NEXUS overall score should be within 20 points of SENTINEL score.""" + sentinel_score = all_results["sentinel"].overall_score + nexus_scores = all_results["nexus"].scores + if "overall" in nexus_scores: + nexus_score = nexus_scores["overall"] + diff = abs(nexus_score - sentinel_score) + assert diff <= 25, \ + f"NEXUS ({nexus_score}) and SENTINEL ({sentinel_score}) scores differ by {diff} — should be within 25" + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s", "--timeout=120"]) diff --git a/tests/test_stage1_extraction.py b/tests/test_stage1_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..49602363879fe30b011a3a228e7372f848549760 --- /dev/null +++ b/tests/test_stage1_extraction.py @@ -0,0 +1,799 @@ +#!/usr/bin/env python3 +""" +Stage 1 Test Suite — Extraction, Normalization & Rule Engine +============================================================= + +Tests the deterministic (free) layer: +- Color utilities: hex normalization, deduplication, categorization +- Rule Engine: WCAG contrast, type scale detection, spacing grid, consistency score +- Edge cases and boundary conditions + +Run: pytest tests/test_stage1_extraction.py -v +""" + +import os +import sys +import pytest + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from core.color_utils import ( + normalize_hex, + parse_color, + deduplicate_colors, + are_colors_similar, + color_distance, + categorize_color, + get_contrast_ratio, + check_wcag_compliance, + generate_color_ramp, + hex_to_rgb, + rgb_to_hex, +) +from core.rule_engine import ( + analyze_type_scale, + analyze_accessibility, + analyze_spacing_grid, + analyze_color_statistics, + run_rule_engine, + get_contrast_ratio as re_get_contrast_ratio, + get_relative_luminance, + hex_to_rgb as re_hex_to_rgb, + is_gray, + color_distance as re_color_distance, + find_aa_compliant_color, + parse_size_to_px, + STANDARD_SCALES, +) + + +# ============================================================================= +# TEST DATA +# ============================================================================= + +MOCK_TYPOGRAPHY_TOKENS = { + "heading-1": {"font_size": "48px", "font_weight": "700"}, + "heading-2": {"font_size": "36px", "font_weight": "600"}, + "heading-3": {"font_size": "28px", "font_weight": "600"}, + "heading-4": {"font_size": "22px", "font_weight": "500"}, + "body-large": {"font_size": "18px", "font_weight": "400"}, + "body": {"font_size": "16px", "font_weight": "400"}, + "body-small": {"font_size": "14px", "font_weight": "400"}, + "caption": {"font_size": "12px", "font_weight": "400"}, +} + +MOCK_COLOR_TOKENS = { + "brand-primary": {"value": "#06b2c4"}, + "brand-secondary": {"value": "#c1df1f"}, + "text-primary": {"value": "#1a1a1a"}, + "text-secondary": {"value": "#666666"}, + "background": {"value": "#ffffff"}, + "light-cyan": {"value": "#7dd3fc"}, # Fails AA on white + "light-lime": {"value": "#d9f99d"}, # Fails AA on white +} + +MOCK_SPACING_TOKENS_ALIGNED = { + "space-1": {"value_px": 4}, + "space-2": {"value_px": 8}, + "space-3": {"value_px": 16}, + "space-4": {"value_px": 24}, + "space-5": {"value_px": 32}, + "space-6": {"value_px": 48}, +} + +MOCK_SPACING_TOKENS_MISALIGNED = { + "space-1": {"value_px": 5}, + "space-2": {"value_px": 10}, + "space-3": {"value_px": 15}, + "space-4": {"value_px": 22}, + "space-5": {"value_px": 33}, +} + + +# ============================================================================= +# TEST CLASS: Color Utilities — Normalization & Deduplication +# ============================================================================= + +class TestColorNormalization: + """Test color parsing, normalization and deduplication.""" + + def test_normalize_hex_6digit(self): + """6-digit hex stays lowercase.""" + assert normalize_hex("#FF0000") == "#ff0000" + assert normalize_hex("#ffffff") == "#ffffff" + + def test_normalize_hex_3digit(self): + """3-digit hex expands to 6-digit.""" + assert normalize_hex("#fff") == "#ffffff" + assert normalize_hex("#000") == "#000000" + assert normalize_hex("#f00") == "#ff0000" + + def test_parse_color_hex(self): + """Parse hex color to ParsedColor.""" + parsed = parse_color("#ff0000") + assert parsed is not None + assert parsed.hex == "#ff0000" + assert parsed.rgb == (255, 0, 0) + + def test_parse_color_rgb(self): + """Parse rgb() string.""" + parsed = parse_color("rgb(0, 128, 255)") + assert parsed is not None + assert parsed.rgb == (0, 128, 255) + + def test_parse_color_invalid(self): + """Invalid color returns None.""" + assert parse_color("not-a-color") is None + assert parse_color("") is None + + def test_hex_to_rgb_and_back(self): + """Round-trip hex → RGB → hex.""" + r, g, b = hex_to_rgb("#1a2b3c") + result = rgb_to_hex(r, g, b) + assert result == "#1a2b3c" + + def test_deduplicate_exact_duplicates(self): + """Exact same colors are deduplicated.""" + colors = ["#ff0000", "#ff0000", "#00ff00", "#00ff00", "#0000ff"] + result = deduplicate_colors(colors, threshold=1.0) + assert len(result) == 3 + + def test_deduplicate_near_duplicates(self): + """Near-duplicate colors (within threshold) are deduplicated.""" + colors = ["#ff0000", "#fe0101", "#00ff00"] + result = deduplicate_colors(colors, threshold=10.0) + assert len(result) == 2 # #ff0000 and #fe0101 are near-dupes + + def test_deduplicate_preserves_distinct(self): + """Distinct colors are preserved.""" + colors = ["#ff0000", "#00ff00", "#0000ff"] + result = deduplicate_colors(colors, threshold=10.0) + assert len(result) == 3 + + def test_are_colors_similar_identical(self): + """Same color is similar.""" + assert are_colors_similar("#ff0000", "#ff0000") + + def test_are_colors_similar_different(self): + """Very different colors are not similar.""" + assert not are_colors_similar("#ff0000", "#0000ff", threshold=10.0) + + def test_color_distance_identical(self): + """Same color has distance 0.""" + assert color_distance("#ff0000", "#ff0000") == 0.0 + + def test_color_distance_symmetric(self): + """Distance is symmetric.""" + d1 = color_distance("#ff0000", "#00ff00") + d2 = color_distance("#00ff00", "#ff0000") + assert d1 == d2 + + +# ============================================================================= +# TEST CLASS: Color Categorization +# ============================================================================= + +class TestColorCategorization: + """Test semantic color classification.""" + + def test_categorize_red(self): + assert categorize_color("#ff0000") == "red" + + def test_categorize_blue(self): + assert categorize_color("#0000ff") == "blue" + + def test_categorize_green(self): + assert categorize_color("#00ff00") == "green" + + def test_categorize_neutral_white(self): + assert categorize_color("#ffffff") == "neutral" + + def test_categorize_neutral_black(self): + assert categorize_color("#000000") == "neutral" + + def test_categorize_neutral_gray(self): + assert categorize_color("#808080") == "neutral" + + def test_categorize_cyan(self): + """Brand color #06b2c4 should be cyan.""" + assert categorize_color("#06b2c4") == "cyan" + + +# ============================================================================= +# TEST CLASS: WCAG Contrast (Rule Engine) +# ============================================================================= + +class TestWCAGContrast: + """Test WCAG contrast ratio calculations — core math.""" + + def test_black_on_white_is_21(self): + """Black on white should be 21:1 (maximum contrast).""" + ratio = re_get_contrast_ratio("#000000", "#ffffff") + assert abs(ratio - 21.0) < 0.1 + + def test_white_on_black_is_21(self): + """White on black is also 21:1 (symmetric).""" + ratio = re_get_contrast_ratio("#ffffff", "#000000") + assert abs(ratio - 21.0) < 0.1 + + def test_same_color_is_1(self): + """Same color on same color should be 1:1.""" + ratio = re_get_contrast_ratio("#ff0000", "#ff0000") + assert abs(ratio - 1.0) < 0.01 + + def test_contrast_ratio_symmetric(self): + """Contrast ratio is symmetric.""" + r1 = re_get_contrast_ratio("#06b2c4", "#ffffff") + r2 = re_get_contrast_ratio("#ffffff", "#06b2c4") + assert abs(r1 - r2) < 0.01 + + def test_brand_primary_fails_aa_on_white(self): + """Brand color #06b2c4 fails AA on white (contrast ~2.6).""" + ratio = re_get_contrast_ratio("#06b2c4", "#ffffff") + assert ratio < 4.5 # Fails AA normal + assert ratio > 2.0 # But has some contrast + + def test_dark_text_passes_aa(self): + """Dark text #1a1a1a passes AA on white.""" + ratio = re_get_contrast_ratio("#1a1a1a", "#ffffff") + assert ratio >= 4.5 + + def test_luminance_black_is_zero(self): + """Black has luminance ~0.""" + lum = get_relative_luminance("#000000") + assert abs(lum) < 0.001 + + def test_luminance_white_is_one(self): + """White has luminance ~1.""" + lum = get_relative_luminance("#ffffff") + assert abs(lum - 1.0) < 0.001 + + def test_find_aa_compliant_preserves_passing(self): + """Color already passing AA is returned unchanged.""" + result = find_aa_compliant_color("#1a1a1a", "#ffffff", 4.5) + assert result == "#1a1a1a" + + def test_find_aa_compliant_fixes_failing(self): + """Failing color gets a fix that passes AA.""" + fixed = find_aa_compliant_color("#06b2c4", "#ffffff", 4.5) + fixed_ratio = re_get_contrast_ratio(fixed, "#ffffff") + assert fixed_ratio >= 4.5 + + def test_analyze_accessibility_finds_failures(self): + """analyze_accessibility identifies colors that fail AA on BOTH white and black.""" + results = analyze_accessibility(MOCK_COLOR_TOKENS) + # passes_aa_normal is True if contrast >= 4.5 on white OR black. + # Light colors pass because they have good contrast on black. + # Medium-contrast colors like #06b2c4 or #666666 may fail on both. + # At minimum, all results should be analyzed + assert len(results) >= 5 # At least the colors with valid hex + # Check that brand-primary #06b2c4 has low contrast on white + brand = [r for r in results if r.hex_color == "#06b2c4"] + assert len(brand) == 1 + assert brand[0].contrast_on_white < 4.5 + + def test_analyze_accessibility_suggests_fixes(self): + """AA failures get suggested fixes.""" + results = analyze_accessibility(MOCK_COLOR_TOKENS) + failures = [r for r in results if not r.passes_aa_normal] + for f in failures: + assert f.suggested_fix is not None + assert f.suggested_fix_contrast is not None + assert f.suggested_fix_contrast >= 4.5 + + def test_fg_bg_pair_check(self): + """FG/BG pairs are checked for contrast.""" + pairs = [ + {"foreground": "#06b2c4", "background": "#ffffff", "element": "button"}, + ] + results = analyze_accessibility({}, fg_bg_pairs=pairs) + # #06b2c4 on white fails AA (contrast ~3.2) + pair_failures = [r for r in results if r.name.startswith("fg:")] + assert len(pair_failures) == 1 + + def test_fg_bg_same_color_skipped(self): + """Same-color FG/BG pairs are skipped (invisible text).""" + pairs = [ + {"foreground": "#ffffff", "background": "#ffffff", "element": "hidden"}, + ] + results = analyze_accessibility({}, fg_bg_pairs=pairs) + assert len(results) == 0 + + +# ============================================================================= +# TEST CLASS: Type Scale Detection +# ============================================================================= + +class TestTypeScaleDetection: + """Test type scale ratio detection and recommendations.""" + + def test_detect_ratio_from_tokens(self): + """Detects a reasonable ratio from mock typography tokens.""" + result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS) + # Sizes: 12, 14, 16, 18, 22, 28, 36, 48 — ratios vary + assert result.detected_ratio > 1.0 + assert result.detected_ratio < 2.0 + + def test_consistent_scale(self): + """A perfectly consistent scale is detected as consistent.""" + # Major Third (1.25): 12, 15, 18.75, 23.4, 29.3 + tokens = { + f"size-{i}": {"font_size": f"{12 * (1.25 ** i):.1f}px"} + for i in range(5) + } + result = analyze_type_scale(tokens) + assert result.is_consistent + assert abs(result.detected_ratio - 1.25) < 0.05 + + def test_inconsistent_scale(self): + """Random sizes are detected as inconsistent.""" + tokens = { + "a": {"font_size": "10px"}, + "b": {"font_size": "17px"}, + "c": {"font_size": "31px"}, + "d": {"font_size": "42px"}, + } + result = analyze_type_scale(tokens) + # Very inconsistent — large variance + assert result.variance > 0.15 or not result.is_consistent + + def test_single_size(self): + """Single size returns Unknown scale.""" + tokens = {"body": {"font_size": "16px"}} + result = analyze_type_scale(tokens) + assert result.scale_name == "Unknown" + assert result.recommendation == 1.25 # Default: Major Third + + def test_no_sizes(self): + """Empty tokens return Unknown scale.""" + result = analyze_type_scale({}) + assert result.scale_name == "Unknown" + + def test_rem_conversion(self): + """rem values are converted to px (1rem = 16px).""" + tokens = { + "body": {"font_size": "1rem"}, + "heading": {"font_size": "2rem"}, + } + result = analyze_type_scale(tokens) + assert 16.0 in result.sizes_px + assert 32.0 in result.sizes_px + + def test_base_size_detection(self): + """Base size detected near 16px.""" + result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS) + assert 14 <= result.base_size <= 18 + + def test_standard_scales_defined(self): + """All standard scales are defined.""" + assert 1.25 in STANDARD_SCALES + assert 1.333 in STANDARD_SCALES + assert 1.618 in STANDARD_SCALES + + def test_parse_size_to_px(self): + """Various size formats parsed correctly.""" + assert parse_size_to_px("16px") == 16.0 + assert parse_size_to_px("1rem") == 16.0 + assert parse_size_to_px("1.5em") == 24.0 + assert parse_size_to_px(16) == 16.0 + assert parse_size_to_px("abc") is None + + +# ============================================================================= +# TEST CLASS: Spacing Grid Analysis +# ============================================================================= + +class TestSpacingGrid: + """Test spacing grid detection and GCD math.""" + + def test_aligned_to_4px(self): + """Values divisible by 4 are detected as 4px-aligned.""" + result = analyze_spacing_grid(MOCK_SPACING_TOKENS_ALIGNED) + assert result.is_aligned + # All values (4, 8, 16, 24, 32, 48) are divisible by 4 and 8 + assert result.recommendation in [4, 8] + + def test_8px_grid_detected(self): + """All-8px-multiple values detected as 8px grid.""" + tokens = { + "s1": {"value_px": 8}, + "s2": {"value_px": 16}, + "s3": {"value_px": 24}, + "s4": {"value_px": 32}, + } + result = analyze_spacing_grid(tokens) + assert result.detected_base == 8 + assert result.is_aligned + assert result.alignment_percentage == 100.0 + + def test_misaligned_detected(self): + """Misaligned spacing is flagged.""" + result = analyze_spacing_grid(MOCK_SPACING_TOKENS_MISALIGNED) + # GCD of 5, 10, 15, 22, 33 = 1 — not aligned + assert result.detected_base == 1 + assert not result.is_aligned + + def test_empty_spacing(self): + """Empty spacing defaults to 8px recommendation.""" + result = analyze_spacing_grid({}) + assert result.recommendation == 8 + assert not result.is_aligned + + def test_single_value(self): + """Single value uses itself as base.""" + tokens = {"s1": {"value_px": 8}} + result = analyze_spacing_grid(tokens) + assert result.detected_base == 8 + + def test_gcd_calculation(self): + """GCD correctly computed for spacing values.""" + tokens = { + "s1": {"value_px": 12}, + "s2": {"value_px": 24}, + "s3": {"value_px": 36}, + } + result = analyze_spacing_grid(tokens) + assert result.detected_base == 12 + + def test_suggested_scale_generated(self): + """Suggested scale is generated.""" + result = analyze_spacing_grid(MOCK_SPACING_TOKENS_ALIGNED) + assert len(result.suggested_scale) > 0 + assert 0 in result.suggested_scale + + def test_string_values_parsed(self): + """String values like '16px' are parsed correctly.""" + tokens = { + "s1": {"value": "8px"}, + "s2": {"value": "16px"}, + } + result = analyze_spacing_grid(tokens) + assert result.current_values == [8, 16] + + +# ============================================================================= +# TEST CLASS: Color Statistics +# ============================================================================= + +class TestColorStatistics: + """Test color palette statistics analysis.""" + + def test_counts_correct(self): + """Total and unique counts are correct.""" + tokens = { + "a": {"value": "#ff0000"}, + "b": {"value": "#ff0000"}, # duplicate + "c": {"value": "#00ff00"}, + } + result = analyze_color_statistics(tokens) + assert result.total_count == 3 + assert result.unique_count == 2 + assert result.duplicate_count == 1 + + def test_gray_detection(self): + """Grays are counted correctly.""" + tokens = { + "white": {"value": "#ffffff"}, + "gray": {"value": "#808080"}, + "black": {"value": "#000000"}, + "red": {"value": "#ff0000"}, + } + result = analyze_color_statistics(tokens) + assert result.gray_count >= 3 # white, gray, black are all low saturation + + def test_near_duplicates_found(self): + """Near-duplicate colors are detected.""" + tokens = { + "red1": {"value": "#ff0000"}, + "red2": {"value": "#fe0101"}, # Very close to red1 + "blue": {"value": "#0000ff"}, + } + result = analyze_color_statistics(tokens, similarity_threshold=0.05) + assert len(result.near_duplicates) >= 1 + + def test_hue_distribution(self): + """Hue distribution groups colors correctly.""" + tokens = { + "red": {"value": "#ff0000"}, + "blue": {"value": "#0000ff"}, + "green": {"value": "#00ff00"}, + } + result = analyze_color_statistics(tokens) + assert "red" in result.hue_distribution + assert "blue" in result.hue_distribution + assert "green" in result.hue_distribution + + def test_empty_tokens(self): + """Empty tokens return zeros.""" + result = analyze_color_statistics({}) + assert result.total_count == 0 + assert result.unique_count == 0 + + +# ============================================================================= +# TEST CLASS: Rule Engine Integration +# ============================================================================= + +class TestRuleEngineIntegration: + """Test the full run_rule_engine() function.""" + + def test_returns_all_components(self): + """Rule engine returns all analysis components.""" + result = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED, + ) + assert result.typography is not None + assert result.accessibility is not None + assert result.spacing is not None + assert result.color_stats is not None + + def test_consistency_score_bounds(self): + """Consistency score is between 0 and 100.""" + result = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED, + ) + assert 0 <= result.consistency_score <= 100 + + def test_aa_failures_counted(self): + """AA failures are counted in summary.""" + result = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED, + ) + assert result.aa_failures >= 0 + + def test_to_dict_serializable(self): + """to_dict() returns JSON-serializable data.""" + import json + result = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED, + ) + d = result.to_dict() + json_str = json.dumps(d) + assert len(json_str) > 0 + + def test_log_callback_called(self): + """Log callback receives messages.""" + logs = [] + run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED, + log_callback=lambda msg: logs.append(msg), + ) + assert len(logs) > 0 + # Should contain rule engine header + assert any("RULE ENGINE" in log for log in logs) + + def test_with_fg_bg_pairs(self): + """FG/BG pairs are analyzed when provided.""" + pairs = [ + {"foreground": "#06b2c4", "background": "#ffffff", "element": "button"}, + {"foreground": "#1a1a1a", "background": "#ffffff", "element": "heading"}, + ] + result = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS_ALIGNED, + fg_bg_pairs=pairs, + ) + # Should have accessibility results including pair checks + assert len(result.accessibility) > 0 + + def test_empty_tokens_no_crash(self): + """Empty tokens don't crash the rule engine.""" + result = run_rule_engine( + typography_tokens={}, + color_tokens={}, + spacing_tokens={}, + ) + assert result.consistency_score >= 0 + + def test_perfect_score_possible(self): + """A well-organized design system scores high.""" + # All 8px-aligned spacing + spacing = {f"s{i}": {"value_px": i * 8} for i in range(1, 7)} + # Consistent type scale (Major Third 1.25) + typo = { + f"t{i}": {"font_size": f"{16 * (1.25 ** i):.0f}px"} + for i in range(5) + } + # AA-passing colors only + colors = { + "dark": {"value": "#1a1a1a"}, + "medium": {"value": "#333333"}, + } + result = run_rule_engine( + typography_tokens=typo, + color_tokens=colors, + spacing_tokens=spacing, + ) + assert result.consistency_score >= 50 # Should be reasonably high + + +# ============================================================================= +# TEST CLASS: Color Ramp Generation +# ============================================================================= + +class TestColorRampGeneration: + """Test color ramp generation from base color.""" + + def test_ramp_has_all_shades(self): + """Ramp generates all standard shades.""" + ramp = generate_color_ramp("#06b2c4") + assert "50" in ramp + assert "500" in ramp + assert "900" in ramp + assert len(ramp) == 10 + + def test_ramp_500_is_base(self): + """Shade 500 is the base color.""" + ramp = generate_color_ramp("#06b2c4") + assert ramp["500"] == "#06b2c4" + + def test_ramp_lightness_order(self): + """Lighter shades are lighter than darker shades.""" + ramp = generate_color_ramp("#06b2c4") + shade_50 = parse_color(ramp["50"]) + shade_900 = parse_color(ramp["900"]) + assert shade_50.hsl[2] > shade_900.hsl[2] # 50 is lighter + + def test_ramp_empty_on_invalid(self): + """Invalid color returns empty ramp.""" + ramp = generate_color_ramp("not-a-color") + assert ramp == {} + + +# ============================================================================= +# TEST CLASS: Edge Cases +# ============================================================================= + +class TestEdgeCases: + """Edge cases and boundary conditions.""" + + def test_is_gray_pure_white(self): + """White is gray (low saturation).""" + assert is_gray("#ffffff") + + def test_is_gray_pure_black(self): + """Black is gray (low saturation).""" + assert is_gray("#000000") + + def test_is_gray_red_is_not(self): + """Pure red is not gray.""" + assert not is_gray("#ff0000") + + def test_color_distance_black_white(self): + """Black to white is maximum distance.""" + dist = re_color_distance("#000000", "#ffffff") + assert dist > 0.9 # Close to maximum (~1.0) + + def test_very_large_spacing(self): + """Large spacing values don't crash.""" + tokens = {"huge": {"value_px": 10000}} + result = analyze_spacing_grid(tokens) + assert result.detected_base == 10000 + + def test_typography_mixed_units(self): + """Mixed px/rem/em units are handled.""" + tokens = { + "a": {"font_size": "16px"}, + "b": {"font_size": "1.5rem"}, + "c": {"font_size": "2em"}, + } + result = analyze_type_scale(tokens) + assert len(result.sizes_px) == 3 + assert 16.0 in result.sizes_px + assert 24.0 in result.sizes_px + assert 32.0 in result.sizes_px + + def test_duplicate_sizes_deduped(self): + """Duplicate font sizes are deduplicated.""" + tokens = { + "a": {"font_size": "16px"}, + "b": {"font_size": "16px"}, + "c": {"font_size": "24px"}, + } + result = analyze_type_scale(tokens) + assert len(result.sizes_px) == 2 # 16 and 24 + + def test_hex_to_rgb_shorthand(self): + """3-digit hex expands correctly.""" + assert re_hex_to_rgb("#fff") == (255, 255, 255) + assert re_hex_to_rgb("#000") == (0, 0, 0) + assert re_hex_to_rgb("#f00") == (255, 0, 0) + + +# ============================================================================= +# TEST CLASS: Color Name Generation Logic +# ============================================================================= + +class TestColorNameGeneration: + """Test color name generation logic (matching app.py's _generate_color_name_from_hex).""" + + def test_pure_red_hue(self): + """Pure red (#ff0000) has hue in red range.""" + import colorsys + hex_val = "#ff0000" + hex_clean = hex_val.lstrip('#').lower() + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + h, l, s = colorsys.rgb_to_hls(r, g, b) + hue = h * 360 + # Red should be in range 0-15 or 345-360 + assert hue < 15 or hue >= 345 + + def test_pure_blue_hue(self): + """Pure blue (#0000ff) has hue in blue range.""" + import colorsys + hex_val = "#0000ff" + hex_clean = hex_val.lstrip('#').lower() + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + h, l, s = colorsys.rgb_to_hls(r, g, b) + hue = h * 360 + # Blue should be in range 195-255 + assert 195 <= hue < 255 + + def test_gray_low_saturation(self): + """Gray colors (#888888) have low saturation.""" + import colorsys + hex_val = "#888888" + hex_clean = hex_val.lstrip('#').lower() + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + h, l, s = colorsys.rgb_to_hls(r, g, b) + # Gray should have very low saturation + assert s < 0.1 + + def test_white_high_lightness(self): + """White (#ffffff) has high lightness for shade 50.""" + import colorsys + hex_val = "#ffffff" + hex_clean = hex_val.lstrip('#').lower() + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + h, l, s = colorsys.rgb_to_hls(r, g, b) + # White should have lightness >= 0.95 -> shade 50 + assert l >= 0.95 + + def test_black_low_lightness(self): + """Black (#000000) has low lightness for shade 900.""" + import colorsys + hex_val = "#000000" + hex_clean = hex_val.lstrip('#').lower() + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + h, l, s = colorsys.rgb_to_hls(r, g, b) + # Black should have lightness < 0.10 -> shade 900 + assert l < 0.10 + + def test_teal_hue(self): + """Teal color (#06b2c4) has hue in teal range.""" + import colorsys + hex_val = "#06b2c4" + hex_clean = hex_val.lstrip('#').lower() + r = int(hex_clean[0:2], 16) / 255 + g = int(hex_clean[2:4], 16) / 255 + b = int(hex_clean[4:6], 16) / 255 + h, l, s = colorsys.rgb_to_hls(r, g, b) + hue = h * 360 + # Teal should be in range 150-195 + assert 150 <= hue < 195 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_stage2_pipeline.py b/tests/test_stage2_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..1bde7f002fffa5dd3f6c98a198bf5570343d86e5 --- /dev/null +++ b/tests/test_stage2_pipeline.py @@ -0,0 +1,665 @@ +#!/usr/bin/env python3 +""" +Stage 2 Pipeline Test Script +============================ + +Tests the new Stage 2 architecture: +- Layer 1: Rule Engine +- Layer 2: Benchmark Research +- Layer 3: LLM Agents +- Layer 4: HEAD Synthesizer + +Run: python tests/test_stage2_pipeline.py +""" + +import asyncio +import json +import os +import sys +from datetime import datetime +from typing import Optional + +import pytest + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +# ============================================================================= +# TEST DATA - Mock extracted tokens +# ============================================================================= + +MOCK_TYPOGRAPHY_TOKENS = { + "heading-1": {"font_size": "48px", "font_weight": "700", "line_height": "1.2", "font_family": "Inter"}, + "heading-2": {"font_size": "36px", "font_weight": "600", "line_height": "1.25", "font_family": "Inter"}, + "heading-3": {"font_size": "28px", "font_weight": "600", "line_height": "1.3", "font_family": "Inter"}, + "heading-4": {"font_size": "22px", "font_weight": "500", "line_height": "1.35", "font_family": "Inter"}, + "body-large": {"font_size": "18px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"}, + "body": {"font_size": "16px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"}, + "body-small": {"font_size": "14px", "font_weight": "400", "line_height": "1.5", "font_family": "Inter"}, + "caption": {"font_size": "12px", "font_weight": "400", "line_height": "1.4", "font_family": "Inter"}, +} + +MOCK_COLOR_TOKENS = { + "brand-primary": {"value": "#06b2c4", "frequency": 45, "context": "buttons, links"}, + "brand-secondary": {"value": "#c1df1f", "frequency": 23, "context": "highlights, badges"}, + "text-primary": {"value": "#1a1a1a", "frequency": 120, "context": "headings, body"}, + "text-secondary": {"value": "#666666", "frequency": 80, "context": "captions, muted"}, + "text-tertiary": {"value": "#999999", "frequency": 40, "context": "placeholders"}, + "background-primary": {"value": "#ffffff", "frequency": 200, "context": "page background"}, + "background-secondary": {"value": "#f5f5f5", "frequency": 60, "context": "cards, sections"}, + "background-tertiary": {"value": "#e8e8e8", "frequency": 30, "context": "dividers"}, + "border-default": {"value": "#dddddd", "frequency": 50, "context": "borders"}, + "border-focus": {"value": "#06b2c4", "frequency": 15, "context": "focus rings"}, + "success": {"value": "#22c55e", "frequency": 10, "context": "success states"}, + "warning": {"value": "#f59e0b", "frequency": 8, "context": "warning states"}, + "error": {"value": "#ef4444", "frequency": 12, "context": "error states"}, + "info": {"value": "#3b82f6", "frequency": 6, "context": "info states"}, + # Some problematic colors for testing + "light-cyan": {"value": "#7dd3fc", "frequency": 5, "context": "light accent"}, # Fails AA + "light-lime": {"value": "#d9f99d", "frequency": 3, "context": "light highlight"}, # Fails AA +} + +MOCK_SPACING_TOKENS = { + "space-1": {"value": "4px", "value_px": 4, "frequency": 30}, + "space-2": {"value": "8px", "value_px": 8, "frequency": 80}, + "space-3": {"value": "12px", "value_px": 12, "frequency": 45}, + "space-4": {"value": "16px", "value_px": 16, "frequency": 60}, + "space-5": {"value": "20px", "value_px": 20, "frequency": 25}, + "space-6": {"value": "24px", "value_px": 24, "frequency": 40}, + "space-8": {"value": "32px", "value_px": 32, "frequency": 20}, + "space-10": {"value": "40px", "value_px": 40, "frequency": 15}, + "space-12": {"value": "48px", "value_px": 48, "frequency": 10}, + # Some misaligned values for testing + "space-odd-1": {"value": "5px", "value_px": 5, "frequency": 3}, + "space-odd-2": {"value": "10px", "value_px": 10, "frequency": 5}, +} + +MOCK_SEMANTIC_ANALYSIS = { + "brand": [{"hex": "#06b2c4", "name": "brand-primary"}, {"hex": "#c1df1f", "name": "brand-secondary"}], + "text": [{"hex": "#1a1a1a", "name": "text-primary"}, {"hex": "#666666", "name": "text-secondary"}], + "background": [{"hex": "#ffffff", "name": "background-primary"}, {"hex": "#f5f5f5", "name": "background-secondary"}], + "border": [{"hex": "#dddddd", "name": "border-default"}], + "feedback": [{"hex": "#22c55e", "name": "success"}, {"hex": "#ef4444", "name": "error"}], +} + + +# ============================================================================= +# TEST HELPERS +# ============================================================================= + +class TestLogger: + """Simple logger for tests.""" + + def __init__(self, verbose: bool = True): + self.verbose = verbose + self.logs = [] + + def log(self, msg: str): + self.logs.append(msg) + if self.verbose: + print(msg) + + def get_logs(self) -> str: + return "\n".join(self.logs) + + +def print_section(title: str): + """Print a section header.""" + print("\n" + "=" * 60) + print(f" {title}") + print("=" * 60 + "\n") + + +def print_result(name: str, passed: bool, details: str = ""): + """Print a test result.""" + icon = "✅" if passed else "❌" + print(f" {icon} {name}") + if details: + print(f" {details}") + + +# ============================================================================= +# LAYER 1: RULE ENGINE TESTS +# ============================================================================= + +def test_rule_engine(): + """Test the Rule Engine layer.""" + print_section("LAYER 1: RULE ENGINE TESTS") + + all_passed = True + + try: + from core.rule_engine import ( + run_rule_engine, + analyze_type_scale, + analyze_accessibility, + analyze_spacing_grid, + analyze_color_statistics, + ) + print_result("Import rule_engine", True) + except Exception as e: + print_result("Import rule_engine", False, str(e)) + return False + + # Test Type Scale Analysis + try: + typo_result = analyze_type_scale(MOCK_TYPOGRAPHY_TOKENS) + + assert typo_result.detected_ratio > 0, "Ratio should be positive" + assert typo_result.closest_standard_ratio > 0, "Standard ratio should be positive" + assert typo_result.scale_name != "", "Scale name should not be empty" + assert len(typo_result.sizes_px) > 0, "Should detect sizes" + + print_result( + "Type Scale Analysis", + True, + f"ratio={typo_result.detected_ratio:.3f}, consistent={typo_result.is_consistent}" + ) + except Exception as e: + print_result("Type Scale Analysis", False, str(e)) + all_passed = False + + # Test Accessibility Analysis + try: + access_result = analyze_accessibility(MOCK_COLOR_TOKENS) + + assert len(access_result) > 0, "Should analyze colors" + + failures = [a for a in access_result if not a.passes_aa_normal] + passes = len(access_result) - len(failures) + + # Check that fixes are generated for failures + fixes_generated = sum(1 for a in failures if a.suggested_fix) + + print_result( + "Accessibility Analysis", + True, + f"total={len(access_result)}, pass={passes}, fail={len(failures)}, fixes={fixes_generated}" + ) + except Exception as e: + print_result("Accessibility Analysis", False, str(e)) + all_passed = False + + # Test Spacing Grid Analysis + try: + spacing_result = analyze_spacing_grid(MOCK_SPACING_TOKENS) + + assert spacing_result.detected_base > 0, "Base should be positive" + assert len(spacing_result.current_values) > 0, "Should detect values" + assert len(spacing_result.suggested_scale) > 0, "Should suggest scale" + + print_result( + "Spacing Grid Analysis", + True, + f"base={spacing_result.detected_base}px, aligned={spacing_result.alignment_percentage:.0f}%" + ) + except Exception as e: + print_result("Spacing Grid Analysis", False, str(e)) + all_passed = False + + # Test Color Statistics + try: + color_stats = analyze_color_statistics(MOCK_COLOR_TOKENS) + + assert color_stats.total_count > 0, "Should count colors" + assert color_stats.unique_count > 0, "Should count unique" + + print_result( + "Color Statistics", + True, + f"total={color_stats.total_count}, unique={color_stats.unique_count}, grays={color_stats.gray_count}" + ) + except Exception as e: + print_result("Color Statistics", False, str(e)) + all_passed = False + + # Test Full Rule Engine + try: + logger = TestLogger(verbose=False) + + full_result = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS, + log_callback=logger.log, + ) + + assert full_result.typography is not None + assert full_result.accessibility is not None + assert full_result.spacing is not None + assert full_result.color_stats is not None + assert 0 <= full_result.consistency_score <= 100 + + print_result( + "Full Rule Engine", + True, + f"consistency_score={full_result.consistency_score}, aa_failures={full_result.aa_failures}" + ) + + # Check logs were generated + log_lines = len(logger.logs) + print_result("Log Generation", log_lines > 10, f"{log_lines} log lines") + + except Exception as e: + print_result("Full Rule Engine", False, str(e)) + all_passed = False + + return all_passed + + +# ============================================================================= +# LAYER 2: BENCHMARK RESEARCH TESTS +# ============================================================================= + +def test_benchmark_research(): + """Test the Benchmark Research layer.""" + print_section("LAYER 2: BENCHMARK RESEARCH TESTS") + + all_passed = True + + try: + from agents.benchmark_researcher import ( + BenchmarkResearcher, + BenchmarkCache, + DESIGN_SYSTEM_SOURCES, + FALLBACK_BENCHMARKS, + get_available_benchmarks, + get_benchmark_choices, + ) + print_result("Import benchmark_researcher", True) + except Exception as e: + print_result("Import benchmark_researcher", False, str(e)) + return False + + # Test Design System Sources + try: + assert len(DESIGN_SYSTEM_SOURCES) >= 6, "Should have at least 6 design systems" + + required_systems = ["material_design_3", "shopify_polaris", "atlassian_design"] + for sys in required_systems: + assert sys in DESIGN_SYSTEM_SOURCES, f"Missing {sys}" + assert "urls" in DESIGN_SYSTEM_SOURCES[sys], f"Missing URLs for {sys}" + assert "best_for" in DESIGN_SYSTEM_SOURCES[sys], f"Missing best_for for {sys}" + + print_result("Design System Sources", True, f"{len(DESIGN_SYSTEM_SOURCES)} systems defined") + except Exception as e: + print_result("Design System Sources", False, str(e)) + all_passed = False + + # Test Fallback Benchmarks + try: + assert len(FALLBACK_BENCHMARKS) >= 6, "Should have fallbacks" + + for key, fallback in FALLBACK_BENCHMARKS.items(): + assert "typography" in fallback, f"Missing typography for {key}" + assert "spacing" in fallback, f"Missing spacing for {key}" + assert fallback["typography"].get("scale_ratio"), f"Missing scale_ratio for {key}" + + print_result("Fallback Benchmarks", True, f"{len(FALLBACK_BENCHMARKS)} fallbacks defined") + except Exception as e: + print_result("Fallback Benchmarks", False, str(e)) + all_passed = False + + # Test Cache + try: + cache = BenchmarkCache() + + # Test set/get + from agents.benchmark_researcher import BenchmarkData + test_data = BenchmarkData( + key="test_system", + name="Test System", + short_name="Test", + vendor="Test Vendor", + icon="🧪", + typography={"scale_ratio": 1.25, "base_size": 16}, + spacing={"base": 8}, + fetched_at=datetime.now().isoformat(), + confidence="high", + ) + + cache.set("test_system", test_data) + retrieved = cache.get("test_system") + + assert retrieved is not None, "Should retrieve cached data" + assert retrieved.typography.get("scale_ratio") == 1.25, "Data should match" + + print_result("Benchmark Cache", True, "set/get working") + except Exception as e: + print_result("Benchmark Cache", False, str(e)) + all_passed = False + + # Test Helper Functions + try: + benchmarks = get_available_benchmarks() + assert len(benchmarks) >= 6, "Should list benchmarks" + assert all("key" in b and "name" in b for b in benchmarks) + + choices = get_benchmark_choices() + assert len(choices) >= 6, "Should have choices" + assert all(isinstance(c, tuple) and len(c) == 2 for c in choices) + + print_result("Helper Functions", True, f"{len(benchmarks)} benchmarks available") + except Exception as e: + print_result("Helper Functions", False, str(e)) + all_passed = False + + # Test Researcher Initialization + try: + researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None) + assert researcher.cache is not None + + print_result("Researcher Initialization", True, "initialized without clients") + except Exception as e: + print_result("Researcher Initialization", False, str(e)) + all_passed = False + + # Test Comparison Logic (with fallback data) + try: + researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None) + + # Create mock benchmark data + from agents.benchmark_researcher import BenchmarkData + mock_benchmarks = [] + for key in ["material_design_3", "shopify_polaris", "atlassian_design"]: + source = DESIGN_SYSTEM_SOURCES[key] + fallback = FALLBACK_BENCHMARKS[key] + mock_benchmarks.append(BenchmarkData( + key=key, + name=source["name"], + short_name=source["short_name"], + vendor=source["vendor"], + icon=source["icon"], + typography=fallback["typography"], + spacing=fallback["spacing"], + fetched_at=datetime.now().isoformat(), + confidence="fallback", + best_for=source["best_for"], + )) + + comparisons = researcher.compare_to_benchmarks( + your_ratio=1.18, + your_base_size=16, + your_spacing_grid=8, + benchmarks=mock_benchmarks, + log_callback=lambda x: None, + ) + + assert len(comparisons) == 3, "Should have 3 comparisons" + assert comparisons[0].similarity_score <= comparisons[1].similarity_score, "Should be sorted" + + print_result( + "Comparison Logic", + True, + f"closest={comparisons[0].benchmark.short_name}, score={comparisons[0].similarity_score:.2f}" + ) + except Exception as e: + print_result("Comparison Logic", False, str(e)) + all_passed = False + + return all_passed + + +# ============================================================================= +# LAYER 3: LLM AGENTS TESTS +# ============================================================================= + +def test_llm_agents(): + """Test the LLM Agents layer.""" + print_section("LAYER 3: LLM AGENTS TESTS") + + all_passed = True + + try: + from agents.llm_agents import ( + BrandIdentifierAgent, + BenchmarkAdvisorAgent, + BestPracticesValidatorAgent, + HeadSynthesizerAgent, + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + HeadSynthesis, + ) + print_result("Import llm_agents", True) + except Exception as e: + print_result("Import llm_agents", False, str(e)) + return False + + # Test Data Classes + try: + brand = BrandIdentification( + brand_primary={"color": "#06b2c4", "confidence": "high"}, + cohesion_score=7, + ) + assert brand.to_dict()["brand_primary"]["color"] == "#06b2c4" + + advice = BenchmarkAdvice( + recommended_benchmark="shopify_polaris", + reasoning="Best fit for e-commerce", + ) + assert advice.to_dict()["recommended_benchmark"] == "shopify_polaris" + + practices = BestPracticesResult( + overall_score=65, + priority_fixes=[{"issue": "AA compliance", "impact": "high"}], + ) + assert practices.to_dict()["overall_score"] == 65 + + synthesis = HeadSynthesis( + executive_summary="Test summary", + scores={"overall": 60}, + ) + assert synthesis.to_dict()["scores"]["overall"] == 60 + + print_result("Data Classes", True, "all serializable") + except Exception as e: + print_result("Data Classes", False, str(e)) + all_passed = False + + # Test Agent Initialization (without HF client) + try: + brand_agent = BrandIdentifierAgent(hf_client=None) + benchmark_agent = BenchmarkAdvisorAgent(hf_client=None) + practices_agent = BestPracticesValidatorAgent(hf_client=None) + head_agent = HeadSynthesizerAgent(hf_client=None) + + print_result("Agent Initialization", True, "all agents created") + except Exception as e: + print_result("Agent Initialization", False, str(e)) + all_passed = False + + # Test Prompt Templates exist + try: + assert hasattr(BrandIdentifierAgent, 'PROMPT_TEMPLATE') + assert hasattr(BenchmarkAdvisorAgent, 'PROMPT_TEMPLATE') + assert hasattr(BestPracticesValidatorAgent, 'PROMPT_TEMPLATE') + assert hasattr(HeadSynthesizerAgent, 'PROMPT_TEMPLATE') + + # Check templates have placeholders + assert "{color_data}" in BrandIdentifierAgent.PROMPT_TEMPLATE + assert "{user_ratio}" in BenchmarkAdvisorAgent.PROMPT_TEMPLATE + assert "{type_ratio}" in BestPracticesValidatorAgent.PROMPT_TEMPLATE + assert "{type_ratio}" in HeadSynthesizerAgent.PROMPT_TEMPLATE + + print_result("Prompt Templates", True, "all templates defined with placeholders") + except Exception as e: + print_result("Prompt Templates", False, str(e)) + all_passed = False + + return all_passed + + +# ============================================================================= +# INTEGRATION TEST +# ============================================================================= + +@pytest.mark.asyncio +async def test_integration(): + """Test the full pipeline integration (without actual LLM calls).""" + print_section("INTEGRATION TEST") + + all_passed = True + + # Test full Rule Engine + Benchmark comparison flow + try: + from core.rule_engine import run_rule_engine + from agents.benchmark_researcher import ( + BenchmarkResearcher, + BenchmarkData, + DESIGN_SYSTEM_SOURCES, + FALLBACK_BENCHMARKS + ) + from agents.llm_agents import ( + BrandIdentification, + BenchmarkAdvice, + BestPracticesResult, + HeadSynthesis, + ) + + logger = TestLogger(verbose=False) + + # Step 1: Run Rule Engine + rule_results = run_rule_engine( + typography_tokens=MOCK_TYPOGRAPHY_TOKENS, + color_tokens=MOCK_COLOR_TOKENS, + spacing_tokens=MOCK_SPACING_TOKENS, + log_callback=logger.log, + ) + + print_result("Step 1: Rule Engine", True, f"score={rule_results.consistency_score}") + + # Step 2: Benchmark Research (using fallbacks) + researcher = BenchmarkResearcher(firecrawl_client=None, hf_client=None) + + mock_benchmarks = [] + for key in ["material_design_3", "shopify_polaris", "atlassian_design"]: + source = DESIGN_SYSTEM_SOURCES[key] + fallback = FALLBACK_BENCHMARKS[key] + mock_benchmarks.append(BenchmarkData( + key=key, + name=source["name"], + short_name=source["short_name"], + vendor=source["vendor"], + icon=source["icon"], + typography=fallback["typography"], + spacing=fallback["spacing"], + fetched_at=datetime.now().isoformat(), + confidence="fallback", + best_for=source["best_for"], + )) + + comparisons = researcher.compare_to_benchmarks( + your_ratio=rule_results.typography.detected_ratio, + your_base_size=int(rule_results.typography.sizes_px[0]) if rule_results.typography.sizes_px else 16, + your_spacing_grid=rule_results.spacing.detected_base, + benchmarks=mock_benchmarks, + log_callback=logger.log, + ) + + print_result("Step 2: Benchmark Comparison", True, f"closest={comparisons[0].benchmark.short_name}") + + # Step 3: Mock LLM results (simulating what agents would return) + brand_result = BrandIdentification( + brand_primary={"color": "#06b2c4", "confidence": "high", "reasoning": "Most used on CTAs"}, + brand_secondary={"color": "#c1df1f", "confidence": "medium"}, + palette_strategy="complementary", + cohesion_score=7, + ) + + benchmark_advice = BenchmarkAdvice( + recommended_benchmark="shopify_polaris", + recommended_benchmark_name="Shopify Polaris", + reasoning="Best match for e-commerce UX", + alignment_changes=[ + {"change": "Type scale", "from": "1.18", "to": "1.25", "effort": "medium"} + ], + ) + + best_practices = BestPracticesResult( + overall_score=58, + checks={ + "type_scale_standard": {"status": "warn", "note": "1.18 close to Minor Third"}, + "aa_compliance": {"status": "fail", "note": "2 colors fail AA"}, + }, + priority_fixes=[ + {"rank": 1, "issue": "Brand primary fails AA", "impact": "high", "effort": "low"}, + ], + ) + + print_result("Step 3: Mock LLM Results", True, "all results created") + + # Step 4: Verify data can be serialized + output = { + "rule_engine": rule_results.to_dict(), + "benchmarks": [c.to_dict() for c in comparisons], + "brand": brand_result.to_dict(), + "advice": benchmark_advice.to_dict(), + "practices": best_practices.to_dict(), + } + + json_str = json.dumps(output, indent=2) + assert len(json_str) > 100, "Should produce substantial output" + + print_result("Step 4: Serialization", True, f"{len(json_str)} bytes") + + # Final summary + print("\n 📊 Integration Summary:") + print(f" - Rule Engine Score: {rule_results.consistency_score}/100") + print(f" - AA Failures: {rule_results.aa_failures}") + print(f" - Closest Benchmark: {comparisons[0].benchmark.name}") + print(f" - Match: {comparisons[0].overall_match_pct:.0f}%") + + all_passed = True + + except Exception as e: + import traceback + print_result("Integration Test", False, str(e)) + traceback.print_exc() + all_passed = False + + return all_passed + + +# ============================================================================= +# MAIN +# ============================================================================= + +def main(): + """Run all tests.""" + print("\n" + "█" * 60) + print(" STAGE 2 PIPELINE TEST SUITE") + print("█" * 60) + print(f"\n Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + results = {} + + # Run tests + results["Rule Engine"] = test_rule_engine() + results["Benchmark Research"] = test_benchmark_research() + results["LLM Agents"] = test_llm_agents() + results["Integration"] = asyncio.run(test_integration()) + + # Summary + print_section("TEST SUMMARY") + + total = len(results) + passed = sum(1 for v in results.values() if v) + + for name, result in results.items(): + icon = "✅" if result else "❌" + print(f" {icon} {name}") + + print(f"\n Total: {passed}/{total} passed") + + if passed == total: + print("\n 🎉 All tests passed!") + return 0 + else: + print("\n ⚠️ Some tests failed") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ui/__init__.py b/ui/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391