Spaces:

riazmo
/

Design-System-Extractor-2

Sleeping

App Files Files Community

riazmo commited on 28 days ago

Commit

38d9cec

verified ·

1 Parent(s): bcbb324

Upload app.py

Browse files

Files changed (1) hide show

app.py +482 -0

app.py ADDED Viewed

	@@ -0,0 +1,482 @@

+"""
+Design System Extractor v2 — Main Application
+==============================================
+A semi-automated, human-in-the-loop agentic system that reverse-engineers
+design systems from live websites.
+Usage:
+    python app.py
+"""
+import os
+import asyncio
+import gradio as gr
+from datetime import datetime
+# Get HF token from environment if available
+HF_TOKEN_FROM_ENV = os.getenv("HF_TOKEN", "")
+# =============================================================================
+# GLOBAL STATE
+# =============================================================================
+current_extraction: dict = {}
+user_hf_token: str = ""
+# =============================================================================
+# HF TOKEN MANAGEMENT
+# =============================================================================
+def set_hf_token(token: str) -> str:
+    """Set the HF token globally."""
+    global user_hf_token
+    if not token or len(token) < 10:
+        return "❌ Please enter a valid HuggingFace token"
+    user_hf_token = token.strip()
+    os.environ["HF_TOKEN"] = user_hf_token
+    return "✅ Token saved! You can now use the extractor."
+# =============================================================================
+# LAZY IMPORTS (avoid circular imports at startup)
+# =============================================================================
+_crawler_module = None
+_extractor_module = None
+_schema_module = None
+def get_crawler():
+    global _crawler_module
+    if _crawler_module is None:
+        import agents.crawler
+        _crawler_module = agents.crawler
+    return _crawler_module
+def get_extractor():
+    global _extractor_module
+    if _extractor_module is None:
+        import agents.extractor
+        _extractor_module = agents.extractor
+    return _extractor_module
+def get_schema():
+    global _schema_module
+    if _schema_module is None:
+        import core.token_schema
+        _schema_module = core.token_schema
+    return _schema_module
+# =============================================================================
+# STAGE 1: URL INPUT & PAGE DISCOVERY
+# =============================================================================
+async def discover_site_pages(url: str, progress=gr.Progress()) -> tuple:
+    """
+    Discover pages from a website URL.
+    Returns tuple of (status_message, pages_dataframe, pages_json)
+    """
+    if not url or not url.startswith(("http://", "https://")):
+        return "❌ Please enter a valid URL starting with http:// or https://", None, None
+    progress(0, desc="🚀 Initializing browser...")
+    try:
+        crawler = get_crawler()
+        discoverer = crawler.PageDiscoverer()
+        def update_progress(p):
+            progress(p, desc=f"🔍 Discovering pages... ({int(p*100)}%)")
+        pages = await discoverer.discover(url, progress_callback=update_progress)
+        progress(1.0, desc="✅ Discovery complete!")
+        # Format for display - ensure we return simple values, not objects
+        pages_data = []
+        for page in pages:
+            pages_data.append([
+                page.selected,                              # Select (bool)
+                page.url,                                   # URL (str)
+                page.title if page.title else "(No title)", # Title (str)
+                page.page_type.value,                       # Type (str)
+                "✓" if not page.error else f"⚠ {page.error}" # Status (str)
+            ])
+        # Store for later use
+        current_extraction["discovered_pages"] = pages
+        current_extraction["base_url"] = url
+        status = f"✅ Found {len(pages)} pages. Select the pages you want to extract tokens from."
+        return status, pages_data
+    except Exception as e:
+        import traceback
+        return f"❌ Error: {str(e)}\n\n{traceback.format_exc()}", None
+async def start_extraction(pages_selection, viewport_choice: str, progress=gr.Progress()) -> tuple:
+    """
+    Start token extraction from selected pages.
+    Returns tuple of (status, colors_data, typography_data, spacing_data)
+    """
+    if pages_selection is None or len(pages_selection) == 0:
+        return "❌ Please discover pages first", None, None, None
+    progress(0, desc="🔄 Preparing extraction...")
+    # Get selected URLs from the dataframe
+    selected_urls = []
+    # Handle both list of lists and list of dicts formats
+    for row in pages_selection:
+        if isinstance(row, (list, tuple)):
+            # Format: [Select, URL, Title, Type, Status]
+            if len(row) >= 2 and row[0]:  # row[0] is Select checkbox
+                selected_urls.append(row[1])  # row[1] is URL
+        elif isinstance(row, dict):
+            if row.get("Select", False):
+                selected_urls.append(row.get("URL", ""))
+    if not selected_urls:
+        return "❌ Please select at least one page using the checkboxes", None, None, None
+    progress(0.05, desc=f"📋 Selected {len(selected_urls)} pages for extraction...")
+    # Determine viewport
+    schema = get_schema()
+    viewport = schema.Viewport.DESKTOP if viewport_choice == "Desktop (1440px)" else schema.Viewport.MOBILE
+    try:
+        extractor_mod = get_extractor()
+        extractor = extractor_mod.TokenExtractor(viewport=viewport)
+        def update_progress(p):
+            # Scale progress from 0.1 to 0.9
+            scaled = 0.1 + (p * 0.8)
+            progress(scaled, desc=f"🔬 Extracting tokens... ({int(p*100)}%)")
+        progress(0.1, desc=f"🌐 Starting {viewport.value} extraction...")
+        result = await extractor.extract(selected_urls, progress_callback=update_progress)
+        progress(0.9, desc="📊 Processing results...")
+        # Store result
+        current_extraction[f"{viewport.value}_tokens"] = result
+        # Format colors for display - use list of lists for Gradio
+        colors_data = []
+        for color in sorted(result.colors, key=lambda c: -c.frequency)[:50]:
+            colors_data.append([
+                True,                                           # Accept
+                color.value,                                    # Color
+                color.frequency,                                # Frequency
+                ", ".join(color.contexts[:3]) if color.contexts else "",  # Context
+                f"{color.contrast_white:.1f}:1",               # Contrast
+                "✓" if color.wcag_aa_small_text else "✗",      # AA Text
+                color.confidence.value if color.confidence else "low"  # Confidence
+            ])
+        progress(0.93, desc="📝 Processing typography...")
+        # Format typography for display
+        typography_data = []
+        for typo in sorted(result.typography, key=lambda t: -t.frequency)[:30]:
+            typography_data.append([
+                True,                                           # Accept
+                typo.font_family,                               # Font
+                typo.font_size,                                 # Size
+                typo.font_weight,                               # Weight
+                typo.line_height if typo.line_height else "",   # Line Height
+                ", ".join(typo.elements[:3]) if typo.elements else "",  # Elements
+                typo.frequency                                  # Frequency
+            ])
+        progress(0.96, desc="📏 Processing spacing...")
+        # Format spacing for display
+        spacing_data = []
+        for space in sorted(result.spacing, key=lambda s: s.value_px)[:20]:
+            spacing_data.append([
+                True,                                           # Accept
+                space.value,                                    # Value
+                space.frequency,                                # Frequency
+                ", ".join(space.contexts[:2]) if space.contexts else "",  # Context
+                "✓" if space.fits_base_8 else "",              # Fits 8px
+                "⚠" if space.is_outlier else ""                # Outlier
+            ])
+        progress(1.0, desc="✅ Extraction complete!")
+        # Summary
+        status = f"""✅ **Extraction Complete** ({viewport.value})
+### 📊 Summary
+| Metric | Value |
+|--------|-------|
+| Pages crawled | {len(result.pages_crawled)} |
+| Colors found | {len(result.colors)} |
+| Typography styles | {len(result.typography)} |
+| Spacing values | {len(result.spacing)} |
+| Font families | {len(result.font_families)} |
+| Spacing base | {result.spacing_base or 'Unknown'}px |
+| Duration | {result.extraction_duration_ms}ms |
+"""
+        if result.warnings:
+            status += f"\n⚠️ **Warnings:** {len(result.warnings)}"
+        if result.errors:
+            status += f"\n❌ **Errors:** {len(result.errors)}"
+            for err in result.errors[:3]:
+                status += f"\n- {err}"
+        return status, colors_data, typography_data, spacing_data
+    except Exception as e:
+        import traceback
+        return f"❌ Extraction failed: {str(e)}\n\n```\n{traceback.format_exc()}\n```", None, None, None
+def export_tokens_json():
+    """Export current tokens to JSON."""
+    import json
+    result = {}
+    if "desktop_tokens" in current_extraction:
+        desktop = current_extraction["desktop_tokens"]
+        result["desktop"] = {
+            "colors": [c.model_dump() for c in desktop.colors],
+            "typography": [t.model_dump() for t in desktop.typography],
+            "spacing": [s.model_dump() for s in desktop.spacing],
+            "metadata": desktop.summary(),
+        }
+    if "mobile_tokens" in current_extraction:
+        mobile = current_extraction["mobile_tokens"]
+        result["mobile"] = {
+            "colors": [c.model_dump() for c in mobile.colors],
+            "typography": [t.model_dump() for t in mobile.typography],
+            "spacing": [s.model_dump() for s in mobile.spacing],
+            "metadata": mobile.summary(),
+        }
+    if not result:
+        return '{"error": "No tokens extracted yet. Please run extraction first."}'
+    return json.dumps(result, indent=2, default=str)
+# =============================================================================
+# UI BUILDING
+# =============================================================================
+def create_ui():
+    """Create the Gradio interface."""
+    with gr.Blocks(
+        title="Design System Extractor v2",
+        theme=gr.themes.Soft(),
+    ) as app:
+        # Header
+        gr.Markdown("""
+        # 🎨 Design System Extractor v2
+        **Reverse-engineer design systems from live websites.**
+        Extract colors, typography, and spacing tokens from any website and export to Figma-compatible JSON.
+        ---
+        """)
+        # =================================================================
+        # CONFIGURATION SECTION
+        # =================================================================
+        with gr.Accordion("⚙️ Configuration", open=not bool(HF_TOKEN_FROM_ENV)):
+            gr.Markdown("""
+            **HuggingFace Token** is required for AI-powered features (Agent 2-4).
+            Get your token at: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+            *Note: Basic extraction (Agent 1) works without a token.*
+            """)
+            with gr.Row():
+                hf_token_input = gr.Textbox(
+                    label="HuggingFace Token",
+                    placeholder="hf_xxxxxxxxxxxxxxxxxxxx",
+                    type="password",
+                    scale=4,
+                    value=HF_TOKEN_FROM_ENV if HF_TOKEN_FROM_ENV else "",
+                )
+                save_token_btn = gr.Button("💾 Save Token", scale=1)
+            token_status = gr.Markdown(
+                "✅ Token loaded from environment" if HF_TOKEN_FROM_ENV else "⏳ Enter your HF token to enable all features"
+            )
+            save_token_btn.click(
+                fn=set_hf_token,
+                inputs=[hf_token_input],
+                outputs=[token_status],
+            )
+        # =================================================================
+        # STAGE 1: URL Input & Discovery
+        # =================================================================
+        with gr.Accordion("📍 Stage 1: Website Discovery", open=True):
+            gr.Markdown("""
+            **Step 1:** Enter your website URL and discover pages.
+            The system will automatically find and classify pages for extraction.
+            """)
+            with gr.Row():
+                url_input = gr.Textbox(
+                    label="Website URL",
+                    placeholder="https://example.com",
+                    scale=4,
+                )
+                discover_btn = gr.Button("🔍 Discover Pages", variant="primary", scale=1)
+            discovery_status = gr.Markdown("")
+            pages_table = gr.Dataframe(
+                headers=["Select", "URL", "Title", "Type", "Status"],
+                datatype=["bool", "str", "str", "str", "str"],
+                interactive=True,
+                label="Discovered Pages",
+                visible=False,
+                col_count=(5, "fixed"),
+            )
+        # =================================================================
+        # STAGE 2: Extraction
+        # =================================================================
+        with gr.Accordion("🔬 Stage 2: Token Extraction", open=False):
+            gr.Markdown("""
+            **Step 2:** Select pages and viewport, then extract design tokens.
+            """)
+            with gr.Row():
+                viewport_radio = gr.Radio(
+                    choices=["Desktop (1440px)", "Mobile (375px)"],
+                    value="Desktop (1440px)",
+                    label="Viewport",
+                )
+                extract_btn = gr.Button("🚀 Extract Tokens", variant="primary")
+            extraction_status = gr.Markdown("")
+            with gr.Tabs():
+                with gr.Tab("🎨 Colors"):
+                    colors_table = gr.Dataframe(
+                        headers=["Accept", "Color", "Frequency", "Context", "Contrast (White)", "AA Text", "Confidence"],
+                        datatype=["bool", "str", "number", "str", "str", "str", "str"],
+                        interactive=True,
+                        label="Extracted Colors",
+                    )
+                with gr.Tab("📝 Typography"):
+                    typography_table = gr.Dataframe(
+                        headers=["Accept", "Font", "Size", "Weight", "Line Height", "Elements", "Frequency"],
+                        datatype=["bool", "str", "str", "number", "str", "str", "number"],
+                        interactive=True,
+                        label="Extracted Typography",
+                    )
+                with gr.Tab("📏 Spacing"):
+                    spacing_table = gr.Dataframe(
+                        headers=["Accept", "Value", "Frequency", "Context", "Fits 8px", "Outlier"],
+                        datatype=["bool", "str", "number", "str", "str", "str"],
+                        interactive=True,
+                        label="Extracted Spacing",
+                    )
+        # =================================================================
+        # STAGE 3: Export
+        # =================================================================
+        with gr.Accordion("📦 Stage 3: Export", open=False):
+            gr.Markdown("""
+            **Step 3:** Review and export your design tokens.
+            """)
+            with gr.Row():
+                export_btn = gr.Button("📥 Export JSON", variant="secondary")
+            export_output = gr.Code(
+                label="Exported Tokens (JSON)",
+                language="json",
+                lines=20,
+            )
+        # =================================================================
+        # EVENT HANDLERS
+        # =================================================================
+        # Discovery
+        discover_btn.click(
+            fn=discover_site_pages,
+            inputs=[url_input],
+            outputs=[discovery_status, pages_table],
+        ).then(
+            fn=lambda: gr.update(visible=True),
+            outputs=[pages_table],
+        )
+        # Extraction
+        extract_btn.click(
+            fn=start_extraction,
+            inputs=[pages_table, viewport_radio],
+            outputs=[extraction_status, colors_table, typography_table, spacing_table],
+        )
+        # Export
+        export_btn.click(
+            fn=export_tokens_json,
+            outputs=[export_output],
+        )
+        # =================================================================
+        # FOOTER
+        # =================================================================
+        gr.Markdown("""
+        ---
+        **Design System Extractor v2** | Built with LangGraph + Gradio + HuggingFace
+        *A semi-automated co-pilot for design system recovery and modernization.*
+        **Models:** Microsoft Phi (Normalizer) • Meta Llama (Advisor) • Mistral Codestral (Generator)
+        """)
+    return app
+# =============================================================================
+# MAIN
+# =============================================================================
+if __name__ == "__main__":
+    app = create_ui()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+    )