Spaces:

Zeyad-Alaa
/

UI-analysis

Runtime error

File size: 11,108 Bytes

04c8445

import gradio as gr
import torch
import json
import re
import spaces
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from peft import PeftModel

# ── Config ───────────────────────────────────────────────────────────────────
BASE_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
ADAPTER_PATH  = "./"   # adapter_config.json + adapter_model.safetensors are here

# ── Load processor once at startup ───────────────────────────────────────────
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)

# ── Load base model, then attach LoRA adapter ────────────────────────────────
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()

# ── System prompt ─────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """You are an expert UI/UX design analyst. When given a UI screenshot, analyze it thoroughly and return a structured JSON report with exactly these keys:

{
  "ui_elements": [
    {"type": "element type", "label": "text/label if any", "position_hint": "top-left / center / etc."}
  ],
  "layout_structure": "description of overall layout pattern (e.g. sidebar + main, top-nav + grid)",
  "hierarchy": {
    "primary":   ["most prominent / CTA elements"],
    "secondary": ["supporting elements"],
    "tertiary":  ["decorative or minor elements"]
  },
  "style": {
    "color_palette":  ["dominant colors as hex or descriptive names"],
    "typography":     "font style observations",
    "spacing":        "tight / balanced / airy",
    "visual_theme":   "overall aesthetic feel"
  },
  "summary": "one paragraph plain-English summary of the UI"
}

Respond ONLY with valid JSON. No markdown fences, no extra commentary."""


# ── Inference ─────────────────────────────────────────────────────────────────
@spaces.GPU
def analyze_ui(image: Image.Image, extra_prompt: str) -> tuple[str, str]:
    if image is None:
        return "⚠️ Please upload a UI screenshot first.", "{}"

    user_text = (
        extra_prompt.strip()
        if extra_prompt and extra_prompt.strip()
        else "Analyze this UI design screenshot in full detail."
    )

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text",  "text":  user_text},
            ],
        },
    ]

    text_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[text_input],
        images=[image],
        return_tensors="pt",
        padding=True,
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            temperature=None,
            top_p=None,
            repetition_penalty=1.1,
        )

    input_len = inputs["input_ids"].shape[1]
    generated = output_ids[:, input_len:]
    raw_text  = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()

    # ── Parse JSON ────────────────────────────────────────────────────────────
    json_text   = "{}"
    pretty_text = raw_text

    cleaned = re.sub(r"^```(?:json)?\s*", "", raw_text, flags=re.MULTILINE)
    cleaned = re.sub(r"\s*```$",          "", cleaned,  flags=re.MULTILINE).strip()

    try:
        parsed    = json.loads(cleaned)
        json_text = json.dumps(parsed, indent=2, ensure_ascii=False)

        lines = ["## 📊 UI Analysis Report\n"]

        # Elements
        elements = parsed.get("ui_elements", [])
        if elements:
            lines.append("### 🧩 UI Elements\n")
            for el in elements:
                if isinstance(el, dict):
                    etype = el.get("type", "element")
                    label = el.get("label") or el.get("text") or ""
                    pos   = el.get("position_hint") or el.get("position") or ""
                    lines.append(
                        f"- **{etype}**"
                        + (f": {label}" if label else "")
                        + (f" — {pos}"  if pos   else "")
                    )
                else:
                    lines.append(f"- {el}")
            lines.append("")

        # Layout
        layout = parsed.get("layout_structure", "")
        if layout:
            lines.append(f"### 🏗️ Layout\n{layout}\n")

        # Hierarchy
        hier = parsed.get("hierarchy", {})
        if hier:
            lines.append("### 📐 Hierarchy")
            for level, items in hier.items():
                if items:
                    val = ", ".join(items) if isinstance(items, list) else items
                    lines.append(f"- **{level.capitalize()}**: {val}")
            lines.append("")

        # Style
        style = parsed.get("style", {})
        if style:
            lines.append("### 🎨 Style")
            for key, val in style.items():
                display_key = key.replace("_", " ").title()
                if isinstance(val, list):
                    lines.append(f"- **{display_key}**: {', '.join(str(v) for v in val)}")
                else:
                    lines.append(f"- **{display_key}**: {val}")
            lines.append("")

        # Summary
        summary = parsed.get("summary", "")
        if summary:
            lines.append(f"### 💬 Summary\n{summary}\n")

        pretty_text = "\n".join(lines)

    except json.JSONDecodeError:
        json_text   = json.dumps({"raw_output": raw_text}, indent=2)
        pretty_text = f"## 📊 UI Analysis\n\n{raw_text}"

    return pretty_text, json_text


# ── Gradio UI ─────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Syne:wght@700;800&family=DM+Sans:wght@300;400;500&display=swap');

:root {
    --bg:       #0c0c10;
    --surface:  #14141a;
    --surface2: #1c1c26;
    --border:   #2a2a38;
    --accent:   #7c6aff;
    --accent2:  #ff6a9b;
    --text:     #e8e8f0;
    --muted:    #6b6b80;
    --radius:   12px;
}

body, .gradio-container {
    background: var(--bg) !important;
    font-family: 'DM Sans', sans-serif;
    color: var(--text);
}

#header { text-align: center; padding: 2.5rem 1rem 1.5rem; }
#header h1 {
    font-family: 'Syne', sans-serif; font-size: 2.6rem; font-weight: 800;
    background: linear-gradient(135deg, #7c6aff 0%, #ff6a9b 100%);
    -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0;
}
#header p { color: var(--muted); font-size: 1rem; margin: 0.5rem 0 0; }

.panel {
    background: var(--surface) !important;
    border: 1px solid var(--border) !important;
    border-radius: var(--radius) !important;
}

label {
    font-family: 'DM Mono', monospace !important; font-size: 0.75rem !important;
    color: var(--muted) !important; letter-spacing: 0.05em; text-transform: uppercase;
}

textarea, input[type=text] {
    background: var(--surface2) !important; border: 1px solid var(--border) !important;
    color: var(--text) !important; border-radius: 8px !important;
    font-family: 'DM Sans', sans-serif !important;
}

.run-btn {
    background: linear-gradient(135deg, #7c6aff, #ff6a9b) !important;
    border: none !important; border-radius: 8px !important; color: white !important;
    font-family: 'Syne', sans-serif !important; font-weight: 700 !important;
    font-size: 1rem !important; padding: 0.75rem 2rem !important; cursor: pointer;
}

.tab-nav button {
    font-family: 'DM Mono', monospace !important; font-size: 0.8rem !important;
    color: var(--muted) !important; border-radius: 6px !important;
}
.tab-nav button.selected { color: var(--accent) !important; border-color: var(--accent) !important; }

.markdown-body { font-family: 'DM Sans', sans-serif !important; color: var(--text) !important; }
.markdown-body h2 { font-family: 'Syne', sans-serif; color: var(--accent); }
.markdown-body h3 { font-family: 'Syne', sans-serif; color: var(--accent2); font-size: 1rem; }
.markdown-body code {
    font-family: 'DM Mono', monospace; background: var(--surface2);
    border-radius: 4px; padding: 0.1em 0.4em;
}

.footer {
    text-align: center; color: var(--muted); font-family: 'DM Mono', monospace;
    font-size: 0.72rem; padding: 1.5rem; letter-spacing: 0.04em;
}
"""

with gr.Blocks(css=CSS, title="UI Design Analyzer · Qwen2.5-VL + LoRA") as demo:

    gr.HTML("""
    <div id="header">
      <h1>UI Design Analyzer</h1>
      <p>Qwen2.5-VL-3B-Instruct + LoRA Adapter &nbsp;·&nbsp; Drop any UI screenshot for instant deep analysis</p>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=1, elem_classes="panel"):
            image_input = gr.Image(
                type="pil",
                label="Upload UI Screenshot",
                height=340,
            )
            prompt_input = gr.Textbox(
                lines=3,
                placeholder="Optional: focus the analysis — e.g. 'Focus on navigation and CTA hierarchy'",
                label="Custom Prompt (optional)",
            )
            run_btn = gr.Button("⚡ Analyze UI", elem_classes="run-btn")

        with gr.Column(scale=1, elem_classes="panel"):
            with gr.Tabs():
                with gr.Tab("📋 Readable Report"):
                    report_out = gr.Markdown(
                        value="*Upload a screenshot and click Analyze UI to begin.*",
                        elem_classes="markdown-body",
                    )
                with gr.Tab("{ } Raw JSON"):
                    json_out = gr.Code(language="json", label="Structured JSON Output")

    run_btn.click(
        fn=analyze_ui,
        inputs=[image_input, prompt_input],
        outputs=[report_out, json_out],
    )

    gr.HTML(
        '<div class="footer">'
        'BASE: Qwen/Qwen2.5-VL-3B-Instruct &nbsp;·&nbsp; '
        'ADAPTER: LoRA (adapter_model.safetensors) &nbsp;·&nbsp; '
        'HF ZeroGPU'
        '</div>'
    )

demo.launch()