import gradio as gr import torch import json import re import spaces from PIL import Image from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration from peft import PeftModel # ── Config ─────────────────────────────────────────────────────────────────── BASE_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" ADAPTER_PATH = "./" # adapter_config.json + adapter_model.safetensors are here # ── Load processor once at startup ─────────────────────────────────────────── processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True) # ── Load base model, then attach LoRA adapter ──────────────────────────────── base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( BASE_MODEL_ID, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) model.eval() # ── System prompt ───────────────────────────────────────────────────────────── SYSTEM_PROMPT = """You are an expert UI/UX design analyst. When given a UI screenshot, analyze it thoroughly and return a structured JSON report with exactly these keys: { "ui_elements": [ {"type": "element type", "label": "text/label if any", "position_hint": "top-left / center / etc."} ], "layout_structure": "description of overall layout pattern (e.g. sidebar + main, top-nav + grid)", "hierarchy": { "primary": ["most prominent / CTA elements"], "secondary": ["supporting elements"], "tertiary": ["decorative or minor elements"] }, "style": { "color_palette": ["dominant colors as hex or descriptive names"], "typography": "font style observations", "spacing": "tight / balanced / airy", "visual_theme": "overall aesthetic feel" }, "summary": "one paragraph plain-English summary of the UI" } Respond ONLY with valid JSON. No markdown fences, no extra commentary.""" # ── Inference ───────────────────────────────────────────────────────────────── @spaces.GPU def analyze_ui(image: Image.Image, extra_prompt: str) -> tuple[str, str]: if image is None: return "⚠️ Please upload a UI screenshot first.", "{}" user_text = ( extra_prompt.strip() if extra_prompt and extra_prompt.strip() else "Analyze this UI design screenshot in full detail." ) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": user_text}, ], }, ] text_input = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=[text_input], images=[image], return_tensors="pt", padding=True, ) inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=1024, do_sample=False, temperature=None, top_p=None, repetition_penalty=1.1, ) input_len = inputs["input_ids"].shape[1] generated = output_ids[:, input_len:] raw_text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip() # ── Parse JSON ──────────────────────────────────────────────────────────── json_text = "{}" pretty_text = raw_text cleaned = re.sub(r"^```(?:json)?\s*", "", raw_text, flags=re.MULTILINE) cleaned = re.sub(r"\s*```$", "", cleaned, flags=re.MULTILINE).strip() try: parsed = json.loads(cleaned) json_text = json.dumps(parsed, indent=2, ensure_ascii=False) lines = ["## 📊 UI Analysis Report\n"] # Elements elements = parsed.get("ui_elements", []) if elements: lines.append("### 🧩 UI Elements\n") for el in elements: if isinstance(el, dict): etype = el.get("type", "element") label = el.get("label") or el.get("text") or "" pos = el.get("position_hint") or el.get("position") or "" lines.append( f"- **{etype}**" + (f": {label}" if label else "") + (f" — {pos}" if pos else "") ) else: lines.append(f"- {el}") lines.append("") # Layout layout = parsed.get("layout_structure", "") if layout: lines.append(f"### 🏗️ Layout\n{layout}\n") # Hierarchy hier = parsed.get("hierarchy", {}) if hier: lines.append("### 📐 Hierarchy") for level, items in hier.items(): if items: val = ", ".join(items) if isinstance(items, list) else items lines.append(f"- **{level.capitalize()}**: {val}") lines.append("") # Style style = parsed.get("style", {}) if style: lines.append("### 🎨 Style") for key, val in style.items(): display_key = key.replace("_", " ").title() if isinstance(val, list): lines.append(f"- **{display_key}**: {', '.join(str(v) for v in val)}") else: lines.append(f"- **{display_key}**: {val}") lines.append("") # Summary summary = parsed.get("summary", "") if summary: lines.append(f"### 💬 Summary\n{summary}\n") pretty_text = "\n".join(lines) except json.JSONDecodeError: json_text = json.dumps({"raw_output": raw_text}, indent=2) pretty_text = f"## 📊 UI Analysis\n\n{raw_text}" return pretty_text, json_text # ── Gradio UI ───────────────────────────────────────────────────────────────── CSS = """ @import url('https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Syne:wght@700;800&family=DM+Sans:wght@300;400;500&display=swap'); :root { --bg: #0c0c10; --surface: #14141a; --surface2: #1c1c26; --border: #2a2a38; --accent: #7c6aff; --accent2: #ff6a9b; --text: #e8e8f0; --muted: #6b6b80; --radius: 12px; } body, .gradio-container { background: var(--bg) !important; font-family: 'DM Sans', sans-serif; color: var(--text); } #header { text-align: center; padding: 2.5rem 1rem 1.5rem; } #header h1 { font-family: 'Syne', sans-serif; font-size: 2.6rem; font-weight: 800; background: linear-gradient(135deg, #7c6aff 0%, #ff6a9b 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; } #header p { color: var(--muted); font-size: 1rem; margin: 0.5rem 0 0; } .panel { background: var(--surface) !important; border: 1px solid var(--border) !important; border-radius: var(--radius) !important; } label { font-family: 'DM Mono', monospace !important; font-size: 0.75rem !important; color: var(--muted) !important; letter-spacing: 0.05em; text-transform: uppercase; } textarea, input[type=text] { background: var(--surface2) !important; border: 1px solid var(--border) !important; color: var(--text) !important; border-radius: 8px !important; font-family: 'DM Sans', sans-serif !important; } .run-btn { background: linear-gradient(135deg, #7c6aff, #ff6a9b) !important; border: none !important; border-radius: 8px !important; color: white !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; padding: 0.75rem 2rem !important; cursor: pointer; } .tab-nav button { font-family: 'DM Mono', monospace !important; font-size: 0.8rem !important; color: var(--muted) !important; border-radius: 6px !important; } .tab-nav button.selected { color: var(--accent) !important; border-color: var(--accent) !important; } .markdown-body { font-family: 'DM Sans', sans-serif !important; color: var(--text) !important; } .markdown-body h2 { font-family: 'Syne', sans-serif; color: var(--accent); } .markdown-body h3 { font-family: 'Syne', sans-serif; color: var(--accent2); font-size: 1rem; } .markdown-body code { font-family: 'DM Mono', monospace; background: var(--surface2); border-radius: 4px; padding: 0.1em 0.4em; } .footer { text-align: center; color: var(--muted); font-family: 'DM Mono', monospace; font-size: 0.72rem; padding: 1.5rem; letter-spacing: 0.04em; } """ with gr.Blocks(css=CSS, title="UI Design Analyzer · Qwen2.5-VL + LoRA") as demo: gr.HTML("""
Qwen2.5-VL-3B-Instruct + LoRA Adapter · Drop any UI screenshot for instant deep analysis