UI-analysis / app.py
Zeyad-Alaa's picture
Create app.py
04c8445 verified
import gradio as gr
import torch
import json
import re
import spaces
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from peft import PeftModel
# ── Config ───────────────────────────────────────────────────────────────────
BASE_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
ADAPTER_PATH = "./" # adapter_config.json + adapter_model.safetensors are here
# ── Load processor once at startup ───────────────────────────────────────────
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
# ── Load base model, then attach LoRA adapter ────────────────────────────────
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
BASE_MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()
# ── System prompt ─────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """You are an expert UI/UX design analyst. When given a UI screenshot, analyze it thoroughly and return a structured JSON report with exactly these keys:
{
"ui_elements": [
{"type": "element type", "label": "text/label if any", "position_hint": "top-left / center / etc."}
],
"layout_structure": "description of overall layout pattern (e.g. sidebar + main, top-nav + grid)",
"hierarchy": {
"primary": ["most prominent / CTA elements"],
"secondary": ["supporting elements"],
"tertiary": ["decorative or minor elements"]
},
"style": {
"color_palette": ["dominant colors as hex or descriptive names"],
"typography": "font style observations",
"spacing": "tight / balanced / airy",
"visual_theme": "overall aesthetic feel"
},
"summary": "one paragraph plain-English summary of the UI"
}
Respond ONLY with valid JSON. No markdown fences, no extra commentary."""
# ── Inference ─────────────────────────────────────────────────────────────────
@spaces.GPU
def analyze_ui(image: Image.Image, extra_prompt: str) -> tuple[str, str]:
if image is None:
return "⚠️ Please upload a UI screenshot first.", "{}"
user_text = (
extra_prompt.strip()
if extra_prompt and extra_prompt.strip()
else "Analyze this UI design screenshot in full detail."
)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": user_text},
],
},
]
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = processor(
text=[text_input],
images=[image],
return_tensors="pt",
padding=True,
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
temperature=None,
top_p=None,
repetition_penalty=1.1,
)
input_len = inputs["input_ids"].shape[1]
generated = output_ids[:, input_len:]
raw_text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
# ── Parse JSON ────────────────────────────────────────────────────────────
json_text = "{}"
pretty_text = raw_text
cleaned = re.sub(r"^```(?:json)?\s*", "", raw_text, flags=re.MULTILINE)
cleaned = re.sub(r"\s*```$", "", cleaned, flags=re.MULTILINE).strip()
try:
parsed = json.loads(cleaned)
json_text = json.dumps(parsed, indent=2, ensure_ascii=False)
lines = ["## πŸ“Š UI Analysis Report\n"]
# Elements
elements = parsed.get("ui_elements", [])
if elements:
lines.append("### 🧩 UI Elements\n")
for el in elements:
if isinstance(el, dict):
etype = el.get("type", "element")
label = el.get("label") or el.get("text") or ""
pos = el.get("position_hint") or el.get("position") or ""
lines.append(
f"- **{etype}**"
+ (f": {label}" if label else "")
+ (f" β€” {pos}" if pos else "")
)
else:
lines.append(f"- {el}")
lines.append("")
# Layout
layout = parsed.get("layout_structure", "")
if layout:
lines.append(f"### πŸ—οΈ Layout\n{layout}\n")
# Hierarchy
hier = parsed.get("hierarchy", {})
if hier:
lines.append("### πŸ“ Hierarchy")
for level, items in hier.items():
if items:
val = ", ".join(items) if isinstance(items, list) else items
lines.append(f"- **{level.capitalize()}**: {val}")
lines.append("")
# Style
style = parsed.get("style", {})
if style:
lines.append("### 🎨 Style")
for key, val in style.items():
display_key = key.replace("_", " ").title()
if isinstance(val, list):
lines.append(f"- **{display_key}**: {', '.join(str(v) for v in val)}")
else:
lines.append(f"- **{display_key}**: {val}")
lines.append("")
# Summary
summary = parsed.get("summary", "")
if summary:
lines.append(f"### πŸ’¬ Summary\n{summary}\n")
pretty_text = "\n".join(lines)
except json.JSONDecodeError:
json_text = json.dumps({"raw_output": raw_text}, indent=2)
pretty_text = f"## πŸ“Š UI Analysis\n\n{raw_text}"
return pretty_text, json_text
# ── Gradio UI ─────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Syne:wght@700;800&family=DM+Sans:wght@300;400;500&display=swap');
:root {
--bg: #0c0c10;
--surface: #14141a;
--surface2: #1c1c26;
--border: #2a2a38;
--accent: #7c6aff;
--accent2: #ff6a9b;
--text: #e8e8f0;
--muted: #6b6b80;
--radius: 12px;
}
body, .gradio-container {
background: var(--bg) !important;
font-family: 'DM Sans', sans-serif;
color: var(--text);
}
#header { text-align: center; padding: 2.5rem 1rem 1.5rem; }
#header h1 {
font-family: 'Syne', sans-serif; font-size: 2.6rem; font-weight: 800;
background: linear-gradient(135deg, #7c6aff 0%, #ff6a9b 100%);
-webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0;
}
#header p { color: var(--muted); font-size: 1rem; margin: 0.5rem 0 0; }
.panel {
background: var(--surface) !important;
border: 1px solid var(--border) !important;
border-radius: var(--radius) !important;
}
label {
font-family: 'DM Mono', monospace !important; font-size: 0.75rem !important;
color: var(--muted) !important; letter-spacing: 0.05em; text-transform: uppercase;
}
textarea, input[type=text] {
background: var(--surface2) !important; border: 1px solid var(--border) !important;
color: var(--text) !important; border-radius: 8px !important;
font-family: 'DM Sans', sans-serif !important;
}
.run-btn {
background: linear-gradient(135deg, #7c6aff, #ff6a9b) !important;
border: none !important; border-radius: 8px !important; color: white !important;
font-family: 'Syne', sans-serif !important; font-weight: 700 !important;
font-size: 1rem !important; padding: 0.75rem 2rem !important; cursor: pointer;
}
.tab-nav button {
font-family: 'DM Mono', monospace !important; font-size: 0.8rem !important;
color: var(--muted) !important; border-radius: 6px !important;
}
.tab-nav button.selected { color: var(--accent) !important; border-color: var(--accent) !important; }
.markdown-body { font-family: 'DM Sans', sans-serif !important; color: var(--text) !important; }
.markdown-body h2 { font-family: 'Syne', sans-serif; color: var(--accent); }
.markdown-body h3 { font-family: 'Syne', sans-serif; color: var(--accent2); font-size: 1rem; }
.markdown-body code {
font-family: 'DM Mono', monospace; background: var(--surface2);
border-radius: 4px; padding: 0.1em 0.4em;
}
.footer {
text-align: center; color: var(--muted); font-family: 'DM Mono', monospace;
font-size: 0.72rem; padding: 1.5rem; letter-spacing: 0.04em;
}
"""
with gr.Blocks(css=CSS, title="UI Design Analyzer Β· Qwen2.5-VL + LoRA") as demo:
gr.HTML("""
<div id="header">
<h1>UI Design Analyzer</h1>
<p>Qwen2.5-VL-3B-Instruct + LoRA Adapter &nbsp;Β·&nbsp; Drop any UI screenshot for instant deep analysis</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1, elem_classes="panel"):
image_input = gr.Image(
type="pil",
label="Upload UI Screenshot",
height=340,
)
prompt_input = gr.Textbox(
lines=3,
placeholder="Optional: focus the analysis β€” e.g. 'Focus on navigation and CTA hierarchy'",
label="Custom Prompt (optional)",
)
run_btn = gr.Button("⚑ Analyze UI", elem_classes="run-btn")
with gr.Column(scale=1, elem_classes="panel"):
with gr.Tabs():
with gr.Tab("πŸ“‹ Readable Report"):
report_out = gr.Markdown(
value="*Upload a screenshot and click Analyze UI to begin.*",
elem_classes="markdown-body",
)
with gr.Tab("{ } Raw JSON"):
json_out = gr.Code(language="json", label="Structured JSON Output")
run_btn.click(
fn=analyze_ui,
inputs=[image_input, prompt_input],
outputs=[report_out, json_out],
)
gr.HTML(
'<div class="footer">'
'BASE: Qwen/Qwen2.5-VL-3B-Instruct &nbsp;Β·&nbsp; '
'ADAPTER: LoRA (adapter_model.safetensors) &nbsp;Β·&nbsp; '
'HF ZeroGPU'
'</div>'
)
demo.launch()