Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import json | |
| import re | |
| import spaces | |
| from PIL import Image | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| from peft import PeftModel | |
| # ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BASE_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" | |
| ADAPTER_PATH = "./" # adapter_config.json + adapter_model.safetensors are here | |
| # ββ Load processor once at startup βββββββββββββββββββββββββββββββββββββββββββ | |
| processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True) | |
| # ββ Load base model, then attach LoRA adapter ββββββββββββββββββββββββββββββββ | |
| base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| BASE_MODEL_ID, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) | |
| model.eval() | |
| # ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SYSTEM_PROMPT = """You are an expert UI/UX design analyst. When given a UI screenshot, analyze it thoroughly and return a structured JSON report with exactly these keys: | |
| { | |
| "ui_elements": [ | |
| {"type": "element type", "label": "text/label if any", "position_hint": "top-left / center / etc."} | |
| ], | |
| "layout_structure": "description of overall layout pattern (e.g. sidebar + main, top-nav + grid)", | |
| "hierarchy": { | |
| "primary": ["most prominent / CTA elements"], | |
| "secondary": ["supporting elements"], | |
| "tertiary": ["decorative or minor elements"] | |
| }, | |
| "style": { | |
| "color_palette": ["dominant colors as hex or descriptive names"], | |
| "typography": "font style observations", | |
| "spacing": "tight / balanced / airy", | |
| "visual_theme": "overall aesthetic feel" | |
| }, | |
| "summary": "one paragraph plain-English summary of the UI" | |
| } | |
| Respond ONLY with valid JSON. No markdown fences, no extra commentary.""" | |
| # ββ Inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def analyze_ui(image: Image.Image, extra_prompt: str) -> tuple[str, str]: | |
| if image is None: | |
| return "β οΈ Please upload a UI screenshot first.", "{}" | |
| user_text = ( | |
| extra_prompt.strip() | |
| if extra_prompt and extra_prompt.strip() | |
| else "Analyze this UI design screenshot in full detail." | |
| ) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": user_text}, | |
| ], | |
| }, | |
| ] | |
| text_input = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[text_input], | |
| images=[image], | |
| return_tensors="pt", | |
| padding=True, | |
| ) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| do_sample=False, | |
| temperature=None, | |
| top_p=None, | |
| repetition_penalty=1.1, | |
| ) | |
| input_len = inputs["input_ids"].shape[1] | |
| generated = output_ids[:, input_len:] | |
| raw_text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip() | |
| # ββ Parse JSON ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| json_text = "{}" | |
| pretty_text = raw_text | |
| cleaned = re.sub(r"^```(?:json)?\s*", "", raw_text, flags=re.MULTILINE) | |
| cleaned = re.sub(r"\s*```$", "", cleaned, flags=re.MULTILINE).strip() | |
| try: | |
| parsed = json.loads(cleaned) | |
| json_text = json.dumps(parsed, indent=2, ensure_ascii=False) | |
| lines = ["## π UI Analysis Report\n"] | |
| # Elements | |
| elements = parsed.get("ui_elements", []) | |
| if elements: | |
| lines.append("### π§© UI Elements\n") | |
| for el in elements: | |
| if isinstance(el, dict): | |
| etype = el.get("type", "element") | |
| label = el.get("label") or el.get("text") or "" | |
| pos = el.get("position_hint") or el.get("position") or "" | |
| lines.append( | |
| f"- **{etype}**" | |
| + (f": {label}" if label else "") | |
| + (f" β {pos}" if pos else "") | |
| ) | |
| else: | |
| lines.append(f"- {el}") | |
| lines.append("") | |
| # Layout | |
| layout = parsed.get("layout_structure", "") | |
| if layout: | |
| lines.append(f"### ποΈ Layout\n{layout}\n") | |
| # Hierarchy | |
| hier = parsed.get("hierarchy", {}) | |
| if hier: | |
| lines.append("### π Hierarchy") | |
| for level, items in hier.items(): | |
| if items: | |
| val = ", ".join(items) if isinstance(items, list) else items | |
| lines.append(f"- **{level.capitalize()}**: {val}") | |
| lines.append("") | |
| # Style | |
| style = parsed.get("style", {}) | |
| if style: | |
| lines.append("### π¨ Style") | |
| for key, val in style.items(): | |
| display_key = key.replace("_", " ").title() | |
| if isinstance(val, list): | |
| lines.append(f"- **{display_key}**: {', '.join(str(v) for v in val)}") | |
| else: | |
| lines.append(f"- **{display_key}**: {val}") | |
| lines.append("") | |
| # Summary | |
| summary = parsed.get("summary", "") | |
| if summary: | |
| lines.append(f"### π¬ Summary\n{summary}\n") | |
| pretty_text = "\n".join(lines) | |
| except json.JSONDecodeError: | |
| json_text = json.dumps({"raw_output": raw_text}, indent=2) | |
| pretty_text = f"## π UI Analysis\n\n{raw_text}" | |
| return pretty_text, json_text | |
| # ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=DM+Mono:wght@400;500&family=Syne:wght@700;800&family=DM+Sans:wght@300;400;500&display=swap'); | |
| :root { | |
| --bg: #0c0c10; | |
| --surface: #14141a; | |
| --surface2: #1c1c26; | |
| --border: #2a2a38; | |
| --accent: #7c6aff; | |
| --accent2: #ff6a9b; | |
| --text: #e8e8f0; | |
| --muted: #6b6b80; | |
| --radius: 12px; | |
| } | |
| body, .gradio-container { | |
| background: var(--bg) !important; | |
| font-family: 'DM Sans', sans-serif; | |
| color: var(--text); | |
| } | |
| #header { text-align: center; padding: 2.5rem 1rem 1.5rem; } | |
| #header h1 { | |
| font-family: 'Syne', sans-serif; font-size: 2.6rem; font-weight: 800; | |
| background: linear-gradient(135deg, #7c6aff 0%, #ff6a9b 100%); | |
| -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; | |
| } | |
| #header p { color: var(--muted); font-size: 1rem; margin: 0.5rem 0 0; } | |
| .panel { | |
| background: var(--surface) !important; | |
| border: 1px solid var(--border) !important; | |
| border-radius: var(--radius) !important; | |
| } | |
| label { | |
| font-family: 'DM Mono', monospace !important; font-size: 0.75rem !important; | |
| color: var(--muted) !important; letter-spacing: 0.05em; text-transform: uppercase; | |
| } | |
| textarea, input[type=text] { | |
| background: var(--surface2) !important; border: 1px solid var(--border) !important; | |
| color: var(--text) !important; border-radius: 8px !important; | |
| font-family: 'DM Sans', sans-serif !important; | |
| } | |
| .run-btn { | |
| background: linear-gradient(135deg, #7c6aff, #ff6a9b) !important; | |
| border: none !important; border-radius: 8px !important; color: white !important; | |
| font-family: 'Syne', sans-serif !important; font-weight: 700 !important; | |
| font-size: 1rem !important; padding: 0.75rem 2rem !important; cursor: pointer; | |
| } | |
| .tab-nav button { | |
| font-family: 'DM Mono', monospace !important; font-size: 0.8rem !important; | |
| color: var(--muted) !important; border-radius: 6px !important; | |
| } | |
| .tab-nav button.selected { color: var(--accent) !important; border-color: var(--accent) !important; } | |
| .markdown-body { font-family: 'DM Sans', sans-serif !important; color: var(--text) !important; } | |
| .markdown-body h2 { font-family: 'Syne', sans-serif; color: var(--accent); } | |
| .markdown-body h3 { font-family: 'Syne', sans-serif; color: var(--accent2); font-size: 1rem; } | |
| .markdown-body code { | |
| font-family: 'DM Mono', monospace; background: var(--surface2); | |
| border-radius: 4px; padding: 0.1em 0.4em; | |
| } | |
| .footer { | |
| text-align: center; color: var(--muted); font-family: 'DM Mono', monospace; | |
| font-size: 0.72rem; padding: 1.5rem; letter-spacing: 0.04em; | |
| } | |
| """ | |
| with gr.Blocks(css=CSS, title="UI Design Analyzer Β· Qwen2.5-VL + LoRA") as demo: | |
| gr.HTML(""" | |
| <div id="header"> | |
| <h1>UI Design Analyzer</h1> | |
| <p>Qwen2.5-VL-3B-Instruct + LoRA Adapter Β· Drop any UI screenshot for instant deep analysis</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_classes="panel"): | |
| image_input = gr.Image( | |
| type="pil", | |
| label="Upload UI Screenshot", | |
| height=340, | |
| ) | |
| prompt_input = gr.Textbox( | |
| lines=3, | |
| placeholder="Optional: focus the analysis β e.g. 'Focus on navigation and CTA hierarchy'", | |
| label="Custom Prompt (optional)", | |
| ) | |
| run_btn = gr.Button("β‘ Analyze UI", elem_classes="run-btn") | |
| with gr.Column(scale=1, elem_classes="panel"): | |
| with gr.Tabs(): | |
| with gr.Tab("π Readable Report"): | |
| report_out = gr.Markdown( | |
| value="*Upload a screenshot and click Analyze UI to begin.*", | |
| elem_classes="markdown-body", | |
| ) | |
| with gr.Tab("{ } Raw JSON"): | |
| json_out = gr.Code(language="json", label="Structured JSON Output") | |
| run_btn.click( | |
| fn=analyze_ui, | |
| inputs=[image_input, prompt_input], | |
| outputs=[report_out, json_out], | |
| ) | |
| gr.HTML( | |
| '<div class="footer">' | |
| 'BASE: Qwen/Qwen2.5-VL-3B-Instruct Β· ' | |
| 'ADAPTER: LoRA (adapter_model.safetensors) Β· ' | |
| 'HF ZeroGPU' | |
| '</div>' | |
| ) | |
| demo.launch() | |