Spaces:

SciX2050
/

visionquery

Sleeping

File size: 12,373 Bytes

"""
VisionQuery — Zero-Shot Image Understanding with SigLIP
Built with Taipy GUI | Deployed on Hugging Face Spaces
"""

import os
import torch
import numpy as np
import pandas as pd
from PIL import Image
import plotly.graph_objects as go
from taipy.gui import Gui, notify, State

# ═══════════════════════════════════════════════════════════════════════════════
# MODEL  (loaded lazily on first inference)
# ═══════════════════════════════════════════════════════════════════════════════
_processor = None
_model = None


def _load_siglip():
    global _processor, _model
    if _model is None:
        from transformers import AutoProcessor, AutoModel
        model_id = "google/siglip-base-patch16-224"
        _processor = AutoProcessor.from_pretrained(model_id)
        _model     = AutoModel.from_pretrained(model_id)
        _model.eval()
    return _processor, _model


# ═══════════════════════════════════════════════════════════════════════════════
# HELPERS
# ═══════════════════════════════════════════════════════════════════════════════
def _empty_chart(msg="Upload an image and click Analyze to see results"):
    fig = go.Figure()
    fig.add_annotation(
        x=0.5, y=0.5, xref="paper", yref="paper",
        text=msg, showarrow=False,
        font=dict(size=14, color="#94a3b8"),
    )
    fig.update_layout(
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        height=300,
        margin=dict(l=10, r=10, t=10, b=10),
    )
    return fig


def _make_bar_chart(labels, scores):
    n = len(labels)
    alphas  = [max(0.20, s / 100) for s in scores]
    colors  = [f"rgba(99,102,241,{a:.2f})" for a in alphas]

    fig = go.Figure(go.Bar(
        x=scores,
        y=labels,
        orientation="h",
        marker=dict(color=colors, line=dict(width=0)),
        text=[f"  {s:.1f}%" for s in scores],
        textposition="outside",
    ))
    fig.update_layout(
        title=dict(
            text="SigLIP Similarity Scores",
            font=dict(size=18, color="#312e81"),
            x=0.02,
        ),
        xaxis=dict(
            title="Score (%)",
            range=[0, min(100, max(scores) * 1.35)],
            gridcolor="#e2e8f0",
        ),
        yaxis=dict(autorange="reversed", gridcolor="#e2e8f0"),
        height=max(320, n * 52 + 100),
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(248,250,252,1)",
        font=dict(color="#1e293b", size=13),
        margin=dict(l=10, r=100, t=60, b=40),
        hoverlabel=dict(bgcolor="#312e81", font_color="white"),
    )
    return fig


# ═══════════════════════════════════════════════════════════════════════════════
# GLOBAL STATE
# ═══════════════════════════════════════════════════════════════════════════════
uploaded_image = None        # bound to file_selector
display_image  = None        # bound to <image>
text_input     = (
    "a cat, a dog, a car, a person walking, "
    "a sunset, a building, a flower, an animal"
)
chart_data = pd.DataFrame({"Label": [], "Score": []})
chart_empty = True
score_df       = pd.DataFrame(columns=["Rank", "Label", "Score (%)"])
status_msg     = "Upload an image and click **Analyze** to begin."
top_label      = ""
top_score      = 0.0
has_results    = False
is_analyzing   = False
model_status   = "⏳ Model loads on first inference (~15-30 s)"


# ═══════════════════════════════════════════════════════════════════════════════
# CALLBACKS
# ═══════════════════════════════════════════════════════════════════════════════
def on_file_upload(state: State, var_name: str, value):
    if state.uploaded_image:
        state.display_image = state.uploaded_image
        state.status_msg    = "✅ Image ready — click **Analyze** to run SigLIP."
        state.has_results   = False
        state.chart_fig     = _empty_chart("Image loaded. Click Analyze!")
        state.score_df      = pd.DataFrame(columns=["Rank", "Label", "Score (%)"])
        state.top_label     = ""
        notify(state, "success", "Image uploaded successfully!")


def analyze(state: State):
    if not state.display_image:
        notify(state, "warning", "Please upload an image first.")
        return

    label_list = [l.strip() for l in state.text_input.split(",") if l.strip()]
    if not label_list:
        notify(state, "warning", "Enter at least one comma-separated label.")
        return

    state.is_analyzing = True
    state.status_msg   = "🔄 Loading SigLIP model & running inference…"

    try:
        proc, mdl          = _load_siglip()
        state.model_status = "✅ google/siglip-base-patch16-224 — ready"

        img = Image.open(state.display_image).convert("RGB")

        with torch.no_grad():
            inputs = proc(
                text=label_list,
                images=img,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
            )
            logits = mdl(**inputs).logits_per_image   # shape: (1, N)
            probs  = torch.sigmoid(logits).squeeze()  # shape: (N,)

        if probs.dim() == 0:
            probs = probs.unsqueeze(0)
        probs = probs.tolist()

        pairs  = sorted(zip(label_list, probs), key=lambda x: x[1], reverse=True)
        labels = [p[0] for p in pairs]
        scores = [round(p[1] * 100, 2) for p in pairs]

        state.top_label  = labels[0]
        state.top_score  = scores[0]
        state.chart_data = pd.DataFrame({"Label": labels, "Score (%)": scores})
        state.chart_empty = False
        state.score_df = pd.DataFrame({
            "Rank":      list(range(1, len(labels) + 1)),
            "Label":     labels,
            "Score (%)": [f"{s:.2f}" for s in scores],  # ← string, never blank
        })
        state.has_results = True
        state.status_msg  = f"✅ Top match: **{labels[0]}** ({scores[0]:.1f}%)"
        notify(state, "success", "Analysis complete!")

    except Exception as exc:
        state.status_msg = f"❌ Error: {exc}"
        notify(state, "error", str(exc))
    finally:
        state.is_analyzing = False


def reset(state: State):
    state.uploaded_image = None
    state.display_image  = None
    state.chart_data     = pd.DataFrame({"Label": [], "Score (%)": []})
    state.chart_empty    = True
    state.score_df       = pd.DataFrame(columns=["Rank", "Label", "Score (%)"])
    state.top_label      = ""
    state.top_score      = 0.0
    state.has_results    = False
    state.status_msg     = "Upload a new image and click Analyze."

# ═══════════════════════════════════════════════════════════════════════════════
# PAGE — DEMO
# ═══════════════════════════════════════════════════════════════════════════════
demo_md = """
<|part|class_name=page-header|
# 🔍 VisionQuery 
### Zero-Shot Image Classification powered by Google SigLIP + Taipy
|>

<|layout|columns=5 7|gap=2.5rem|class_name=main-layout|

<|part|class_name=panel card|

#### Step 1 — Upload Image

<|{uploaded_image}|file_selector|label=📂 Choose Image…|extensions=.jpg,.jpeg,.png,.webp|drop_message=Drop image here|on_action=on_file_upload|class_name=upload-btn|>

<|{display_image}|image|width=100%|class_name=preview-img|>

---

#### Step 2 — Enter Text Labels
*Comma-separated concepts to test against the image:*

<|{text_input}|input|multiline|rows=5|class_name=fullwidth label-input|>

<|🚀 Analyze Image|button|on_action=analyze|active={not is_analyzing}|class_name=plain analyze-btn|>
<| ↺ Reset|button|on_action=reset|class_name=reset-btn|>

---

<|{status_msg}|text|class_name=status-text|>

<|{model_status}|text|class_name=model-tag|>
|>

<|part|class_name=panel card|

#### Results

<|part|render={has_results}|class_name=winner-card|
<|layout|columns=1 1|gap=1rem|
<|part|
🏆 **Best Match**

<|{top_label}|text|class_name=winner-label|>
|>
<|part|
📊 **Confidence**

<|{top_score:.1f}|text|class_name=winner-score|>%
|>
|>
|>

<|{chart_data}|chart|type=bar|x=Score (%)|y=Label|orientation=h|title=SigLIP Similarity Scores|height=350px|>

<|part|render={has_results}|class_name=score-table|
**Detailed Scores:**
<|{score_df}|table|width=100%|page_size=10|>
|>

|>

|>
"""


# ═══════════════════════════════════════════════════════════════════════════════
# PAGE — ABOUT
# ═══════════════════════════════════════════════════════════════════════════════
about_md = """
<|part|class_name=page-header|
# 🧠 About VisionQuery 
### Problem · Solution · Technology Stack
|>

<|layout|columns=1 1|gap=2rem|

<|part|class_name=card problem-card|
## 🔴 The Problem

Traditional image classification requires:

- **Thousands of labeled images** per category
- **Expensive GPU training** pipelines
- **Re-training** whenever you add a new category
- **Domain expertise** to build & maintain

This makes vision AI **slow, costly, and inflexible** for real-world deployment.
|>

<|part|class_name=card solution-card|
## 🟢 The Solution

**VisionQuery AI** uses **SigLIP** by Google DeepMind for **zero-shot classification**:

- Describe what you're looking for in **plain English**
- No training data required — ever
- Add **unlimited new categories** instantly
- Works in **100+ languages** (multilingual SigLIP)
|>

|>

---

### 🛠️ Tech Stack

**Model Layer**
🤗 `google/siglip-base-patch16-224`
PyTorch + Transformers

**GUI Layer**
Taipy — Python-native reactive GUI
Plotly interactive charts

**Deployment**
Hugging Face Spaces (Docker)
|>

|>

---

## 📄 Citation

> Zhai, X. et al. (2023). *Sigmoid Loss for Language Image Pre-Training.*
> Google DeepMind. arXiv:2303.15343
"""


# ═══════════════════════════════════════════════════════════════════════════════
# RUN
# ═══════════════════════════════════════════════════════════════════════════════
pages = {
    "/":     demo_md,
    "About": about_md,
}

gui = Gui(pages=pages, css_file="style.css")

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    gui.run(
        host="0.0.0.0",
        port=port,
        title="VisionQuery AI — SigLIP",
        favicon="🔍",
        use_reloader=False,
        dark_mode=False,
    )