Spaces:

VOIDER
/

Aesthetic-Classifier

Running

App Files Files Community

VOIDER commited on Mar 14

Commit

1e3b83e

verified ·

1 Parent(s): 73b6364

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -116

app.py CHANGED Viewed

@@ -1,162 +1,154 @@
 import torch
 import torch.nn as nn
 import clip
 import gradio as gr
-import numpy as np
 from PIL import Image
 from huggingface_hub import hf_hub_download
-import os
-# ── Labels ─────────────────────────────────────────────────────────────────────
-# Pony V7 captioning uses 9 aesthetic buckets (worst → best)
-LABELS = [
-    "worst quality",
-    "very bad quality",
-    "bad quality",
-    "low quality",
-    "normal quality",
-    "good quality",
-    "high quality",
-    "best quality",
-    "masterpiece",
-]
-# Colour gradient: red → yellow → green
-COLOURS = [
-    "#e74c3c", "#e67e22", "#f39c12",
-    "#d4ac0d", "#a9cce3", "#27ae60",
-    "#1e8449", "#148f77", "#0e6655",
-]
-# ── Model ───────────────────────────────────────────────────────────────────────
-class AestheticHead(nn.Module):
-    """Small MLP head that sits on top of frozen CLIP image features."""
-    def __init__(self, in_features: int = 768, num_classes: int = 9):
         super().__init__()
-        self.layers = nn.Sequential(
-            nn.Linear(in_features, 1024),
             nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(1024, 128),
             nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(128, num_classes),
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.layers(x)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[info] device: {DEVICE}")
-# Load CLIP backbone
-print("[info] Loading CLIP ViT-L/14 …")
 clip_model, preprocess = clip.load("ViT-L/14", device=DEVICE)
 clip_model.eval()
-# Load aesthetic head
-print("[info] Downloading aesthetic-classifier checkpoint …")
 ckpt_path = hf_hub_download(
     repo_id="purplesmartai/aesthetic-classifier",
     filename="v2.ckpt",
 )
-state_dict = torch.load(ckpt_path, map_location=DEVICE)
-# Auto-detect architecture from checkpoint keys
-first_key = next(iter(state_dict))
-# If keys start with 'layers.' it's our AestheticHead; otherwise try to load directly
-if isinstance(state_dict, dict) and not any(k.startswith("layers") for k in state_dict):
-    # Flat state dict — try wrapping in 'layers'
-    new_sd = {"layers." + k if not k.startswith("layers") else k: v for k, v in state_dict.items()}
-    state_dict = new_sd
-# Detect input size from first weight tensor
-in_feat = 768  # default ViT-L/14
-for k, v in state_dict.items():
-    if "weight" in k and v.dim() == 2:
-        in_feat = v.shape[1]
-        break
-num_classes = len(LABELS)
-model = AestheticHead(in_features=in_feat, num_classes=num_classes).to(DEVICE)
-try:
-    model.load_state_dict(state_dict, strict=True)
-    print("[info] Checkpoint loaded (strict).")
-except RuntimeError:
-    model.load_state_dict(state_dict, strict=False)
-    print("[warn] Checkpoint loaded (non-strict).")
-model.eval()
-# ── Inference ───────────────────────────────────────────────────────────────────
-@torch.no_grad()
-def classify(image: Image.Image):
-    if image is None:
-        return {}
-    # Preprocess & encode with CLIP
-    tensor = preprocess(image).unsqueeze(0).to(DEVICE)
-    features = clip_model.encode_image(tensor).float()
-    features = features / features.norm(dim=-1, keepdim=True)
-    # Run head
-    logits = model(features)
-    probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
-    # Top prediction
-    top_idx = int(np.argmax(probs))
-    result = {label: float(prob) for label, prob in zip(LABELS, probs)}
-    return result
-# ── Gradio UI ───────────────────────────────────────────────────────────────────
-EXAMPLES = []
-examples_dir = "examples"
-if os.path.isdir(examples_dir):
-    EXAMPLES = [[os.path.join(examples_dir, f)] for f in os.listdir(examples_dir)
-                if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))]
 with gr.Blocks(
-    title="Aesthetic Classifier — PurpleSmartAI",
     theme=gr.themes.Soft(primary_hue="purple"),
-    css="""
-    .gradio-container { max-width: 900px !important; margin: auto; }
-    #title { text-align: center; margin-bottom: 0.5rem; }
-    #subtitle { text-align: center; color: #888; margin-bottom: 1.5rem; font-size: 0.95rem; }
-    """,
 ) as demo:
-    gr.Markdown("# 🎨 Aesthetic Classifier", elem_id="title")
     gr.Markdown(
-        "CLIP-based aesthetic quality classifier by **PurpleSmartAI** — "
-        "originally developed for [Pony V7](https://huggingface.co/purplesmartai/aesthetic-classifier) captioning.\n\n"
-        "Upload an image and get a probability distribution across 9 quality tiers.",
-        elem_id="subtitle",
     )
     with gr.Row():
         with gr.Column(scale=1):
             img_input = gr.Image(type="pil", label="Input Image", height=340)
-            run_btn = gr.Button("✨ Classify", variant="primary", size="lg")
         with gr.Column(scale=1):
-            label_output = gr.Label(
-                num_top_classes=9,
-                label="Aesthetic Score Distribution",
             )
-    if EXAMPLES:
-        gr.Examples(examples=EXAMPLES, inputs=img_input, label="Example images")
     gr.Markdown(
-        "---\n"
-        "**Model:** [`purplesmartai/aesthetic-classifier`](https://huggingface.co/purplesmartai/aesthetic-classifier) · "
-        "**Backbone:** OpenAI CLIP ViT-L/14"
     )
-    run_btn.click(fn=classify, inputs=img_input, outputs=label_output)
-    img_input.change(fn=classify, inputs=img_input, outputs=label_output)
 if __name__ == "__main__":
-    demo.launch()

+import os
 import torch
 import torch.nn as nn
+import numpy as np
 import clip
 import gradio as gr
 from PIL import Image
 from huggingface_hub import hf_hub_download
+# ── Model — exactly as in the Pony V7 Captioner notebook ───────────────────────
+class AestheticScorer(nn.Module):
+    def __init__(self, input_size: int = 768):
         super().__init__()
+        self.model = nn.Sequential(
+            nn.Linear(input_size, 1024),
             nn.ReLU(),
+            nn.Dropout(0.5),
+            nn.Linear(1024, 512),
             nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 1),
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[info] device: {DEVICE}")
+print("[info] Loading CLIP ViT-L/14 ...")
 clip_model, preprocess = clip.load("ViT-L/14", device=DEVICE)
 clip_model.eval()
+print("[info] Downloading aesthetic-classifier checkpoint ...")
 ckpt_path = hf_hub_download(
     repo_id="purplesmartai/aesthetic-classifier",
     filename="v2.ckpt",
 )
+checkpoint_data = torch.load(ckpt_path, map_location=DEVICE)
+state_dict = checkpoint_data["state_dict"]
+# Strip the "model." prefix from keys (same as notebook)
+state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}
+aesthetic_model = AestheticScorer(input_size=768).to(DEVICE)
+aesthetic_model.load_state_dict(state_dict)
+aesthetic_model.eval()
+print("[info] Model ready.")
+# ── Scoring — identical to notebook ────────────────────────────────────────────
+@torch.no_grad()
+def get_score(image: Image.Image) -> float:
+    """Returns raw float score (typically 0-1 range)."""
+    image_tensor = preprocess(image.convert("RGB")).unsqueeze(0).to(DEVICE)
+    features = clip_model.encode_image(image_tensor).cpu().numpy()
+    norm = np.linalg.norm(features, axis=1, keepdims=True)
+    norm[norm == 0] = 1
+    features = features / norm
+    features_t = torch.tensor(features, dtype=torch.float32, device=DEVICE)
+    raw = aesthetic_model(features_t).item()
+    return raw
+def raw_to_pony(raw: float) -> int:
+    """Convert raw score to pony score_0...score_9 (same formula as notebook)."""
+    return int(max(0.0, min(0.99, raw)) * 10)
+# ── Colour palette ─────────────────────────────────────────────────────────────
+SCORE_COLOURS = [
+    "#c0392b", "#e74c3c", "#e67e22", "#f39c12", "#d4ac0d",
+    "#27ae60", "#1e8449", "#148f77", "#0e6655", "#0a4f42",
+]
+def build_html(raw: float) -> str:
+    pony = raw_to_pony(raw)
+    colour = SCORE_COLOURS[pony]
+    tiles_html = ""
+    for i in range(10):
+        active = i == pony
+        bg     = SCORE_COLOURS[i] if active else "rgba(255,255,255,0.06)"
+        border = f"2px solid {SCORE_COLOURS[i]}" if active else "2px solid transparent"
+        weight = "700" if active else "400"
+        scale  = "scale(1.12)" if active else "scale(1)"
+        opac   = "1" if active else "0.45"
+        tiles_html += f"""<div style="background:{bg};border:{border};border-radius:8px;
+            padding:10px 0;text-align:center;font-size:.82rem;font-weight:{weight};color:#fff;
+            transform:{scale};opacity:{opac};transition:all .2s;user-select:none;">score_{i}</div>"""
+    bar_w = min(raw, 1.0) * 100
+    return f"""
+<div style="font-family:'Inter',sans-serif;padding:8px 0;">
+  <div style="text-align:center;margin-bottom:20px;">
+    <div style="display:inline-block;background:{colour};color:#fff;border-radius:12px;
+        padding:14px 36px;font-size:2rem;font-weight:800;letter-spacing:.04em;
+        box-shadow:0 4px 20px {colour}66;">score_{pony}</div>
+    <div style="color:#aaa;font-size:.85rem;margin-top:8px;">
+      raw score: <code style="color:#ddd">{raw:.4f}</code>
+    </div>
+  </div>
+  <div style="display:grid;grid-template-columns:repeat(10,1fr);gap:6px;margin-bottom:16px;">
+    {tiles_html}
+  </div>
+  <div style="background:rgba(255,255,255,.1);border-radius:6px;height:8px;overflow:hidden;">
+    <div style="width:{bar_w:.1f}%;height:100%;
+        background:linear-gradient(90deg,#c0392b,#f39c12,#27ae60);
+        border-radius:6px;transition:width .4s;"></div>
+  </div>
+  <div style="display:flex;justify-content:space-between;font-size:.72rem;color:#777;margin-top:4px;">
+    <span>score_0</span><span>score_9</span>
+  </div>
+</div>"""
+def classify(image):
+    if image is None:
+        return "<p style='color:#888;text-align:center'>Upload an image to score it.</p>"
+    raw = get_score(image)
+    return build_html(raw)
+# ── Gradio UI ───────────────────────────────────────────────────────────────────
 with gr.Blocks(
+    title="Aesthetic Classifier - PurpleSmartAI",
     theme=gr.themes.Soft(primary_hue="purple"),
+    css=".gradio-container{max-width:860px!important;margin:auto} #title{text-align:center} #sub{text-align:center;color:#888;font-size:.9rem;margin-bottom:1.5rem}",
 ) as demo:
+    gr.Markdown("# Aesthetic Classifier", elem_id="title")
     gr.Markdown(
+        "CLIP ViT-L/14 regression model by **PurpleSmartAI** for Pony V7 captioning. "
+        "Outputs a **score_0...score_9** tag used directly in training captions.",
+        elem_id="sub",
     )
     with gr.Row():
         with gr.Column(scale=1):
             img_input = gr.Image(type="pil", label="Input Image", height=340)
+            run_btn   = gr.Button("Score image", variant="primary", size="lg")
         with gr.Column(scale=1):
+            out_html  = gr.HTML(
+                value="<p style='color:#888;text-align:center;padding:40px 0'>Upload an image to see its score.</p>",
             )
     gr.Markdown(
+        "---\n**Model:** [`purplesmartai/aesthetic-classifier`](https://huggingface.co/purplesmartai/aesthetic-classifier)"
+        " · **Backbone:** OpenAI CLIP ViT-L/14"
     )
+    run_btn.click(fn=classify, inputs=img_input, outputs=out_html)
+    img_input.change(fn=classify, inputs=img_input, outputs=out_html)
 if __name__ == "__main__":
+    demo.launch()