Spaces:

VOIDER
/

Aesthetic-Classifier

Running

App Files Files Community

VOIDER commited on Mar 14

Commit

73b6364

verified ·

1 Parent(s): 23584f6

Upload 2 files

Browse files

Files changed (2) hide show

app.py +162 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn as nn
+import clip
+import gradio as gr
+import numpy as np
+from PIL import Image
+from huggingface_hub import hf_hub_download
+import os
+# ── Labels ─────────────────────────────────────────────────────────────────────
+# Pony V7 captioning uses 9 aesthetic buckets (worst → best)
+LABELS = [
+    "worst quality",
+    "very bad quality",
+    "bad quality",
+    "low quality",
+    "normal quality",
+    "good quality",
+    "high quality",
+    "best quality",
+    "masterpiece",
+]
+# Colour gradient: red → yellow → green
+COLOURS = [
+    "#e74c3c", "#e67e22", "#f39c12",
+    "#d4ac0d", "#a9cce3", "#27ae60",
+    "#1e8449", "#148f77", "#0e6655",
+]
+# ── Model ───────────────────────────────────────────────────────────────────────
+class AestheticHead(nn.Module):
+    """Small MLP head that sits on top of frozen CLIP image features."""
+    def __init__(self, in_features: int = 768, num_classes: int = 9):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(in_features, 1024),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, num_classes),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layers(x)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"[info] device: {DEVICE}")
+# Load CLIP backbone
+print("[info] Loading CLIP ViT-L/14 …")
+clip_model, preprocess = clip.load("ViT-L/14", device=DEVICE)
+clip_model.eval()
+# Load aesthetic head
+print("[info] Downloading aesthetic-classifier checkpoint …")
+ckpt_path = hf_hub_download(
+    repo_id="purplesmartai/aesthetic-classifier",
+    filename="v2.ckpt",
+)
+state_dict = torch.load(ckpt_path, map_location=DEVICE)
+# Auto-detect architecture from checkpoint keys
+first_key = next(iter(state_dict))
+# If keys start with 'layers.' it's our AestheticHead; otherwise try to load directly
+if isinstance(state_dict, dict) and not any(k.startswith("layers") for k in state_dict):
+    # Flat state dict — try wrapping in 'layers'
+    new_sd = {"layers." + k if not k.startswith("layers") else k: v for k, v in state_dict.items()}
+    state_dict = new_sd
+# Detect input size from first weight tensor
+in_feat = 768  # default ViT-L/14
+for k, v in state_dict.items():
+    if "weight" in k and v.dim() == 2:
+        in_feat = v.shape[1]
+        break
+num_classes = len(LABELS)
+model = AestheticHead(in_features=in_feat, num_classes=num_classes).to(DEVICE)
+try:
+    model.load_state_dict(state_dict, strict=True)
+    print("[info] Checkpoint loaded (strict).")
+except RuntimeError:
+    model.load_state_dict(state_dict, strict=False)
+    print("[warn] Checkpoint loaded (non-strict).")
+model.eval()
+# ── Inference ───────────────────────────────────────────────────────────────────
+@torch.no_grad()
+def classify(image: Image.Image):
+    if image is None:
+        return {}
+    # Preprocess & encode with CLIP
+    tensor = preprocess(image).unsqueeze(0).to(DEVICE)
+    features = clip_model.encode_image(tensor).float()
+    features = features / features.norm(dim=-1, keepdim=True)
+    # Run head
+    logits = model(features)
+    probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
+    # Top prediction
+    top_idx = int(np.argmax(probs))
+    result = {label: float(prob) for label, prob in zip(LABELS, probs)}
+    return result
+# ── Gradio UI ───────────────────────────────────────────────────────────────────
+EXAMPLES = []
+examples_dir = "examples"
+if os.path.isdir(examples_dir):
+    EXAMPLES = [[os.path.join(examples_dir, f)] for f in os.listdir(examples_dir)
+                if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))]
+with gr.Blocks(
+    title="Aesthetic Classifier — PurpleSmartAI",
+    theme=gr.themes.Soft(primary_hue="purple"),
+    css="""
+    .gradio-container { max-width: 900px !important; margin: auto; }
+    #title { text-align: center; margin-bottom: 0.5rem; }
+    #subtitle { text-align: center; color: #888; margin-bottom: 1.5rem; font-size: 0.95rem; }
+    """,
+) as demo:
+    gr.Markdown("# 🎨 Aesthetic Classifier", elem_id="title")
+    gr.Markdown(
+        "CLIP-based aesthetic quality classifier by **PurpleSmartAI** — "
+        "originally developed for [Pony V7](https://huggingface.co/purplesmartai/aesthetic-classifier) captioning.\n\n"
+        "Upload an image and get a probability distribution across 9 quality tiers.",
+        elem_id="subtitle",
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            img_input = gr.Image(type="pil", label="Input Image", height=340)
+            run_btn = gr.Button("✨ Classify", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            label_output = gr.Label(
+                num_top_classes=9,
+                label="Aesthetic Score Distribution",
+            )
+    if EXAMPLES:
+        gr.Examples(examples=EXAMPLES, inputs=img_input, label="Example images")
+    gr.Markdown(
+        "---\n"
+        "**Model:** [`purplesmartai/aesthetic-classifier`](https://huggingface.co/purplesmartai/aesthetic-classifier) · "
+        "**Backbone:** OpenAI CLIP ViT-L/14"
+    )
+    run_btn.click(fn=classify, inputs=img_input, outputs=label_output)
+    img_input.change(fn=classify, inputs=img_input, outputs=label_output)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio>=4.0.0
+torch>=2.0.0
+torchvision>=0.15.0
+ftfy
+regex
+tqdm
+git+https://github.com/openai/CLIP.git
+huggingface_hub>=0.20.0
+Pillow>=9.0.0
+numpy>=1.24.0