Spaces:

wueesnin
/

image_comparison

Sleeping

App Files Files Community

wueesnin commited on Apr 14

Commit

4011ff2

verified ·

1 Parent(s): 80987aa

Updated app to anime images

Browse files

Files changed (1) hide show

app.py +119 -70

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import List, Dict, Tuple
 import gradio as gr
 import torch
@@ -11,7 +11,6 @@ from transformers import (
     CLIPProcessor,
 )
-# Optional OpenAI client. The app still works without it.
 try:
     from openai import OpenAI
 except Exception:
@@ -21,35 +20,53 @@ except Exception:
 # =========================================================
 # Configuration
 # =========================================================
-# Replace these labels with your final dataset classes.
 CLASS_LABELS: List[str] = [
-    "sphynx",
-    "russian blue",
-    "maine coon",
-    "ragdoll",
-    "bengal",
-    "singapura",
-    "calico cat"
 ]
 # Your fine-tuned Hugging Face image classification model.
-# Example: "your-username/cat-vs-wild-animal-vit"
-CUSTOM_MODEL_ID = os.getenv("CUSTOM_MODEL_ID", "your-username/your-model-name")
-# Open-source comparison model.
 CLIP_MODEL_ID = os.getenv("CLIP_MODEL_ID", "openai/clip-vit-base-patch32")
-# Example images shown in Gradio. Add real files before deployment.
 EXAMPLE_IMAGES = [
-    ["example_images/sphynx.jpg"],
-    ["example_images/russian-blue.jpg"],
-    ["example_images/maine-coon.jpg"],
-    ["example_images/ragdoll.jpg"],
-    ["example_images/bengal.jpg"],
-    ["example_images/singapura.jpg"],
-    ["example_images/calico.jpg"],
 ]
 # =========================================================
 # Model loading
@@ -97,27 +114,44 @@ load_clip_model()
 # =========================================================
 def ensure_rgb(image: Image.Image) -> Image.Image:
     if image.mode != "RGB":
-        image = image.convert("RGB")
     return image
 def format_topk(predictions: List[Tuple[str, float]]) -> str:
-    lines = []
-    for rank, (label, score) in enumerate(predictions, start=1):
-        lines.append(f"{rank}. {label} ({score:.4f})")
-    return "\n".join(lines)
 def predict_custom_model(image: Image.Image, top_k: int = 3) -> Tuple[str, Dict[str, float]]:
     if custom_model is None or custom_processor is None:
-        message = (
-            "Custom model could not be loaded.\n\n"
-            f"Model ID: {CUSTOM_MODEL_ID}\n"
-            f"Error: {custom_model_error}"
         )
-        return message, {}
     image = ensure_rgb(image)
     inputs = custom_processor(images=image, return_tensors="pt")
@@ -127,33 +161,36 @@ def predict_custom_model(image: Image.Image, top_k: int = 3) -> Tuple[str, Dict[
         outputs = custom_model(**inputs)
         probs = torch.softmax(outputs.logits, dim=-1)[0]
-    id2label = custom_model.config.id2label
     top_indices = torch.topk(probs, k=min(top_k, probs.shape[0])).indices.tolist()
-    top_preds = []
-    label_scores = {}
     for idx in top_indices:
-        label = id2label.get(idx, str(idx))
-        score = probs[idx].item()
         top_preds.append((label, score))
-        label_scores[label] = score
-    return format_topk(top_preds), label_scores
 def predict_clip(image: Image.Image, class_labels: List[str], top_k: int = 3) -> Tuple[str, Dict[str, float]]:
     if clip_model is None or clip_processor is None:
-        message = (
-            "CLIP model could not be loaded.\n\n"
-            f"Model ID: {CLIP_MODEL_ID}\n"
-            f"Error: {clip_model_error}"
         )
-        return message, {}
     image = ensure_rgb(image)
-    prompts = [f"a photo of a {label}" for label in class_labels]
     inputs = clip_processor(text=prompts, images=image, return_tensors="pt", padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -162,12 +199,12 @@ def predict_clip(image: Image.Image, class_labels: List[str], top_k: int = 3) ->
         logits = outputs.logits_per_image[0]
         probs = torch.softmax(logits, dim=-1)
-    pairs = [(label, probs[i].item()) for i, label in enumerate(class_labels)]
     pairs.sort(key=lambda x: x[1], reverse=True)
     top_preds = pairs[:top_k]
-    label_scores = {label: score for label, score in pairs}
-    return format_topk(top_preds), label_scores
@@ -177,36 +214,51 @@ def predict_openai(image: Image.Image, class_labels: List[str]) -> str:
     api_key = os.getenv("OPENAI_API_KEY")
     if not api_key:
-        return "OPENAI_API_KEY is not set. The app can still run without the OpenAI comparison."
     try:
-        client = OpenAI(api_key=api_key)
-        # Convert image to bytes for upload.
         import io
         buffer = io.BytesIO()
         ensure_rgb(image).save(buffer, format="JPEG")
-        buffer.seek(0)
-        uploaded = client.files.create(file=("image.jpg", buffer.getvalue(), "image/jpeg"), purpose="vision")
         prompt = (
-            "You are an image classifier. "
-            "Choose exactly one label from this label set: "
-            f"{', '.join(class_labels)}. "
-            "Return a short answer with this structure only: "
-            "label: <chosen label>\\nreason: <very short reason>."
         )
         response = client.responses.create(
-            model="gpt-4.1-mini",
             input=[
                 {
                     "role": "user",
                     "content": [
                         {"type": "input_text", "text": prompt},
-                        {"type": "input_image", "file_id": uploaded.id},
                     ],
                 }
             ],
@@ -217,31 +269,28 @@ def predict_openai(image: Image.Image, class_labels: List[str]) -> str:
-def compare_models(image: Image.Image) -> Tuple[str, Dict[str, float], str, Dict[str, float], str]:
     if image is None:
-        return "Please upload an image.", {}, "Please upload an image.", {}, "Please upload an image."
     custom_text, custom_scores = predict_custom_model(image)
     clip_text, clip_scores = predict_clip(image, CLASS_LABELS)
     openai_text = predict_openai(image, CLASS_LABELS)
     return custom_text, custom_scores, clip_text, clip_scores, openai_text
-# =========================================================
-# UI
-# =========================================================
 DESCRIPTION = """
-Upload an image and compare three approaches:
 1. Fine-tuned transfer learning model
 2. Zero-shot CLIP
 3. OpenAI vision model
-This version focuses only on cat breed classification.
 """
 with gr.Blocks() as demo:
-    gr.Markdown("# Cat Breed Classifier")
     gr.Markdown(DESCRIPTION)
     with gr.Row():

 import os
+from typing import Dict, List, Tuple
 import gradio as gr
 import torch
     CLIPProcessor,
 )
 try:
     from openai import OpenAI
 except Exception:
 # =========================================================
 # Configuration
 # =========================================================
 CLASS_LABELS: List[str] = [
+    "cherry",
+    "sakura",
+    "naruto",
+    "eren",
+    "kirito",
+    "doraemon",
+    "asuna",
+    "totoro",
+    "chihiro",
 ]
 # Your fine-tuned Hugging Face image classification model.
+CUSTOM_MODEL_ID = os.getenv("CUSTOM_MODEL_ID", "wueesnin/image_comparison")
+# Open-source comparison model (openai)
 CLIP_MODEL_ID = os.getenv("CLIP_MODEL_ID", "openai/clip-vit-base-patch32")
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4.1-mini")
+# Example anime images :3
 EXAMPLE_IMAGES = [
+    ["example_images/eren.JPG"],
+    ["example_images/mikasa.JPG"],
+    ["example_images/naruto.webp"],
+    ["example_images/sakura.webp"],
+    ["example_images/cherry.webp"],
+    ["example_images/kirito.webp"],
+    ["example_images/doraemon.webp"],
+    ["example_images/luffy.webp"],
+    ["example_images/asuna.webp"],
+    ["example_images/totoro.webp"],
+    ["example_images/chihiro.webp"],
 ]
+# Better prompt wording for CLIP / OpenAI.
+LABEL_DESCRIPTIONS: Dict[str, str] = {
+    "eren": "Eren Yeager from Attack on Titan",
+    "mikasa": "Mikasa Akermann from Attack on Titan",
+    "totoro": "Totoro from My Neighbor Totoro",
+    "sakura": "Sakura Haruno from Naruto",
+    "naruto": "Naruto Uzumaki from Naruto",
+    "cherry": "Cherry Magic",
+    "kirito": "Kirito from Sword Art Online",
+    "doraemon": "Doraemon",
+    "asuna": "Asuna Yuuki from Sword Art Online",
+    "chihiro": "Chihiro Ogino from Spirited Away",
+}
 # =========================================================
 # Model loading
 # =========================================================
 def ensure_rgb(image: Image.Image) -> Image.Image:
     if image.mode != "RGB":
+        return image.convert("RGB")
     return image
 def format_topk(predictions: List[Tuple[str, float]]) -> str:
+    return "
+".join(
+        f"{rank}. {label} ({score:.4f})"
+        for rank, (label, score) in enumerate(predictions, start=1)
+    )
+def normalize_model_label(label: str) -> str:
+    return str(label).strip().lower().replace("_", " ")
+def build_clip_prompts(class_labels: List[str]) -> List[str]:
+    return [
+        f"anime character, {LABEL_DESCRIPTIONS.get(label, label)}"
+        for label in class_labels
+    ]
 def predict_custom_model(image: Image.Image, top_k: int = 3) -> Tuple[str, Dict[str, float]]:
     if custom_model is None or custom_processor is None:
+        return (
+            "Custom model could not be loaded.
+"
+            f"Model ID: {CUSTOM_MODEL_ID}
+"
+            f"Error: {custom_model_error}",
+            {},
         )
     image = ensure_rgb(image)
     inputs = custom_processor(images=image, return_tensors="pt")
         outputs = custom_model(**inputs)
         probs = torch.softmax(outputs.logits, dim=-1)[0]
+    id2label = getattr(custom_model.config, "id2label", {})
     top_indices = torch.topk(probs, k=min(top_k, probs.shape[0])).indices.tolist()
+    top_preds: List[Tuple[str, float]] = []
+    score_map: Dict[str, float] = {}
     for idx in top_indices:
+        raw_label = id2label.get(idx, str(idx))
+        label = normalize_model_label(raw_label)
+        score = float(probs[idx].item())
         top_preds.append((label, score))
+        score_map[label] = score
+    return format_topk(top_preds), score_map
 def predict_clip(image: Image.Image, class_labels: List[str], top_k: int = 3) -> Tuple[str, Dict[str, float]]:
     if clip_model is None or clip_processor is None:
+        return (
+            "CLIP model could not be loaded.
+"
+            f"Model ID: {CLIP_MODEL_ID}
+"
+            f"Error: {clip_model_error}",
+            {},
         )
     image = ensure_rgb(image)
+    prompts = build_clip_prompts(class_labels)
     inputs = clip_processor(text=prompts, images=image, return_tensors="pt", padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
         logits = outputs.logits_per_image[0]
         probs = torch.softmax(logits, dim=-1)
+    pairs = [(class_labels[i], float(probs[i].item())) for i in range(len(class_labels))]
     pairs.sort(key=lambda x: x[1], reverse=True)
     top_preds = pairs[:top_k]
+    score_map = {label: score for label, score in pairs}
+    return format_topk(top_preds), score_map
     api_key = os.getenv("OPENAI_API_KEY")
     if not api_key:
+        return "OPENAI_API_KEY is not set. The app still works for the custom model and CLIP."
     try:
+        import base64
         import io
         buffer = io.BytesIO()
         ensure_rgb(image).save(buffer, format="JPEG")
+        encoded = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        client = OpenAI(api_key=api_key)
+        allowed_labels = ", ".join(class_labels)
+        descriptions = "
+".join(
+            f"- {label}: {LABEL_DESCRIPTIONS.get(label, label)}" for label in class_labels
+        )
         prompt = (
+            "Classify this anime image. Choose exactly one label from this list: "
+            f"{allowed_labels}.
+"
+            "Label meanings:
+"
+            f"{descriptions}
+"
+            "Return exactly this format:
+"
+            "label: <one label from the list>
+"
+            "reason: <short reason>"
         )
         response = client.responses.create(
+            model=OPENAI_MODEL,
             input=[
                 {
                     "role": "user",
                     "content": [
                         {"type": "input_text", "text": prompt},
+                        {
+                            "type": "input_image",
+                            "image_url": f"data:image/jpeg;base64,{encoded}",
+                        },
                     ],
                 }
             ],
+def compare_models(image: Image.Image):
     if image is None:
+        msg = "Please upload or select an example image."
+        return msg, {}, msg, {}, msg
     custom_text, custom_scores = predict_custom_model(image)
     clip_text, clip_scores = predict_clip(image, CLASS_LABELS)
     openai_text = predict_openai(image, CLASS_LABELS)
     return custom_text, custom_scores, clip_text, clip_scores, openai_text
 DESCRIPTION = """
+Upload an anime image and compare three approaches:
 1. Fine-tuned transfer learning model
 2. Zero-shot CLIP
 3. OpenAI vision model
+This version uses 9 fixed character labels.
 """
 with gr.Blocks() as demo:
+    gr.Markdown("# Anime Character Classifier")
     gr.Markdown(DESCRIPTION)
     with gr.Row():