Spaces:

TangYiJay
/

imagelanguage

Sleeping

App Files Files Community

TangYiJay commited on Nov 3, 2025

Commit

0fe4fae

verified ·

1 Parent(s): ee14787

app.py

Browse files

Files changed (1) hide show

app.py +36 -37

app.py CHANGED Viewed

@@ -1,63 +1,62 @@
-from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import gradio as gr
 import torch
-import numpy as np
 MODEL_ID = "openai/clip-vit-base-patch32"
-# Load model & processor
 model = CLIPModel.from_pretrained(MODEL_ID)
 processor = CLIPProcessor.from_pretrained(MODEL_ID)
-# Candidate material labels
 LABELS = ["plastic", "metal", "paper", "cardboard", "glass", "trash"]
-def get_image_embedding(image):
-    inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
-        embedding = model.get_image_features(**inputs)
-    embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
-    return embedding.cpu().numpy()
-def classify_material(base_img, target_img):
     if base_img is None or target_img is None:
-        return "Please upload both base and target images."
-    # Compute embeddings
-    base_emb = get_image_embedding(base_img)
-    target_emb = get_image_embedding(target_img)
-    # Difference score
-    diff = np.linalg.norm(target_emb - base_emb)
-    # Text embeddings for all labels
-    text_inputs = processor(text=LABELS, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        text_emb = model.get_text_features(**text_inputs)
-        text_emb = text_emb / text_emb.norm(p=2, dim=-1, keepdim=True)
-    # Compute similarity with target image
-    img_inputs = processor(images=target_img, return_tensors="pt")
-    with torch.no_grad():
-        img_feat = model.get_image_features(**img_inputs)
-        img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
-    sims = torch.matmul(img_feat, text_emb.T).squeeze(0)
-    best_idx = torch.argmax(sims).item()
-    best_label = LABELS[best_idx]
-    return f"Detected material: {best_label}\nDifference from base: {diff:.4f}"
 demo = gr.Interface(
-    fn=classify_material,
     inputs=[
         gr.Image(type="pil", label="Base Image"),
         gr.Image(type="pil", label="Target Image")
     ],
-    outputs=gr.Textbox(label="Detection Result"),
-    title="Material Classification (CLIP, CPU Mode)",
-    description="Upload a base image (background) and a target image (with object). The model detects what new material appears: plastic, metal, paper, cardboard, glass, or trash."
 )
 if __name__ == "__main__":

+import cv2
+import numpy as np
 from PIL import Image
 import gradio as gr
+from transformers import CLIPProcessor, CLIPModel
 import torch
+# Load CLIP model for material classification
 MODEL_ID = "openai/clip-vit-base-patch32"
 model = CLIPModel.from_pretrained(MODEL_ID)
 processor = CLIPProcessor.from_pretrained(MODEL_ID)
 LABELS = ["plastic", "metal", "paper", "cardboard", "glass", "trash"]
+def get_clip_prediction(crop_img):
+    inputs = processor(text=LABELS, images=crop_img, return_tensors="pt", padding=True)
     with torch.no_grad():
+        outputs = model(**inputs)
+        logits_per_image = outputs.logits_per_image
+        probs = logits_per_image.softmax(dim=1).cpu().numpy()
+    best_idx = np.argmax(probs)
+    return LABELS[best_idx], float(probs[0][best_idx])
+def detect_diff_and_classify(base_img, target_img):
     if base_img is None or target_img is None:
+        return "Please upload both images.", None
+    base_np = np.array(base_img.convert("RGB"))
+    target_np = np.array(target_img.convert("RGB"))
+    diff = cv2.absdiff(base_np, target_np)
+    gray = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
+    _, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY)
+    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        return "No significant difference detected.", None
+    # Largest contour → likely new object
+    c = max(contours, key=cv2.contourArea)
+    x, y, w, h = cv2.boundingRect(c)
+    crop = target_np[y:y+h, x:x+w]
+    crop_img = Image.fromarray(crop)
+    # Run CLIP classification
+    label, prob = get_clip_prediction(crop_img)
+    return f"Detected material: {label} (confidence {prob:.2f})", crop_img
 demo = gr.Interface(
+    fn=detect_diff_and_classify,
     inputs=[
         gr.Image(type="pil", label="Base Image"),
         gr.Image(type="pil", label="Target Image")
     ],
+    outputs=[
+        gr.Textbox(label="Result"),
+        gr.Image(type="pil", label="Detected Object Region")
+    ],
+    title="Automatic Object Detection + Material Classification",
+    description="Detect differences between base and target images, crop the changed region, and classify the material (plastic, metal, paper, cardboard, glass, trash)."
 )
 if __name__ == "__main__":