TangYiJay commited on
Commit
0fe4fae
·
verified ·
1 Parent(s): ee14787
Files changed (1) hide show
  1. app.py +36 -37
app.py CHANGED
@@ -1,63 +1,62 @@
1
- from transformers import CLIPProcessor, CLIPModel
 
2
  from PIL import Image
3
  import gradio as gr
 
4
  import torch
5
- import numpy as np
6
 
 
7
  MODEL_ID = "openai/clip-vit-base-patch32"
8
-
9
- # Load model & processor
10
  model = CLIPModel.from_pretrained(MODEL_ID)
11
  processor = CLIPProcessor.from_pretrained(MODEL_ID)
12
-
13
- # Candidate material labels
14
  LABELS = ["plastic", "metal", "paper", "cardboard", "glass", "trash"]
15
 
16
- def get_image_embedding(image):
17
- inputs = processor(images=image, return_tensors="pt")
18
  with torch.no_grad():
19
- embedding = model.get_image_features(**inputs)
20
- embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
21
- return embedding.cpu().numpy()
 
 
22
 
23
- def classify_material(base_img, target_img):
24
  if base_img is None or target_img is None:
25
- return "Please upload both base and target images."
26
 
27
- # Compute embeddings
28
- base_emb = get_image_embedding(base_img)
29
- target_emb = get_image_embedding(target_img)
30
 
31
- # Difference score
32
- diff = np.linalg.norm(target_emb - base_emb)
 
 
33
 
34
- # Text embeddings for all labels
35
- text_inputs = processor(text=LABELS, return_tensors="pt", padding=True)
36
- with torch.no_grad():
37
- text_emb = model.get_text_features(**text_inputs)
38
- text_emb = text_emb / text_emb.norm(p=2, dim=-1, keepdim=True)
39
 
40
- # Compute similarity with target image
41
- img_inputs = processor(images=target_img, return_tensors="pt")
42
- with torch.no_grad():
43
- img_feat = model.get_image_features(**img_inputs)
44
- img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
45
 
46
- sims = torch.matmul(img_feat, text_emb.T).squeeze(0)
47
- best_idx = torch.argmax(sims).item()
48
- best_label = LABELS[best_idx]
49
-
50
- return f"Detected material: {best_label}\nDifference from base: {diff:.4f}"
51
 
52
  demo = gr.Interface(
53
- fn=classify_material,
54
  inputs=[
55
  gr.Image(type="pil", label="Base Image"),
56
  gr.Image(type="pil", label="Target Image")
57
  ],
58
- outputs=gr.Textbox(label="Detection Result"),
59
- title="Material Classification (CLIP, CPU Mode)",
60
- description="Upload a base image (background) and a target image (with object). The model detects what new material appears: plastic, metal, paper, cardboard, glass, or trash."
 
 
 
61
  )
62
 
63
  if __name__ == "__main__":
 
1
+ import cv2
2
+ import numpy as np
3
  from PIL import Image
4
  import gradio as gr
5
+ from transformers import CLIPProcessor, CLIPModel
6
  import torch
 
7
 
8
+ # Load CLIP model for material classification
9
  MODEL_ID = "openai/clip-vit-base-patch32"
 
 
10
  model = CLIPModel.from_pretrained(MODEL_ID)
11
  processor = CLIPProcessor.from_pretrained(MODEL_ID)
 
 
12
  LABELS = ["plastic", "metal", "paper", "cardboard", "glass", "trash"]
13
 
14
+ def get_clip_prediction(crop_img):
15
+ inputs = processor(text=LABELS, images=crop_img, return_tensors="pt", padding=True)
16
  with torch.no_grad():
17
+ outputs = model(**inputs)
18
+ logits_per_image = outputs.logits_per_image
19
+ probs = logits_per_image.softmax(dim=1).cpu().numpy()
20
+ best_idx = np.argmax(probs)
21
+ return LABELS[best_idx], float(probs[0][best_idx])
22
 
23
+ def detect_diff_and_classify(base_img, target_img):
24
  if base_img is None or target_img is None:
25
+ return "Please upload both images.", None
26
 
27
+ base_np = np.array(base_img.convert("RGB"))
28
+ target_np = np.array(target_img.convert("RGB"))
 
29
 
30
+ diff = cv2.absdiff(base_np, target_np)
31
+ gray = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
32
+ _, thresh = cv2.threshold(gray, 30, 255, cv2.THRESH_BINARY)
33
+ contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
 
35
+ if not contours:
36
+ return "No significant difference detected.", None
 
 
 
37
 
38
+ # Largest contour likely new object
39
+ c = max(contours, key=cv2.contourArea)
40
+ x, y, w, h = cv2.boundingRect(c)
41
+ crop = target_np[y:y+h, x:x+w]
42
+ crop_img = Image.fromarray(crop)
43
 
44
+ # Run CLIP classification
45
+ label, prob = get_clip_prediction(crop_img)
46
+ return f"Detected material: {label} (confidence {prob:.2f})", crop_img
 
 
47
 
48
  demo = gr.Interface(
49
+ fn=detect_diff_and_classify,
50
  inputs=[
51
  gr.Image(type="pil", label="Base Image"),
52
  gr.Image(type="pil", label="Target Image")
53
  ],
54
+ outputs=[
55
+ gr.Textbox(label="Result"),
56
+ gr.Image(type="pil", label="Detected Object Region")
57
+ ],
58
+ title="Automatic Object Detection + Material Classification",
59
+ description="Detect differences between base and target images, crop the changed region, and classify the material (plastic, metal, paper, cardboard, glass, trash)."
60
  )
61
 
62
  if __name__ == "__main__":