File size: 2,262 Bytes
366963e
7d26df7
de0a685
04ec2fd
366963e
7d26df7
366963e
de0a685
366963e
 
 
04ec2fd
366963e
 
de0a685
366963e
 
 
 
 
 
 
 
de0a685
366963e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d26df7
de0a685
366963e
de0a685
 
366963e
de0a685
366963e
 
 
de0a685
7d26df7
de0a685
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import gradio as gr
import torch
import numpy as np

MODEL_ID = "openai/clip-vit-base-patch32"

# Load model & processor
model = CLIPModel.from_pretrained(MODEL_ID)
processor = CLIPProcessor.from_pretrained(MODEL_ID)

# Candidate material labels
LABELS = ["plastic", "metal", "paper", "cardboard", "glass", "trash"]

def get_image_embedding(image):
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
    return embedding.cpu().numpy()

def classify_material(base_img, target_img):
    if base_img is None or target_img is None:
        return "Please upload both base and target images."

    # Compute embeddings
    base_emb = get_image_embedding(base_img)
    target_emb = get_image_embedding(target_img)

    # Difference score
    diff = np.linalg.norm(target_emb - base_emb)

    # Text embeddings for all labels
    text_inputs = processor(text=LABELS, return_tensors="pt", padding=True)
    with torch.no_grad():
        text_emb = model.get_text_features(**text_inputs)
        text_emb = text_emb / text_emb.norm(p=2, dim=-1, keepdim=True)

    # Compute similarity with target image
    img_inputs = processor(images=target_img, return_tensors="pt")
    with torch.no_grad():
        img_feat = model.get_image_features(**img_inputs)
        img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)

    sims = torch.matmul(img_feat, text_emb.T).squeeze(0)
    best_idx = torch.argmax(sims).item()
    best_label = LABELS[best_idx]

    return f"Detected material: {best_label}\nDifference from base: {diff:.4f}"

demo = gr.Interface(
    fn=classify_material,
    inputs=[
        gr.Image(type="pil", label="Base Image"),
        gr.Image(type="pil", label="Target Image")
    ],
    outputs=gr.Textbox(label="Detection Result"),
    title="Material Classification (CLIP, CPU Mode)",
    description="Upload a base image (background) and a target image (with object). The model detects what new material appears: plastic, metal, paper, cardboard, glass, or trash."
)

if __name__ == "__main__":
    demo.launch()