Spaces:

TangYiJay
/

imagelanguage

Sleeping

App Files Files Community

TangYiJay commited on Nov 3

Commit

366963e

verified ·

1 Parent(s): 04ec2fd

app.py

Browse files

Files changed (1) hide show

app.py +47 -24

app.py CHANGED Viewed

@@ -1,40 +1,63 @@
-from transformers import AutoProcessor, AutoModelForVision2Seq
 from PIL import Image
 import gradio as gr
 import torch
-MODEL_ID = "HuggingFaceM4/idefics2-8b"
-# 强制使用 CPU 模式
-device = "cpu"
-# 加载模型与处理器（关闭 float16 避免 CPU 报错）
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=torch.float32, device_map=None)
-model.to(device)
-def analyze_images(base_img, target_img, user_prompt):
     if base_img is None or target_img is None:
-        return "Please upload both a base image and a target image."
-    images = [base_img, target_img]
-    prompt = f"Ignore the first image (base image). Analyze the second image: {user_prompt}"
-    inputs = processor(images=images, text=prompt, return_tensors="pt").to(device)
-    output = model.generate(**inputs, max_new_tokens=200)
-    result = processor.decode(output[0], skip_special_tokens=True)
-    return result
 demo = gr.Interface(
-    fn=analyze_images,
     inputs=[
         gr.Image(type="pil", label="Base Image"),
-        gr.Image(type="pil", label="Target Image"),
-        gr.Textbox(label="Prompt", placeholder="Describe what to analyze...")
     ],
-    outputs=gr.Textbox(label="Model Output"),
-    title="Image Comparison (IDEFICS2-8B, CPU Mode)",
-    description="Upload two images. The model will ignore the base image and analyze the second according to your prompt."
 )
 if __name__ == "__main__":

+from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import gradio as gr
 import torch
+import numpy as np
+MODEL_ID = "openai/clip-vit-base-patch32"
+# Load model & processor
+model = CLIPModel.from_pretrained(MODEL_ID)
+processor = CLIPProcessor.from_pretrained(MODEL_ID)
+# Candidate material labels
+LABELS = ["plastic", "metal", "paper", "cardboard", "glass", "trash"]
+def get_image_embedding(image):
+    inputs = processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        embedding = model.get_image_features(**inputs)
+    embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
+    return embedding.cpu().numpy()
+def classify_material(base_img, target_img):
     if base_img is None or target_img is None:
+        return "Please upload both base and target images."
+    # Compute embeddings
+    base_emb = get_image_embedding(base_img)
+    target_emb = get_image_embedding(target_img)
+    # Difference score
+    diff = np.linalg.norm(target_emb - base_emb)
+    # Text embeddings for all labels
+    text_inputs = processor(text=LABELS, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        text_emb = model.get_text_features(**text_inputs)
+        text_emb = text_emb / text_emb.norm(p=2, dim=-1, keepdim=True)
+    # Compute similarity with target image
+    img_inputs = processor(images=target_img, return_tensors="pt")
+    with torch.no_grad():
+        img_feat = model.get_image_features(**img_inputs)
+        img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
+    sims = torch.matmul(img_feat, text_emb.T).squeeze(0)
+    best_idx = torch.argmax(sims).item()
+    best_label = LABELS[best_idx]
+    return f"Detected material: {best_label}\nDifference from base: {diff:.4f}"
 demo = gr.Interface(
+    fn=classify_material,
     inputs=[
         gr.Image(type="pil", label="Base Image"),
+        gr.Image(type="pil", label="Target Image")
     ],
+    outputs=gr.Textbox(label="Detection Result"),
+    title="Material Classification (CLIP, CPU Mode)",
+    description="Upload a base image (background) and a target image (with object). The model detects what new material appears: plastic, metal, paper, cardboard, glass, or trash."
 )
 if __name__ == "__main__":