Hayloo9838
/

uno-recognizer

+import cv2
+import torch
+import numpy as np
+from transformers import CLIPProcessor, CLIPVisionModel
+from PIL import Image
+from torch import nn
+MODEL_PATH = "clip_large.pth"
+class CLIPVisionClassifier(nn.Module):
+    def __init__(self, num_labels):
+        super().__init__()
+        self.vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch14',
+                                                          attn_implementation="eager") # shows heat
+        self.classifier = nn.Linear(self.vision_model.config.hidden_size, num_labels, bias=False)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, pixel_values, output_attentions=False):
+        outputs = self.vision_model(pixel_values, output_attentions=output_attentions)
+        pooled_output = outputs.pooler_output
+        logits = self.classifier(pooled_output)
+        if output_attentions:
+            return logits, outputs.attentions
+        return logits
+def get_attention_map(attentions, image_size=(224, 224)):
+    attention = attentions[-1]
+    attention = attention.mean(dim=1)
+    attention = attention[0, 0, 1:]
+    num_patches = int(np.sqrt(attention.shape[0]))
+    attention_map = attention.reshape(num_patches, num_patches)
+    attention_map = attention_map.cpu().numpy()
+    attention_map = cv2.resize(attention_map, image_size, interpolation=cv2.INTER_LINEAR)
+    attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min())
+    return attention_map
+def apply_heatmap(image, attention_map, new_size=(640, 480)):
+    heatmap = cv2.applyColorMap(np.uint8(255 * attention_map), cv2.COLORMAP_JET)
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    image_resized = cv2.resize(image, new_size)
+    attention_map_resized = cv2.resize(attention_map, new_size, interpolation=cv2.INTER_LINEAR)
+    attention_map_resized = (attention_map_resized - attention_map_resized.min()) / (attention_map_resized.max() - attention_map_resized.min())
+    heatmap_resized = cv2.applyColorMap(np.uint8(255 * attention_map_resized), cv2.COLORMAP_JET)
+    output = cv2.addWeighted(image_resized, 0.7, heatmap_resized, 0.3, 0)
+    return output
+def webcam_card_detection():
+    model, processor, reverse_mapping, device = load_model()
+    cap = cv2.VideoCapture(0)
+    print("Press 'q' to quit.")
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            print("Failed to capture image. Exiting...")
+            break
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(frame_rgb)
+        inputs = processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(device)
+        with torch.no_grad():
+            logits, attentions = model(pixel_values, output_attentions=True)
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            prediction = torch.argmax(probs).item()
+        # Generate attention map
+        attention_map = get_attention_map(attentions)
+        visualization = apply_heatmap(frame, attention_map, new_size=(640, 480))
+        card_name = reverse_mapping[prediction]
+        confidence = probs[0][prediction].item()
+        cv2.putText(visualization, f"{card_name} ({confidence:.2%})", (10, 50),
+                    cv2.FONT_HERSHEY_SIMPLEX, 1, (1, 255, 255), 2, cv2.LINE_AA)
+        cv2.imshow("UNO Card Detection", visualization)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            print("Exiting...")
+            break
+    cap.release()
+    cv2.destroyAllWindows()
+def load_model():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    checkpoint = torch.load(MODEL_PATH, map_location=device)
+    label_mapping = checkpoint['label_mapping']
+    reverse_mapping = {v: k for k, v in label_mapping.items()}
+    model = CLIPVisionClassifier(len(label_mapping))
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model = model.to(device)
+    model.eval()
+    processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
+    return model, processor, reverse_mapping, device
+if __name__ == "__main__":
+    webcam_card_detection()