Hayloo9838
/

uno-recognizer

@@ -1,21 +1,24 @@
-# model usage + heatmap of attention (what the model is focusing on)
 import cv2
 import torch
 import numpy as np
 from transformers import CLIPProcessor, CLIPVisionModel
 from PIL import Image
 from torch import nn
 MODEL_PATH = "pytorch_model.bin"
 class CLIPVisionClassifier(nn.Module):
     def __init__(self, num_labels):
         super().__init__()
         self.vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch14',
-                                                          attn_implementation="eager") # shows heat
         self.classifier = nn.Linear(self.vision_model.config.hidden_size, num_labels, bias=False)
-        self.dropout = nn.Dropout(0.1) # this is not used dont worry :
     def forward(self, pixel_values, output_attentions=False):
         outputs = self.vision_model(pixel_values, output_attentions=output_attentions)
@@ -26,7 +29,7 @@ class CLIPVisionClassifier(nn.Module):
             return logits, outputs.attentions
         return logits
-def get_attention_map(attentions, image_size=(224, 224)):
     attention = attentions[-1]
     attention = attention.mean(dim=1)
     attention = attention[0, 0, 1:]
@@ -36,82 +39,72 @@ def get_attention_map(attentions, image_size=(224, 224)):
     attention_map = attention.reshape(num_patches, num_patches)
     attention_map = attention_map.cpu().numpy()
-    attention_map = cv2.resize(attention_map, image_size, interpolation=cv2.INTER_LINEAR)
     attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min())
     return attention_map
-def apply_heatmap(image, attention_map, new_size=(640, 480)):
     heatmap = cv2.applyColorMap(np.uint8(255 * attention_map), cv2.COLORMAP_JET)
     if isinstance(image, Image.Image):
         image = np.array(image)
         image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-    image_resized = cv2.resize(image, new_size)
-    attention_map_resized = cv2.resize(attention_map, new_size, interpolation=cv2.INTER_LINEAR)
-    attention_map_resized = (attention_map_resized - attention_map_resized.min()) / (attention_map_resized.max() - attention_map_resized.min())
-    heatmap_resized = cv2.applyColorMap(np.uint8(255 * attention_map_resized), cv2.COLORMAP_JET)
-    output = cv2.addWeighted(image_resized, 0.7, heatmap_resized, 0.3, 0)
-    return output
-def webcam_card_detection():
-    model, processor, reverse_mapping, device = load_model()
-    cap = cv2.VideoCapture(0)
-    print("Press 'q' to quit.")
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            print("Failed to capture image. Exiting...")
-            break
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        image = Image.fromarray(frame_rgb)
-        inputs = processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(device)
-        with torch.no_grad():
-            logits, attentions = model(pixel_values, output_attentions=True)
-            probs = torch.nn.functional.softmax(logits, dim=-1)
-            prediction = torch.argmax(probs).item()
-        # Generate attention map
-        attention_map = get_attention_map(attentions)
-        visualization = apply_heatmap(frame, attention_map, new_size=(640, 480))
-        card_name = reverse_mapping[prediction]
-        confidence = probs[0][prediction].item()
-        cv2.putText(visualization, f"{card_name} ({confidence:.2%})", (10, 50),
-                    cv2.FONT_HERSHEY_SIMPLEX, 1, (1, 255, 255), 2, cv2.LINE_AA)
-        cv2.imshow("UNO Card Detection", visualization)
-        if cv2.waitKey(1) & 0xFF == ord('q'):
-            print("Exiting...")
-            break
-    cap.release()
-    cv2.destroyAllWindows()
 def load_model():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    checkpoint = torch.load(MODEL_PATH, map_location=device)
     label_mapping = checkpoint['label_mapping']
     reverse_mapping = {v: k for k, v in label_mapping.items()}
     model = CLIPVisionClassifier(len(label_mapping))
-    model.load_state_dict(checkpoint['model_state_dict'])
     model = model.to(device)
     model.eval()
@@ -120,4 +113,18 @@ def load_model():
     return model, processor, reverse_mapping, device
 if __name__ == "__main__":
-    webcam_card_detection()

 import cv2
 import torch
 import numpy as np
 from transformers import CLIPProcessor, CLIPVisionModel
 from PIL import Image
 from torch import nn
+import requests
+import matplotlib.pyplot as plt
+from huggingface_hub import hf_hub_download
 MODEL_PATH = "pytorch_model.bin"
+REPO_ID = "Hayloo9838/uno-recognizer"
+MAPANDSTUFF = "mapandstuff.pth"
 class CLIPVisionClassifier(nn.Module):
     def __init__(self, num_labels):
         super().__init__()
         self.vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch14',
+                                                          attn_implementation="eager")
         self.classifier = nn.Linear(self.vision_model.config.hidden_size, num_labels, bias=False)
+        self.dropout = nn.Dropout(0.1)
     def forward(self, pixel_values, output_attentions=False):
         outputs = self.vision_model(pixel_values, output_attentions=output_attentions)
             return logits, outputs.attentions
         return logits
+def get_attention_map(attentions):
     attention = attentions[-1]
     attention = attention.mean(dim=1)
     attention = attention[0, 0, 1:]
     attention_map = attention.reshape(num_patches, num_patches)
     attention_map = attention_map.cpu().numpy()
     attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min())
     return attention_map
+def apply_heatmap(image, attention_map, new_size=None):
     heatmap = cv2.applyColorMap(np.uint8(255 * attention_map), cv2.COLORMAP_JET)
     if isinstance(image, Image.Image):
         image = np.array(image)
         image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    if new_size is not None:
+        image_resized = cv2.resize(image, new_size)
+        attention_map_resized = cv2.resize(attention_map, image_resized.shape[:2][::-1] , interpolation=cv2.INTER_LINEAR)
+        attention_map_resized = (attention_map_resized - attention_map_resized.min()) / (attention_map_resized.max() - attention_map_resized.min())
+        heatmap_resized = cv2.applyColorMap(np.uint8(255 * attention_map_resized), cv2.COLORMAP_JET)
+        output = cv2.addWeighted(image_resized, 0.7, heatmap_resized, 0.3, 0)
+    else:
+        attention_map_resized = cv2.resize(attention_map, image.shape[:2][::-1] , interpolation=cv2.INTER_LINEAR)
+        attention_map_resized = (attention_map_resized - attention_map_resized.min()) / (attention_map_resized.max() - attention_map_resized.min())
+        heatmap_resized = cv2.applyColorMap(np.uint8(255 * attention_map_resized), cv2.COLORMAP_JET)
+        output = cv2.addWeighted(image, 0.7, heatmap_resized, 0.3, 0)
+    return output
+def process_image_classification(image_url):
+    model, processor, reverse_mapping, device = load_model()
+    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
+    inputs = processor(images=image, return_tensors="pt")
+    pixel_values = inputs.pixel_values.to(device)
+    with torch.no_grad():
+        logits, attentions = model(pixel_values, output_attentions=True)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        prediction = torch.argmax(probs).item()
+    # Generate attention map
+    attention_map = get_attention_map(attentions)
+    visualization = apply_heatmap(image, attention_map)
+    card_name = reverse_mapping[prediction]
+    confidence = probs[0][prediction].item()
+    # Convert back to RGB for matplotlib display
+    visualization_rgb = cv2.cvtColor(visualization, cv2.COLOR_BGR2RGB)
+    return visualization_rgb, card_name, confidence
 def load_model():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Download model weights and label mapping from Hugging Face Hub
+    model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_PATH)
+    #mapandstuff_path = hf_hub_download(repo_id=REPO_ID, filename=MAPANDSTUFF)
+    checkpoint = torch.load(model_path, map_location=device)
     label_mapping = checkpoint['label_mapping']
     reverse_mapping = {v: k for k, v in label_mapping.items()}
     model = CLIPVisionClassifier(len(label_mapping))
+    model_state_dict = checkpoint["model_state_dict"]
+    model.load_state_dict(model_state_dict)
     model = model.to(device)
     model.eval()
     return model, processor, reverse_mapping, device
 if __name__ == "__main__":
+    image_url = "https://www.shutterstock.com/image-vector/hand-hold-reverse-card-symbol-600w-2360073097.jpg"
+    visualization, card_name, confidence = process_image_classification(image_url)
+    plt.figure(figsize=(10, 5))
+    plt.subplot(1, 2, 1)
+    plt.imshow(visualization)
+    plt.title(f"Heatmap on Image")
+    plt.axis('off')
+    plt.subplot(1, 2, 2)
+    plt.text(0.5, 0.5, f"Predicted Card: {card_name}\nConfidence: {confidence:.2%}",
+             fontsize=12, ha='center', va='center')
+    plt.axis('off')
+    plt.show()