Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import cv2 | |
| import torch | |
| import numpy as np | |
| from transformers import CLIPProcessor, CLIPVisionModel | |
| from PIL import Image | |
| from torch import nn | |
| import requests | |
| from huggingface_hub import hf_hub_download | |
| MODEL_PATH = "pytorch_model.bin" | |
| REPO_ID = "Hayloo9838/uno-recognizer" | |
| class CLIPVisionClassifier(nn.Module): | |
| def __init__(self, num_labels): | |
| super().__init__() | |
| self.vision_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch14') | |
| self.classifier = nn.Linear(self.vision_model.config.hidden_size, num_labels, bias=False) | |
| self.dropout = nn.Dropout(0.1) | |
| def forward(self, pixel_values, output_attentions=False): | |
| outputs = self.vision_model(pixel_values, output_attentions=output_attentions) | |
| pooled_output = outputs.pooler_output | |
| logits = self.classifier(pooled_output) | |
| if output_attentions: | |
| return logits, outputs.attentions | |
| return logits | |
| def get_attention_map(attentions): | |
| attention = attentions[-1] | |
| attention = attention.mean(dim=1) | |
| attention = attention[0, 0, 1:] | |
| num_patches = int(np.sqrt(attention.shape[0])) | |
| attention_map = attention.reshape(num_patches, num_patches) | |
| attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min()) | |
| return attention_map.cpu().numpy() | |
| def apply_heatmap(image, attention_map): | |
| heatmap = cv2.applyColorMap(np.uint8(255 * attention_map), cv2.COLORMAP_JET) | |
| if isinstance(image, Image.Image): | |
| image = np.array(image) | |
| image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) | |
| attention_map_resized = cv2.resize(attention_map, image.shape[:2][::-1], interpolation=cv2.INTER_LINEAR) | |
| attention_map_resized = (attention_map_resized - attention_map_resized.min()) / (attention_map_resized.max() - attention_map_resized.min()) | |
| heatmap_resized = cv2.applyColorMap(np.uint8(255 * attention_map_resized), cv2.COLORMAP_JET) | |
| output = cv2.addWeighted(image, 0.7, heatmap_resized, 0.3, 0) | |
| return output | |
| def process_image_classification(image): | |
| model, processor, reverse_mapping, device = load_model() | |
| image = Image.fromarray(image) | |
| inputs = processor(images=image, return_tensors="pt") | |
| pixel_values = inputs.pixel_values.to(device) | |
| with torch.no_grad(): | |
| logits, attentions = model(pixel_values, output_attentions=True) | |
| probs = torch.nn.functional.softmax(logits, dim=-1) | |
| prediction = torch.argmax(probs).item() | |
| attention_map = get_attention_map(attentions) | |
| visualization = apply_heatmap(image, attention_map) | |
| card_name = reverse_mapping[prediction] | |
| confidence = probs[0][prediction].item() | |
| return visualization, card_name, confidence | |
| def load_model(): | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_PATH) | |
| checkpoint = torch.load(model_path, map_location=device) | |
| label_mapping = checkpoint['label_mapping'] | |
| reverse_mapping = {v: k for k, v in label_mapping.items()} | |
| model = CLIPVisionClassifier(len(label_mapping)) | |
| model.load_state_dict(checkpoint["model_state_dict"]) | |
| model.to(device).eval() | |
| processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14') | |
| return model, processor, reverse_mapping, device | |
| def gradio_interface(): | |
| gr.Interface( | |
| fn=process_image_classification, | |
| inputs=gr.Image(type="numpy"), | |
| outputs=[ | |
| gr.Image(label="Heatmap Plot"), | |
| gr.Textbox(label="Predicted Card"), | |
| gr.Textbox(label="Confidence") | |
| ], | |
| title="Uno Card Recognizer", | |
| description="Upload an image or use your webcam to recognize an Uno card." | |
| ).launch() | |
| if __name__ == "__main__": | |
| gradio_interface() | |