MuhammedKsee
/

custom-clip-vit-b-coco

+import os
+import torch
+import matplotlib.pyplot as plt
+from PIL import Image
+from torchvision import transforms
+from transformers import CLIPTokenizer
+from config import HParams
+from model import CLIP
+# Load Tokenizer (Must match the training tokenizer)
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
+# Define Transform (Resize + CenterCrop + Normalize)
+test_transform = transforms.Compose([
+    transforms.Resize(HParams.IMAGE_SIZE),
+    transforms.CenterCrop(HParams.IMAGE_SIZE),
+    transforms.ToTensor(),
+    transforms.Normalize((0.481, 0.457, 0.408), (0.268, 0.261, 0.275))
+])
+def load_model(model_path=HParams.MODEL_PATH, device=HParams.DEVICE):
+    print(f"⚙️  Device: {device}")
+    model = CLIP(len(tokenizer)).to(device)
+    if os.path.exists(model_path):
+        print(f"📂 Loading model weights from: {model_path}")
+        ckpt = torch.load(model_path, map_location=device)
+        # Clean '_orig_mod.' prefix if the model was compiled during training
+        state_dict = ckpt['model'] if 'model' in ckpt else ckpt
+        new_state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        try:
+            model.load_state_dict(new_state_dict, strict=False)
+            model.eval()
+            # OPTIMIZATION: Enable FP16 (Half Precision) if on CUDA
+            if device == "cuda":
+                model = model.half()
+            print("✅ Model loaded successfully!")
+            return model
+        except Exception as e:
+            print(f"❌ Error loading weights: {e}")
+            return None
+    else:
+        print(f"❌ Model file not found: {model_path}")
+        print("   -> Please download 'best_model.pt' from Hugging Face and place it in the root directory.")
+        return None
+def predict(model, image_path, text_options):
+    if not os.path.exists(image_path):
+        print(f"❌ Image file not found: {image_path}")
+        return
+    # 1. Prepare Image
+    try:
+        img_pil = Image.open(image_path).convert("RGB")
+    except:
+        print("❌ Failed to open image file.")
+        return
+    img_tensor = test_transform(img_pil).unsqueeze(0).to(HParams.DEVICE)
+    # Convert to FP16 if using CUDA
+    if HParams.DEVICE == "cuda":
+        img_tensor = img_tensor.half()
+    # 2. Prepare Text
+    text_inputs = tokenizer(
+        text_options,
+        padding="max_length",
+        max_length=HParams.MAX_TOKENS,
+        truncation=True,
+        return_tensors="pt"
+    ).to(HParams.DEVICE)
+    # 3. Inference
+    with torch.no_grad():
+        img_features = model.visual(img_tensor)
+        text_features = model.text(text_inputs["input_ids"], text_inputs["attention_mask"])
+        # Normalization
+        img_features /= img_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+        # Calculate Similarity
+        similarity = (100.0 * img_features @ text_features.T).softmax(dim=-1)
+        values, indices = similarity[0].topk(len(text_options))
+    # 4. Visualize Results
+    plt.figure(figsize=(12, 6))
+    # Show Image
+    plt.subplot(1, 2, 1)
+    plt.imshow(img_pil)
+    plt.axis("off")
+    plt.title("Input Image")
+    # Show Chart
+    plt.subplot(1, 2, 2)
+    scores = values.cpu().float().numpy() * 100
+    labels = [text_options[idx] for idx in indices.cpu().numpy()]
+    # Color logic: Green for >50%, Blue for others
+    colors = ['#4CAF50' if s > 50 else '#2196F3' for s in scores]
+    plt.barh(range(len(labels)), scores, color=colors)
+    plt.yticks(range(len(labels)), labels)
+    plt.xlabel('Confidence Score (%)')
+    plt.xlim(0, 100)
+    plt.gca().invert_yaxis() # Display highest score at top
+    # Add labels to bars
+    for i, v in enumerate(scores):
+        plt.text(v + 1, i, f"{v:.1f}%", va='center', fontweight='bold')
+    plt.tight_layout()
+    plt.show()