Upload utils.py with huggingface_hub
Browse files
utils.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from torchvision import transforms
|
| 6 |
+
from transformers import CLIPTokenizer
|
| 7 |
+
from config import HParams
|
| 8 |
+
from model import CLIP
|
| 9 |
+
|
| 10 |
+
# Load Tokenizer (Must match the training tokenizer)
|
| 11 |
+
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
|
| 12 |
+
|
| 13 |
+
# Define Transform (Resize + CenterCrop + Normalize)
|
| 14 |
+
test_transform = transforms.Compose([
|
| 15 |
+
transforms.Resize(HParams.IMAGE_SIZE),
|
| 16 |
+
transforms.CenterCrop(HParams.IMAGE_SIZE),
|
| 17 |
+
transforms.ToTensor(),
|
| 18 |
+
transforms.Normalize((0.481, 0.457, 0.408), (0.268, 0.261, 0.275))
|
| 19 |
+
])
|
| 20 |
+
|
| 21 |
+
def load_model(model_path=HParams.MODEL_PATH, device=HParams.DEVICE):
|
| 22 |
+
print(f"⚙️ Device: {device}")
|
| 23 |
+
model = CLIP(len(tokenizer)).to(device)
|
| 24 |
+
|
| 25 |
+
if os.path.exists(model_path):
|
| 26 |
+
print(f"📂 Loading model weights from: {model_path}")
|
| 27 |
+
ckpt = torch.load(model_path, map_location=device)
|
| 28 |
+
|
| 29 |
+
# Clean '_orig_mod.' prefix if the model was compiled during training
|
| 30 |
+
state_dict = ckpt['model'] if 'model' in ckpt else ckpt
|
| 31 |
+
new_state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
model.load_state_dict(new_state_dict, strict=False)
|
| 35 |
+
model.eval()
|
| 36 |
+
|
| 37 |
+
# OPTIMIZATION: Enable FP16 (Half Precision) if on CUDA
|
| 38 |
+
if device == "cuda":
|
| 39 |
+
model = model.half()
|
| 40 |
+
|
| 41 |
+
print("✅ Model loaded successfully!")
|
| 42 |
+
return model
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"❌ Error loading weights: {e}")
|
| 45 |
+
return None
|
| 46 |
+
else:
|
| 47 |
+
print(f"❌ Model file not found: {model_path}")
|
| 48 |
+
print(" -> Please download 'best_model.pt' from Hugging Face and place it in the root directory.")
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
def predict(model, image_path, text_options):
|
| 52 |
+
if not os.path.exists(image_path):
|
| 53 |
+
print(f"❌ Image file not found: {image_path}")
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
# 1. Prepare Image
|
| 57 |
+
try:
|
| 58 |
+
img_pil = Image.open(image_path).convert("RGB")
|
| 59 |
+
except:
|
| 60 |
+
print("❌ Failed to open image file.")
|
| 61 |
+
return
|
| 62 |
+
|
| 63 |
+
img_tensor = test_transform(img_pil).unsqueeze(0).to(HParams.DEVICE)
|
| 64 |
+
|
| 65 |
+
# Convert to FP16 if using CUDA
|
| 66 |
+
if HParams.DEVICE == "cuda":
|
| 67 |
+
img_tensor = img_tensor.half()
|
| 68 |
+
|
| 69 |
+
# 2. Prepare Text
|
| 70 |
+
text_inputs = tokenizer(
|
| 71 |
+
text_options,
|
| 72 |
+
padding="max_length",
|
| 73 |
+
max_length=HParams.MAX_TOKENS,
|
| 74 |
+
truncation=True,
|
| 75 |
+
return_tensors="pt"
|
| 76 |
+
).to(HParams.DEVICE)
|
| 77 |
+
|
| 78 |
+
# 3. Inference
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
img_features = model.visual(img_tensor)
|
| 81 |
+
text_features = model.text(text_inputs["input_ids"], text_inputs["attention_mask"])
|
| 82 |
+
|
| 83 |
+
# Normalization
|
| 84 |
+
img_features /= img_features.norm(dim=-1, keepdim=True)
|
| 85 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
| 86 |
+
|
| 87 |
+
# Calculate Similarity
|
| 88 |
+
similarity = (100.0 * img_features @ text_features.T).softmax(dim=-1)
|
| 89 |
+
values, indices = similarity[0].topk(len(text_options))
|
| 90 |
+
|
| 91 |
+
# 4. Visualize Results
|
| 92 |
+
plt.figure(figsize=(12, 6))
|
| 93 |
+
|
| 94 |
+
# Show Image
|
| 95 |
+
plt.subplot(1, 2, 1)
|
| 96 |
+
plt.imshow(img_pil)
|
| 97 |
+
plt.axis("off")
|
| 98 |
+
plt.title("Input Image")
|
| 99 |
+
|
| 100 |
+
# Show Chart
|
| 101 |
+
plt.subplot(1, 2, 2)
|
| 102 |
+
scores = values.cpu().float().numpy() * 100
|
| 103 |
+
labels = [text_options[idx] for idx in indices.cpu().numpy()]
|
| 104 |
+
|
| 105 |
+
# Color logic: Green for >50%, Blue for others
|
| 106 |
+
colors = ['#4CAF50' if s > 50 else '#2196F3' for s in scores]
|
| 107 |
+
plt.barh(range(len(labels)), scores, color=colors)
|
| 108 |
+
plt.yticks(range(len(labels)), labels)
|
| 109 |
+
plt.xlabel('Confidence Score (%)')
|
| 110 |
+
plt.xlim(0, 100)
|
| 111 |
+
plt.gca().invert_yaxis() # Display highest score at top
|
| 112 |
+
|
| 113 |
+
# Add labels to bars
|
| 114 |
+
for i, v in enumerate(scores):
|
| 115 |
+
plt.text(v + 1, i, f"{v:.1f}%", va='center', fontweight='bold')
|
| 116 |
+
|
| 117 |
+
plt.tight_layout()
|
| 118 |
+
plt.show()
|