INVERTO
/

bird-captioning-cub200

@@ -46,8 +46,11 @@ from huggingface_hub import PyTorchModelHubMixin
 import torch
 from model import BirdCaptioningModel  # Save model.py locally
 # Load model
-model = BirdCaptioningModel.from_pretrained("INVERTO/bird-captioning-cub200")
 image_processor = ViTImageProcessor.from_pretrained("INVERTO/bird-captioning-cub200")
 tokenizer = AutoTokenizer.from_pretrained("INVERTO/bird-captioning-cub200")
 model.eval()
@@ -66,13 +69,13 @@ from PIL import Image
 def predict_bird_image(image_path):
     image = Image.open(image_path).convert("RGB")
-    pixel_values = image_processor(image, return_tensors="pt").pixel_values
     with torch.no_grad():
         output_ids = model.base_model.generate(pixel_values, max_length=75, num_beams=4)
         _, class_logits = model(pixel_values)
         predicted_class_idx = torch.argmax(class_logits, dim=1).item()
         confidence = torch.nn.functional.softmax(class_logits, dim=1)[0, predicted_class_idx].item() * 100
-        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         species = species_mapping.get(predicted_class_idx, "Unknown")
     return caption, species, confidence

 import torch
 from model import BirdCaptioningModel  # Save model.py locally
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load model
+model = BirdCaptioningModel.from_pretrained("INVERTO/bird-captioning-cub200").to(device)
 image_processor = ViTImageProcessor.from_pretrained("INVERTO/bird-captioning-cub200")
 tokenizer = AutoTokenizer.from_pretrained("INVERTO/bird-captioning-cub200")
 model.eval()
 def predict_bird_image(image_path):
     image = Image.open(image_path).convert("RGB")
+    pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
     with torch.no_grad():
         output_ids = model.base_model.generate(pixel_values, max_length=75, num_beams=4)
         _, class_logits = model(pixel_values)
         predicted_class_idx = torch.argmax(class_logits, dim=1).item()
         confidence = torch.nn.functional.softmax(class_logits, dim=1)[0, predicted_class_idx].item() * 100
+        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
         species = species_mapping.get(predicted_class_idx, "Unknown")
     return caption, species, confidence