Spaces:

Mohansai2004
/

Image_analysis

Running

Mohansai2004 commited on Aug 9, 2025

Commit

72f4fc3

verified ·

1 Parent(s): 5b9dd2f

Update app/caption_model.py

Files changed (1) hide show

app/caption_model.py CHANGED Viewed

@@ -1,43 +1,43 @@
-from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
-import torch
-MODEL_NAME = "Salesforce/blip-image-captioning-base"
-MAX_LENGTH = 50
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load model and processor only once at startup
-processor = BlipProcessor.from_pretrained(MODEL_NAME)
-model = BlipForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
-model.eval()
 def caption_image(image: Image.Image):
     # Validate input
     if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'):
         raise ValueError("Input must be a valid PIL Image in RGB or grayscale format")
-    # Preprocess input
-    inputs = processor(image, return_tensors="pt")
-    inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
-    # Generate caption
-    with torch.no_grad():
-        output_ids = model.generate(
-            **inputs,
-            max_length=MAX_LENGTH,
-            num_beams=5,
-            temperature=1.0,
-            top_k=50,
-            top_p=0.95
-        )
-    # Decode caption
-    caption = processor.decode(output_ids[0], skip_special_tokens=True).strip()
-    caption = caption[0].upper() + caption[1:]
-    if not caption.endswith(('.', '!', '?')):
-        caption += '.'
     return {
         "caption": caption,
-        "confidence": 1.0  # BLIP doesn't return a real score
     }

+from transformers import pipeline
 from PIL import Image
+# Load object detection model
+MODEL_NAME = "facebook/detr-resnet-50"
+detector = pipeline("object-detection", model=MODEL_NAME)
 def caption_image(image: Image.Image):
     # Validate input
     if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'):
         raise ValueError("Input must be a valid PIL Image in RGB or grayscale format")
+    # Run object detection
+    results = detector(image)
+    # Track highest score per object
+    objects_dict = {}
+    for result in results:
+        label = result['label']
+        score = result['score']
+        if label in objects_dict:
+            objects_dict[label] = max(objects_dict[label], score)
+        else:
+            objects_dict[label] = score
+    # Build structured list of objects
+    objects_list = [
+        {"label": label, "score": round(score, 2)}
+        for label, score in sorted(objects_dict.items(), key=lambda x: x[1], reverse=True)
+    ]
+    # Create readable caption
+    detected_objects = [f"{obj['label']} ({obj['score']:.2f})" for obj in objects_list]
+    caption = "Detected objects: " + ", ".join(detected_objects) if detected_objects else "No objects detected."
+    # Highest confidence score
+    max_confidence = max(objects_dict.values()) if objects_dict else 0.0
     return {
         "caption": caption,
+        "objects": objects_list,
+        "confidence": round(max_confidence, 2)
     }