Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Running

App Files Files Community

SaniaE commited on 19 days ago

Commit

05b6fbd

verified ·

1 Parent(s): 83aed07

updated scoring

Browse files

Files changed (1) hide show

app.py +30 -19

app.py CHANGED Viewed

@@ -105,27 +105,38 @@ async def ui_tester(file: UploadFile = File(...), description: str = Query(...))
     image = Image.open(file.file).convert("RGB")
     blip_data = MODELS["blip"]
-    inputs = blip_data["processor"](images=image, text=description, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        outputs = blip_data["model"](**inputs, labels=inputs["input_ids"])
-        loss = outputs.loss.item()
-        # 1. Temperature Scaling (Softens the penalty for minor mismatches)
-        T = 2.0
-        # 2. Logistic Calibration (Sigmoid)
-        # We center the curve around a loss of 3.5 (the "uncertainty zone")
-        # This makes the jump from 4.0 to 3.0 much more significant in the % score
-        steepness = 1.5
-        midpoint = 3.5
-        calibrated_score = 1 / (1 + torch.exp(torch.tensor(steepness * (loss - midpoint) / T))).item()
-    percentage_score = round(calibrated_score * 100, 2)
     return {
-        "confidence_score": f"{percentage_score}%",
-        "raw_loss": round(loss, 4),
-        "status": "Match Found" if percentage_score > 50 else "Weak Match" if percentage_score > 25 else "No Match",
-        "is_valid": percentage_score > 50
     }

     image = Image.open(file.file).convert("RGB")
     blip_data = MODELS["blip"]
+    # 1. Get the Baseline (The model's "Perfect" loss for this image)
+    # We generate a caption to see what the model thinks is a 100% match
+    inputs_gen = blip_data["processor"](images=image, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        generated_ids = blip_data["model"].generate(**inputs_gen, max_length=50)
+        baseline_caption = blip_data["processor"].decode(generated_ids[0], skip_special_tokens=True)
+        # Calculate loss for the model's own generated caption
+        baseline_inputs = blip_data["processor"](images=image, text=baseline_caption, return_tensors="pt").to(DEVICE)
+        baseline_outputs = blip_data["model"](**baseline_inputs, labels=baseline_inputs["input_ids"])
+        baseline_loss = baseline_outputs.loss.item()
+    # 2. Calculate User Loss
+    user_inputs = blip_data["processor"](images=image, text=description, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        user_outputs = blip_data["model"](**user_inputs, labels=user_inputs["input_ids"])
+        user_loss = user_outputs.loss.item()
+    # 3. Relative Scoring (Intuition-based)
+    # If user_loss is equal to or better than baseline_loss, it's a 90-100% match.
+    # This accounts for the fact that UI images naturally have higher raw loss.
+    relative_ratio = baseline_loss / user_loss
+    # Apply a curve to make it feel "right"
+    # A ratio of 1.0 (perfect match to model) -> ~95%
+    # A ratio of 0.7 (pretty good) -> ~75%
+    confidence_score = min(100.0, round((relative_ratio ** 2) * 95, 2))
     return {
+        "confidence_score": f"{confidence_score}%",
+        "model_best_guess": baseline_caption,
+        "raw_user_loss": round(user_loss, 4),
+        "status": "Match Found" if confidence_score > 60 else "No Match",
+        "is_valid": confidence_score > 60
     }