Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 18 days ago

Commit

edccc41

verified ·

1 Parent(s): 05b6fbd

relative scoring

Browse files

Files changed (1) hide show

app.py +23 -15

app.py CHANGED Viewed

@@ -105,11 +105,16 @@ async def ui_tester(file: UploadFile = File(...), description: str = Query(...))
     image = Image.open(file.file).convert("RGB")
     blip_data = MODELS["blip"]
-    # 1. Get the Baseline (The model's "Perfect" loss for this image)
-    # We generate a caption to see what the model thinks is a 100% match
     inputs_gen = blip_data["processor"](images=image, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        generated_ids = blip_data["model"].generate(**inputs_gen, max_length=50)
         baseline_caption = blip_data["processor"].decode(generated_ids[0], skip_special_tokens=True)
         # Calculate loss for the model's own generated caption
@@ -117,26 +122,29 @@ async def ui_tester(file: UploadFile = File(...), description: str = Query(...))
         baseline_outputs = blip_data["model"](**baseline_inputs, labels=baseline_inputs["input_ids"])
         baseline_loss = baseline_outputs.loss.item()
-    # 2. Calculate User Loss
     user_inputs = blip_data["processor"](images=image, text=description, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         user_outputs = blip_data["model"](**user_inputs, labels=user_inputs["input_ids"])
         user_loss = user_outputs.loss.item()
-    # 3. Relative Scoring (Intuition-based)
-    # If user_loss is equal to or better than baseline_loss, it's a 90-100% match.
-    # This accounts for the fact that UI images naturally have higher raw loss.
     relative_ratio = baseline_loss / user_loss
-    # Apply a curve to make it feel "right"
-    # A ratio of 1.0 (perfect match to model) -> ~95%
-    # A ratio of 0.7 (pretty good) -> ~75%
-    confidence_score = min(100.0, round((relative_ratio ** 2) * 95, 2))
     return {
         "confidence_score": f"{confidence_score}%",
-        "model_best_guess": baseline_caption,
-        "raw_user_loss": round(user_loss, 4),
-        "status": "Match Found" if confidence_score > 60 else "No Match",
-        "is_valid": confidence_score > 60
     }

     image = Image.open(file.file).convert("RGB")
     blip_data = MODELS["blip"]
+    # 1. GET THE BASELINE (The model's "Perfect" loss for its own perception)
+    # We generate a caption using high-precision parameters to see its "truth"
     inputs_gen = blip_data["processor"](images=image, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        generated_ids = blip_data["model"].generate(
+            **inputs_gen,
+            max_length=50,
+            num_beams=5, # Higher beams for a more stable "best guess"
+            temperature=1.0
+        )
         baseline_caption = blip_data["processor"].decode(generated_ids[0], skip_special_tokens=True)
         # Calculate loss for the model's own generated caption
         baseline_outputs = blip_data["model"](**baseline_inputs, labels=baseline_inputs["input_ids"])
         baseline_loss = baseline_outputs.loss.item()
+    # 2. CALCULATE USER LOSS
     user_inputs = blip_data["processor"](images=image, text=description, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         user_outputs = blip_data["model"](**user_inputs, labels=user_inputs["input_ids"])
         user_loss = user_outputs.loss.item()
+    # 3. RELATIVE SCORING (The "Intuition" Fix)
+    # This ratio tells us how close the user is to the model's internal maximum confidence
     relative_ratio = baseline_loss / user_loss
+    # Scaling: If the user matches the model's perception, they get ~95%.
+    # If they are significantly off (like Orange vs Yellow), they land in the 60s.
+    # This prevents the 0% "confusion ceiling" you saw earlier.
+    confidence_score = min(100.0, round((relative_ratio ** 1.5) * 100, 2))
     return {
         "confidence_score": f"{confidence_score}%",
+        "model_perceived_caption": baseline_caption,
+        "raw_metrics": {
+            "user_loss": round(user_loss, 4),
+            "baseline_loss": round(baseline_loss, 4),
+            "delta": round(user_loss - baseline_loss, 4)
+        },
+        "status": "Match Found" if confidence_score > 55 else "Partial Match" if confidence_score > 30 else "No Match",
+        "is_valid": confidence_score > 55
     }