Spaces:
Runtime error
Runtime error
| """ | |
| Grading utilities for the Annotation QA Environment. | |
| Provides deterministic scoring (0.0-1.0) based on: | |
| - IoU (Intersection over Union) of bounding boxes | |
| - Class label accuracy | |
| - Precision (penalizes spurious annotations) | |
| - Recall (penalizes missed annotations) | |
| Uses Hungarian matching to optimally pair predicted vs gold annotations. | |
| """ | |
| from typing import Dict, List, Tuple | |
| def compute_iou(box_a: List[float], box_b: List[float]) -> float: | |
| """ | |
| Compute Intersection over Union between two boxes. | |
| Boxes are [x, y, w, h] with values in 0.0–1.0. | |
| """ | |
| ax, ay, aw, ah = box_a | |
| bx, by, bw, bh = box_b | |
| # Convert to (x1, y1, x2, y2) | |
| a_x1, a_y1, a_x2, a_y2 = ax, ay, ax + aw, ay + ah | |
| b_x1, b_y1, b_x2, b_y2 = bx, by, bx + bw, by + bh | |
| # Intersection | |
| inter_x1 = max(a_x1, b_x1) | |
| inter_y1 = max(a_y1, b_y1) | |
| inter_x2 = min(a_x2, b_x2) | |
| inter_y2 = min(a_y2, b_y2) | |
| inter_w = max(0, inter_x2 - inter_x1) | |
| inter_h = max(0, inter_y2 - inter_y1) | |
| inter_area = inter_w * inter_h | |
| # Union | |
| area_a = aw * ah | |
| area_b = bw * bh | |
| union_area = area_a + area_b - inter_area | |
| if union_area < 1e-8: | |
| return 0.0 | |
| return inter_area / union_area | |
| def compute_annotation_quality( | |
| annotations: List[Dict], | |
| gold_annotations: List[Dict], | |
| ) -> float: | |
| """ | |
| Compute specific Semantic VLM visual QA testing metrics (0.0-1.0). | |
| Graded on: | |
| - Spurious Precision (35%): Did you remove fake boxes without destroying real ones? | |
| - Class Match Accuracy (35%): For existing valid boxes, did you change to the correct Gold label? | |
| - Missing Flag Recall (30%): Did you successfully use FLAG_MISSING for objects removed from the image? | |
| """ | |
| from collections import Counter | |
| if not gold_annotations: | |
| return 1.0 if not annotations else 0.5 | |
| # 1. Spurious Precision | |
| gold_map = {a["id"]: a for a in gold_annotations} | |
| predictions_valid = [a for a in annotations if not a.get("class_label", "").startswith("missing_")] | |
| if not predictions_valid: | |
| precision = 0.0 | |
| else: | |
| precision = sum(1 for a in predictions_valid if a["id"] in gold_map) / len(predictions_valid) | |
| # 2. Class Match Accuracy for valid boxes | |
| matched = [a for a in predictions_valid if a["id"] in gold_map] | |
| if not matched: | |
| class_acc = 0.0 | |
| else: | |
| class_acc = sum(1 for a in matched if a.get("class_label", "") == gold_map[a["id"]].get("class_label", "")) / len(matched) | |
| # 3. Missing Object Flag Recall | |
| expected_classes = [g.get("class_label", "") for g in gold_annotations] | |
| present_classes = [a.get("class_label", "") for a in annotations if a["id"] in gold_map and not a.get("class_label", "").startswith("missing_")] | |
| # Calculate exact missing instances mathematically | |
| exp_counts = Counter(expected_classes) | |
| pres_counts = Counter(present_classes) | |
| actual_missing_classes = [] | |
| for cls, count in exp_counts.items(): | |
| if count > pres_counts.get(cls, 0): | |
| for _ in range(count - pres_counts.get(cls, 0)): | |
| actual_missing_classes.append(cls) | |
| if not actual_missing_classes: | |
| missing_acc = 1.0 | |
| else: | |
| flagged_classes = [a.get("class_label", "").replace("missing_", "", 1) for a in annotations if a.get("class_label", "").startswith("missing_")] | |
| flagged_counts = Counter(flagged_classes) | |
| caught = 0 | |
| for cls in actual_missing_classes: | |
| if flagged_counts.get(cls, 0) > 0: | |
| caught += 1 | |
| flagged_counts[cls] -= 1 | |
| missing_acc = caught / len(actual_missing_classes) | |
| quality = 0.35 * class_acc + 0.35 * precision + 0.30 * missing_acc | |
| return max(0.0, min(1.0, quality)) | |
| def grade_episode( | |
| initial_annotations: List[Dict], | |
| final_annotations: List[Dict], | |
| gold_annotations: List[Dict], | |
| ) -> float: | |
| """ | |
| Compute the episode grade (0.0–1.0). | |
| """ | |
| initial_quality = compute_annotation_quality(initial_annotations, gold_annotations) | |
| final_quality = compute_annotation_quality(final_annotations, gold_annotations) | |
| max_improvement = 1.0 - initial_quality | |
| if max_improvement < 0.01: | |
| return 1.0 if final_quality >= initial_quality - 0.01 else 0.5 | |
| improvement = final_quality - initial_quality | |
| score = improvement / max_improvement | |
| return max(0.0, min(1.0, score)) | |
| def compute_step_reward( | |
| old_annotations: List[Dict], | |
| new_annotations: List[Dict], | |
| gold_annotations: List[Dict], | |
| action_type: str, | |
| ) -> float: | |
| """ | |
| Compute dense per-step reward based on quality delta. | |
| """ | |
| old_quality = compute_annotation_quality(old_annotations, gold_annotations) | |
| new_quality = compute_annotation_quality(new_annotations, gold_annotations) | |
| delta = new_quality - old_quality | |
| reward = delta * 2.0 # quality improvement → reward | |
| reward -= 0.01 # step penalty | |
| if action_type == "submit": | |
| reward += 0.05 | |
| return round(reward, 4) | |