Spaces:

CyCrawwler
/

annotation-qa-env

Sleeping

k3tikvats

initial commit

8b4d6a8 4 days ago

5.92 kB

	"""
	Grading utilities for the Annotation QA Environment.

	Provides deterministic scoring (0.0–1.0) based on:
	- IoU (Intersection over Union) of bounding boxes
	- Class label accuracy
	- Precision (penalizes spurious annotations)
	- Recall (penalizes missed annotations)

	Uses Hungarian matching to optimally pair predicted vs gold annotations.
	"""

	from typing import Dict, List, Tuple


	def compute_iou(box_a: List[float], box_b: List[float]) -> float:
	"""
	Compute Intersection over Union between two boxes.
	Boxes are [x, y, w, h] with values in 0.0–1.0.
	"""
	ax, ay, aw, ah = box_a
	bx, by, bw, bh = box_b

	# Convert to (x1, y1, x2, y2)
	a_x1, a_y1, a_x2, a_y2 = ax, ay, ax + aw, ay + ah
	b_x1, b_y1, b_x2, b_y2 = bx, by, bx + bw, by + bh

	# Intersection
	inter_x1 = max(a_x1, b_x1)
	inter_y1 = max(a_y1, b_y1)
	inter_x2 = min(a_x2, b_x2)
	inter_y2 = min(a_y2, b_y2)

	inter_w = max(0, inter_x2 - inter_x1)
	inter_h = max(0, inter_y2 - inter_y1)
	inter_area = inter_w * inter_h

	# Union
	area_a = aw * ah
	area_b = bw * bh
	union_area = area_a + area_b - inter_area

	if union_area < 1e-8:
	return 0.0

	return inter_area / union_area


	def hungarian_match(
	pred_annotations: List[Dict],
	gold_annotations: List[Dict],
	iou_threshold: float = 0.3,
	) -> List[Tuple[int, int, float]]:
	"""
	Match predicted annotations to gold annotations using greedy best-IoU matching.
	Returns list of (pred_idx, gold_idx, iou) tuples.

	Uses a simple greedy approach (good enough for our scale) instead of
	scipy.optimize.linear_sum_assignment to avoid the scipy dependency.
	"""
	if not pred_annotations or not gold_annotations:
	return []

	# Compute IoU matrix
	n_pred = len(pred_annotations)
	n_gold = len(gold_annotations)
	iou_matrix = []
	for i in range(n_pred):
	row = []
	for j in range(n_gold):
	iou = compute_iou(pred_annotations[i]["bbox"], gold_annotations[j]["bbox"])
	row.append(iou)
	iou_matrix.append(row)

	# Greedy matching: pick highest IoU pair iteratively
	matches = []
	used_pred = set()
	used_gold = set()

	# Flatten and sort all (pred_idx, gold_idx, iou) by IoU descending
	all_pairs = []
	for i in range(n_pred):
	for j in range(n_gold):
	if iou_matrix[i][j] >= iou_threshold:
	all_pairs.append((i, j, iou_matrix[i][j]))

	all_pairs.sort(key=lambda x: x[2], reverse=True)

	for pred_idx, gold_idx, iou in all_pairs:
	if pred_idx not in used_pred and gold_idx not in used_gold:
	matches.append((pred_idx, gold_idx, iou))
	used_pred.add(pred_idx)
	used_gold.add(gold_idx)

	return matches


	def compute_annotation_quality(
	annotations: List[Dict],
	gold_annotations: List[Dict],
	iou_threshold: float = 0.3,
	) -> float:
	"""
	Compute overall annotation quality score (0.0–1.0).

	Combines:
	- Mean IoU of matched annotations (40%)
	- Class label accuracy on matched annotations (30%)
	- Precision: matched / total_predicted (15%)
	- Recall: matched / total_gold (15%)
	"""
	if not gold_annotations:
	# No gold → quality is 1.0 if no predictions, else penalized
	return 1.0 if not annotations else 0.5

	matches = hungarian_match(annotations, gold_annotations, iou_threshold)

	n_pred = len(annotations)
	n_gold = len(gold_annotations)
	n_matched = len(matches)

	# Mean IoU of matched pairs
	if n_matched > 0:
	mean_iou = sum(iou for _, _, iou in matches) / n_matched
	else:
	mean_iou = 0.0

	# Class accuracy on matched pairs
	if n_matched > 0:
	class_correct = sum(
	1
	for pred_idx, gold_idx, _ in matches
	if annotations[pred_idx].get("class_label", "")
	== gold_annotations[gold_idx].get("class_label", "")
	)
	class_acc = class_correct / n_matched
	else:
	class_acc = 0.0

	# Precision and recall
	precision = n_matched / n_pred if n_pred > 0 else 0.0
	recall = n_matched / n_gold if n_gold > 0 else 0.0

	# Weighted composite
	quality = 0.40 * mean_iou + 0.30 * class_acc + 0.15 * precision + 0.15 * recall
	return max(0.0, min(1.0, quality))


	def grade_episode(
	initial_annotations: List[Dict],
	final_annotations: List[Dict],
	gold_annotations: List[Dict],
	) -> float:
	"""
	Compute the episode grade (0.0–1.0).

	Score = improvement in annotation quality normalized by maximum possible
	improvement.
	"""
	initial_quality = compute_annotation_quality(initial_annotations, gold_annotations)
	final_quality = compute_annotation_quality(final_annotations, gold_annotations)

	max_improvement = 1.0 - initial_quality
	if max_improvement < 0.01:
	# Already near-perfect, give full credit if not degraded
	return 1.0 if final_quality >= initial_quality - 0.01 else 0.5

	improvement = final_quality - initial_quality
	score = improvement / max_improvement

	return max(0.0, min(1.0, score))


	def compute_step_reward(
	old_annotations: List[Dict],
	new_annotations: List[Dict],
	gold_annotations: List[Dict],
	action_type: str,
	) -> float:
	"""
	Compute dense per-step reward based on quality delta.
	"""
	old_quality = compute_annotation_quality(old_annotations, gold_annotations)
	new_quality = compute_annotation_quality(new_annotations, gold_annotations)

	delta = new_quality - old_quality

	# Scale delta to a reasonable reward range
	reward = delta * 2.0 # quality improvement → reward

	# Small step penalty to encourage efficiency
	reward -= 0.01

	# Bonus for submit action (completion)
	if action_type == "submit":
	reward += 0.05 # small bonus for actually submitting

	return round(reward, 4)