Spaces:

Aswini-Kumar
/

data-wrangler-env

Sleeping

App Files Files Community

data-wrangler-env / server /grader.py

Aswini-Kumar

Sync server/grader.py

c77741a verified about 2 months ago

raw

history blame contribute delete

12.5 kB

	"""
	Grader for DataWranglerEnv.

	Multi-dimensional scoring system that compares the agent's cleaned dataset
	against the ground truth. Produces deterministic scores in (0.001, 0.999).

	Dimensions:
	- Missing values fixed (20%)
	- Duplicates removed (15%)
	- Type correctness (15%)
	- Value accuracy (20%)
	- Data preservation (10%)
	- Constraint compliance (10%) — NEW: business rule satisfaction
	- Step efficiency (5%) — NEW: fewer steps = better
	- Golden row integrity (5%) — NEW: anti-exploit mechanism
	"""

	from typing import Any, Dict, List, Tuple

	import numpy as np
	import pandas as pd


	def compute_score(
	dirty_df: pd.DataFrame,
	current_df: pd.DataFrame,
	clean_df: pd.DataFrame,
	original_dirty_df: pd.DataFrame,
	issue_manifest: Dict[str, Any],
	step_count: int = 0,
	max_steps: int = 30,
	golden_indices: List[int] = None,
	) -> Tuple[float, Dict[str, float]]:
	"""Compute the composite data quality score.

	Args:
	dirty_df: The original dirty dataset (snapshot at reset)
	current_df: The agent's current working dataset
	clean_df: The ground truth clean dataset
	original_dirty_df: The very first dirty state (for reference)
	issue_manifest: Manifest of all injected issues
	step_count: Current step number
	max_steps: Maximum allowed steps
	golden_indices: Indices of golden rows in clean_df

	Returns:
	Tuple of (composite_score, dimension_scores_dict)
	"""
	scores = {}
	if golden_indices is None:
	golden_indices = issue_manifest.get("golden_indices", [])

	# ── Dimension 1: Missing Values Fixed (20%) ──────────────────────────
	original_missing = original_dirty_df.isnull().sum().sum()
	current_missing = current_df.isnull().sum().sum()
	target_missing = clean_df.isnull().sum().sum()

	if original_missing > target_missing:
	fixable = original_missing - target_missing
	fixed = max(0, original_missing - current_missing)
	scores["missing_fixed"] = min(1.0, fixed / fixable) if fixable > 0 else 1.0
	else:
	scores["missing_fixed"] = 1.0

	# ── Dimension 2: Duplicates Removed (15%) ─────────────────────────────
	original_dupes = original_dirty_df.duplicated().sum()
	current_dupes = current_df.duplicated().sum()
	target_dupes = clean_df.duplicated().sum()

	if original_dupes > target_dupes:
	fixable_dupes = original_dupes - target_dupes
	removed = max(0, original_dupes - current_dupes)
	scores["duplicates_removed"] = min(1.0, removed / fixable_dupes) if fixable_dupes > 0 else 1.0
	else:
	scores["duplicates_removed"] = 1.0

	# ── Dimension 3: Type Correctness (15%) ───────────────────────────────
	type_matches = 0
	type_total = len(clean_df.columns)

	for col in clean_df.columns:
	if col not in current_df.columns:
	continue
	clean_dtype = clean_df[col].dtype
	current_dtype = current_df[col].dtype

	if clean_dtype == current_dtype:
	type_matches += 1
	elif pd.api.types.is_numeric_dtype(clean_dtype) and pd.api.types.is_numeric_dtype(current_dtype):
	type_matches += 1
	elif pd.api.types.is_string_dtype(clean_dtype) and pd.api.types.is_string_dtype(current_dtype):
	type_matches += 1
	elif str(clean_dtype) == "object" and str(current_dtype) == "object":
	type_matches += 1

	scores["type_correctness"] = type_matches / type_total if type_total > 0 else 1.0

	# ── Dimension 4: Value Accuracy (20%) ─────────────────────────────────
	try:
	min_rows = min(len(current_df), len(clean_df))
	common_cols = [c for c in clean_df.columns if c in current_df.columns]
	if common_cols and min_rows > 0:
	clean_subset = clean_df[common_cols].head(min_rows).reset_index(drop=True)
	current_subset = current_df[common_cols].head(min_rows).reset_index(drop=True)

	total_cells = 0
	matching_cells = 0

	for col in common_cols:
	for i in range(min_rows):
	total_cells += 1
	clean_val = clean_subset.at[i, col]
	try:
	current_val = current_subset.at[i, col]
	except (KeyError, IndexError):
	continue

	if pd.isna(clean_val) and pd.isna(current_val):
	matching_cells += 1
	elif pd.isna(clean_val) or pd.isna(current_val):
	continue
	elif str(clean_val).strip().lower() == str(current_val).strip().lower():
	matching_cells += 1
	else:
	try:
	cv = float(clean_val)
	av = float(current_val)
	if cv != 0 and abs(cv - av) / abs(cv) < 0.01:
	matching_cells += 0.8
	except (ValueError, TypeError):
	pass

	scores["value_accuracy"] = matching_cells / total_cells if total_cells > 0 else 0.0
	else:
	scores["value_accuracy"] = 0.0
	except Exception:
	scores["value_accuracy"] = 0.0

	# ── Dimension 5: Data Preservation (10%) ──────────────────────────────
	expected_rows = len(clean_df)
	current_rows = len(current_df)

	if current_rows >= expected_rows:
	scores["data_preservation"] = 1.0
	elif current_rows == 0:
	scores["data_preservation"] = 0.0
	else:
	scores["data_preservation"] = max(0.0, current_rows / expected_rows)

	# ── Dimension 6: Constraint Compliance (10%) — NEW ────────────────────
	business_rules = issue_manifest.get("business_rules", [])
	if business_rules:
	rules_satisfied = 0
	rules_total = len(business_rules)
	for rule in business_rules:
	rule_type = rule.get("type", "")
	col = rule.get("column", "")

	if rule_type == "range" and col in current_df.columns:
	lo, hi = rule.get("min", float("-inf")), rule.get("max", float("inf"))
	numeric = pd.to_numeric(current_df[col], errors="coerce")
	valid = numeric.dropna()
	if len(valid) > 0:
	pct_ok = ((valid >= lo) & (valid <= hi)).mean()
	rules_satisfied += pct_ok

	elif rule_type == "not_null" and col in current_df.columns:
	pct_ok = 1.0 - (current_df[col].isna().sum() / max(1, len(current_df)))
	rules_satisfied += pct_ok

	elif rule_type == "cross_column":
	col_a = rule.get("column_a", "")
	col_b = rule.get("column_b", "")
	if col_a in current_df.columns and col_b in current_df.columns:
	a = pd.to_numeric(current_df[col_a], errors="coerce")
	b = pd.to_numeric(current_df[col_b], errors="coerce")
	valid_mask = a.notna() & b.notna()
	if valid_mask.sum() > 0:
	pct_ok = (a[valid_mask] > b[valid_mask]).mean()
	rules_satisfied += pct_ok

	elif rule_type == "categorical" and col in current_df.columns:
	allowed = set(rule.get("allowed_values", []))
	if allowed:
	non_null = current_df[col].dropna()
	if len(non_null) > 0:
	pct_ok = non_null.isin(allowed).mean()
	rules_satisfied += pct_ok

	elif rule_type == "pattern" and col in current_df.columns:
	pat = rule.get("pattern", "")
	if pat:
	non_null = current_df[col].dropna().astype(str)
	if len(non_null) > 0:
	pct_ok = non_null.str.match(pat).mean()
	rules_satisfied += pct_ok

	scores["constraint_compliance"] = rules_satisfied / rules_total if rules_total > 0 else 0.8
	else:
	scores["constraint_compliance"] = 0.8 # Default for tasks without rules

	# ── Dimension 7: Step Efficiency (5%) — NEW ───────────────────────────
	if step_count > 0 and max_steps > 0:
	efficiency = 1.0 - (step_count / max_steps)
	scores["step_efficiency"] = max(0.1, min(1.0, efficiency * 1.5))
	else:
	scores["step_efficiency"] = 0.5

	# ── Dimension 8: Golden Row Integrity (5%) — NEW ──────────────────────
	if golden_indices and len(current_df) > 0:
	golden_ok = 0
	golden_total = len(golden_indices)
	for gi in golden_indices:
	if gi >= len(clean_df):
	continue
	golden_row = clean_df.iloc[gi]
	# Check if this golden row still exists in current_df (approximately)
	found = False
	for _, row in current_df.iterrows():
	match_count = 0
	total_check = 0
	for col in clean_df.columns:
	if col not in current_df.columns:
	continue
	total_check += 1
	gv = golden_row[col]
	cv = row[col]
	if pd.isna(gv) and pd.isna(cv):
	match_count += 1
	elif not pd.isna(gv) and not pd.isna(cv):
	if str(gv).strip().lower() == str(cv).strip().lower():
	match_count += 1
	if total_check > 0 and match_count / total_check > 0.8:
	found = True
	break
	if found:
	golden_ok += 1
	scores["golden_row_integrity"] = golden_ok / golden_total if golden_total > 0 else 1.0
	else:
	scores["golden_row_integrity"] = 1.0

	# ── Composite Score ──────────────────────────────────────────────────
	weights = {
	"missing_fixed": 0.20,
	"duplicates_removed": 0.15,
	"type_correctness": 0.15,
	"value_accuracy": 0.20,
	"data_preservation": 0.10,
	"constraint_compliance": 0.10,
	"step_efficiency": 0.05,
	"golden_row_integrity": 0.05,
	}

	composite = sum(scores[dim] * weights[dim] for dim in weights)
	# Clamp to open interval (0, 1)
	composite = max(0.001, min(0.999, composite))

	return composite, scores


	def compute_step_reward(
	before_df: pd.DataFrame,
	after_df: pd.DataFrame,
	clean_df: pd.DataFrame,
	original_dirty_df: pd.DataFrame,
	issue_manifest: Dict[str, Any],
	command: str,
	data_modified: bool,
	) -> float:
	"""Compute the reward for a single step."""
	cmd = command.strip().split()[0].lower() if command.strip() else ""

	# Diagnostic commands get small positive reward
	diagnostic_cmds = {"help", "view", "profile", "profile_column", "find_missing",
	"find_duplicates", "find_outliers", "check_rules", "history"}
	if cmd in diagnostic_cmds:
	return 0.02

	# Validate and undo get small reward
	if cmd in ("validate", "undo"):
	return 0.01

	# Submit doesn't give step reward (final score is computed separately)
	if cmd == "submit":
	return 0.001

	if not data_modified:
	return -0.01

	# For data-modifying commands, compare improvement
	before_score, _ = compute_score(
	original_dirty_df, before_df, clean_df, original_dirty_df, issue_manifest
	)
	after_score, _ = compute_score(
	original_dirty_df, after_df, clean_df, original_dirty_df, issue_manifest
	)

	improvement = after_score - before_score

	if improvement > 0:
	return min(0.15, improvement * 2.0 + 0.03)
	elif improvement < -0.01:
	return max(-0.20, improvement * 2.0)
	else:
	return 0.001