Spaces:

Anuj424614
/

dataclean-env

Sleeping

App Files Files Community

dataclean-env / server /environment.py

Anuj424614

Upload folder using huggingface_hub

8345e43 verified about 2 months ago

raw

history blame contribute delete

44.5 kB

	"""DataClean Environment: core logic for the data cleaning RL environment.

	Implements reset(), step(), state property following OpenEnv spec.
	All 10 action handlers fully implemented. Delta reward system.
	"""

	from __future__ import annotations

	import copy
	import re
	from collections import Counter
	from datetime import datetime
	from typing import Any, Dict, List, Optional
	from uuid import uuid4

	import logging

	from openenv.core.env_server import Environment

	logger = logging.getLogger(__name__)

	from dataclean_env.models import (
	ActionResult,
	DataCleanAction,
	DataCleanObservation,
	DataCleanState,
	DataSummary,
	IssueGroup,
	QualityIssue,
	)
	from dataclean_env.server.grader import DataCleanGrader
	from dataclean_env.server.tasks import get_task, list_tasks


	# US state name -> abbreviation mapping
	US_STATES: Dict[str, str] = {
	"alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR",
	"california": "CA", "colorado": "CO", "connecticut": "CT", "delaware": "DE",
	"florida": "FL", "georgia": "GA", "hawaii": "HI", "idaho": "ID",
	"illinois": "IL", "indiana": "IN", "iowa": "IA", "kansas": "KS",
	"kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
	"massachusetts": "MA", "michigan": "MI", "minnesota": "MN",
	"mississippi": "MS", "missouri": "MO", "montana": "MT", "nebraska": "NE",
	"nevada": "NV", "new hampshire": "NH", "new jersey": "NJ",
	"new mexico": "NM", "new york": "NY", "north carolina": "NC",
	"north dakota": "ND", "ohio": "OH", "oklahoma": "OK", "oregon": "OR",
	"pennsylvania": "PA", "rhode island": "RI", "south carolina": "SC",
	"south dakota": "SD", "tennessee": "TN", "texas": "TX", "utah": "UT",
	"vermont": "VT", "virginia": "VA", "washington": "WA",
	"west virginia": "WV", "wisconsin": "WI", "wyoming": "WY",
	}

	# Date parsing formats (most specific first)
	DATE_PARSE_FORMATS = [
	"%Y-%m-%d", # 2023-01-15
	"%m/%d/%Y", # 01/15/2023
	"%d-%m-%Y", # 15-01-2023
	"%B %d, %Y", # January 15, 2023
	"%b %d, %Y", # Jan 15, 2023
	"%d %B %Y", # 15 January 2023
	"%d-%b-%Y", # 15-Jan-2023
	"%m-%d-%Y", # 01-15-2023
	"%B %d %Y", # January 15 2023
	"%d/%m/%Y", # 15/01/2023
	"%Y/%m/%d", # 2023/01/15
	]


	# Per-action costs for the intervention budget system
	ACTION_COSTS: Dict[str, float] = {
	"fix_value": 1.0,
	"delete_row": 6.0,
	"fill_missing": 1.0,
	"standardize_format": 2.0,
	"merge_duplicates": 4.0,
	"flag_anomaly": 0.5,
	"split_column": 3.0,
	"rename_column": 0.5,
	"cast_type": 2.0,
	"escalate_to_human": 0.5,
	"mark_complete": 0.0,
	}

	# Budget allocation per difficulty level
	DIFFICULTY_BUDGETS: Dict[str, float] = {
	"easy": 50.0,
	"medium": 100.0,
	"hard": 150.0,
	}

	# Per-step penalty in delta reward computation
	STEP_COST: float = 0.005

	# Default seed when none provided (deterministic fallback)
	DEFAULT_SEED: int = 42


	class DataCleanEnvironment(
	Environment[DataCleanAction, DataCleanObservation, DataCleanState]
	):
	"""Data Cleaning environment for training AI agents."""

	SUPPORTS_CONCURRENT_SESSIONS = False

	def __init__(self) -> None:
	super().__init__()
	self._state = DataCleanState()
	self._grader = DataCleanGrader()
	self._utility_probes: list = []
	self._ambiguous_cells: list = []
	self._task_name: str = ""
	self._last_grade_result = None
	self._next_row_id: int = 0

	def reset(
	self,
	seed: Optional[int] = None,
	episode_id: Optional[str] = None,
	**kwargs: Any,
	) -> DataCleanObservation:
	"""Initialize a new data cleaning episode."""
	task_id = kwargs.get("task_id", "easy_contacts")
	task = get_task(task_id) # raises KeyError/ValueError on unknown task_id

	actual_seed = seed if seed is not None else DEFAULT_SEED

	from dataclean_env.server.data_generator import generate_dirty_data

	dirty_data = generate_dirty_data(
	clean_data=task.ground_truth,
	corruptions=task.corruptions,
	seed=actual_seed,
	)

	# Assign stable row_ids (persist through delete/merge within episode)
	self._next_row_id = 0
	for row in dirty_data:
	row["_row_id"] = self._next_row_id
	self._next_row_id += 1

	# Compute initial score (dirty data vs ground truth)
	initial_score = self._grader.grade(
	final_data=dirty_data,
	ground_truth=task.ground_truth,
	original_data=dirty_data,
	action_history=[],
	schema=task.schema,
	flagged_cells=[],
	escalated_cells=[],
	ambiguous_cells=list(getattr(task, "ambiguous_cells", [])),
	utility_probes=list(getattr(task, "utility_probes", [])),
	).score

	budget = DIFFICULTY_BUDGETS.get(task.difficulty, 100.0)
	self._state = DataCleanState(
	episode_id=episode_id or str(uuid4()),
	step_count=0,
	task_id=task_id,
	difficulty=task.difficulty,
	current_data=copy.deepcopy(dirty_data),
	ground_truth=copy.deepcopy(task.ground_truth),
	original_dirty=copy.deepcopy(dirty_data),
	schema_def=task.schema,
	action_log=[],
	flagged_cells=[],
	escalated_cells=[],
	max_steps=task.max_steps,
	is_complete=False,
	previous_score=initial_score,
	initial_raw_score=initial_score,
	action_budget=budget,
	budget_spent=0.0,
	budget_remaining=budget,
	)
	self._task_name = task.name
	self._ambiguous_cells: List[tuple[str, str]] = list(
	getattr(task, "ambiguous_cells", [])
	)
	self._utility_probes = list(getattr(task, "utility_probes", []))

	return self._build_observation(reward=None, done=False)

	def step(
	self,
	action: DataCleanAction,
	timeout_s: Optional[float] = None,
	**kwargs: Any,
	) -> DataCleanObservation:
	"""Process one cleaning action. Returns observation with delta reward."""
	# Guard: episode already ended
	if self._state.is_complete:
	return self._build_observation(
	reward=0.0, done=True,
	)

	self._state.step_count += 1

	# Enforce budget: reject actions (except mark_complete) when exhausted
	cost = ACTION_COSTS.get(action.action_type, 1.0)
	if cost > 0 and self._state.budget_remaining < cost:
	result = {
	"action": action.action_type,
	"status": "error",
	"message": f"Budget exhausted ({self._state.budget_remaining:.1f} remaining, "
	f"action costs {cost:.1f})",
	"cells_modified": 0,
	}
	self._state.action_log.append(result)
	return self._build_observation(
	reward=-0.01, done=False,
	)

	# Execute the action
	result = self._execute_action(action)
	self._state.action_log.append(result)

	# Deduct action cost from budget
	self._state.budget_spent += cost
	self._state.budget_remaining -= cost

	# Check termination
	is_done = (
	action.action_type == "mark_complete"
	or self._state.step_count >= self._state.max_steps
	)
	self._state.is_complete = is_done

	# Compute reward
	if is_done:
	# Terminal: return absolute final score
	grade_result = self._grader.grade(
	final_data=self._state.current_data,
	ground_truth=self._state.ground_truth,
	original_data=self._state.original_dirty,
	action_history=self._state.action_log,
	schema=self._state.schema_def,
	flagged_cells=self._state.flagged_cells,
	budget_spent=self._state.budget_spent,
	action_budget=self._state.action_budget,
	escalated_cells=self._state.escalated_cells,
	ambiguous_cells=self._ambiguous_cells,
	utility_probes=self._utility_probes,
	)
	reward = grade_result.score
	self._last_grade_result = grade_result
	else:
	# Non-terminal: delta reward
	reward = self._compute_delta_reward(result)

	return self._build_observation(reward=reward, done=is_done)

	@property
	def state(self) -> DataCleanState:
	return self._state

	# ------------------------------------------------------------------
	# Delta Reward System
	# ------------------------------------------------------------------

	def _compute_delta_reward(self, action_result: Dict[str, Any]) -> float:
	"""Compute reward = current_score - previous_score - step_cost.

	Penalizes no-ops and errors explicitly.
	"""
	# Explicit penalties for bad actions
	if action_result.get("status") == "error":
	return -0.02
	if action_result.get("status") == "no_effect":
	return -0.01
	if action_result.get("cells_modified", 0) == 0 and action_result.get("action") not in ("flag_anomaly", "escalate_to_human"):
	return -0.01

	# Compute current score (raw, without normalization — delta is relative
	# so the baseline cancels out; normalization only at terminal grading)
	current_score = self._grader.grade(
	final_data=self._state.current_data,
	ground_truth=self._state.ground_truth,
	original_data=self._state.original_dirty,
	action_history=self._state.action_log,
	schema=self._state.schema_def,
	flagged_cells=self._state.flagged_cells,
	budget_spent=self._state.budget_spent,
	action_budget=self._state.action_budget,
	escalated_cells=self._state.escalated_cells,
	ambiguous_cells=self._ambiguous_cells,
	utility_probes=self._utility_probes,
	).score

	delta = current_score - self._state.previous_score - STEP_COST
	self._state.previous_score = current_score
	return round(delta, 4)

	# ------------------------------------------------------------------
	# Action Dispatch
	# ------------------------------------------------------------------

	@staticmethod
	def _normalize_action_params(action_type: str, params: Dict[str, Any]) -> Dict[str, Any]:
	"""Normalize common LLM param aliases to canonical names."""
	p = dict(params)
	# Universal aliases
	if "row" in p and "row_id" not in p:
	p["row_id"] = p.pop("row")
	if "col" in p and "column" not in p:
	p["column"] = p.pop("col")
	# Action-specific aliases
	if action_type == "fix_value" and "value" in p and "new_value" not in p:
	p["new_value"] = p.pop("value")
	if action_type == "merge_duplicates":
	if "row_id_1" in p and "row_id1" not in p:
	p["row_id1"] = p.pop("row_id_1")
	if "row_id_2" in p and "row_id2" not in p:
	p["row_id2"] = p.pop("row_id_2")
	if "row1" in p and "row_id1" not in p:
	p["row_id1"] = p.pop("row1")
	if "row2" in p and "row_id2" not in p:
	p["row_id2"] = p.pop("row2")
	return p

	def _execute_action(self, action: DataCleanAction) -> Dict[str, Any]:
	"""Dispatch action to the appropriate handler."""
	handler = getattr(self, f"_action_{action.action_type}", None)
	if handler is None:
	return {
	"action": action.action_type,
	"status": "error",
	"message": f"Unknown action type: {action.action_type}",
	"cells_modified": 0,
	}
	# Normalize param aliases before dispatching
	normalized_params = self._normalize_action_params(action.action_type, action.params)
	try:
	return handler(normalized_params)
	except (KeyError, TypeError, IndexError) as exc:
	return {
	"action": action.action_type,
	"status": "error",
	"message": f"Invalid params: {exc}",
	"cells_modified": 0,
	}
	except Exception as exc:
	logger.exception("Unexpected error in action handler %s", action.action_type)
	return {
	"action": action.action_type,
	"status": "error",
	"message": str(exc),
	"cells_modified": 0,
	}

	# ------------------------------------------------------------------
	# Row Lookup by Stable row_id
	# ------------------------------------------------------------------

	def _find_row_by_id(self, row_id: int) -> tuple[int, Dict[str, Any] \| None]:
	"""Find the list index and row dict for a given stable row_id.

	Returns (index, row_dict) or (-1, None) if not found.
	"""
	for i, row in enumerate(self._state.current_data):
	if row.get("_row_id") == row_id:
	return i, row
	return -1, None

	# ------------------------------------------------------------------
	# Action Handlers (10 total) — all use stable row_id
	# ------------------------------------------------------------------

	def _action_fix_value(self, params: Dict[str, Any]) -> Dict[str, Any]:
	row_id = int(params["row_id"])
	column = str(params["column"])
	new_value = params["new_value"]

	idx, row = self._find_row_by_id(row_id)
	if row is None:
	return {"action": "fix_value", "status": "error",
	"message": f"row_id {row_id} not found", "cells_modified": 0}
	if column not in row or column.startswith("_"):
	return {"action": "fix_value", "status": "error",
	"message": f"Column '{column}' not found", "cells_modified": 0}

	old_value = row[column]
	if str(old_value) == str(new_value):
	return {"action": "fix_value", "status": "no_effect",
	"message": f"Value unchanged at (row_id={row_id}, '{column}')", "cells_modified": 0}

	row[column] = new_value
	return {"action": "fix_value", "status": "success",
	"message": f"(row_id={row_id}, '{column}'): '{old_value}' -> '{new_value}'",
	"cells_modified": 1, "old_value": old_value, "new_value": new_value,
	"row_id": row_id, "column": column}

	def _action_delete_row(self, params: Dict[str, Any]) -> Dict[str, Any]:
	row_id = int(params["row_id"])
	idx, row = self._find_row_by_id(row_id)

	if row is None:
	return {"action": "delete_row", "status": "error",
	"message": f"row_id {row_id} not found", "cells_modified": 0}

	deleted = self._state.current_data.pop(idx)
	return {"action": "delete_row", "status": "success",
	"message": f"row_id={row_id} deleted",
	"cells_modified": len(deleted), "deleted_data": deleted,
	"row_id": row_id, "deleted_entity_id": deleted.get("_entity_id")}

	def _action_fill_missing(self, params: Dict[str, Any]) -> Dict[str, Any]:
	row_id = int(params["row_id"])
	column = str(params["column"])
	value = params["value"]

	idx, row = self._find_row_by_id(row_id)
	if row is None:
	return {"action": "fill_missing", "status": "error",
	"message": f"row_id {row_id} not found", "cells_modified": 0}
	if column not in row or column.startswith("_"):
	return {"action": "fill_missing", "status": "error",
	"message": f"Column '{column}' not found", "cells_modified": 0}

	current = row.get(column)
	if current is not None and str(current).strip() != "":
	return {"action": "fill_missing", "status": "error",
	"message": f"Cell (row_id={row_id}, '{column}') is not empty: '{current}'",
	"cells_modified": 0}

	row[column] = value
	return {"action": "fill_missing", "status": "success",
	"message": f"(row_id={row_id}, '{column}'): NULL -> '{value}'",
	"cells_modified": 1, "row_id": row_id, "column": column, "new_value": value}

	def _action_standardize_format(self, params: Dict[str, Any]) -> Dict[str, Any]:
	column = str(params["column"])
	format_type = str(params["format_type"])
	data = self._state.current_data
	modified = 0
	errors: List[str] = []

	for row in data:
	if column not in row or row[column] is None:
	continue
	try:
	new_val = self._apply_format(row[column], format_type)
	if str(new_val) != str(row[column]):
	row[column] = new_val
	modified += 1
	except (ValueError, TypeError) as exc:
	errors.append(f"row_id={row.get('_row_id', '?')}: {exc}")

	if modified == 0 and not errors:
	return {"action": "standardize_format", "status": "no_effect",
	"message": f"No changes needed in '{column}' for {format_type}",
	"cells_modified": 0}

	msg = f"Formatted {modified} cell(s) in '{column}' to {format_type}"
	if errors:
	msg += f". {len(errors)} parse failure(s)."
	return {"action": "standardize_format", "status": "success",
	"message": msg, "cells_modified": modified}

	def _action_merge_duplicates(self, params: Dict[str, Any]) -> Dict[str, Any]:
	row_id1 = int(params["row_id1"])
	row_id2 = int(params["row_id2"])
	strategy = str(params.get("strategy", "merge_prefer_nonnull"))

	if row_id1 == row_id2:
	return {"action": "merge_duplicates", "status": "error",
	"message": "Cannot merge a row with itself", "cells_modified": 0}

	idx1, r1 = self._find_row_by_id(row_id1)
	idx2, r2 = self._find_row_by_id(row_id2)

	if r1 is None or r2 is None:
	missing = row_id1 if r1 is None else row_id2
	return {"action": "merge_duplicates", "status": "error",
	"message": f"row_id {missing} not found", "cells_modified": 0}

	# Track entity IDs for penalty checking
	eid1 = r1.get("_entity_id", "")
	eid2 = r2.get("_entity_id", "")

	merged = self._merge_rows(r1, r2, strategy)
	# Merged row keeps the first row's entity_id and row_id
	merged["_entity_id"] = eid1
	merged["_row_id"] = r1["_row_id"]

	# Remove both, insert merged at first position
	data = self._state.current_data
	lo_idx = min(idx1, idx2)
	hi_idx = max(idx1, idx2)
	data.pop(hi_idx)
	data.pop(lo_idx)
	data.insert(lo_idx, merged)

	return {"action": "merge_duplicates", "status": "success",
	"message": f"Merged row_id={row_id1} and row_id={row_id2} using '{strategy}'",
	"cells_modified": len(merged),
	"row_id1": row_id1, "row_id2": row_id2,
	"entity_id1": eid1, "entity_id2": eid2,
	"deleted_entity_id": eid2,
	"strategy": strategy}

	def _action_flag_anomaly(self, params: Dict[str, Any]) -> Dict[str, Any]:
	row_id = int(params["row_id"])
	column = str(params["column"])
	reason = str(params.get("reason", ""))

	idx, row = self._find_row_by_id(row_id)
	if row is None:
	return {"action": "flag_anomaly", "status": "error",
	"message": f"row_id {row_id} not found", "cells_modified": 0}

	self._state.flagged_cells.append(
	{"row_id": row_id, "column": column, "reason": reason}
	)
	return {"action": "flag_anomaly", "status": "success",
	"message": f"Flagged (row_id={row_id}, '{column}'): {reason}",
	"cells_modified": 0}

	def _action_escalate_to_human(self, params: Dict[str, Any]) -> Dict[str, Any]:
	"""Escalate a cell to human review -- agent signals it is uncertain."""
	row_id = int(params["row_id"])
	column = str(params["column"])
	confidence = float(params.get("confidence", 0.5))
	reason = str(params.get("reason", ""))

	idx, row = self._find_row_by_id(row_id)
	if row is None:
	return {"action": "escalate_to_human", "status": "error",
	"message": f"row_id {row_id} not found", "cells_modified": 0}

	self._state.escalated_cells.append({
	"row_id": row_id, "column": column,
	"confidence": confidence, "reason": reason,
	})
	return {"action": "escalate_to_human", "status": "success",
	"message": f"Escalated (row_id={row_id}, '{column}'): {reason} (confidence={confidence})",
	"cells_modified": 0}

	def _action_split_column(self, params: Dict[str, Any]) -> Dict[str, Any]:
	column = str(params["column"])
	delimiter = str(params["delimiter"])
	new_names = list(params["new_names"])
	data = self._state.current_data
	modified = 0

	for row in data:
	if column not in row or row[column] is None:
	continue
	parts = str(row[column]).split(delimiter, maxsplit=len(new_names) - 1)
	for i, name in enumerate(new_names):
	row[name] = parts[i].strip() if i < len(parts) else None
	del row[column]
	modified += 1

	if modified == 0:
	return {"action": "split_column", "status": "no_effect",
	"message": f"Column '{column}' not found or all null", "cells_modified": 0}
	return {"action": "split_column", "status": "success",
	"message": f"Split '{column}' into {new_names} ({modified} rows)",
	"cells_modified": modified}

	def _action_rename_column(self, params: Dict[str, Any]) -> Dict[str, Any]:
	old_name = str(params["old_name"])
	new_name = str(params["new_name"])
	data = self._state.current_data

	if not data or old_name not in data[0]:
	return {"action": "rename_column", "status": "error",
	"message": f"Column '{old_name}' not found", "cells_modified": 0}
	if data and new_name in data[0]:
	return {"action": "rename_column", "status": "error",
	"message": f"Column '{new_name}' already exists", "cells_modified": 0}
	if new_name.startswith("_"):
	return {"action": "rename_column", "status": "error",
	"message": f"Column names starting with '_' are reserved", "cells_modified": 0}

	for row in data:
	if old_name in row:
	row[new_name] = row.pop(old_name)
	return {"action": "rename_column", "status": "success",
	"message": f"Renamed '{old_name}' -> '{new_name}'",
	"cells_modified": len(data)}

	def _action_cast_type(self, params: Dict[str, Any]) -> Dict[str, Any]:
	column = str(params["column"])
	target_type = str(params["target_type"])
	valid_types = {"int", "float", "str", "bool", "date"}
	if target_type not in valid_types:
	return {"action": "cast_type", "status": "error",
	"message": f"Unknown type '{target_type}'. Valid: {sorted(valid_types)}",
	"cells_modified": 0}
	data = self._state.current_data
	modified = 0
	nullified = 0

	for row in data:
	if column not in row or row[column] is None:
	continue
	try:
	row[column] = self._cast_value(row[column], target_type)
	modified += 1
	except (ValueError, TypeError):
	row[column] = None
	nullified += 1

	msg = f"Cast {modified} cell(s) in '{column}' to {target_type}"
	if nullified:
	msg += f" ({nullified} failed -> null)"
	status = "success" if modified > 0 else ("error" if nullified > 0 else "no_effect")
	return {"action": "cast_type", "status": status,
	"message": msg, "cells_modified": modified + nullified}

	def _action_mark_complete(self, params: Dict[str, Any]) -> Dict[str, Any]:
	return {"action": "mark_complete", "status": "success",
	"message": "Agent signaled completion", "cells_modified": 0}

	# ------------------------------------------------------------------
	# Format Standardization (8 types, fully implemented)
	# ------------------------------------------------------------------

	def _apply_format(self, value: Any, format_type: str) -> Any:
	"""Apply format transformation to a single value."""
	val_str = str(value).strip()
	if not val_str:
	return value

	if format_type == "date:YYYY-MM-DD":
	return self._format_date_iso(val_str)
	elif format_type == "phone:US":
	return self._format_phone_us(val_str)
	elif format_type == "phone:E164":
	return self._format_phone_e164(val_str)
	elif format_type == "name:title_case":
	return val_str.title()
	elif format_type == "email:lowercase":
	return val_str.lower()
	elif format_type == "zip:5digit":
	return self._format_zip_5digit(val_str)
	elif format_type == "currency:float":
	return self._format_currency_float(val_str)
	elif format_type == "state:abbreviation":
	return self._format_state_abbrev(val_str)
	else:
	raise ValueError(f"Unknown format type: {format_type}")

	def _format_date_iso(self, val: str) -> str:
	"""Parse various date formats and return YYYY-MM-DD."""
	for fmt in DATE_PARSE_FORMATS:
	try:
	dt = datetime.strptime(val.strip(), fmt)
	return dt.strftime("%Y-%m-%d")
	except ValueError:
	continue
	raise ValueError(f"Cannot parse date: '{val}'")

	def _format_phone_us(self, val: str) -> str:
	"""Normalize phone to (XXX) XXX-XXXX format."""
	digits = re.sub(r"\D", "", val)
	if digits.startswith("1") and len(digits) == 11:
	digits = digits[1:]
	if len(digits) != 10:
	raise ValueError(f"Phone must have 10 digits, got {len(digits)}: '{val}'")
	return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"

	def _format_phone_e164(self, val: str) -> str:
	"""Normalize phone to +1XXXXXXXXXX format."""
	digits = re.sub(r"\D", "", val)
	if digits.startswith("1") and len(digits) == 11:
	digits = digits[1:]
	if len(digits) != 10:
	raise ValueError(f"Phone must have 10 digits, got {len(digits)}: '{val}'")
	return f"+1{digits}"

	def _format_zip_5digit(self, val: str) -> str:
	"""Normalize ZIP to 5 digits (pad or truncate)."""
	digits = re.sub(r"\D", "", val.split("-")[0])
	if not digits:
	raise ValueError(f"No digits in ZIP: '{val}'")
	return digits[:5].zfill(5)

	def _format_currency_float(self, val: str) -> float:
	"""Parse currency string to float. '$1,234.56' -> 1234.56"""
	cleaned = val.replace("$", "").replace(",", "").strip()
	if cleaned.lower().endswith("k"):
	return float(cleaned[:-1]) * 1000
	return float(cleaned)

	def _format_state_abbrev(self, val: str) -> str:
	"""Convert full state name to 2-letter abbreviation."""
	if len(val) == 2 and val.upper() in US_STATES.values():
	return val.upper()
	lower = val.strip().lower()
	if lower in US_STATES:
	return US_STATES[lower]
	raise ValueError(f"Unknown state: '{val}'")

	# ------------------------------------------------------------------
	# Row Merging (all strategies)
	# ------------------------------------------------------------------

	def _merge_rows(self, r1: Dict, r2: Dict, strategy: str) -> Dict:
	"""Merge two rows according to the given strategy."""
	if strategy == "keep_first":
	return copy.deepcopy(r1)
	elif strategy == "keep_second":
	return copy.deepcopy(r2)
	elif strategy == "merge_prefer_nonnull":
	merged: Dict[str, Any] = {}
	for key in dict.fromkeys(list(r1.keys()) + list(r2.keys())):
	v1 = r1.get(key)
	v2 = r2.get(key)
	if v1 is not None and str(v1).strip():
	merged[key] = v1
	elif v2 is not None and str(v2).strip():
	merged[key] = v2
	else:
	merged[key] = v1
	return merged
	elif strategy == "merge_prefer_row1":
	merged = copy.deepcopy(r2)
	for key, val in r1.items():
	if val is not None and str(val).strip():
	merged[key] = val
	return merged
	elif strategy == "merge_prefer_row2":
	merged = copy.deepcopy(r1)
	for key, val in r2.items():
	if val is not None and str(val).strip():
	merged[key] = val
	return merged
	else:
	raise ValueError(f"Unknown merge strategy: '{strategy}'")

	# ------------------------------------------------------------------
	# Type Casting
	# ------------------------------------------------------------------

	def _cast_value(self, value: Any, target_type: str) -> Any:
	"""Cast a value to the target type."""
	val_str = str(value).strip()
	if target_type == "int":
	return int(float(val_str.replace(",", "").replace("$", "")))
	elif target_type == "float":
	return float(val_str.replace(",", "").replace("$", ""))
	elif target_type == "str":
	return val_str
	elif target_type == "bool":
	return val_str.lower() in ("true", "1", "yes", "y")
	elif target_type == "date":
	return self._format_date_iso(val_str)
	else:
	raise ValueError(f"Unknown target type: '{target_type}'")

	# ------------------------------------------------------------------
	# Observation Builder
	# ------------------------------------------------------------------

	def _build_observation(
	self, reward: float \| None, done: bool
	) -> DataCleanObservation:
	"""Build issue-first observation from current state."""
	data = self._state.current_data
	columns = list(data[0].keys()) if data else []

	# Filter out internal fields EXCEPT _row_id (renamed to row_id for agent)
	hidden = {"_entity_id"}
	visible_columns = ["row_id"] + [c for c in columns if c not in hidden and c != "_row_id"]
	rows = [
	[row.get("_row_id")] + [row.get(col) for col in columns if col not in hidden and col != "_row_id"]
	for row in data
	]

	# Quality analysis
	quality_issues = self._analyze_quality()
	issue_groups = self._group_issues(quality_issues)

	# Data summary — count nulls using INTERNAL column names (not aliases)
	internal_cols = [c for c in columns if c not in hidden and c != "_row_id"]
	null_count = sum(
	1 for row in data for col in internal_cols
	if row.get(col) is None
	)
	data_summary = DataSummary(
	row_count=len(data),
	column_count=len(visible_columns),
	total_cells=len(visible_columns) * len(data),
	null_count=null_count,
	issue_count=len(quality_issues),
	columns=visible_columns,
	dtypes={
	col: self._state.schema_def.get("expected_types", {}).get(col, "str")
	for col in visible_columns
	},
	)

	# Recent actions (last 5)
	recent = [
	ActionResult(
	action=a.get("action", ""),
	status=a.get("status", ""),
	message=a.get("message", ""),
	cells_modified=a.get("cells_modified", 0),
	)
	for a in self._state.action_log[-5:]
	]

	# Build metadata with grade breakdown when episode ends
	metadata: Dict[str, Any] = {}
	grade = getattr(self, "_last_grade_result", None)
	if done and grade is not None:
	metadata = {
	"grade_breakdown": {
	"accuracy": grade.accuracy,
	"completeness": grade.completeness,
	"format_consistency": grade.format_consistency,
	"row_correctness": grade.row_correctness,
	"efficiency": grade.efficiency,
	"utility_score": grade.utility_score,
	"penalties": grade.penalties,
	"bonuses": grade.bonuses,
	},
	"utility_details": grade.utility_details,
	}

	return DataCleanObservation(
	done=done,
	reward=reward,
	metadata=metadata,
	data_summary=data_summary,
	quality_issues=quality_issues[:20], # Cap at 20 for readability
	issue_groups=issue_groups,
	issues_remaining=len(quality_issues),
	columns=visible_columns,
	rows=rows,
	row_count=len(data),
	schema_info=self._state.schema_def,
	step_number=self._state.step_count,
	max_steps=self._state.max_steps,
	steps_remaining=self._state.max_steps - self._state.step_count,
	budget_spent=self._state.budget_spent,
	budget_remaining=self._state.budget_remaining,
	action_costs=ACTION_COSTS,
	last_action_result=recent[-1] if recent else None,
	recent_actions=recent,
	task_id=self._state.task_id,
	task_name=getattr(self, "_task_name", self._state.task_id),
	difficulty=self._state.difficulty,
	)

	# ------------------------------------------------------------------
	# Quality Analysis
	# ------------------------------------------------------------------

	def _analyze_quality(self) -> List[QualityIssue]:
	"""Analyze current data and return detected quality issues."""
	issues: List[QualityIssue] = []
	schema = self._state.schema_def
	data = self._state.current_data
	constraints = schema.get("constraints", {})

	for row in data:
	rid = row.get("_row_id", 0)
	for col in [c for c in row if not c.startswith("_")]:
	val = row.get(col)
	col_constraints = constraints.get(col, {})

	# Null check
	if val is None and col_constraints.get("not_null"):
	issues.append(QualityIssue(
	row_id=rid, column=col, issue_type="null",
	description="Required field is null",
	))

	if val is None:
	continue

	# Format check
	fmt = col_constraints.get("format")
	if fmt and not self._matches_format(val, fmt):
	issues.append(QualityIssue(
	row_id=rid, column=col, issue_type="format",
	description=f"Does not match format: {fmt}",
	suggestion=f"Use standardize_format('{col}', '{self._suggest_format_type(fmt)}')",
	))

	# Allowed values
	allowed = col_constraints.get("allowed_values")
	if allowed and str(val) not in allowed:
	issues.append(QualityIssue(
	row_id=rid, column=col, issue_type="type_violation",
	description=f"Value '{val}' not in allowed values",
	))

	# Duplicate detection
	issues.extend(self._detect_potential_duplicates())

	# Cross-field validation (for hard mode)
	issues.extend(self._detect_cross_field_issues())

	return issues

	def _detect_cross_field_issues(self) -> List[QualityIssue]:
	"""Detect cross-field inconsistencies: zip/city, date relationships, insurance ID prefixes."""
	issues: List[QualityIssue] = []
	data = self._state.current_data
	schema = self._state.schema_def
	cross_field_rules = schema.get("cross_field_rules", {})

	# Rule: zip_city_match — zip code should correspond to the city
	zip_city_map = cross_field_rules.get("zip_city_map", {})
	if zip_city_map:
	for row in data:
	rid = row.get("_row_id", 0)
	zip_val = str(row.get("zip", row.get("office_zip", ""))).strip()
	city_val = str(row.get("city", row.get("office_city", ""))).strip().lower()
	if zip_val in zip_city_map:
	expected_city = zip_city_map[zip_val].lower()
	if city_val and city_val != expected_city:
	issues.append(QualityIssue(
	row_id=rid, column="zip",
	issue_type="cross_field",
	description=f"ZIP '{zip_val}' should map to '{zip_city_map[zip_val]}', got '{row.get('city', row.get('office_city', ''))}'",
	suggestion=f"fix_value(row_id={rid}, column='city', new_value='{zip_city_map[zip_val]}')",
	))

	# Rule: date_order — dob must be before last_visit_date
	if "dob" in schema.get("expected_types", {}) and "last_visit_date" in schema.get("expected_types", {}):
	for row in data:
	rid = row.get("_row_id", 0)
	dob = row.get("dob")
	visit = row.get("last_visit_date")
	if dob and visit:
	try:
	dob_dt = datetime.strptime(str(dob), "%Y-%m-%d")
	visit_dt = datetime.strptime(str(visit), "%Y-%m-%d")
	if dob_dt > visit_dt:
	issues.append(QualityIssue(
	row_id=rid, column="dob",
	issue_type="cross_field",
	description=f"DOB '{dob}' is after last_visit_date '{visit}'",
	))
	if dob_dt > datetime.now():
	issues.append(QualityIssue(
	row_id=rid, column="dob",
	issue_type="cross_field",
	description=f"DOB '{dob}' is in the future",
	))
	except ValueError:
	pass

	# Rule: insurance_prefix — insurance_id prefix must match provider
	prefix_map = cross_field_rules.get("insurance_prefix_map", {})
	if prefix_map:
	for row in data:
	rid = row.get("_row_id", 0)
	provider = str(row.get("insurance_provider", "")).strip()
	ins_id = str(row.get("insurance_id", "")).strip()
	if provider and ins_id and provider in prefix_map:
	expected_prefix = prefix_map[provider]
	if not ins_id.startswith(expected_prefix):
	issues.append(QualityIssue(
	row_id=rid, column="insurance_id",
	issue_type="cross_field",
	description=f"Insurance ID '{ins_id}' should start with '{expected_prefix}' for provider '{provider}'",
	))

	return issues

	def _detect_potential_duplicates(self) -> List[QualityIssue]:
	"""Detect potential duplicate rows by email, phone, or name similarity."""
	issues: List[QualityIssue] = []
	data = self._state.current_data

	# Check by email
	email_index: Dict[str, List[int]] = {}
	for row in data:
	rid = row.get("_row_id", 0)
	email = row.get("email")
	if email and str(email).strip():
	key = str(email).strip().lower()
	email_index.setdefault(key, []).append(rid)
	for email, row_ids in email_index.items():
	if len(row_ids) > 1:
	issues.append(QualityIssue(
	row_id=row_ids[0], column="email", issue_type="duplicate",
	description=f"Rows {row_ids} share email '{email}'",
	suggestion=f"Consider merge_duplicates(row_id1={row_ids[0]}, row_id2={row_ids[1]}, strategy='merge_prefer_nonnull')",
	))

	# Check by phone (digit-only comparison)
	phone_index: Dict[str, List[int]] = {}
	for row in data:
	rid = row.get("_row_id", 0)
	phone = row.get("phone")
	if phone and str(phone).strip():
	digits = re.sub(r"\D", "", str(phone))
	if digits.startswith("1") and len(digits) == 11:
	digits = digits[1:]
	if len(digits) == 10:
	phone_index.setdefault(digits, []).append(rid)
	for digits, row_ids in phone_index.items():
	if len(row_ids) > 1:
	# Avoid duplicate issues if already flagged by email
	issues.append(QualityIssue(
	row_id=row_ids[0], column="phone", issue_type="duplicate",
	description=f"Rows {row_ids} share phone digits '{digits}'",
	))

	return issues

	def _group_issues(self, issues: List[QualityIssue]) -> List[IssueGroup]:
	"""Group issues by type for compact display."""
	type_counter: Dict[str, List[QualityIssue]] = {}
	for issue in issues:
	type_counter.setdefault(issue.issue_type, []).append(issue)

	return [
	IssueGroup(
	issue_type=itype,
	count=len(items),
	examples=items[:3], # Show max 3 examples per type
	)
	for itype, items in sorted(type_counter.items())
	]

	def _matches_format(self, value: Any, format_spec: str) -> bool:
	"""Check if a value matches the expected format."""
	val_str = str(value)
	format_patterns: Dict[str, str] = {
	"YYYY-MM-DD": r"^\d{4}-\d{2}-\d{2}$",
	"(XXX) XXX-XXXX": r"^$\d{3}$ \d{3}-\d{4}$",
	"email": r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$",
	"5_digit": r"^\d{5}$",
	"+1XXXXXXXXXX": r"^\+1\d{10}$",
	}
	pattern = format_patterns.get(format_spec)
	if pattern:
	return bool(re.match(pattern, val_str))
	return True

	def _suggest_format_type(self, format_spec: str) -> str:
	"""Suggest the standardize_format type for a given format spec."""
	mapping = {
	"YYYY-MM-DD": "date:YYYY-MM-DD",
	"(XXX) XXX-XXXX": "phone:US",
	"email": "email:lowercase",
	"5_digit": "zip:5digit",
	"+1XXXXXXXXXX": "phone:E164",
	}
	return mapping.get(format_spec, format_spec)

	# ------------------------------------------------------------------
	# Metadata
	# ------------------------------------------------------------------

	def get_metadata(self): # type: ignore[override]
	from openenv.core.env_server.types import EnvironmentMetadata

	return EnvironmentMetadata(
	name="dataclean_env",
	description=(
	"Data Cleaning environment for training AI agents to clean "
	"messy tabular data. Supports 3 difficulty levels (easy, medium, hard) "
	"with deterministic grading via cell-by-cell comparison against ground truth."
	),
	version="0.1.0",
	)