Spaces:
Sleeping
Sleeping
| """ | |
| Dataset Generator for Data-Centric RL Environment. | |
| Generates corrupted sklearn classification datasets with known ground truth. | |
| Each task has deterministic corruptions via seeded random.Random. | |
| CRITICAL: Always produces TWO copies: | |
| ground_truth β frozen, only read by grader | |
| working_copy β the only thing the agent can mutate | |
| """ | |
| import random | |
| from copy import deepcopy | |
| from typing import Any, Dict, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.datasets import make_classification | |
| # ββ Column metadata schema ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_col_meta(expected_dtype: str, valid_range=None, | |
| valid_categories=None, is_nullable: bool = False) -> Dict: | |
| return { | |
| "expected_dtype": expected_dtype, | |
| "valid_range": valid_range, | |
| "valid_categories": valid_categories, | |
| "is_nullable": is_nullable, | |
| } | |
| # ββ Task configurations βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TASK_CONFIGS = { | |
| "task_0_tutorial": { | |
| "n_samples": 100, | |
| "n_features": 4, | |
| "n_classes": 2, | |
| "n_informative": 3, | |
| "budget": 30, | |
| "target_accuracy": 0.73, | |
| "baseline_accuracy": 0.62, | |
| "description": "Single-issue tutorial. Fix missing values in 'age' to win.", | |
| }, | |
| "task_1_easy": { | |
| "n_samples": 200, | |
| "n_features": 5, | |
| "n_classes": 2, | |
| "n_informative": 4, | |
| "budget": 25, | |
| "target_accuracy": 0.79, | |
| "baseline_accuracy": 0.63, | |
| "description": "Missing values + mild class imbalance.", | |
| }, | |
| "task_2_medium": { | |
| "n_samples": 500, | |
| "n_features": 7, | |
| "n_classes": 3, | |
| "n_informative": 5, | |
| "budget": 40, | |
| "target_accuracy": 0.74, | |
| "baseline_accuracy": 0.58, | |
| "description": "Missing values, duplicates, class imbalance, type error.", | |
| }, | |
| "task_3_hard": { | |
| "n_samples": 900, | |
| "n_features": 10, | |
| "n_classes": 4, | |
| "n_informative": 7, | |
| "budget": 60, | |
| "target_accuracy": 0.71, | |
| "baseline_accuracy": 0.54, | |
| "description": "Missing values, duplicates, imbalance, type errors, outliers, cross-column errors.", | |
| }, | |
| } | |
| # ββ Generic feature names βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| FEATURE_NAMES = ["age", "income", "score", "tenure", "balance", | |
| "transactions", "risk_level", "credit", "spend", "savings"] | |
| def _build_column_meta(feature_cols: list, task: str) -> Dict[str, Dict]: | |
| meta = {} | |
| for col in feature_cols: | |
| meta[col] = _make_col_meta("float64", valid_range=(-10.0, 10.0)) | |
| # age gets tighter range for tutorial plausibility | |
| if "age" in meta: | |
| meta["age"] = _make_col_meta("float64", valid_range=(0.0, 100.0)) | |
| meta["target"] = _make_col_meta("int64", valid_categories=None) | |
| return meta | |
| # ββ Core generator ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_dataset(task: str, seed: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any]]: | |
| """ | |
| Generate a corrupted dataset for the given task. | |
| Returns: | |
| ground_truth β clean DataFrame (frozen) | |
| working_copy β corrupted DataFrame (agent mutates this) | |
| metadata β task config + column metadata + original_length | |
| """ | |
| cfg = TASK_CONFIGS[task] | |
| rng = random.Random(seed) | |
| np_rng = np.random.RandomState(seed) | |
| n = cfg["n_samples"] | |
| n_feat = cfg["n_features"] | |
| n_cls = cfg["n_classes"] | |
| # ββ Generate clean classification data ββββββββββββββββββββββββββββββββββ | |
| X, y = make_classification( | |
| n_samples=n, | |
| n_features=n_feat, | |
| n_informative=cfg["n_informative"], | |
| n_redundant=max(0, n_feat - cfg["n_informative"] - 1), | |
| n_classes=n_cls, | |
| n_clusters_per_class=1, | |
| weights=None, | |
| random_state=seed, | |
| ) | |
| cols = FEATURE_NAMES[:n_feat] | |
| df_clean = pd.DataFrame(X, columns=cols) | |
| df_clean["target"] = y | |
| # Rescale 'age' column to [18, 80] for plausibility | |
| if "age" in df_clean.columns: | |
| mn, mx = df_clean["age"].min(), df_clean["age"].max() | |
| df_clean["age"] = ((df_clean["age"] - mn) / (mx - mn + 1e-9)) * 62 + 18 | |
| ground_truth = deepcopy(df_clean) | |
| working_copy = deepcopy(df_clean) | |
| # ββ Inject corruptions into working_copy only ββββββββββββββββββββββββββββ | |
| _inject_corruptions(working_copy, task, cfg, rng, np_rng, seed) | |
| col_meta = _build_column_meta(cols, task) | |
| metadata = { | |
| **cfg, | |
| "task": task, | |
| "seed": seed, | |
| "feature_cols": cols, | |
| "col_meta": col_meta, | |
| "original_length": len(working_copy), | |
| "class_names": [str(c) for c in sorted(working_copy["target"].unique())], | |
| } | |
| return ground_truth, working_copy, metadata | |
| def _inject_corruptions(df: pd.DataFrame, task: str, cfg: dict, | |
| rng: random.Random, np_rng: np.random.RandomState, | |
| seed: int): | |
| """Inject task-specific corruptions into df in-place.""" | |
| if task == "task_0_tutorial": | |
| # Single issue: 20% missing in age only | |
| _inject_missing(df, ["age"], frac=0.20, rng=rng) | |
| elif task == "task_1_easy": | |
| # Missing values 15% + mild class imbalance | |
| cols = df.columns[:-1].tolist() | |
| _inject_missing(df, cols[:2], frac=0.15, rng=rng) | |
| _inject_class_imbalance(df, ratio=0.60, rng=rng, seed=seed) | |
| elif task == "task_2_medium": | |
| cols = df.columns[:-1].tolist() | |
| _inject_missing(df, cols[:3], frac=0.12, rng=rng) | |
| _inject_duplicates(df, frac=0.05, rng=rng) | |
| _inject_class_imbalance(df, ratio=0.55, rng=rng, seed=seed) | |
| _inject_type_error(df, cols[0], rng=rng, frac=0.04) | |
| elif task == "task_3_hard": | |
| cols = df.columns[:-1].tolist() | |
| _inject_missing(df, cols[:4], frac=0.10, rng=rng) | |
| _inject_duplicates(df, frac=0.05, rng=rng) | |
| _inject_class_imbalance(df, ratio=0.50, rng=rng, seed=seed) | |
| _inject_type_error(df, cols[0], rng=rng, frac=0.03) | |
| _inject_outliers(df, cols[1], rng=rng, frac=0.03) | |
| _inject_cross_column_errors(df, cols[2], cols[3], rng=rng, frac=0.02) | |
| def _inject_missing(df: pd.DataFrame, cols: list, frac: float, rng: random.Random): | |
| for col in cols: | |
| if col not in df.columns: | |
| continue | |
| indices = rng.sample(range(len(df)), int(len(df) * frac)) | |
| df.loc[indices, col] = np.nan | |
| def _inject_duplicates(df: pd.DataFrame, frac: float, rng: random.Random): | |
| n_dups = max(1, int(len(df) * frac)) | |
| dup_indices = rng.choices(range(len(df)), k=n_dups) | |
| dups = df.iloc[dup_indices].copy() | |
| new_df = pd.concat([df, dups], ignore_index=True) | |
| # Mutate the caller's DataFrame in-place by clearing and re-populating | |
| df.drop(df.index, inplace=True) | |
| df.drop(df.columns, axis=1, inplace=True) | |
| for col in new_df.columns: | |
| df[col] = new_df[col].values | |
| df.reset_index(drop=True, inplace=True) | |
| def _inject_class_imbalance(df: pd.DataFrame, ratio: float, | |
| rng: random.Random, seed: int): | |
| """Make class 0 account for `ratio` of rows, drop minority excess.""" | |
| target_col = "target" | |
| classes = df[target_col].unique() | |
| if len(classes) < 2: | |
| return | |
| major = int(classes[0]) | |
| n_major = int(len(df) * ratio) | |
| major_idx = df[df[target_col] == major].index.tolist() | |
| if len(major_idx) > n_major: | |
| drop_n = len(major_idx) - n_major | |
| to_drop = rng.sample(major_idx, drop_n) | |
| df.drop(to_drop, inplace=True) | |
| df.reset_index(drop=True, inplace=True) | |
| def _inject_type_error(df: pd.DataFrame, col: str, rng: random.Random, frac: float): | |
| """Replace some float values with string 'ERR' to simulate type errors.""" | |
| if col not in df.columns: | |
| return | |
| indices = rng.sample(range(len(df)), max(1, int(len(df) * frac))) | |
| df[col] = df[col].astype(object) | |
| for i in indices: | |
| df.at[i, col] = "ERR" | |
| def _inject_outliers(df: pd.DataFrame, col: str, rng: random.Random, frac: float): | |
| if col not in df.columns: | |
| return | |
| indices = rng.sample(range(len(df)), max(1, int(len(df) * frac))) | |
| for i in indices: | |
| df.at[i, col] = rng.choice([999.0, -999.0]) | |
| def _inject_cross_column_errors(df: pd.DataFrame, col_a: str, col_b: str, | |
| rng: random.Random, frac: float): | |
| """Make col_a < col_b for some rows (e.g. min > max violations).""" | |
| if col_a not in df.columns or col_b not in df.columns: | |
| return | |
| indices = rng.sample(range(len(df)), max(1, int(len(df) * frac))) | |
| for i in indices: | |
| try: | |
| a = float(df.at[i, col_a]) | |
| b = float(df.at[i, col_b]) | |
| if a >= b: | |
| df.at[i, col_a], df.at[i, col_b] = b - 1.0, a + 1.0 | |
| except (ValueError, TypeError): | |
| pass | |