Spaces:
Sleeping
Sleeping
Sync server/dataset_generator.py
Browse files- server/dataset_generator.py +78 -1
server/dataset_generator.py
CHANGED
|
@@ -93,13 +93,20 @@ COMMANDS_HELP = """Available commands:
|
|
| 93 |
find_missing - Show missing value counts per column
|
| 94 |
find_duplicates [COL1,COL2] - Find duplicate rows (optional subset of columns)
|
| 95 |
find_outliers COL - Statistical outlier detection for a numeric column
|
|
|
|
|
|
|
| 96 |
fill_missing COL STRATEGY [VALUE] - Fill nulls (mean/median/mode/constant VALUE)
|
| 97 |
remove_duplicates [COL1,COL2] [KEEP] - Drop duplicates (keep: first/last/none)
|
| 98 |
fix_dtype COL TYPE - Cast column to type (int/float/str/datetime)
|
| 99 |
replace COL OLD NEW - Replace specific values in a column
|
|
|
|
| 100 |
standardize COL METHOD - Normalize formatting (lowercase/uppercase/titlecase/strip)
|
| 101 |
remove_rows COL CONDITION VALUE - Remove rows (CONDITION: equals/not_equals/less_than/greater_than/contains)
|
| 102 |
clip COL LOWER UPPER - Clip numeric values to [LOWER, UPPER]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
validate - Check current quality score without submitting
|
| 104 |
submit - Finalize and grade the cleaned dataset (ends episode)
|
| 105 |
"""
|
|
@@ -128,6 +135,15 @@ def generate_task_1_easy(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
|
|
| 128 |
"city": city, "signup_date": signup_date,
|
| 129 |
})
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
clean_df = pd.DataFrame(rows)
|
| 132 |
|
| 133 |
# Create dirty copy
|
|
@@ -149,7 +165,6 @@ def generate_task_1_easy(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
|
|
| 149 |
issues["duplicates"] = [{"source_row": idx} for idx in dupe_indices]
|
| 150 |
|
| 151 |
# Inject city name typos (3 random rows)
|
| 152 |
-
available_cities = [c for c in CITIES if c in CITY_TYPOS]
|
| 153 |
typo_indices = rng.sample(range(n_rows), min(3, n_rows))
|
| 154 |
for idx in typo_indices:
|
| 155 |
original_city = dirty_df.at[idx, "city"]
|
|
@@ -161,6 +176,20 @@ def generate_task_1_easy(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
|
|
| 161 |
# Convert age to float (because of NaN)
|
| 162 |
dirty_df["age"] = pd.to_numeric(dirty_df["age"], errors="coerce")
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
return dirty_df, clean_df, issues
|
| 165 |
|
| 166 |
|
|
@@ -190,6 +219,14 @@ def generate_task_2_medium(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFra
|
|
| 190 |
"customer_id": customer_id, "region": region,
|
| 191 |
})
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
clean_df = pd.DataFrame(rows)
|
| 194 |
dirty_df = clean_df.copy()
|
| 195 |
|
|
@@ -245,6 +282,23 @@ def generate_task_2_medium(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFra
|
|
| 245 |
dirty_df.at[idx, "price"] = round(rng.uniform(50000, 99999), 2)
|
| 246 |
issues["outliers"].append({"row": idx, "column": "price"})
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
return dirty_df, clean_df, issues
|
| 249 |
|
| 250 |
|
|
@@ -414,6 +468,29 @@ def generate_task_3_hard(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
|
|
| 414 |
except (ValueError, TypeError):
|
| 415 |
pass
|
| 416 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
return dirty_df, clean_df, issues
|
| 418 |
|
| 419 |
|
|
|
|
| 93 |
find_missing - Show missing value counts per column
|
| 94 |
find_duplicates [COL1,COL2] - Find duplicate rows (optional subset of columns)
|
| 95 |
find_outliers COL - Statistical outlier detection for a numeric column
|
| 96 |
+
check_rules - Check business rule violations
|
| 97 |
+
history - Show operation history (data lineage)
|
| 98 |
fill_missing COL STRATEGY [VALUE] - Fill nulls (mean/median/mode/constant VALUE)
|
| 99 |
remove_duplicates [COL1,COL2] [KEEP] - Drop duplicates (keep: first/last/none)
|
| 100 |
fix_dtype COL TYPE - Cast column to type (int/float/str/datetime)
|
| 101 |
replace COL OLD NEW - Replace specific values in a column
|
| 102 |
+
regex_replace COL PATTERN REPLACEMENT - Regex-based replacement
|
| 103 |
standardize COL METHOD - Normalize formatting (lowercase/uppercase/titlecase/strip)
|
| 104 |
remove_rows COL CONDITION VALUE - Remove rows (CONDITION: equals/not_equals/less_than/greater_than/contains)
|
| 105 |
clip COL LOWER UPPER - Clip numeric values to [LOWER, UPPER]
|
| 106 |
+
rename_column OLD_NAME NEW_NAME - Rename a column
|
| 107 |
+
drop_column COL - Remove a column
|
| 108 |
+
sort COL [asc|desc] - Sort data by column
|
| 109 |
+
undo - Undo the last data-modifying operation
|
| 110 |
validate - Check current quality score without submitting
|
| 111 |
submit - Finalize and grade the cleaned dataset (ends episode)
|
| 112 |
"""
|
|
|
|
| 135 |
"city": city, "signup_date": signup_date,
|
| 136 |
})
|
| 137 |
|
| 138 |
+
# ββ Red herring rows: valid-but-suspicious data ββ
|
| 139 |
+
# These SHOULD NOT be cleaned β tests agents don't over-clean
|
| 140 |
+
rows.append({"name": "Null Fisher", "email": "null.fisher42@example.com",
|
| 141 |
+
"age": 45, "city": "New York", "signup_date": "2023-06-15"})
|
| 142 |
+
rows.append({"name": "None Yamada", "email": "none.yamada7@example.com",
|
| 143 |
+
"age": 28, "city": "Los Angeles", "signup_date": "2022-12-01"})
|
| 144 |
+
rows.append({"name": "Na Lee", "email": "na.lee99@example.com",
|
| 145 |
+
"age": 0, "city": "Chicago", "signup_date": "2024-01-01"}) # Baby
|
| 146 |
+
|
| 147 |
clean_df = pd.DataFrame(rows)
|
| 148 |
|
| 149 |
# Create dirty copy
|
|
|
|
| 165 |
issues["duplicates"] = [{"source_row": idx} for idx in dupe_indices]
|
| 166 |
|
| 167 |
# Inject city name typos (3 random rows)
|
|
|
|
| 168 |
typo_indices = rng.sample(range(n_rows), min(3, n_rows))
|
| 169 |
for idx in typo_indices:
|
| 170 |
original_city = dirty_df.at[idx, "city"]
|
|
|
|
| 176 |
# Convert age to float (because of NaN)
|
| 177 |
dirty_df["age"] = pd.to_numeric(dirty_df["age"], errors="coerce")
|
| 178 |
|
| 179 |
+
# ββ Golden rows: select rows that must not be damaged (anti-exploit) ββ
|
| 180 |
+
golden_indices = rng.sample(range(len(clean_df)), min(5, len(clean_df)))
|
| 181 |
+
issues["golden_indices"] = golden_indices
|
| 182 |
+
|
| 183 |
+
# ββ Business rules ββ
|
| 184 |
+
issues["business_rules"] = [
|
| 185 |
+
{"type": "range", "column": "age", "min": 0, "max": 120,
|
| 186 |
+
"description": "Age must be between 0 and 120"},
|
| 187 |
+
{"type": "not_null", "column": "email",
|
| 188 |
+
"description": "Email address is required"},
|
| 189 |
+
{"type": "pattern", "column": "email", "pattern": r".*@.*\..*",
|
| 190 |
+
"description": "Email must contain @ and domain"},
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
return dirty_df, clean_df, issues
|
| 194 |
|
| 195 |
|
|
|
|
| 219 |
"customer_id": customer_id, "region": region,
|
| 220 |
})
|
| 221 |
|
| 222 |
+
# ββ Red herring rows: legitimate edge cases that should NOT be cleaned ββ
|
| 223 |
+
rows.append({"transaction_id": "TXN-FREE01", "product": "Promotional Sticker",
|
| 224 |
+
"category": "Accessories", "price": 0.00, "quantity": 1,
|
| 225 |
+
"date": "2023-07-04", "customer_id": "CUST-1500", "region": "North"})
|
| 226 |
+
rows.append({"transaction_id": "TXN-BULK01", "product": "Ethernet Cable",
|
| 227 |
+
"category": "Networking", "price": 2.99, "quantity": 500,
|
| 228 |
+
"date": "2023-11-24", "customer_id": "CUST-1001", "region": "Central"})
|
| 229 |
+
|
| 230 |
clean_df = pd.DataFrame(rows)
|
| 231 |
dirty_df = clean_df.copy()
|
| 232 |
|
|
|
|
| 282 |
dirty_df.at[idx, "price"] = round(rng.uniform(50000, 99999), 2)
|
| 283 |
issues["outliers"].append({"row": idx, "column": "price"})
|
| 284 |
|
| 285 |
+
# ββ Golden rows ββ
|
| 286 |
+
golden_indices = rng.sample(range(len(clean_df)), min(8, len(clean_df)))
|
| 287 |
+
issues["golden_indices"] = golden_indices
|
| 288 |
+
|
| 289 |
+
# ββ Business rules ββ
|
| 290 |
+
issues["business_rules"] = [
|
| 291 |
+
{"type": "range", "column": "price", "min": 0, "max": 10000,
|
| 292 |
+
"description": "Price must be between $0 and $10,000"},
|
| 293 |
+
{"type": "range", "column": "quantity", "min": 1, "max": 1000,
|
| 294 |
+
"description": "Quantity must be between 1 and 1000"},
|
| 295 |
+
{"type": "not_null", "column": "transaction_id",
|
| 296 |
+
"description": "Transaction ID is required"},
|
| 297 |
+
{"type": "categorical", "column": "region",
|
| 298 |
+
"allowed_values": ["North", "South", "East", "West", "Central"],
|
| 299 |
+
"description": "Region must be a valid US region"},
|
| 300 |
+
]
|
| 301 |
+
|
| 302 |
return dirty_df, clean_df, issues
|
| 303 |
|
| 304 |
|
|
|
|
| 468 |
except (ValueError, TypeError):
|
| 469 |
pass
|
| 470 |
|
| 471 |
+
# ββ Golden rows: important rows for anti-exploit ββ
|
| 472 |
+
golden_indices = rng.sample(range(len(clean_df)), min(15, len(clean_df)))
|
| 473 |
+
issues["golden_indices"] = golden_indices
|
| 474 |
+
|
| 475 |
+
# ββ Business rules (healthcare domain constraints) ββ
|
| 476 |
+
issues["business_rules"] = [
|
| 477 |
+
{"type": "range", "column": "height_cm", "min": 30, "max": 250,
|
| 478 |
+
"description": "Height must be between 30cm and 250cm"},
|
| 479 |
+
{"type": "range", "column": "weight_kg", "min": 1, "max": 300,
|
| 480 |
+
"description": "Weight must be between 1kg and 300kg"},
|
| 481 |
+
{"type": "cross_column", "column_a": "bp_systolic", "column_b": "bp_diastolic",
|
| 482 |
+
"relation": "greater_than",
|
| 483 |
+
"description": "Systolic BP must be greater than Diastolic BP"},
|
| 484 |
+
{"type": "categorical", "column": "gender",
|
| 485 |
+
"allowed_values": ["Male", "Female"],
|
| 486 |
+
"description": "Gender must be 'Male' or 'Female'"},
|
| 487 |
+
{"type": "categorical", "column": "blood_type",
|
| 488 |
+
"allowed_values": ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"],
|
| 489 |
+
"description": "Blood type must be a valid ABO+Rh type"},
|
| 490 |
+
{"type": "not_null", "column": "patient_id",
|
| 491 |
+
"description": "Patient ID is required"},
|
| 492 |
+
]
|
| 493 |
+
|
| 494 |
return dirty_df, clean_df, issues
|
| 495 |
|
| 496 |
|