Aswini-Kumar commited on
Commit
6451040
Β·
verified Β·
1 Parent(s): 10647f4

Sync server/dataset_generator.py

Browse files
Files changed (1) hide show
  1. server/dataset_generator.py +78 -1
server/dataset_generator.py CHANGED
@@ -93,13 +93,20 @@ COMMANDS_HELP = """Available commands:
93
  find_missing - Show missing value counts per column
94
  find_duplicates [COL1,COL2] - Find duplicate rows (optional subset of columns)
95
  find_outliers COL - Statistical outlier detection for a numeric column
 
 
96
  fill_missing COL STRATEGY [VALUE] - Fill nulls (mean/median/mode/constant VALUE)
97
  remove_duplicates [COL1,COL2] [KEEP] - Drop duplicates (keep: first/last/none)
98
  fix_dtype COL TYPE - Cast column to type (int/float/str/datetime)
99
  replace COL OLD NEW - Replace specific values in a column
 
100
  standardize COL METHOD - Normalize formatting (lowercase/uppercase/titlecase/strip)
101
  remove_rows COL CONDITION VALUE - Remove rows (CONDITION: equals/not_equals/less_than/greater_than/contains)
102
  clip COL LOWER UPPER - Clip numeric values to [LOWER, UPPER]
 
 
 
 
103
  validate - Check current quality score without submitting
104
  submit - Finalize and grade the cleaned dataset (ends episode)
105
  """
@@ -128,6 +135,15 @@ def generate_task_1_easy(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
128
  "city": city, "signup_date": signup_date,
129
  })
130
 
 
 
 
 
 
 
 
 
 
131
  clean_df = pd.DataFrame(rows)
132
 
133
  # Create dirty copy
@@ -149,7 +165,6 @@ def generate_task_1_easy(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
149
  issues["duplicates"] = [{"source_row": idx} for idx in dupe_indices]
150
 
151
  # Inject city name typos (3 random rows)
152
- available_cities = [c for c in CITIES if c in CITY_TYPOS]
153
  typo_indices = rng.sample(range(n_rows), min(3, n_rows))
154
  for idx in typo_indices:
155
  original_city = dirty_df.at[idx, "city"]
@@ -161,6 +176,20 @@ def generate_task_1_easy(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
161
  # Convert age to float (because of NaN)
162
  dirty_df["age"] = pd.to_numeric(dirty_df["age"], errors="coerce")
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  return dirty_df, clean_df, issues
165
 
166
 
@@ -190,6 +219,14 @@ def generate_task_2_medium(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFra
190
  "customer_id": customer_id, "region": region,
191
  })
192
 
 
 
 
 
 
 
 
 
193
  clean_df = pd.DataFrame(rows)
194
  dirty_df = clean_df.copy()
195
 
@@ -245,6 +282,23 @@ def generate_task_2_medium(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFra
245
  dirty_df.at[idx, "price"] = round(rng.uniform(50000, 99999), 2)
246
  issues["outliers"].append({"row": idx, "column": "price"})
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  return dirty_df, clean_df, issues
249
 
250
 
@@ -414,6 +468,29 @@ def generate_task_3_hard(rng: random.Random) -> Tuple[pd.DataFrame, pd.DataFrame
414
  except (ValueError, TypeError):
415
  pass
416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  return dirty_df, clean_df, issues
418
 
419
 
 
93
  find_missing - Show missing value counts per column
94
  find_duplicates [COL1,COL2] - Find duplicate rows (optional subset of columns)
95
  find_outliers COL - Statistical outlier detection for a numeric column
96
+ check_rules - Check business rule violations
97
+ history - Show operation history (data lineage)
98
  fill_missing COL STRATEGY [VALUE] - Fill nulls (mean/median/mode/constant VALUE)
99
  remove_duplicates [COL1,COL2] [KEEP] - Drop duplicates (keep: first/last/none)
100
  fix_dtype COL TYPE - Cast column to type (int/float/str/datetime)
101
  replace COL OLD NEW - Replace specific values in a column
102
+ regex_replace COL PATTERN REPLACEMENT - Regex-based replacement
103
  standardize COL METHOD - Normalize formatting (lowercase/uppercase/titlecase/strip)
104
  remove_rows COL CONDITION VALUE - Remove rows (CONDITION: equals/not_equals/less_than/greater_than/contains)
105
  clip COL LOWER UPPER - Clip numeric values to [LOWER, UPPER]
106
+ rename_column OLD_NAME NEW_NAME - Rename a column
107
+ drop_column COL - Remove a column
108
+ sort COL [asc|desc] - Sort data by column
109
+ undo - Undo the last data-modifying operation
110
  validate - Check current quality score without submitting
111
  submit - Finalize and grade the cleaned dataset (ends episode)
112
  """
 
135
  "city": city, "signup_date": signup_date,
136
  })
137
 
138
+ # ── Red herring rows: valid-but-suspicious data ──
139
+ # These SHOULD NOT be cleaned β€” tests agents don't over-clean
140
+ rows.append({"name": "Null Fisher", "email": "null.fisher42@example.com",
141
+ "age": 45, "city": "New York", "signup_date": "2023-06-15"})
142
+ rows.append({"name": "None Yamada", "email": "none.yamada7@example.com",
143
+ "age": 28, "city": "Los Angeles", "signup_date": "2022-12-01"})
144
+ rows.append({"name": "Na Lee", "email": "na.lee99@example.com",
145
+ "age": 0, "city": "Chicago", "signup_date": "2024-01-01"}) # Baby
146
+
147
  clean_df = pd.DataFrame(rows)
148
 
149
  # Create dirty copy
 
165
  issues["duplicates"] = [{"source_row": idx} for idx in dupe_indices]
166
 
167
  # Inject city name typos (3 random rows)
 
168
  typo_indices = rng.sample(range(n_rows), min(3, n_rows))
169
  for idx in typo_indices:
170
  original_city = dirty_df.at[idx, "city"]
 
176
  # Convert age to float (because of NaN)
177
  dirty_df["age"] = pd.to_numeric(dirty_df["age"], errors="coerce")
178
 
179
+ # ── Golden rows: select rows that must not be damaged (anti-exploit) ──
180
+ golden_indices = rng.sample(range(len(clean_df)), min(5, len(clean_df)))
181
+ issues["golden_indices"] = golden_indices
182
+
183
+ # ── Business rules ──
184
+ issues["business_rules"] = [
185
+ {"type": "range", "column": "age", "min": 0, "max": 120,
186
+ "description": "Age must be between 0 and 120"},
187
+ {"type": "not_null", "column": "email",
188
+ "description": "Email address is required"},
189
+ {"type": "pattern", "column": "email", "pattern": r".*@.*\..*",
190
+ "description": "Email must contain @ and domain"},
191
+ ]
192
+
193
  return dirty_df, clean_df, issues
194
 
195
 
 
219
  "customer_id": customer_id, "region": region,
220
  })
221
 
222
+ # ── Red herring rows: legitimate edge cases that should NOT be cleaned ──
223
+ rows.append({"transaction_id": "TXN-FREE01", "product": "Promotional Sticker",
224
+ "category": "Accessories", "price": 0.00, "quantity": 1,
225
+ "date": "2023-07-04", "customer_id": "CUST-1500", "region": "North"})
226
+ rows.append({"transaction_id": "TXN-BULK01", "product": "Ethernet Cable",
227
+ "category": "Networking", "price": 2.99, "quantity": 500,
228
+ "date": "2023-11-24", "customer_id": "CUST-1001", "region": "Central"})
229
+
230
  clean_df = pd.DataFrame(rows)
231
  dirty_df = clean_df.copy()
232
 
 
282
  dirty_df.at[idx, "price"] = round(rng.uniform(50000, 99999), 2)
283
  issues["outliers"].append({"row": idx, "column": "price"})
284
 
285
+ # ── Golden rows ──
286
+ golden_indices = rng.sample(range(len(clean_df)), min(8, len(clean_df)))
287
+ issues["golden_indices"] = golden_indices
288
+
289
+ # ── Business rules ──
290
+ issues["business_rules"] = [
291
+ {"type": "range", "column": "price", "min": 0, "max": 10000,
292
+ "description": "Price must be between $0 and $10,000"},
293
+ {"type": "range", "column": "quantity", "min": 1, "max": 1000,
294
+ "description": "Quantity must be between 1 and 1000"},
295
+ {"type": "not_null", "column": "transaction_id",
296
+ "description": "Transaction ID is required"},
297
+ {"type": "categorical", "column": "region",
298
+ "allowed_values": ["North", "South", "East", "West", "Central"],
299
+ "description": "Region must be a valid US region"},
300
+ ]
301
+
302
  return dirty_df, clean_df, issues
303
 
304
 
 
468
  except (ValueError, TypeError):
469
  pass
470
 
471
+ # ── Golden rows: important rows for anti-exploit ──
472
+ golden_indices = rng.sample(range(len(clean_df)), min(15, len(clean_df)))
473
+ issues["golden_indices"] = golden_indices
474
+
475
+ # ── Business rules (healthcare domain constraints) ──
476
+ issues["business_rules"] = [
477
+ {"type": "range", "column": "height_cm", "min": 30, "max": 250,
478
+ "description": "Height must be between 30cm and 250cm"},
479
+ {"type": "range", "column": "weight_kg", "min": 1, "max": 300,
480
+ "description": "Weight must be between 1kg and 300kg"},
481
+ {"type": "cross_column", "column_a": "bp_systolic", "column_b": "bp_diastolic",
482
+ "relation": "greater_than",
483
+ "description": "Systolic BP must be greater than Diastolic BP"},
484
+ {"type": "categorical", "column": "gender",
485
+ "allowed_values": ["Male", "Female"],
486
+ "description": "Gender must be 'Male' or 'Female'"},
487
+ {"type": "categorical", "column": "blood_type",
488
+ "allowed_values": ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"],
489
+ "description": "Blood type must be a valid ABO+Rh type"},
490
+ {"type": "not_null", "column": "patient_id",
491
+ "description": "Patient ID is required"},
492
+ ]
493
+
494
  return dirty_df, clean_df, issues
495
 
496