from __future__ import annotations import numpy as np import pandas as pd NULL_DISGUISES = ["NULL", "N/A", "UNKNOWN", "-", "", "0", "none"] def generate_dataset(task_id: int, seed: int) -> tuple[dict[str, pd.DataFrame], dict]: """ Returns: tables_dict: {table_name: DataFrame} gold_faults: dict """ rng = np.random.default_rng(seed) if task_id == 1: return _task1(rng, seed) if task_id == 2: return _task2(rng) if task_id == 3: return _task3(rng) if task_id == 4: return _task4(rng) raise ValueError(f"Unknown task_id {task_id}") def _task1(rng: np.random.Generator, seed: int) -> tuple[dict[str, pd.DataFrame], dict]: n = 200 df = pd.DataFrame( { "customer_id": range(1001, 1001 + n), "email": [f"user{i}@example.com" for i in range(n)], "name": [f"Name {i}" for i in range(n)], "signup_date": pd.date_range("2023-01-01", periods=n, freq="D").astype(str), "country": rng.choice(["US", "UK", "IN", "DE", "FR"], n).tolist(), } ) real_null_cid = int(rng.integers(3, 7)) null_cid_idx = rng.choice(n, real_null_cid, replace=False) df.loc[null_cid_idx, "customer_id"] = None real_null_email = int(rng.integers(8, 15)) null_email_idx = rng.choice(n, real_null_email, replace=False) df.loc[null_email_idx, "email"] = None disguised_null_email = int(rng.integers(4, 9)) avail = [i for i in range(n) if i not in set(null_email_idx.tolist())] dis_idx = rng.choice(avail, disguised_null_email, replace=False) df.loc[dis_idx, "email"] = rng.choice(NULL_DISGUISES, disguised_null_email).tolist() dup_count = int(rng.integers(10, 19)) dup_src = rng.choice(n, dup_count, replace=True) dups = df.iloc[dup_src].copy() df = pd.concat([df, dups], ignore_index=True) near_dup_count = int(rng.integers(5, 9)) near_src = rng.choice(n, near_dup_count, replace=False) near_dups = df.iloc[near_src].copy() near_dups["country"] = rng.choice(["US", "UK", "IN", "DE", "FR"], near_dup_count).tolist() df = pd.concat([df, near_dups], ignore_index=True) df = df.sample(frac=1, random_state=seed).reset_index(drop=True) gold = { "null_customer_id": real_null_cid, "null_email_real": real_null_email, "null_email_disguised": disguised_null_email, "null_email_total": real_null_email + disguised_null_email, "exact_duplicate_rows": dup_count, "near_duplicate_rows": near_dup_count, } return {"customers": df}, gold def _task2(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]: n = 300 amounts_float = (rng.random(n) * 500 + 5).round(2) dates = pd.date_range("2023-01-01", periods=n, freq="h")[:n] df = pd.DataFrame( { "order_id": range(5001, 5001 + n), "customer_id": rng.integers(1001, 1201, n).tolist(), "amount": [f"${a}" for a in amounts_float], "order_date": [d.strftime("%b %d %Y") for d in dates], "status": rng.choice(["pending", "shipped", "delivered", "cancelled"], n).tolist(), "quantity": rng.integers(1, 20, n).tolist(), } ) neg_qty = int(rng.integers(5, 11)) neg_idx = rng.choice(n, neg_qty, replace=False) df.loc[neg_idx, "quantity"] = rng.integers(-10, 0, neg_qty).tolist() bad_amt = int(rng.integers(3, 8)) bad_idx = rng.choice([i for i in range(n) if i not in set(neg_idx.tolist())], bad_amt, replace=False) df.loc[bad_idx, "amount"] = rng.choice(["N/A", "#ERR", "TBD", "--"], bad_amt).tolist() gold = { "amount_type_violation": True, "date_format_violation": True, "negative_quantity_rows": neg_qty, "unparseable_amount_rows": bad_amt, } return {"orders": df}, gold def _task3(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]: def make_txn(n: int, rg: np.random.Generator, mean_amt: float, cats: list[str], id_start: int) -> pd.DataFrame: return pd.DataFrame( { "txn_id": range(id_start, id_start + n), "user_id": rg.integers(2001, 2501, n).tolist(), "amount": rg.normal(mean_amt, 15, n).round(2).tolist(), "category": rg.choice(cats, n).tolist(), "ts": pd.date_range("2024-01-01", periods=n, freq="h")[:n].astype(str).tolist(), } ) base_cats = ["food", "travel", "retail", "health", "utilities"] new_cats = ["crypto", "NFT"] baseline = make_txn(500, rng, mean_amt=50.0, cats=base_cats, id_start=10001) current_rng = np.random.default_rng(int(rng.integers(9999))) current = make_txn(500, current_rng, mean_amt=78.0, cats=base_cats + new_cats, id_start=10501) new_uid_count = int(0.15 * 500) new_uid_idx = current_rng.choice(500, new_uid_count, replace=False) current.loc[new_uid_idx, "user_id"] = current_rng.integers(3000, 3500, new_uid_count).tolist() gold = { "amount_mean_shift": True, "baseline_mean": 50.0, "current_mean": float(current["amount"].mean()), "new_categories": new_cats, "referential_drift_pct": new_uid_count / 500, } return {"transactions_baseline": baseline, "transactions_current": current}, gold def _task4(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]: nc = 200 customers = pd.DataFrame( { "customer_id": range(1, nc + 1), "name": [f"Customer {i}" for i in range(nc)], "tier": rng.choice(["bronze", "silver", "gold"], nc).tolist(), } ) no = 500 orphan_count = int(rng.integers(15, 22)) valid_cids = list(range(1, nc + 1)) order_cids = rng.choice(valid_cids, no - orphan_count).tolist() orphan_cids = rng.integers(9000, 9999, orphan_count).tolist() all_cids = order_cids + orphan_cids rng.shuffle(all_cids) order_dates = pd.date_range("2024-01-01", periods=no, freq="h")[:no] ship_dates = [d + pd.Timedelta(days=int(rng.integers(1, 10))) for d in order_dates] temp_viol = int(rng.integers(10, 16)) temp_idx = rng.choice(no, temp_viol, replace=False) for i in temp_idx: ship_dates[i] = order_dates[i] - pd.Timedelta(days=int(rng.integers(1, 5))) orders = pd.DataFrame( { "order_id": range(1, no + 1), "customer_id": all_cids, "order_date": order_dates.astype(str).tolist(), "ship_date": [str(d) for d in ship_dates], "order_total": (rng.random(no) * 400 + 20).round(2).tolist(), } ) nl = 1500 li_order_ids = rng.choice(range(1, no + 1), nl).tolist() li_prices = (rng.random(nl) * 100 + 5).round(2) li_qtys = rng.integers(1, 6, nl) line_items = pd.DataFrame( { "line_id": range(1, nl + 1), "order_id": li_order_ids, "product": rng.choice(["Widget A", "Widget B", "Widget C", "Widget D"], nl).tolist(), "price": li_prices.tolist(), "quantity": li_qtys.tolist(), "subtotal": (li_prices * li_qtys).round(2).tolist(), } ) agg_mismatch = int(rng.integers(5, 9)) mismatch_order_ids = rng.choice(range(1, no + 1), agg_mismatch, replace=False) for oid in mismatch_order_ids: idx = orders[orders["order_id"] == oid].index if len(idx): orders.loc[idx[0], "order_total"] = round(float(orders.loc[idx[0], "order_total"]) * rng.uniform(1.3, 2.0), 2) gold = { "orphaned_order_count": orphan_count, "temporal_violation_count": temp_viol, "aggregate_mismatch_count": agg_mismatch, "total_orders": no, } return {"customers": customers, "orders": orders, "line_items": line_items}, gold