data-quality-env / env /dataset_gen.py
Hemanth Kunta
Meta hackathon submission
91e7690
from __future__ import annotations
import numpy as np
import pandas as pd
NULL_DISGUISES = ["NULL", "N/A", "UNKNOWN", "-", "", "0", "none"]
def generate_dataset(task_id: int, seed: int) -> tuple[dict[str, pd.DataFrame], dict]:
"""
Returns:
tables_dict: {table_name: DataFrame}
gold_faults: dict
"""
rng = np.random.default_rng(seed)
if task_id == 1:
return _task1(rng, seed)
if task_id == 2:
return _task2(rng)
if task_id == 3:
return _task3(rng)
if task_id == 4:
return _task4(rng)
raise ValueError(f"Unknown task_id {task_id}")
def _task1(rng: np.random.Generator, seed: int) -> tuple[dict[str, pd.DataFrame], dict]:
n = 200
df = pd.DataFrame(
{
"customer_id": range(1001, 1001 + n),
"email": [f"user{i}@example.com" for i in range(n)],
"name": [f"Name {i}" for i in range(n)],
"signup_date": pd.date_range("2023-01-01", periods=n, freq="D").astype(str),
"country": rng.choice(["US", "UK", "IN", "DE", "FR"], n).tolist(),
}
)
real_null_cid = int(rng.integers(3, 7))
null_cid_idx = rng.choice(n, real_null_cid, replace=False)
df.loc[null_cid_idx, "customer_id"] = None
real_null_email = int(rng.integers(8, 15))
null_email_idx = rng.choice(n, real_null_email, replace=False)
df.loc[null_email_idx, "email"] = None
disguised_null_email = int(rng.integers(4, 9))
avail = [i for i in range(n) if i not in set(null_email_idx.tolist())]
dis_idx = rng.choice(avail, disguised_null_email, replace=False)
df.loc[dis_idx, "email"] = rng.choice(NULL_DISGUISES, disguised_null_email).tolist()
dup_count = int(rng.integers(10, 19))
dup_src = rng.choice(n, dup_count, replace=True)
dups = df.iloc[dup_src].copy()
df = pd.concat([df, dups], ignore_index=True)
near_dup_count = int(rng.integers(5, 9))
near_src = rng.choice(n, near_dup_count, replace=False)
near_dups = df.iloc[near_src].copy()
near_dups["country"] = rng.choice(["US", "UK", "IN", "DE", "FR"], near_dup_count).tolist()
df = pd.concat([df, near_dups], ignore_index=True)
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
gold = {
"null_customer_id": real_null_cid,
"null_email_real": real_null_email,
"null_email_disguised": disguised_null_email,
"null_email_total": real_null_email + disguised_null_email,
"exact_duplicate_rows": dup_count,
"near_duplicate_rows": near_dup_count,
}
return {"customers": df}, gold
def _task2(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]:
n = 300
amounts_float = (rng.random(n) * 500 + 5).round(2)
dates = pd.date_range("2023-01-01", periods=n, freq="h")[:n]
df = pd.DataFrame(
{
"order_id": range(5001, 5001 + n),
"customer_id": rng.integers(1001, 1201, n).tolist(),
"amount": [f"${a}" for a in amounts_float],
"order_date": [d.strftime("%b %d %Y") for d in dates],
"status": rng.choice(["pending", "shipped", "delivered", "cancelled"], n).tolist(),
"quantity": rng.integers(1, 20, n).tolist(),
}
)
neg_qty = int(rng.integers(5, 11))
neg_idx = rng.choice(n, neg_qty, replace=False)
df.loc[neg_idx, "quantity"] = rng.integers(-10, 0, neg_qty).tolist()
bad_amt = int(rng.integers(3, 8))
bad_idx = rng.choice([i for i in range(n) if i not in set(neg_idx.tolist())], bad_amt, replace=False)
df.loc[bad_idx, "amount"] = rng.choice(["N/A", "#ERR", "TBD", "--"], bad_amt).tolist()
gold = {
"amount_type_violation": True,
"date_format_violation": True,
"negative_quantity_rows": neg_qty,
"unparseable_amount_rows": bad_amt,
}
return {"orders": df}, gold
def _task3(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]:
def make_txn(n: int, rg: np.random.Generator, mean_amt: float, cats: list[str], id_start: int) -> pd.DataFrame:
return pd.DataFrame(
{
"txn_id": range(id_start, id_start + n),
"user_id": rg.integers(2001, 2501, n).tolist(),
"amount": rg.normal(mean_amt, 15, n).round(2).tolist(),
"category": rg.choice(cats, n).tolist(),
"ts": pd.date_range("2024-01-01", periods=n, freq="h")[:n].astype(str).tolist(),
}
)
base_cats = ["food", "travel", "retail", "health", "utilities"]
new_cats = ["crypto", "NFT"]
baseline = make_txn(500, rng, mean_amt=50.0, cats=base_cats, id_start=10001)
current_rng = np.random.default_rng(int(rng.integers(9999)))
current = make_txn(500, current_rng, mean_amt=78.0, cats=base_cats + new_cats, id_start=10501)
new_uid_count = int(0.15 * 500)
new_uid_idx = current_rng.choice(500, new_uid_count, replace=False)
current.loc[new_uid_idx, "user_id"] = current_rng.integers(3000, 3500, new_uid_count).tolist()
gold = {
"amount_mean_shift": True,
"baseline_mean": 50.0,
"current_mean": float(current["amount"].mean()),
"new_categories": new_cats,
"referential_drift_pct": new_uid_count / 500,
}
return {"transactions_baseline": baseline, "transactions_current": current}, gold
def _task4(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]:
nc = 200
customers = pd.DataFrame(
{
"customer_id": range(1, nc + 1),
"name": [f"Customer {i}" for i in range(nc)],
"tier": rng.choice(["bronze", "silver", "gold"], nc).tolist(),
}
)
no = 500
orphan_count = int(rng.integers(15, 22))
valid_cids = list(range(1, nc + 1))
order_cids = rng.choice(valid_cids, no - orphan_count).tolist()
orphan_cids = rng.integers(9000, 9999, orphan_count).tolist()
all_cids = order_cids + orphan_cids
rng.shuffle(all_cids)
order_dates = pd.date_range("2024-01-01", periods=no, freq="h")[:no]
ship_dates = [d + pd.Timedelta(days=int(rng.integers(1, 10))) for d in order_dates]
temp_viol = int(rng.integers(10, 16))
temp_idx = rng.choice(no, temp_viol, replace=False)
for i in temp_idx:
ship_dates[i] = order_dates[i] - pd.Timedelta(days=int(rng.integers(1, 5)))
orders = pd.DataFrame(
{
"order_id": range(1, no + 1),
"customer_id": all_cids,
"order_date": order_dates.astype(str).tolist(),
"ship_date": [str(d) for d in ship_dates],
"order_total": (rng.random(no) * 400 + 20).round(2).tolist(),
}
)
nl = 1500
li_order_ids = rng.choice(range(1, no + 1), nl).tolist()
li_prices = (rng.random(nl) * 100 + 5).round(2)
li_qtys = rng.integers(1, 6, nl)
line_items = pd.DataFrame(
{
"line_id": range(1, nl + 1),
"order_id": li_order_ids,
"product": rng.choice(["Widget A", "Widget B", "Widget C", "Widget D"], nl).tolist(),
"price": li_prices.tolist(),
"quantity": li_qtys.tolist(),
"subtotal": (li_prices * li_qtys).round(2).tolist(),
}
)
agg_mismatch = int(rng.integers(5, 9))
mismatch_order_ids = rng.choice(range(1, no + 1), agg_mismatch, replace=False)
for oid in mismatch_order_ids:
idx = orders[orders["order_id"] == oid].index
if len(idx):
orders.loc[idx[0], "order_total"] = round(float(orders.loc[idx[0], "order_total"]) * rng.uniform(1.3, 2.0), 2)
gold = {
"orphaned_order_count": orphan_count,
"temporal_violation_count": temp_viol,
"aggregate_mismatch_count": agg_mismatch,
"total_orders": no,
}
return {"customers": customers, "orders": orders, "line_items": line_items}, gold