Spaces:

kumar6591
/

data-quality-env

Sleeping

File size: 7,852 Bytes

91e7690

from __future__ import annotations

import numpy as np
import pandas as pd

NULL_DISGUISES = ["NULL", "N/A", "UNKNOWN", "-", "", "0", "none"]


def generate_dataset(task_id: int, seed: int) -> tuple[dict[str, pd.DataFrame], dict]:
    """
    Returns:
      tables_dict: {table_name: DataFrame}
      gold_faults: dict
    """
    rng = np.random.default_rng(seed)
    if task_id == 1:
        return _task1(rng, seed)
    if task_id == 2:
        return _task2(rng)
    if task_id == 3:
        return _task3(rng)
    if task_id == 4:
        return _task4(rng)
    raise ValueError(f"Unknown task_id {task_id}")


def _task1(rng: np.random.Generator, seed: int) -> tuple[dict[str, pd.DataFrame], dict]:
    n = 200
    df = pd.DataFrame(
        {
            "customer_id": range(1001, 1001 + n),
            "email": [f"user{i}@example.com" for i in range(n)],
            "name": [f"Name {i}" for i in range(n)],
            "signup_date": pd.date_range("2023-01-01", periods=n, freq="D").astype(str),
            "country": rng.choice(["US", "UK", "IN", "DE", "FR"], n).tolist(),
        }
    )

    real_null_cid = int(rng.integers(3, 7))
    null_cid_idx = rng.choice(n, real_null_cid, replace=False)
    df.loc[null_cid_idx, "customer_id"] = None

    real_null_email = int(rng.integers(8, 15))
    null_email_idx = rng.choice(n, real_null_email, replace=False)
    df.loc[null_email_idx, "email"] = None

    disguised_null_email = int(rng.integers(4, 9))
    avail = [i for i in range(n) if i not in set(null_email_idx.tolist())]
    dis_idx = rng.choice(avail, disguised_null_email, replace=False)
    df.loc[dis_idx, "email"] = rng.choice(NULL_DISGUISES, disguised_null_email).tolist()

    dup_count = int(rng.integers(10, 19))
    dup_src = rng.choice(n, dup_count, replace=True)
    dups = df.iloc[dup_src].copy()
    df = pd.concat([df, dups], ignore_index=True)

    near_dup_count = int(rng.integers(5, 9))
    near_src = rng.choice(n, near_dup_count, replace=False)
    near_dups = df.iloc[near_src].copy()
    near_dups["country"] = rng.choice(["US", "UK", "IN", "DE", "FR"], near_dup_count).tolist()
    df = pd.concat([df, near_dups], ignore_index=True)
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    gold = {
        "null_customer_id": real_null_cid,
        "null_email_real": real_null_email,
        "null_email_disguised": disguised_null_email,
        "null_email_total": real_null_email + disguised_null_email,
        "exact_duplicate_rows": dup_count,
        "near_duplicate_rows": near_dup_count,
    }
    return {"customers": df}, gold


def _task2(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]:
    n = 300
    amounts_float = (rng.random(n) * 500 + 5).round(2)
    dates = pd.date_range("2023-01-01", periods=n, freq="h")[:n]
    df = pd.DataFrame(
        {
            "order_id": range(5001, 5001 + n),
            "customer_id": rng.integers(1001, 1201, n).tolist(),
            "amount": [f"${a}" for a in amounts_float],
            "order_date": [d.strftime("%b %d %Y") for d in dates],
            "status": rng.choice(["pending", "shipped", "delivered", "cancelled"], n).tolist(),
            "quantity": rng.integers(1, 20, n).tolist(),
        }
    )
    neg_qty = int(rng.integers(5, 11))
    neg_idx = rng.choice(n, neg_qty, replace=False)
    df.loc[neg_idx, "quantity"] = rng.integers(-10, 0, neg_qty).tolist()

    bad_amt = int(rng.integers(3, 8))
    bad_idx = rng.choice([i for i in range(n) if i not in set(neg_idx.tolist())], bad_amt, replace=False)
    df.loc[bad_idx, "amount"] = rng.choice(["N/A", "#ERR", "TBD", "--"], bad_amt).tolist()

    gold = {
        "amount_type_violation": True,
        "date_format_violation": True,
        "negative_quantity_rows": neg_qty,
        "unparseable_amount_rows": bad_amt,
    }
    return {"orders": df}, gold


def _task3(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]:
    def make_txn(n: int, rg: np.random.Generator, mean_amt: float, cats: list[str], id_start: int) -> pd.DataFrame:
        return pd.DataFrame(
            {
                "txn_id": range(id_start, id_start + n),
                "user_id": rg.integers(2001, 2501, n).tolist(),
                "amount": rg.normal(mean_amt, 15, n).round(2).tolist(),
                "category": rg.choice(cats, n).tolist(),
                "ts": pd.date_range("2024-01-01", periods=n, freq="h")[:n].astype(str).tolist(),
            }
        )

    base_cats = ["food", "travel", "retail", "health", "utilities"]
    new_cats = ["crypto", "NFT"]

    baseline = make_txn(500, rng, mean_amt=50.0, cats=base_cats, id_start=10001)
    current_rng = np.random.default_rng(int(rng.integers(9999)))
    current = make_txn(500, current_rng, mean_amt=78.0, cats=base_cats + new_cats, id_start=10501)

    new_uid_count = int(0.15 * 500)
    new_uid_idx = current_rng.choice(500, new_uid_count, replace=False)
    current.loc[new_uid_idx, "user_id"] = current_rng.integers(3000, 3500, new_uid_count).tolist()

    gold = {
        "amount_mean_shift": True,
        "baseline_mean": 50.0,
        "current_mean": float(current["amount"].mean()),
        "new_categories": new_cats,
        "referential_drift_pct": new_uid_count / 500,
    }
    return {"transactions_baseline": baseline, "transactions_current": current}, gold


def _task4(rng: np.random.Generator) -> tuple[dict[str, pd.DataFrame], dict]:
    nc = 200
    customers = pd.DataFrame(
        {
            "customer_id": range(1, nc + 1),
            "name": [f"Customer {i}" for i in range(nc)],
            "tier": rng.choice(["bronze", "silver", "gold"], nc).tolist(),
        }
    )

    no = 500
    orphan_count = int(rng.integers(15, 22))
    valid_cids = list(range(1, nc + 1))
    order_cids = rng.choice(valid_cids, no - orphan_count).tolist()
    orphan_cids = rng.integers(9000, 9999, orphan_count).tolist()
    all_cids = order_cids + orphan_cids
    rng.shuffle(all_cids)

    order_dates = pd.date_range("2024-01-01", periods=no, freq="h")[:no]
    ship_dates = [d + pd.Timedelta(days=int(rng.integers(1, 10))) for d in order_dates]

    temp_viol = int(rng.integers(10, 16))
    temp_idx = rng.choice(no, temp_viol, replace=False)
    for i in temp_idx:
        ship_dates[i] = order_dates[i] - pd.Timedelta(days=int(rng.integers(1, 5)))

    orders = pd.DataFrame(
        {
            "order_id": range(1, no + 1),
            "customer_id": all_cids,
            "order_date": order_dates.astype(str).tolist(),
            "ship_date": [str(d) for d in ship_dates],
            "order_total": (rng.random(no) * 400 + 20).round(2).tolist(),
        }
    )

    nl = 1500
    li_order_ids = rng.choice(range(1, no + 1), nl).tolist()
    li_prices = (rng.random(nl) * 100 + 5).round(2)
    li_qtys = rng.integers(1, 6, nl)
    line_items = pd.DataFrame(
        {
            "line_id": range(1, nl + 1),
            "order_id": li_order_ids,
            "product": rng.choice(["Widget A", "Widget B", "Widget C", "Widget D"], nl).tolist(),
            "price": li_prices.tolist(),
            "quantity": li_qtys.tolist(),
            "subtotal": (li_prices * li_qtys).round(2).tolist(),
        }
    )

    agg_mismatch = int(rng.integers(5, 9))
    mismatch_order_ids = rng.choice(range(1, no + 1), agg_mismatch, replace=False)
    for oid in mismatch_order_ids:
        idx = orders[orders["order_id"] == oid].index
        if len(idx):
            orders.loc[idx[0], "order_total"] = round(float(orders.loc[idx[0], "order_total"]) * rng.uniform(1.3, 2.0), 2)

    gold = {
        "orphaned_order_count": orphan_count,
        "temporal_violation_count": temp_viol,
        "aggregate_mismatch_count": agg_mismatch,
        "total_orders": no,
    }
    return {"customers": customers, "orders": orders, "line_items": line_items}, gold