File size: 4,562 Bytes
76da36a
 
 
 
 
 
 
a410163
76da36a
a410163
76da36a
a410163
 
76da36a
 
 
a410163
76da36a
 
 
a410163
76da36a
 
 
 
 
 
a410163
76da36a
 
a410163
76da36a
 
 
 
 
 
 
a410163
 
 
76da36a
 
 
 
 
 
a410163
 
 
 
 
 
 
76da36a
 
 
 
a410163
 
76da36a
a410163
76da36a
a4e6dd6
76da36a
a4e6dd6
76da36a
 
 
a410163
 
 
 
 
 
76da36a
 
 
 
 
 
 
 
 
 
 
a410163
76da36a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a410163
76da36a
 
 
 
 
 
 
a410163
76da36a
 
 
 
a410163
 
 
 
 
 
 
76da36a
a410163
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from datasets import load_from_disk
import json
import numpy as np
import pandas as pd
from deepdiff import DeepDiff
import argparse


def diff_datasets(name1, name2, path=None, topN=20):
    pd.set_option("display.precision", 4)
    if isinstance(path, str):
        name1 = f"{path}/{name1}"
        name2 = f"{path}/{name2}"

    # Load datasets and metadata
    ds1 = load_from_disk(name1)
    with open(f"{name1}/{ds1.info.description}") as f:
        meta1 = json.load(f)

    ds2 = load_from_disk(name2)
    with open(f"{name2}/{ds2.info.description}") as f:
        meta2 = json.load(f)

    df1 = ds1.to_pandas()
    df2 = ds2.to_pandas()

    # Track columns to exclude from comparison
    exclude_cols = {"job_uuid", "job_id", "SVD"}

    # Check for identical job ids (unexpected)
    if df1["job_uuid"].equals(df2["job_uuid"]):
        print("WARNING: job_uuid columns are identical - verify this is intended")

    # Compare w_bins
    bins1 = np.array(meta1.get("w_bins", []))
    bins2 = np.array(meta2.get("w_bins", []))
    if not np.array_equal(bins1, bins2):
        print("w_bins DIFFER:")
        print(f"  1: n={len(bins1) - 1}, range=[{bins1[0]:.4g}, {bins1[-1]:.4g}]")
        print(f"  2: n={len(bins2) - 1}, range=[{bins2[0]:.4g}, {bins2[-1]:.4g}]")
        exclude_cols.add("P_w")

    # Compare sv_bins
    sv_bins1 = np.array(meta1.get("sv_bins", []))
    sv_bins2 = np.array(meta2.get("sv_bins", []))
    if not np.array_equal(sv_bins1, sv_bins2):
        print("sv_bins DIFFER:")
        print(
            f"  1: n={len(sv_bins1) - 1}, range=[{sv_bins1[0]:.4g}, {sv_bins1[-1]:.4g}]"
        )
        print(
            f"  2: n={len(sv_bins2) - 1}, range=[{sv_bins2[0]:.4g}, {sv_bins2[-1]:.4g}]"
        )
        exclude_cols.add("P_sv")

    # Compare stats keys (column presence)
    stats1 = set(meta1.get("stats", {}).keys())
    stats2 = set(meta2.get("stats", {}).keys())
    # common_stats = stats1 & stats2

    if stats1 != stats2:
        print("stats columns DIFFER:")
        if stats1 - stats2:
            print(f"  Only in reference: {stats1 - stats2}")
        if stats2 - stats1:
            print(f"  Only in target: {stats2 - stats1}")
        exclude_cols.update(stats1 ^ stats2)  # exclude non-common stats columns

    # Compare remaining metadata
    meta1_rest = {
        k: v for k, v in meta1.items() if k not in ("w_bins", "sv_bins", "stats")
    }
    meta2_rest = {
        k: v for k, v in meta2.items() if k not in ("w_bins", "sv_bins", "stats")
    }
    meta_diff = DeepDiff(meta1_rest, meta2_rest, ignore_order=True)
    if meta_diff:
        print("Other metadata differences:")
        print(meta_diff.to_json(indent=2))

    # Compare dataframes
    cols1 = set(df1.columns) - exclude_cols
    cols2 = set(df2.columns) - exclude_cols
    common_cols = sorted(cols1 & cols2)

    if cols1 != cols2:
        print("DataFrame columns differ (excluding ignored):")
        if cols1 - cols2:
            print(f"  Only in 1: {cols1 - cols2}")
        if cols2 - cols1:
            print(f"  Only in 2: {cols2 - cols1}")

    # Numerical comparison on common columns
    df1_cmp = df1[common_cols].reset_index(drop=True)
    df2_cmp = df2[common_cols].reset_index(drop=True)

    if df1_cmp.shape != df2_cmp.shape:
        print(f"Shape mismatch: {df1_cmp.shape} vs {df2_cmp.shape}")
        return

    if df1_cmp.equals(df2_cmp):
        print(f"DataFrames match on {len(common_cols)} compared columns.")
    else:
        ###
        ### Main diff command
        diff = df1_cmp.compare(df2_cmp, result_names=("reference", "target"))
        print(f"DataFrame differences ({len(diff)} rows differ):")
        if topN < 0:
            print(diff.head(20))
        else:
            print(diff.head(topN))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Diff two HuggingFace datasets",
        epilog="""Examples:
  %(prog)s ds_v1 ds_v2 --path /data/results
  %(prog)s /full/path/ds_v1 /full/path/ds_v2
""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("reference", help="Reference dataset name")
    parser.add_argument("target", help="Target dataset name")
    parser.add_argument("--path", "-p", default=None, help="Base path for datasets")
    parser.add_argument(
        "--num-print",
        "-n",
        default=50,
        help="Max number of rows to print. -1 prints all",
    )

    args = parser.parse_args()
    diff_datasets(args.reference, args.target, path=args.path, topN=args.num_print)