Spaces:
Running
Running
File size: 4,562 Bytes
76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a4e6dd6 76da36a a4e6dd6 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 76da36a a410163 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | from datasets import load_from_disk
import json
import numpy as np
import pandas as pd
from deepdiff import DeepDiff
import argparse
def diff_datasets(name1, name2, path=None, topN=20):
pd.set_option("display.precision", 4)
if isinstance(path, str):
name1 = f"{path}/{name1}"
name2 = f"{path}/{name2}"
# Load datasets and metadata
ds1 = load_from_disk(name1)
with open(f"{name1}/{ds1.info.description}") as f:
meta1 = json.load(f)
ds2 = load_from_disk(name2)
with open(f"{name2}/{ds2.info.description}") as f:
meta2 = json.load(f)
df1 = ds1.to_pandas()
df2 = ds2.to_pandas()
# Track columns to exclude from comparison
exclude_cols = {"job_uuid", "job_id", "SVD"}
# Check for identical job ids (unexpected)
if df1["job_uuid"].equals(df2["job_uuid"]):
print("WARNING: job_uuid columns are identical - verify this is intended")
# Compare w_bins
bins1 = np.array(meta1.get("w_bins", []))
bins2 = np.array(meta2.get("w_bins", []))
if not np.array_equal(bins1, bins2):
print("w_bins DIFFER:")
print(f" 1: n={len(bins1) - 1}, range=[{bins1[0]:.4g}, {bins1[-1]:.4g}]")
print(f" 2: n={len(bins2) - 1}, range=[{bins2[0]:.4g}, {bins2[-1]:.4g}]")
exclude_cols.add("P_w")
# Compare sv_bins
sv_bins1 = np.array(meta1.get("sv_bins", []))
sv_bins2 = np.array(meta2.get("sv_bins", []))
if not np.array_equal(sv_bins1, sv_bins2):
print("sv_bins DIFFER:")
print(
f" 1: n={len(sv_bins1) - 1}, range=[{sv_bins1[0]:.4g}, {sv_bins1[-1]:.4g}]"
)
print(
f" 2: n={len(sv_bins2) - 1}, range=[{sv_bins2[0]:.4g}, {sv_bins2[-1]:.4g}]"
)
exclude_cols.add("P_sv")
# Compare stats keys (column presence)
stats1 = set(meta1.get("stats", {}).keys())
stats2 = set(meta2.get("stats", {}).keys())
# common_stats = stats1 & stats2
if stats1 != stats2:
print("stats columns DIFFER:")
if stats1 - stats2:
print(f" Only in reference: {stats1 - stats2}")
if stats2 - stats1:
print(f" Only in target: {stats2 - stats1}")
exclude_cols.update(stats1 ^ stats2) # exclude non-common stats columns
# Compare remaining metadata
meta1_rest = {
k: v for k, v in meta1.items() if k not in ("w_bins", "sv_bins", "stats")
}
meta2_rest = {
k: v for k, v in meta2.items() if k not in ("w_bins", "sv_bins", "stats")
}
meta_diff = DeepDiff(meta1_rest, meta2_rest, ignore_order=True)
if meta_diff:
print("Other metadata differences:")
print(meta_diff.to_json(indent=2))
# Compare dataframes
cols1 = set(df1.columns) - exclude_cols
cols2 = set(df2.columns) - exclude_cols
common_cols = sorted(cols1 & cols2)
if cols1 != cols2:
print("DataFrame columns differ (excluding ignored):")
if cols1 - cols2:
print(f" Only in 1: {cols1 - cols2}")
if cols2 - cols1:
print(f" Only in 2: {cols2 - cols1}")
# Numerical comparison on common columns
df1_cmp = df1[common_cols].reset_index(drop=True)
df2_cmp = df2[common_cols].reset_index(drop=True)
if df1_cmp.shape != df2_cmp.shape:
print(f"Shape mismatch: {df1_cmp.shape} vs {df2_cmp.shape}")
return
if df1_cmp.equals(df2_cmp):
print(f"DataFrames match on {len(common_cols)} compared columns.")
else:
###
### Main diff command
diff = df1_cmp.compare(df2_cmp, result_names=("reference", "target"))
print(f"DataFrame differences ({len(diff)} rows differ):")
if topN < 0:
print(diff.head(20))
else:
print(diff.head(topN))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Diff two HuggingFace datasets",
epilog="""Examples:
%(prog)s ds_v1 ds_v2 --path /data/results
%(prog)s /full/path/ds_v1 /full/path/ds_v2
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("reference", help="Reference dataset name")
parser.add_argument("target", help="Target dataset name")
parser.add_argument("--path", "-p", default=None, help="Base path for datasets")
parser.add_argument(
"--num-print",
"-n",
default=50,
help="Max number of rows to print. -1 prints all",
)
args = parser.parse_args()
diff_datasets(args.reference, args.target, path=args.path, topN=args.num_print)
|