from datasets import load_from_disk import json import numpy as np import pandas as pd from deepdiff import DeepDiff import argparse def diff_datasets(name1, name2, path=None, topN=20): pd.set_option("display.precision", 4) if isinstance(path, str): name1 = f"{path}/{name1}" name2 = f"{path}/{name2}" # Load datasets and metadata ds1 = load_from_disk(name1) with open(f"{name1}/{ds1.info.description}") as f: meta1 = json.load(f) ds2 = load_from_disk(name2) with open(f"{name2}/{ds2.info.description}") as f: meta2 = json.load(f) df1 = ds1.to_pandas() df2 = ds2.to_pandas() # Track columns to exclude from comparison exclude_cols = {"job_uuid", "job_id", "SVD"} # Check for identical job ids (unexpected) if df1["job_uuid"].equals(df2["job_uuid"]): print("WARNING: job_uuid columns are identical - verify this is intended") # Compare w_bins bins1 = np.array(meta1.get("w_bins", [])) bins2 = np.array(meta2.get("w_bins", [])) if not np.array_equal(bins1, bins2): print("w_bins DIFFER:") print(f" 1: n={len(bins1) - 1}, range=[{bins1[0]:.4g}, {bins1[-1]:.4g}]") print(f" 2: n={len(bins2) - 1}, range=[{bins2[0]:.4g}, {bins2[-1]:.4g}]") exclude_cols.add("P_w") # Compare sv_bins sv_bins1 = np.array(meta1.get("sv_bins", [])) sv_bins2 = np.array(meta2.get("sv_bins", [])) if not np.array_equal(sv_bins1, sv_bins2): print("sv_bins DIFFER:") print( f" 1: n={len(sv_bins1) - 1}, range=[{sv_bins1[0]:.4g}, {sv_bins1[-1]:.4g}]" ) print( f" 2: n={len(sv_bins2) - 1}, range=[{sv_bins2[0]:.4g}, {sv_bins2[-1]:.4g}]" ) exclude_cols.add("P_sv") # Compare stats keys (column presence) stats1 = set(meta1.get("stats", {}).keys()) stats2 = set(meta2.get("stats", {}).keys()) # common_stats = stats1 & stats2 if stats1 != stats2: print("stats columns DIFFER:") if stats1 - stats2: print(f" Only in reference: {stats1 - stats2}") if stats2 - stats1: print(f" Only in target: {stats2 - stats1}") exclude_cols.update(stats1 ^ stats2) # exclude non-common stats columns # Compare remaining metadata meta1_rest = { k: v for k, v in meta1.items() if k not in ("w_bins", "sv_bins", "stats") } meta2_rest = { k: v for k, v in meta2.items() if k not in ("w_bins", "sv_bins", "stats") } meta_diff = DeepDiff(meta1_rest, meta2_rest, ignore_order=True) if meta_diff: print("Other metadata differences:") print(meta_diff.to_json(indent=2)) # Compare dataframes cols1 = set(df1.columns) - exclude_cols cols2 = set(df2.columns) - exclude_cols common_cols = sorted(cols1 & cols2) if cols1 != cols2: print("DataFrame columns differ (excluding ignored):") if cols1 - cols2: print(f" Only in 1: {cols1 - cols2}") if cols2 - cols1: print(f" Only in 2: {cols2 - cols1}") # Numerical comparison on common columns df1_cmp = df1[common_cols].reset_index(drop=True) df2_cmp = df2[common_cols].reset_index(drop=True) if df1_cmp.shape != df2_cmp.shape: print(f"Shape mismatch: {df1_cmp.shape} vs {df2_cmp.shape}") return if df1_cmp.equals(df2_cmp): print(f"DataFrames match on {len(common_cols)} compared columns.") else: ### ### Main diff command diff = df1_cmp.compare(df2_cmp, result_names=("reference", "target")) print(f"DataFrame differences ({len(diff)} rows differ):") if topN < 0: print(diff.head(20)) else: print(diff.head(topN)) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Diff two HuggingFace datasets", epilog="""Examples: %(prog)s ds_v1 ds_v2 --path /data/results %(prog)s /full/path/ds_v1 /full/path/ds_v2 """, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument("reference", help="Reference dataset name") parser.add_argument("target", help="Target dataset name") parser.add_argument("--path", "-p", default=None, help="Base path for datasets") parser.add_argument( "--num-print", "-n", default=50, help="Max number of rows to print. -1 prints all", ) args = parser.parse_args() diff_datasets(args.reference, args.target, path=args.path, topN=args.num_print)