Spaces:

angerami
/

transformer-weights

Running

File size: 4,562 Bytes

from datasets import load_from_disk
import json
import numpy as np
import pandas as pd
from deepdiff import DeepDiff
import argparse


def diff_datasets(name1, name2, path=None, topN=20):
    pd.set_option("display.precision", 4)
    if isinstance(path, str):
        name1 = f"{path}/{name1}"
        name2 = f"{path}/{name2}"

    # Load datasets and metadata
    ds1 = load_from_disk(name1)
    with open(f"{name1}/{ds1.info.description}") as f:
        meta1 = json.load(f)

    ds2 = load_from_disk(name2)
    with open(f"{name2}/{ds2.info.description}") as f:
        meta2 = json.load(f)

    df1 = ds1.to_pandas()
    df2 = ds2.to_pandas()

    # Track columns to exclude from comparison
    exclude_cols = {"job_uuid", "job_id", "SVD"}

    # Check for identical job ids (unexpected)
    if df1["job_uuid"].equals(df2["job_uuid"]):
        print("WARNING: job_uuid columns are identical - verify this is intended")

    # Compare w_bins
    bins1 = np.array(meta1.get("w_bins", []))
    bins2 = np.array(meta2.get("w_bins", []))
    if not np.array_equal(bins1, bins2):
        print("w_bins DIFFER:")
        print(f"  1: n={len(bins1) - 1}, range=[{bins1[0]:.4g}, {bins1[-1]:.4g}]")
        print(f"  2: n={len(bins2) - 1}, range=[{bins2[0]:.4g}, {bins2[-1]:.4g}]")
        exclude_cols.add("P_w")

    # Compare sv_bins
    sv_bins1 = np.array(meta1.get("sv_bins", []))
    sv_bins2 = np.array(meta2.get("sv_bins", []))
    if not np.array_equal(sv_bins1, sv_bins2):
        print("sv_bins DIFFER:")
        print(
            f"  1: n={len(sv_bins1) - 1}, range=[{sv_bins1[0]:.4g}, {sv_bins1[-1]:.4g}]"
        )
        print(
            f"  2: n={len(sv_bins2) - 1}, range=[{sv_bins2[0]:.4g}, {sv_bins2[-1]:.4g}]"
        )
        exclude_cols.add("P_sv")

    # Compare stats keys (column presence)
    stats1 = set(meta1.get("stats", {}).keys())
    stats2 = set(meta2.get("stats", {}).keys())
    # common_stats = stats1 & stats2

    if stats1 != stats2:
        print("stats columns DIFFER:")
        if stats1 - stats2:
            print(f"  Only in reference: {stats1 - stats2}")
        if stats2 - stats1:
            print(f"  Only in target: {stats2 - stats1}")
        exclude_cols.update(stats1 ^ stats2)  # exclude non-common stats columns

    # Compare remaining metadata
    meta1_rest = {
        k: v for k, v in meta1.items() if k not in ("w_bins", "sv_bins", "stats")
    }
    meta2_rest = {
        k: v for k, v in meta2.items() if k not in ("w_bins", "sv_bins", "stats")
    }
    meta_diff = DeepDiff(meta1_rest, meta2_rest, ignore_order=True)
    if meta_diff:
        print("Other metadata differences:")
        print(meta_diff.to_json(indent=2))

    # Compare dataframes
    cols1 = set(df1.columns) - exclude_cols
    cols2 = set(df2.columns) - exclude_cols
    common_cols = sorted(cols1 & cols2)

    if cols1 != cols2:
        print("DataFrame columns differ (excluding ignored):")
        if cols1 - cols2:
            print(f"  Only in 1: {cols1 - cols2}")
        if cols2 - cols1:
            print(f"  Only in 2: {cols2 - cols1}")

    # Numerical comparison on common columns
    df1_cmp = df1[common_cols].reset_index(drop=True)
    df2_cmp = df2[common_cols].reset_index(drop=True)

    if df1_cmp.shape != df2_cmp.shape:
        print(f"Shape mismatch: {df1_cmp.shape} vs {df2_cmp.shape}")
        return

    if df1_cmp.equals(df2_cmp):
        print(f"DataFrames match on {len(common_cols)} compared columns.")
    else:
        ###
        ### Main diff command
        diff = df1_cmp.compare(df2_cmp, result_names=("reference", "target"))
        print(f"DataFrame differences ({len(diff)} rows differ):")
        if topN < 0:
            print(diff.head(20))
        else:
            print(diff.head(topN))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Diff two HuggingFace datasets",
        epilog="""Examples:
  %(prog)s ds_v1 ds_v2 --path /data/results
  %(prog)s /full/path/ds_v1 /full/path/ds_v2
""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("reference", help="Reference dataset name")
    parser.add_argument("target", help="Target dataset name")
    parser.add_argument("--path", "-p", default=None, help="Base path for datasets")
    parser.add_argument(
        "--num-print",
        "-n",
        default=50,
        help="Max number of rows to print. -1 prints all",
    )

    args = parser.parse_args()
    diff_datasets(args.reference, args.target, path=args.path, topN=args.num_print)