data-cleaning-openenv / tests /test_grader.py
yashmarathe's picture
refactor: move all root-level files to repo root
7492bfe
"""Tests for the grading system."""
from __future__ import annotations
import inspect
import numpy as np
import pandas as pd
import pytest
from data_cleaning_env.grader import (
compute_quality_score,
grade_episode,
precompute_baselines,
)
class TestComputeQualityScore:
def test_perfect_score(self, clean_df: pd.DataFrame) -> None:
score = compute_quality_score(clean_df, clean_df)
assert score["composite"] == 1.0
def test_empty_df_scores_zero(self, clean_df: pd.DataFrame) -> None:
empty = pd.DataFrame()
score = compute_quality_score(empty, clean_df)
assert score["composite"] == 0.0
def test_deterministic(
self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame
) -> None:
s1 = compute_quality_score(dirty_df, clean_df)
s2 = compute_quality_score(dirty_df, clean_df)
assert s1 == s2, "Score is not deterministic"
def test_dirty_scores_less_than_clean(
self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame
) -> None:
dirty_score = compute_quality_score(dirty_df, clean_df)
clean_score = compute_quality_score(clean_df, clean_df)
assert dirty_score["composite"] < clean_score["composite"]
def test_all_components_present(self, clean_df: pd.DataFrame) -> None:
score = compute_quality_score(clean_df, clean_df)
for key in ("completeness", "consistency", "accuracy", "format", "composite"):
assert key in score, f"Missing key: {key}"
def test_all_nan_completeness_low(self, clean_df: pd.DataFrame) -> None:
# Create a df where every column is nullable (float/object) then set all NaN
all_nan = clean_df.astype(object).copy()
all_nan[:] = None
score = compute_quality_score(all_nan, clean_df)
assert score["completeness"] < 0.1
def test_row_drop_penalizes_accuracy(self, clean_df: pd.DataFrame) -> None:
"""Dropping rows should reduce accuracy since the grader uses
n_clean as the denominator."""
partial = clean_df.iloc[:2].copy()
score = compute_quality_score(partial, clean_df)
perfect = compute_quality_score(clean_df, clean_df)
assert score["accuracy"] < perfect["accuracy"]
def test_no_sklearn_in_grader(self) -> None:
src = inspect.getsource(compute_quality_score)
assert "sklearn" not in src
class TestGradeEpisode:
def test_perfect_episode_scores_one(self, clean_df: pd.DataFrame) -> None:
precompute_baselines("_test", clean_df, clean_df.copy())
# When dirty baseline == clean, grade_episode returns 1.0 for perfect data
result = grade_episode(clean_df, "_test", clean_df)
assert result["score"] == 1.0
assert "breakdown" in result
def test_per_episode_baseline_overrides_cache(
self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame
) -> None:
precompute_baselines("_test2", clean_df, dirty_df)
dirty_quality = compute_quality_score(dirty_df, clean_df)["composite"]
# Pass a custom dirty_baseline that's different from the cached one
result = grade_episode(
clean_df, "_test2", clean_df, dirty_baseline=dirty_quality
)
assert result["score"] == 1.0
def test_no_improvement_scores_zero(
self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame
) -> None:
dirty_quality = compute_quality_score(dirty_df, clean_df)["composite"]
result = grade_episode(dirty_df, "x", clean_df, dirty_baseline=dirty_quality)
assert result["score"] == 0.0