Spaces:
Sleeping
Sleeping
| """Tests for the grading system.""" | |
| from __future__ import annotations | |
| import inspect | |
| import numpy as np | |
| import pandas as pd | |
| import pytest | |
| from data_cleaning_env.grader import ( | |
| compute_quality_score, | |
| grade_episode, | |
| precompute_baselines, | |
| ) | |
| class TestComputeQualityScore: | |
| def test_perfect_score(self, clean_df: pd.DataFrame) -> None: | |
| score = compute_quality_score(clean_df, clean_df) | |
| assert score["composite"] == 1.0 | |
| def test_empty_df_scores_zero(self, clean_df: pd.DataFrame) -> None: | |
| empty = pd.DataFrame() | |
| score = compute_quality_score(empty, clean_df) | |
| assert score["composite"] == 0.0 | |
| def test_deterministic( | |
| self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame | |
| ) -> None: | |
| s1 = compute_quality_score(dirty_df, clean_df) | |
| s2 = compute_quality_score(dirty_df, clean_df) | |
| assert s1 == s2, "Score is not deterministic" | |
| def test_dirty_scores_less_than_clean( | |
| self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame | |
| ) -> None: | |
| dirty_score = compute_quality_score(dirty_df, clean_df) | |
| clean_score = compute_quality_score(clean_df, clean_df) | |
| assert dirty_score["composite"] < clean_score["composite"] | |
| def test_all_components_present(self, clean_df: pd.DataFrame) -> None: | |
| score = compute_quality_score(clean_df, clean_df) | |
| for key in ("completeness", "consistency", "accuracy", "format", "composite"): | |
| assert key in score, f"Missing key: {key}" | |
| def test_all_nan_completeness_low(self, clean_df: pd.DataFrame) -> None: | |
| # Create a df where every column is nullable (float/object) then set all NaN | |
| all_nan = clean_df.astype(object).copy() | |
| all_nan[:] = None | |
| score = compute_quality_score(all_nan, clean_df) | |
| assert score["completeness"] < 0.1 | |
| def test_row_drop_penalizes_accuracy(self, clean_df: pd.DataFrame) -> None: | |
| """Dropping rows should reduce accuracy since the grader uses | |
| n_clean as the denominator.""" | |
| partial = clean_df.iloc[:2].copy() | |
| score = compute_quality_score(partial, clean_df) | |
| perfect = compute_quality_score(clean_df, clean_df) | |
| assert score["accuracy"] < perfect["accuracy"] | |
| def test_no_sklearn_in_grader(self) -> None: | |
| src = inspect.getsource(compute_quality_score) | |
| assert "sklearn" not in src | |
| class TestGradeEpisode: | |
| def test_perfect_episode_scores_one(self, clean_df: pd.DataFrame) -> None: | |
| precompute_baselines("_test", clean_df, clean_df.copy()) | |
| # When dirty baseline == clean, grade_episode returns 1.0 for perfect data | |
| result = grade_episode(clean_df, "_test", clean_df) | |
| assert result["score"] == 1.0 | |
| assert "breakdown" in result | |
| def test_per_episode_baseline_overrides_cache( | |
| self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame | |
| ) -> None: | |
| precompute_baselines("_test2", clean_df, dirty_df) | |
| dirty_quality = compute_quality_score(dirty_df, clean_df)["composite"] | |
| # Pass a custom dirty_baseline that's different from the cached one | |
| result = grade_episode( | |
| clean_df, "_test2", clean_df, dirty_baseline=dirty_quality | |
| ) | |
| assert result["score"] == 1.0 | |
| def test_no_improvement_scores_zero( | |
| self, clean_df: pd.DataFrame, dirty_df: pd.DataFrame | |
| ) -> None: | |
| dirty_quality = compute_quality_score(dirty_df, clean_df)["composite"] | |
| result = grade_episode(dirty_df, "x", clean_df, dirty_baseline=dirty_quality) | |
| assert result["score"] == 0.0 | |