realadvisor-challenge / tests /test_cleaner.py
dcrey7's picture
test: add unit tests for all pipeline modules
e751d0d
"""Tests for data cleaning functions."""
import polars as pl
import pytest
from src.cleaner import (
add_derived_columns,
deduplicate_mutations,
filter_sales,
normalize_commune_codes,
remove_outliers,
)
def test_filter_sales_keeps_only_vente(sample_raw_df):
lf = sample_raw_df.lazy()
result = filter_sales(lf).collect()
assert (result["nature_mutation"] == "Vente").all()
def test_filter_sales_removes_null_price(sample_raw_df):
lf = sample_raw_df.lazy()
result = filter_sales(lf).collect()
assert result["valeur_fonciere"].null_count() == 0
assert (result["valeur_fonciere"] > 0).all()
def test_filter_sales_removes_null_surface(sample_raw_df):
lf = sample_raw_df.lazy()
result = filter_sales(lf).collect()
assert result["surface_reelle_bati"].null_count() == 0
assert (result["surface_reelle_bati"] > 0).all()
def test_filter_sales_keeps_only_residential(sample_raw_df):
lf = sample_raw_df.lazy()
result = filter_sales(lf).collect()
types = result["type_local"].unique().to_list()
assert "Local industriel. commercial ou assimilé" not in types
assert "Dependance" not in types
for t in types:
assert t in ("Appartement", "Maison")
def test_deduplicate_single_row_mutations(sample_raw_df):
lf = sample_raw_df.lazy()
filtered = filter_sales(lf)
result = deduplicate_mutations(filtered).collect()
# Each id_mutation should appear exactly once
counts = result.group_by("id_mutation").len()
assert (counts["len"] == 1).all()
def test_deduplicate_multi_row_same_type():
"""M006 has 2 rows both Appartement - should be kept with summed surface."""
df = pl.DataFrame({
"id_mutation": ["M006", "M006"],
"date_mutation": ["2024-03-10", "2024-03-10"],
"nature_mutation": ["Vente", "Vente"],
"valeur_fonciere": [400000.0, 400000.0],
"code_postal": ["75002", "75002"],
"code_commune": ["75102", "75102"],
"nom_commune": ["Paris 2e", "Paris 2e"],
"code_departement": ["75", "75"],
"id_parcelle": ["75102000F006", "75102000F007"],
"code_type_local": ["2", "2"],
"type_local": ["Appartement", "Appartement"],
"surface_reelle_bati": [80.0, 30.0],
"nombre_pieces_principales": [3, 1],
"nombre_lots": [2, 2],
"longitude": [2.34, 2.34],
"latitude": [48.87, 48.87],
})
result = deduplicate_mutations(df.lazy()).collect()
assert len(result) == 1
assert result["surface_reelle_bati"][0] == 110.0 # 80 + 30
assert result["valeur_fonciere"][0] == 400000.0
def test_deduplicate_mixed_type_excluded():
"""M007 has Appartement + Dependance - should be excluded."""
df = pl.DataFrame({
"id_mutation": ["M007", "M007"],
"date_mutation": ["2023-06-01", "2023-06-01"],
"nature_mutation": ["Vente", "Vente"],
"valeur_fonciere": [350000.0, 350000.0],
"code_postal": ["06000", "06000"],
"code_commune": ["06088", "06088"],
"nom_commune": ["Nice", "Nice"],
"code_departement": ["06", "06"],
"id_parcelle": ["06088000G008", "06088000G009"],
"code_type_local": ["2", "4"],
"type_local": ["Appartement", "Dependance"],
"surface_reelle_bati": [60.0, 10.0],
"nombre_pieces_principales": [2, 0],
"nombre_lots": [2, 2],
"longitude": [7.26, 7.26],
"latitude": [43.71, 43.71],
})
result = deduplicate_mutations(df.lazy()).collect()
assert len(result) == 0
def test_add_derived_columns_prix_m2():
df = pl.DataFrame({
"valeur_fonciere": [200000.0],
"surface_reelle_bati": [100.0],
"id_parcelle": ["75101000A001"],
"code_departement": ["75"],
"date_mutation": ["2024-06-15"],
})
result = add_derived_columns(df.lazy()).collect()
assert result["prix_m2"][0] == pytest.approx(2000.0)
assert result["code_section"][0] == "75101000A0"
assert result["year"][0] == "2024"
assert result["code_region"][0] == "11" # Île-de-France
def test_add_derived_columns_temporal_weight():
df = pl.DataFrame({
"valeur_fonciere": [200000.0],
"surface_reelle_bati": [100.0],
"id_parcelle": ["75101000A001"],
"code_departement": ["75"],
"date_mutation": ["2024-01-01"],
})
result = add_derived_columns(df.lazy()).collect()
# ~12 months before reference date (2025-01-01)
assert result["months_since"][0] == pytest.approx(12.0, abs=0.5)
assert 0 < result["temporal_weight"][0] < 1
expected_weight = 0.97 ** 12
assert result["temporal_weight"][0] == pytest.approx(expected_weight, abs=0.05)
def test_add_derived_columns_corsica():
df = pl.DataFrame({
"valeur_fonciere": [200000.0],
"surface_reelle_bati": [100.0],
"id_parcelle": ["2A004000B001"],
"code_departement": ["2A"],
"date_mutation": ["2024-06-15"],
})
result = add_derived_columns(df.lazy()).collect()
assert result["code_region"][0] == "94" # Corse
def test_normalize_commune_paris():
df = pl.DataFrame({"code_commune": ["75101", "75115", "75120"]})
result = normalize_commune_codes(df.lazy()).collect()
assert (result["code_commune_city"] == "75056").all()
def test_normalize_commune_lyon():
df = pl.DataFrame({"code_commune": ["69381", "69389"]})
result = normalize_commune_codes(df.lazy()).collect()
assert (result["code_commune_city"] == "69123").all()
def test_normalize_commune_marseille():
df = pl.DataFrame({"code_commune": ["13201", "13216"]})
result = normalize_commune_codes(df.lazy()).collect()
assert (result["code_commune_city"] == "13055").all()
def test_normalize_commune_regular_unchanged():
df = pl.DataFrame({"code_commune": ["33063", "31555"]})
result = normalize_commune_codes(df.lazy()).collect()
assert result["code_commune_city"][0] == "33063"
assert result["code_commune_city"][1] == "31555"
def test_remove_outliers_surface():
df = pl.DataFrame({
"surface_reelle_bati": [5.0, 50.0, 1500.0],
"prix_m2": [2000.0, 2000.0, 2000.0],
})
result = remove_outliers(df.lazy()).collect()
assert len(result) == 1
assert result["surface_reelle_bati"][0] == 50.0
def test_remove_outliers_price():
df = pl.DataFrame({
"surface_reelle_bati": [50.0, 50.0, 50.0],
"prix_m2": [50.0, 2000.0, 30000.0],
})
result = remove_outliers(df.lazy()).collect()
assert len(result) == 1
assert result["prix_m2"][0] == 2000.0