Spaces:
Sleeping
Sleeping
| """Tests for data cleaning functions.""" | |
| import polars as pl | |
| import pytest | |
| from src.cleaner import ( | |
| add_derived_columns, | |
| deduplicate_mutations, | |
| filter_sales, | |
| normalize_commune_codes, | |
| remove_outliers, | |
| ) | |
| def test_filter_sales_keeps_only_vente(sample_raw_df): | |
| lf = sample_raw_df.lazy() | |
| result = filter_sales(lf).collect() | |
| assert (result["nature_mutation"] == "Vente").all() | |
| def test_filter_sales_removes_null_price(sample_raw_df): | |
| lf = sample_raw_df.lazy() | |
| result = filter_sales(lf).collect() | |
| assert result["valeur_fonciere"].null_count() == 0 | |
| assert (result["valeur_fonciere"] > 0).all() | |
| def test_filter_sales_removes_null_surface(sample_raw_df): | |
| lf = sample_raw_df.lazy() | |
| result = filter_sales(lf).collect() | |
| assert result["surface_reelle_bati"].null_count() == 0 | |
| assert (result["surface_reelle_bati"] > 0).all() | |
| def test_filter_sales_keeps_only_residential(sample_raw_df): | |
| lf = sample_raw_df.lazy() | |
| result = filter_sales(lf).collect() | |
| types = result["type_local"].unique().to_list() | |
| assert "Local industriel. commercial ou assimilé" not in types | |
| assert "Dependance" not in types | |
| for t in types: | |
| assert t in ("Appartement", "Maison") | |
| def test_deduplicate_single_row_mutations(sample_raw_df): | |
| lf = sample_raw_df.lazy() | |
| filtered = filter_sales(lf) | |
| result = deduplicate_mutations(filtered).collect() | |
| # Each id_mutation should appear exactly once | |
| counts = result.group_by("id_mutation").len() | |
| assert (counts["len"] == 1).all() | |
| def test_deduplicate_multi_row_same_type(): | |
| """M006 has 2 rows both Appartement - should be kept with summed surface.""" | |
| df = pl.DataFrame({ | |
| "id_mutation": ["M006", "M006"], | |
| "date_mutation": ["2024-03-10", "2024-03-10"], | |
| "nature_mutation": ["Vente", "Vente"], | |
| "valeur_fonciere": [400000.0, 400000.0], | |
| "code_postal": ["75002", "75002"], | |
| "code_commune": ["75102", "75102"], | |
| "nom_commune": ["Paris 2e", "Paris 2e"], | |
| "code_departement": ["75", "75"], | |
| "id_parcelle": ["75102000F006", "75102000F007"], | |
| "code_type_local": ["2", "2"], | |
| "type_local": ["Appartement", "Appartement"], | |
| "surface_reelle_bati": [80.0, 30.0], | |
| "nombre_pieces_principales": [3, 1], | |
| "nombre_lots": [2, 2], | |
| "longitude": [2.34, 2.34], | |
| "latitude": [48.87, 48.87], | |
| }) | |
| result = deduplicate_mutations(df.lazy()).collect() | |
| assert len(result) == 1 | |
| assert result["surface_reelle_bati"][0] == 110.0 # 80 + 30 | |
| assert result["valeur_fonciere"][0] == 400000.0 | |
| def test_deduplicate_mixed_type_excluded(): | |
| """M007 has Appartement + Dependance - should be excluded.""" | |
| df = pl.DataFrame({ | |
| "id_mutation": ["M007", "M007"], | |
| "date_mutation": ["2023-06-01", "2023-06-01"], | |
| "nature_mutation": ["Vente", "Vente"], | |
| "valeur_fonciere": [350000.0, 350000.0], | |
| "code_postal": ["06000", "06000"], | |
| "code_commune": ["06088", "06088"], | |
| "nom_commune": ["Nice", "Nice"], | |
| "code_departement": ["06", "06"], | |
| "id_parcelle": ["06088000G008", "06088000G009"], | |
| "code_type_local": ["2", "4"], | |
| "type_local": ["Appartement", "Dependance"], | |
| "surface_reelle_bati": [60.0, 10.0], | |
| "nombre_pieces_principales": [2, 0], | |
| "nombre_lots": [2, 2], | |
| "longitude": [7.26, 7.26], | |
| "latitude": [43.71, 43.71], | |
| }) | |
| result = deduplicate_mutations(df.lazy()).collect() | |
| assert len(result) == 0 | |
| def test_add_derived_columns_prix_m2(): | |
| df = pl.DataFrame({ | |
| "valeur_fonciere": [200000.0], | |
| "surface_reelle_bati": [100.0], | |
| "id_parcelle": ["75101000A001"], | |
| "code_departement": ["75"], | |
| "date_mutation": ["2024-06-15"], | |
| }) | |
| result = add_derived_columns(df.lazy()).collect() | |
| assert result["prix_m2"][0] == pytest.approx(2000.0) | |
| assert result["code_section"][0] == "75101000A0" | |
| assert result["year"][0] == "2024" | |
| assert result["code_region"][0] == "11" # Île-de-France | |
| def test_add_derived_columns_temporal_weight(): | |
| df = pl.DataFrame({ | |
| "valeur_fonciere": [200000.0], | |
| "surface_reelle_bati": [100.0], | |
| "id_parcelle": ["75101000A001"], | |
| "code_departement": ["75"], | |
| "date_mutation": ["2024-01-01"], | |
| }) | |
| result = add_derived_columns(df.lazy()).collect() | |
| # ~12 months before reference date (2025-01-01) | |
| assert result["months_since"][0] == pytest.approx(12.0, abs=0.5) | |
| assert 0 < result["temporal_weight"][0] < 1 | |
| expected_weight = 0.97 ** 12 | |
| assert result["temporal_weight"][0] == pytest.approx(expected_weight, abs=0.05) | |
| def test_add_derived_columns_corsica(): | |
| df = pl.DataFrame({ | |
| "valeur_fonciere": [200000.0], | |
| "surface_reelle_bati": [100.0], | |
| "id_parcelle": ["2A004000B001"], | |
| "code_departement": ["2A"], | |
| "date_mutation": ["2024-06-15"], | |
| }) | |
| result = add_derived_columns(df.lazy()).collect() | |
| assert result["code_region"][0] == "94" # Corse | |
| def test_normalize_commune_paris(): | |
| df = pl.DataFrame({"code_commune": ["75101", "75115", "75120"]}) | |
| result = normalize_commune_codes(df.lazy()).collect() | |
| assert (result["code_commune_city"] == "75056").all() | |
| def test_normalize_commune_lyon(): | |
| df = pl.DataFrame({"code_commune": ["69381", "69389"]}) | |
| result = normalize_commune_codes(df.lazy()).collect() | |
| assert (result["code_commune_city"] == "69123").all() | |
| def test_normalize_commune_marseille(): | |
| df = pl.DataFrame({"code_commune": ["13201", "13216"]}) | |
| result = normalize_commune_codes(df.lazy()).collect() | |
| assert (result["code_commune_city"] == "13055").all() | |
| def test_normalize_commune_regular_unchanged(): | |
| df = pl.DataFrame({"code_commune": ["33063", "31555"]}) | |
| result = normalize_commune_codes(df.lazy()).collect() | |
| assert result["code_commune_city"][0] == "33063" | |
| assert result["code_commune_city"][1] == "31555" | |
| def test_remove_outliers_surface(): | |
| df = pl.DataFrame({ | |
| "surface_reelle_bati": [5.0, 50.0, 1500.0], | |
| "prix_m2": [2000.0, 2000.0, 2000.0], | |
| }) | |
| result = remove_outliers(df.lazy()).collect() | |
| assert len(result) == 1 | |
| assert result["surface_reelle_bati"][0] == 50.0 | |
| def test_remove_outliers_price(): | |
| df = pl.DataFrame({ | |
| "surface_reelle_bati": [50.0, 50.0, 50.0], | |
| "prix_m2": [50.0, 2000.0, 30000.0], | |
| }) | |
| result = remove_outliers(df.lazy()).collect() | |
| assert len(result) == 1 | |
| assert result["prix_m2"][0] == 2000.0 | |