Spaces:

dcrey7
/

realadvisor-challenge

Sleeping

App Files Files Community

dcrey7 commited on 27 days ago

Commit

e751d0d

1 Parent(s): 2b0cef4

test: add unit tests for all pipeline modules

Browse files

60 tests covering config, cleaner, aggregator, and top_cities.

Files changed (6) hide show

tests/__init__.py +0 -0
tests/conftest.py +236 -0
tests/test_aggregator.py +243 -0
tests/test_cleaner.py +189 -0
tests/test_config.py +108 -0
tests/test_top_cities.py +77 -0

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Shared fixtures for DVF pipeline tests."""
+import polars as pl
+import pytest
+@pytest.fixture
+def sample_raw_df() -> pl.DataFrame:
+    """
+    Small synthetic DVF dataset mimicking raw CSV structure.
+    Contains known edge cases: multi-row mutations, mixed types,
+    arrondissements, outliers, commercial properties.
+    """
+    return pl.DataFrame(
+        {
+            "id_mutation": [
+                # Normal single-row mutations
+                "M001", "M002", "M003", "M004", "M005",
+                # Multi-row same type (should be aggregated)
+                "M006", "M006",
+                # Multi-row mixed type (should be excluded)
+                "M007", "M007",
+                # Outliers
+                "M008", "M009",
+                # Commercial (should be filtered after config change)
+                "M010",
+                # Paris arrondissement
+                "M011",
+                # Non-sale
+                "M012",
+                # Null price
+                "M013",
+            ],
+            "date_mutation": [
+                "2024-06-15", "2023-01-20", "2022-07-10", "2021-03-05", "2020-11-22",
+                "2024-03-10", "2024-03-10",
+                "2023-06-01", "2023-06-01",
+                "2024-01-01", "2024-01-01",
+                "2024-05-01",
+                "2024-08-20",
+                "2023-09-15",
+                "2024-02-01",
+            ],
+            "nature_mutation": [
+                "Vente", "Vente", "Vente", "Vente", "Vente",
+                "Vente", "Vente",
+                "Vente", "Vente",
+                "Vente", "Vente",
+                "Vente",
+                "Vente",
+                "Echange",
+                "Vente",
+            ],
+            "valeur_fonciere": [
+                200000.0, 150000.0, 300000.0, 180000.0, 250000.0,
+                400000.0, 400000.0,
+                350000.0, 350000.0,
+                5000.0, 9000000.0,
+                120000.0,
+                500000.0,
+                100000.0,
+                None,
+            ],
+            "code_postal": [
+                "75001", "69001", "13001", "31000", "33000",
+                "75002", "75002",
+                "06000", "06000",
+                "44000", "44000",
+                "34000",
+                "75101",
+                "59000",
+                "75003",
+            ],
+            "code_commune": [
+                "75101", "69381", "13201", "31555", "33063",
+                "75102", "75102",
+                "06088", "06088",
+                "44109", "44109",
+                "34172",
+                "75101",
+                "59350",
+                "75103",
+            ],
+            "nom_commune": [
+                "Paris 1er", "Lyon 1er", "Marseille 1er", "Toulouse", "Bordeaux",
+                "Paris 2e", "Paris 2e",
+                "Nice", "Nice",
+                "Nantes", "Nantes",
+                "Montpellier",
+                "Paris 1er",
+                "Lille",
+                "Paris 3e",
+            ],
+            "code_departement": [
+                "75", "69", "13", "31", "33",
+                "75", "75",
+                "06", "06",
+                "44", "44",
+                "34",
+                "75",
+                "59",
+                "75",
+            ],
+            "id_parcelle": [
+                "75101000A001", "69381000B002", "13201000C003", "31555000D004", "33063000E005",
+                "75102000F006", "75102000F007",
+                "06088000G008", "06088000G009",
+                "44109000H010", "44109000H011",
+                "34172000I012",
+                "75101000J013",
+                "59350000K014",
+                "75103000L015",
+            ],
+            "code_type_local": [
+                "2", "2", "1", "1", "2",
+                "2", "2",
+                "2", "4",
+                "2", "2",
+                "4",
+                "2",
+                "1",
+                "2",
+            ],
+            "type_local": [
+                "Appartement", "Appartement", "Maison", "Maison", "Appartement",
+                "Appartement", "Appartement",
+                "Appartement", "Dependance",
+                "Appartement", "Appartement",
+                "Local industriel. commercial ou assimilé",
+                "Appartement",
+                "Maison",
+                "Appartement",
+            ],
+            "surface_reelle_bati": [
+                50.0, 75.0, 120.0, 90.0, 65.0,
+                80.0, 30.0,
+                60.0, 10.0,
+                3.0, 45.0,
+                200.0,
+                55.0,
+                100.0,
+                40.0,
+            ],
+            "nombre_pieces_principales": [
+                2, 3, 5, 4, 3,
+                3, 1,
+                2, 0,
+                1, 2,
+                0,
+                2,
+                4,
+                2,
+            ],
+            "nombre_lots": [
+                1, 1, 1, 1, 1,
+                2, 2,
+                2, 2,
+                1, 1,
+                1,
+                1,
+                1,
+                1,
+            ],
+            "longitude": [
+                2.34, 4.83, 5.37, 1.44, -0.57,
+                2.34, 2.34,
+                7.26, 7.26,
+                -1.55, -1.55,
+                3.87,
+                2.34,
+                3.06,
+                2.36,
+            ],
+            "latitude": [
+                48.86, 45.76, 43.30, 43.60, 44.84,
+                48.87, 48.87,
+                43.71, 43.71,
+                47.22, 47.22,
+                43.61,
+                48.86,
+                50.63,
+                48.86,
+            ],
+        }
+    )
+@pytest.fixture
+def sample_clean_df() -> pl.DataFrame:
+    """
+    Pre-cleaned dataset with derived columns, ready for aggregation.
+    Represents what comes out of the cleaner.
+    """
+    return pl.DataFrame(
+        {
+            "id_mutation": ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"],
+            "date_mutation": [
+                "2024-06-15", "2024-01-10", "2023-06-01",
+                "2022-01-15", "2021-06-01", "2020-01-10",
+                "2024-09-01", "2024-03-15",
+            ],
+            "type_local": [
+                "Appartement", "Maison", "Appartement",
+                "Maison", "Appartement", "Maison",
+                "Appartement", "Maison",
+            ],
+            "valeur_fonciere": [
+                250000.0, 180000.0, 200000.0,
+                150000.0, 300000.0, 120000.0,
+                400000.0, 220000.0,
+            ],
+            "surface_reelle_bati": [
+                50.0, 90.0, 80.0,
+                100.0, 60.0, 80.0,
+                40.0, 110.0,
+            ],
+            "prix_m2": [
+                5000.0, 2000.0, 2500.0,
+                1500.0, 5000.0, 1500.0,
+                10000.0, 2000.0,
+            ],
+            "temporal_weight": [
+                0.97**2, 0.97**7, 0.97**13,
+                0.97**25, 0.97**37, 0.97**49,
+                0.97**(-2), 0.97**4,
+            ],
+            "code_departement": ["75", "69", "75", "69", "75", "69", "75", "69"],
+            "code_commune": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
+            "code_commune_city": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
+            "code_postal": ["75001", "69001", "75002", "69002", "75003", "69003", "75001", "69002"],
+            "code_section": ["7510100001", "6938100001", "7510200001", "6912300001", "7510300001", "6912300002", "7510100002", "6912300003"],
+            "code_region": ["11", "84", "11", "84", "11", "84", "11", "84"],
+            "year": ["2024", "2024", "2023", "2022", "2021", "2020", "2024", "2024"],
+            "months_since": [2.0, 7.0, 13.0, 25.0, 37.0, 49.0, -2.0, 4.0],
+        }
+    )

tests/test_aggregator.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""Tests for aggregation functions."""
+import json
+import math
+from pathlib import Path
+import numpy as np
+import polars as pl
+import pytest
+from src.aggregator import (
+    LEVEL_TO_COLUMN,
+    _aggregate_group,
+    _export_sections,
+    aggregate_all_levels,
+    aggregate_all_types,
+    aggregate_level,
+    effective_sample_size,
+    export_json,
+    weighted_trimmed_mean,
+)
+# ---- weighted_trimmed_mean ----
+def test_wtm_uniform_weights():
+    """With equal weights and 0 trim, WTM == simple mean."""
+    prices = np.array([100.0, 200.0, 300.0, 400.0, 500.0])
+    weights = np.ones(5)
+    result = weighted_trimmed_mean(prices, weights, trim=0.0)
+    assert result == pytest.approx(300.0)
+def test_wtm_trims_extremes():
+    """With 20% trim, bottom and top tails should be clipped."""
+    prices = np.array([1.0, 100.0, 200.0, 300.0, 10000.0])
+    weights = np.ones(5)
+    result_trimmed = weighted_trimmed_mean(prices, weights, trim=0.2)
+    result_full = weighted_trimmed_mean(prices, weights, trim=0.0)
+    # Trimmed mean should be less affected by outliers
+    assert result_trimmed < result_full
+def test_wtm_respects_weight_ordering():
+    """Higher-weighted observations should pull the mean toward them."""
+    prices = np.array([100.0, 200.0])
+    # Heavily weight the 200
+    weights_high = np.array([0.1, 10.0])
+    weights_low = np.array([10.0, 0.1])
+    result_high = weighted_trimmed_mean(prices, weights_high, trim=0.0)
+    result_low = weighted_trimmed_mean(prices, weights_low, trim=0.0)
+    assert result_high > result_low
+def test_wtm_empty_array():
+    result = weighted_trimmed_mean(np.array([]), np.array([]), trim=0.2)
+    assert math.isnan(result)
+def test_wtm_single_element():
+    result = weighted_trimmed_mean(np.array([5000.0]), np.array([1.0]), trim=0.2)
+    assert result == pytest.approx(5000.0)
+def test_wtm_zero_weights():
+    result = weighted_trimmed_mean(np.array([100.0, 200.0]), np.array([0.0, 0.0]))
+    assert math.isnan(result)
+# ---- effective_sample_size ----
+def test_ess_equal_weights():
+    """With all equal weights, n_eff == n."""
+    weights = np.ones(50)
+    assert effective_sample_size(weights) == pytest.approx(50.0)
+def test_ess_unequal_weights():
+    """Unequal weights should give n_eff < n."""
+    weights = np.array([1.0, 1.0, 1.0, 0.01])
+    n_eff = effective_sample_size(weights)
+    assert n_eff < 4.0
+    assert n_eff > 1.0
+def test_ess_single_dominant():
+    """When one weight dominates, n_eff -> 1."""
+    weights = np.array([1000.0, 0.001, 0.001, 0.001])
+    n_eff = effective_sample_size(weights)
+    assert n_eff < 2.0
+def test_ess_empty():
+    assert effective_sample_size(np.array([])) == 0.0
+# ---- _aggregate_group ----
+def test_aggregate_group_basic():
+    df = pl.DataFrame({
+        "prix_m2": [2000.0, 3000.0, 4000.0, 2500.0, 3500.0],
+        "temporal_weight": [1.0, 1.0, 1.0, 1.0, 1.0],
+    })
+    stats = _aggregate_group(df)
+    assert stats["volume"] == 5
+    assert stats["median"] == pytest.approx(3000.0, rel=0.01)
+    assert stats["q1"] < stats["median"]
+    assert stats["q3"] > stats["median"]
+    assert stats["n_eff"] == pytest.approx(5.0, abs=0.1)
+    assert 0.0 <= stats["confidence"] <= 1.0
+def test_aggregate_group_with_temporal_decay():
+    """Older transactions should have lower weights, pulling WTM toward recent."""
+    df = pl.DataFrame({
+        "prix_m2": [2000.0, 2000.0, 2000.0, 5000.0, 5000.0],
+        "temporal_weight": [0.5, 0.5, 0.5, 1.0, 1.0],
+    })
+    stats = _aggregate_group(df)
+    # WTM should be pulled toward 5000 (higher weight)
+    assert stats["wtm"] > stats["median"]
+def test_aggregate_group_empty():
+    df = pl.DataFrame({
+        "prix_m2": pl.Series([], dtype=pl.Float64),
+        "temporal_weight": pl.Series([], dtype=pl.Float64),
+    })
+    stats = _aggregate_group(df)
+    assert stats["volume"] == 0
+    assert stats["confidence"] == 0.0
+def test_aggregate_group_returns_all_keys():
+    df = pl.DataFrame({
+        "prix_m2": [3000.0],
+        "temporal_weight": [1.0],
+    })
+    stats = _aggregate_group(df)
+    expected_keys = {"median", "wtm", "q1", "q3", "volume", "n_eff", "confidence"}
+    assert set(stats.keys()) == expected_keys
+# ---- aggregate_level ----
+def test_aggregate_level_groups_correctly(sample_clean_df):
+    result = aggregate_level(sample_clean_df, "code_departement")
+    assert "75" in result
+    assert "69" in result
+    assert result["75"]["volume"] == 4  # 4 entries with dept 75
+    assert result["69"]["volume"] == 4
+def test_aggregate_level_with_property_type(sample_clean_df):
+    result = aggregate_level(
+        sample_clean_df, "code_departement", property_type="Appartement"
+    )
+    assert "75" in result
+    # Only Appartement rows for dept 75
+    expected = sample_clean_df.filter(
+        (pl.col("code_departement") == "75")
+        & (pl.col("type_local") == "Appartement")
+    )
+    assert result["75"]["volume"] == len(expected)
+def test_aggregate_level_country(sample_clean_df):
+    result = aggregate_level(sample_clean_df, "_country")
+    assert "FR" in result
+    assert result["FR"]["volume"] == 8
+# ---- aggregate_all_types ----
+def test_aggregate_all_types_keys(sample_clean_df):
+    result = aggregate_all_types(sample_clean_df, "code_departement")
+    for code in result:
+        assert "tous" in result[code]
+    # Dept 75 has only Appartement in sample, dept 69 has only Maison
+    assert "appartement" in result["75"]
+    assert "maison" in result["69"]
+# ---- aggregate_all_levels ----
+def test_aggregate_all_levels_keys(sample_clean_df):
+    result = aggregate_all_levels(sample_clean_df)
+    for level in ["country", "region", "department", "commune", "postcode", "section"]:
+        assert level in result
+# ---- _export_sections ----
+def test_export_sections_splits_by_dept(tmp_path):
+    section_data = {
+        "7510100001": {"tous": {"median": 5000}},
+        "7510100002": {"tous": {"median": 5100}},
+        "6938100001": {"tous": {"median": 3000}},
+        "2A004000B0": {"tous": {"median": 2000}},
+    }
+    _export_sections(section_data, tmp_path)
+    sections_dir = tmp_path / "sections"
+    assert (sections_dir / "75.json").exists()
+    assert (sections_dir / "69.json").exists()
+    assert (sections_dir / "2A.json").exists()
+    with open(sections_dir / "75.json") as f:
+        data_75 = json.load(f)
+    assert len(data_75) == 2
+def test_export_sections_dom_tom(tmp_path):
+    """DOM-TOM departments (971-976) use 3-digit dept codes."""
+    section_data = {
+        "97105000001": {"tous": {"median": 2500}},
+        "97205000001": {"tous": {"median": 2600}},
+    }
+    _export_sections(section_data, tmp_path)
+    sections_dir = tmp_path / "sections"
+    assert (sections_dir / "971.json").exists()
+    assert (sections_dir / "972.json").exists()
+# ---- export_json ----
+def test_export_json_creates_files(tmp_path, sample_clean_df):
+    aggregated = aggregate_all_levels(sample_clean_df)
+    export_json(aggregated, tmp_path)
+    assert (tmp_path / "prices_country.json").exists()
+    assert (tmp_path / "prices_region.json").exists()
+    assert (tmp_path / "prices_department.json").exists()
+    assert (tmp_path / "prices_commune.json").exists()
+    assert (tmp_path / "prices_postcode.json").exists()
+    # Section is split into per-dept files
+    assert (tmp_path / "sections").is_dir()
+# ---- LEVEL_TO_COLUMN mapping ----
+def test_level_to_column_covers_all_levels():
+    from src.config import AGGREGATION_LEVELS
+    for level in AGGREGATION_LEVELS:
+        assert level in LEVEL_TO_COLUMN

tests/test_cleaner.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""Tests for data cleaning functions."""
+import polars as pl
+import pytest
+from src.cleaner import (
+    add_derived_columns,
+    deduplicate_mutations,
+    filter_sales,
+    normalize_commune_codes,
+    remove_outliers,
+)
+def test_filter_sales_keeps_only_vente(sample_raw_df):
+    lf = sample_raw_df.lazy()
+    result = filter_sales(lf).collect()
+    assert (result["nature_mutation"] == "Vente").all()
+def test_filter_sales_removes_null_price(sample_raw_df):
+    lf = sample_raw_df.lazy()
+    result = filter_sales(lf).collect()
+    assert result["valeur_fonciere"].null_count() == 0
+    assert (result["valeur_fonciere"] > 0).all()
+def test_filter_sales_removes_null_surface(sample_raw_df):
+    lf = sample_raw_df.lazy()
+    result = filter_sales(lf).collect()
+    assert result["surface_reelle_bati"].null_count() == 0
+    assert (result["surface_reelle_bati"] > 0).all()
+def test_filter_sales_keeps_only_residential(sample_raw_df):
+    lf = sample_raw_df.lazy()
+    result = filter_sales(lf).collect()
+    types = result["type_local"].unique().to_list()
+    assert "Local industriel. commercial ou assimilé" not in types
+    assert "Dependance" not in types
+    for t in types:
+        assert t in ("Appartement", "Maison")
+def test_deduplicate_single_row_mutations(sample_raw_df):
+    lf = sample_raw_df.lazy()
+    filtered = filter_sales(lf)
+    result = deduplicate_mutations(filtered).collect()
+    # Each id_mutation should appear exactly once
+    counts = result.group_by("id_mutation").len()
+    assert (counts["len"] == 1).all()
+def test_deduplicate_multi_row_same_type():
+    """M006 has 2 rows both Appartement - should be kept with summed surface."""
+    df = pl.DataFrame({
+        "id_mutation": ["M006", "M006"],
+        "date_mutation": ["2024-03-10", "2024-03-10"],
+        "nature_mutation": ["Vente", "Vente"],
+        "valeur_fonciere": [400000.0, 400000.0],
+        "code_postal": ["75002", "75002"],
+        "code_commune": ["75102", "75102"],
+        "nom_commune": ["Paris 2e", "Paris 2e"],
+        "code_departement": ["75", "75"],
+        "id_parcelle": ["75102000F006", "75102000F007"],
+        "code_type_local": ["2", "2"],
+        "type_local": ["Appartement", "Appartement"],
+        "surface_reelle_bati": [80.0, 30.0],
+        "nombre_pieces_principales": [3, 1],
+        "nombre_lots": [2, 2],
+        "longitude": [2.34, 2.34],
+        "latitude": [48.87, 48.87],
+    })
+    result = deduplicate_mutations(df.lazy()).collect()
+    assert len(result) == 1
+    assert result["surface_reelle_bati"][0] == 110.0  # 80 + 30
+    assert result["valeur_fonciere"][0] == 400000.0
+def test_deduplicate_mixed_type_excluded():
+    """M007 has Appartement + Dependance - should be excluded."""
+    df = pl.DataFrame({
+        "id_mutation": ["M007", "M007"],
+        "date_mutation": ["2023-06-01", "2023-06-01"],
+        "nature_mutation": ["Vente", "Vente"],
+        "valeur_fonciere": [350000.0, 350000.0],
+        "code_postal": ["06000", "06000"],
+        "code_commune": ["06088", "06088"],
+        "nom_commune": ["Nice", "Nice"],
+        "code_departement": ["06", "06"],
+        "id_parcelle": ["06088000G008", "06088000G009"],
+        "code_type_local": ["2", "4"],
+        "type_local": ["Appartement", "Dependance"],
+        "surface_reelle_bati": [60.0, 10.0],
+        "nombre_pieces_principales": [2, 0],
+        "nombre_lots": [2, 2],
+        "longitude": [7.26, 7.26],
+        "latitude": [43.71, 43.71],
+    })
+    result = deduplicate_mutations(df.lazy()).collect()
+    assert len(result) == 0
+def test_add_derived_columns_prix_m2():
+    df = pl.DataFrame({
+        "valeur_fonciere": [200000.0],
+        "surface_reelle_bati": [100.0],
+        "id_parcelle": ["75101000A001"],
+        "code_departement": ["75"],
+        "date_mutation": ["2024-06-15"],
+    })
+    result = add_derived_columns(df.lazy()).collect()
+    assert result["prix_m2"][0] == pytest.approx(2000.0)
+    assert result["code_section"][0] == "75101000A0"
+    assert result["year"][0] == "2024"
+    assert result["code_region"][0] == "11"  # Île-de-France
+def test_add_derived_columns_temporal_weight():
+    df = pl.DataFrame({
+        "valeur_fonciere": [200000.0],
+        "surface_reelle_bati": [100.0],
+        "id_parcelle": ["75101000A001"],
+        "code_departement": ["75"],
+        "date_mutation": ["2024-01-01"],
+    })
+    result = add_derived_columns(df.lazy()).collect()
+    # ~12 months before reference date (2025-01-01)
+    assert result["months_since"][0] == pytest.approx(12.0, abs=0.5)
+    assert 0 < result["temporal_weight"][0] < 1
+    expected_weight = 0.97 ** 12
+    assert result["temporal_weight"][0] == pytest.approx(expected_weight, abs=0.05)
+def test_add_derived_columns_corsica():
+    df = pl.DataFrame({
+        "valeur_fonciere": [200000.0],
+        "surface_reelle_bati": [100.0],
+        "id_parcelle": ["2A004000B001"],
+        "code_departement": ["2A"],
+        "date_mutation": ["2024-06-15"],
+    })
+    result = add_derived_columns(df.lazy()).collect()
+    assert result["code_region"][0] == "94"  # Corse
+def test_normalize_commune_paris():
+    df = pl.DataFrame({"code_commune": ["75101", "75115", "75120"]})
+    result = normalize_commune_codes(df.lazy()).collect()
+    assert (result["code_commune_city"] == "75056").all()
+def test_normalize_commune_lyon():
+    df = pl.DataFrame({"code_commune": ["69381", "69389"]})
+    result = normalize_commune_codes(df.lazy()).collect()
+    assert (result["code_commune_city"] == "69123").all()
+def test_normalize_commune_marseille():
+    df = pl.DataFrame({"code_commune": ["13201", "13216"]})
+    result = normalize_commune_codes(df.lazy()).collect()
+    assert (result["code_commune_city"] == "13055").all()
+def test_normalize_commune_regular_unchanged():
+    df = pl.DataFrame({"code_commune": ["33063", "31555"]})
+    result = normalize_commune_codes(df.lazy()).collect()
+    assert result["code_commune_city"][0] == "33063"
+    assert result["code_commune_city"][1] == "31555"
+def test_remove_outliers_surface():
+    df = pl.DataFrame({
+        "surface_reelle_bati": [5.0, 50.0, 1500.0],
+        "prix_m2": [2000.0, 2000.0, 2000.0],
+    })
+    result = remove_outliers(df.lazy()).collect()
+    assert len(result) == 1
+    assert result["surface_reelle_bati"][0] == 50.0
+def test_remove_outliers_price():
+    df = pl.DataFrame({
+        "surface_reelle_bati": [50.0, 50.0, 50.0],
+        "prix_m2": [50.0, 2000.0, 30000.0],
+    })
+    result = remove_outliers(df.lazy()).collect()
+    assert len(result) == 1
+    assert result["prix_m2"][0] == 2000.0

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Tests for configuration constants and mappings."""
+from src.config import (
+    AGGREGATION_LEVELS,
+    ARRONDISSEMENT_MAPPING,
+    DEPT_TO_REGION,
+    DVF_YEARS,
+    NO_DVF_DEPARTMENTS,
+    PRICE_M2_MAX,
+    PRICE_M2_MIN,
+    REGION_NAMES,
+    SURFACE_MAX,
+    SURFACE_MIN,
+    TEMPORAL_LAMBDA,
+    TOP_10_CITIES,
+    TRIM_FRACTION,
+    VALID_TYPE_LOCAL,
+    dvf_url,
+)
+def test_dvf_years_covers_full_range():
+    assert DVF_YEARS == list(range(2014, 2026))
+    assert len(DVF_YEARS) == 12
+def test_valid_type_local_residential_only():
+    assert "Appartement" in VALID_TYPE_LOCAL
+    assert "Maison" in VALID_TYPE_LOCAL
+    assert len(VALID_TYPE_LOCAL) == 2
+    # Commercial should NOT be included
+    assert "Local industriel. commercial ou assimilé" not in VALID_TYPE_LOCAL
+def test_dept_to_region_coverage():
+    # 101 departments total in France (including DOM-TOM)
+    assert len(DEPT_TO_REGION) >= 100
+    # Spot checks
+    assert DEPT_TO_REGION["75"] == "11"  # Paris -> Île-de-France
+    assert DEPT_TO_REGION["69"] == "84"  # Rhône -> Auvergne-Rhône-Alpes
+    assert DEPT_TO_REGION["2A"] == "94"  # Corse-du-Sud -> Corse
+    assert DEPT_TO_REGION["2B"] == "94"  # Haute-Corse -> Corse
+    assert DEPT_TO_REGION["971"] == "01"  # Guadeloupe
+def test_region_names():
+    assert len(REGION_NAMES) == 18  # 13 metropolitan + 5 overseas
+    assert REGION_NAMES["11"] == "Île-de-France"
+    assert REGION_NAMES["84"] == "Auvergne-Rhône-Alpes"
+def test_arrondissement_mapping_paris():
+    for i in range(1, 21):
+        code = f"751{i:02d}"
+        assert ARRONDISSEMENT_MAPPING[code] == "75056"
+def test_arrondissement_mapping_lyon():
+    for i in range(1, 10):
+        code = f"6938{i}"
+        assert ARRONDISSEMENT_MAPPING[code] == "69123"
+def test_arrondissement_mapping_marseille():
+    for i in range(1, 17):
+        code = f"132{i:02d}"
+        assert ARRONDISSEMENT_MAPPING[code] == "13055"
+def test_top_10_cities():
+    assert len(TOP_10_CITIES) == 10
+    assert TOP_10_CITIES["75056"] == "Paris"
+    assert TOP_10_CITIES["13055"] == "Marseille"
+    assert TOP_10_CITIES["69123"] == "Lyon"
+def test_no_dvf_departments():
+    assert NO_DVF_DEPARTMENTS == {"57", "67", "68", "976"}
+def test_aggregation_levels():
+    assert AGGREGATION_LEVELS == [
+        "country", "region", "department", "commune", "postcode", "section"
+    ]
+def test_price_bounds_sensible():
+    assert PRICE_M2_MIN < PRICE_M2_MAX
+    assert PRICE_M2_MIN >= 100
+    assert PRICE_M2_MAX <= 50000
+def test_surface_bounds_sensible():
+    assert SURFACE_MIN < SURFACE_MAX
+    assert SURFACE_MIN >= 5
+    assert SURFACE_MAX <= 2000
+def test_temporal_parameters():
+    assert 0.9 < TEMPORAL_LAMBDA < 1.0
+    assert 0.0 < TRIM_FRACTION < 0.5
+def test_dvf_url_format():
+    url = dvf_url(2024)
+    assert "2024" in url
+    assert url.endswith("full.csv.gz")
+    assert url.startswith("https://")

tests/test_top_cities.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Tests for top 10 cities computation."""
+import json
+from pathlib import Path
+import polars as pl
+import pytest
+from src.top_cities import compute_top_cities, export_top_cities
+def test_compute_top_cities_returns_known_cities(sample_clean_df):
+    result = compute_top_cities(sample_clean_df)
+    # sample_clean_df has code_commune_city 75056 and 69123
+    assert "Paris" in result
+    assert "Lyon" in result
+def test_compute_top_cities_structure(sample_clean_df):
+    result = compute_top_cities(sample_clean_df)
+    for city_name, city_data in result.items():
+        assert "code" in city_data
+        assert "tous" in city_data
+        # Check stats structure
+        tous = city_data["tous"]
+        assert "median" in tous
+        assert "wtm" in tous
+        assert "volume" in tous
+def test_compute_top_cities_has_property_types(sample_clean_df):
+    result = compute_top_cities(sample_clean_df)
+    paris = result["Paris"]
+    # sample_clean_df has only Appartement for Paris (75056)
+    assert "appartement" in paris
+    lyon = result["Lyon"]
+    # sample_clean_df has only Maison for Lyon (69123)
+    assert "maison" in lyon
+def test_compute_top_cities_no_commercial(sample_clean_df):
+    """No 'local' key should exist since commercial was removed."""
+    result = compute_top_cities(sample_clean_df)
+    for city_data in result.values():
+        assert "local" not in city_data
+def test_compute_top_cities_volumes(sample_clean_df):
+    result = compute_top_cities(sample_clean_df)
+    paris = result["Paris"]
+    # sample_clean_df has 4 rows with code_commune_city 75056
+    assert paris["tous"]["volume"] == 4
+def test_compute_top_cities_empty_city():
+    """Cities with no data should be skipped."""
+    df = pl.DataFrame({
+        "id_mutation": ["M1"],
+        "code_commune_city": ["99999"],  # Not a top 10 city
+        "type_local": ["Appartement"],
+        "prix_m2": [3000.0],
+        "temporal_weight": [1.0],
+    })
+    result = compute_top_cities(df)
+    assert len(result) == 0
+def test_export_top_cities_creates_file(tmp_path):
+    data = {
+        "Paris": {"code": "75056", "tous": {"median": 10000.0}},
+    }
+    export_top_cities(data, tmp_path)
+    path = tmp_path / "top_cities.json"
+    assert path.exists()
+    with open(path) as f:
+        loaded = json.load(f)
+    assert loaded["Paris"]["code"] == "75056"