Spaces:
Sleeping
Sleeping
test: add unit tests for all pipeline modules
Browse files60 tests covering config, cleaner, aggregator, and top_cities.
- tests/__init__.py +0 -0
- tests/conftest.py +236 -0
- tests/test_aggregator.py +243 -0
- tests/test_cleaner.py +189 -0
- tests/test_config.py +108 -0
- tests/test_top_cities.py +77 -0
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared fixtures for DVF pipeline tests."""
|
| 2 |
+
|
| 3 |
+
import polars as pl
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@pytest.fixture
|
| 8 |
+
def sample_raw_df() -> pl.DataFrame:
|
| 9 |
+
"""
|
| 10 |
+
Small synthetic DVF dataset mimicking raw CSV structure.
|
| 11 |
+
Contains known edge cases: multi-row mutations, mixed types,
|
| 12 |
+
arrondissements, outliers, commercial properties.
|
| 13 |
+
"""
|
| 14 |
+
return pl.DataFrame(
|
| 15 |
+
{
|
| 16 |
+
"id_mutation": [
|
| 17 |
+
# Normal single-row mutations
|
| 18 |
+
"M001", "M002", "M003", "M004", "M005",
|
| 19 |
+
# Multi-row same type (should be aggregated)
|
| 20 |
+
"M006", "M006",
|
| 21 |
+
# Multi-row mixed type (should be excluded)
|
| 22 |
+
"M007", "M007",
|
| 23 |
+
# Outliers
|
| 24 |
+
"M008", "M009",
|
| 25 |
+
# Commercial (should be filtered after config change)
|
| 26 |
+
"M010",
|
| 27 |
+
# Paris arrondissement
|
| 28 |
+
"M011",
|
| 29 |
+
# Non-sale
|
| 30 |
+
"M012",
|
| 31 |
+
# Null price
|
| 32 |
+
"M013",
|
| 33 |
+
],
|
| 34 |
+
"date_mutation": [
|
| 35 |
+
"2024-06-15", "2023-01-20", "2022-07-10", "2021-03-05", "2020-11-22",
|
| 36 |
+
"2024-03-10", "2024-03-10",
|
| 37 |
+
"2023-06-01", "2023-06-01",
|
| 38 |
+
"2024-01-01", "2024-01-01",
|
| 39 |
+
"2024-05-01",
|
| 40 |
+
"2024-08-20",
|
| 41 |
+
"2023-09-15",
|
| 42 |
+
"2024-02-01",
|
| 43 |
+
],
|
| 44 |
+
"nature_mutation": [
|
| 45 |
+
"Vente", "Vente", "Vente", "Vente", "Vente",
|
| 46 |
+
"Vente", "Vente",
|
| 47 |
+
"Vente", "Vente",
|
| 48 |
+
"Vente", "Vente",
|
| 49 |
+
"Vente",
|
| 50 |
+
"Vente",
|
| 51 |
+
"Echange",
|
| 52 |
+
"Vente",
|
| 53 |
+
],
|
| 54 |
+
"valeur_fonciere": [
|
| 55 |
+
200000.0, 150000.0, 300000.0, 180000.0, 250000.0,
|
| 56 |
+
400000.0, 400000.0,
|
| 57 |
+
350000.0, 350000.0,
|
| 58 |
+
5000.0, 9000000.0,
|
| 59 |
+
120000.0,
|
| 60 |
+
500000.0,
|
| 61 |
+
100000.0,
|
| 62 |
+
None,
|
| 63 |
+
],
|
| 64 |
+
"code_postal": [
|
| 65 |
+
"75001", "69001", "13001", "31000", "33000",
|
| 66 |
+
"75002", "75002",
|
| 67 |
+
"06000", "06000",
|
| 68 |
+
"44000", "44000",
|
| 69 |
+
"34000",
|
| 70 |
+
"75101",
|
| 71 |
+
"59000",
|
| 72 |
+
"75003",
|
| 73 |
+
],
|
| 74 |
+
"code_commune": [
|
| 75 |
+
"75101", "69381", "13201", "31555", "33063",
|
| 76 |
+
"75102", "75102",
|
| 77 |
+
"06088", "06088",
|
| 78 |
+
"44109", "44109",
|
| 79 |
+
"34172",
|
| 80 |
+
"75101",
|
| 81 |
+
"59350",
|
| 82 |
+
"75103",
|
| 83 |
+
],
|
| 84 |
+
"nom_commune": [
|
| 85 |
+
"Paris 1er", "Lyon 1er", "Marseille 1er", "Toulouse", "Bordeaux",
|
| 86 |
+
"Paris 2e", "Paris 2e",
|
| 87 |
+
"Nice", "Nice",
|
| 88 |
+
"Nantes", "Nantes",
|
| 89 |
+
"Montpellier",
|
| 90 |
+
"Paris 1er",
|
| 91 |
+
"Lille",
|
| 92 |
+
"Paris 3e",
|
| 93 |
+
],
|
| 94 |
+
"code_departement": [
|
| 95 |
+
"75", "69", "13", "31", "33",
|
| 96 |
+
"75", "75",
|
| 97 |
+
"06", "06",
|
| 98 |
+
"44", "44",
|
| 99 |
+
"34",
|
| 100 |
+
"75",
|
| 101 |
+
"59",
|
| 102 |
+
"75",
|
| 103 |
+
],
|
| 104 |
+
"id_parcelle": [
|
| 105 |
+
"75101000A001", "69381000B002", "13201000C003", "31555000D004", "33063000E005",
|
| 106 |
+
"75102000F006", "75102000F007",
|
| 107 |
+
"06088000G008", "06088000G009",
|
| 108 |
+
"44109000H010", "44109000H011",
|
| 109 |
+
"34172000I012",
|
| 110 |
+
"75101000J013",
|
| 111 |
+
"59350000K014",
|
| 112 |
+
"75103000L015",
|
| 113 |
+
],
|
| 114 |
+
"code_type_local": [
|
| 115 |
+
"2", "2", "1", "1", "2",
|
| 116 |
+
"2", "2",
|
| 117 |
+
"2", "4",
|
| 118 |
+
"2", "2",
|
| 119 |
+
"4",
|
| 120 |
+
"2",
|
| 121 |
+
"1",
|
| 122 |
+
"2",
|
| 123 |
+
],
|
| 124 |
+
"type_local": [
|
| 125 |
+
"Appartement", "Appartement", "Maison", "Maison", "Appartement",
|
| 126 |
+
"Appartement", "Appartement",
|
| 127 |
+
"Appartement", "Dependance",
|
| 128 |
+
"Appartement", "Appartement",
|
| 129 |
+
"Local industriel. commercial ou assimilé",
|
| 130 |
+
"Appartement",
|
| 131 |
+
"Maison",
|
| 132 |
+
"Appartement",
|
| 133 |
+
],
|
| 134 |
+
"surface_reelle_bati": [
|
| 135 |
+
50.0, 75.0, 120.0, 90.0, 65.0,
|
| 136 |
+
80.0, 30.0,
|
| 137 |
+
60.0, 10.0,
|
| 138 |
+
3.0, 45.0,
|
| 139 |
+
200.0,
|
| 140 |
+
55.0,
|
| 141 |
+
100.0,
|
| 142 |
+
40.0,
|
| 143 |
+
],
|
| 144 |
+
"nombre_pieces_principales": [
|
| 145 |
+
2, 3, 5, 4, 3,
|
| 146 |
+
3, 1,
|
| 147 |
+
2, 0,
|
| 148 |
+
1, 2,
|
| 149 |
+
0,
|
| 150 |
+
2,
|
| 151 |
+
4,
|
| 152 |
+
2,
|
| 153 |
+
],
|
| 154 |
+
"nombre_lots": [
|
| 155 |
+
1, 1, 1, 1, 1,
|
| 156 |
+
2, 2,
|
| 157 |
+
2, 2,
|
| 158 |
+
1, 1,
|
| 159 |
+
1,
|
| 160 |
+
1,
|
| 161 |
+
1,
|
| 162 |
+
1,
|
| 163 |
+
],
|
| 164 |
+
"longitude": [
|
| 165 |
+
2.34, 4.83, 5.37, 1.44, -0.57,
|
| 166 |
+
2.34, 2.34,
|
| 167 |
+
7.26, 7.26,
|
| 168 |
+
-1.55, -1.55,
|
| 169 |
+
3.87,
|
| 170 |
+
2.34,
|
| 171 |
+
3.06,
|
| 172 |
+
2.36,
|
| 173 |
+
],
|
| 174 |
+
"latitude": [
|
| 175 |
+
48.86, 45.76, 43.30, 43.60, 44.84,
|
| 176 |
+
48.87, 48.87,
|
| 177 |
+
43.71, 43.71,
|
| 178 |
+
47.22, 47.22,
|
| 179 |
+
43.61,
|
| 180 |
+
48.86,
|
| 181 |
+
50.63,
|
| 182 |
+
48.86,
|
| 183 |
+
],
|
| 184 |
+
}
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
@pytest.fixture
|
| 189 |
+
def sample_clean_df() -> pl.DataFrame:
|
| 190 |
+
"""
|
| 191 |
+
Pre-cleaned dataset with derived columns, ready for aggregation.
|
| 192 |
+
Represents what comes out of the cleaner.
|
| 193 |
+
"""
|
| 194 |
+
return pl.DataFrame(
|
| 195 |
+
{
|
| 196 |
+
"id_mutation": ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"],
|
| 197 |
+
"date_mutation": [
|
| 198 |
+
"2024-06-15", "2024-01-10", "2023-06-01",
|
| 199 |
+
"2022-01-15", "2021-06-01", "2020-01-10",
|
| 200 |
+
"2024-09-01", "2024-03-15",
|
| 201 |
+
],
|
| 202 |
+
"type_local": [
|
| 203 |
+
"Appartement", "Maison", "Appartement",
|
| 204 |
+
"Maison", "Appartement", "Maison",
|
| 205 |
+
"Appartement", "Maison",
|
| 206 |
+
],
|
| 207 |
+
"valeur_fonciere": [
|
| 208 |
+
250000.0, 180000.0, 200000.0,
|
| 209 |
+
150000.0, 300000.0, 120000.0,
|
| 210 |
+
400000.0, 220000.0,
|
| 211 |
+
],
|
| 212 |
+
"surface_reelle_bati": [
|
| 213 |
+
50.0, 90.0, 80.0,
|
| 214 |
+
100.0, 60.0, 80.0,
|
| 215 |
+
40.0, 110.0,
|
| 216 |
+
],
|
| 217 |
+
"prix_m2": [
|
| 218 |
+
5000.0, 2000.0, 2500.0,
|
| 219 |
+
1500.0, 5000.0, 1500.0,
|
| 220 |
+
10000.0, 2000.0,
|
| 221 |
+
],
|
| 222 |
+
"temporal_weight": [
|
| 223 |
+
0.97**2, 0.97**7, 0.97**13,
|
| 224 |
+
0.97**25, 0.97**37, 0.97**49,
|
| 225 |
+
0.97**(-2), 0.97**4,
|
| 226 |
+
],
|
| 227 |
+
"code_departement": ["75", "69", "75", "69", "75", "69", "75", "69"],
|
| 228 |
+
"code_commune": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
|
| 229 |
+
"code_commune_city": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
|
| 230 |
+
"code_postal": ["75001", "69001", "75002", "69002", "75003", "69003", "75001", "69002"],
|
| 231 |
+
"code_section": ["7510100001", "6938100001", "7510200001", "6912300001", "7510300001", "6912300002", "7510100002", "6912300003"],
|
| 232 |
+
"code_region": ["11", "84", "11", "84", "11", "84", "11", "84"],
|
| 233 |
+
"year": ["2024", "2024", "2023", "2022", "2021", "2020", "2024", "2024"],
|
| 234 |
+
"months_since": [2.0, 7.0, 13.0, 25.0, 37.0, 49.0, -2.0, 4.0],
|
| 235 |
+
}
|
| 236 |
+
)
|
tests/test_aggregator.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for aggregation functions."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import polars as pl
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from src.aggregator import (
|
| 12 |
+
LEVEL_TO_COLUMN,
|
| 13 |
+
_aggregate_group,
|
| 14 |
+
_export_sections,
|
| 15 |
+
aggregate_all_levels,
|
| 16 |
+
aggregate_all_types,
|
| 17 |
+
aggregate_level,
|
| 18 |
+
effective_sample_size,
|
| 19 |
+
export_json,
|
| 20 |
+
weighted_trimmed_mean,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---- weighted_trimmed_mean ----
|
| 25 |
+
|
| 26 |
+
def test_wtm_uniform_weights():
|
| 27 |
+
"""With equal weights and 0 trim, WTM == simple mean."""
|
| 28 |
+
prices = np.array([100.0, 200.0, 300.0, 400.0, 500.0])
|
| 29 |
+
weights = np.ones(5)
|
| 30 |
+
result = weighted_trimmed_mean(prices, weights, trim=0.0)
|
| 31 |
+
assert result == pytest.approx(300.0)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_wtm_trims_extremes():
|
| 35 |
+
"""With 20% trim, bottom and top tails should be clipped."""
|
| 36 |
+
prices = np.array([1.0, 100.0, 200.0, 300.0, 10000.0])
|
| 37 |
+
weights = np.ones(5)
|
| 38 |
+
result_trimmed = weighted_trimmed_mean(prices, weights, trim=0.2)
|
| 39 |
+
result_full = weighted_trimmed_mean(prices, weights, trim=0.0)
|
| 40 |
+
# Trimmed mean should be less affected by outliers
|
| 41 |
+
assert result_trimmed < result_full
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_wtm_respects_weight_ordering():
|
| 45 |
+
"""Higher-weighted observations should pull the mean toward them."""
|
| 46 |
+
prices = np.array([100.0, 200.0])
|
| 47 |
+
# Heavily weight the 200
|
| 48 |
+
weights_high = np.array([0.1, 10.0])
|
| 49 |
+
weights_low = np.array([10.0, 0.1])
|
| 50 |
+
result_high = weighted_trimmed_mean(prices, weights_high, trim=0.0)
|
| 51 |
+
result_low = weighted_trimmed_mean(prices, weights_low, trim=0.0)
|
| 52 |
+
assert result_high > result_low
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def test_wtm_empty_array():
|
| 56 |
+
result = weighted_trimmed_mean(np.array([]), np.array([]), trim=0.2)
|
| 57 |
+
assert math.isnan(result)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_wtm_single_element():
|
| 61 |
+
result = weighted_trimmed_mean(np.array([5000.0]), np.array([1.0]), trim=0.2)
|
| 62 |
+
assert result == pytest.approx(5000.0)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_wtm_zero_weights():
|
| 66 |
+
result = weighted_trimmed_mean(np.array([100.0, 200.0]), np.array([0.0, 0.0]))
|
| 67 |
+
assert math.isnan(result)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ---- effective_sample_size ----
|
| 71 |
+
|
| 72 |
+
def test_ess_equal_weights():
|
| 73 |
+
"""With all equal weights, n_eff == n."""
|
| 74 |
+
weights = np.ones(50)
|
| 75 |
+
assert effective_sample_size(weights) == pytest.approx(50.0)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_ess_unequal_weights():
|
| 79 |
+
"""Unequal weights should give n_eff < n."""
|
| 80 |
+
weights = np.array([1.0, 1.0, 1.0, 0.01])
|
| 81 |
+
n_eff = effective_sample_size(weights)
|
| 82 |
+
assert n_eff < 4.0
|
| 83 |
+
assert n_eff > 1.0
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_ess_single_dominant():
|
| 87 |
+
"""When one weight dominates, n_eff -> 1."""
|
| 88 |
+
weights = np.array([1000.0, 0.001, 0.001, 0.001])
|
| 89 |
+
n_eff = effective_sample_size(weights)
|
| 90 |
+
assert n_eff < 2.0
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def test_ess_empty():
|
| 94 |
+
assert effective_sample_size(np.array([])) == 0.0
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ---- _aggregate_group ----
|
| 98 |
+
|
| 99 |
+
def test_aggregate_group_basic():
|
| 100 |
+
df = pl.DataFrame({
|
| 101 |
+
"prix_m2": [2000.0, 3000.0, 4000.0, 2500.0, 3500.0],
|
| 102 |
+
"temporal_weight": [1.0, 1.0, 1.0, 1.0, 1.0],
|
| 103 |
+
})
|
| 104 |
+
stats = _aggregate_group(df)
|
| 105 |
+
assert stats["volume"] == 5
|
| 106 |
+
assert stats["median"] == pytest.approx(3000.0, rel=0.01)
|
| 107 |
+
assert stats["q1"] < stats["median"]
|
| 108 |
+
assert stats["q3"] > stats["median"]
|
| 109 |
+
assert stats["n_eff"] == pytest.approx(5.0, abs=0.1)
|
| 110 |
+
assert 0.0 <= stats["confidence"] <= 1.0
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def test_aggregate_group_with_temporal_decay():
|
| 114 |
+
"""Older transactions should have lower weights, pulling WTM toward recent."""
|
| 115 |
+
df = pl.DataFrame({
|
| 116 |
+
"prix_m2": [2000.0, 2000.0, 2000.0, 5000.0, 5000.0],
|
| 117 |
+
"temporal_weight": [0.5, 0.5, 0.5, 1.0, 1.0],
|
| 118 |
+
})
|
| 119 |
+
stats = _aggregate_group(df)
|
| 120 |
+
# WTM should be pulled toward 5000 (higher weight)
|
| 121 |
+
assert stats["wtm"] > stats["median"]
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def test_aggregate_group_empty():
|
| 125 |
+
df = pl.DataFrame({
|
| 126 |
+
"prix_m2": pl.Series([], dtype=pl.Float64),
|
| 127 |
+
"temporal_weight": pl.Series([], dtype=pl.Float64),
|
| 128 |
+
})
|
| 129 |
+
stats = _aggregate_group(df)
|
| 130 |
+
assert stats["volume"] == 0
|
| 131 |
+
assert stats["confidence"] == 0.0
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def test_aggregate_group_returns_all_keys():
|
| 135 |
+
df = pl.DataFrame({
|
| 136 |
+
"prix_m2": [3000.0],
|
| 137 |
+
"temporal_weight": [1.0],
|
| 138 |
+
})
|
| 139 |
+
stats = _aggregate_group(df)
|
| 140 |
+
expected_keys = {"median", "wtm", "q1", "q3", "volume", "n_eff", "confidence"}
|
| 141 |
+
assert set(stats.keys()) == expected_keys
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ---- aggregate_level ----
|
| 145 |
+
|
| 146 |
+
def test_aggregate_level_groups_correctly(sample_clean_df):
|
| 147 |
+
result = aggregate_level(sample_clean_df, "code_departement")
|
| 148 |
+
assert "75" in result
|
| 149 |
+
assert "69" in result
|
| 150 |
+
assert result["75"]["volume"] == 4 # 4 entries with dept 75
|
| 151 |
+
assert result["69"]["volume"] == 4
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def test_aggregate_level_with_property_type(sample_clean_df):
|
| 155 |
+
result = aggregate_level(
|
| 156 |
+
sample_clean_df, "code_departement", property_type="Appartement"
|
| 157 |
+
)
|
| 158 |
+
assert "75" in result
|
| 159 |
+
# Only Appartement rows for dept 75
|
| 160 |
+
expected = sample_clean_df.filter(
|
| 161 |
+
(pl.col("code_departement") == "75")
|
| 162 |
+
& (pl.col("type_local") == "Appartement")
|
| 163 |
+
)
|
| 164 |
+
assert result["75"]["volume"] == len(expected)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def test_aggregate_level_country(sample_clean_df):
|
| 168 |
+
result = aggregate_level(sample_clean_df, "_country")
|
| 169 |
+
assert "FR" in result
|
| 170 |
+
assert result["FR"]["volume"] == 8
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# ---- aggregate_all_types ----
|
| 174 |
+
|
| 175 |
+
def test_aggregate_all_types_keys(sample_clean_df):
|
| 176 |
+
result = aggregate_all_types(sample_clean_df, "code_departement")
|
| 177 |
+
for code in result:
|
| 178 |
+
assert "tous" in result[code]
|
| 179 |
+
# Dept 75 has only Appartement in sample, dept 69 has only Maison
|
| 180 |
+
assert "appartement" in result["75"]
|
| 181 |
+
assert "maison" in result["69"]
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ---- aggregate_all_levels ----
|
| 185 |
+
|
| 186 |
+
def test_aggregate_all_levels_keys(sample_clean_df):
|
| 187 |
+
result = aggregate_all_levels(sample_clean_df)
|
| 188 |
+
for level in ["country", "region", "department", "commune", "postcode", "section"]:
|
| 189 |
+
assert level in result
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
# ---- _export_sections ----
|
| 193 |
+
|
| 194 |
+
def test_export_sections_splits_by_dept(tmp_path):
|
| 195 |
+
section_data = {
|
| 196 |
+
"7510100001": {"tous": {"median": 5000}},
|
| 197 |
+
"7510100002": {"tous": {"median": 5100}},
|
| 198 |
+
"6938100001": {"tous": {"median": 3000}},
|
| 199 |
+
"2A004000B0": {"tous": {"median": 2000}},
|
| 200 |
+
}
|
| 201 |
+
_export_sections(section_data, tmp_path)
|
| 202 |
+
sections_dir = tmp_path / "sections"
|
| 203 |
+
assert (sections_dir / "75.json").exists()
|
| 204 |
+
assert (sections_dir / "69.json").exists()
|
| 205 |
+
assert (sections_dir / "2A.json").exists()
|
| 206 |
+
|
| 207 |
+
with open(sections_dir / "75.json") as f:
|
| 208 |
+
data_75 = json.load(f)
|
| 209 |
+
assert len(data_75) == 2
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def test_export_sections_dom_tom(tmp_path):
|
| 213 |
+
"""DOM-TOM departments (971-976) use 3-digit dept codes."""
|
| 214 |
+
section_data = {
|
| 215 |
+
"97105000001": {"tous": {"median": 2500}},
|
| 216 |
+
"97205000001": {"tous": {"median": 2600}},
|
| 217 |
+
}
|
| 218 |
+
_export_sections(section_data, tmp_path)
|
| 219 |
+
sections_dir = tmp_path / "sections"
|
| 220 |
+
assert (sections_dir / "971.json").exists()
|
| 221 |
+
assert (sections_dir / "972.json").exists()
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# ---- export_json ----
|
| 225 |
+
|
| 226 |
+
def test_export_json_creates_files(tmp_path, sample_clean_df):
|
| 227 |
+
aggregated = aggregate_all_levels(sample_clean_df)
|
| 228 |
+
export_json(aggregated, tmp_path)
|
| 229 |
+
assert (tmp_path / "prices_country.json").exists()
|
| 230 |
+
assert (tmp_path / "prices_region.json").exists()
|
| 231 |
+
assert (tmp_path / "prices_department.json").exists()
|
| 232 |
+
assert (tmp_path / "prices_commune.json").exists()
|
| 233 |
+
assert (tmp_path / "prices_postcode.json").exists()
|
| 234 |
+
# Section is split into per-dept files
|
| 235 |
+
assert (tmp_path / "sections").is_dir()
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
# ---- LEVEL_TO_COLUMN mapping ----
|
| 239 |
+
|
| 240 |
+
def test_level_to_column_covers_all_levels():
|
| 241 |
+
from src.config import AGGREGATION_LEVELS
|
| 242 |
+
for level in AGGREGATION_LEVELS:
|
| 243 |
+
assert level in LEVEL_TO_COLUMN
|
tests/test_cleaner.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for data cleaning functions."""
|
| 2 |
+
|
| 3 |
+
import polars as pl
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from src.cleaner import (
|
| 7 |
+
add_derived_columns,
|
| 8 |
+
deduplicate_mutations,
|
| 9 |
+
filter_sales,
|
| 10 |
+
normalize_commune_codes,
|
| 11 |
+
remove_outliers,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_filter_sales_keeps_only_vente(sample_raw_df):
|
| 16 |
+
lf = sample_raw_df.lazy()
|
| 17 |
+
result = filter_sales(lf).collect()
|
| 18 |
+
assert (result["nature_mutation"] == "Vente").all()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_filter_sales_removes_null_price(sample_raw_df):
|
| 22 |
+
lf = sample_raw_df.lazy()
|
| 23 |
+
result = filter_sales(lf).collect()
|
| 24 |
+
assert result["valeur_fonciere"].null_count() == 0
|
| 25 |
+
assert (result["valeur_fonciere"] > 0).all()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_filter_sales_removes_null_surface(sample_raw_df):
|
| 29 |
+
lf = sample_raw_df.lazy()
|
| 30 |
+
result = filter_sales(lf).collect()
|
| 31 |
+
assert result["surface_reelle_bati"].null_count() == 0
|
| 32 |
+
assert (result["surface_reelle_bati"] > 0).all()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_filter_sales_keeps_only_residential(sample_raw_df):
|
| 36 |
+
lf = sample_raw_df.lazy()
|
| 37 |
+
result = filter_sales(lf).collect()
|
| 38 |
+
types = result["type_local"].unique().to_list()
|
| 39 |
+
assert "Local industriel. commercial ou assimilé" not in types
|
| 40 |
+
assert "Dependance" not in types
|
| 41 |
+
for t in types:
|
| 42 |
+
assert t in ("Appartement", "Maison")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_deduplicate_single_row_mutations(sample_raw_df):
|
| 46 |
+
lf = sample_raw_df.lazy()
|
| 47 |
+
filtered = filter_sales(lf)
|
| 48 |
+
result = deduplicate_mutations(filtered).collect()
|
| 49 |
+
# Each id_mutation should appear exactly once
|
| 50 |
+
counts = result.group_by("id_mutation").len()
|
| 51 |
+
assert (counts["len"] == 1).all()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_deduplicate_multi_row_same_type():
|
| 55 |
+
"""M006 has 2 rows both Appartement - should be kept with summed surface."""
|
| 56 |
+
df = pl.DataFrame({
|
| 57 |
+
"id_mutation": ["M006", "M006"],
|
| 58 |
+
"date_mutation": ["2024-03-10", "2024-03-10"],
|
| 59 |
+
"nature_mutation": ["Vente", "Vente"],
|
| 60 |
+
"valeur_fonciere": [400000.0, 400000.0],
|
| 61 |
+
"code_postal": ["75002", "75002"],
|
| 62 |
+
"code_commune": ["75102", "75102"],
|
| 63 |
+
"nom_commune": ["Paris 2e", "Paris 2e"],
|
| 64 |
+
"code_departement": ["75", "75"],
|
| 65 |
+
"id_parcelle": ["75102000F006", "75102000F007"],
|
| 66 |
+
"code_type_local": ["2", "2"],
|
| 67 |
+
"type_local": ["Appartement", "Appartement"],
|
| 68 |
+
"surface_reelle_bati": [80.0, 30.0],
|
| 69 |
+
"nombre_pieces_principales": [3, 1],
|
| 70 |
+
"nombre_lots": [2, 2],
|
| 71 |
+
"longitude": [2.34, 2.34],
|
| 72 |
+
"latitude": [48.87, 48.87],
|
| 73 |
+
})
|
| 74 |
+
result = deduplicate_mutations(df.lazy()).collect()
|
| 75 |
+
assert len(result) == 1
|
| 76 |
+
assert result["surface_reelle_bati"][0] == 110.0 # 80 + 30
|
| 77 |
+
assert result["valeur_fonciere"][0] == 400000.0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_deduplicate_mixed_type_excluded():
|
| 81 |
+
"""M007 has Appartement + Dependance - should be excluded."""
|
| 82 |
+
df = pl.DataFrame({
|
| 83 |
+
"id_mutation": ["M007", "M007"],
|
| 84 |
+
"date_mutation": ["2023-06-01", "2023-06-01"],
|
| 85 |
+
"nature_mutation": ["Vente", "Vente"],
|
| 86 |
+
"valeur_fonciere": [350000.0, 350000.0],
|
| 87 |
+
"code_postal": ["06000", "06000"],
|
| 88 |
+
"code_commune": ["06088", "06088"],
|
| 89 |
+
"nom_commune": ["Nice", "Nice"],
|
| 90 |
+
"code_departement": ["06", "06"],
|
| 91 |
+
"id_parcelle": ["06088000G008", "06088000G009"],
|
| 92 |
+
"code_type_local": ["2", "4"],
|
| 93 |
+
"type_local": ["Appartement", "Dependance"],
|
| 94 |
+
"surface_reelle_bati": [60.0, 10.0],
|
| 95 |
+
"nombre_pieces_principales": [2, 0],
|
| 96 |
+
"nombre_lots": [2, 2],
|
| 97 |
+
"longitude": [7.26, 7.26],
|
| 98 |
+
"latitude": [43.71, 43.71],
|
| 99 |
+
})
|
| 100 |
+
result = deduplicate_mutations(df.lazy()).collect()
|
| 101 |
+
assert len(result) == 0
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test_add_derived_columns_prix_m2():
|
| 105 |
+
df = pl.DataFrame({
|
| 106 |
+
"valeur_fonciere": [200000.0],
|
| 107 |
+
"surface_reelle_bati": [100.0],
|
| 108 |
+
"id_parcelle": ["75101000A001"],
|
| 109 |
+
"code_departement": ["75"],
|
| 110 |
+
"date_mutation": ["2024-06-15"],
|
| 111 |
+
})
|
| 112 |
+
result = add_derived_columns(df.lazy()).collect()
|
| 113 |
+
assert result["prix_m2"][0] == pytest.approx(2000.0)
|
| 114 |
+
assert result["code_section"][0] == "75101000A0"
|
| 115 |
+
assert result["year"][0] == "2024"
|
| 116 |
+
assert result["code_region"][0] == "11" # Île-de-France
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def test_add_derived_columns_temporal_weight():
|
| 120 |
+
df = pl.DataFrame({
|
| 121 |
+
"valeur_fonciere": [200000.0],
|
| 122 |
+
"surface_reelle_bati": [100.0],
|
| 123 |
+
"id_parcelle": ["75101000A001"],
|
| 124 |
+
"code_departement": ["75"],
|
| 125 |
+
"date_mutation": ["2024-01-01"],
|
| 126 |
+
})
|
| 127 |
+
result = add_derived_columns(df.lazy()).collect()
|
| 128 |
+
# ~12 months before reference date (2025-01-01)
|
| 129 |
+
assert result["months_since"][0] == pytest.approx(12.0, abs=0.5)
|
| 130 |
+
assert 0 < result["temporal_weight"][0] < 1
|
| 131 |
+
expected_weight = 0.97 ** 12
|
| 132 |
+
assert result["temporal_weight"][0] == pytest.approx(expected_weight, abs=0.05)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def test_add_derived_columns_corsica():
|
| 136 |
+
df = pl.DataFrame({
|
| 137 |
+
"valeur_fonciere": [200000.0],
|
| 138 |
+
"surface_reelle_bati": [100.0],
|
| 139 |
+
"id_parcelle": ["2A004000B001"],
|
| 140 |
+
"code_departement": ["2A"],
|
| 141 |
+
"date_mutation": ["2024-06-15"],
|
| 142 |
+
})
|
| 143 |
+
result = add_derived_columns(df.lazy()).collect()
|
| 144 |
+
assert result["code_region"][0] == "94" # Corse
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def test_normalize_commune_paris():
|
| 148 |
+
df = pl.DataFrame({"code_commune": ["75101", "75115", "75120"]})
|
| 149 |
+
result = normalize_commune_codes(df.lazy()).collect()
|
| 150 |
+
assert (result["code_commune_city"] == "75056").all()
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def test_normalize_commune_lyon():
|
| 154 |
+
df = pl.DataFrame({"code_commune": ["69381", "69389"]})
|
| 155 |
+
result = normalize_commune_codes(df.lazy()).collect()
|
| 156 |
+
assert (result["code_commune_city"] == "69123").all()
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def test_normalize_commune_marseille():
|
| 160 |
+
df = pl.DataFrame({"code_commune": ["13201", "13216"]})
|
| 161 |
+
result = normalize_commune_codes(df.lazy()).collect()
|
| 162 |
+
assert (result["code_commune_city"] == "13055").all()
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def test_normalize_commune_regular_unchanged():
|
| 166 |
+
df = pl.DataFrame({"code_commune": ["33063", "31555"]})
|
| 167 |
+
result = normalize_commune_codes(df.lazy()).collect()
|
| 168 |
+
assert result["code_commune_city"][0] == "33063"
|
| 169 |
+
assert result["code_commune_city"][1] == "31555"
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def test_remove_outliers_surface():
|
| 173 |
+
df = pl.DataFrame({
|
| 174 |
+
"surface_reelle_bati": [5.0, 50.0, 1500.0],
|
| 175 |
+
"prix_m2": [2000.0, 2000.0, 2000.0],
|
| 176 |
+
})
|
| 177 |
+
result = remove_outliers(df.lazy()).collect()
|
| 178 |
+
assert len(result) == 1
|
| 179 |
+
assert result["surface_reelle_bati"][0] == 50.0
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def test_remove_outliers_price():
|
| 183 |
+
df = pl.DataFrame({
|
| 184 |
+
"surface_reelle_bati": [50.0, 50.0, 50.0],
|
| 185 |
+
"prix_m2": [50.0, 2000.0, 30000.0],
|
| 186 |
+
})
|
| 187 |
+
result = remove_outliers(df.lazy()).collect()
|
| 188 |
+
assert len(result) == 1
|
| 189 |
+
assert result["prix_m2"][0] == 2000.0
|
tests/test_config.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for configuration constants and mappings."""
|
| 2 |
+
|
| 3 |
+
from src.config import (
|
| 4 |
+
AGGREGATION_LEVELS,
|
| 5 |
+
ARRONDISSEMENT_MAPPING,
|
| 6 |
+
DEPT_TO_REGION,
|
| 7 |
+
DVF_YEARS,
|
| 8 |
+
NO_DVF_DEPARTMENTS,
|
| 9 |
+
PRICE_M2_MAX,
|
| 10 |
+
PRICE_M2_MIN,
|
| 11 |
+
REGION_NAMES,
|
| 12 |
+
SURFACE_MAX,
|
| 13 |
+
SURFACE_MIN,
|
| 14 |
+
TEMPORAL_LAMBDA,
|
| 15 |
+
TOP_10_CITIES,
|
| 16 |
+
TRIM_FRACTION,
|
| 17 |
+
VALID_TYPE_LOCAL,
|
| 18 |
+
dvf_url,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_dvf_years_covers_full_range():
|
| 23 |
+
assert DVF_YEARS == list(range(2014, 2026))
|
| 24 |
+
assert len(DVF_YEARS) == 12
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_valid_type_local_residential_only():
|
| 28 |
+
assert "Appartement" in VALID_TYPE_LOCAL
|
| 29 |
+
assert "Maison" in VALID_TYPE_LOCAL
|
| 30 |
+
assert len(VALID_TYPE_LOCAL) == 2
|
| 31 |
+
# Commercial should NOT be included
|
| 32 |
+
assert "Local industriel. commercial ou assimilé" not in VALID_TYPE_LOCAL
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_dept_to_region_coverage():
|
| 36 |
+
# 101 departments total in France (including DOM-TOM)
|
| 37 |
+
assert len(DEPT_TO_REGION) >= 100
|
| 38 |
+
# Spot checks
|
| 39 |
+
assert DEPT_TO_REGION["75"] == "11" # Paris -> Île-de-France
|
| 40 |
+
assert DEPT_TO_REGION["69"] == "84" # Rhône -> Auvergne-Rhône-Alpes
|
| 41 |
+
assert DEPT_TO_REGION["2A"] == "94" # Corse-du-Sud -> Corse
|
| 42 |
+
assert DEPT_TO_REGION["2B"] == "94" # Haute-Corse -> Corse
|
| 43 |
+
assert DEPT_TO_REGION["971"] == "01" # Guadeloupe
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_region_names():
|
| 47 |
+
assert len(REGION_NAMES) == 18 # 13 metropolitan + 5 overseas
|
| 48 |
+
assert REGION_NAMES["11"] == "Île-de-France"
|
| 49 |
+
assert REGION_NAMES["84"] == "Auvergne-Rhône-Alpes"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_arrondissement_mapping_paris():
|
| 53 |
+
for i in range(1, 21):
|
| 54 |
+
code = f"751{i:02d}"
|
| 55 |
+
assert ARRONDISSEMENT_MAPPING[code] == "75056"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_arrondissement_mapping_lyon():
|
| 59 |
+
for i in range(1, 10):
|
| 60 |
+
code = f"6938{i}"
|
| 61 |
+
assert ARRONDISSEMENT_MAPPING[code] == "69123"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_arrondissement_mapping_marseille():
|
| 65 |
+
for i in range(1, 17):
|
| 66 |
+
code = f"132{i:02d}"
|
| 67 |
+
assert ARRONDISSEMENT_MAPPING[code] == "13055"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_top_10_cities():
|
| 71 |
+
assert len(TOP_10_CITIES) == 10
|
| 72 |
+
assert TOP_10_CITIES["75056"] == "Paris"
|
| 73 |
+
assert TOP_10_CITIES["13055"] == "Marseille"
|
| 74 |
+
assert TOP_10_CITIES["69123"] == "Lyon"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_no_dvf_departments():
|
| 78 |
+
assert NO_DVF_DEPARTMENTS == {"57", "67", "68", "976"}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_aggregation_levels():
|
| 82 |
+
assert AGGREGATION_LEVELS == [
|
| 83 |
+
"country", "region", "department", "commune", "postcode", "section"
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def test_price_bounds_sensible():
|
| 88 |
+
assert PRICE_M2_MIN < PRICE_M2_MAX
|
| 89 |
+
assert PRICE_M2_MIN >= 100
|
| 90 |
+
assert PRICE_M2_MAX <= 50000
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def test_surface_bounds_sensible():
|
| 94 |
+
assert SURFACE_MIN < SURFACE_MAX
|
| 95 |
+
assert SURFACE_MIN >= 5
|
| 96 |
+
assert SURFACE_MAX <= 2000
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def test_temporal_parameters():
|
| 100 |
+
assert 0.9 < TEMPORAL_LAMBDA < 1.0
|
| 101 |
+
assert 0.0 < TRIM_FRACTION < 0.5
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test_dvf_url_format():
|
| 105 |
+
url = dvf_url(2024)
|
| 106 |
+
assert "2024" in url
|
| 107 |
+
assert url.endswith("full.csv.gz")
|
| 108 |
+
assert url.startswith("https://")
|
tests/test_top_cities.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for top 10 cities computation."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import polars as pl
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from src.top_cities import compute_top_cities, export_top_cities
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_compute_top_cities_returns_known_cities(sample_clean_df):
|
| 13 |
+
result = compute_top_cities(sample_clean_df)
|
| 14 |
+
# sample_clean_df has code_commune_city 75056 and 69123
|
| 15 |
+
assert "Paris" in result
|
| 16 |
+
assert "Lyon" in result
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_compute_top_cities_structure(sample_clean_df):
|
| 20 |
+
result = compute_top_cities(sample_clean_df)
|
| 21 |
+
for city_name, city_data in result.items():
|
| 22 |
+
assert "code" in city_data
|
| 23 |
+
assert "tous" in city_data
|
| 24 |
+
# Check stats structure
|
| 25 |
+
tous = city_data["tous"]
|
| 26 |
+
assert "median" in tous
|
| 27 |
+
assert "wtm" in tous
|
| 28 |
+
assert "volume" in tous
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_compute_top_cities_has_property_types(sample_clean_df):
|
| 32 |
+
result = compute_top_cities(sample_clean_df)
|
| 33 |
+
paris = result["Paris"]
|
| 34 |
+
# sample_clean_df has only Appartement for Paris (75056)
|
| 35 |
+
assert "appartement" in paris
|
| 36 |
+
lyon = result["Lyon"]
|
| 37 |
+
# sample_clean_df has only Maison for Lyon (69123)
|
| 38 |
+
assert "maison" in lyon
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_compute_top_cities_no_commercial(sample_clean_df):
|
| 42 |
+
"""No 'local' key should exist since commercial was removed."""
|
| 43 |
+
result = compute_top_cities(sample_clean_df)
|
| 44 |
+
for city_data in result.values():
|
| 45 |
+
assert "local" not in city_data
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def test_compute_top_cities_volumes(sample_clean_df):
|
| 49 |
+
result = compute_top_cities(sample_clean_df)
|
| 50 |
+
paris = result["Paris"]
|
| 51 |
+
# sample_clean_df has 4 rows with code_commune_city 75056
|
| 52 |
+
assert paris["tous"]["volume"] == 4
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def test_compute_top_cities_empty_city():
|
| 56 |
+
"""Cities with no data should be skipped."""
|
| 57 |
+
df = pl.DataFrame({
|
| 58 |
+
"id_mutation": ["M1"],
|
| 59 |
+
"code_commune_city": ["99999"], # Not a top 10 city
|
| 60 |
+
"type_local": ["Appartement"],
|
| 61 |
+
"prix_m2": [3000.0],
|
| 62 |
+
"temporal_weight": [1.0],
|
| 63 |
+
})
|
| 64 |
+
result = compute_top_cities(df)
|
| 65 |
+
assert len(result) == 0
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_export_top_cities_creates_file(tmp_path):
|
| 69 |
+
data = {
|
| 70 |
+
"Paris": {"code": "75056", "tous": {"median": 10000.0}},
|
| 71 |
+
}
|
| 72 |
+
export_top_cities(data, tmp_path)
|
| 73 |
+
path = tmp_path / "top_cities.json"
|
| 74 |
+
assert path.exists()
|
| 75 |
+
with open(path) as f:
|
| 76 |
+
loaded = json.load(f)
|
| 77 |
+
assert loaded["Paris"]["code"] == "75056"
|