dcrey7 commited on
Commit
e751d0d
·
1 Parent(s): 2b0cef4

test: add unit tests for all pipeline modules

Browse files

60 tests covering config, cleaner, aggregator, and top_cities.

tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared fixtures for DVF pipeline tests."""
2
+
3
+ import polars as pl
4
+ import pytest
5
+
6
+
7
+ @pytest.fixture
8
+ def sample_raw_df() -> pl.DataFrame:
9
+ """
10
+ Small synthetic DVF dataset mimicking raw CSV structure.
11
+ Contains known edge cases: multi-row mutations, mixed types,
12
+ arrondissements, outliers, commercial properties.
13
+ """
14
+ return pl.DataFrame(
15
+ {
16
+ "id_mutation": [
17
+ # Normal single-row mutations
18
+ "M001", "M002", "M003", "M004", "M005",
19
+ # Multi-row same type (should be aggregated)
20
+ "M006", "M006",
21
+ # Multi-row mixed type (should be excluded)
22
+ "M007", "M007",
23
+ # Outliers
24
+ "M008", "M009",
25
+ # Commercial (should be filtered after config change)
26
+ "M010",
27
+ # Paris arrondissement
28
+ "M011",
29
+ # Non-sale
30
+ "M012",
31
+ # Null price
32
+ "M013",
33
+ ],
34
+ "date_mutation": [
35
+ "2024-06-15", "2023-01-20", "2022-07-10", "2021-03-05", "2020-11-22",
36
+ "2024-03-10", "2024-03-10",
37
+ "2023-06-01", "2023-06-01",
38
+ "2024-01-01", "2024-01-01",
39
+ "2024-05-01",
40
+ "2024-08-20",
41
+ "2023-09-15",
42
+ "2024-02-01",
43
+ ],
44
+ "nature_mutation": [
45
+ "Vente", "Vente", "Vente", "Vente", "Vente",
46
+ "Vente", "Vente",
47
+ "Vente", "Vente",
48
+ "Vente", "Vente",
49
+ "Vente",
50
+ "Vente",
51
+ "Echange",
52
+ "Vente",
53
+ ],
54
+ "valeur_fonciere": [
55
+ 200000.0, 150000.0, 300000.0, 180000.0, 250000.0,
56
+ 400000.0, 400000.0,
57
+ 350000.0, 350000.0,
58
+ 5000.0, 9000000.0,
59
+ 120000.0,
60
+ 500000.0,
61
+ 100000.0,
62
+ None,
63
+ ],
64
+ "code_postal": [
65
+ "75001", "69001", "13001", "31000", "33000",
66
+ "75002", "75002",
67
+ "06000", "06000",
68
+ "44000", "44000",
69
+ "34000",
70
+ "75101",
71
+ "59000",
72
+ "75003",
73
+ ],
74
+ "code_commune": [
75
+ "75101", "69381", "13201", "31555", "33063",
76
+ "75102", "75102",
77
+ "06088", "06088",
78
+ "44109", "44109",
79
+ "34172",
80
+ "75101",
81
+ "59350",
82
+ "75103",
83
+ ],
84
+ "nom_commune": [
85
+ "Paris 1er", "Lyon 1er", "Marseille 1er", "Toulouse", "Bordeaux",
86
+ "Paris 2e", "Paris 2e",
87
+ "Nice", "Nice",
88
+ "Nantes", "Nantes",
89
+ "Montpellier",
90
+ "Paris 1er",
91
+ "Lille",
92
+ "Paris 3e",
93
+ ],
94
+ "code_departement": [
95
+ "75", "69", "13", "31", "33",
96
+ "75", "75",
97
+ "06", "06",
98
+ "44", "44",
99
+ "34",
100
+ "75",
101
+ "59",
102
+ "75",
103
+ ],
104
+ "id_parcelle": [
105
+ "75101000A001", "69381000B002", "13201000C003", "31555000D004", "33063000E005",
106
+ "75102000F006", "75102000F007",
107
+ "06088000G008", "06088000G009",
108
+ "44109000H010", "44109000H011",
109
+ "34172000I012",
110
+ "75101000J013",
111
+ "59350000K014",
112
+ "75103000L015",
113
+ ],
114
+ "code_type_local": [
115
+ "2", "2", "1", "1", "2",
116
+ "2", "2",
117
+ "2", "4",
118
+ "2", "2",
119
+ "4",
120
+ "2",
121
+ "1",
122
+ "2",
123
+ ],
124
+ "type_local": [
125
+ "Appartement", "Appartement", "Maison", "Maison", "Appartement",
126
+ "Appartement", "Appartement",
127
+ "Appartement", "Dependance",
128
+ "Appartement", "Appartement",
129
+ "Local industriel. commercial ou assimilé",
130
+ "Appartement",
131
+ "Maison",
132
+ "Appartement",
133
+ ],
134
+ "surface_reelle_bati": [
135
+ 50.0, 75.0, 120.0, 90.0, 65.0,
136
+ 80.0, 30.0,
137
+ 60.0, 10.0,
138
+ 3.0, 45.0,
139
+ 200.0,
140
+ 55.0,
141
+ 100.0,
142
+ 40.0,
143
+ ],
144
+ "nombre_pieces_principales": [
145
+ 2, 3, 5, 4, 3,
146
+ 3, 1,
147
+ 2, 0,
148
+ 1, 2,
149
+ 0,
150
+ 2,
151
+ 4,
152
+ 2,
153
+ ],
154
+ "nombre_lots": [
155
+ 1, 1, 1, 1, 1,
156
+ 2, 2,
157
+ 2, 2,
158
+ 1, 1,
159
+ 1,
160
+ 1,
161
+ 1,
162
+ 1,
163
+ ],
164
+ "longitude": [
165
+ 2.34, 4.83, 5.37, 1.44, -0.57,
166
+ 2.34, 2.34,
167
+ 7.26, 7.26,
168
+ -1.55, -1.55,
169
+ 3.87,
170
+ 2.34,
171
+ 3.06,
172
+ 2.36,
173
+ ],
174
+ "latitude": [
175
+ 48.86, 45.76, 43.30, 43.60, 44.84,
176
+ 48.87, 48.87,
177
+ 43.71, 43.71,
178
+ 47.22, 47.22,
179
+ 43.61,
180
+ 48.86,
181
+ 50.63,
182
+ 48.86,
183
+ ],
184
+ }
185
+ )
186
+
187
+
188
+ @pytest.fixture
189
+ def sample_clean_df() -> pl.DataFrame:
190
+ """
191
+ Pre-cleaned dataset with derived columns, ready for aggregation.
192
+ Represents what comes out of the cleaner.
193
+ """
194
+ return pl.DataFrame(
195
+ {
196
+ "id_mutation": ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"],
197
+ "date_mutation": [
198
+ "2024-06-15", "2024-01-10", "2023-06-01",
199
+ "2022-01-15", "2021-06-01", "2020-01-10",
200
+ "2024-09-01", "2024-03-15",
201
+ ],
202
+ "type_local": [
203
+ "Appartement", "Maison", "Appartement",
204
+ "Maison", "Appartement", "Maison",
205
+ "Appartement", "Maison",
206
+ ],
207
+ "valeur_fonciere": [
208
+ 250000.0, 180000.0, 200000.0,
209
+ 150000.0, 300000.0, 120000.0,
210
+ 400000.0, 220000.0,
211
+ ],
212
+ "surface_reelle_bati": [
213
+ 50.0, 90.0, 80.0,
214
+ 100.0, 60.0, 80.0,
215
+ 40.0, 110.0,
216
+ ],
217
+ "prix_m2": [
218
+ 5000.0, 2000.0, 2500.0,
219
+ 1500.0, 5000.0, 1500.0,
220
+ 10000.0, 2000.0,
221
+ ],
222
+ "temporal_weight": [
223
+ 0.97**2, 0.97**7, 0.97**13,
224
+ 0.97**25, 0.97**37, 0.97**49,
225
+ 0.97**(-2), 0.97**4,
226
+ ],
227
+ "code_departement": ["75", "69", "75", "69", "75", "69", "75", "69"],
228
+ "code_commune": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
229
+ "code_commune_city": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
230
+ "code_postal": ["75001", "69001", "75002", "69002", "75003", "69003", "75001", "69002"],
231
+ "code_section": ["7510100001", "6938100001", "7510200001", "6912300001", "7510300001", "6912300002", "7510100002", "6912300003"],
232
+ "code_region": ["11", "84", "11", "84", "11", "84", "11", "84"],
233
+ "year": ["2024", "2024", "2023", "2022", "2021", "2020", "2024", "2024"],
234
+ "months_since": [2.0, 7.0, 13.0, 25.0, 37.0, 49.0, -2.0, 4.0],
235
+ }
236
+ )
tests/test_aggregator.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for aggregation functions."""
2
+
3
+ import json
4
+ import math
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import pytest
10
+
11
+ from src.aggregator import (
12
+ LEVEL_TO_COLUMN,
13
+ _aggregate_group,
14
+ _export_sections,
15
+ aggregate_all_levels,
16
+ aggregate_all_types,
17
+ aggregate_level,
18
+ effective_sample_size,
19
+ export_json,
20
+ weighted_trimmed_mean,
21
+ )
22
+
23
+
24
+ # ---- weighted_trimmed_mean ----
25
+
26
+ def test_wtm_uniform_weights():
27
+ """With equal weights and 0 trim, WTM == simple mean."""
28
+ prices = np.array([100.0, 200.0, 300.0, 400.0, 500.0])
29
+ weights = np.ones(5)
30
+ result = weighted_trimmed_mean(prices, weights, trim=0.0)
31
+ assert result == pytest.approx(300.0)
32
+
33
+
34
+ def test_wtm_trims_extremes():
35
+ """With 20% trim, bottom and top tails should be clipped."""
36
+ prices = np.array([1.0, 100.0, 200.0, 300.0, 10000.0])
37
+ weights = np.ones(5)
38
+ result_trimmed = weighted_trimmed_mean(prices, weights, trim=0.2)
39
+ result_full = weighted_trimmed_mean(prices, weights, trim=0.0)
40
+ # Trimmed mean should be less affected by outliers
41
+ assert result_trimmed < result_full
42
+
43
+
44
+ def test_wtm_respects_weight_ordering():
45
+ """Higher-weighted observations should pull the mean toward them."""
46
+ prices = np.array([100.0, 200.0])
47
+ # Heavily weight the 200
48
+ weights_high = np.array([0.1, 10.0])
49
+ weights_low = np.array([10.0, 0.1])
50
+ result_high = weighted_trimmed_mean(prices, weights_high, trim=0.0)
51
+ result_low = weighted_trimmed_mean(prices, weights_low, trim=0.0)
52
+ assert result_high > result_low
53
+
54
+
55
+ def test_wtm_empty_array():
56
+ result = weighted_trimmed_mean(np.array([]), np.array([]), trim=0.2)
57
+ assert math.isnan(result)
58
+
59
+
60
+ def test_wtm_single_element():
61
+ result = weighted_trimmed_mean(np.array([5000.0]), np.array([1.0]), trim=0.2)
62
+ assert result == pytest.approx(5000.0)
63
+
64
+
65
+ def test_wtm_zero_weights():
66
+ result = weighted_trimmed_mean(np.array([100.0, 200.0]), np.array([0.0, 0.0]))
67
+ assert math.isnan(result)
68
+
69
+
70
+ # ---- effective_sample_size ----
71
+
72
+ def test_ess_equal_weights():
73
+ """With all equal weights, n_eff == n."""
74
+ weights = np.ones(50)
75
+ assert effective_sample_size(weights) == pytest.approx(50.0)
76
+
77
+
78
+ def test_ess_unequal_weights():
79
+ """Unequal weights should give n_eff < n."""
80
+ weights = np.array([1.0, 1.0, 1.0, 0.01])
81
+ n_eff = effective_sample_size(weights)
82
+ assert n_eff < 4.0
83
+ assert n_eff > 1.0
84
+
85
+
86
+ def test_ess_single_dominant():
87
+ """When one weight dominates, n_eff -> 1."""
88
+ weights = np.array([1000.0, 0.001, 0.001, 0.001])
89
+ n_eff = effective_sample_size(weights)
90
+ assert n_eff < 2.0
91
+
92
+
93
+ def test_ess_empty():
94
+ assert effective_sample_size(np.array([])) == 0.0
95
+
96
+
97
+ # ---- _aggregate_group ----
98
+
99
+ def test_aggregate_group_basic():
100
+ df = pl.DataFrame({
101
+ "prix_m2": [2000.0, 3000.0, 4000.0, 2500.0, 3500.0],
102
+ "temporal_weight": [1.0, 1.0, 1.0, 1.0, 1.0],
103
+ })
104
+ stats = _aggregate_group(df)
105
+ assert stats["volume"] == 5
106
+ assert stats["median"] == pytest.approx(3000.0, rel=0.01)
107
+ assert stats["q1"] < stats["median"]
108
+ assert stats["q3"] > stats["median"]
109
+ assert stats["n_eff"] == pytest.approx(5.0, abs=0.1)
110
+ assert 0.0 <= stats["confidence"] <= 1.0
111
+
112
+
113
+ def test_aggregate_group_with_temporal_decay():
114
+ """Older transactions should have lower weights, pulling WTM toward recent."""
115
+ df = pl.DataFrame({
116
+ "prix_m2": [2000.0, 2000.0, 2000.0, 5000.0, 5000.0],
117
+ "temporal_weight": [0.5, 0.5, 0.5, 1.0, 1.0],
118
+ })
119
+ stats = _aggregate_group(df)
120
+ # WTM should be pulled toward 5000 (higher weight)
121
+ assert stats["wtm"] > stats["median"]
122
+
123
+
124
+ def test_aggregate_group_empty():
125
+ df = pl.DataFrame({
126
+ "prix_m2": pl.Series([], dtype=pl.Float64),
127
+ "temporal_weight": pl.Series([], dtype=pl.Float64),
128
+ })
129
+ stats = _aggregate_group(df)
130
+ assert stats["volume"] == 0
131
+ assert stats["confidence"] == 0.0
132
+
133
+
134
+ def test_aggregate_group_returns_all_keys():
135
+ df = pl.DataFrame({
136
+ "prix_m2": [3000.0],
137
+ "temporal_weight": [1.0],
138
+ })
139
+ stats = _aggregate_group(df)
140
+ expected_keys = {"median", "wtm", "q1", "q3", "volume", "n_eff", "confidence"}
141
+ assert set(stats.keys()) == expected_keys
142
+
143
+
144
+ # ---- aggregate_level ----
145
+
146
+ def test_aggregate_level_groups_correctly(sample_clean_df):
147
+ result = aggregate_level(sample_clean_df, "code_departement")
148
+ assert "75" in result
149
+ assert "69" in result
150
+ assert result["75"]["volume"] == 4 # 4 entries with dept 75
151
+ assert result["69"]["volume"] == 4
152
+
153
+
154
+ def test_aggregate_level_with_property_type(sample_clean_df):
155
+ result = aggregate_level(
156
+ sample_clean_df, "code_departement", property_type="Appartement"
157
+ )
158
+ assert "75" in result
159
+ # Only Appartement rows for dept 75
160
+ expected = sample_clean_df.filter(
161
+ (pl.col("code_departement") == "75")
162
+ & (pl.col("type_local") == "Appartement")
163
+ )
164
+ assert result["75"]["volume"] == len(expected)
165
+
166
+
167
+ def test_aggregate_level_country(sample_clean_df):
168
+ result = aggregate_level(sample_clean_df, "_country")
169
+ assert "FR" in result
170
+ assert result["FR"]["volume"] == 8
171
+
172
+
173
+ # ---- aggregate_all_types ----
174
+
175
+ def test_aggregate_all_types_keys(sample_clean_df):
176
+ result = aggregate_all_types(sample_clean_df, "code_departement")
177
+ for code in result:
178
+ assert "tous" in result[code]
179
+ # Dept 75 has only Appartement in sample, dept 69 has only Maison
180
+ assert "appartement" in result["75"]
181
+ assert "maison" in result["69"]
182
+
183
+
184
+ # ---- aggregate_all_levels ----
185
+
186
+ def test_aggregate_all_levels_keys(sample_clean_df):
187
+ result = aggregate_all_levels(sample_clean_df)
188
+ for level in ["country", "region", "department", "commune", "postcode", "section"]:
189
+ assert level in result
190
+
191
+
192
+ # ---- _export_sections ----
193
+
194
+ def test_export_sections_splits_by_dept(tmp_path):
195
+ section_data = {
196
+ "7510100001": {"tous": {"median": 5000}},
197
+ "7510100002": {"tous": {"median": 5100}},
198
+ "6938100001": {"tous": {"median": 3000}},
199
+ "2A004000B0": {"tous": {"median": 2000}},
200
+ }
201
+ _export_sections(section_data, tmp_path)
202
+ sections_dir = tmp_path / "sections"
203
+ assert (sections_dir / "75.json").exists()
204
+ assert (sections_dir / "69.json").exists()
205
+ assert (sections_dir / "2A.json").exists()
206
+
207
+ with open(sections_dir / "75.json") as f:
208
+ data_75 = json.load(f)
209
+ assert len(data_75) == 2
210
+
211
+
212
+ def test_export_sections_dom_tom(tmp_path):
213
+ """DOM-TOM departments (971-976) use 3-digit dept codes."""
214
+ section_data = {
215
+ "97105000001": {"tous": {"median": 2500}},
216
+ "97205000001": {"tous": {"median": 2600}},
217
+ }
218
+ _export_sections(section_data, tmp_path)
219
+ sections_dir = tmp_path / "sections"
220
+ assert (sections_dir / "971.json").exists()
221
+ assert (sections_dir / "972.json").exists()
222
+
223
+
224
+ # ---- export_json ----
225
+
226
+ def test_export_json_creates_files(tmp_path, sample_clean_df):
227
+ aggregated = aggregate_all_levels(sample_clean_df)
228
+ export_json(aggregated, tmp_path)
229
+ assert (tmp_path / "prices_country.json").exists()
230
+ assert (tmp_path / "prices_region.json").exists()
231
+ assert (tmp_path / "prices_department.json").exists()
232
+ assert (tmp_path / "prices_commune.json").exists()
233
+ assert (tmp_path / "prices_postcode.json").exists()
234
+ # Section is split into per-dept files
235
+ assert (tmp_path / "sections").is_dir()
236
+
237
+
238
+ # ---- LEVEL_TO_COLUMN mapping ----
239
+
240
+ def test_level_to_column_covers_all_levels():
241
+ from src.config import AGGREGATION_LEVELS
242
+ for level in AGGREGATION_LEVELS:
243
+ assert level in LEVEL_TO_COLUMN
tests/test_cleaner.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for data cleaning functions."""
2
+
3
+ import polars as pl
4
+ import pytest
5
+
6
+ from src.cleaner import (
7
+ add_derived_columns,
8
+ deduplicate_mutations,
9
+ filter_sales,
10
+ normalize_commune_codes,
11
+ remove_outliers,
12
+ )
13
+
14
+
15
+ def test_filter_sales_keeps_only_vente(sample_raw_df):
16
+ lf = sample_raw_df.lazy()
17
+ result = filter_sales(lf).collect()
18
+ assert (result["nature_mutation"] == "Vente").all()
19
+
20
+
21
+ def test_filter_sales_removes_null_price(sample_raw_df):
22
+ lf = sample_raw_df.lazy()
23
+ result = filter_sales(lf).collect()
24
+ assert result["valeur_fonciere"].null_count() == 0
25
+ assert (result["valeur_fonciere"] > 0).all()
26
+
27
+
28
+ def test_filter_sales_removes_null_surface(sample_raw_df):
29
+ lf = sample_raw_df.lazy()
30
+ result = filter_sales(lf).collect()
31
+ assert result["surface_reelle_bati"].null_count() == 0
32
+ assert (result["surface_reelle_bati"] > 0).all()
33
+
34
+
35
+ def test_filter_sales_keeps_only_residential(sample_raw_df):
36
+ lf = sample_raw_df.lazy()
37
+ result = filter_sales(lf).collect()
38
+ types = result["type_local"].unique().to_list()
39
+ assert "Local industriel. commercial ou assimilé" not in types
40
+ assert "Dependance" not in types
41
+ for t in types:
42
+ assert t in ("Appartement", "Maison")
43
+
44
+
45
+ def test_deduplicate_single_row_mutations(sample_raw_df):
46
+ lf = sample_raw_df.lazy()
47
+ filtered = filter_sales(lf)
48
+ result = deduplicate_mutations(filtered).collect()
49
+ # Each id_mutation should appear exactly once
50
+ counts = result.group_by("id_mutation").len()
51
+ assert (counts["len"] == 1).all()
52
+
53
+
54
+ def test_deduplicate_multi_row_same_type():
55
+ """M006 has 2 rows both Appartement - should be kept with summed surface."""
56
+ df = pl.DataFrame({
57
+ "id_mutation": ["M006", "M006"],
58
+ "date_mutation": ["2024-03-10", "2024-03-10"],
59
+ "nature_mutation": ["Vente", "Vente"],
60
+ "valeur_fonciere": [400000.0, 400000.0],
61
+ "code_postal": ["75002", "75002"],
62
+ "code_commune": ["75102", "75102"],
63
+ "nom_commune": ["Paris 2e", "Paris 2e"],
64
+ "code_departement": ["75", "75"],
65
+ "id_parcelle": ["75102000F006", "75102000F007"],
66
+ "code_type_local": ["2", "2"],
67
+ "type_local": ["Appartement", "Appartement"],
68
+ "surface_reelle_bati": [80.0, 30.0],
69
+ "nombre_pieces_principales": [3, 1],
70
+ "nombre_lots": [2, 2],
71
+ "longitude": [2.34, 2.34],
72
+ "latitude": [48.87, 48.87],
73
+ })
74
+ result = deduplicate_mutations(df.lazy()).collect()
75
+ assert len(result) == 1
76
+ assert result["surface_reelle_bati"][0] == 110.0 # 80 + 30
77
+ assert result["valeur_fonciere"][0] == 400000.0
78
+
79
+
80
+ def test_deduplicate_mixed_type_excluded():
81
+ """M007 has Appartement + Dependance - should be excluded."""
82
+ df = pl.DataFrame({
83
+ "id_mutation": ["M007", "M007"],
84
+ "date_mutation": ["2023-06-01", "2023-06-01"],
85
+ "nature_mutation": ["Vente", "Vente"],
86
+ "valeur_fonciere": [350000.0, 350000.0],
87
+ "code_postal": ["06000", "06000"],
88
+ "code_commune": ["06088", "06088"],
89
+ "nom_commune": ["Nice", "Nice"],
90
+ "code_departement": ["06", "06"],
91
+ "id_parcelle": ["06088000G008", "06088000G009"],
92
+ "code_type_local": ["2", "4"],
93
+ "type_local": ["Appartement", "Dependance"],
94
+ "surface_reelle_bati": [60.0, 10.0],
95
+ "nombre_pieces_principales": [2, 0],
96
+ "nombre_lots": [2, 2],
97
+ "longitude": [7.26, 7.26],
98
+ "latitude": [43.71, 43.71],
99
+ })
100
+ result = deduplicate_mutations(df.lazy()).collect()
101
+ assert len(result) == 0
102
+
103
+
104
+ def test_add_derived_columns_prix_m2():
105
+ df = pl.DataFrame({
106
+ "valeur_fonciere": [200000.0],
107
+ "surface_reelle_bati": [100.0],
108
+ "id_parcelle": ["75101000A001"],
109
+ "code_departement": ["75"],
110
+ "date_mutation": ["2024-06-15"],
111
+ })
112
+ result = add_derived_columns(df.lazy()).collect()
113
+ assert result["prix_m2"][0] == pytest.approx(2000.0)
114
+ assert result["code_section"][0] == "75101000A0"
115
+ assert result["year"][0] == "2024"
116
+ assert result["code_region"][0] == "11" # Île-de-France
117
+
118
+
119
+ def test_add_derived_columns_temporal_weight():
120
+ df = pl.DataFrame({
121
+ "valeur_fonciere": [200000.0],
122
+ "surface_reelle_bati": [100.0],
123
+ "id_parcelle": ["75101000A001"],
124
+ "code_departement": ["75"],
125
+ "date_mutation": ["2024-01-01"],
126
+ })
127
+ result = add_derived_columns(df.lazy()).collect()
128
+ # ~12 months before reference date (2025-01-01)
129
+ assert result["months_since"][0] == pytest.approx(12.0, abs=0.5)
130
+ assert 0 < result["temporal_weight"][0] < 1
131
+ expected_weight = 0.97 ** 12
132
+ assert result["temporal_weight"][0] == pytest.approx(expected_weight, abs=0.05)
133
+
134
+
135
+ def test_add_derived_columns_corsica():
136
+ df = pl.DataFrame({
137
+ "valeur_fonciere": [200000.0],
138
+ "surface_reelle_bati": [100.0],
139
+ "id_parcelle": ["2A004000B001"],
140
+ "code_departement": ["2A"],
141
+ "date_mutation": ["2024-06-15"],
142
+ })
143
+ result = add_derived_columns(df.lazy()).collect()
144
+ assert result["code_region"][0] == "94" # Corse
145
+
146
+
147
+ def test_normalize_commune_paris():
148
+ df = pl.DataFrame({"code_commune": ["75101", "75115", "75120"]})
149
+ result = normalize_commune_codes(df.lazy()).collect()
150
+ assert (result["code_commune_city"] == "75056").all()
151
+
152
+
153
+ def test_normalize_commune_lyon():
154
+ df = pl.DataFrame({"code_commune": ["69381", "69389"]})
155
+ result = normalize_commune_codes(df.lazy()).collect()
156
+ assert (result["code_commune_city"] == "69123").all()
157
+
158
+
159
+ def test_normalize_commune_marseille():
160
+ df = pl.DataFrame({"code_commune": ["13201", "13216"]})
161
+ result = normalize_commune_codes(df.lazy()).collect()
162
+ assert (result["code_commune_city"] == "13055").all()
163
+
164
+
165
+ def test_normalize_commune_regular_unchanged():
166
+ df = pl.DataFrame({"code_commune": ["33063", "31555"]})
167
+ result = normalize_commune_codes(df.lazy()).collect()
168
+ assert result["code_commune_city"][0] == "33063"
169
+ assert result["code_commune_city"][1] == "31555"
170
+
171
+
172
+ def test_remove_outliers_surface():
173
+ df = pl.DataFrame({
174
+ "surface_reelle_bati": [5.0, 50.0, 1500.0],
175
+ "prix_m2": [2000.0, 2000.0, 2000.0],
176
+ })
177
+ result = remove_outliers(df.lazy()).collect()
178
+ assert len(result) == 1
179
+ assert result["surface_reelle_bati"][0] == 50.0
180
+
181
+
182
+ def test_remove_outliers_price():
183
+ df = pl.DataFrame({
184
+ "surface_reelle_bati": [50.0, 50.0, 50.0],
185
+ "prix_m2": [50.0, 2000.0, 30000.0],
186
+ })
187
+ result = remove_outliers(df.lazy()).collect()
188
+ assert len(result) == 1
189
+ assert result["prix_m2"][0] == 2000.0
tests/test_config.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for configuration constants and mappings."""
2
+
3
+ from src.config import (
4
+ AGGREGATION_LEVELS,
5
+ ARRONDISSEMENT_MAPPING,
6
+ DEPT_TO_REGION,
7
+ DVF_YEARS,
8
+ NO_DVF_DEPARTMENTS,
9
+ PRICE_M2_MAX,
10
+ PRICE_M2_MIN,
11
+ REGION_NAMES,
12
+ SURFACE_MAX,
13
+ SURFACE_MIN,
14
+ TEMPORAL_LAMBDA,
15
+ TOP_10_CITIES,
16
+ TRIM_FRACTION,
17
+ VALID_TYPE_LOCAL,
18
+ dvf_url,
19
+ )
20
+
21
+
22
+ def test_dvf_years_covers_full_range():
23
+ assert DVF_YEARS == list(range(2014, 2026))
24
+ assert len(DVF_YEARS) == 12
25
+
26
+
27
+ def test_valid_type_local_residential_only():
28
+ assert "Appartement" in VALID_TYPE_LOCAL
29
+ assert "Maison" in VALID_TYPE_LOCAL
30
+ assert len(VALID_TYPE_LOCAL) == 2
31
+ # Commercial should NOT be included
32
+ assert "Local industriel. commercial ou assimilé" not in VALID_TYPE_LOCAL
33
+
34
+
35
+ def test_dept_to_region_coverage():
36
+ # 101 departments total in France (including DOM-TOM)
37
+ assert len(DEPT_TO_REGION) >= 100
38
+ # Spot checks
39
+ assert DEPT_TO_REGION["75"] == "11" # Paris -> Île-de-France
40
+ assert DEPT_TO_REGION["69"] == "84" # Rhône -> Auvergne-Rhône-Alpes
41
+ assert DEPT_TO_REGION["2A"] == "94" # Corse-du-Sud -> Corse
42
+ assert DEPT_TO_REGION["2B"] == "94" # Haute-Corse -> Corse
43
+ assert DEPT_TO_REGION["971"] == "01" # Guadeloupe
44
+
45
+
46
+ def test_region_names():
47
+ assert len(REGION_NAMES) == 18 # 13 metropolitan + 5 overseas
48
+ assert REGION_NAMES["11"] == "Île-de-France"
49
+ assert REGION_NAMES["84"] == "Auvergne-Rhône-Alpes"
50
+
51
+
52
+ def test_arrondissement_mapping_paris():
53
+ for i in range(1, 21):
54
+ code = f"751{i:02d}"
55
+ assert ARRONDISSEMENT_MAPPING[code] == "75056"
56
+
57
+
58
+ def test_arrondissement_mapping_lyon():
59
+ for i in range(1, 10):
60
+ code = f"6938{i}"
61
+ assert ARRONDISSEMENT_MAPPING[code] == "69123"
62
+
63
+
64
+ def test_arrondissement_mapping_marseille():
65
+ for i in range(1, 17):
66
+ code = f"132{i:02d}"
67
+ assert ARRONDISSEMENT_MAPPING[code] == "13055"
68
+
69
+
70
+ def test_top_10_cities():
71
+ assert len(TOP_10_CITIES) == 10
72
+ assert TOP_10_CITIES["75056"] == "Paris"
73
+ assert TOP_10_CITIES["13055"] == "Marseille"
74
+ assert TOP_10_CITIES["69123"] == "Lyon"
75
+
76
+
77
+ def test_no_dvf_departments():
78
+ assert NO_DVF_DEPARTMENTS == {"57", "67", "68", "976"}
79
+
80
+
81
+ def test_aggregation_levels():
82
+ assert AGGREGATION_LEVELS == [
83
+ "country", "region", "department", "commune", "postcode", "section"
84
+ ]
85
+
86
+
87
+ def test_price_bounds_sensible():
88
+ assert PRICE_M2_MIN < PRICE_M2_MAX
89
+ assert PRICE_M2_MIN >= 100
90
+ assert PRICE_M2_MAX <= 50000
91
+
92
+
93
+ def test_surface_bounds_sensible():
94
+ assert SURFACE_MIN < SURFACE_MAX
95
+ assert SURFACE_MIN >= 5
96
+ assert SURFACE_MAX <= 2000
97
+
98
+
99
+ def test_temporal_parameters():
100
+ assert 0.9 < TEMPORAL_LAMBDA < 1.0
101
+ assert 0.0 < TRIM_FRACTION < 0.5
102
+
103
+
104
+ def test_dvf_url_format():
105
+ url = dvf_url(2024)
106
+ assert "2024" in url
107
+ assert url.endswith("full.csv.gz")
108
+ assert url.startswith("https://")
tests/test_top_cities.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for top 10 cities computation."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+ import pytest
8
+
9
+ from src.top_cities import compute_top_cities, export_top_cities
10
+
11
+
12
+ def test_compute_top_cities_returns_known_cities(sample_clean_df):
13
+ result = compute_top_cities(sample_clean_df)
14
+ # sample_clean_df has code_commune_city 75056 and 69123
15
+ assert "Paris" in result
16
+ assert "Lyon" in result
17
+
18
+
19
+ def test_compute_top_cities_structure(sample_clean_df):
20
+ result = compute_top_cities(sample_clean_df)
21
+ for city_name, city_data in result.items():
22
+ assert "code" in city_data
23
+ assert "tous" in city_data
24
+ # Check stats structure
25
+ tous = city_data["tous"]
26
+ assert "median" in tous
27
+ assert "wtm" in tous
28
+ assert "volume" in tous
29
+
30
+
31
+ def test_compute_top_cities_has_property_types(sample_clean_df):
32
+ result = compute_top_cities(sample_clean_df)
33
+ paris = result["Paris"]
34
+ # sample_clean_df has only Appartement for Paris (75056)
35
+ assert "appartement" in paris
36
+ lyon = result["Lyon"]
37
+ # sample_clean_df has only Maison for Lyon (69123)
38
+ assert "maison" in lyon
39
+
40
+
41
+ def test_compute_top_cities_no_commercial(sample_clean_df):
42
+ """No 'local' key should exist since commercial was removed."""
43
+ result = compute_top_cities(sample_clean_df)
44
+ for city_data in result.values():
45
+ assert "local" not in city_data
46
+
47
+
48
+ def test_compute_top_cities_volumes(sample_clean_df):
49
+ result = compute_top_cities(sample_clean_df)
50
+ paris = result["Paris"]
51
+ # sample_clean_df has 4 rows with code_commune_city 75056
52
+ assert paris["tous"]["volume"] == 4
53
+
54
+
55
+ def test_compute_top_cities_empty_city():
56
+ """Cities with no data should be skipped."""
57
+ df = pl.DataFrame({
58
+ "id_mutation": ["M1"],
59
+ "code_commune_city": ["99999"], # Not a top 10 city
60
+ "type_local": ["Appartement"],
61
+ "prix_m2": [3000.0],
62
+ "temporal_weight": [1.0],
63
+ })
64
+ result = compute_top_cities(df)
65
+ assert len(result) == 0
66
+
67
+
68
+ def test_export_top_cities_creates_file(tmp_path):
69
+ data = {
70
+ "Paris": {"code": "75056", "tous": {"median": 10000.0}},
71
+ }
72
+ export_top_cities(data, tmp_path)
73
+ path = tmp_path / "top_cities.json"
74
+ assert path.exists()
75
+ with open(path) as f:
76
+ loaded = json.load(f)
77
+ assert loaded["Paris"]["code"] == "75056"