dcrey7's picture
test: add unit tests for all pipeline modules
e751d0d
"""Shared fixtures for DVF pipeline tests."""
import polars as pl
import pytest
@pytest.fixture
def sample_raw_df() -> pl.DataFrame:
"""
Small synthetic DVF dataset mimicking raw CSV structure.
Contains known edge cases: multi-row mutations, mixed types,
arrondissements, outliers, commercial properties.
"""
return pl.DataFrame(
{
"id_mutation": [
# Normal single-row mutations
"M001", "M002", "M003", "M004", "M005",
# Multi-row same type (should be aggregated)
"M006", "M006",
# Multi-row mixed type (should be excluded)
"M007", "M007",
# Outliers
"M008", "M009",
# Commercial (should be filtered after config change)
"M010",
# Paris arrondissement
"M011",
# Non-sale
"M012",
# Null price
"M013",
],
"date_mutation": [
"2024-06-15", "2023-01-20", "2022-07-10", "2021-03-05", "2020-11-22",
"2024-03-10", "2024-03-10",
"2023-06-01", "2023-06-01",
"2024-01-01", "2024-01-01",
"2024-05-01",
"2024-08-20",
"2023-09-15",
"2024-02-01",
],
"nature_mutation": [
"Vente", "Vente", "Vente", "Vente", "Vente",
"Vente", "Vente",
"Vente", "Vente",
"Vente", "Vente",
"Vente",
"Vente",
"Echange",
"Vente",
],
"valeur_fonciere": [
200000.0, 150000.0, 300000.0, 180000.0, 250000.0,
400000.0, 400000.0,
350000.0, 350000.0,
5000.0, 9000000.0,
120000.0,
500000.0,
100000.0,
None,
],
"code_postal": [
"75001", "69001", "13001", "31000", "33000",
"75002", "75002",
"06000", "06000",
"44000", "44000",
"34000",
"75101",
"59000",
"75003",
],
"code_commune": [
"75101", "69381", "13201", "31555", "33063",
"75102", "75102",
"06088", "06088",
"44109", "44109",
"34172",
"75101",
"59350",
"75103",
],
"nom_commune": [
"Paris 1er", "Lyon 1er", "Marseille 1er", "Toulouse", "Bordeaux",
"Paris 2e", "Paris 2e",
"Nice", "Nice",
"Nantes", "Nantes",
"Montpellier",
"Paris 1er",
"Lille",
"Paris 3e",
],
"code_departement": [
"75", "69", "13", "31", "33",
"75", "75",
"06", "06",
"44", "44",
"34",
"75",
"59",
"75",
],
"id_parcelle": [
"75101000A001", "69381000B002", "13201000C003", "31555000D004", "33063000E005",
"75102000F006", "75102000F007",
"06088000G008", "06088000G009",
"44109000H010", "44109000H011",
"34172000I012",
"75101000J013",
"59350000K014",
"75103000L015",
],
"code_type_local": [
"2", "2", "1", "1", "2",
"2", "2",
"2", "4",
"2", "2",
"4",
"2",
"1",
"2",
],
"type_local": [
"Appartement", "Appartement", "Maison", "Maison", "Appartement",
"Appartement", "Appartement",
"Appartement", "Dependance",
"Appartement", "Appartement",
"Local industriel. commercial ou assimilé",
"Appartement",
"Maison",
"Appartement",
],
"surface_reelle_bati": [
50.0, 75.0, 120.0, 90.0, 65.0,
80.0, 30.0,
60.0, 10.0,
3.0, 45.0,
200.0,
55.0,
100.0,
40.0,
],
"nombre_pieces_principales": [
2, 3, 5, 4, 3,
3, 1,
2, 0,
1, 2,
0,
2,
4,
2,
],
"nombre_lots": [
1, 1, 1, 1, 1,
2, 2,
2, 2,
1, 1,
1,
1,
1,
1,
],
"longitude": [
2.34, 4.83, 5.37, 1.44, -0.57,
2.34, 2.34,
7.26, 7.26,
-1.55, -1.55,
3.87,
2.34,
3.06,
2.36,
],
"latitude": [
48.86, 45.76, 43.30, 43.60, 44.84,
48.87, 48.87,
43.71, 43.71,
47.22, 47.22,
43.61,
48.86,
50.63,
48.86,
],
}
)
@pytest.fixture
def sample_clean_df() -> pl.DataFrame:
"""
Pre-cleaned dataset with derived columns, ready for aggregation.
Represents what comes out of the cleaner.
"""
return pl.DataFrame(
{
"id_mutation": ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"],
"date_mutation": [
"2024-06-15", "2024-01-10", "2023-06-01",
"2022-01-15", "2021-06-01", "2020-01-10",
"2024-09-01", "2024-03-15",
],
"type_local": [
"Appartement", "Maison", "Appartement",
"Maison", "Appartement", "Maison",
"Appartement", "Maison",
],
"valeur_fonciere": [
250000.0, 180000.0, 200000.0,
150000.0, 300000.0, 120000.0,
400000.0, 220000.0,
],
"surface_reelle_bati": [
50.0, 90.0, 80.0,
100.0, 60.0, 80.0,
40.0, 110.0,
],
"prix_m2": [
5000.0, 2000.0, 2500.0,
1500.0, 5000.0, 1500.0,
10000.0, 2000.0,
],
"temporal_weight": [
0.97**2, 0.97**7, 0.97**13,
0.97**25, 0.97**37, 0.97**49,
0.97**(-2), 0.97**4,
],
"code_departement": ["75", "69", "75", "69", "75", "69", "75", "69"],
"code_commune": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
"code_commune_city": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"],
"code_postal": ["75001", "69001", "75002", "69002", "75003", "69003", "75001", "69002"],
"code_section": ["7510100001", "6938100001", "7510200001", "6912300001", "7510300001", "6912300002", "7510100002", "6912300003"],
"code_region": ["11", "84", "11", "84", "11", "84", "11", "84"],
"year": ["2024", "2024", "2023", "2022", "2021", "2020", "2024", "2024"],
"months_since": [2.0, 7.0, 13.0, 25.0, 37.0, 49.0, -2.0, 4.0],
}
)