"""Shared fixtures for DVF pipeline tests.""" import polars as pl import pytest @pytest.fixture def sample_raw_df() -> pl.DataFrame: """ Small synthetic DVF dataset mimicking raw CSV structure. Contains known edge cases: multi-row mutations, mixed types, arrondissements, outliers, commercial properties. """ return pl.DataFrame( { "id_mutation": [ # Normal single-row mutations "M001", "M002", "M003", "M004", "M005", # Multi-row same type (should be aggregated) "M006", "M006", # Multi-row mixed type (should be excluded) "M007", "M007", # Outliers "M008", "M009", # Commercial (should be filtered after config change) "M010", # Paris arrondissement "M011", # Non-sale "M012", # Null price "M013", ], "date_mutation": [ "2024-06-15", "2023-01-20", "2022-07-10", "2021-03-05", "2020-11-22", "2024-03-10", "2024-03-10", "2023-06-01", "2023-06-01", "2024-01-01", "2024-01-01", "2024-05-01", "2024-08-20", "2023-09-15", "2024-02-01", ], "nature_mutation": [ "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Vente", "Echange", "Vente", ], "valeur_fonciere": [ 200000.0, 150000.0, 300000.0, 180000.0, 250000.0, 400000.0, 400000.0, 350000.0, 350000.0, 5000.0, 9000000.0, 120000.0, 500000.0, 100000.0, None, ], "code_postal": [ "75001", "69001", "13001", "31000", "33000", "75002", "75002", "06000", "06000", "44000", "44000", "34000", "75101", "59000", "75003", ], "code_commune": [ "75101", "69381", "13201", "31555", "33063", "75102", "75102", "06088", "06088", "44109", "44109", "34172", "75101", "59350", "75103", ], "nom_commune": [ "Paris 1er", "Lyon 1er", "Marseille 1er", "Toulouse", "Bordeaux", "Paris 2e", "Paris 2e", "Nice", "Nice", "Nantes", "Nantes", "Montpellier", "Paris 1er", "Lille", "Paris 3e", ], "code_departement": [ "75", "69", "13", "31", "33", "75", "75", "06", "06", "44", "44", "34", "75", "59", "75", ], "id_parcelle": [ "75101000A001", "69381000B002", "13201000C003", "31555000D004", "33063000E005", "75102000F006", "75102000F007", "06088000G008", "06088000G009", "44109000H010", "44109000H011", "34172000I012", "75101000J013", "59350000K014", "75103000L015", ], "code_type_local": [ "2", "2", "1", "1", "2", "2", "2", "2", "4", "2", "2", "4", "2", "1", "2", ], "type_local": [ "Appartement", "Appartement", "Maison", "Maison", "Appartement", "Appartement", "Appartement", "Appartement", "Dependance", "Appartement", "Appartement", "Local industriel. commercial ou assimilé", "Appartement", "Maison", "Appartement", ], "surface_reelle_bati": [ 50.0, 75.0, 120.0, 90.0, 65.0, 80.0, 30.0, 60.0, 10.0, 3.0, 45.0, 200.0, 55.0, 100.0, 40.0, ], "nombre_pieces_principales": [ 2, 3, 5, 4, 3, 3, 1, 2, 0, 1, 2, 0, 2, 4, 2, ], "nombre_lots": [ 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, ], "longitude": [ 2.34, 4.83, 5.37, 1.44, -0.57, 2.34, 2.34, 7.26, 7.26, -1.55, -1.55, 3.87, 2.34, 3.06, 2.36, ], "latitude": [ 48.86, 45.76, 43.30, 43.60, 44.84, 48.87, 48.87, 43.71, 43.71, 47.22, 47.22, 43.61, 48.86, 50.63, 48.86, ], } ) @pytest.fixture def sample_clean_df() -> pl.DataFrame: """ Pre-cleaned dataset with derived columns, ready for aggregation. Represents what comes out of the cleaner. """ return pl.DataFrame( { "id_mutation": ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"], "date_mutation": [ "2024-06-15", "2024-01-10", "2023-06-01", "2022-01-15", "2021-06-01", "2020-01-10", "2024-09-01", "2024-03-15", ], "type_local": [ "Appartement", "Maison", "Appartement", "Maison", "Appartement", "Maison", "Appartement", "Maison", ], "valeur_fonciere": [ 250000.0, 180000.0, 200000.0, 150000.0, 300000.0, 120000.0, 400000.0, 220000.0, ], "surface_reelle_bati": [ 50.0, 90.0, 80.0, 100.0, 60.0, 80.0, 40.0, 110.0, ], "prix_m2": [ 5000.0, 2000.0, 2500.0, 1500.0, 5000.0, 1500.0, 10000.0, 2000.0, ], "temporal_weight": [ 0.97**2, 0.97**7, 0.97**13, 0.97**25, 0.97**37, 0.97**49, 0.97**(-2), 0.97**4, ], "code_departement": ["75", "69", "75", "69", "75", "69", "75", "69"], "code_commune": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"], "code_commune_city": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"], "code_postal": ["75001", "69001", "75002", "69002", "75003", "69003", "75001", "69002"], "code_section": ["7510100001", "6938100001", "7510200001", "6912300001", "7510300001", "6912300002", "7510100002", "6912300003"], "code_region": ["11", "84", "11", "84", "11", "84", "11", "84"], "year": ["2024", "2024", "2023", "2022", "2021", "2020", "2024", "2024"], "months_since": [2.0, 7.0, 13.0, 25.0, 37.0, 49.0, -2.0, 4.0], } )