Spaces:
Sleeping
Sleeping
| """Shared fixtures for DVF pipeline tests.""" | |
| import polars as pl | |
| import pytest | |
| def sample_raw_df() -> pl.DataFrame: | |
| """ | |
| Small synthetic DVF dataset mimicking raw CSV structure. | |
| Contains known edge cases: multi-row mutations, mixed types, | |
| arrondissements, outliers, commercial properties. | |
| """ | |
| return pl.DataFrame( | |
| { | |
| "id_mutation": [ | |
| # Normal single-row mutations | |
| "M001", "M002", "M003", "M004", "M005", | |
| # Multi-row same type (should be aggregated) | |
| "M006", "M006", | |
| # Multi-row mixed type (should be excluded) | |
| "M007", "M007", | |
| # Outliers | |
| "M008", "M009", | |
| # Commercial (should be filtered after config change) | |
| "M010", | |
| # Paris arrondissement | |
| "M011", | |
| # Non-sale | |
| "M012", | |
| # Null price | |
| "M013", | |
| ], | |
| "date_mutation": [ | |
| "2024-06-15", "2023-01-20", "2022-07-10", "2021-03-05", "2020-11-22", | |
| "2024-03-10", "2024-03-10", | |
| "2023-06-01", "2023-06-01", | |
| "2024-01-01", "2024-01-01", | |
| "2024-05-01", | |
| "2024-08-20", | |
| "2023-09-15", | |
| "2024-02-01", | |
| ], | |
| "nature_mutation": [ | |
| "Vente", "Vente", "Vente", "Vente", "Vente", | |
| "Vente", "Vente", | |
| "Vente", "Vente", | |
| "Vente", "Vente", | |
| "Vente", | |
| "Vente", | |
| "Echange", | |
| "Vente", | |
| ], | |
| "valeur_fonciere": [ | |
| 200000.0, 150000.0, 300000.0, 180000.0, 250000.0, | |
| 400000.0, 400000.0, | |
| 350000.0, 350000.0, | |
| 5000.0, 9000000.0, | |
| 120000.0, | |
| 500000.0, | |
| 100000.0, | |
| None, | |
| ], | |
| "code_postal": [ | |
| "75001", "69001", "13001", "31000", "33000", | |
| "75002", "75002", | |
| "06000", "06000", | |
| "44000", "44000", | |
| "34000", | |
| "75101", | |
| "59000", | |
| "75003", | |
| ], | |
| "code_commune": [ | |
| "75101", "69381", "13201", "31555", "33063", | |
| "75102", "75102", | |
| "06088", "06088", | |
| "44109", "44109", | |
| "34172", | |
| "75101", | |
| "59350", | |
| "75103", | |
| ], | |
| "nom_commune": [ | |
| "Paris 1er", "Lyon 1er", "Marseille 1er", "Toulouse", "Bordeaux", | |
| "Paris 2e", "Paris 2e", | |
| "Nice", "Nice", | |
| "Nantes", "Nantes", | |
| "Montpellier", | |
| "Paris 1er", | |
| "Lille", | |
| "Paris 3e", | |
| ], | |
| "code_departement": [ | |
| "75", "69", "13", "31", "33", | |
| "75", "75", | |
| "06", "06", | |
| "44", "44", | |
| "34", | |
| "75", | |
| "59", | |
| "75", | |
| ], | |
| "id_parcelle": [ | |
| "75101000A001", "69381000B002", "13201000C003", "31555000D004", "33063000E005", | |
| "75102000F006", "75102000F007", | |
| "06088000G008", "06088000G009", | |
| "44109000H010", "44109000H011", | |
| "34172000I012", | |
| "75101000J013", | |
| "59350000K014", | |
| "75103000L015", | |
| ], | |
| "code_type_local": [ | |
| "2", "2", "1", "1", "2", | |
| "2", "2", | |
| "2", "4", | |
| "2", "2", | |
| "4", | |
| "2", | |
| "1", | |
| "2", | |
| ], | |
| "type_local": [ | |
| "Appartement", "Appartement", "Maison", "Maison", "Appartement", | |
| "Appartement", "Appartement", | |
| "Appartement", "Dependance", | |
| "Appartement", "Appartement", | |
| "Local industriel. commercial ou assimilé", | |
| "Appartement", | |
| "Maison", | |
| "Appartement", | |
| ], | |
| "surface_reelle_bati": [ | |
| 50.0, 75.0, 120.0, 90.0, 65.0, | |
| 80.0, 30.0, | |
| 60.0, 10.0, | |
| 3.0, 45.0, | |
| 200.0, | |
| 55.0, | |
| 100.0, | |
| 40.0, | |
| ], | |
| "nombre_pieces_principales": [ | |
| 2, 3, 5, 4, 3, | |
| 3, 1, | |
| 2, 0, | |
| 1, 2, | |
| 0, | |
| 2, | |
| 4, | |
| 2, | |
| ], | |
| "nombre_lots": [ | |
| 1, 1, 1, 1, 1, | |
| 2, 2, | |
| 2, 2, | |
| 1, 1, | |
| 1, | |
| 1, | |
| 1, | |
| 1, | |
| ], | |
| "longitude": [ | |
| 2.34, 4.83, 5.37, 1.44, -0.57, | |
| 2.34, 2.34, | |
| 7.26, 7.26, | |
| -1.55, -1.55, | |
| 3.87, | |
| 2.34, | |
| 3.06, | |
| 2.36, | |
| ], | |
| "latitude": [ | |
| 48.86, 45.76, 43.30, 43.60, 44.84, | |
| 48.87, 48.87, | |
| 43.71, 43.71, | |
| 47.22, 47.22, | |
| 43.61, | |
| 48.86, | |
| 50.63, | |
| 48.86, | |
| ], | |
| } | |
| ) | |
| def sample_clean_df() -> pl.DataFrame: | |
| """ | |
| Pre-cleaned dataset with derived columns, ready for aggregation. | |
| Represents what comes out of the cleaner. | |
| """ | |
| return pl.DataFrame( | |
| { | |
| "id_mutation": ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8"], | |
| "date_mutation": [ | |
| "2024-06-15", "2024-01-10", "2023-06-01", | |
| "2022-01-15", "2021-06-01", "2020-01-10", | |
| "2024-09-01", "2024-03-15", | |
| ], | |
| "type_local": [ | |
| "Appartement", "Maison", "Appartement", | |
| "Maison", "Appartement", "Maison", | |
| "Appartement", "Maison", | |
| ], | |
| "valeur_fonciere": [ | |
| 250000.0, 180000.0, 200000.0, | |
| 150000.0, 300000.0, 120000.0, | |
| 400000.0, 220000.0, | |
| ], | |
| "surface_reelle_bati": [ | |
| 50.0, 90.0, 80.0, | |
| 100.0, 60.0, 80.0, | |
| 40.0, 110.0, | |
| ], | |
| "prix_m2": [ | |
| 5000.0, 2000.0, 2500.0, | |
| 1500.0, 5000.0, 1500.0, | |
| 10000.0, 2000.0, | |
| ], | |
| "temporal_weight": [ | |
| 0.97**2, 0.97**7, 0.97**13, | |
| 0.97**25, 0.97**37, 0.97**49, | |
| 0.97**(-2), 0.97**4, | |
| ], | |
| "code_departement": ["75", "69", "75", "69", "75", "69", "75", "69"], | |
| "code_commune": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"], | |
| "code_commune_city": ["75056", "69123", "75056", "69123", "75056", "69123", "75056", "69123"], | |
| "code_postal": ["75001", "69001", "75002", "69002", "75003", "69003", "75001", "69002"], | |
| "code_section": ["7510100001", "6938100001", "7510200001", "6912300001", "7510300001", "6912300002", "7510100002", "6912300003"], | |
| "code_region": ["11", "84", "11", "84", "11", "84", "11", "84"], | |
| "year": ["2024", "2024", "2023", "2022", "2021", "2020", "2024", "2024"], | |
| "months_since": [2.0, 7.0, 13.0, 25.0, 37.0, 49.0, -2.0, 4.0], | |
| } | |
| ) | |