Spaces:
Running
Running
| """ | |
| Configuration constants for the DVF data pipeline. | |
| Single source of truth for paths, URLs, thresholds, and mappings. | |
| """ | |
| from pathlib import Path | |
| # --------------------------------------------------------------------------- | |
| # Paths | |
| # --------------------------------------------------------------------------- | |
| ROOT_DIR = Path(__file__).resolve().parent.parent | |
| DATA_DIR = ROOT_DIR / "data" | |
| RAW_DIR = DATA_DIR / "raw" | |
| PROCESSED_DIR = DATA_DIR / "processed" | |
| AGGREGATED_DIR = DATA_DIR / "aggregated" | |
| SECTIONS_DIR = AGGREGATED_DIR / "sections" | |
| # --------------------------------------------------------------------------- | |
| # DVF data source | |
| # --------------------------------------------------------------------------- | |
| DVF_BASE_URL = "https://files.data.gouv.fr/geo-dvf/latest/csv" | |
| DVF_YEARS = list(range(2020, 2026)) # 2020-2025: geo-dvf only available for last 5 years | |
| def dvf_url(year: int) -> str: | |
| """Return download URL for a given year's national DVF geolocalized CSV.""" | |
| return f"{DVF_BASE_URL}/{year}/full.csv.gz" | |
| # --------------------------------------------------------------------------- | |
| # Columns we actually need (saves memory on load) | |
| # --------------------------------------------------------------------------- | |
| DVF_COLUMNS = [ | |
| "id_mutation", | |
| "date_mutation", | |
| "nature_mutation", | |
| "valeur_fonciere", | |
| "code_postal", | |
| "code_commune", | |
| "nom_commune", | |
| "code_departement", | |
| "id_parcelle", | |
| "code_type_local", | |
| "type_local", | |
| "surface_reelle_bati", | |
| "nombre_pieces_principales", | |
| "nombre_lots", | |
| "longitude", | |
| "latitude", | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Filtering thresholds | |
| # --------------------------------------------------------------------------- | |
| VALID_NATURE_MUTATION = "Vente" | |
| VALID_TYPE_LOCAL = ["Appartement", "Maison"] # Residential only per Carlos's feedback | |
| TYPE_LOCAL_SHORT = { | |
| "Appartement": "appartement", | |
| "Maison": "maison", | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Temporal weighting parameters | |
| # --------------------------------------------------------------------------- | |
| REFERENCE_DATE = "2025-01-01" # Anchor date for temporal decay | |
| TEMPORAL_LAMBDA = 0.97 # Monthly decay factor (half-life ~23 months) | |
| TRIM_FRACTION = 0.20 # Trim 20% from each tail for trimmed mean | |
| # Price per m² bounds for outlier removal | |
| PRICE_M2_MIN = 200 # €/m² — below this is almost certainly an error | |
| PRICE_M2_MAX = 25_000 # €/m² — above this is extreme luxury / error | |
| SURFACE_MIN = 9 # m² — below 9m² is legally not habitable in France | |
| SURFACE_MAX = 1000 # m² — above this for a single unit is suspect | |
| # --------------------------------------------------------------------------- | |
| # Department → Region mapping (2016 reform) | |
| # --------------------------------------------------------------------------- | |
| DEPT_TO_REGION: dict[str, str] = {} | |
| _REGION_DEPTS = { | |
| "84-Auvergne-Rhône-Alpes": "01,03,07,15,26,38,42,43,63,69,73,74", | |
| "27-Bourgogne-Franche-Comté": "21,25,39,58,70,71,89,90", | |
| "53-Bretagne": "22,29,35,56", | |
| "24-Centre-Val de Loire": "18,28,36,37,41,45", | |
| "94-Corse": "2A,2B", | |
| "44-Grand Est": "08,10,51,52,54,55,57,67,68,88", | |
| "32-Hauts-de-France": "02,59,60,62,80", | |
| "11-Île-de-France": "75,77,78,91,92,93,94,95", | |
| "28-Normandie": "14,27,50,61,76", | |
| "75-Nouvelle-Aquitaine": "16,17,19,23,24,33,40,47,64,79,86,87", | |
| "76-Occitanie": "09,11,12,30,31,32,34,46,48,65,66,81,82", | |
| "52-Pays de la Loire": "44,49,53,72,85", | |
| "93-Provence-Alpes-Côte d'Azur": "04,05,06,13,83,84", | |
| "01-Guadeloupe": "971", | |
| "02-Martinique": "972", | |
| "03-Guyane": "973", | |
| "04-La Réunion": "974", | |
| "06-Mayotte": "976", | |
| } | |
| REGION_NAMES: dict[str, str] = {} | |
| for key, depts_str in _REGION_DEPTS.items(): | |
| code, name = key.split("-", 1) | |
| REGION_NAMES[code] = name | |
| for d in depts_str.split(","): | |
| DEPT_TO_REGION[d.strip()] = code | |
| # Departments with no DVF data (Alsace-Moselle + Mayotte) | |
| NO_DVF_DEPARTMENTS = {"57", "67", "68", "976"} | |
| # --------------------------------------------------------------------------- | |
| # Top 10 cities by commune population (INSEE code → name) | |
| # Source: INSEE Recensement de la population | |
| # https://www.data.gouv.fr/datasets/population-municipale-des-communes-france-entiere | |
| # Note: Strasbourg (67482) is in Alsace-Moselle — no DVF data available | |
| # --------------------------------------------------------------------------- | |
| TOP_10_CITIES: dict[str, str] = { | |
| "75056": "Paris", | |
| "13055": "Marseille", | |
| "69123": "Lyon", | |
| "31555": "Toulouse", | |
| "06088": "Nice", | |
| "44109": "Nantes", | |
| "34172": "Montpellier", | |
| "67482": "Strasbourg", | |
| "33063": "Bordeaux", | |
| "59350": "Lille", | |
| } | |
| # Paris, Lyon, Marseille have arrondissements — we need to map them back | |
| ARRONDISSEMENT_MAPPING: dict[str, str] = {} | |
| # Paris: 75101-75120 → 75056 | |
| for i in range(1, 21): | |
| ARRONDISSEMENT_MAPPING[f"751{i:02d}"] = "75056" | |
| # Lyon: 69381-69389 → 69123 | |
| for i in range(1, 10): | |
| ARRONDISSEMENT_MAPPING[f"6938{i}"] = "69123" | |
| # Marseille: 13201-13216 → 13055 | |
| for i in range(1, 17): | |
| ARRONDISSEMENT_MAPPING[f"132{i:02d}"] = "13055" | |
| # --------------------------------------------------------------------------- | |
| # Aggregation levels | |
| # --------------------------------------------------------------------------- | |
| AGGREGATION_LEVELS = [ | |
| "country", | |
| "region", | |
| "department", | |
| "commune", | |
| "postcode", | |
| "section", | |
| ] | |