""" Configuration constants for the DVF data pipeline. Single source of truth for paths, URLs, thresholds, and mappings. """ from pathlib import Path # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- ROOT_DIR = Path(__file__).resolve().parent.parent DATA_DIR = ROOT_DIR / "data" RAW_DIR = DATA_DIR / "raw" PROCESSED_DIR = DATA_DIR / "processed" AGGREGATED_DIR = DATA_DIR / "aggregated" SECTIONS_DIR = AGGREGATED_DIR / "sections" # --------------------------------------------------------------------------- # DVF data source # --------------------------------------------------------------------------- DVF_BASE_URL = "https://files.data.gouv.fr/geo-dvf/latest/csv" DVF_YEARS = list(range(2020, 2026)) # 2020-2025: geo-dvf only available for last 5 years def dvf_url(year: int) -> str: """Return download URL for a given year's national DVF geolocalized CSV.""" return f"{DVF_BASE_URL}/{year}/full.csv.gz" # --------------------------------------------------------------------------- # Columns we actually need (saves memory on load) # --------------------------------------------------------------------------- DVF_COLUMNS = [ "id_mutation", "date_mutation", "nature_mutation", "valeur_fonciere", "code_postal", "code_commune", "nom_commune", "code_departement", "id_parcelle", "code_type_local", "type_local", "surface_reelle_bati", "nombre_pieces_principales", "nombre_lots", "longitude", "latitude", ] # --------------------------------------------------------------------------- # Filtering thresholds # --------------------------------------------------------------------------- VALID_NATURE_MUTATION = "Vente" VALID_TYPE_LOCAL = ["Appartement", "Maison"] # Residential only per Carlos's feedback TYPE_LOCAL_SHORT = { "Appartement": "appartement", "Maison": "maison", } # --------------------------------------------------------------------------- # Temporal weighting parameters # --------------------------------------------------------------------------- REFERENCE_DATE = "2025-01-01" # Anchor date for temporal decay TEMPORAL_LAMBDA = 0.97 # Monthly decay factor (half-life ~23 months) TRIM_FRACTION = 0.20 # Trim 20% from each tail for trimmed mean # Price per m² bounds for outlier removal PRICE_M2_MIN = 200 # €/m² — below this is almost certainly an error PRICE_M2_MAX = 25_000 # €/m² — above this is extreme luxury / error SURFACE_MIN = 9 # m² — below 9m² is legally not habitable in France SURFACE_MAX = 1000 # m² — above this for a single unit is suspect # --------------------------------------------------------------------------- # Department → Region mapping (2016 reform) # --------------------------------------------------------------------------- DEPT_TO_REGION: dict[str, str] = {} _REGION_DEPTS = { "84-Auvergne-Rhône-Alpes": "01,03,07,15,26,38,42,43,63,69,73,74", "27-Bourgogne-Franche-Comté": "21,25,39,58,70,71,89,90", "53-Bretagne": "22,29,35,56", "24-Centre-Val de Loire": "18,28,36,37,41,45", "94-Corse": "2A,2B", "44-Grand Est": "08,10,51,52,54,55,57,67,68,88", "32-Hauts-de-France": "02,59,60,62,80", "11-Île-de-France": "75,77,78,91,92,93,94,95", "28-Normandie": "14,27,50,61,76", "75-Nouvelle-Aquitaine": "16,17,19,23,24,33,40,47,64,79,86,87", "76-Occitanie": "09,11,12,30,31,32,34,46,48,65,66,81,82", "52-Pays de la Loire": "44,49,53,72,85", "93-Provence-Alpes-Côte d'Azur": "04,05,06,13,83,84", "01-Guadeloupe": "971", "02-Martinique": "972", "03-Guyane": "973", "04-La Réunion": "974", "06-Mayotte": "976", } REGION_NAMES: dict[str, str] = {} for key, depts_str in _REGION_DEPTS.items(): code, name = key.split("-", 1) REGION_NAMES[code] = name for d in depts_str.split(","): DEPT_TO_REGION[d.strip()] = code # Departments with no DVF data (Alsace-Moselle + Mayotte) NO_DVF_DEPARTMENTS = {"57", "67", "68", "976"} # --------------------------------------------------------------------------- # Top 10 cities by commune population (INSEE code → name) # Source: INSEE Recensement de la population # https://www.data.gouv.fr/datasets/population-municipale-des-communes-france-entiere # Note: Strasbourg (67482) is in Alsace-Moselle — no DVF data available # --------------------------------------------------------------------------- TOP_10_CITIES: dict[str, str] = { "75056": "Paris", "13055": "Marseille", "69123": "Lyon", "31555": "Toulouse", "06088": "Nice", "44109": "Nantes", "34172": "Montpellier", "67482": "Strasbourg", "33063": "Bordeaux", "59350": "Lille", } # Paris, Lyon, Marseille have arrondissements — we need to map them back ARRONDISSEMENT_MAPPING: dict[str, str] = {} # Paris: 75101-75120 → 75056 for i in range(1, 21): ARRONDISSEMENT_MAPPING[f"751{i:02d}"] = "75056" # Lyon: 69381-69389 → 69123 for i in range(1, 10): ARRONDISSEMENT_MAPPING[f"6938{i}"] = "69123" # Marseille: 13201-13216 → 13055 for i in range(1, 17): ARRONDISSEMENT_MAPPING[f"132{i:02d}"] = "13055" # --------------------------------------------------------------------------- # Aggregation levels # --------------------------------------------------------------------------- AGGREGATION_LEVELS = [ "country", "region", "department", "commune", "postcode", "section", ]