dcrey7's picture
feat: government vector tiles, dept selection, fix section rendering
9eba1e1
"""
Configuration constants for the DVF data pipeline.
Single source of truth for paths, URLs, thresholds, and mappings.
"""
from pathlib import Path
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
ROOT_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = ROOT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
AGGREGATED_DIR = DATA_DIR / "aggregated"
SECTIONS_DIR = AGGREGATED_DIR / "sections"
# ---------------------------------------------------------------------------
# DVF data source
# ---------------------------------------------------------------------------
DVF_BASE_URL = "https://files.data.gouv.fr/geo-dvf/latest/csv"
DVF_YEARS = list(range(2020, 2026)) # 2020-2025: geo-dvf only available for last 5 years
def dvf_url(year: int) -> str:
"""Return download URL for a given year's national DVF geolocalized CSV."""
return f"{DVF_BASE_URL}/{year}/full.csv.gz"
# ---------------------------------------------------------------------------
# Columns we actually need (saves memory on load)
# ---------------------------------------------------------------------------
DVF_COLUMNS = [
"id_mutation",
"date_mutation",
"nature_mutation",
"valeur_fonciere",
"code_postal",
"code_commune",
"nom_commune",
"code_departement",
"id_parcelle",
"code_type_local",
"type_local",
"surface_reelle_bati",
"nombre_pieces_principales",
"nombre_lots",
"longitude",
"latitude",
]
# ---------------------------------------------------------------------------
# Filtering thresholds
# ---------------------------------------------------------------------------
VALID_NATURE_MUTATION = "Vente"
VALID_TYPE_LOCAL = ["Appartement", "Maison"] # Residential only per Carlos's feedback
TYPE_LOCAL_SHORT = {
"Appartement": "appartement",
"Maison": "maison",
}
# ---------------------------------------------------------------------------
# Temporal weighting parameters
# ---------------------------------------------------------------------------
REFERENCE_DATE = "2025-01-01" # Anchor date for temporal decay
TEMPORAL_LAMBDA = 0.97 # Monthly decay factor (half-life ~23 months)
TRIM_FRACTION = 0.20 # Trim 20% from each tail for trimmed mean
# Price per m² bounds for outlier removal
PRICE_M2_MIN = 200 # €/m² — below this is almost certainly an error
PRICE_M2_MAX = 25_000 # €/m² — above this is extreme luxury / error
SURFACE_MIN = 9 # m² — below 9m² is legally not habitable in France
SURFACE_MAX = 1000 # m² — above this for a single unit is suspect
# ---------------------------------------------------------------------------
# Department → Region mapping (2016 reform)
# ---------------------------------------------------------------------------
DEPT_TO_REGION: dict[str, str] = {}
_REGION_DEPTS = {
"84-Auvergne-Rhône-Alpes": "01,03,07,15,26,38,42,43,63,69,73,74",
"27-Bourgogne-Franche-Comté": "21,25,39,58,70,71,89,90",
"53-Bretagne": "22,29,35,56",
"24-Centre-Val de Loire": "18,28,36,37,41,45",
"94-Corse": "2A,2B",
"44-Grand Est": "08,10,51,52,54,55,57,67,68,88",
"32-Hauts-de-France": "02,59,60,62,80",
"11-Île-de-France": "75,77,78,91,92,93,94,95",
"28-Normandie": "14,27,50,61,76",
"75-Nouvelle-Aquitaine": "16,17,19,23,24,33,40,47,64,79,86,87",
"76-Occitanie": "09,11,12,30,31,32,34,46,48,65,66,81,82",
"52-Pays de la Loire": "44,49,53,72,85",
"93-Provence-Alpes-Côte d'Azur": "04,05,06,13,83,84",
"01-Guadeloupe": "971",
"02-Martinique": "972",
"03-Guyane": "973",
"04-La Réunion": "974",
"06-Mayotte": "976",
}
REGION_NAMES: dict[str, str] = {}
for key, depts_str in _REGION_DEPTS.items():
code, name = key.split("-", 1)
REGION_NAMES[code] = name
for d in depts_str.split(","):
DEPT_TO_REGION[d.strip()] = code
# Departments with no DVF data (Alsace-Moselle + Mayotte)
NO_DVF_DEPARTMENTS = {"57", "67", "68", "976"}
# ---------------------------------------------------------------------------
# Top 10 cities by commune population (INSEE code → name)
# Source: INSEE Recensement de la population
# https://www.data.gouv.fr/datasets/population-municipale-des-communes-france-entiere
# Note: Strasbourg (67482) is in Alsace-Moselle — no DVF data available
# ---------------------------------------------------------------------------
TOP_10_CITIES: dict[str, str] = {
"75056": "Paris",
"13055": "Marseille",
"69123": "Lyon",
"31555": "Toulouse",
"06088": "Nice",
"44109": "Nantes",
"34172": "Montpellier",
"67482": "Strasbourg",
"33063": "Bordeaux",
"59350": "Lille",
}
# Paris, Lyon, Marseille have arrondissements — we need to map them back
ARRONDISSEMENT_MAPPING: dict[str, str] = {}
# Paris: 75101-75120 → 75056
for i in range(1, 21):
ARRONDISSEMENT_MAPPING[f"751{i:02d}"] = "75056"
# Lyon: 69381-69389 → 69123
for i in range(1, 10):
ARRONDISSEMENT_MAPPING[f"6938{i}"] = "69123"
# Marseille: 13201-13216 → 13055
for i in range(1, 17):
ARRONDISSEMENT_MAPPING[f"132{i:02d}"] = "13055"
# ---------------------------------------------------------------------------
# Aggregation levels
# ---------------------------------------------------------------------------
AGGREGATION_LEVELS = [
"country",
"region",
"department",
"commune",
"postcode",
"section",
]