Spaces:
Sleeping
Sleeping
File size: 5,558 Bytes
ba08c19 7bef62a ba08c19 9eba1e1 ba08c19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | """
Configuration constants for the DVF data pipeline.
Single source of truth for paths, URLs, thresholds, and mappings.
"""
from pathlib import Path
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
ROOT_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = ROOT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
AGGREGATED_DIR = DATA_DIR / "aggregated"
SECTIONS_DIR = AGGREGATED_DIR / "sections"
# ---------------------------------------------------------------------------
# DVF data source
# ---------------------------------------------------------------------------
DVF_BASE_URL = "https://files.data.gouv.fr/geo-dvf/latest/csv"
DVF_YEARS = list(range(2020, 2026)) # 2020-2025: geo-dvf only available for last 5 years
def dvf_url(year: int) -> str:
"""Return download URL for a given year's national DVF geolocalized CSV."""
return f"{DVF_BASE_URL}/{year}/full.csv.gz"
# ---------------------------------------------------------------------------
# Columns we actually need (saves memory on load)
# ---------------------------------------------------------------------------
DVF_COLUMNS = [
"id_mutation",
"date_mutation",
"nature_mutation",
"valeur_fonciere",
"code_postal",
"code_commune",
"nom_commune",
"code_departement",
"id_parcelle",
"code_type_local",
"type_local",
"surface_reelle_bati",
"nombre_pieces_principales",
"nombre_lots",
"longitude",
"latitude",
]
# ---------------------------------------------------------------------------
# Filtering thresholds
# ---------------------------------------------------------------------------
VALID_NATURE_MUTATION = "Vente"
VALID_TYPE_LOCAL = ["Appartement", "Maison"] # Residential only per Carlos's feedback
TYPE_LOCAL_SHORT = {
"Appartement": "appartement",
"Maison": "maison",
}
# ---------------------------------------------------------------------------
# Temporal weighting parameters
# ---------------------------------------------------------------------------
REFERENCE_DATE = "2025-01-01" # Anchor date for temporal decay
TEMPORAL_LAMBDA = 0.97 # Monthly decay factor (half-life ~23 months)
TRIM_FRACTION = 0.20 # Trim 20% from each tail for trimmed mean
# Price per m² bounds for outlier removal
PRICE_M2_MIN = 200 # €/m² — below this is almost certainly an error
PRICE_M2_MAX = 25_000 # €/m² — above this is extreme luxury / error
SURFACE_MIN = 9 # m² — below 9m² is legally not habitable in France
SURFACE_MAX = 1000 # m² — above this for a single unit is suspect
# ---------------------------------------------------------------------------
# Department → Region mapping (2016 reform)
# ---------------------------------------------------------------------------
DEPT_TO_REGION: dict[str, str] = {}
_REGION_DEPTS = {
"84-Auvergne-Rhône-Alpes": "01,03,07,15,26,38,42,43,63,69,73,74",
"27-Bourgogne-Franche-Comté": "21,25,39,58,70,71,89,90",
"53-Bretagne": "22,29,35,56",
"24-Centre-Val de Loire": "18,28,36,37,41,45",
"94-Corse": "2A,2B",
"44-Grand Est": "08,10,51,52,54,55,57,67,68,88",
"32-Hauts-de-France": "02,59,60,62,80",
"11-Île-de-France": "75,77,78,91,92,93,94,95",
"28-Normandie": "14,27,50,61,76",
"75-Nouvelle-Aquitaine": "16,17,19,23,24,33,40,47,64,79,86,87",
"76-Occitanie": "09,11,12,30,31,32,34,46,48,65,66,81,82",
"52-Pays de la Loire": "44,49,53,72,85",
"93-Provence-Alpes-Côte d'Azur": "04,05,06,13,83,84",
"01-Guadeloupe": "971",
"02-Martinique": "972",
"03-Guyane": "973",
"04-La Réunion": "974",
"06-Mayotte": "976",
}
REGION_NAMES: dict[str, str] = {}
for key, depts_str in _REGION_DEPTS.items():
code, name = key.split("-", 1)
REGION_NAMES[code] = name
for d in depts_str.split(","):
DEPT_TO_REGION[d.strip()] = code
# Departments with no DVF data (Alsace-Moselle + Mayotte)
NO_DVF_DEPARTMENTS = {"57", "67", "68", "976"}
# ---------------------------------------------------------------------------
# Top 10 cities by commune population (INSEE code → name)
# Source: INSEE Recensement de la population
# https://www.data.gouv.fr/datasets/population-municipale-des-communes-france-entiere
# Note: Strasbourg (67482) is in Alsace-Moselle — no DVF data available
# ---------------------------------------------------------------------------
TOP_10_CITIES: dict[str, str] = {
"75056": "Paris",
"13055": "Marseille",
"69123": "Lyon",
"31555": "Toulouse",
"06088": "Nice",
"44109": "Nantes",
"34172": "Montpellier",
"67482": "Strasbourg",
"33063": "Bordeaux",
"59350": "Lille",
}
# Paris, Lyon, Marseille have arrondissements — we need to map them back
ARRONDISSEMENT_MAPPING: dict[str, str] = {}
# Paris: 75101-75120 → 75056
for i in range(1, 21):
ARRONDISSEMENT_MAPPING[f"751{i:02d}"] = "75056"
# Lyon: 69381-69389 → 69123
for i in range(1, 10):
ARRONDISSEMENT_MAPPING[f"6938{i}"] = "69123"
# Marseille: 13201-13216 → 13055
for i in range(1, 17):
ARRONDISSEMENT_MAPPING[f"132{i:02d}"] = "13055"
# ---------------------------------------------------------------------------
# Aggregation levels
# ---------------------------------------------------------------------------
AGGREGATION_LEVELS = [
"country",
"region",
"department",
"commune",
"postcode",
"section",
]
|