File size: 5,558 Bytes
ba08c19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bef62a
ba08c19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9eba1e1
 
 
 
ba08c19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Configuration constants for the DVF data pipeline.

Single source of truth for paths, URLs, thresholds, and mappings.
"""

from pathlib import Path

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
ROOT_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = ROOT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
AGGREGATED_DIR = DATA_DIR / "aggregated"
SECTIONS_DIR = AGGREGATED_DIR / "sections"

# ---------------------------------------------------------------------------
# DVF data source
# ---------------------------------------------------------------------------
DVF_BASE_URL = "https://files.data.gouv.fr/geo-dvf/latest/csv"
DVF_YEARS = list(range(2020, 2026))  # 2020-2025: geo-dvf only available for last 5 years

def dvf_url(year: int) -> str:
    """Return download URL for a given year's national DVF geolocalized CSV."""
    return f"{DVF_BASE_URL}/{year}/full.csv.gz"

# ---------------------------------------------------------------------------
# Columns we actually need (saves memory on load)
# ---------------------------------------------------------------------------
DVF_COLUMNS = [
    "id_mutation",
    "date_mutation",
    "nature_mutation",
    "valeur_fonciere",
    "code_postal",
    "code_commune",
    "nom_commune",
    "code_departement",
    "id_parcelle",
    "code_type_local",
    "type_local",
    "surface_reelle_bati",
    "nombre_pieces_principales",
    "nombre_lots",
    "longitude",
    "latitude",
]

# ---------------------------------------------------------------------------
# Filtering thresholds
# ---------------------------------------------------------------------------
VALID_NATURE_MUTATION = "Vente"

VALID_TYPE_LOCAL = ["Appartement", "Maison"]  # Residential only per Carlos's feedback

TYPE_LOCAL_SHORT = {
    "Appartement": "appartement",
    "Maison": "maison",
}

# ---------------------------------------------------------------------------
# Temporal weighting parameters
# ---------------------------------------------------------------------------
REFERENCE_DATE = "2025-01-01"  # Anchor date for temporal decay
TEMPORAL_LAMBDA = 0.97         # Monthly decay factor (half-life ~23 months)
TRIM_FRACTION = 0.20           # Trim 20% from each tail for trimmed mean

# Price per m² bounds for outlier removal
PRICE_M2_MIN = 200       # €/m² — below this is almost certainly an error
PRICE_M2_MAX = 25_000    # €/m² — above this is extreme luxury / error
SURFACE_MIN = 9           # m² — below 9m² is legally not habitable in France
SURFACE_MAX = 1000        # m² — above this for a single unit is suspect

# ---------------------------------------------------------------------------
# Department → Region mapping (2016 reform)
# ---------------------------------------------------------------------------
DEPT_TO_REGION: dict[str, str] = {}
_REGION_DEPTS = {
    "84-Auvergne-Rhône-Alpes": "01,03,07,15,26,38,42,43,63,69,73,74",
    "27-Bourgogne-Franche-Comté": "21,25,39,58,70,71,89,90",
    "53-Bretagne": "22,29,35,56",
    "24-Centre-Val de Loire": "18,28,36,37,41,45",
    "94-Corse": "2A,2B",
    "44-Grand Est": "08,10,51,52,54,55,57,67,68,88",
    "32-Hauts-de-France": "02,59,60,62,80",
    "11-Île-de-France": "75,77,78,91,92,93,94,95",
    "28-Normandie": "14,27,50,61,76",
    "75-Nouvelle-Aquitaine": "16,17,19,23,24,33,40,47,64,79,86,87",
    "76-Occitanie": "09,11,12,30,31,32,34,46,48,65,66,81,82",
    "52-Pays de la Loire": "44,49,53,72,85",
    "93-Provence-Alpes-Côte d'Azur": "04,05,06,13,83,84",
    "01-Guadeloupe": "971",
    "02-Martinique": "972",
    "03-Guyane": "973",
    "04-La Réunion": "974",
    "06-Mayotte": "976",
}

REGION_NAMES: dict[str, str] = {}
for key, depts_str in _REGION_DEPTS.items():
    code, name = key.split("-", 1)
    REGION_NAMES[code] = name
    for d in depts_str.split(","):
        DEPT_TO_REGION[d.strip()] = code

# Departments with no DVF data (Alsace-Moselle + Mayotte)
NO_DVF_DEPARTMENTS = {"57", "67", "68", "976"}

# ---------------------------------------------------------------------------
# Top 10 cities by commune population (INSEE code → name)
# Source: INSEE Recensement de la population
# https://www.data.gouv.fr/datasets/population-municipale-des-communes-france-entiere
# Note: Strasbourg (67482) is in Alsace-Moselle — no DVF data available
# ---------------------------------------------------------------------------
TOP_10_CITIES: dict[str, str] = {
    "75056": "Paris",
    "13055": "Marseille",
    "69123": "Lyon",
    "31555": "Toulouse",
    "06088": "Nice",
    "44109": "Nantes",
    "34172": "Montpellier",
    "67482": "Strasbourg",
    "33063": "Bordeaux",
    "59350": "Lille",
}

# Paris, Lyon, Marseille have arrondissements — we need to map them back
ARRONDISSEMENT_MAPPING: dict[str, str] = {}
# Paris: 75101-75120 → 75056
for i in range(1, 21):
    ARRONDISSEMENT_MAPPING[f"751{i:02d}"] = "75056"
# Lyon: 69381-69389 → 69123
for i in range(1, 10):
    ARRONDISSEMENT_MAPPING[f"6938{i}"] = "69123"
# Marseille: 13201-13216 → 13055
for i in range(1, 17):
    ARRONDISSEMENT_MAPPING[f"132{i:02d}"] = "13055"

# ---------------------------------------------------------------------------
# Aggregation levels
# ---------------------------------------------------------------------------
AGGREGATION_LEVELS = [
    "country",
    "region",
    "department",
    "commune",
    "postcode",
    "section",
]