File size: 7,686 Bytes
9eba1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d554e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9eba1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d554e
9eba1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d554e
9eba1e1
92d554e
9eba1e1
 
 
 
 
 
 
 
 
 
 
 
92d554e
9eba1e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Build clean postcode GeoJSON by dissolving commune boundaries by postcode.

Downloads commune contours from geo.api.gouv.fr (per department),
maps each commune to its postcode(s), and dissolves geometries by postcode.
This produces non-overlapping postcode polygons.

Key design decision: each commune is assigned to ONE postcode only (the first
in sorted order) to prevent overlapping boundaries. For Paris/Lyon/Marseille,
arrondissement boundaries are used instead of the meta-commune.
"""

import json
import time
import urllib.request
import geopandas as gpd
import pandas as pd
from shapely.ops import unary_union
from pathlib import Path

OUTPUT_PATH = Path(__file__).parent.parent / "data" / "aggregated" / "postcodes.geojson"

# All metropolitan + DOM department codes
DEPT_CODES = [
    "01","02","03","04","05","06","07","08","09","10",
    "11","12","13","14","15","16","17","18","19","21",
    "22","23","24","25","26","27","28","29","2A","2B",
    "30","31","32","33","34","35","36","37","38","39",
    "40","41","42","43","44","45","46","47","48","49",
    "50","51","52","53","54","55","56","57","58","59",
    "60","61","62","63","64","65","66","67","68","69",
    "70","71","72","73","74","75","76","77","78","79",
    "80","81","82","83","84","85","86","87","88","89",
    "90","91","92","93","94","95",
    "971","972","973","974","976",
]

# Paris/Lyon/Marseille meta-communes that need arrondissement-level treatment
ARRONDISSEMENT_CITIES = {
    "75056": "Paris",
    "69123": "Lyon",
    "13055": "Marseille",
}


def fetch_url(url: str, retries: int = 3) -> dict | None:
    """Fetch JSON from a URL with retries."""
    for attempt in range(retries):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
            with urllib.request.urlopen(req, timeout=60) as resp:
                return json.loads(resp.read().decode())
        except Exception as e:
            if attempt < retries - 1:
                wait = 2 ** attempt
                print(f"  retry {attempt+1} in {wait}s...", end=" ", flush=True)
                time.sleep(wait)
            else:
                print(f"  WARNING: Failed to fetch {url}: {e}")
                return None


def fetch_dept_communes(dept_code: str) -> list[dict]:
    """Fetch communes with contours for a department from geo.api.gouv.fr."""
    url = (
        f"https://geo.api.gouv.fr/departements/{dept_code}/communes"
        f"?fields=code,nom,codesPostaux,contour&format=geojson&geometry=contour"
    )
    data = fetch_url(url)
    return data.get("features", []) if data else []


def fetch_arrondissements(city_code: str) -> list[dict]:
    """Fetch arrondissement boundaries for Paris/Lyon/Marseille."""
    url = (
        f"https://geo.api.gouv.fr/communes/{city_code}"
        f"?type=arrondissement-municipal"
        f"&fields=code,nom,codesPostaux,contour&format=geojson"
    )
    data = fetch_url(url)
    if data and "features" in data:
        return data["features"]
    # API might return a single object instead of FeatureCollection
    if data and data.get("type") == "Feature":
        return [data]
    return []


def fetch_arrondissements_list(city_code: str) -> list[dict]:
    """Fetch arrondissements for a city as a list of GeoJSON features."""
    # The arrondissements API endpoint
    url = (
        f"https://geo.api.gouv.fr/communes"
        f"?codeParent={city_code}&type=arrondissement-municipal"
        f"&fields=code,nom,codesPostaux,contour&format=geojson&geometry=contour"
    )
    data = fetch_url(url)
    if data and "features" in data:
        return data["features"]
    return []


def build_postcode_geojson():
    """Main pipeline: download communes, dissolve by postcode, export GeoJSON."""
    print("=== Building clean postcode boundaries ===\n")

    # Step 1: Download all commune features with their postcodes
    all_rows = []  # (postcode, geometry)
    total_communes = 0
    skip_codes = set(ARRONDISSEMENT_CITIES.keys())

    for i, dept in enumerate(DEPT_CODES):
        print(f"[{i+1}/{len(DEPT_CODES)}] Fetching dept {dept}...", end=" ", flush=True)
        features = fetch_dept_communes(dept)
        print(f"{len(features)} communes")

        for f in features:
            geom = f.get("geometry")
            props = f.get("properties", {})
            code = props.get("code", "")
            postcodes = props.get("codesPostaux", [])

            if not geom or not postcodes:
                continue

            # Skip meta-communes (Paris/Lyon/Marseille) - handle via arrondissements
            if code in skip_codes:
                continue

            # Assign commune to ONE postcode only (first in sorted order)
            # This prevents overlapping boundaries for multi-postcode communes
            pc = sorted(postcodes)[0]
            all_rows.append({"codePostal": pc, "geometry": geom})
            total_communes += 1

        # Be polite to the API
        time.sleep(0.3)

    # Step 1b: Fetch arrondissements for Paris/Lyon/Marseille
    for city_code, city_name in ARRONDISSEMENT_CITIES.items():
        print(f"Fetching {city_name} arrondissements...", end=" ", flush=True)
        features = fetch_arrondissements_list(city_code)
        print(f"{len(features)} arrondissements")

        for f in features:
            geom = f.get("geometry")
            props = f.get("properties", {})
            postcodes = props.get("codesPostaux", [])

            if not geom or not postcodes:
                continue

            # Each arrondissement typically maps to one postcode
            pc = sorted(postcodes)[0]
            all_rows.append({"codePostal": pc, "geometry": geom})
            total_communes += 1

        time.sleep(0.3)

    print(f"\nTotal communes/arrondissements fetched: {total_communes}")
    print(f"Total rows: {len(all_rows)}")

    # Step 2: Build GeoDataFrame
    print("\nBuilding GeoDataFrame...")
    gdf = gpd.GeoDataFrame.from_features(
        [{"type": "Feature", "geometry": r["geometry"], "properties": {"codePostal": r["codePostal"]}} for r in all_rows],
        crs="EPSG:4326",
    )
    print(f"  Shape: {gdf.shape}")
    print(f"  Unique postcodes: {gdf['codePostal'].nunique()}")

    # Step 3: Dissolve by postcode (union geometries from different communes)
    print("\nDissolving by postcode...")
    dissolved = gdf.dissolve(by="codePostal", aggfunc="first").reset_index()
    print(f"  Dissolved features: {len(dissolved)}")

    # Step 4: Simplify geometries for web (tolerance ~55m, smooth at zoom 12-13)
    print("\nSimplifying geometries...")
    dissolved["geometry"] = dissolved["geometry"].simplify(tolerance=0.0005, preserve_topology=True)

    # Step 5: Export
    print(f"\nExporting to {OUTPUT_PATH}...")
    dissolved.to_file(OUTPUT_PATH, driver="GeoJSON")

    # Post-process: reduce coordinate precision
    print("Post-processing: reducing coordinate precision...")
    with open(OUTPUT_PATH) as f:
        geojson = json.load(f)

    def round_coords(coords):
        if isinstance(coords[0], (int, float)):
            return [round(c, 5) for c in coords]
        return [round_coords(c) for c in coords]

    for feature in geojson["features"]:
        feature["geometry"]["coordinates"] = round_coords(feature["geometry"]["coordinates"])

    with open(OUTPUT_PATH, "w") as f:
        json.dump(geojson, f, separators=(",", ":"))

    final_size = OUTPUT_PATH.stat().st_size / (1024 * 1024)
    print(f"\nDone! Output: {OUTPUT_PATH}")
    print(f"  Features: {len(geojson['features'])}")
    print(f"  File size: {final_size:.1f} MB")


if __name__ == "__main__":
    build_postcode_geojson()