realadvisor-challenge / src /top_cities.py
dcrey7's picture
feat: add temporal weighting, remove commercial, expand to 2014-2025
c7a66d3
"""
Compute price/m2 breakdown for the top 10 French cities.
Responsibility: Produce a clean table of time-weighted median price per m2
by property type for the largest cities.
"""
import json
import logging
from pathlib import Path
import polars as pl
from src.aggregator import _aggregate_group
from src.config import AGGREGATED_DIR, TOP_10_CITIES, TYPE_LOCAL_SHORT
logger = logging.getLogger(__name__)
def compute_top_cities(df: pl.DataFrame) -> dict[str, dict]:
"""
Compute price statistics for top 10 cities, broken down by property type.
Uses code_commune_city (with arrondissements mapped to parent city)
to correctly aggregate Paris, Lyon, and Marseille.
Args:
df: Collected DataFrame with code_commune_city, prix_m2, temporal_weight.
Returns:
Nested dict: {"Paris": {"code": "75056", "tous": {...}, ...}, ...}
"""
city_codes = list(TOP_10_CITIES.keys())
city_data = df.filter(pl.col("code_commune_city").is_in(city_codes))
result: dict[str, dict] = {}
for city_code, city_name in TOP_10_CITIES.items():
city_df = city_data.filter(pl.col("code_commune_city") == city_code)
if len(city_df) == 0:
logger.warning("No data for %s (%s)", city_name, city_code)
continue
entry: dict = {"code": city_code}
# All residential combined
entry["tous"] = _aggregate_group(city_df)
# Per property type
for full_name, short_name in TYPE_LOCAL_SHORT.items():
type_df = city_df.filter(pl.col("type_local") == full_name)
if len(type_df) > 0:
entry[short_name] = _aggregate_group(type_df)
result[city_name] = entry
return result
def export_top_cities(data: dict[str, dict], output_dir: Path | None = None) -> None:
"""
Export top cities data to JSON.
"""
output_dir = output_dir or AGGREGATED_DIR
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / "top_cities.json"
with open(path, "w") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info("Exported: %s (%d cities)", path.name, len(data))