Spaces:

dcrey7
/

realadvisor-challenge

Sleeping

App Files Files Community

realadvisor-challenge / src /top_cities.py

dcrey7

feat: add temporal weighting, remove commercial, expand to 2014-2025

c7a66d3 18 days ago

raw

history blame contribute delete

2.16 kB

	"""
	Compute price/m2 breakdown for the top 10 French cities.

	Responsibility: Produce a clean table of time-weighted median price per m2
	by property type for the largest cities.
	"""

	import json
	import logging
	from pathlib import Path

	import polars as pl

	from src.aggregator import _aggregate_group
	from src.config import AGGREGATED_DIR, TOP_10_CITIES, TYPE_LOCAL_SHORT

	logger = logging.getLogger(__name__)


	def compute_top_cities(df: pl.DataFrame) -> dict[str, dict]:
	"""
	Compute price statistics for top 10 cities, broken down by property type.

	Uses code_commune_city (with arrondissements mapped to parent city)
	to correctly aggregate Paris, Lyon, and Marseille.

	Args:
	df: Collected DataFrame with code_commune_city, prix_m2, temporal_weight.

	Returns:
	Nested dict: {"Paris": {"code": "75056", "tous": {...}, ...}, ...}
	"""
	city_codes = list(TOP_10_CITIES.keys())
	city_data = df.filter(pl.col("code_commune_city").is_in(city_codes))

	result: dict[str, dict] = {}

	for city_code, city_name in TOP_10_CITIES.items():
	city_df = city_data.filter(pl.col("code_commune_city") == city_code)

	if len(city_df) == 0:
	logger.warning("No data for %s (%s)", city_name, city_code)
	continue

	entry: dict = {"code": city_code}

	# All residential combined
	entry["tous"] = _aggregate_group(city_df)

	# Per property type
	for full_name, short_name in TYPE_LOCAL_SHORT.items():
	type_df = city_df.filter(pl.col("type_local") == full_name)
	if len(type_df) > 0:
	entry[short_name] = _aggregate_group(type_df)

	result[city_name] = entry

	return result


	def export_top_cities(data: dict[str, dict], output_dir: Path \| None = None) -> None:
	"""
	Export top cities data to JSON.
	"""
	output_dir = output_dir or AGGREGATED_DIR
	output_dir.mkdir(parents=True, exist_ok=True)

	path = output_dir / "top_cities.json"
	with open(path, "w") as f:
	json.dump(data, f, ensure_ascii=False, indent=2)
	logger.info("Exported: %s (%d cities)", path.name, len(data))