oasis-web / src /pages /0_Summary.py
pradelf's picture
Update src/pages/0_Summary.py
90f0bcb verified
import streamlit as st
import asyncio
import pandas as pd
from functools import reduce
import plotly.express as px
from pages.utils.utils import (
async_load_file_s3,
async_load_geojson_from_s3,
_format_department_code,
async_load_file_s3_gzip,
)
from pages.utils.graphs import (
display_choropleth_map_country,
display_choropleth_map_for_department,
)
# Internal async function to gather all data loading tasks
async def _load_all_data_async_internal_departements():
departements_df_task = async_load_file_s3(
"processed/housing/dataset_departements_prices.csv"
)
departements_geojson_task = async_load_geojson_from_s3(
"processed/referentiel/departements.geojson"
)
insee_task = async_load_file_s3("processed/referentiel/ref_espace_communes.csv")
risks_df_task = async_load_file_s3_gzip(
"processed/risk-scores/risk-scores-departements-final.csv.gz"
)
return await asyncio.gather(
departements_df_task, departements_geojson_task, insee_task, risks_df_task
)
# Internal async function to gather all data loading tasks
async def _load_all_data_async_internal_communes():
communes_df_task = async_load_file_s3(
"processed/housing/dataset_housing_prices.csv"
)
communes_geojson_task = async_load_geojson_from_s3(
"processed/referentiel/communes.geojson"
)
risks_df_task = async_load_file_s3_gzip(
"processed/risk-scores/risk-scores-final.csv.gz"
)
return await asyncio.gather(communes_df_task, communes_geojson_task, risks_df_task)
@st.cache_resource
def load_all_data_wrapper_summary_country():
return asyncio.run(_load_all_data_async_internal_departements())
@st.cache_resource
def load_all_data_wrapper_summary_region():
return asyncio.run(_load_all_data_async_internal_communes())
# --- Streamlit App Layout ---
st.set_page_config(page_title="Oasis - Summary", page_icon=":money_with_wings:", layout="wide")
###############################################################################
st.header("Summary of Historical \"Good places\"")
with st.spinner("Loading data and preparing maps..."):
(
dataset_departements_housing_prices,
departements_geojson,
insee_df,
dataset_departements_risks,
) = load_all_data_wrapper_summary_country()
# merge risks with housing prices (code_departement and annee)
dataset_departements_risks = _format_department_code(dataset_departements_risks)
dataset_departements_housing_prices = dataset_departements_housing_prices.merge(
dataset_departements_risks[["code_departement", "annee", "avg_risk_score"]],
on=["code_departement", "annee"],
how="left",
)
# scale the prixm2moyen to a range of 0-1 for better visualization
dataset_departements_housing_prices["prixm2moyen"] = (
dataset_departements_housing_prices["prixm2moyen"] - dataset_departements_housing_prices["prixm2moyen"].min()
) / (
dataset_departements_housing_prices["prixm2moyen"].max()
- dataset_departements_housing_prices["prixm2moyen"].min()
)
# scale the avg_risk_score to a range of 0-1 for better visualization
dataset_departements_housing_prices["avg_risk_score"] = (
dataset_departements_housing_prices["avg_risk_score"] - dataset_departements_housing_prices["avg_risk_score"].min()
) / (
dataset_departements_housing_prices["avg_risk_score"].max()
- dataset_departements_housing_prices["avg_risk_score"].min()
)
# combine prixm2moyen and avg_risk_score into a single column for visualization
dataset_departements_housing_prices["combined_score"] = (
dataset_departements_housing_prices["prixm2moyen"]
* dataset_departements_housing_prices["avg_risk_score"]
)
st.subheader(
"The summary between prices and global risks in France", divider=True
)
st.write(
"This map summarizes the relationship between real estate prices and global risks in French departments."
)
fig_france = display_choropleth_map_country(
dataset_departements_housing_prices,
departements_geojson,
metric_name="combined_score",
metric_description="Summary of the relationship between prices and risks",
red_gradient=True,
)
st.plotly_chart(fig_france, use_container_width=False)
st.write(
"Missing values are represented in light grey, while actual data is shown in a gradient from red (high prices) to green (low prices)."
)
###############################################################################
st.subheader("Top & Bottom 5 Departments", divider=True)
st.write("Select a year to view the top and bottom departments by combined score.")
selected_year = st.selectbox(
"Select a Year",
options=dataset_departements_housing_prices["annee"].unique(),
format_func=lambda x: f"{x}",
index=list(dataset_departements_housing_prices["annee"].unique()).index(2024) if 2024 in dataset_departements_housing_prices["annee"].unique() else 0
)
top_departements = (
dataset_departements_housing_prices[
dataset_departements_housing_prices["annee"] == selected_year
]
.groupby("code_departement")["combined_score"]
.mean()
.reset_index()
.sort_values(by="combined_score", ascending=False)
.rename(
columns={
"code_departement": "Department Code",
"combined_score": "Combined score (price and global risk)",
}
)
.assign(
**{
"Department Name": lambda x: x["Department Code"].apply(
lambda code: insee_df[
_format_department_code(insee_df)["code_departement"] == code
]["nom_departement"].values[0]
if code in _format_department_code(insee_df)["code_departement"].values
else "Unknown"
)
}
)
.head(5)
)
bottom_departements = (
dataset_departements_housing_prices[
(dataset_departements_housing_prices["annee"] == selected_year) & (dataset_departements_housing_prices["combined_score"] > 0)
]
.groupby("code_departement")["combined_score"]
.mean()
.reset_index()
.sort_values(by="combined_score", ascending=True)
.rename(
columns={
"code_departement": "Department Code",
"combined_score": "Combined score (price and global risk)",
}
)
.assign(
**{
"Department Name": lambda x: x["Department Code"].apply(
lambda code: insee_df[
_format_department_code(insee_df)["code_departement"] == code
]["nom_departement"].values[0]
if code in _format_department_code(insee_df)["code_departement"].values
else "Unknown"
)
}
)
.head(5)
)
left_co, right_co = st.columns(2)
with left_co:
fig = px.bar(
bottom_departements,
x="Department Name",
y="Combined score (price and global risk)",
title="Bottom 5 Departments",
)
st.plotly_chart(fig)
st.dataframe(
bottom_departements,
hide_index=True,
column_order=("Department Code", "Department Name", "Combined score (price and global risk)",),
)
with right_co:
# display a bar chart of the top_departements
fig = px.bar(
top_departements.sort_values(by="Combined score (price and global risk)", ascending=True),
x="Department Name",
y="Combined score (price and global risk)",
title="Top 5 Departments",
)
st.plotly_chart(fig)
st.dataframe(
top_departements,
hide_index=True,
column_order=("Department Code", "Department Name", "Combined score (price and global risk)",),
)
###############################################################################
# st.subheader("Select Department(s) to View Historical Combined Scores", divider=True)
# with st.spinner("Loading data and preparing maps..."):
# (
# dataset_housing_prices,
# communes_geojson,
# dataset_risks,
# ) = load_all_data_wrapper_summary_region()
# # merge risks with housing prices (code_departement and annee)
# dataset_risks = _format_department_code(dataset_risks)
# dataset_housing_prices = dataset_housing_prices.merge(
# dataset_risks[["code_departement", "annee", "avg_risk_score"]],
# on=["code_departement", "annee"],
# how="left",
# )
# # scale the prixm2moyen to a range of 0-1 for better visualization
# dataset_housing_prices["prixm2moyen"] = (
# dataset_housing_prices["prixm2moyen"]
# - dataset_housing_prices["prixm2moyen"].min()
# ) / (
# dataset_housing_prices["prixm2moyen"].max()
# - dataset_housing_prices["prixm2moyen"].min()
# )
# # scale the avg_risk_score to a range of 0-1 for better visualization
# dataset_housing_prices["avg_risk_score"] = (
# dataset_housing_prices["avg_risk_score"]
# - dataset_housing_prices["avg_risk_score"].min()
# ) / (
# dataset_housing_prices["avg_risk_score"].max()
# - dataset_housing_prices["avg_risk_score"].min()
# )
# # combine prixm2moyen and avg_risk_score into a single column for visualization
# dataset_housing_prices["combined_score"] = (
# dataset_housing_prices["prixm2moyen"]
# * dataset_housing_prices["avg_risk_score"]
# )
# min_global_commune_avg_risk_score = dataset_housing_prices[
# dataset_housing_prices["combined_score"] > 0
# ]["combined_score"].min()
# max_global_commune_avg_risk_score = dataset_housing_prices["combined_score"].max()
# # Get all unique department codes for selectbox options
# all_departement_codes = _format_department_code(insee_df)["code_departement"].unique()
# # --- Department Selection 1 & 2 ---
# col_dept1, col_dept2 = st.columns(2)
# # Set default department values
# default_dept_1 = all_departement_codes[0] if len(all_departement_codes) > 0 else None
# default_dept_2 = all_departement_codes[1] if len(all_departement_codes) > 1 else None
# with col_dept1:
# selected_departement = st.selectbox(
# "Select the first Department",
# options=all_departement_codes,
# format_func=lambda x: f"{x} - {insee_df[insee_df['code_departement'] == x]['nom_departement'].values[0]}"
# if x in insee_df["code_departement"].values
# else x,
# key="departement_select_1",
# index=list(all_departement_codes).index(default_dept_1) if default_dept_1 else 0
# )
# with col_dept2:
# selected_departement_2 = st.selectbox(
# "Select the second Department (Optional for comparison)",
# options=[None] + list(all_departement_codes), # Add None option
# format_func=lambda x: f"{x} - {insee_df[insee_df['code_departement'] == x]['nom_departement'].values[0]}"
# if x is not None and x in insee_df["code_departement"].values
# else "None (Only show Department 1)",
# key="departement_select_2",
# index=list([None] + list(all_departement_codes)).index(default_dept_2) if default_dept_2 else 0
# )
# st.write("This chart shows the average price per square meter in the selected department(s) over the years, with a focus on climatic events.")
# # --- Data Preparation for Department Chart ---
# all_departments_to_plot = []
# # Process Department 1 data
# if selected_departement: # Ensure a department is selected
# department_data_1 = dataset_housing_prices[
# dataset_housing_prices["code_departement"] == selected_departement
# ].copy()
# if not department_data_1.empty:
# department_data_1["annee"] = department_data_1["annee"].astype(str)
# # Group by year and calculate mean for the department
# department_data_1 = department_data_1.groupby("annee")["combined_score"].mean().reset_index()
# # Get the department name for the legend
# departement_name_1 = (
# insee_df[insee_df["code_departement"] == selected_departement][
# "nom_departement"
# ].values[0]
# if selected_departement in insee_df["code_departement"].values
# else selected_departement
# )
# department_data_1.rename(columns={"combined_score": departement_name_1}, inplace=True)
# all_departments_to_plot.append(department_data_1)
# else:
# st.warning(f"No data available for Department 1: {selected_departement}")
# # Process Department 2 data if selected
# if (
# selected_departement_2 and selected_departement_2 != selected_departement
# ): # Ensure a valid second department is chosen and it's not the same as the first
# department_data_2 = dataset_housing_prices[
# dataset_housing_prices["code_departement"] == selected_departement_2
# ].copy()
# if not department_data_2.empty:
# department_data_2["annee"] = department_data_2["annee"].astype(str)
# # Group by year and calculate mean for the department
# department_data_2 = department_data_2.groupby("annee")["combined_score"].mean().reset_index()
# # Get the department name for the legend
# departement_name_2 = (
# insee_df[insee_df["code_departement"] == selected_departement_2][
# "nom_departement"
# ].values[0]
# if selected_departement_2 in insee_df["code_departement"].values
# else selected_departement_2
# )
# department_data_2.rename(columns={"combined_score": departement_name_2}, inplace=True)
# all_departments_to_plot.append(department_data_2)
# else:
# st.warning(f"No data available for Department 2: {selected_departement_2}")
# elif selected_departement_2 == selected_departement and selected_departement_2 is not None:
# st.info("You've selected the same department for both. Showing only one line.")
# # Combine dataframes for plotting the department comparison chart
# if all_departments_to_plot:
# combined_dept_df = reduce(
# lambda left, right: pd.merge(left, right, on="annee", how="outer"),
# all_departments_to_plot,
# )
# combined_dept_df.set_index("annee", inplace=True)
# fig_dept = px.line(
# combined_dept_df.reset_index(),
# x="annee",
# y=combined_dept_df.columns,
# title="Combined score (price and global risk)",
# labels={"annee": "Year", "value": "Combined score (price and global risk)"},
# )
# fig_dept.update_layout(
# xaxis_title="Year",
# yaxis_title="Combined score (price and global risk)",
# legend_title_text="Department",
# )
# st.plotly_chart(fig_dept, use_container_width=True)
# else:
# st.info("Please select at least one department to display data.")
# selected_departement_label = (f"{insee_df[insee_df['code_departement'] == selected_departement]['nom_departement'].values[0]}"
# if selected_departement in insee_df["code_departement"].values
# else selected_departement
# )
# # Prepare data for box plot
# box_plot_data = dataset_housing_prices[
# (dataset_housing_prices["code_departement"] == selected_departement)
# | (dataset_housing_prices["code_departement"] == selected_departement_2)
# ].copy()
# box_plot_data["annee"] = box_plot_data["annee"].astype(
# str
# ) # Ensure 'annee' is string for categorical x-axis
# # Create the box plot
# fig_box = px.box(
# box_plot_data,
# x="annee",
# y="combined_score",
# color="code_departement",
# title=f"Distribution of Prices in Department {selected_departement_label} by Year",
# )
# fig_box.update_layout(
# xaxis_title="Year",
# yaxis_title="Combined score (price and global risk)",
# )
# st.plotly_chart(fig_box, use_container_width=True)
# ###############################################################################
# # display the top 5 communes in the selected department
# st.subheader(
# f"Top and Bottom 5 Communes in Department {selected_departement_label}",
# divider=True,
# )
# # selected year
# selected_year_communes = st.selectbox(
# "Select a Year",
# options=dataset_housing_prices["annee"].unique(),
# format_func=lambda x: str(x),
# key="year_communes_selectbox",
# index=list(dataset_housing_prices["annee"].unique()).index(2024) if 2024 in dataset_housing_prices["annee"].unique() else 0
# )
# top_communes = (
# dataset_housing_prices[
# (dataset_housing_prices["code_departement"] == selected_departement)
# & (dataset_housing_prices["annee"] == selected_year_communes)
# ]
# .groupby("code_commune_insee")["combined_score"]
# .mean()
# .reset_index()
# .sort_values(by="combined_score", ascending=False)
# .rename(
# columns={
# "code_commune_insee": "Commune Code",
# "combined_score": "Combined score (price and global risk)",
# }
# )
# .assign(
# **{
# "Commune Name": lambda x: x["Commune Code"].apply(
# lambda code: insee_df[
# _format_department_code(insee_df)["code_commune_INSEE"] == code
# ]["nom_commune_complet"].values[0]
# if code
# in _format_department_code(insee_df)["code_commune_INSEE"].values
# else "Unknown"
# )
# }
# )
# .head(5)
# )
# bottom_communes = (
# dataset_housing_prices[
# (dataset_housing_prices["code_departement"] == selected_departement)
# & (dataset_housing_prices["annee"] == selected_year_communes)
# & (dataset_housing_prices["combined_score"] > 0)
# ]
# .groupby("code_commune_insee")["combined_score"]
# .mean()
# .reset_index()
# .sort_values(by="combined_score", ascending=True)
# .rename(
# columns={
# "code_commune_insee": "Commune Code",
# "combined_score": "Combined score (price and global risk)",
# }
# )
# .assign(
# **{
# "Commune Name": lambda x: x["Commune Code"].apply(
# lambda code: insee_df[
# _format_department_code(insee_df)["code_commune_INSEE"] == code
# ]["nom_commune_complet"].values[0]
# if code
# in _format_department_code(insee_df)["code_commune_INSEE"].values
# else "Unknown"
# )
# }
# )
# .head(5)
# )
# left_co, right_co = st.columns(2)
# with left_co:
# fig = px.bar(
# bottom_communes.sort_values(by="Combined score (price and global risk)", ascending=True),
# x="Commune Name",
# y="Combined score (price and global risk)",
# title="Bottom 5 Communes",
# )
# st.plotly_chart(fig)
# st.dataframe(
# bottom_communes,
# hide_index=True,
# column_order=("Commune Code", "Commune Name", "Combined score (price and global risk)"),
# )
# with right_co:
# fig = px.bar(
# top_communes.sort_values(by="Combined score (price and global risk)", ascending=True),
# x="Commune Name",
# y="Combined score (price and global risk)",
# title="Top 5 Communes",
# )
# st.plotly_chart(fig)
# st.dataframe(
# top_communes,
# hide_index=True,
# column_order=("Commune Code", "Commune Name", "Combined score (price and global risk)"),
# )
# ###############################################################################
# st.subheader(
# "Average Price per Square Meter in French Communes", divider=True
# )
# fig_department = display_choropleth_map_for_department(
# dataset_housing_prices,
# selected_departement,
# communes_geojson,
# min_global_commune_avg_risk_score,
# max_global_commune_avg_risk_score,
# title=f"Average Price per Square Meter in Department {selected_departement_label} (Animated by Year)",
# height_graph=1000,
# width_graph=1400,
# )
# st.plotly_chart(fig_department, use_container_width=False)
# st.write(
# "Missing values are represented in light grey, while actual data is shown in a gradient from red (high prices) to green (low prices)."
# )
# ###############################################################################
# st.subheader(f"Historical Price comparaison in Selected Commune in Departement {selected_departement_label}", divider=True)
# available_communes = dataset_housing_prices[
# dataset_housing_prices["code_departement"] == selected_departement
# ]["code_commune_insee"].unique()
# let_col1, right_col2 = st.columns(2)
# with let_col1:
# # --- Commune Selection 1 ---
# selected_commune_1 = st.selectbox(
# "Select the first Commune",
# options=available_communes,
# format_func=lambda x: f"{x} - {insee_df[insee_df['code_commune_INSEE'] == x]['nom_commune_complet'].values[0]}"
# if x in insee_df["code_commune_INSEE"].values
# else x,
# key="commune_select_1",
# index=0 if len(available_communes) > 1 else 0
# )
# with right_col2:
# # --- Commune Selection 2 ---
# selected_commune_2 = st.selectbox(
# "Select the second Commune (Optional for comparison)",
# options=[None]
# + list(available_communes), # Add None as an option to not select a second commune
# format_func=lambda x: f"{x} - {insee_df[insee_df['code_commune_INSEE'] == x]['nom_commune_complet'].values[0]}"
# if x is not None and x in insee_df["code_commune_INSEE"].values
# else "None (Only show Commune 1)",
# key="commune_select_2",
# index=2 if len(available_communes) > 2 else 0
# )
# # --- Data Preparation for Chart ---
# all_communes_to_plot = []
# # Process Commune 1 data
# commune_data_1 = dataset_housing_prices[
# dataset_housing_prices["code_commune_insee"] == selected_commune_1
# ].copy()
# if not commune_data_1.empty:
# commune_data_1["annee"] = commune_data_1["annee"].astype(str)
# commune_data_1 = commune_data_1.groupby("annee")["combined_score"].mean().reset_index()
# # Rename the price column to reflect the commune for the legend
# commune_name_1 = (
# insee_df[insee_df["code_commune_INSEE"] == selected_commune_1][
# "nom_commune_complet"
# ].values[0]
# if selected_commune_1 in insee_df["code_commune_INSEE"].values
# else selected_commune_1
# )
# commune_data_1.rename(columns={"combined_score": commune_name_1}, inplace=True)
# all_communes_to_plot.append(commune_data_1)
# else:
# st.warning(f"No data available for Commune 1: {selected_commune_1}")
# # Process Commune 2 data if selected
# if (
# selected_commune_2 and selected_commune_2 != selected_commune_1
# ): # Ensure a valid second commune is chosen and it's not the same as the first
# commune_data_2 = dataset_housing_prices[
# dataset_housing_prices["code_commune_insee"] == selected_commune_2
# ].copy()
# if not commune_data_2.empty:
# commune_data_2["annee"] = commune_data_2["annee"].astype(str)
# commune_data_2 = (
# commune_data_2.groupby("annee")["combined_score"].mean().reset_index()
# )
# # Rename the price column for the second commune
# commune_name_2 = (
# insee_df[insee_df["code_commune_INSEE"] == selected_commune_2][
# "nom_commune_complet"
# ].values[0]
# if selected_commune_2 in insee_df["code_commune_INSEE"].values
# else selected_commune_2
# )
# commune_data_2.rename(columns={"combined_score": commune_name_2}, inplace=True)
# all_communes_to_plot.append(commune_data_2)
# else:
# st.warning(f"No data available for Commune 2: {selected_commune_2}")
# elif selected_commune_2 == selected_commune_1 and selected_commune_2 is not None:
# st.info("You've selected the same commune for both. Showing only one line.")
# # Combine dataframes for plotting
# if all_communes_to_plot:
# # Use reduce or pd.merge to combine, ensuring 'annee' is the common key
# # Start with the first dataframe, then merge others
# combined_df = reduce(
# lambda left, right: pd.merge(left, right, on="annee", how="outer"),
# all_communes_to_plot,
# )
# combined_df.set_index("annee", inplace=True)
# st.line_chart(
# combined_df,
# use_container_width=True,
# height=400,
# x_label="Year",
# y_label="Combined score (price and global risk)",
# )
# else:
# st.info("Please select at least one commune to display data.")