Spaces:

oasisorg
/

oasis-web

Build error

App Files Files Community

Dreipfelt commited on Jul 25, 2025

Commit

fd69b5a

verified ·

1 Parent(s): ede6e69

feat/exploratory-data-analysis (#2)

Browse files

- ✨ Add the prices historical graphs (b77833c647678df675d9f6d1c71729160ca68153)

Files changed (8) hide show

.streamlit/config.toml +3 -0
Dockerfile +2 -1
requirements.txt +3 -1
src/Home.py +21 -0
src/app.py +0 -40
src/pages/1_Historical_Price_-_France.py +190 -0
src/pages/2_Historical_Price_-_Region.py +238 -0
src/pages/3_About_Us.py +24 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ [server]
2	+
3	+ maxMessageSize = 300

Dockerfile CHANGED Viewed

@@ -18,10 +18,11 @@ COPY --chown=user requirements.txt ./
 RUN pip3 install -r requirements.txt
 COPY --chown=user README.md ./
 COPY --chown=user src/ ./src/
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

 RUN pip3 install -r requirements.txt
 COPY --chown=user README.md ./
+COPY --chown=user .streamlit/ ./.streamlit/
 COPY --chown=user src/ ./src/
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/Home.py", "--server.port=8501", "--server.address=0.0.0.0"]

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 altair
 pandas
-streamlit

 altair
 pandas
+streamlit
+boto3
+plotly

src/Home.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Home.py
+import streamlit as st
+st.set_page_config(page_title="Multi-Page App Home", page_icon="🏠", layout="centered")
+# This project aims to predict real estate prices, primarily focusing on the impact of **climatic events**. Our goal is to identify **safe and profitable locations** by analyzing how various weather and climate patterns influence property values. As the project evolves, we plan to incorporate other significant events that might affect real estate prices.
+st.title("Welcome to Oasis! 🏠")
+st.write(
+    """
+    Oasis is a project designed to predict real estate prices, focusing on the impact of climatic events. Our goal is to identify safe and profitable locations by analyzing how various weather and climate patterns influence property values.
+    **How this works:**
+    1. **Data Collection:** We gather data on real estate prices and climatic events.
+    2. **Data Analysis:** We analyze the data to understand how different climatic factors affect property values.
+    3. **Model Training:** We train machine learning models to predict real estate prices based on climatic conditions and climatic conditions.
+    4. **Location Assessment:** We assess locations for safety and profitability based on our predictions.
+    """
+)
+st.info("👈 Select a page from the sidebar to get started!")

src/app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

src/pages/1_Historical_Price_-_France.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import pandas as pd
+import streamlit as st
+import os
+import boto3
+import json
+import urllib.request
+import io
+import plotly.colors as pcolors
+import plotly.express as px
+AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET", "oasis-prd-001")
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+st.set_page_config(page_title="Oasis", page_icon=":house:", layout="wide")
+st.header("Historical Price - France")
+st.subheader("An overview of real estate prices in France from 2015 to 2024")
+st.write(
+    "This map shows the average price per square meter in French departments over the years, with a focus on climatic events."
+)
+def load_file_s3(object_key: str) -> pd.DataFrame:
+    """Load a file from S3 and return its contents as a pandas DataFrame."""
+    if not AWS_S3_BUCKET or not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
+        raise ValueError(
+            "AWS credentials or bucket name not set in environment variables."
+        )
+    s3_client = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=object_key)
+    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
+    if status == 200:
+        return pd.read_csv(io.StringIO(response["Body"].read().decode("utf-8")))
+    raise ValueError(f"Unsuccessful S3 get_object response. Status - {status}")
+@st.cache_data
+def load_geojson():
+    geojson_url = "https://france-geojson.gregoiredavid.fr/repo/departements.geojson"
+    with urllib.request.urlopen(geojson_url) as response:
+        departements_geojson = json.load(response)
+    return departements_geojson
+@st.cache_data
+def load_dataset_housing_prices():
+    df = load_file_s3("processed/housing/dataset_housing_prices.csv")
+    return df
+@st.cache_data
+def load_dataset_housing_departement_prices_full():
+    df = load_file_s3("processed/housing/dataset_housing_departement_prices_full.csv")
+    return df
+#####################################################################
+# Data loading
+#####################################################################
+dataset_housing_prices = load_dataset_housing_prices()
+dataset_housing_departement_prices_full = load_dataset_housing_departement_prices_full()
+departements_geojson = load_geojson()
+#####################################################################
+# Data processing
+#####################################################################
+MISSING_VALUE_PLACEHOLDER = -1
+dataset_departements_housing_prices = (
+    dataset_housing_prices.groupby(["code_departement", "annee"])["prixm2moyen"]
+    .mean()
+    .reset_index()
+)
+min_actual_departement_prixm2moyen = dataset_departements_housing_prices[
+    "prixm2moyen"
+].min()
+max_actual_departement_prixm2moyen = dataset_departements_housing_prices[
+    "prixm2moyen"
+].max()
+missing_rows = dataset_housing_departement_prices_full[
+    ~dataset_housing_departement_prices_full.set_index(
+        ["code_departement", "annee"]
+    ).index.isin(
+        dataset_departements_housing_prices.set_index(
+            ["code_departement", "annee"]
+        ).index
+    )
+]
+missing_rows = missing_rows[["code_departement", "annee"]]
+missing_rows["prixm2moyen"] = (
+    MISSING_VALUE_PLACEHOLDER  # Set a default value for prixm2moyen
+)
+dataset_departements_housing_prices = pd.concat(
+    [dataset_departements_housing_prices, missing_rows], ignore_index=True
+)
+#####################################################################
+# Graphical representation of the data
+#####################################################################
+color_range_min = MISSING_VALUE_PLACEHOLDER
+color_range_max = max_actual_departement_prixm2moyen
+normalized_min_actual = (min_actual_departement_prixm2moyen - color_range_min) / (
+    color_range_max - color_range_min
+)
+normalized_max_actual = (max_actual_departement_prixm2moyen - color_range_min) / (
+    color_range_max - color_range_min
+)
+custom_colorscale = []
+# Add the color for missing values
+custom_colorscale.append([0.0, "lightgrey"])
+reversed_rdylgn_colors = pcolors.diverging.RdYlGn[::-1]  # <--- Correct way to reverse
+# Add the reversed RdYlGn colors for the actual data range
+num_steps = len(reversed_rdylgn_colors)
+for i, color in enumerate(reversed_rdylgn_colors):
+    normalized_point = normalized_min_actual + (
+        normalized_max_actual - normalized_min_actual
+    ) * (i / (num_steps - 1))
+    if normalized_point > 0.0:  # Ensure we don't overwrite the grey for missing
+        custom_colorscale.append([normalized_point, color])
+# Sort the custom_colorscale by the normalized value to ensure correct order
+custom_colorscale = sorted(custom_colorscale, key=lambda x: x[0])
+fig = px.choropleth_map(
+    dataset_departements_housing_prices,
+    geojson=departements_geojson,
+    locations="code_departement",
+    featureidkey="properties.code",
+    color="prixm2moyen",
+    range_color=[
+        min_actual_departement_prixm2moyen,
+        max_actual_departement_prixm2moyen,
+    ],
+    color_continuous_scale=custom_colorscale,
+    center={"lat": 46.6, "lon": 2.6},
+    zoom=5,
+    opacity=0.75,
+    hover_name="code_departement",
+    hover_data={
+        "prixm2moyen": ":.0f",
+        "annee": True,  # Include year in hover data
+    },
+    title="Average Price per Square Meter in French Departments (2015-2024)",
+    height=1000,
+    animation_frame="annee",
+    animation_group="code_departement",
+)
+fig.update_traces(marker_line_width=0)
+if fig.layout.updatemenus:
+    try:
+        fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = (
+            1000  # milliseconds per frame
+        )
+        fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = (
+            500  # transition duration
+        )
+    except IndexError:
+        print(
+            "Could not set animation speed. updatemenus structure might be unexpected."
+        )
+else:
+    print(
+        "No animation updatemenus found. This usually means 'animation_frame' column has too few unique values or data issues."
+    )
+st.plotly_chart(fig, use_container_width=True)
+st.write("Hover over the map to see detailed information for each department and year.")
+st.write(
+    "Missing values are represented in light grey, while actual data is shown in a gradient from red (high prices) to green (low prices)."
+)
+st.write(
+    "Note: The color scale is customized to highlight missing values in light grey, while the actual data is represented using a reversed RdYlGn color scale, where red indicates higher prices and green indicates lower prices."
+)

src/pages/2_Historical_Price_-_Region.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import pandas as pd
+import streamlit as st
+import os
+import boto3
+import json
+import urllib.request
+import io
+import plotly.colors as pcolors
+import plotly.express as px
+AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET", "oasis-prd-001")
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+# --- Streamlit Page Configuration ---
+st.set_page_config(page_title="Oasis", page_icon=":house:", layout="wide")
+st.header("Historical Price - Region")
+st.subheader("An overview of real estate prices for each region from 2015 to 2024")
+st.write(
+    "This map shows the average price per square meter for each city over the years, with a focus on climatic events."
+)
+# --- Data Loading Functions ---
+def load_file_s3(object_key: str) -> pd.DataFrame:
+    """Load a file from S3 and return its contents as a pandas DataFrame."""
+    if not AWS_S3_BUCKET or not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
+        raise ValueError(
+            "AWS credentials or bucket name not set in environment variables."
+        )
+    s3_client = boto3.client(
+        "s3",
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=object_key)
+    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
+    if status == 200:
+        # Ensure proper decoding and file-like object for pandas
+        return pd.read_csv(io.StringIO(response["Body"].read().decode("utf-8")))
+    raise ValueError(f"Unsuccessful S3 get_object response. Status - {status}")
+@st.cache_data
+def load_geojson():
+    """Loads GeoJSON data from a URL and caches it."""
+    geojson_url = "https://france-geojson.gregoiredavid.fr/repo/communes.geojson"
+    with urllib.request.urlopen(geojson_url) as response:
+        communes_geojson = json.load(response)
+    return communes_geojson
+@st.cache_data
+def load_dataset_housing_prices():
+    """Loads the main housing prices dataset from S3 and caches it."""
+    df = load_file_s3("processed/housing/dataset_housing_prices.csv")
+    return df
+@st.cache_data
+def load_dataset_housing_prices_full():
+    """Loads the full housing prices dataset from S3 and caches it."""
+    df = load_file_s3("processed/housing/dataset_housing_prices_full.csv")
+    return df
+# --- Data Preprocessing Function (NEW: Cached for efficiency) ---
+@st.cache_data
+def preprocess_housing_data(df_prices, df_full):
+    """
+    Performs all necessary data preprocessing steps and caches the result.
+    This function will only re-run if df_prices or df_full change.
+    """
+    MISSING_VALUE_PLACEHOLDER = -1
+    # Calculate min/max from the original (non-concatenated) dataset
+    min_actual_country_prixm2moyen = df_prices["prixm2moyen"].min()
+    max_actual_country_prixm2moyen = df_prices["prixm2moyen"].max()
+    # Identify missing rows from the full dataset
+    missing_rows = df_full[
+        ~df_full.set_index(["code_commune_insee", "annee"]).index.isin(
+            df_prices.set_index(["code_commune_insee", "annee"]).index
+        )
+    ]
+    missing_rows = missing_rows[["code_commune_insee", "annee"]]
+    missing_rows["prixm2moyen"] = MISSING_VALUE_PLACEHOLDER
+    # Concatenate and add department code
+    processed_df = pd.concat([df_prices, missing_rows], ignore_index=True)
+    processed_df["code_departement"] = processed_df["code_commune_insee"].str[:2]
+    return processed_df, min_actual_country_prixm2moyen, max_actual_country_prixm2moyen
+# --- Plotly Figure Creation Function (NEW: Cached for efficiency) ---
+@st.cache_data
+def create_animated_choropleth_map(
+    filtered_df,
+    communes_geojson,
+    min_actual_country_prixm2moyen,
+    max_actual_country_prixm2moyen,
+):
+    """
+    Creates and caches the Plotly choropleth map figure.
+    This function will only re-run if filtered_df, communes_geojson,
+    or the min/max price values change.
+    """
+    MISSING_VALUE_PLACEHOLDER = -1  # Needs to be consistent with preprocessing
+    color_range_min = MISSING_VALUE_PLACEHOLDER
+    color_range_max = max_actual_country_prixm2moyen
+    # Normalize the actual min/max to a 0-1 scale for defining the custom colorscale points
+    normalized_min_actual = (min_actual_country_prixm2moyen - color_range_min) / (
+        color_range_max - color_range_min
+    )
+    normalized_max_actual = (max_actual_country_prixm2moyen - color_range_min) / (
+        color_range_max - color_range_min
+    )
+    custom_colorscale = []
+    # Add the color for missing values
+    custom_colorscale.append([0.0, "lightgrey"])
+    reversed_rdylgn_colors = pcolors.diverging.RdYlGn[::-1]
+    # Add the reversed RdYlGn colors for the actual data range
+    num_steps = len(reversed_rdylgn_colors)
+    for i, color in enumerate(reversed_rdylgn_colors):
+        normalized_point = normalized_min_actual + (
+            normalized_max_actual - normalized_min_actual
+        ) * (i / (num_steps - 1))
+        if normalized_point > 0.0:  # Ensure we don't overwrite the grey for missing
+            custom_colorscale.append([normalized_point, color])
+    # Sort the custom_colorscale by the normalized value to ensure correct order
+    custom_colorscale = sorted(custom_colorscale, key=lambda x: x[0])
+    fig = px.choropleth_map(
+        filtered_df,
+        geojson=communes_geojson,
+        locations="code_commune_insee",
+        featureidkey="properties.code",
+        color="prixm2moyen",
+        range_color=[min_actual_country_prixm2moyen, max_actual_country_prixm2moyen],
+        color_continuous_scale=custom_colorscale,
+        center={"lat": 46.6, "lon": 2.6},
+        zoom=5,
+        opacity=0.75,
+        hover_name="code_commune_insee",
+        hover_data={
+            "prixm2moyen": ":.0f",
+            "annee": True,  # Include year in hover data
+        },
+        title="Average Price per Square Meter in French Communes (2015-2024)",
+        height=800,
+        animation_frame="annee",
+        animation_group="code_commune_insee",
+    )
+    fig.update_traces(marker_line_width=0)
+    # Set animation speed (error handling for robustness)
+    if fig.layout.updatemenus:
+        try:
+            fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1500
+            fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 500
+        except IndexError:
+            st.warning(
+                "Could not set animation speed. Updatemenus structure might be unexpected."
+            )
+    else:
+        st.warning(
+            "No animation updatemenus found. This usually means 'animation_frame' column has too few unique values or data issues."
+        )
+    return fig
+#####################################################################
+# Main Streamlit App Logic
+#####################################################################
+# Use st.spinner for initial loading and preprocessing
+with st.spinner("Loading and preprocessing data... This might take a moment."):
+    # Load the raw datasets (these are cached)
+    dataset_housing_prices = load_dataset_housing_prices()
+    dataset_housing_prices_full = load_dataset_housing_prices_full()
+    communes_geojson = load_geojson()
+    # Preprocess the data (this result is cached)
+    (
+        processed_housing_data,
+        min_actual_country_prixm2moyen,
+        max_actual_country_prixm2moyen,
+    ) = preprocess_housing_data(dataset_housing_prices, dataset_housing_prices_full)
+# Dropdown for department selection (this interaction triggers a rerun)
+st.subheader("Select a Department to View Commune Prices")
+selected_departement = st.selectbox(
+    "Select a Department",
+    options=processed_housing_data["code_departement"].unique(),
+)
+# Filter data based on selected department (this happens on every rerun after selection)
+# This filtering is fast on the already preprocessed data.
+filtered_data_for_map = processed_housing_data[
+    processed_housing_data["code_departement"] == selected_departement
+].copy()  # Use .copy() to avoid SettingWithCopyWarning
+filtered_data_for_map = filtered_data_for_map[
+    ["code_commune_insee", "annee", "prixm2moyen"]
+]
+# Create and display the choropleth map (this result is cached based on filtered_data_for_map)
+fig = create_animated_choropleth_map(
+    filtered_data_for_map,
+    communes_geojson,
+    min_actual_country_prixm2moyen,
+    max_actual_country_prixm2moyen,
+)
+st.subheader("Average Price per Square Meter in French Communes (2015-2024)")
+st.write(
+    "This map shows the average price per square meter in French communes over the years, with a focus on climatic events."
+)
+st.plotly_chart(fig, use_container_width=True)
+st.write("Hover over the map to see detailed information for each commune and year.")
+st.write(
+    "Missing values are represented in light grey, while actual data is shown in a gradient from red (high prices) to green (low prices)."
+)
+st.write(
+    "Note: The color scale is customized to highlight missing values in light grey, while the actual data is represented using a reversed RdYlGn color scale, where red indicates higher prices and green indicates lower prices."
+)
+st.write(
+    "The map is animated by year, allowing you to see how the average price per square meter changes over time."
+)

src/pages/3_About_Us.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Create an about page to present the team and the project in more detail
+# The team includes:
+# - Frederic, the project manager
+# - Olivior, the data scientist
+# - Nick, the developer
+# - Faycel, the data engineer
+# - Francis, the data analyst
+import streamlit as st
+st.set_page_config(page_title="About Us", page_icon="ℹ️", layout="centered")
+st.title("ℹ️ About Us")
+st.write(
+    """
+    This ambitious project, Oasis, aims to predict real estate prices with a primary focus on the impact of climatic events. Our goal is to identify safe and profitable locations by analyzing how various weather and climate patterns influence property values.
+    ## The Team
+    - Frederic, the project manager
+    - Olivior, the data scientist
+    - Nick, the developer
+    - Faycel, the data engineer
+    - Francis, the data analyst
+    """
+)