Energy_Forecast / app.py
LeemahLee's picture
updated app.py
b57e1ef verified
raw
history blame
18.8 kB
import os
import time
import pandas as pd
import numpy as np
import joblib
import glob
import requests
import streamlit as st
from streamlit_autorefresh import st_autorefresh
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from pytz import timezone
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats.mstats import winsorize
# === Streamlit Setup ===
st.set_page_config(page_title="Energy Pulse Dashboard", layout="wide", page_icon="⚡")
st_autorefresh(interval=5000, key="auto_refresh")
st.title(":zap: Energy Pulse: Real-Time Power Forecast")
st.markdown("Smarter Energy Monitoring for Smarter Grids")
# === Load Model ===
@st.cache_resource
def load_model():
return joblib.load("lightgbm_energy_model_full2.pkl")
model = load_model()
# === Supabase Config ===
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_API_KEY = os.environ.get("SUPABASE_API_KEY")
TABLE = "smart_meter_readings_1year"
HEADERS = {
"apikey": SUPABASE_API_KEY,
"Authorization": f"Bearer {SUPABASE_API_KEY}"
}
# === Pull Latest Data (Today Only) ===
@st.cache_data(ttl=300)
def fetch_today_data():
today = pd.Timestamp.now(tz=timezone("Europe/London")).strftime("%Y-%m-%d")
url = f"{SUPABASE_URL}/rest/v1/{TABLE}?select=*&timestamp=gte.{today}T00:00:00&timestamp=lt.{today}T23:59:59&order=timestamp.asc"
r = requests.get(url, headers=HEADERS)
if r.ok:
df = pd.DataFrame(r.json())
df['timestamp'] = pd.to_datetime(df['timestamp'])
return df
else:
st.error(f"Failed to fetch data: {r.status_code}")
return pd.DataFrame()
df = fetch_today_data()
df = df.dropna(subset=['region', 'property_type', 'power_consumption_kwh'])
# === Feature Engineering ===
def prepare_features(df):
if df.empty:
return df
df = df.copy()
df['hour'] = df['timestamp'].dt.hour
df['minute_of_day'] = df['hour'] * 60 + df['timestamp'].dt.minute
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24)
df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24)
df['sin_30min'] = np.sin(2 * np.pi * df['minute_of_day'] / 1440)
df['cos_30min'] = np.cos(2 * np.pi * df['minute_of_day'] / 1440)
df['lag_30min'] = df['power_consumption_kwh'].shift(1)
df['lag_1hr'] = df['power_consumption_kwh'].shift(2)
df['lag_90min'] = df['power_consumption_kwh'].shift(3)
df['lag_2hr'] = df['power_consumption_kwh'].shift(4)
df['lag_mean_3'] = df['power_consumption_kwh'].shift(1).rolling(2).mean()
df['lag_mean_6'] = df['power_consumption_kwh'].shift(1).rolling(4).mean()
df['rolling_std_6'] = df['power_consumption_kwh'].rolling(6).std()
df['interaction_lag_voltage'] = df['lag_1hr'] * df['voltage']
df = pd.get_dummies(df, columns=['property_type', 'region'], drop_first=False)
for col in ['property_type_residential', 'property_type_commercial',
'region_north', 'region_south', 'region_east', 'region_west']:
if col not in df.columns:
df[col] = 0
df.fillna(method='ffill', inplace=True)
df.dropna(inplace=True)
expected = [
'voltage', 'cos_hour', 'sin_hour', 'is_weekend', 'minute_of_day', 'cos_30min',
'humidity_pct', 'lag_90min', 'temperature_c', 'lag_2hr', 'lag_mean_6', 'lag_mean_3',
'lag_1hr', 'rolling_std_6', 'lag_30min', 'sin_30min', 'interaction_lag_voltage',
'day_of_week', 'property_type_commercial', 'property_type_residential',
'region_north', 'region_south', 'region_east', 'region_west'
]
return df[['timestamp'] + expected]
# === Predict ===
df_features = prepare_features(df)
feature_cols = [col for col in df_features.columns if col != 'timestamp']
expected_features = feature_cols # Save correct order of features
df_features['predicted_kwh'] = model.predict(df_features[feature_cols])
# === Helper Function for Future Forecasting ===
def forecast_future_usage(last_row, model, steps=[30, 60, 90]):
preds = {}
forecast_times = []
prev_preds = [
last_row['lag_30min'],
last_row['lag_1hr'],
last_row['lag_90min']
]
for step in steps:
future_time = last_row["timestamp"] + pd.Timedelta(minutes=step)
row = last_row.copy()
row["timestamp"] = future_time
row["hour"] = future_time.hour
row["minute_of_day"] = row["hour"] * 60 + future_time.minute
row["day_of_week"] = future_time.dayofweek
row["is_weekend"] = int(future_time.dayofweek >= 5)
row["sin_hour"] = np.sin(2 * np.pi * row["hour"] / 24)
row["cos_hour"] = np.cos(2 * np.pi * row["hour"] / 24)
row["sin_30min"] = np.sin(2 * np.pi * row["minute_of_day"] / 1440)
row["cos_30min"] = np.cos(2 * np.pi * row["minute_of_day"] / 1440)
row["lag_30min"] = prev_preds[-1]
row["lag_1hr"] = prev_preds[-2] if len(prev_preds) >= 2 else prev_preds[-1]
row["lag_90min"] = prev_preds[-3] if len(prev_preds) >= 3 else prev_preds[-1]
row["lag_mean_3"] = np.mean(prev_preds[-3:])
row["lag_mean_6"] = np.mean(prev_preds[-3:] + [last_row["lag_mean_3"], last_row["lag_mean_6"]])
row_features = row[expected_features].values.reshape(1, -1)
# row_features = row[expected].values.reshape(1, -1)
pred = model.predict(row_features)[0]
preds[f"{step} min"] = pred
prev_preds.append(pred)
return preds, forecast_times
# === Tabs ===
tabs = st.tabs(["🔮 Regional Forecast", "📊 Performance Monitor", "📈 Usage Pattern", "🚗🌞 Electric Vehicle & Solar"])
# === Regional Forecast Tab ==
with tabs[0]:
st.subheader("Regional Forecast for Next Hours")
# Region filter with bold styling
st.markdown("<h4 style='margin-bottom: 0;'>📍 <b>Select a Region</b></h4>", unsafe_allow_html=True)
selected_region = st.selectbox(" ", ['south', 'east', 'north', 'west'])
for ptype in ['residential', 'commercial']:
temp_df = df_features.copy()
region_col = f'region_{selected_region}'
ptype_col = f'property_type_{ptype}'
st.markdown(f"#### {selected_region.capitalize()} - {ptype.capitalize()}")
if region_col in temp_df.columns and ptype_col in temp_df.columns:
temp_df = temp_df[(temp_df[region_col] == 1) & (temp_df[ptype_col] == 1)]
else:
st.warning(f"No data available for Region: {selected_region} and Property: {ptype}")
continue
if temp_df.empty or temp_df.shape[0] < 2:
st.info(f"Not enough data for {ptype} properties in {selected_region} region")
continue
# Sort and get latest row
temp_df = temp_df.sort_values("timestamp")
# latest_row = temp_df.iloc[-1].copy()
latest_row = temp_df.iloc[-1].copy()
# Make latest_row['timestamp'] timezone-aware if needed
if latest_row['timestamp'].tzinfo is None:
latest_row['timestamp'] = latest_row['timestamp'].tz_localize("Europe/London")
now = pd.Timestamp.now(tz="Europe/London")
# Skip forecast if data is too old
if (now - latest_row['timestamp']) > pd.Timedelta(hours=1):
st.warning(f"Last data point for {ptype} in {selected_region} is too old (at {latest_row['timestamp']:%H:%M}). Skipping forecast.")
continue
# Forecast future with both predictions and times
preds = {}
forecast_times = []
current_time = latest_row['timestamp']
features = latest_row.drop(labels=['timestamp', 'predicted_kwh'])
for i, minutes_ahead in enumerate([30, 60, 90, 120]):
forecast_time = (current_time + pd.Timedelta(minutes=minutes_ahead)).strftime('%H:%M')
prediction = model.predict([features])[0]
preds[f"{minutes_ahead} min"] = prediction
forecast_times.append(forecast_time)
# Display metrics with time below each forecast
cols = st.columns(len(preds))
for i, (label, value) in enumerate(preds.items()):
with cols[i]:
st.metric(label=label, value=f"{value:.2f} kWh")
st.markdown(
f"""
<div style='text-align:center; font-size:15px;'>
<span style='color:#FF0000; font-weight:bold;'>⏰ {forecast_times[i]}</span>
</div>
""",
unsafe_allow_html=True
)
# Line chart of recent trend
fig = px.line(temp_df, x='timestamp', y='predicted_kwh',
title=f"{selected_region.capitalize()} {ptype.capitalize()} Forecast")
st.plotly_chart(fig, use_container_width=True)
# === Performance Monitor ===
with tabs[1]:
st.subheader("Prediction Performance Monitor")
df_eval = df_features.copy()
df_eval = df_eval.merge(df[['timestamp', 'power_consumption_kwh']], on='timestamp', how='left')
df_eval.rename(columns={'power_consumption_kwh': 'actual'}, inplace=True)
# === Error Metrics ===
mse = mean_squared_error(df_eval['actual'], df_eval['predicted_kwh'])
rmse = np.sqrt(mse)
mae = mean_absolute_error(df_eval['actual'], df_eval['predicted_kwh'])
st.metric("RMSE", f"{rmse:.3f} kWh")
st.metric("MAE", f"{mae:.3f} kWh")
# === Hourly RMSE Chart ===
# Ensure 'hour' is present
df_eval['hour'] = df_eval['timestamp'].dt.hour
hourly_rmse = df_eval.groupby('hour').apply(
lambda x: np.sqrt(mean_squared_error(x['actual'], x['predicted_kwh']))
).reset_index(name='rmse')
fig = px.bar(hourly_rmse, x='hour', y='rmse', title='Hourly RMSE')
st.plotly_chart(fig, use_container_width=True)
# === Latency Placeholders ===
st.metric("Prediction Latency", "89 ms")
st.metric("API Fetch Latency", "170 ms")
# === ⚠ Drift Detection
if rmse > 0.8:
st.warning("⚠️ Model performance has drifted: RMSE above 0.8 kWh!")
else:
st.success("✅ Model performance is within expected range.")
missing_info = df[df[['region', 'property_type']].isnull().any(axis=1)]
if not missing_info.empty:
st.warning(f"{len(missing_info)} rows had missing region or property_type and were excluded from prediction.")
# === Compute Error and Flag Anomalies ===
df_eval = df_features.copy()
df_eval = df_eval.merge(df[['timestamp', 'power_consumption_kwh']], on='timestamp', how='left')
df_eval.rename(columns={'power_consumption_kwh': 'actual'}, inplace=True)
# Compute error
df_eval['error'] = np.abs(df_eval['actual'] - df_eval['predicted_kwh'])
# Flag anomalies
threshold = df_eval['error'].quantile(0.99)
df_eval['is_anomaly'] = df_eval['error'] > threshold
# Plot anomalies
fig = px.scatter(
df_eval,
x='timestamp',
y='error',
color='is_anomaly',
title="Prediction Errors and Anomalies",
color_discrete_map={True: 'red', False: 'green'},
labels={"error": "Absolute Error (kWh)"})
st.plotly_chart(fig, use_container_width=True)
# === Usage Pattern ===
with tabs[2]:
st.subheader(" Usage Pattern")
# Filter today's data only
df_today_eval = df_features.merge(df[['timestamp', 'power_consumption_kwh']], on='timestamp', how='left')
df_today_eval.rename(columns={'power_consumption_kwh': 'actual'}, inplace=True)
df_today_eval['hour'] = df_today_eval['timestamp'].dt.strftime("%H:%M")
# Rolling 30-min average (optional smoothing)
df_today_eval['actual_rolling'] = df_today_eval['actual'].rolling(2).mean()
df_today_eval['predicted_rolling'] = df_today_eval['predicted_kwh'].rolling(2).mean()
# Plotting hourly line chart
fig_hourly = go.Figure()
fig_hourly.add_trace(go.Scatter(
x=df_today_eval['timestamp'], y=df_today_eval['actual_rolling'],
mode='lines',
name='Actual',
line=dict(color='indigo', width=2)
))
fig_hourly.add_trace(go.Scatter(
x=df_today_eval['timestamp'], y=df_today_eval['predicted_rolling'],
mode='lines',
name='Predicted',
line=dict(color='crimson', width=2,)
))
fig_hourly.update_layout(
title="⏱️ Hourly Trend for Today: Predicted vs Actual",
xaxis_title="Time",
yaxis_title="kWh",
xaxis=dict(tickformat="%H:%M", showgrid=True),
legend_title="Legend",
template="plotly_white"
)
st.plotly_chart(fig_hourly, use_container_width=True)
# === Hourly Usage by Region ===
if not df.empty:
df['hour'] = df['timestamp'].dt.hour
if 'region' in df.columns:
avg_by_region = df.groupby(['hour', 'region'])['power_consumption_kwh'].mean().reset_index()
fig = px.line(
avg_by_region, x='hour', y='power_consumption_kwh',
color='region', title="Average Hourly Usage by Region", markers=True
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Region information not available in raw data.")
# === Daily Comparison Logging ===
def update_daily_comparison_log(df_features, df_raw):
today = df_features['timestamp'].dt.date.iloc[0]
predicted_total = df_features['predicted_kwh'].sum()
actual_total = df_raw['power_consumption_kwh'].sum()
daily_df = pd.DataFrame([{
'date': today,
'predicted_kwh': predicted_total,
'actual_kwh': actual_total
}])
if os.path.exists("daily_comparison_log.csv"):
existing = pd.read_csv("daily_comparison_log.csv")
existing['date'] = pd.to_datetime(existing['date']).dt.date
if today not in existing['date'].values:
updated = pd.concat([existing, daily_df], ignore_index=True)
updated.to_csv("daily_comparison_log.csv", index=False)
else:
daily_df.to_csv("daily_comparison_log.csv", index=False)
update_daily_comparison_log(df_features, df)
# === 1. Log timestamp-level data for today ===
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
today_str = pd.Timestamp.now(tz=timezone("Europe/London")).strftime("%Y-%m-%d")
log_path = os.path.join(log_dir, f"{today_str}.csv")
df_eval = df_features.merge(df[['timestamp', 'power_consumption_kwh']], on='timestamp', how='left')
df_eval.rename(columns={'power_consumption_kwh': 'actual'}, inplace=True)
df_log = df_eval[['timestamp', 'actual', 'predicted_kwh']].copy()
df_log['date'] = df_log['timestamp'].dt.date
df_log['hour'] = df_log['timestamp'].dt.strftime("%H:%M")
df_log.to_csv(log_path, index=False)
# === 2. Read all logs and concatenate ===
log_files = sorted(glob.glob(f"{log_dir}/*.csv"))
full_df = pd.concat([pd.read_csv(f) for f in log_files], ignore_index=True)
# === 3. Clean up and format ===
full_df['timestamp'] = pd.to_datetime(full_df['timestamp'])
full_df['hour'] = pd.to_datetime(full_df['hour'], format="%H:%M").dt.time
full_df['date'] = pd.to_datetime(full_df['date'])
# === 4. Plot all days together ===
fig = go.Figure()
for date in full_df['date'].dt.date.unique():
sub = full_df[full_df['date'].dt.date == date]
fig.add_trace(go.Scatter(
x=sub['timestamp'], y=sub['actual'],
mode='lines',
name=f"{date} - Actual",
line=dict(color='#008080', width=2)
))
fig.add_trace(go.Scatter(
x=sub['timestamp'], y=sub['predicted_kwh'],
mode='lines',
name=f"{date} - Predicted",
line=dict(color='#8B0000', width=2)
))
fig.update_layout(
title="📈 Hourly Trend: Actual vs Predicted Energy Consumption",
xaxis_title="Timestamp",
yaxis_title="kWh",
legend_title="Legend",
xaxis=dict(tickformat="%H:%M", showgrid=True),
template="plotly_white"
)
st.plotly_chart(fig, use_container_width=True)
missing_info = df[df[['region', 'property_type']].isnull().any(axis=1)]
if not missing_info.empty:
st.warning(f"{len(missing_info)} rows had missing region or property_type and were excluded from prediction.")
# === EV & Solar Insight ===
with tabs[3]:
st.subheader("EV & Solar Insights")
ev_df = df[df['ev_owner'] == 1]
solar_df = df[df['solar_installed'] == 1]
ev_avg = ev_df['power_consumption_kwh'].mean()
solar_avg = solar_df['power_consumption_kwh'].mean()
ev_peak_hour = ev_df['timestamp'].dt.hour.value_counts().idxmax() if not ev_df.empty else "N/A"
solar_peak_hour = solar_df['timestamp'].dt.hour.value_counts().idxmax() if not solar_df.empty else "N/A"
col1, col2 = st.columns(2)
with col1:
st.metric("EV Household Avg Usage (kWh)", f"{ev_avg:.2f}")
st.markdown(f"🚗 Peak Usage Hour: {ev_peak_hour}:00")
if ev_avg > df['power_consumption_kwh'].mean():
st.warning("⚠ EV households consume above average energy!")
with col2:
st.metric("Solar Household Avg Usage (kWh)", f"{solar_avg:.2f}")
st.markdown(f"🌞 Peak Usage Hour: {solar_peak_hour}:00")
if solar_avg > df['power_consumption_kwh'].mean():
st.warning("⚠ Solar homes also show high consumption.")
st.markdown("### 🔍 Distribution Analysis")
col3, col4 = st.columns(2)
with col3:
fig_ev = px.box(ev_df, y='power_consumption_kwh', points="outliers",
title="EV Owners: Power Usage Box Plot")
st.plotly_chart(fig_ev, use_container_width=True)
with col4:
fig_solar = px.box(solar_df, y='power_consumption_kwh', points="outliers",
title="Solar Homes: Power Usage Box Plot")
st.plotly_chart(fig_solar, use_container_width=True)
st.markdown("### 📈 Trend Over Time")
# Filter only rows where either EV or Solar is installed
df_combo = df[df['ev_owner'] + df['solar_installed'] > 0].copy()
# Create label per row
def get_label(row):
if row['ev_owner'] and row['solar_installed']:
return "EV + Solar"
elif row['ev_owner']:
return "EV Only"
elif row['solar_installed']:
return "Solar Only"
return "Other"
df_combo['label'] = df_combo.apply(get_label, axis=1)
# Plot
fig_trend = px.line(
df_combo,
x='timestamp',
y='power_consumption_kwh',
color='label',
title="Energy Usage Trend: EV & Solar Homes"
)
st.plotly_chart(fig_trend, use_container_width=True)