decodingdatascience commited on
Commit
74a8924
Β·
verified Β·
1 Parent(s): dade342

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -0
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, tempfile
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ from pandas.api.types import is_datetime64_any_dtype as is_datetime
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import mean_absolute_error, r2_score
8
+ from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
9
+ import gradio as gr
10
+
11
+ # ---------- Helpers ----------
12
+ def infer_target_column(df: pd.DataFrame):
13
+ for c in ["power_usage_kwh", "energy_kwh", "power_kwh", "energy"]:
14
+ if c in df.columns:
15
+ return c
16
+ raise ValueError("Target column not found. Expected one of: "
17
+ "['power_usage_kwh','energy_kwh','power_kwh','energy'].")
18
+
19
+ def ensure_datetime_naive(df: pd.DataFrame, tz_target: str = "Asia/Dubai"):
20
+ if "timestamp" not in df.columns:
21
+ return df
22
+ # Parse robustly with UTC, then convert to target tz and drop tz
23
+ ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
24
+ try:
25
+ ts = ts.dt.tz_convert(tz_target).dt.tz_localize(None)
26
+ except Exception:
27
+ try:
28
+ ts = ts.dt.tz_localize(None)
29
+ except Exception:
30
+ pass
31
+ df = df.copy()
32
+ df["timestamp"] = ts
33
+ return df
34
+
35
+ def feature_engineer(df: pd.DataFrame) -> pd.DataFrame:
36
+ df = df.copy()
37
+ df = ensure_datetime_naive(df, tz_target="Asia/Dubai")
38
+
39
+ # Light numeric imputation
40
+ num_cols = df.select_dtypes(include=[np.number]).columns
41
+ df[num_cols] = df[num_cols].ffill().bfill()
42
+
43
+ # Time features
44
+ if "timestamp" in df.columns and is_datetime(df["timestamp"]):
45
+ df["hour"] = df["timestamp"].dt.hour
46
+ df["dayofweek"] = df["timestamp"].dt.dayofweek
47
+ df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
48
+ df["month"] = df["timestamp"].dt.month
49
+ df["dayofyear"] = df["timestamp"].dt.dayofyear
50
+ df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24)
51
+ df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24)
52
+ df["dow_sin"] = np.sin(2*np.pi*df["dayofweek"]/7)
53
+ df["dow_cos"] = np.cos(2*np.pi*df["dayofweek"]/7)
54
+ else:
55
+ for c in ["hour","dayofweek","is_weekend","month","dayofyear","hour_sin","hour_cos","dow_sin","dow_cos"]:
56
+ if c not in df.columns:
57
+ df[c] = 0
58
+
59
+ # Domain features
60
+ tgt = infer_target_column(df)
61
+ if "cooling_eff_pct" in df.columns:
62
+ df["cooling_ineff_pct"] = 100 - df["cooling_eff_pct"]
63
+ if "server_load_pct" in df.columns:
64
+ df["energy_per_load"] = df[tgt] / np.maximum(df["server_load_pct"], 1)
65
+ if "ambient_temp_c" in df.columns and "server_load_pct" in df.columns:
66
+ df["temp_load_interaction"] = df["ambient_temp_c"] * df["server_load_pct"]
67
+
68
+ # Target lags/rollings
69
+ df["target_lag1"] = df[tgt].shift(1)
70
+ df["target_roll3"] = df[tgt].rolling(3, min_periods=1).mean()
71
+ df["target_roll24"] = df[tgt].rolling(24, min_periods=1).mean()
72
+
73
+ # Fill NaNs from shifts
74
+ df = df.ffill().bfill()
75
+ return df
76
+
77
+ def get_model(name: str):
78
+ return GradientBoostingRegressor(random_state=42) if name == "Gradient Boosting" \
79
+ else RandomForestRegressor(n_estimators=300, random_state=42)
80
+
81
+ def feature_target_split(df: pd.DataFrame):
82
+ y_col = infer_target_column(df)
83
+ X = df.drop(columns=[c for c in [y_col, "timestamp"] if c in df.columns], errors="ignore")
84
+ X = X.select_dtypes(include=[np.number]).copy()
85
+ y = df[y_col].astype(float)
86
+ return X, y, y_col
87
+
88
+ # ---------- Core pipeline ----------
89
+ def run_pipeline(file_path, model_name):
90
+ title = "⚑ AI-Driven Data Center Energy Optimization Dashboard"
91
+
92
+ try:
93
+ if not file_path:
94
+ return (title, "Please upload a CSV file.", None, None, None, None, None, None)
95
+
96
+ df_raw = pd.read_csv(file_path)
97
+ df = feature_engineer(df_raw)
98
+
99
+ # Guardrail
100
+ if len(df) < 10:
101
+ return (title, "Not enough rows to train a model (need >= 10).", None, None, None, None, None, None)
102
+
103
+ X, y, y_col = feature_target_split(df)
104
+
105
+ # Split, train, predict
106
+ test_size = 0.25 if len(df) >= 25 else 0.2
107
+ X_train, X_test, y_train, y_test = train_test_split(
108
+ X, y, test_size=test_size, random_state=42
109
+ )
110
+ model = get_model(model_name)
111
+ model.fit(X_train, y_train)
112
+
113
+ y_pred_all = model.predict(X)
114
+ y_pred_test = model.predict(X_test)
115
+
116
+ mae = mean_absolute_error(y_test, y_pred_test)
117
+ r2 = r2_score(y_test, y_pred_test)
118
+ avg_actual = float(np.mean(y))
119
+ avg_pred = float(np.mean(y_pred_all))
120
+
121
+ # ------ Visualizations ------
122
+ ts_plot = None
123
+ if "timestamp" in df.columns and is_datetime(df["timestamp"]):
124
+ plot_df = df.copy().sort_values("timestamp")
125
+ Xp = plot_df.drop(columns=[c for c in [y_col, "timestamp"] if c in plot_df.columns], errors="ignore")
126
+ Xp = Xp.select_dtypes(include=[np.number]).copy()
127
+ yp = model.predict(Xp)
128
+ ts_plot = plt.figure(figsize=(9, 3.6))
129
+ plt.plot(plot_df["timestamp"], plot_df[y_col], label="Actual")
130
+ plt.plot(plot_df["timestamp"], yp, label="Predicted")
131
+ plt.title("Time Series: Actual vs Predicted")
132
+ plt.xlabel("Time"); plt.ylabel(y_col)
133
+ plt.legend(); plt.tight_layout()
134
+
135
+ sc_plot = plt.figure(figsize=(4.6, 3.8))
136
+ plt.scatter(y_test, y_pred_test, alpha=0.6)
137
+ mn = min(y_test.min(), y_pred_test.min()); mx = max(y_test.max(), y_pred_test.max())
138
+ plt.plot([mn, mx], [mn, mx], linestyle="--")
139
+ plt.title("Holdout: Actual vs Predicted")
140
+ plt.xlabel("Actual"); plt.ylabel("Predicted")
141
+ plt.tight_layout()
142
+
143
+ res = y_test - y_pred_test
144
+ resid_plot = plt.figure(figsize=(4.6, 3.6))
145
+ plt.hist(res, bins=30)
146
+ plt.title("Holdout Residuals (Actual βˆ’ Predicted)")
147
+ plt.xlabel("Residual"); plt.ylabel("Count")
148
+ plt.tight_layout()
149
+
150
+ fi_plot = None
151
+ if hasattr(model, "feature_importances_"):
152
+ importances = model.feature_importances_
153
+ fi = (pd.DataFrame({"feature": X.columns, "importance": importances})
154
+ .sort_values("importance", ascending=False).head(12))
155
+ fi_plot = plt.figure(figsize=(6.2, 3.8))
156
+ plt.barh(fi["feature"][::-1], fi["importance"][::-1])
157
+ plt.title("Top Feature Importances")
158
+ plt.tight_layout()
159
+
160
+ # Save predictions for download
161
+ out_df = df.copy()
162
+ out_df[f"{y_col}_pred"] = y_pred_all
163
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
164
+ out_df.to_csv(tmp.name, index=False)
165
+
166
+ # --------- Copy text (explainer + KPIs) ---------
167
+ explainer = (
168
+ "### 🧠 What this app does\n"
169
+ "This AI-driven dashboard learns the relationship between **server load**, **ambient temperature**, "
170
+ "**cooling efficiency**, and time features to **predict power usage**. "
171
+ "Use it to quantify drivers of energy consumption, monitor deviations, and surface optimization levers.\n\n"
172
+ "### πŸ”Ž Why it matters\n"
173
+ "- Reduces **OPEX** by forecasting and optimizing energy usage\n"
174
+ "- Identifies high-impact drivers (feature importance)\n"
175
+ "- Enables proactive actions (e.g., workload shaping, cooling set-point tuning)\n\n"
176
+ "### βš™οΈ How it works (high-level)\n"
177
+ "1) Cleans and engineers features (diurnal/weekly cycles, rolling stats, domain signals)\n"
178
+ "2) Trains a tree ensemble (Gradient Boosting or Random Forest)\n"
179
+ "3) Evaluates on a holdout split and produces predictions for the entire dataset\n"
180
+ "4) Visualizes time series, accuracy scatter, residuals, and top feature importance\n"
181
+ )
182
+
183
+ kpis = (
184
+ f"**Model:** {model_name}\n\n"
185
+ f"**Target:** {y_col}\n"
186
+ f"**Avg {y_col} (actual):** {avg_actual:,.2f}\n"
187
+ f"**Avg {y_col} (predicted):** {avg_pred:,.2f}\n"
188
+ f"**Rows:** {len(df):,}\n\n"
189
+ f"**Holdout MAE:** {mae:,.2f} | **RΒ²:** {r2:,.3f}"
190
+ )
191
+
192
+ # Sample preview table
193
+ preview = out_df.head(10)
194
+
195
+ return (
196
+ title,
197
+ explainer,
198
+ kpis,
199
+ preview,
200
+ ts_plot,
201
+ sc_plot,
202
+ resid_plot,
203
+ fi_plot,
204
+ tmp.name
205
+ )
206
+
207
+ except Exception as e:
208
+ err = f"❌ **Error:** {type(e).__name__}: {e}"
209
+ return (title, err, None, None, None, None, None, None, None)
210
+
211
+ # ---------- Gradio UI ----------
212
+ import gradio
213
+ gradio.close_all() # avoid port conflicts in Colab
214
+
215
+ with gr.Blocks(title="AI-Driven Data Center Energy Optimization") as demo:
216
+ gr.Markdown("## ⚑ AI-Driven Data Center Energy Optimization Dashboard")
217
+
218
+ with gr.Row():
219
+ fpath = gr.File(label="πŸ“ Upload Dataset (CSV)", file_types=[".csv"], type="filepath")
220
+ model_name = gr.Dropdown(
221
+ choices=["Gradient Boosting", "Random Forest"],
222
+ value="Gradient Boosting",
223
+ label="πŸ” Select Model"
224
+ )
225
+
226
+ run_btn = gr.Button("▢️ Run")
227
+
228
+ title_out = gr.Markdown()
229
+ explainer_out = gr.Markdown()
230
+ kpi_out = gr.Markdown()
231
+ table_out = gr.Dataframe(label="πŸ“‹ Sample (+ Predictions)", wrap=True, row_count=("fixed", 10))
232
+
233
+ gr.Markdown("### πŸ“ˆ Visual Insights")
234
+ ts_plot = gr.Plot(label="Time Series: Actual vs Predicted")
235
+ sc_plot = gr.Plot(label="Holdout: Actual vs Predicted")
236
+ resid_plot = gr.Plot(label="Residuals (Histogram)")
237
+ fi_plot = gr.Plot(label="Top Feature Importances")
238
+
239
+ dl = gr.File(label="πŸ“₯ Download Data (+ Predictions)")
240
+
241
+ run_btn.click(
242
+ fn=run_pipeline,
243
+ inputs=[fpath, model_name],
244
+ outputs=[title_out, explainer_out, kpi_out, table_out, ts_plot, sc_plot, resid_plot, fi_plot, dl]
245
+ )
246
+
247
+ demo.launch(share=True)