jessicayang commited on
Commit
c050836
·
verified ·
1 Parent(s): a4e9b7d

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. food.jpg +3 -0
  3. streamlit_app.py +416 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  src/food.jpg filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  src/food.jpg filter=lfs diff=lfs merge=lfs -text
37
+ food.jpg filter=lfs diff=lfs merge=lfs -text
food.jpg ADDED

Git LFS Details

  • SHA256: 752f443e1c16e3dc0d97aee1362e74e77d117e039f96d77fd14b78e9318688f9
  • Pointer size: 132 Bytes
  • Size of remote file: 2.75 MB
streamlit_app.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import shap
7
+ import mlflow
8
+ import mlflow.sklearn
9
+ import mlflow
10
+ from mlflow.tracking import MlflowClient
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.linear_model import LinearRegression
13
+ from sklearn.tree import DecisionTreeRegressor, plot_tree
14
+ from sklearn.ensemble import RandomForestRegressor
15
+ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
16
+ from sklearn.metrics import f1_score, accuracy_score, precision_score
17
+ from sklearn.preprocessing import LabelEncoder
18
+ from sklearn.tree import DecisionTreeClassifier
19
+ from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
20
+ import pickle
21
+
22
+ # Page config
23
+ st.set_page_config(page_title="Food Delivery Time Prediction", layout="centered", page_icon="🍔")
24
+
25
+ # Sidebar navigation
26
+ st.sidebar.title("🍔 Food Delivery Dashboard")
27
+ page = st.sidebar.selectbox("Select Page", [
28
+ "Introduction 📘",
29
+ "Visualization 📊",
30
+ "Prediction 🔮",
31
+ "Explainability 🤔",
32
+ "Model Tracker 📊",
33
+ "Conclusion 📌",
34
+ "What-If Simulator 🔁"
35
+ ])
36
+
37
+ # Load dataset
38
+ @st.cache_data
39
+ def load_data():
40
+ df = pd.read_csv("Food_Delivery_Times.csv")
41
+ return df
42
+
43
+ df = load_data()
44
+
45
+ # Page 1: Introduction
46
+ if page == "Introduction 📘":
47
+ st.title("🚴 Food Delivery Time Prediction")
48
+ st.markdown("## 🌟 Problem Statement")
49
+ st.markdown("""
50
+ Food delivery companies struggle with accurately estimating delivery times.
51
+ Inaccurate estimates reduce customer satisfaction and can hurt business.
52
+ This app aims to **predict delivery time** based on factors like distance, traffic, weather, and driver experience
53
+ using different machine learning models.
54
+ """)
55
+ st.image("food.jpg")
56
+
57
+ st.markdown("## 📁 Dataset Overview")
58
+ rows = st.slider("Preview rows", 5, 30, 10)
59
+ st.dataframe(df.head(rows))
60
+
61
+ st.markdown("### 🔎 Missing Values")
62
+ missing = df.isnull().sum()
63
+ st.write(missing)
64
+ if missing.sum() == 0:
65
+ st.success("✅ No missing values")
66
+ else:
67
+ st.warning("⚠️ Some columns have missing values and will be dropped for modeling.")
68
+
69
+ st.markdown("### 📊 Summary Statistics")
70
+ if st.button("Show Summary"):
71
+ st.dataframe(df.describe())
72
+
73
+ # Page 2: Visualization
74
+ elif page == "Visualization 📊":
75
+ st.title("📊 Data Insights")
76
+ df_viz = df.dropna()
77
+
78
+ st.markdown("### 🚗 Delivery Vehicle Type Distribution")
79
+ vehicle_counts = df_viz["Vehicle_Type"].value_counts()
80
+ fig1, ax1 = plt.subplots()
81
+ ax1.pie(vehicle_counts, labels=vehicle_counts.index, autopct='%1.1f%%', startangle=90)
82
+ ax1.set_title("Distribution of Delivery Vehicle Types")
83
+ st.pyplot(fig1)
84
+
85
+ st.markdown("### 🛏️ Avg Delivery Time by Distance Segment")
86
+ bins = [0, 5, 10, 15, 20, 25]
87
+ labels = ["0-5km", "5-10km", "10-15km", "15-20km", "20-25km"]
88
+ df_viz["Distance_Segment"] = pd.cut(df_viz["Distance_km"], bins=bins, labels=labels)
89
+ avg_by_segment = df_viz.groupby("Distance_Segment")["Delivery_Time_min"].mean().reset_index()
90
+
91
+ fig2, ax2 = plt.subplots()
92
+ sns.barplot(x="Distance_Segment", y="Delivery_Time_min", data=avg_by_segment, ax=ax2)
93
+ ax2.set_xlabel("Distance Segment")
94
+ ax2.set_ylabel("Average Delivery Time (min)")
95
+ ax2.set_title("Avg Delivery Time by Distance Segment")
96
+ st.pyplot(fig2)
97
+
98
+ st.markdown("### 📌 How does distance relate to delivery time?")
99
+ fig, ax = plt.subplots()
100
+ sns.scatterplot(data=df_viz, x="Distance_km", y="Delivery_Time_min", hue="Traffic_Level", ax=ax)
101
+ ax.set_title("Delivery Time vs. Distance colored by Traffic Level")
102
+ st.pyplot(fig)
103
+
104
+ st.markdown("### 📉 Correlation Heatmap")
105
+ df_numeric = df_viz.select_dtypes(include=np.number)
106
+ fig3, ax3 = plt.subplots()
107
+ sns.heatmap(df_numeric.corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax3)
108
+ st.pyplot(fig3)
109
+
110
+ # Page 3: Prediction
111
+ elif page == "Prediction 🔮":
112
+ mlflow.set_tracking_uri("file:///tmp/mlruns")
113
+ st.title("🔮 Predicting Delivery Time")
114
+ st.markdown("""
115
+ Use different models to predict delivery time and compare their performance.
116
+ """)
117
+
118
+ # Handle missing values
119
+ df_model = df.dropna().copy()
120
+
121
+ # Encode categoricals
122
+ le_weather = LabelEncoder()
123
+ le_traffic = LabelEncoder()
124
+ le_time = LabelEncoder()
125
+ le_vehicle = LabelEncoder()
126
+
127
+ df_model["Weather"] = le_weather.fit_transform(df_model["Weather"])
128
+ df_model["Traffic_Level"] = le_traffic.fit_transform(df_model["Traffic_Level"])
129
+ df_model["Time_of_Day"] = le_time.fit_transform(df_model["Time_of_Day"])
130
+ df_model["Vehicle_Type"] = le_vehicle.fit_transform(df_model["Vehicle_Type"])
131
+
132
+ features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day",
133
+ "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
134
+ target = "Delivery_Time_min"
135
+
136
+ X = df_model[features]
137
+ y = df_model[target]
138
+
139
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
140
+
141
+ model_choice = st.selectbox("Choose your model", ["Linear Regression", "Decision Tree", "K-Nearest Neighbors"])
142
+
143
+ with mlflow.start_run():
144
+ if model_choice == "Linear Regression":
145
+ model = LinearRegression()
146
+ model.fit(X_train, y_train)
147
+ predictions = model.predict(X_test)
148
+
149
+ st.subheader("📈 Model Performance")
150
+ st.write(f"**MAE**: {mean_absolute_error(y_test, predictions):.2f}")
151
+ st.write(f"**MSE**: {mean_squared_error(y_test, predictions):.2f}")
152
+ st.write(f"**R² Score**: {r2_score(y_test, predictions):.3f}")
153
+
154
+ fig, ax = plt.subplots()
155
+ ax.scatter(y_test, predictions, alpha=0.5)
156
+ ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
157
+ ax.set_xlabel("Actual Delivery Time")
158
+ ax.set_ylabel("Predicted Delivery Time")
159
+ ax.set_title("Actual vs Predicted Delivery Time")
160
+ st.pyplot(fig)
161
+
162
+ st.subheader("📌 Key Insights")
163
+ st.markdown("""
164
+ - **Feature Impact:** Distance, Traffic Level, and Preparation Time were the most influential features in predicting delivery time.
165
+ - **Model Fit:** The model achieves an R² score of ~0.77, indicating decent predictive power, but improvements are possible.
166
+ - **Real-World Use:** Businesses can use this model to estimate delivery ETAs and improve customer satisfaction. More complex models or live traffic inputs could enhance future predictions.
167
+ """)
168
+ elif model_choice == "Decision Tree":
169
+ # Classification setup
170
+ df_model["FastDelivery"] = (df_model["Delivery_Time_min"] <= 30).astype(int)
171
+ target = "FastDelivery"
172
+
173
+ X = df_model[features]
174
+ y = df_model[target]
175
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
176
+
177
+ # UI for depth
178
+ max_depth = st.number_input("Enter the maximum depth of the decision tree", 1, 20, value=5)
179
+ model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
180
+ model.fit(X_train, y_train)
181
+ preds = model.predict(X_test)
182
+
183
+ # Metrics
184
+ f1 = f1_score(y_test, preds)
185
+ acc = accuracy_score(y_test, preds)
186
+ precision = precision_score(y_test, preds)
187
+
188
+ # Show metrics
189
+ st.subheader("🧮 Decision Tree Prediction Metrics")
190
+ col1, col2, col3 = st.columns(3)
191
+ col1.metric("Decision Tree' f1-Score", f"{f1*100:.1f}%", "vs last run")
192
+ col2.metric("Accuracy", f"{acc*100:.1f}%", "vs last run")
193
+ col3.metric("Precision", f"{precision*100:.1f}%", "vs last run")
194
+
195
+ # Visualization
196
+ st.subheader("🌳 Decision Tree Visualization")
197
+ fig_tree, ax_tree = plt.subplots(figsize=(20, 10))
198
+ plot_tree(model, feature_names=features, class_names=["Slow", "Fast"], filled=True, rounded=True, fontsize=10)
199
+ st.pyplot(fig_tree)
200
+
201
+ elif model_choice == "K-Nearest Neighbors":
202
+ from sklearn.neighbors import KNeighborsClassifier
203
+ from sklearn.metrics import accuracy_score
204
+ import seaborn as sns
205
+
206
+ # Optional: allow user to choose features
207
+ all_features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day",
208
+ "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
209
+ selected_features = st.multiselect("Select features for KNN", all_features, default=all_features)
210
+
211
+ if len(selected_features) == 0:
212
+ st.warning("Please select at least one feature.")
213
+ else:
214
+ X = df_model[selected_features]
215
+ y = (df_model["Delivery_Time_min"] <= 30).astype(int)
216
+
217
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
218
+
219
+ # Try different k values
220
+ accuracies = []
221
+ k_range = range(1, 21)
222
+ best_k = 1
223
+ best_acc = 0
224
+ best_model = None
225
+
226
+ for k in k_range:
227
+ knn = KNeighborsClassifier(n_neighbors=k)
228
+ knn.fit(X_train, y_train)
229
+ preds = knn.predict(X_test)
230
+ acc = accuracy_score(y_test, preds)
231
+ accuracies.append(acc)
232
+ if acc > best_acc:
233
+ best_k = k
234
+ best_acc = acc
235
+ best_model = knn
236
+
237
+ st.markdown(f"✅ Best value of k: **{best_k}**")
238
+ st.markdown(f"📈 Best accuracy: **{best_acc:.2%}**")
239
+
240
+ # Plot K vs Accuracy
241
+ fig, ax = plt.subplots()
242
+ sns.lineplot(x=list(k_range), y=accuracies, marker="o", ax=ax)
243
+ ax.set_title("K Number × Accuracy")
244
+ ax.set_xlabel("K")
245
+ ax.set_ylabel("Accuracy")
246
+ st.pyplot(fig)
247
+
248
+
249
+
250
+ # Page 4: Explainability
251
+ elif page == "Explainability 🤔":
252
+ st.title("🤔 Model Explainability with SHAP")
253
+
254
+ df_model = df.dropna().copy()
255
+ df_model["Weather"] = LabelEncoder().fit_transform(df_model["Weather"])
256
+ df_model["Traffic_Level"] = LabelEncoder().fit_transform(df_model["Traffic_Level"])
257
+ df_model["Time_of_Day"] = LabelEncoder().fit_transform(df_model["Time_of_Day"])
258
+ df_model["Vehicle_Type"] = LabelEncoder().fit_transform(df_model["Vehicle_Type"])
259
+
260
+ features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day",
261
+ "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
262
+ target = "Delivery_Time_min"
263
+
264
+ X = df_model[features]
265
+ y = df_model[target]
266
+ model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
267
+ model.fit(X, y)
268
+
269
+ explainer = shap.Explainer(model, X)
270
+ shap_values = explainer(X)
271
+
272
+ st.subheader("🌍 Global Feature Importance")
273
+ fig, ax = plt.subplots()
274
+ shap.plots.bar(shap_values, max_display=7, show=False)
275
+ st.pyplot(fig)
276
+
277
+ st.subheader("📊 SHAP Summary Plot")
278
+ fig2, ax2 = plt.subplots()
279
+ shap.summary_plot(shap_values, X, show=False)
280
+ st.pyplot(fig2)
281
+
282
+ st.subheader("🔍 Explain Single Prediction")
283
+ instance = st.slider("Pick a row to explain", 0, len(X)-1, 0)
284
+ fig3, ax3 = plt.subplots()
285
+ shap.plots.waterfall(shap_values[instance], show=False)
286
+ st.pyplot(fig3)
287
+
288
+ elif page == "Model Tracker 📊":
289
+ st.title("📊 Model Tracker with DagsHub + MLflow")
290
+ st.markdown("This page shows all logged experiments and highlights your best model based on MAE.")
291
+
292
+ # 🔧 Set MLflow URI (DagsHub)
293
+ mlflow.set_tracking_uri("https://dagshub.com/zy2869/my-first-repo.mlflow")
294
+
295
+ client = MlflowClient()
296
+
297
+ # 🔍 Show all experiments so user knows what's available
298
+ experiments = mlflow.search_experiments()
299
+ experiment_names = [exp.name for exp in experiments]
300
+ selected_exp_name = st.selectbox("Choose experiment", experiment_names)
301
+
302
+ selected_exp = client.get_experiment_by_name(selected_exp_name)
303
+ runs = client.search_runs(experiment_ids=[selected_exp.experiment_id], order_by=["metrics.MAE ASC"])
304
+
305
+ # 📊 Create table
306
+ data = []
307
+ for r in runs:
308
+ data.append({
309
+ "Run ID": r.info.run_id,
310
+ "Model": r.data.tags.get("mlflow.runName", "Unnamed"),
311
+ "MAE": r.data.metrics.get("MAE", None),
312
+ "MSE": r.data.metrics.get("MSE", None),
313
+ "MAPE": r.data.metrics.get("MAPE", None),
314
+ })
315
+ df_runs = pd.DataFrame(data)
316
+
317
+ # 🏆 Show sorted models
318
+ st.subheader("Top Performing Models (Sorted by MAE)")
319
+ if not df_runs.empty:
320
+ st.dataframe(df_runs.sort_values("MAE", na_position='last').reset_index(drop=True))
321
+ else:
322
+ st.warning("No runs with MAE metric found in this experiment.")
323
+
324
+ if page == "Conclusion 📌":
325
+ st.title("📌 Conclusion and Insights")
326
+
327
+ st.subheader("🍔 Delivery Strategy Recommendations Based on Our Analysis")
328
+
329
+ st.markdown("""
330
+ **Based on our overall analysis**, we found that delivery time is most strongly influenced by a few key operational features: **distance**, **preparation time**, and **traffic level**. These factors consistently showed high predictive value across models and SHAP explanations.
331
+
332
+ 📍 For instance, our SHAP analysis confirmed that **Distance (km)** had the highest impact on delivery time predictions, while **Preparation Time** also played a major role. When these two were both high, delivery times significantly increased.
333
+
334
+ 🏍️ Among the different vehicle types, **bikes** were the most frequently used (51%), but they also had more variation in delivery speed depending on other conditions like traffic.
335
+
336
+ 📈 As distance increases, average delivery time predictably rises—a trend confirmed by both bar charts and regression models.
337
+ """)
338
+
339
+ st.subheader("🧠 Key Learnings from Model Comparison")
340
+
341
+ st.markdown("""
342
+ - **Linear Regression** offered a strong baseline with an R² of **0.775**.
343
+ - **Decision Trees** gave better interpretability with strong accuracy (~91.5%) but a lower F1-score.
344
+ - **K-Nearest Neighbors (KNN)** with selected features reached **96.05% accuracy**.
345
+
346
+ 🔍 Our model tracker (with MLflow + DagsHub) revealed that **Huber Regressor** performed best in terms of MAE, making it a great option when minimizing large errors.
347
+ """)
348
+
349
+ st.subheader("🚚 Real-World Use Case")
350
+
351
+ st.markdown("""
352
+ These results suggest that food delivery platforms could:
353
+ - ✅ Use real-time **distance and traffic** data to adjust estimated delivery windows.
354
+ - ✅ Improve ETAs by accounting for **preparation time** at the vendor.
355
+ - ✅ Recommend **vehicle-type optimizations** during peak or off-peak hours.
356
+
357
+ This could lead to improved customer satisfaction, fewer complaints, and better delivery routing decisions.
358
+ """)
359
+
360
+ st.subheader("🔧 Future Improvements?")
361
+
362
+ st.markdown("""
363
+ 1. **Live Traffic API Integration**: Use real-time traffic feeds (e.g., Google Maps API) for more dynamic predictions.
364
+ 2. **User Behavior Modeling**: Include customer behavior (e.g., reorder rate, tip likelihood) to improve prioritization.
365
+ 3. **Expand Dataset**: Include orders from multiple cities to improve generalization across delivery environments.
366
+ """)
367
+
368
+ if page == "What-If Simulator 🔁":
369
+ st.title("🔁 What-If Simulator")
370
+ st.markdown("### Adjust inputs to simulate delivery time!")
371
+
372
+ df_model = df.dropna().copy()
373
+ df_model["Weather"] = LabelEncoder().fit_transform(df_model["Weather"])
374
+ df_model["Traffic_Level"] = LabelEncoder().fit_transform(df_model["Traffic_Level"])
375
+ df_model["Time_of_Day"] = LabelEncoder().fit_transform(df_model["Time_of_Day"])
376
+ df_model["Vehicle_Type"] = LabelEncoder().fit_transform(df_model["Vehicle_Type"])
377
+
378
+ features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day",
379
+ "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
380
+
381
+ # Train simple model
382
+ X = df_model[features]
383
+ y = df_model["Delivery_Time_min"]
384
+ model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
385
+ model.fit(X, y)
386
+
387
+ # Input widgets
388
+ st.markdown("#### Input Simulation Variables")
389
+ col1, col2 = st.columns(2)
390
+
391
+ with col1:
392
+ distance = st.slider("Distance (km)", 0.5, 25.0, 5.0)
393
+ prep_time = st.slider("Preparation Time (min)", 5, 40, 15)
394
+ experience = st.slider("Courier Experience (yrs)", 0, 10, 2)
395
+
396
+ with col2:
397
+ weather = st.selectbox("Weather", df["Weather"].unique())
398
+ traffic = st.selectbox("Traffic Level", df["Traffic_Level"].unique())
399
+ time_of_day = st.selectbox("Time of Day", df["Time_of_Day"].unique())
400
+ vehicle = st.selectbox("Vehicle Type", df["Vehicle_Type"].unique())
401
+
402
+ # Encoding user input
403
+ input_data = pd.DataFrame({
404
+ "Distance_km": [distance],
405
+ "Weather": [LabelEncoder().fit(df["Weather"]).transform([weather])[0]],
406
+ "Traffic_Level": [LabelEncoder().fit(df["Traffic_Level"]).transform([traffic])[0]],
407
+ "Time_of_Day": [LabelEncoder().fit(df["Time_of_Day"]).transform([time_of_day])[0]],
408
+ "Vehicle_Type": [LabelEncoder().fit(df["Vehicle_Type"]).transform([vehicle])[0]],
409
+ "Preparation_Time_min": [prep_time],
410
+ "Courier_Experience_yrs": [experience]
411
+ })
412
+
413
+ prediction = model.predict(input_data)[0]
414
+ st.success(f"📦 Estimated Delivery Time: {prediction:.2f} minutes")
415
+
416
+ st.caption("⚡ Tip: Try extreme values to simulate peak vs. off-peak hours!")