File size: 17,858 Bytes
dd8cd4e
6ea2544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328cc93
6ea2544
 
 
 
 
 
 
 
 
 
 
 
 
 
1e5d6c6
6ea2544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372923c
6ea2544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372923c
6ea2544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd8cd4e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import mlflow
import mlflow.sklearn
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import f1_score, accuracy_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
import pickle

# Page config
st.set_page_config(page_title="Food Delivery Time Prediction", layout="centered", page_icon="๐Ÿ”")

# Sidebar navigation
st.sidebar.title("๐Ÿ” Food Delivery Dashboard")
page = st.sidebar.selectbox("Select Page", [
    "Introduction ๐Ÿ“˜", 
    "Visualization ๐Ÿ“Š", 
    "Prediction ๐Ÿ”ฎ",
    "Explainability ๐Ÿค”",
    "Model Tracker ๐Ÿ“Š",
    "Conclusion ๐Ÿ“Œ",
    "What-If Simulator ๐Ÿ”"
])

# Load dataset
@st.cache_data
def load_data():
    df = pd.read_csv("src/Food_Delivery_Times.csv")
    return df

df = load_data()

# Page 1: Introduction
if page == "Introduction ๐Ÿ“˜":
    st.title("๐Ÿšด Food Delivery Time Prediction")
    st.markdown("## ๐ŸŒŸ Problem Statement")
    st.markdown("""
        Food delivery companies struggle with accurately estimating delivery times.
        Inaccurate estimates reduce customer satisfaction and can hurt business.
        This app aims to **predict delivery time** based on factors like distance, traffic, weather, and driver experience
        using different machine learning models.
    """)
    st.image("src/food.jpg")

    st.markdown("## ๐Ÿ“ Dataset Overview")
    rows = st.slider("Preview rows", 5, 30, 10)
    st.dataframe(df.head(rows))

    st.markdown("### ๐Ÿ”Ž Missing Values")
    missing = df.isnull().sum()
    st.write(missing)
    if missing.sum() == 0:
        st.success("โœ… No missing values")
    else:
        st.warning("โš ๏ธ Some columns have missing values and will be dropped for modeling.")

    st.markdown("### ๐Ÿ“Š Summary Statistics")
    if st.button("Show Summary"):
        st.dataframe(df.describe())

# Page 2: Visualization
elif page == "Visualization ๐Ÿ“Š":
    st.title("๐Ÿ“Š Data Insights")
    df_viz = df.dropna()

    st.markdown("### ๐Ÿš— Delivery Vehicle Type Distribution")
    vehicle_counts = df_viz["Vehicle_Type"].value_counts()
    fig1, ax1 = plt.subplots()
    ax1.pie(vehicle_counts, labels=vehicle_counts.index, autopct='%1.1f%%', startangle=90)
    ax1.set_title("Distribution of Delivery Vehicle Types")
    st.pyplot(fig1)

    st.markdown("### ๐Ÿ›๏ธ Avg Delivery Time by Distance Segment")
    bins = [0, 5, 10, 15, 20, 25]
    labels = ["0-5km", "5-10km", "10-15km", "15-20km", "20-25km"]
    df_viz["Distance_Segment"] = pd.cut(df_viz["Distance_km"], bins=bins, labels=labels)
    avg_by_segment = df_viz.groupby("Distance_Segment")["Delivery_Time_min"].mean().reset_index()

    fig2, ax2 = plt.subplots()
    sns.barplot(x="Distance_Segment", y="Delivery_Time_min", data=avg_by_segment, ax=ax2)
    ax2.set_xlabel("Distance Segment")
    ax2.set_ylabel("Average Delivery Time (min)")
    ax2.set_title("Avg Delivery Time by Distance Segment")
    st.pyplot(fig2)

    st.markdown("### ๐Ÿ“Œ How does distance relate to delivery time?")
    fig, ax = plt.subplots()
    sns.scatterplot(data=df_viz, x="Distance_km", y="Delivery_Time_min", hue="Traffic_Level", ax=ax)
    ax.set_title("Delivery Time vs. Distance colored by Traffic Level")
    st.pyplot(fig)

    st.markdown("### ๐Ÿ“‰ Correlation Heatmap")
    df_numeric = df_viz.select_dtypes(include=np.number)
    fig3, ax3 = plt.subplots()
    sns.heatmap(df_numeric.corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax3)
    st.pyplot(fig3)

# Page 3: Prediction
elif page == "Prediction ๐Ÿ”ฎ":
    mlflow.set_tracking_uri("file:///tmp/mlruns")
    st.title("๐Ÿ”ฎ Predicting Delivery Time")
    st.markdown("""
        Use different models to predict delivery time and compare their performance.
    """)

    # Handle missing values
    df_model = df.dropna().copy()

    # Encode categoricals
    le_weather = LabelEncoder()
    le_traffic = LabelEncoder()
    le_time = LabelEncoder()
    le_vehicle = LabelEncoder()

    df_model["Weather"] = le_weather.fit_transform(df_model["Weather"])
    df_model["Traffic_Level"] = le_traffic.fit_transform(df_model["Traffic_Level"])
    df_model["Time_of_Day"] = le_time.fit_transform(df_model["Time_of_Day"])
    df_model["Vehicle_Type"] = le_vehicle.fit_transform(df_model["Vehicle_Type"])

    features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day", 
                "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
    target = "Delivery_Time_min"

    X = df_model[features]
    y = df_model[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model_choice = st.selectbox("Choose your model", ["Linear Regression", "Decision Tree", "K-Nearest Neighbors"])

    with mlflow.start_run():
        if model_choice == "Linear Regression":
            model = LinearRegression()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

            st.subheader("๐Ÿ“ˆ Model Performance")
            st.write(f"**MAE**: {mean_absolute_error(y_test, predictions):.2f}")
            st.write(f"**MSE**: {mean_squared_error(y_test, predictions):.2f}")
            st.write(f"**Rยฒ Score**: {r2_score(y_test, predictions):.3f}")

            fig, ax = plt.subplots()
            ax.scatter(y_test, predictions, alpha=0.5)
            ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
            ax.set_xlabel("Actual Delivery Time")
            ax.set_ylabel("Predicted Delivery Time")
            ax.set_title("Actual vs Predicted Delivery Time")
            st.pyplot(fig)

            st.subheader("๐Ÿ“Œ Key Insights")
            st.markdown("""
            - **Feature Impact:** Distance, Traffic Level, and Preparation Time were the most influential features in predicting delivery time.
            - **Model Fit:** The model achieves an Rยฒ score of ~0.77, indicating decent predictive power, but improvements are possible.
            - **Real-World Use:** Businesses can use this model to estimate delivery ETAs and improve customer satisfaction. More complex models or live traffic inputs could enhance future predictions.
            """)
        elif model_choice == "Decision Tree":
            # Classification setup
            df_model["FastDelivery"] = (df_model["Delivery_Time_min"] <= 30).astype(int)
            target = "FastDelivery"

            X = df_model[features]
            y = df_model[target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # UI for depth
            max_depth = st.number_input("Enter the maximum depth of the decision tree", 1, 20, value=5)
            model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

            # Metrics
            f1 = f1_score(y_test, preds)
            acc = accuracy_score(y_test, preds)
            precision = precision_score(y_test, preds)

            # Show metrics
            st.subheader("๐Ÿงฎ Decision Tree Prediction Metrics")
            col1, col2, col3 = st.columns(3)
            col1.metric("Decision Tree' f1-Score", f"{f1*100:.1f}%", "vs last run")
            col2.metric("Accuracy", f"{acc*100:.1f}%", "vs last run")
            col3.metric("Precision", f"{precision*100:.1f}%", "vs last run")

            # Visualization
            st.subheader("๐ŸŒณ Decision Tree Visualization")
            fig_tree, ax_tree = plt.subplots(figsize=(20, 10))
            plot_tree(model, feature_names=features, class_names=["Slow", "Fast"], filled=True, rounded=True, fontsize=10)
            st.pyplot(fig_tree)

        elif model_choice == "K-Nearest Neighbors":
            from sklearn.neighbors import KNeighborsClassifier
            from sklearn.metrics import accuracy_score
            import seaborn as sns

            # Optional: allow user to choose features
            all_features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day", 
                            "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
            selected_features = st.multiselect("Select features for KNN", all_features, default=all_features)

            if len(selected_features) == 0:
                st.warning("Please select at least one feature.")
            else:
                X = df_model[selected_features]
                y = (df_model["Delivery_Time_min"] <= 30).astype(int)

                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                # Try different k values
                accuracies = []
                k_range = range(1, 21)
                best_k = 1
                best_acc = 0
                best_model = None

                for k in k_range:
                    knn = KNeighborsClassifier(n_neighbors=k)
                    knn.fit(X_train, y_train)
                    preds = knn.predict(X_test)
                    acc = accuracy_score(y_test, preds)
                    accuracies.append(acc)
                    if acc > best_acc:
                        best_k = k
                        best_acc = acc
                        best_model = knn

                st.markdown(f"โœ… Best value of k: **{best_k}**")
                st.markdown(f"๐Ÿ“ˆ Best accuracy: **{best_acc:.2%}**")

                # Plot K vs Accuracy
                fig, ax = plt.subplots()
                sns.lineplot(x=list(k_range), y=accuracies, marker="o", ax=ax)
                ax.set_title("K Number ร— Accuracy")
                ax.set_xlabel("K")
                ax.set_ylabel("Accuracy")
                st.pyplot(fig)



# Page 4: Explainability
elif page == "Explainability ๐Ÿค”":
    st.title("๐Ÿค” Model Explainability with SHAP")

    df_model = df.dropna().copy()
    df_model["Weather"] = LabelEncoder().fit_transform(df_model["Weather"])
    df_model["Traffic_Level"] = LabelEncoder().fit_transform(df_model["Traffic_Level"])
    df_model["Time_of_Day"] = LabelEncoder().fit_transform(df_model["Time_of_Day"])
    df_model["Vehicle_Type"] = LabelEncoder().fit_transform(df_model["Vehicle_Type"])

    features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day", 
                "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]
    target = "Delivery_Time_min"

    X = df_model[features]
    y = df_model[target]
    model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
    model.fit(X, y)

    explainer = shap.Explainer(model, X)
    shap_values = explainer(X)

    st.subheader("๐ŸŒ Global Feature Importance")
    fig, ax = plt.subplots()
    shap.plots.bar(shap_values, max_display=7, show=False)
    st.pyplot(fig)

    st.subheader("๐Ÿ“Š SHAP Summary Plot")
    fig2, ax2 = plt.subplots()
    shap.summary_plot(shap_values, X, show=False)
    st.pyplot(fig2)

    st.subheader("๐Ÿ” Explain Single Prediction")
    instance = st.slider("Pick a row to explain", 0, len(X)-1, 0)
    fig3, ax3 = plt.subplots()
    shap.plots.waterfall(shap_values[instance], show=False)
    st.pyplot(fig3)

elif page == "Model Tracker ๐Ÿ“Š":
    st.title("๐Ÿ“Š Model Tracker with DagsHub + MLflow")
    st.markdown("This page shows all logged experiments and highlights your best model based on MAE.")

    # ๐Ÿ”ง Set MLflow URI (DagsHub)
    mlflow.set_tracking_uri("https://dagshub.com/zy2869/my-first-repo.mlflow")

    client = MlflowClient()

    # ๐Ÿ” Show all experiments so user knows what's available
    experiments = mlflow.search_experiments()
    experiment_names = [exp.name for exp in experiments]
    selected_exp_name = st.selectbox("Choose experiment", experiment_names)

    selected_exp = client.get_experiment_by_name(selected_exp_name)
    runs = client.search_runs(experiment_ids=[selected_exp.experiment_id], order_by=["metrics.MAE ASC"])

    # ๐Ÿ“Š Create table
    data = []
    for r in runs:
        data.append({
            "Run ID": r.info.run_id,
            "Model": r.data.tags.get("mlflow.runName", "Unnamed"),
            "MAE": r.data.metrics.get("MAE", None),
            "MSE": r.data.metrics.get("MSE", None),
            "MAPE": r.data.metrics.get("MAPE", None),
        })
    df_runs = pd.DataFrame(data)

    # ๐Ÿ† Show sorted models
    st.subheader("Top Performing Models (Sorted by MAE)")
    if not df_runs.empty:
        st.dataframe(df_runs.sort_values("MAE", na_position='last').reset_index(drop=True))
    else:
        st.warning("No runs with MAE metric found in this experiment.")

elif page == "Conclusion ๐Ÿ“Œ":
    st.title("๐Ÿ“Œ Conclusion and Insights")

    st.subheader("๐Ÿ” Delivery Strategy Recommendations Based on Our Analysis")

    st.markdown("""
    **Based on our overall analysis**, we found that delivery time is most strongly influenced by a few key operational features: **distance**, **preparation time**, and **traffic level**. These factors consistently showed high predictive value across models and SHAP explanations.

    ๐Ÿ“ For instance, our SHAP analysis confirmed that **Distance (km)** had the highest impact on delivery time predictions, while **Preparation Time** also played a major role. When these two were both high, delivery times significantly increased.

    ๐Ÿ๏ธ Among the different vehicle types, **bikes** were the most frequently used (51%), but they also had more variation in delivery speed depending on other conditions like traffic.

    ๐Ÿ“ˆ As distance increases, average delivery time predictably risesโ€”a trend confirmed by both bar charts and regression models.
    """)

    st.subheader("๐Ÿง  Key Learnings from Model Comparison")

    st.markdown("""
    - **Linear Regression** offered a strong baseline with an Rยฒ of **0.775**.
    - **Decision Trees** gave better interpretability with strong accuracy (~91.5%) but a lower F1-score.
    - **K-Nearest Neighbors (KNN)** with selected features reached **96.05% accuracy**.

    ๐Ÿ” Our model tracker (with MLflow + DagsHub) revealed that **Huber Regressor** performed best in terms of MAE, making it a great option when minimizing large errors.
    """)

    st.subheader("๐Ÿšš Real-World Use Case")

    st.markdown("""
    These results suggest that food delivery platforms could:
    - โœ… Use real-time **distance and traffic** data to adjust estimated delivery windows.
    - โœ… Improve ETAs by accounting for **preparation time** at the vendor.
    - โœ… Recommend **vehicle-type optimizations** during peak or off-peak hours.

    This could lead to improved customer satisfaction, fewer complaints, and better delivery routing decisions.
    """)

    st.subheader("๐Ÿ”ง Future Improvements?")

    st.markdown("""
    1. **Live Traffic API Integration**: Use real-time traffic feeds (e.g., Google Maps API) for more dynamic predictions.
    2. **User Behavior Modeling**: Include customer behavior (e.g., reorder rate, tip likelihood) to improve prioritization.
    3. **Expand Dataset**: Include orders from multiple cities to improve generalization across delivery environments.
    """)

elif page == "What-If Simulator ๐Ÿ”":
    st.title("๐Ÿ” What-If Simulator")
    st.markdown("### Adjust inputs to simulate delivery time!")

    df_model = df.dropna().copy()
    df_model["Weather"] = LabelEncoder().fit_transform(df_model["Weather"])
    df_model["Traffic_Level"] = LabelEncoder().fit_transform(df_model["Traffic_Level"])
    df_model["Time_of_Day"] = LabelEncoder().fit_transform(df_model["Time_of_Day"])
    df_model["Vehicle_Type"] = LabelEncoder().fit_transform(df_model["Vehicle_Type"])

    features = ["Distance_km", "Weather", "Traffic_Level", "Time_of_Day", 
                "Vehicle_Type", "Preparation_Time_min", "Courier_Experience_yrs"]

    # Train simple model
    X = df_model[features]
    y = df_model["Delivery_Time_min"]
    model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42)
    model.fit(X, y)

    # Input widgets
    st.markdown("#### Input Simulation Variables")
    col1, col2 = st.columns(2)

    with col1:
        distance = st.slider("Distance (km)", 0.5, 25.0, 5.0)
        prep_time = st.slider("Preparation Time (min)", 5, 40, 15)
        experience = st.slider("Courier Experience (yrs)", 0, 10, 2)

    with col2:
        weather = st.selectbox("Weather", df["Weather"].unique())
        traffic = st.selectbox("Traffic Level", df["Traffic_Level"].unique())
        time_of_day = st.selectbox("Time of Day", df["Time_of_Day"].unique())
        vehicle = st.selectbox("Vehicle Type", df["Vehicle_Type"].unique())

    # Encoding user input
    input_data = pd.DataFrame({
        "Distance_km": [distance],
        "Weather": [LabelEncoder().fit(df["Weather"]).transform([weather])[0]],
        "Traffic_Level": [LabelEncoder().fit(df["Traffic_Level"]).transform([traffic])[0]],
        "Time_of_Day": [LabelEncoder().fit(df["Time_of_Day"]).transform([time_of_day])[0]],
        "Vehicle_Type": [LabelEncoder().fit(df["Vehicle_Type"]).transform([vehicle])[0]],
        "Preparation_Time_min": [prep_time],
        "Courier_Experience_yrs": [experience]
    })

    prediction = model.predict(input_data)[0]
    st.success(f"๐Ÿ“ฆ Estimated Delivery Time: {prediction:.2f} minutes")

    st.caption("โšก Tip: Try extreme values to simulate peak vs. off-peak hours!")