import gradio as gr import pandas as pd import numpy as np import plotly.express as px from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from pathlib import Path import tempfile DATA_PATH = Path("synthetic_delivery_data.csv") NUMERIC_COLS = [ "distance_km", "package_weight_kg", "delivery_time_hours", "expected_time_hours", "delivery_rating", "delivery_cost" ] CAT_COLS = [ "delivery_partner", "package_type", "vehicle_type", "delivery_mode", "region", "weather_condition", "delayed", "delivery_status" ] CUSTOM_CSS = """ .gradio-container {max-width: 1280px !important; margin: auto;} .metric-card {background: linear-gradient(135deg, #ffffff, #f7f8fb); border: 1px solid #e8e8ef; border-radius: 18px; padding: 18px; box-shadow: 0 8px 24px rgba(0,0,0,.05);} .metric-label {font-size: 13px; color: #5f6470; margin-bottom: 6px;} .metric-value {font-size: 30px; font-weight: 800; color: #111827;} .insight-box {background: #111827; color: white; border-radius: 18px; padding: 20px; line-height: 1.55;} .small-muted {color: #6b7280; font-size: 13px;} """ def _clean_time_column(series): """Convert either normal numbers or timestamp-looking duration strings into numeric hours.""" if pd.api.types.is_numeric_dtype(series): return pd.to_numeric(series, errors="coerce") s = series.astype(str) # Handles values like 1970-01-01 00:00:00.000000008 by extracting last part. extracted = s.str.split(".").str[-1] return pd.to_numeric(extracted, errors="coerce") def load_and_prepare(file_obj=None): if file_obj is None: df = pd.read_csv(DATA_PATH) else: df = pd.read_csv(file_obj.name) df = df.copy() df.columns = df.columns.str.strip().str.lower() df = df.drop_duplicates() required_minimum = ["distance_km", "vehicle_type", "weather_condition", "delivery_mode", "region"] missing_required = [c for c in required_minimum if c not in df.columns] if missing_required: raise gr.Error(f"Your file is missing these required columns: {missing_required}") for col in ["delivery_time_hours", "expected_time_hours"]: if col in df.columns: df[col] = _clean_time_column(df[col]) for col in NUMERIC_COLS: if col in df.columns: df[col] = pd.to_numeric(df[col], errors="coerce") df[col] = df[col].fillna(df[col].median()) for col in CAT_COLS: if col in df.columns: df[col] = df[col].astype(str).str.strip().str.lower() if df[col].isna().any(): df[col] = df[col].fillna(df[col].mode()[0]) # If expected/delivery time are not reliable or missing, rebuild them with business logic. df = create_synthetic_time_logic(df) df["delay_hours"] = (df["delivery_time_hours"] - df["expected_time_hours"]).round(2) df["calculated_delay"] = np.where(df["delay_hours"] > 0, "yes", "no") df["delay_score"] = df["delay_hours"].apply(delay_score) df["performance_label"] = df["delay_score"].apply(performance_label) df["distance_category"] = pd.cut( df["distance_km"], bins=[0, 50, 150, 300, float("inf")], labels=["short", "medium", "long", "very long"], include_lowest=True, ).astype(str) return df def create_synthetic_time_logic(df): df = df.copy() for col in ["vehicle_type", "weather_condition", "delivery_mode", "region"]: df[col] = df[col].astype(str).str.strip().str.lower() vehicle_adjustment = {"bike": 1.2, "van": 0.5, "truck": 0.8, "ev van": 0.4} weather_adjustment = {"clear": 0.0, "cloudy": 0.2, "foggy": 0.6, "rainy": 0.8, "stormy": 1.2, "cold": 0.2, "hot": 0.2, "windy": 0.3} mode_adjustment = {"same day": 0.3, "express": 0.2, "two day": 0.7, "standard": 0.5} region_adjustment = {"central": 0.6, "north": 0.3, "south": 0.3, "east": 0.4, "west": 0.4} expected = ( df["distance_km"] / 45 + df["vehicle_type"].map(vehicle_adjustment).fillna(0.5) + df["weather_condition"].map(weather_adjustment).fillna(0.3) + df["delivery_mode"].map(mode_adjustment).fillna(0.4) + df["region"].map(region_adjustment).fillna(0.3) ).clip(lower=0.5) vehicle_mult = {"bike": 1.05, "van": 0.95, "truck": 1.02, "ev van": 0.97} weather_mult = {"clear": 0.95, "cloudy": 1.00, "foggy": 1.05, "rainy": 1.10, "stormy": 1.20, "cold": 1.02, "hot": 1.02, "windy": 1.03} mode_mult = {"same day": 1.05, "express": 1.02, "two day": 0.97, "standard": 1.00} region_mult = {"central": 1.08, "north": 1.00, "south": 1.01, "east": 1.02, "west": 1.03} actual = ( expected * df["vehicle_type"].map(vehicle_mult).fillna(1) * df["weather_condition"].map(weather_mult).fillna(1) * df["delivery_mode"].map(mode_mult).fillna(1) * df["region"].map(region_mult).fillna(1) ).clip(lower=0.5) ratio = actual / expected balanced_actual = np.where( ratio < 0.98, expected * 0.95, np.where(ratio < 1.05, expected * 1.00, np.where(ratio < 1.15, expected * 1.10, expected * 1.25)) ) df["expected_time_hours"] = expected.round(2) df["delivery_time_hours"] = pd.Series(balanced_actual).round(2) return df def delay_score(delay): if delay <= 0: return 5 if delay <= 2: return 4 if delay <= 5: return 3 if delay <= 8: return 2 return 1 def performance_label(score): return {5: "excellent", 4: "good", 3: "average", 2: "poor", 1: "critical"}.get(int(score), "unknown") def filter_df(df, vehicle, weather, mode, region): out = df.copy() filters = {"vehicle_type": vehicle, "weather_condition": weather, "delivery_mode": mode, "region": region} for col, selected in filters.items(): if selected and "all" not in selected: out = out[out[col].isin(selected)] return out def kpi_html(df): total = len(df) delay_rate = (df["calculated_delay"].eq("yes").mean() * 100) if total else 0 avg_delay = df["delay_hours"].mean() if total else 0 avg_score = df["delay_score"].mean() if total else 0 cost = df["delivery_cost"].mean() if "delivery_cost" in df.columns and total else 0 return f"""
Average delivery cost in filtered data: {cost:,.2f}
""" def group_summary(df, col): return ( df.groupby(col, observed=False) .agg( deliveries=(col, "size"), delay_rate=("calculated_delay", lambda x: round((x.eq("yes").mean() * 100), 2)), avg_delay_hours=("delay_hours", "mean"), avg_delay_score=("delay_score", "mean"), avg_distance_km=("distance_km", "mean"), ) .round(2) .sort_values(["delay_rate", "avg_delay_hours"], ascending=False) .reset_index() ) def make_charts(df): by_vehicle = group_summary(df, "vehicle_type") by_weather = group_summary(df, "weather_condition") by_region = group_summary(df, "region") by_mode = group_summary(df, "delivery_mode") fig_vehicle = px.bar(by_vehicle, x="vehicle_type", y="delay_rate", text="delay_rate", title="Delay Risk by Vehicle Type") fig_weather = px.bar(by_weather, x="weather_condition", y="avg_delay_hours", text="avg_delay_hours", title="Average Delay Hours by Weather") fig_region = px.bar(by_region, x="region", y="delay_rate", text="delay_rate", title="Delay Rate by Region") fig_mode = px.bar(by_mode, x="delivery_mode", y="avg_delay_score", text="avg_delay_score", title="Performance Score by Delivery Mode") fig_scatter = px.scatter(df.sample(min(len(df), 2000), random_state=42), x="distance_km", y="delay_hours", color="weather_condition", hover_data=["vehicle_type", "delivery_mode", "region"], title="Distance vs Delay Hours") for fig in [fig_vehicle, fig_weather, fig_region, fig_mode, fig_scatter]: fig.update_layout(template="plotly_white", height=430, margin=dict(l=40, r=20, t=60, b=40)) return fig_vehicle, fig_weather, fig_region, fig_mode, fig_scatter def train_feature_importance(df): model_cols = ["vehicle_type", "weather_condition", "delivery_mode", "region", "distance_category", "distance_km", "package_weight_kg"] model_cols = [c for c in model_cols if c in df.columns] X = df[model_cols] y = df["calculated_delay"].eq("yes").astype(int) cat = [c for c in model_cols if X[c].dtype == "object" or str(X[c].dtype) == "category"] num = [c for c in model_cols if c not in cat] pre = ColumnTransformer([("cat", OneHotEncoder(handle_unknown="ignore"), cat), ("num", "passthrough", num)]) clf = RandomForestClassifier(n_estimators=80, random_state=42, max_depth=7) pipe = Pipeline([("pre", pre), ("clf", clf)]) pipe.fit(X, y) names = list(pipe.named_steps["pre"].get_feature_names_out()) importances = pipe.named_steps["clf"].feature_importances_ imp = pd.DataFrame({"factor": names, "importance": importances}).sort_values("importance", ascending=False).head(12) imp["factor"] = imp["factor"].str.replace("cat__", "", regex=False).str.replace("num__", "", regex=False) fig = px.bar(imp.sort_values("importance"), x="importance", y="factor", orientation="h", title="AI Model: Most Important Delay-Risk Drivers") fig.update_layout(template="plotly_white", height=470, margin=dict(l=120, r=20, t=60, b=40)) return fig, imp def auto_insights(df): if len(df) == 0: return "Business challenge: Which operational factors create the highest delivery-delay risk, and what should management do?
Highest-risk factors found in the filtered data:
{top_risk_text}
Best-performing conditions:
{best_text}
Management action: {recommendation}
Qualitative interpretation: Delay risk is not only a numeric issue. It affects customer trust, service reliability, driver planning, and cost control. The dashboard therefore combines quantitative KPIs with qualitative business recommendations.