Spaces:
Sleeping
Sleeping
| import os | |
| import glob | |
| import joblib | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from math import radians, cos, sin, asin, sqrt | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.cluster import KMeans | |
| st.set_page_config(page_title="Amazon Delivery Time Prediction", layout="wide") | |
| # ------------------------- | |
| # Utility helpers | |
| # ------------------------- | |
| def haversine(lat1, lon1, lat2, lon2): | |
| """Calculate haversine distance (km) between two points. Accepts floats or NaN.""" | |
| try: | |
| lon1, lat1, lon2, lat2 = map(float, (lon1, lat1, lon2, lat2)) | |
| lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) | |
| dlon = lon2 - lon1 | |
| dlat = lat2 - lat1 | |
| a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 | |
| c = 2 * asin(sqrt(a)) | |
| km = 6371 * c | |
| return km | |
| except Exception: | |
| return np.nan | |
| def safe_choose_cols(df, candidates): | |
| for cand in candidates: | |
| if all([c in df.columns for c in cand]): | |
| return cand | |
| return None | |
| def find_file_anywhere(name_substr): | |
| candidates = [ | |
| f"./{name_substr}", | |
| os.path.join(os.getcwd(), name_substr), | |
| f"/workspace/{name_substr}", | |
| f"/home/app/{name_substr}", | |
| f"/mnt/data/{name_substr}", | |
| ] | |
| for c in candidates: | |
| if os.path.exists(c): | |
| return c | |
| for p in glob.glob("**/*", recursive=True): | |
| if os.path.basename(p) == name_substr or name_substr in os.path.basename(p): | |
| return p | |
| return None | |
| def safe_make_ohe(**kwargs): | |
| try: | |
| return OneHotEncoder(sparse=False, **kwargs) | |
| except TypeError: | |
| return OneHotEncoder(sparse_output=False, **kwargs) | |
| def get_ohe_feature_names(ohe, input_features): | |
| try: | |
| return list(ohe.get_feature_names_out(input_features)) | |
| except Exception: | |
| try: | |
| return list(ohe.get_feature_names(input_features)) | |
| except Exception: | |
| names = [] | |
| if hasattr(ohe, "categories_"): | |
| for i, feat in enumerate(input_features): | |
| cats = list(ohe.categories_[i]) | |
| for c in cats[1:]: | |
| names.append(f"{feat}_{c}") | |
| return names | |
| # ------------------------- | |
| # Robust artifact loaders | |
| # ------------------------- | |
| def load_model_if_exists(model_name="xgb_tuned_model_pca.pkl"): | |
| path = find_file_anywhere(model_name) | |
| if path is None: | |
| return None, None | |
| try: | |
| model = joblib.load(path) | |
| expected_n = getattr(model, "n_features_in_", None) | |
| return model, {"path": path, "expected_n": expected_n} | |
| except Exception as e: | |
| return None, {"path": path, "error": str(e)} | |
| def load_training_csv(csv_name="amazon_distance.csv"): | |
| path = find_file_anywhere(csv_name) | |
| if path is None: | |
| return None, None | |
| try: | |
| df = pd.read_csv(path) | |
| return df, path | |
| except Exception: | |
| return None, path | |
| # ------------------------- | |
| # Preprocessing / artifact fitting | |
| # ------------------------- | |
| def fit_preprocessors(df, target_pca_n=None): | |
| artifacts = {} | |
| lat_candidates = [ | |
| ("Store_Latitude", "Store_Longitude", "Drop_Latitude", "Drop_Longitude"), | |
| ("Store_Lat", "Store_Long", "Drop_Lat", "Drop_Lng"), | |
| ("Store_lat", "Store_lng", "Drop_lat", "Drop_lng"), | |
| ("StoreLatitude","StoreLongitude","DropLatitude","DropLongitude"), | |
| ] | |
| loc_cols = safe_choose_cols(df, lat_candidates) | |
| core_map = {} | |
| core_map["Category"] = next((c for c in df.columns if c.lower().startswith("category")), None) | |
| core_map["Weather"] = next((c for c in df.columns if c.lower().startswith("weather")), None) | |
| core_map["Traffic"] = next((c for c in df.columns if c.lower().startswith("traffic")), None) | |
| core_map["Vehicle"] = next((c for c in df.columns if c.lower().startswith("vehicle")), None) | |
| core_map["Area"] = next((c for c in df.columns if c.lower().startswith("area")), None) | |
| core_map["Agent_Age"] = next((c for c in df.columns if "agent" in c.lower() and "age" in c.lower()), None) | |
| core_map["Agent_Rating"] = next((c for c in df.columns if "agent" in c.lower() and "rating" in c.lower()), None) | |
| core_map["Distance"] = next((c for c in df.columns if c.lower().startswith("distance")), None) | |
| core_map["Delivery_Time"] = next((c for c in df.columns if "delivery" in c.lower() and "time" in c.lower()), None) | |
| df2 = df.copy() | |
| if loc_cols is not None: | |
| s_lat, s_lon, d_lat, d_lon = loc_cols | |
| df2["Distance_calc"] = df2.apply( | |
| lambda r: haversine(r.get(d_lat, np.nan), r.get(d_lon, np.nan), | |
| r.get(s_lat, np.nan), r.get(s_lon, np.nan)), | |
| axis=1 | |
| ) | |
| if core_map["Distance"] is None: | |
| df2["Distance"] = df2["Distance_calc"] | |
| possible_order_time_cols = [c for c in df2.columns if "order" in c.lower() and ("time" in c.lower() or "date" in c.lower())] | |
| if len(possible_order_time_cols) > 0: | |
| col = possible_order_time_cols[0] | |
| try: | |
| df2["Order_DT"] = pd.to_datetime(df2[col], errors="coerce") | |
| df2["Order_Year"] = df2["Order_DT"].dt.year | |
| df2["Order_Month"] = df2["Order_DT"].dt.month | |
| df2["Order_DayOfWeek"] = df2["Order_DT"].dt.dayofweek | |
| df2["Is_Weekend"] = df2["Order_DayOfWeek"].isin([5,6]).astype(int) | |
| df2["Order_Hour"] = df2["Order_DT"].dt.hour.fillna(0).astype(int) | |
| except Exception: | |
| pass | |
| if core_map["Traffic"] is not None: | |
| df2["Traffic_clean"] = df2[core_map["Traffic"]].astype(str).str.lower().str.strip() | |
| else: | |
| df2["Traffic_clean"] = "unknown" | |
| if core_map["Weather"] is not None: | |
| df2["Weather_clean"] = df2[core_map["Weather"]].astype(str).str.title().str.strip() | |
| else: | |
| df2["Weather_clean"] = "Unknown" | |
| if core_map["Category"] is not None: | |
| df2["Category_clean"] = df2[core_map["Category"]].astype(str).str.strip() | |
| df2["Category_FreqEnc"] = df2["Category_clean"].map( | |
| df2["Category_clean"].value_counts(normalize=True)) | |
| else: | |
| df2["Category_clean"] = "Unknown" | |
| df2["Category_FreqEnc"] = 0.0 | |
| if core_map["Delivery_Time"] is not None: | |
| df2["Category_TE"] = df2.groupby("Category_clean")[core_map["Delivery_Time"]].transform("mean") | |
| else: | |
| df2["Category_TE"] = df2["Category_clean"].map( | |
| df2["Category_clean"].value_counts(normalize=True)) | |
| df2["Traffic_Weather"] = df2["Traffic_clean"].astype(str) + "_" + df2["Weather_clean"].astype(str) | |
| df2["Vehicle_clean"] = df2[core_map["Vehicle"]].astype(str) if core_map["Vehicle"] in df2.columns else "unknown" | |
| df2["Area_clean"] = df2[core_map["Area"]].astype(str) if core_map["Area"] in df2.columns else "unknown" | |
| if loc_cols is not None: | |
| _, _, d_lat, d_lon = loc_cols | |
| coords = df2[[d_lat, d_lon]].dropna() | |
| if len(coords) >= 5: | |
| kmeans = KMeans(n_clusters=5, random_state=42) | |
| kmeans.fit(coords) | |
| def safe_cluster(r): | |
| if pd.isna(r[d_lat]) or pd.isna(r[d_lon]): | |
| return -1 | |
| return int(kmeans.predict([[r[d_lat], r[d_lon]]])[0]) | |
| df2["Location_Cluster"] = df2.apply(safe_cluster, axis=1) | |
| artifacts["kmeans"] = kmeans | |
| else: | |
| df2["Location_Cluster"] = -1 | |
| else: | |
| df2["Location_Cluster"] = -1 | |
| ohe_features = ["Weather_clean", "Traffic_clean", "Vehicle_clean", "Area_clean", "Traffic_Weather"] | |
| ohe = safe_make_ohe(handle_unknown="ignore", drop="first") | |
| fit_df_ohe = df2[ohe_features].fillna("missing") | |
| ohe.fit(fit_df_ohe) | |
| artifacts["ohe"] = ohe | |
| artifacts["ohe_features"] = ohe_features | |
| numeric_cols = [] | |
| for colname in ["Distance", "Order_Hour", "Agent_Age", "Agent_Rating", "Category_FreqEnc"]: | |
| if colname in df2.columns: | |
| numeric_cols.append(colname) | |
| if numeric_cols: | |
| df2[numeric_cols] = df2[numeric_cols].fillna(df2[numeric_cols].median()) | |
| scaler = StandardScaler() | |
| if numeric_cols: | |
| scaler.fit(df2[numeric_cols]) | |
| artifacts["scaler"] = scaler | |
| artifacts["numeric_cols"] = numeric_cols | |
| X_num = scaler.transform(df2[numeric_cols]) if numeric_cols else np.zeros((len(df2), 0)) | |
| X_ohe_raw = ohe.transform(df2[ohe_features].fillna("missing")) | |
| X_ohe = X_ohe_raw.toarray() if hasattr(X_ohe_raw, "toarray") else X_ohe_raw | |
| extra_cols = [] | |
| X_extra = np.empty((len(df2), 0)) | |
| if "Category_TE" in df2.columns: | |
| X_extra = np.hstack([X_extra, df2[["Category_TE"]].fillna(0).values]) | |
| extra_cols.append("Category_TE") | |
| if "Category_FreqEnc" in df2.columns: | |
| X_extra = np.hstack([X_extra, df2[["Category_FreqEnc"]].fillna(0).values]) | |
| extra_cols.append("Category_FreqEnc") | |
| if "Location_Cluster" in df2.columns: | |
| X_extra = np.hstack([X_extra, df2[["Location_Cluster"]].fillna(-1).values]) | |
| extra_cols.append("Location_Cluster") | |
| X_full = np.hstack([X_num, X_ohe, X_extra]) | |
| artifacts["rf"] = None | |
| if core_map["Delivery_Time"] is not None: | |
| y = df2[core_map["Delivery_Time"]].values | |
| rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1) | |
| rf.fit(X_full, y) | |
| artifacts["rf"] = rf | |
| if target_pca_n is not None and isinstance(target_pca_n, int) and target_pca_n > 0: | |
| desired_n = min(target_pca_n, X_full.shape[1]) | |
| else: | |
| desired_n = min(17, X_full.shape[1]) | |
| pca = PCA(n_components=desired_n, random_state=42) | |
| pca.fit(X_full) | |
| artifacts["pca"] = pca | |
| artifacts["pca_n"] = desired_n | |
| artifacts["feature_order"] = { | |
| "numeric": numeric_cols, | |
| "ohe": get_ohe_feature_names(ohe, ohe_features), | |
| "extra": extra_cols | |
| } | |
| artifacts["ohe_features"] = ohe_features | |
| artifacts["train_X_shape"] = X_full.shape | |
| return artifacts | |
| # ------------------------- | |
| # Load model and train CSV | |
| # ------------------------- | |
| model, model_meta = load_model_if_exists("xgb_tuned_model_pca.pkl") | |
| train_df, train_csv_path = load_training_csv("amazon_distance.csv") | |
| artifacts = None | |
| if train_df is not None: | |
| target_pca_n = None | |
| if model_meta is not None and isinstance(model_meta, dict): | |
| target_pca_n = model_meta.get("expected_n") | |
| artifacts = fit_preprocessors(train_df, target_pca_n=target_pca_n) | |
| # ------------------------- | |
| # UI | |
| # ------------------------- | |
| st.title("🚚 Amazon Delivery Time Prediction") | |
| st.markdown("Fill the form to predict delivery time") | |
| default_weather_opts = ["Cloudy", "Sunny", "Rainy"] | |
| default_traffic_opts = ["low", "medium", "high", "jam"] | |
| default_vehicle_opts = ["motorcycle", "car", "van"] | |
| default_area_opts = ["Metropolitian", "Urban", "Rural"] | |
| default_category_opts = ["Apparel", "Electronics", "Home"] | |
| def find_column_values(df, keyword, fallback): | |
| if df is None: | |
| return fallback | |
| col = next((c for c in df.columns if keyword in c.lower()), None) | |
| if col is not None: | |
| try: | |
| vals = sorted(df[col].dropna().unique()) | |
| if len(vals) > 0: | |
| return vals | |
| except Exception: | |
| pass | |
| return fallback | |
| weather_opts = find_column_values(train_df, "weather", default_weather_opts) | |
| traffic_opts = find_column_values(train_df, "traffic", default_traffic_opts) | |
| vehicle_opts = find_column_values(train_df, "vehicle", default_vehicle_opts) | |
| area_opts = find_column_values(train_df, "area", default_area_opts) | |
| category_opts = find_column_values(train_df, "category", default_category_opts) | |
| left, right = st.columns([2,2]) | |
| with left: | |
| weather = st.selectbox("Weather", options=weather_opts) | |
| vehicle = st.selectbox("Vehicle", options=vehicle_opts) | |
| category = st.selectbox("Category", options=category_opts) | |
| agent_age = st.number_input("Agent_Age (range: 20 - 80)", min_value=20.0, max_value=80.0, value=30.0, step=1.0) | |
| distance_user = st.number_input("Distance (km)", min_value=0.0, value=5.0, step=0.1) | |
| with right: | |
| traffic = st.selectbox("Traffic", options=traffic_opts) | |
| area = st.selectbox("Area", options=area_opts) | |
| agent_rating = st.number_input("Agent_Rating (range: 0.0 - 5.0)", min_value=0.0, max_value=5.0, value=4.5, step=0.1) | |
| order_hour = st.number_input("Order_Hour (0 - 23)", min_value=0, max_value=23, value=12, step=1) | |
| if train_df is None: | |
| st.error("Training CSV not found. Place 'amazon_distance.csv' in repo root or /mnt/data/ for preprocessing parity.") | |
| # ------------------------- | |
| # Predict button logic | |
| # ------------------------- | |
| if st.button("🚀 Predict Delivery Time"): | |
| raw = { | |
| "Weather_clean": weather, | |
| "Traffic_clean": traffic, | |
| "Vehicle_clean": vehicle, | |
| "Area_clean": area, | |
| "Category_clean": category, | |
| "Agent_Age": agent_age, | |
| "Agent_Rating": agent_rating, | |
| "Distance": distance_user * 1000, | |
| "Order_Hour": order_hour, | |
| "Traffic_Weather": f"{traffic}_{weather}" | |
| } | |
| X_raw = pd.DataFrame([raw]) | |
| if artifacts is None: | |
| st.error("Missing preprocessing artifacts (training CSV not loaded). Cannot preprocess reliably.") | |
| st.stop() | |
| ohe = artifacts["ohe"] | |
| numeric_cols = artifacts["numeric_cols"] | |
| scaler = artifacts["scaler"] | |
| pca = artifacts["pca"] | |
| rf = artifacts["rf"] | |
| ohe_features = artifacts["ohe_features"] | |
| for nc in numeric_cols: | |
| if nc not in X_raw.columns: | |
| X_raw[nc] = 0 | |
| X_raw[numeric_cols] = X_raw[numeric_cols].fillna(0) | |
| X_num = scaler.transform(X_raw[numeric_cols]) if numeric_cols else np.zeros((1,0)) | |
| X_ohe_raw = ohe.transform(X_raw[ohe_features].fillna("missing")) | |
| X_ohe = X_ohe_raw.toarray() if hasattr(X_ohe_raw, "toarray") else X_ohe_raw | |
| extras = [] | |
| cat_col = next((c for c in train_df.columns if c.lower().startswith("category")), None) if train_df is not None else None | |
| if "Category_TE" in artifacts["feature_order"]["extra"]: | |
| try: | |
| target_col = next((c for c in train_df.columns if "delivery" in c.lower() and "time" in c.lower()), None) | |
| if cat_col is not None and target_col is not None: | |
| te_map = train_df.groupby(cat_col)[target_col].mean() | |
| te_val = te_map.get(category, 0) if hasattr(te_map, "get") else 0 | |
| else: | |
| te_val = 0 | |
| except Exception: | |
| te_val = 0 | |
| extras.append([te_val]) | |
| if "Category_FreqEnc" in artifacts["feature_order"]["extra"]: | |
| try: | |
| freq_val = train_df[cat_col].value_counts(normalize=True).get(category, 0) if cat_col in train_df.columns else 0 | |
| except Exception: | |
| freq_val = 0 | |
| extras.append([freq_val]) | |
| if "Location_Cluster" in artifacts["feature_order"]["extra"]: | |
| extras.append([-1]) | |
| if len(extras) > 0: | |
| cols = [np.asarray(e).reshape(1, -1) for e in extras] | |
| X_extra = np.hstack(cols) | |
| else: | |
| X_extra = np.empty((1,0)) | |
| X_full = np.hstack([X_num, X_ohe, X_extra]) | |
| X_pca = pca.transform(X_full) | |
| chosen_prediction = None | |
| if model is not None: | |
| expected_n = getattr(model, "n_features_in_", None) | |
| if expected_n is not None and expected_n != X_pca.shape[1]: | |
| if X_pca.shape[1] > expected_n: | |
| X_pca_used = X_pca[:, :expected_n] | |
| else: | |
| pad = np.zeros((X_pca.shape[0], expected_n - X_pca.shape[1])) | |
| X_pca_used = np.hstack([X_pca, pad]) | |
| else: | |
| X_pca_used = X_pca | |
| try: | |
| raw_pred = model.predict(X_pca_used) | |
| chosen_prediction = float(np.array(raw_pred).ravel()[0]) | |
| except Exception: | |
| chosen_prediction = None | |
| if chosen_prediction is None and rf is not None: | |
| try: | |
| rf_pred = rf.predict(X_full) | |
| chosen_prediction = float(np.array(rf_pred).ravel()[0]) | |
| except Exception: | |
| pass | |
| if chosen_prediction is None and train_df is not None: | |
| target_col = next((c for c in train_df.columns if "delivery" in c.lower() and "time" in c.lower()), None) | |
| if target_col is not None: | |
| chosen_prediction = float(train_df[target_col].median()) | |
| if chosen_prediction is None: | |
| st.error("Failed to produce a prediction.") | |
| else: | |
| st.success(f"Predicted Delivery Time: {chosen_prediction:.2f} minutes") | |