Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files
Predictive_Modelling_+_ARIMA_Forecasting_Urban_Mobility_Startup_–_Pricing_&_Satisfaction_Optimization (1).ipynb
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"id": "2ERyVGhbyopK"
|
| 22 |
+
},
|
| 23 |
+
"outputs": [],
|
| 24 |
+
"source": [
|
| 25 |
+
"# Role: Data Analyst\n",
|
| 26 |
+
"# Pipeline:\n",
|
| 27 |
+
"# CLEAN > ENCODE > SPLIT 80-20 > RANDOM FOREST CLASSIFICATION (satisfaction)\n",
|
| 28 |
+
"# > ARIMA REVENUE FORECAST > FEATURE IMPORTANCE > EVALUATION\n",
|
| 29 |
+
"# =============================================================================\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"import pandas as pd\n",
|
| 32 |
+
"import numpy as np\n",
|
| 33 |
+
"import matplotlib.pyplot as plt\n",
|
| 34 |
+
"import matplotlib.gridspec as gridspec\n",
|
| 35 |
+
"import seaborn as sns\n",
|
| 36 |
+
"import warnings\n",
|
| 37 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"from sklearn.ensemble import RandomForestClassifier\n",
|
| 40 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 41 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
| 42 |
+
"from sklearn.metrics import (classification_report,\n",
|
| 43 |
+
" ConfusionMatrixDisplay,\n",
|
| 44 |
+
" accuracy_score)\n",
|
| 45 |
+
"from statsmodels.tsa.arima.model import ARIMA\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
|
| 48 |
+
"sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n",
|
| 49 |
+
"plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n",
|
| 50 |
+
" \"axes.labelsize\": 11})"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"source": [
|
| 56 |
+
"# SECTION 1 – LOAD DATA FROM NOTEBOOK 1\n",
|
| 57 |
+
"# =============================================================================\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"ride_df = pd.read_csv(\"/content/ride_data_clean.csv\")\n",
|
| 60 |
+
"review_df = pd.read_csv(\"/content/review_data_clean.csv\")\n",
|
| 61 |
+
"merged_df = pd.read_csv(\"/content/merged_summary.csv\")\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"print(f\"Rides: {ride_df.shape} | Reviews: {review_df.shape} | Merged: {merged_df.shape}\")"
|
| 64 |
+
],
|
| 65 |
+
"metadata": {
|
| 66 |
+
"colab": {
|
| 67 |
+
"base_uri": "https://localhost:8080/"
|
| 68 |
+
},
|
| 69 |
+
"id": "63oe81VNzi8_",
|
| 70 |
+
"outputId": "245fcee0-e3b0-41ed-9fb3-3ec7c4fd6eba"
|
| 71 |
+
},
|
| 72 |
+
"execution_count": 3,
|
| 73 |
+
"outputs": [
|
| 74 |
+
{
|
| 75 |
+
"output_type": "stream",
|
| 76 |
+
"name": "stdout",
|
| 77 |
+
"text": [
|
| 78 |
+
"Rides: (1000, 12) | Reviews: (1500, 7) | Merged: (16, 11)\n"
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
]
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"cell_type": "code",
|
| 85 |
+
"source": [
|
| 86 |
+
"# SECTION 2 – CLASSIFICATION: PREDICT USER SATISFACTION (HIGH vs LOW)\n",
|
| 87 |
+
"# Dependent variable : SatisfactionLabel (High = rating ≥ 4, Low otherwise)\n",
|
| 88 |
+
"# Independent variables: final_price_eur, distance_km, duration_min,\n",
|
| 89 |
+
"# discount_pct, cancelled, ride_type, time_slot\n",
|
| 90 |
+
"# =============================================================================\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"# ── 2a. Build classification dataframe ───────────────────────────────────────\n",
|
| 93 |
+
"clf_df = ride_df[[\n",
|
| 94 |
+
" \"final_price_eur\", \"distance_km\", \"duration_min\",\n",
|
| 95 |
+
" \"discount_pct\", \"cancelled\", \"ride_type\", \"time_slot\", \"rating\"\n",
|
| 96 |
+
"]].copy()\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"clf_df[\"SatisfactionLabel\"] = (clf_df[\"rating\"] >= 4).astype(int) # 1=High, 0=Low\n",
|
| 99 |
+
"clf_df.drop(columns=\"rating\", inplace=True)\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"# ── 2b. Encode categoricals ───────────────────────────────────────────────────\n",
|
| 102 |
+
"le_rt = LabelEncoder()\n",
|
| 103 |
+
"le_ts = LabelEncoder()\n",
|
| 104 |
+
"clf_df[\"ride_type_enc\"] = le_rt.fit_transform(clf_df[\"ride_type\"])\n",
|
| 105 |
+
"clf_df[\"time_slot_enc\"] = le_ts.fit_transform(clf_df[\"time_slot\"])\n",
|
| 106 |
+
"clf_df.drop(columns=[\"ride_type\", \"time_slot\"], inplace=True)\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"X = clf_df.drop(columns=\"SatisfactionLabel\")\n",
|
| 109 |
+
"y = clf_df[\"SatisfactionLabel\"]\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"# ── 2c. Train / test split 80-20 ──────────────────────────────────────────────\n",
|
| 112 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
| 113 |
+
" X, y, test_size=0.20, random_state=42, stratify=y)\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"print(f\"\\nTrain size: {len(X_train)} | Test size: {len(X_test)}\")\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"# ── 2d. Random Forest ─────────────────────────────────────────────────────────\n",
|
| 118 |
+
"rf = RandomForestClassifier(n_estimators=200, max_depth=8,\n",
|
| 119 |
+
" random_state=42, class_weight=\"balanced\")\n",
|
| 120 |
+
"rf.fit(X_train, y_train)\n",
|
| 121 |
+
"y_pred = rf.predict(X_test)\n",
|
| 122 |
+
"\n",
|
| 123 |
+
"print(f\"\\nClassification Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
|
| 124 |
+
"print(\"\\nClassification Report:\")\n",
|
| 125 |
+
"print(classification_report(y_test, y_pred,\n",
|
| 126 |
+
" target_names=[\"Low Satisfaction\", \"High Satisfaction\"]))\n"
|
| 127 |
+
],
|
| 128 |
+
"metadata": {
|
| 129 |
+
"colab": {
|
| 130 |
+
"base_uri": "https://localhost:8080/"
|
| 131 |
+
},
|
| 132 |
+
"id": "nv-OXM0nzywU",
|
| 133 |
+
"outputId": "335d1c9d-6d2b-4878-9aed-2e3e05e11905"
|
| 134 |
+
},
|
| 135 |
+
"execution_count": 4,
|
| 136 |
+
"outputs": [
|
| 137 |
+
{
|
| 138 |
+
"output_type": "stream",
|
| 139 |
+
"name": "stdout",
|
| 140 |
+
"text": [
|
| 141 |
+
"\n",
|
| 142 |
+
"Train size: 800 | Test size: 200\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"Classification Accuracy: 0.6450\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"Classification Report:\n",
|
| 147 |
+
" precision recall f1-score support\n",
|
| 148 |
+
"\n",
|
| 149 |
+
" Low Satisfaction 0.22 0.16 0.18 50\n",
|
| 150 |
+
"High Satisfaction 0.74 0.81 0.77 150\n",
|
| 151 |
+
"\n",
|
| 152 |
+
" accuracy 0.65 200\n",
|
| 153 |
+
" macro avg 0.48 0.48 0.48 200\n",
|
| 154 |
+
" weighted avg 0.61 0.65 0.63 200\n",
|
| 155 |
+
"\n"
|
| 156 |
+
]
|
| 157 |
+
}
|
| 158 |
+
]
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"cell_type": "code",
|
| 162 |
+
"source": [
|
| 163 |
+
"# SECTION 3 – ARIMA REVENUE FORECAST\n",
|
| 164 |
+
"# Aggregate weekly total revenue → forecast next 12 weeks for 3 sample cities\n",
|
| 165 |
+
"# =============================================================================\n",
|
| 166 |
+
"\n",
|
| 167 |
+
"# Generate a synthetic weekly revenue time series per city (realistic trend + noise)\n",
|
| 168 |
+
"np.random.seed(7)\n",
|
| 169 |
+
"weeks = pd.date_range(\"2022-01-03\", periods=104, freq=\"W\") # 2 years weekly\n",
|
| 170 |
+
"cities_sel = [\"Paris\", \"Berlin\", \"Madrid\"]\n",
|
| 171 |
+
"\n",
|
| 172 |
+
"city_rev = {}\n",
|
| 173 |
+
"for c in cities_sel:\n",
|
| 174 |
+
" trend = np.linspace(80_000, 130_000, 104)\n",
|
| 175 |
+
" season = 8_000 * np.sin(np.linspace(0, 4 * np.pi, 104))\n",
|
| 176 |
+
" noise = np.random.normal(0, 5_000, 104)\n",
|
| 177 |
+
" city_rev[c] = pd.Series(trend + season + noise, index=weeks)\n",
|
| 178 |
+
"\n",
|
| 179 |
+
"FORECAST_STEPS = 12"
|
| 180 |
+
],
|
| 181 |
+
"metadata": {
|
| 182 |
+
"id": "2tsC_F9oz4JO"
|
| 183 |
+
},
|
| 184 |
+
"execution_count": 5,
|
| 185 |
+
"outputs": []
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"cell_type": "code",
|
| 189 |
+
"source": [
|
| 190 |
+
"# SECTION 4 – VISUALIZATIONS (5 charts)\n",
|
| 191 |
+
"# =============================================================================\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"fig = plt.figure(figsize=(20, 24))\n",
|
| 194 |
+
"fig.suptitle(\"Urban Mobility – Predictive Analytics & Revenue Forecasting\",\n",
|
| 195 |
+
" fontsize=17, fontweight=\"bold\", y=0.99)\n",
|
| 196 |
+
"gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.50, wspace=0.35)\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"# ── Chart 1: Feature importance ───────────────────────────────────────────────\n",
|
| 199 |
+
"ax1 = fig.add_subplot(gs[0, 0])\n",
|
| 200 |
+
"feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values()\n",
|
| 201 |
+
"feat_imp.index = [\"Discount %\", \"Cancelled\", \"Ride Type\",\n",
|
| 202 |
+
" \"Time Slot\", \"Duration (min)\", \"Distance (km)\", \"Final Price (€)\"]\n",
|
| 203 |
+
"feat_imp.sort_values().plot(kind=\"barh\", ax=ax1, color=PALETTE[1])\n",
|
| 204 |
+
"ax1.set_title(\"Random Forest – Feature Importances\\n(Satisfaction Classification)\")\n",
|
| 205 |
+
"ax1.set_xlabel(\"Importance Score\")\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"# ── Chart 2: Confusion matrix ─────────────────────────────────────────────────\n",
|
| 208 |
+
"ax2 = fig.add_subplot(gs[0, 1])\n",
|
| 209 |
+
"ConfusionMatrixDisplay.from_predictions(\n",
|
| 210 |
+
" y_test, y_pred,\n",
|
| 211 |
+
" display_labels=[\"Low\", \"High\"],\n",
|
| 212 |
+
" colorbar=False, cmap=\"Blues\", ax=ax2)\n",
|
| 213 |
+
"ax2.set_title(\"Confusion Matrix\\n(Satisfaction: Low vs High)\")\n",
|
| 214 |
+
"\n",
|
| 215 |
+
"# ── Charts 3-5: ARIMA forecasts per city ─────────────────────────────────────\n",
|
| 216 |
+
"arima_positions = [(1, 0), (1, 1), (2, 0)]\n",
|
| 217 |
+
"arima_models = {}\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"for idx, city in enumerate(cities_sel):\n",
|
| 220 |
+
" row, col = arima_positions[idx]\n",
|
| 221 |
+
" ax = fig.add_subplot(gs[row, col])\n",
|
| 222 |
+
" series = city_rev[city]\n",
|
| 223 |
+
"\n",
|
| 224 |
+
" model = ARIMA(series, order=(2, 1, 2))\n",
|
| 225 |
+
" result = model.fit()\n",
|
| 226 |
+
" arima_models[city] = result\n",
|
| 227 |
+
"\n",
|
| 228 |
+
" forecast = result.get_forecast(steps=FORECAST_STEPS)\n",
|
| 229 |
+
" forecast_df = forecast.summary_frame(alpha=0.10)\n",
|
| 230 |
+
" future_idx = pd.date_range(series.index[-1] + pd.Timedelta(weeks=1),\n",
|
| 231 |
+
" periods=FORECAST_STEPS, freq=\"W\")\n",
|
| 232 |
+
" forecast_df.index = future_idx\n",
|
| 233 |
+
"\n",
|
| 234 |
+
" ax.plot(series, color=PALETTE[0], linewidth=1.2, label=\"Historical\")\n",
|
| 235 |
+
" ax.plot(forecast_df[\"mean\"], color=PALETTE[2],\n",
|
| 236 |
+
" linewidth=2, linestyle=\"--\", label=\"Forecast\")\n",
|
| 237 |
+
" ax.fill_between(forecast_df.index,\n",
|
| 238 |
+
" forecast_df[\"mean_ci_lower\"],\n",
|
| 239 |
+
" forecast_df[\"mean_ci_upper\"],\n",
|
| 240 |
+
" alpha=0.25, color=PALETTE[2], label=\"90% CI\")\n",
|
| 241 |
+
" ax.set_title(f\"ARIMA Revenue Forecast – {city}\")\n",
|
| 242 |
+
" ax.set_ylabel(\"Weekly Revenue (€)\")\n",
|
| 243 |
+
" ax.set_xlabel(\"\")\n",
|
| 244 |
+
" ax.legend(fontsize=8)\n",
|
| 245 |
+
" ax.yaxis.set_major_formatter(\n",
|
| 246 |
+
" plt.FuncFormatter(lambda v, _: f\"€{v/1000:.0f}k\"))\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"# ── Chart 6 (last cell): Price sensitivity – avg rating by price bucket ───────\n",
|
| 249 |
+
"ax6 = fig.add_subplot(gs[2, 1])\n",
|
| 250 |
+
"ride_df[\"price_bucket\"] = pd.cut(ride_df[\"final_price_eur\"],\n",
|
| 251 |
+
" bins=[0, 2, 3.5, 5, 6.5, 10],\n",
|
| 252 |
+
" labels=[\"<2\", \"2–3.5\", \"3.5–5\", \"5–6.5\", \">6.5\"])\n",
|
| 253 |
+
"price_sens = ride_df.groupby(\"price_bucket\", observed=True)[\"rating\"].mean()\n",
|
| 254 |
+
"ax6.bar(price_sens.index, price_sens.values, color=PALETTE[3], edgecolor=\"white\")\n",
|
| 255 |
+
"ax6.set_title(\"Price Sensitivity – Avg. Rating by Price Bucket\")\n",
|
| 256 |
+
"ax6.set_xlabel(\"Final Price (€)\")\n",
|
| 257 |
+
"ax6.set_ylabel(\"Avg. Rating\")\n",
|
| 258 |
+
"ax6.set_ylim(3, 5)\n",
|
| 259 |
+
"for p, v in zip(price_sens.index, price_sens.values):\n",
|
| 260 |
+
" ax6.text(p, v + 0.02, f\"{v:.2f}★\", ha=\"center\", fontsize=9)\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"plt.savefig(\"/content/notebook2_models_output.png\", bbox_inches=\"tight\")\n",
|
| 263 |
+
"plt.close()\n",
|
| 264 |
+
"print(\"\\n✅ Model output chart saved → notebook2_models_output.png\")"
|
| 265 |
+
],
|
| 266 |
+
"metadata": {
|
| 267 |
+
"colab": {
|
| 268 |
+
"base_uri": "https://localhost:8080/"
|
| 269 |
+
},
|
| 270 |
+
"id": "o7Y2B81jz89H",
|
| 271 |
+
"outputId": "35a76ae9-1ac1-4de8-bfba-6c349af72d11"
|
| 272 |
+
},
|
| 273 |
+
"execution_count": 7,
|
| 274 |
+
"outputs": [
|
| 275 |
+
{
|
| 276 |
+
"output_type": "stream",
|
| 277 |
+
"name": "stdout",
|
| 278 |
+
"text": [
|
| 279 |
+
"\n",
|
| 280 |
+
"✅ Model output chart saved → notebook2_models_output.png\n"
|
| 281 |
+
]
|
| 282 |
+
}
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"cell_type": "code",
|
| 287 |
+
"source": [
|
| 288 |
+
"# SECTION 5 – SAVE FORECAST TABLE\n",
|
| 289 |
+
"# =============================================================================\n",
|
| 290 |
+
"all_forecasts = []\n",
|
| 291 |
+
"for city in cities_sel:\n",
|
| 292 |
+
" fc = arima_models[city].get_forecast(steps=FORECAST_STEPS).summary_frame(alpha=0.10)\n",
|
| 293 |
+
" fc.index = pd.date_range(city_rev[city].index[-1] + pd.Timedelta(weeks=1),\n",
|
| 294 |
+
" periods=FORECAST_STEPS, freq=\"W\")\n",
|
| 295 |
+
" fc[\"city\"] = city\n",
|
| 296 |
+
" all_forecasts.append(fc[[\"city\", \"mean\", \"mean_ci_lower\", \"mean_ci_upper\"]])\n",
|
| 297 |
+
"\n",
|
| 298 |
+
"forecast_table = pd.concat(all_forecasts)\n",
|
| 299 |
+
"forecast_table.columns = [\"city\", \"forecast_revenue\", \"ci_lower_90\", \"ci_upper_90\"]\n",
|
| 300 |
+
"forecast_table.to_csv(\"/content/arima_forecast_table.csv\")\n",
|
| 301 |
+
"print(\"✅ Forecast table saved → arima_forecast_table.csv\")\n",
|
| 302 |
+
"print(forecast_table.head(6).round(0).to_string())\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"print(\"\\n══════════════════════════════════════════\")\n",
|
| 305 |
+
"print(\" NOTEBOOK 2 COMPLETE\")\n",
|
| 306 |
+
"print(\"══════════════════════════════════════════\")"
|
| 307 |
+
],
|
| 308 |
+
"metadata": {
|
| 309 |
+
"colab": {
|
| 310 |
+
"base_uri": "https://localhost:8080/"
|
| 311 |
+
},
|
| 312 |
+
"id": "KY_odiwF0h7r",
|
| 313 |
+
"outputId": "7e77d938-59cc-4241-e4b9-37dc7cb6eaf5"
|
| 314 |
+
},
|
| 315 |
+
"execution_count": 9,
|
| 316 |
+
"outputs": [
|
| 317 |
+
{
|
| 318 |
+
"output_type": "stream",
|
| 319 |
+
"name": "stdout",
|
| 320 |
+
"text": [
|
| 321 |
+
"✅ Forecast table saved → arima_forecast_table.csv\n",
|
| 322 |
+
" city forecast_revenue ci_lower_90 ci_upper_90\n",
|
| 323 |
+
"2024-01-07 Paris 124853.0 115042.0 134665.0\n",
|
| 324 |
+
"2024-01-14 Paris 124946.0 112850.0 137043.0\n",
|
| 325 |
+
"2024-01-21 Paris 124678.0 110776.0 138581.0\n",
|
| 326 |
+
"2024-01-28 Paris 124855.0 109164.0 140546.0\n",
|
| 327 |
+
"2024-02-04 Paris 124759.0 107576.0 141942.0\n",
|
| 328 |
+
"2024-02-11 Paris 124808.0 106197.0 143420.0\n",
|
| 329 |
+
"\n",
|
| 330 |
+
"══════════════════════════════════════════\n",
|
| 331 |
+
" NOTEBOOK 2 COMPLETE\n",
|
| 332 |
+
"══════════════════════════════════════════\n"
|
| 333 |
+
]
|
| 334 |
+
}
|
| 335 |
+
]
|
| 336 |
+
}
|
| 337 |
+
]
|
| 338 |
+
}
|
Real_World_Data_Processing_EDA.ipynb
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 4,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"colab": {
|
| 22 |
+
"base_uri": "https://localhost:8080/"
|
| 23 |
+
},
|
| 24 |
+
"id": "r-G_BpFaLoa4",
|
| 25 |
+
"outputId": "6ce2e622-9704-47a9-a54d-17ea18432dfd"
|
| 26 |
+
},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"output_type": "stream",
|
| 30 |
+
"name": "stdout",
|
| 31 |
+
"text": [
|
| 32 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
|
| 33 |
+
"Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
|
| 34 |
+
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
|
| 35 |
+
"Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
|
| 36 |
+
"Requirement already satisfied: vaderSentiment in /usr/local/lib/python3.12/dist-packages (3.3.2)\n",
|
| 37 |
+
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)\n",
|
| 38 |
+
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
|
| 39 |
+
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
|
| 40 |
+
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)\n",
|
| 41 |
+
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
|
| 42 |
+
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
|
| 43 |
+
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1)\n",
|
| 44 |
+
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n",
|
| 45 |
+
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
|
| 46 |
+
"Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
|
| 47 |
+
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
|
| 48 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n",
|
| 49 |
+
"Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)\n",
|
| 50 |
+
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.3)\n",
|
| 51 |
+
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)\n",
|
| 52 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
|
| 53 |
+
"Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.7)\n",
|
| 54 |
+
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n",
|
| 55 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n",
|
| 56 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.2.25)\n"
|
| 57 |
+
]
|
| 58 |
+
}
|
| 59 |
+
],
|
| 60 |
+
"source": [
|
| 61 |
+
"# --- 0. INSTALL DEPENDENCIES ---\n",
|
| 62 |
+
"!pip install pandas numpy matplotlib seaborn vaderSentiment scikit-learn\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"import pandas as pd\n",
|
| 65 |
+
"import numpy as np\n",
|
| 66 |
+
"import matplotlib.pyplot as plt\n",
|
| 67 |
+
"import matplotlib.gridspec as gridspec\n",
|
| 68 |
+
"import seaborn as sns\n",
|
| 69 |
+
"import warnings\n",
|
| 70 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
| 71 |
+
"\n",
|
| 72 |
+
"from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"# \u2500\u2500 Styling \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 75 |
+
"PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
|
| 76 |
+
"sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n",
|
| 77 |
+
"plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n",
|
| 78 |
+
" \"axes.labelsize\": 11, \"font.family\": \"DejaVu Sans\"})"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "code",
|
| 83 |
+
"source": [
|
| 84 |
+
"import pandas as pd\n",
|
| 85 |
+
"import numpy as np\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"N_RIDES = 1000\n",
|
| 88 |
+
"cities = [\"Berlin\", \"Munich\", \"Hamburg\", \"Cologne\"]\n",
|
| 89 |
+
"ride_types = [\"Standard\", \"Premium\", \"XL\", \"Eco\"]\n",
|
| 90 |
+
"time_slots = [\"Morning (6-10)\", \"Midday (10-14)\", \"Afternoon (14-18)\", \"Evening (18-22)\"]\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"# SECTION 1 \u2013 SIMULATE \"SCRAPED / FOUND\" REAL-WORLD DATA\n",
|
| 93 |
+
"# (In production: replace with actual web-scraped or API-fetched CSVs)\n",
|
| 94 |
+
"# \u2500\u2500 1a. Ride-level transaction data (quantitative) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 95 |
+
"ride_data = pd.DataFrame({\n",
|
| 96 |
+
" \"ride_id\": range(1, N_RIDES + 1),\n",
|
| 97 |
+
" \"city\": np.random.choice(cities, N_RIDES),\n",
|
| 98 |
+
" \"ride_type\": np.random.choice(ride_types, N_RIDES, p=[0.40, 0.30, 0.20, 0.10]),\n",
|
| 99 |
+
" \"time_slot\": np.random.choice(time_slots, N_RIDES, p=[0.25, 0.30, 0.30, 0.15]),\n",
|
| 100 |
+
" \"distance_km\": np.round(np.random.exponential(4, N_RIDES) + 0.5, 2),\n",
|
| 101 |
+
" \"duration_min\": np.round(np.random.normal(18, 6, N_RIDES).clip(3), 1),\n",
|
| 102 |
+
" \"base_price_eur\":np.round(np.random.uniform(1.5, 8.0, N_RIDES), 2),\n",
|
| 103 |
+
" \"discount_pct\": np.random.choice([0, 5, 10, 15, 20], N_RIDES,\n",
|
| 104 |
+
" p=[0.50, 0.20, 0.15, 0.10, 0.05]),\n",
|
| 105 |
+
" \"rating\": np.random.choice([1, 2, 3, 4, 5], N_RIDES,\n",
|
| 106 |
+
" p=[0.03, 0.07, 0.15, 0.40, 0.35]),\n",
|
| 107 |
+
" \"cancelled\": np.random.choice([0, 1], N_RIDES, p=[0.93, 0.07]),\n",
|
| 108 |
+
"})\n",
|
| 109 |
+
"\n",
|
| 110 |
+
"# Introduce 3 % missing values in price and rating (realistic)\n",
|
| 111 |
+
"for col in [\"base_price_eur\", \"rating\"]:\n",
|
| 112 |
+
" ride_data.loc[ride_data.sample(frac=0.03).index, col] = np.nan\n",
|
| 113 |
+
"\n",
|
| 114 |
+
"# Derived fields\n",
|
| 115 |
+
"ride_data[\"final_price_eur\"] = np.round(\n",
|
| 116 |
+
" ride_data[\"base_price_eur\"] * (1 - ride_data[\"discount_pct\"] / 100), 2)\n",
|
| 117 |
+
"ride_data[\"price_per_km\"] = np.round(\n",
|
| 118 |
+
" ride_data[\"final_price_eur\"] / ride_data[\"distance_km\"], 3)"
|
| 119 |
+
],
|
| 120 |
+
"metadata": {
|
| 121 |
+
"id": "gtbUjaaWMfH-"
|
| 122 |
+
},
|
| 123 |
+
"execution_count": 2,
|
| 124 |
+
"outputs": []
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"cell_type": "code",
|
| 128 |
+
"source": [
|
| 129 |
+
"# \u2500\u2500 1b. App-review data (qualitative) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 130 |
+
"positive_reviews = [\n",
|
| 131 |
+
" \"Absolutely love the e-scooter! Fast, clean, affordable.\",\n",
|
| 132 |
+
" \"Seamless booking and the bike was in great condition.\",\n",
|
| 133 |
+
" \"Best way to get around the city. Highly recommend!\",\n",
|
| 134 |
+
" \"Super convenient, saved me 20 minutes every morning.\",\n",
|
| 135 |
+
" \"Eco-friendly and cheap. Will use every day.\",\n",
|
| 136 |
+
" \"App works perfectly. Scooter was fully charged.\",\n",
|
| 137 |
+
" \"Great service, prices are very fair for the distance.\",\n",
|
| 138 |
+
" \"Customer support was helpful and friendly.\",\n",
|
| 139 |
+
"]\n",
|
| 140 |
+
"negative_reviews = [\n",
|
| 141 |
+
" \"The scooter was broken when I unlocked it. Very frustrating.\",\n",
|
| 142 |
+
" \"Overcharged for a 2 km ride. Pricing is confusing.\",\n",
|
| 143 |
+
" \"App crashed three times before I could complete the booking.\",\n",
|
| 144 |
+
" \"Terrible availability in my neighbourhood. Always empty.\",\n",
|
| 145 |
+
" \"The e-bike seat was damaged and uncomfortable.\",\n",
|
| 146 |
+
" \"Hidden fees are unacceptable. Totally misleading pricing.\",\n",
|
| 147 |
+
" \"Waited 10 minutes to connect to a scooter. Wasted my time.\",\n",
|
| 148 |
+
" \"No customer support response after a billing error.\",\n",
|
| 149 |
+
"]\n",
|
| 150 |
+
"neutral_reviews = [\n",
|
| 151 |
+
" \"It was okay. Nothing special, works as expected.\",\n",
|
| 152 |
+
" \"Decent ride, though a bit pricey compared to the metro.\",\n",
|
| 153 |
+
" \"Average experience. Some improvements needed in the app.\",\n",
|
| 154 |
+
" \"Not bad, but parking zones need to be clearer.\",\n",
|
| 155 |
+
" \"Works fine most of the time. Occasional glitches.\",\n",
|
| 156 |
+
"]\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"N_REVIEWS = 1500 # Define N_REVIEWS here\n",
|
| 159 |
+
"all_reviews = positive_reviews * 30 + negative_reviews * 20 + neutral_reviews * 10\n",
|
| 160 |
+
"review_data = pd.DataFrame({\n",
|
| 161 |
+
" \"review_id\": range(1, N_REVIEWS + 1),\n",
|
| 162 |
+
" \"city\": np.random.choice(cities, N_REVIEWS),\n",
|
| 163 |
+
" \"ride_type\": np.random.choice(ride_types, N_REVIEWS),\n",
|
| 164 |
+
" \"review_text\":np.random.choice(all_reviews, N_REVIEWS),\n",
|
| 165 |
+
" \"review_date\":pd.date_range(\"2024-01-01\", periods=N_REVIEWS, freq=\"14h\"),\n",
|
| 166 |
+
"})\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"print(\"\u2705 Data generated\")\n",
|
| 169 |
+
"print(f\" ride_data : {ride_data.shape}\")\n",
|
| 170 |
+
"print(f\" review_data: {review_data.shape}\")"
|
| 171 |
+
],
|
| 172 |
+
"metadata": {
|
| 173 |
+
"colab": {
|
| 174 |
+
"base_uri": "https://localhost:8080/"
|
| 175 |
+
},
|
| 176 |
+
"id": "CzPxA0rAoaHZ",
|
| 177 |
+
"outputId": "5174afc6-c010-4b9a-9dc8-8f41a6ac4b56"
|
| 178 |
+
},
|
| 179 |
+
"execution_count": 4,
|
| 180 |
+
"outputs": [
|
| 181 |
+
{
|
| 182 |
+
"output_type": "stream",
|
| 183 |
+
"name": "stdout",
|
| 184 |
+
"text": [
|
| 185 |
+
"\u2705 Data generated\n",
|
| 186 |
+
" ride_data : (1000, 12)\n",
|
| 187 |
+
" review_data: (1500, 5)\n"
|
| 188 |
+
]
|
| 189 |
+
}
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"cell_type": "code",
|
| 194 |
+
"source": [
|
| 195 |
+
"# SECTION 2 \u2013 DATA CLEANING\n",
|
| 196 |
+
"# =============================================================================\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"# \u2500\u2500 2a. Ride data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 199 |
+
"print(\"\\n\u2500\u2500 Missing values BEFORE cleaning \u2500\u2500\")\n",
|
| 200 |
+
"print(ride_data[[\"base_price_eur\", \"rating\"]].isnull().sum())\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"ride_data[\"base_price_eur\"] = ride_data[\"base_price_eur\"].fillna(ride_data[\"base_price_eur\"].median())\n",
|
| 203 |
+
"ride_data[\"rating\"] = ride_data[\"rating\"].fillna(round(ride_data[\"rating\"].median()))\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"print(\"\u2500\u2500 Missing values AFTER cleaning \u2500\u2500\")\n",
|
| 206 |
+
"print(ride_data[[\"base_price_eur\", \"rating\"]].isnull().sum())\n",
|
| 207 |
+
"\n",
|
| 208 |
+
"# Remove duplicate ride IDs (none expected, but good practice)\n",
|
| 209 |
+
"ride_data.drop_duplicates(subset=\"ride_id\", inplace=True)\n",
|
| 210 |
+
"\n",
|
| 211 |
+
"# Drop rides with physically impossible distance\n",
|
| 212 |
+
"ride_data = ride_data[ride_data[\"distance_km\"] > 0]\n",
|
| 213 |
+
"\n",
|
| 214 |
+
"# \u2500\u2500 2b. Review data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 215 |
+
"review_data.dropna(subset=[\"review_text\"], inplace=True)\n",
|
| 216 |
+
"review_data[\"review_text\"] = review_data[\"review_text\"].str.strip()"
|
| 217 |
+
],
|
| 218 |
+
"metadata": {
|
| 219 |
+
"colab": {
|
| 220 |
+
"base_uri": "https://localhost:8080/"
|
| 221 |
+
},
|
| 222 |
+
"id": "lnGYrVG7o0aO",
|
| 223 |
+
"outputId": "be8fb6b5-a485-4495-8ebd-240e741a8ce4"
|
| 224 |
+
},
|
| 225 |
+
"execution_count": 5,
|
| 226 |
+
"outputs": [
|
| 227 |
+
{
|
| 228 |
+
"output_type": "stream",
|
| 229 |
+
"name": "stdout",
|
| 230 |
+
"text": [
|
| 231 |
+
"\n",
|
| 232 |
+
"\u2500\u2500 Missing values BEFORE cleaning \u2500\u2500\n",
|
| 233 |
+
"base_price_eur 30\n",
|
| 234 |
+
"rating 30\n",
|
| 235 |
+
"dtype: int64\n",
|
| 236 |
+
"\u2500\u2500 Missing values AFTER cleaning \u2500\u2500\n",
|
| 237 |
+
"base_price_eur 0\n",
|
| 238 |
+
"rating 0\n",
|
| 239 |
+
"dtype: int64\n"
|
| 240 |
+
]
|
| 241 |
+
}
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"cell_type": "code",
|
| 246 |
+
"source": [
|
| 247 |
+
"# SECTION 3 \u2013 VADER SENTIMENT ANALYSIS ON REVIEWS\n",
|
| 248 |
+
"# =============================================================================\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"!pip install vaderSentiment\n",
|
| 251 |
+
"from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"analyzer = SentimentIntensityAnalyzer()\n",
|
| 254 |
+
"\n",
|
| 255 |
+
"def classify_sentiment(text):\n",
|
| 256 |
+
" score = analyzer.polarity_scores(text)[\"compound\"]\n",
|
| 257 |
+
" if score >= 0.05: return \"Positive\"\n",
|
| 258 |
+
" elif score <= -0.05: return \"Negative\"\n",
|
| 259 |
+
" else: return \"Neutral\"\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"review_data[\"compound_score\"] = review_data[\"review_text\"].apply(\n",
|
| 262 |
+
" lambda t: analyzer.polarity_scores(t)[\"compound\"])\n",
|
| 263 |
+
"review_data[\"sentiment\"] = review_data[\"review_text\"].apply(classify_sentiment)\n",
|
| 264 |
+
"\n",
|
| 265 |
+
"print(\"\\n\u2500\u2500 Sentiment distribution \u2500\u2500\")\n",
|
| 266 |
+
"print(review_data[\"sentiment\"].value_counts())"
|
| 267 |
+
],
|
| 268 |
+
"metadata": {
|
| 269 |
+
"colab": {
|
| 270 |
+
"base_uri": "https://localhost:8080/"
|
| 271 |
+
},
|
| 272 |
+
"id": "iXvqwQRxo-W6",
|
| 273 |
+
"outputId": "39694265-24f5-44e0-bc33-0f5300b1b917"
|
| 274 |
+
},
|
| 275 |
+
"execution_count": 8,
|
| 276 |
+
"outputs": [
|
| 277 |
+
{
|
| 278 |
+
"output_type": "stream",
|
| 279 |
+
"name": "stdout",
|
| 280 |
+
"text": [
|
| 281 |
+
"Collecting vaderSentiment\n",
|
| 282 |
+
" Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)\n",
|
| 283 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n",
|
| 284 |
+
"Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.7)\n",
|
| 285 |
+
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n",
|
| 286 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n",
|
| 287 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.2.25)\n",
|
| 288 |
+
"Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)\n",
|
| 289 |
+
"\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m126.0/126.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 290 |
+
"\u001b[?25hInstalling collected packages: vaderSentiment\n",
|
| 291 |
+
"Successfully installed vaderSentiment-3.3.2\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"\u2500\u2500 Sentiment distribution \u2500\u2500\n",
|
| 294 |
+
"sentiment\n",
|
| 295 |
+
"Positive 798\n",
|
| 296 |
+
"Negative 509\n",
|
| 297 |
+
"Neutral 193\n",
|
| 298 |
+
"Name: count, dtype: int64\n"
|
| 299 |
+
]
|
| 300 |
+
}
|
| 301 |
+
]
|
| 302 |
+
},
|
| 303 |
+
{
|
| 304 |
+
"cell_type": "code",
|
| 305 |
+
"source": [
|
| 306 |
+
"# SECTION 4 \u2013 MERGING DATASETS\n",
|
| 307 |
+
"# Aggregate rides per (city, ride_type) \u2192 merge with review sentiments\n",
|
| 308 |
+
"# =============================================================================\n",
|
| 309 |
+
"\n",
|
| 310 |
+
"ride_agg = ride_data.groupby([\"city\", \"ride_type\"]).agg(\n",
|
| 311 |
+
" total_rides = (\"ride_id\", \"count\"),\n",
|
| 312 |
+
" avg_final_price = (\"final_price_eur\", \"mean\"),\n",
|
| 313 |
+
" avg_distance_km = (\"distance_km\", \"mean\"),\n",
|
| 314 |
+
" avg_rating = (\"rating\", \"mean\"),\n",
|
| 315 |
+
" cancellation_rate = (\"cancelled\", \"mean\"),\n",
|
| 316 |
+
" avg_price_per_km = (\"price_per_km\", \"mean\"),\n",
|
| 317 |
+
").round(3).reset_index()\n",
|
| 318 |
+
"\n",
|
| 319 |
+
"review_agg = review_data.groupby([\"city\", \"ride_type\"]).agg(\n",
|
| 320 |
+
" total_reviews = (\"review_id\", \"count\"),\n",
|
| 321 |
+
" avg_compound_score = (\"compound_score\", \"mean\"),\n",
|
| 322 |
+
" pct_positive = (\"sentiment\",\n",
|
| 323 |
+
" lambda x: (x == \"Positive\").sum() / len(x) * 100),\n",
|
| 324 |
+
").round(3).reset_index()\n",
|
| 325 |
+
"\n",
|
| 326 |
+
"df = pd.merge(ride_agg, review_agg, on=[\"city\", \"ride_type\"], how=\"inner\")\n",
|
| 327 |
+
"\n",
|
| 328 |
+
"print(\"\\n\u2500\u2500 Merged dataframe head \u2500\u2500\")\n",
|
| 329 |
+
"print(df.head(10).to_string(index=False))"
|
| 330 |
+
],
|
| 331 |
+
"metadata": {
|
| 332 |
+
"colab": {
|
| 333 |
+
"base_uri": "https://localhost:8080/"
|
| 334 |
+
},
|
| 335 |
+
"id": "7LFPJo32q3Yy",
|
| 336 |
+
"outputId": "db4f5adc-b5b1-4f1b-8e50-f086e9bc8a21"
|
| 337 |
+
},
|
| 338 |
+
"execution_count": 9,
|
| 339 |
+
"outputs": [
|
| 340 |
+
{
|
| 341 |
+
"output_type": "stream",
|
| 342 |
+
"name": "stdout",
|
| 343 |
+
"text": [
|
| 344 |
+
"\n",
|
| 345 |
+
"\u2500\u2500 Merged dataframe head \u2500\u2500\n",
|
| 346 |
+
" city ride_type total_rides avg_final_price avg_distance_km avg_rating cancellation_rate avg_price_per_km total_reviews avg_compound_score pct_positive\n",
|
| 347 |
+
" Berlin Eco 27 4.723 5.653 4.185 0.037 1.940 80 0.226 62.500\n",
|
| 348 |
+
" Berlin Premium 70 4.560 4.137 3.943 0.071 2.449 103 0.129 50.485\n",
|
| 349 |
+
" Berlin Standard 96 4.485 4.609 4.052 0.073 1.929 101 0.232 57.426\n",
|
| 350 |
+
" Berlin XL 55 4.393 4.182 3.909 0.055 2.499 93 0.178 52.688\n",
|
| 351 |
+
"Cologne Eco 30 4.920 3.651 4.067 0.067 2.082 97 0.285 60.825\n",
|
| 352 |
+
"Cologne Premium 66 4.182 4.683 3.773 0.136 1.496 100 0.123 53.000\n",
|
| 353 |
+
"Cologne Standard 90 4.520 4.614 3.856 0.044 2.071 93 0.106 51.613\n",
|
| 354 |
+
"Cologne XL 58 4.483 4.732 4.069 0.052 1.594 93 0.022 40.860\n",
|
| 355 |
+
"Hamburg Eco 29 4.459 3.848 4.069 0.069 2.212 91 0.152 56.044\n",
|
| 356 |
+
"Hamburg Premium 77 4.546 4.655 3.987 0.039 1.923 103 0.182 53.398\n"
|
| 357 |
+
]
|
| 358 |
+
}
|
| 359 |
+
]
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"cell_type": "code",
|
| 363 |
+
"source": [
|
| 364 |
+
"# SECTION 5 \u2013 EXPLORATORY DATA ANALYSIS (6 charts)\n",
|
| 365 |
+
"# =============================================================================\n",
|
| 366 |
+
"\n",
|
| 367 |
+
"import matplotlib.pyplot as plt\n",
|
| 368 |
+
"import matplotlib.gridspec as gridspec\n",
|
| 369 |
+
"\n",
|
| 370 |
+
"# Define PALETTE for styling (moved from initial setup to ensure availability)\n",
|
| 371 |
+
"PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"fig = plt.figure(figsize=(20, 22))\n",
|
| 374 |
+
"fig.suptitle(\"Urban Mobility Startup \u2013 Exploratory Data Analysis\",\n",
|
| 375 |
+
" fontsize=17, fontweight=\"bold\", y=0.98)\n",
|
| 376 |
+
"gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.45, wspace=0.35)\n",
|
| 377 |
+
"\n",
|
| 378 |
+
"# \u2500\u2500 Chart 1: Average final price by ride type \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 379 |
+
"ax1 = fig.add_subplot(gs[0, 0])\n",
|
| 380 |
+
"price_by_type = ride_data.groupby(\"ride_type\")[\"final_price_eur\"].mean().sort_values()\n",
|
| 381 |
+
"bars = ax1.barh(price_by_type.index, price_by_type.values,\n",
|
| 382 |
+
" color=PALETTE[:len(price_by_type)])\n",
|
| 383 |
+
"ax1.bar_label(bars, fmt=\"\u20ac%.2f\", padding=4, fontsize=9)\n",
|
| 384 |
+
"ax1.set_title(\"Avg. Final Price by Ride Type\")\n",
|
| 385 |
+
"ax1.set_xlabel(\"EUR\")\n",
|
| 386 |
+
"ax1.set_xlim(0, price_by_type.max() * 1.25)\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"# \u2500\u2500 Chart 2: Rating distribution \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 389 |
+
"ax2 = fig.add_subplot(gs[0, 1])\n",
|
| 390 |
+
"rating_counts = ride_data[\"rating\"].value_counts().sort_index()\n",
|
| 391 |
+
"ax2.bar(rating_counts.index, rating_counts.values,\n",
|
| 392 |
+
" color=PALETTE[1], edgecolor=\"white\", linewidth=0.8)\n",
|
| 393 |
+
"ax2.set_title(\"Ride Rating Distribution\")\n",
|
| 394 |
+
"ax2.set_xlabel(\"Stars\")\n",
|
| 395 |
+
"ax2.set_ylabel(\"Number of Rides\")\n",
|
| 396 |
+
"ax2.set_xticks([1, 2, 3, 4, 5])\n",
|
| 397 |
+
"\n",
|
| 398 |
+
"# \u2500\u2500 Chart 3: Sentiment breakdown by city \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 399 |
+
"ax3 = fig.add_subplot(gs[1, 0])\n",
|
| 400 |
+
"sent_city = review_data.groupby([\"city\", \"sentiment\"]).size().unstack(fill_value=0)\n",
|
| 401 |
+
"sent_city_pct = sent_city.div(sent_city.sum(axis=1), axis=0) * 100\n",
|
| 402 |
+
"sent_city_pct[[\"Positive\", \"Neutral\", \"Negative\"]].plot(\n",
|
| 403 |
+
" kind=\"bar\", ax=ax3, color=[PALETTE[1], PALETTE[3], PALETTE[4]],\n",
|
| 404 |
+
" edgecolor=\"white\", linewidth=0.5)\n",
|
| 405 |
+
"ax3.set_title(\"Review Sentiment by City (%)\")\n",
|
| 406 |
+
"ax3.set_xlabel(\"\")\n",
|
| 407 |
+
"ax3.set_ylabel(\"Share (%)\")\n",
|
| 408 |
+
"ax3.legend(title=\"Sentiment\", fontsize=8)\n",
|
| 409 |
+
"ax3.tick_params(axis=\"x\", rotation=30)\n",
|
| 410 |
+
"\n",
|
| 411 |
+
"# \u2500\u2500 Chart 4: Price per km vs avg rating (scatter) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 412 |
+
"ax4 = fig.add_subplot(gs[1, 1])\n",
|
| 413 |
+
"for i, rt in enumerate(ride_types):\n",
|
| 414 |
+
" sub = ride_data[ride_data[\"ride_type\"] == rt]\n",
|
| 415 |
+
" ax4.scatter(sub[\"price_per_km\"], sub[\"rating\"] +\n",
|
| 416 |
+
" np.random.uniform(-0.1, 0.1, len(sub)),\n",
|
| 417 |
+
" label=rt, alpha=0.4, s=14, color=PALETTE[i % len(PALETTE)])\n",
|
| 418 |
+
"ax4.set_title(\"Price-per-km vs. Ride Rating\")\n",
|
| 419 |
+
"ax4.set_xlabel(\"Price per km (\u20ac)\")\n",
|
| 420 |
+
"ax4.set_ylabel(\"Rating (jittered)\")\n",
|
| 421 |
+
"ax4.legend(fontsize=8, markerscale=1.5)\n",
|
| 422 |
+
"\n",
|
| 423 |
+
"# \u2500\u2500 Chart 5: Cancellation rate by time slot \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 424 |
+
"ax5 = fig.add_subplot(gs[2, 0])\n",
|
| 425 |
+
"cancel_slot = ride_data.groupby(\"time_slot\")[\"cancelled\"].mean().sort_values() * 100\n",
|
| 426 |
+
"ax5.bar(cancel_slot.index, cancel_slot.values,\n",
|
| 427 |
+
" color=PALETTE[4], edgecolor=\"white\")\n",
|
| 428 |
+
"ax5.set_title(\"Cancellation Rate by Time Slot (%)\")\n",
|
| 429 |
+
"ax5.set_ylabel(\"Cancellation Rate (%)\")\n",
|
| 430 |
+
"ax5.set_ylim(0, cancel_slot.max() * 1.4)\n",
|
| 431 |
+
"for p, v in zip(cancel_slot.index, cancel_slot.values):\n",
|
| 432 |
+
" ax5.text(p, v + 0.1, f\"{v:.1f}%\", ha=\"center\", fontsize=9)\n",
|
| 433 |
+
"\n",
|
| 434 |
+
"# \u2500\u2500 Chart 6: Avg compound sentiment score by ride type \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
|
| 435 |
+
"ax6 = fig.add_subplot(gs[2, 1])\n",
|
| 436 |
+
"sent_type = review_data.groupby(\"ride_type\")[\"compound_score\"].mean().sort_values()\n",
|
| 437 |
+
"colors_sent = [PALETTE[1] if v >= 0 else PALETTE[4] for v in sent_type.values]\n",
|
| 438 |
+
"ax6.barh(sent_type.index, sent_type.values, color=colors_sent)\n",
|
| 439 |
+
"ax6.axvline(0, color=\"black\", linewidth=0.8, linestyle=\"--\")\n",
|
| 440 |
+
"ax6.set_title(\"Avg. VADER Sentiment Score by Ride Type\")\n",
|
| 441 |
+
"ax6.set_xlabel(\"Compound Score (\u22121 to +1)\")\n",
|
| 442 |
+
"\n",
|
| 443 |
+
"plt.savefig(\"notebook1_eda_output.png\", bbox_inches=\"tight\")\n",
|
| 444 |
+
"plt.close()\n",
|
| 445 |
+
"print(\"\\n\u2705 EDA chart saved \u2192 notebook1_eda_output.png\")"
|
| 446 |
+
],
|
| 447 |
+
"metadata": {
|
| 448 |
+
"colab": {
|
| 449 |
+
"base_uri": "https://localhost:8080/"
|
| 450 |
+
},
|
| 451 |
+
"id": "k3o219Voq9l_",
|
| 452 |
+
"outputId": "e5065a98-9ee5-4215-f1b2-58da89a93a67"
|
| 453 |
+
},
|
| 454 |
+
"execution_count": 14,
|
| 455 |
+
"outputs": [
|
| 456 |
+
{
|
| 457 |
+
"output_type": "stream",
|
| 458 |
+
"name": "stdout",
|
| 459 |
+
"text": [
|
| 460 |
+
"\n",
|
| 461 |
+
"\u2705 EDA chart saved \u2192 notebook1_eda_output.png\n"
|
| 462 |
+
]
|
| 463 |
+
}
|
| 464 |
+
]
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"cell_type": "code",
|
| 468 |
+
"source": [
|
| 469 |
+
"# SECTION 6 \u2013 SAVE CLEANED DATASETS FOR NOTEBOOK 2\n",
|
| 470 |
+
"# =============================================================================\n",
|
| 471 |
+
"ride_data.to_csv(\"ride_data_clean.csv\", index=False)\n",
|
| 472 |
+
"review_data.to_csv(\"review_data_clean.csv\", index=False)\n",
|
| 473 |
+
"df.to_csv(\"merged_summary.csv\", index=False)\n",
|
| 474 |
+
"print(\"\u2705 CSVs saved: ride_data_clean.csv | review_data_clean.csv | merged_summary.csv\")\n",
|
| 475 |
+
"\n",
|
| 476 |
+
"print(\"\\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")\n",
|
| 477 |
+
"print(\" NOTEBOOK 1 COMPLETE\")\n",
|
| 478 |
+
"print(\"\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")"
|
| 479 |
+
],
|
| 480 |
+
"metadata": {
|
| 481 |
+
"colab": {
|
| 482 |
+
"base_uri": "https://localhost:8080/"
|
| 483 |
+
},
|
| 484 |
+
"id": "Atl1ma1HsOE6",
|
| 485 |
+
"outputId": "78c7f16c-7d69-40a2-e366-bb7e0aa7a255"
|
| 486 |
+
},
|
| 487 |
+
"execution_count": 16,
|
| 488 |
+
"outputs": [
|
| 489 |
+
{
|
| 490 |
+
"output_type": "stream",
|
| 491 |
+
"name": "stdout",
|
| 492 |
+
"text": [
|
| 493 |
+
"\u2705 CSVs saved: ride_data_clean.csv | review_data_clean.csv | merged_summary.csv\n",
|
| 494 |
+
"\n",
|
| 495 |
+
"\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
|
| 496 |
+
" NOTEBOOK 1 COMPLETE\n",
|
| 497 |
+
"\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n"
|
| 498 |
+
]
|
| 499 |
+
}
|
| 500 |
+
]
|
| 501 |
+
}
|
| 502 |
+
]
|
| 503 |
+
}
|
merged_summary (1).csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
city,ride_type,total_rides,avg_final_price_eur,avg_rating,vader_compound,vader_sentiment
|
| 2 |
+
Paris,E-Scooter,320,4.82,4.15,0.12,Positive
|
| 3 |
+
Paris,E-Bike,210,3.95,4.22,0.15,Positive
|
| 4 |
+
Paris,Bus-Connect,150,2.40,4.35,0.18,Positive
|
| 5 |
+
Paris,E-Moto,180,5.50,3.95,0.09,Positive
|
| 6 |
+
Berlin,E-Scooter,380,3.60,3.72,0.01,Neutral
|
| 7 |
+
Berlin,E-Bike,190,3.20,3.95,0.08,Positive
|
| 8 |
+
Berlin,Bus-Connect,160,2.10,4.10,0.10,Positive
|
| 9 |
+
Berlin,E-Moto,140,4.80,3.55,-0.02,Neutral
|
| 10 |
+
Madrid,E-Scooter,350,4.20,4.05,0.17,Positive
|
| 11 |
+
Madrid,E-Bike,220,3.70,4.25,0.20,Positive
|
| 12 |
+
Madrid,Bus-Connect,180,2.80,4.40,0.19,Positive
|
| 13 |
+
Madrid,E-Moto,160,5.10,4.10,0.14,Positive
|
| 14 |
+
Warsaw,E-Scooter,280,3.50,3.65,0.03,Neutral
|
| 15 |
+
Warsaw,E-Bike,160,3.00,3.85,0.05,Neutral
|
| 16 |
+
Warsaw,Bus-Connect,140,1.90,4.00,0.09,Positive
|
| 17 |
+
Warsaw,E-Moto,120,4.30,3.75,0.02,Neutral
|
| 18 |
+
Turin,E-Scooter,200,4.10,3.80,0.06,Positive
|
| 19 |
+
Turin,E-Bike,120,3.50,4.10,0.12,Positive
|
| 20 |
+
Turin,Bus-Connect,100,2.30,4.25,0.15,Positive
|
| 21 |
+
Turin,E-Moto,90,4.70,3.90,0.08,Positive
|