matteobardelle commited on
Commit
ef4672c
·
verified ·
1 Parent(s): cdfed75

Upload 3 files

Browse files
Predictive_Modelling_+_ARIMA_Forecasting_Urban_Mobility_Startup_–_Pricing_&_Satisfaction_Optimization (1).ipynb ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "id": "2ERyVGhbyopK"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "# Role: Data Analyst\n",
26
+ "# Pipeline:\n",
27
+ "# CLEAN > ENCODE > SPLIT 80-20 > RANDOM FOREST CLASSIFICATION (satisfaction)\n",
28
+ "# > ARIMA REVENUE FORECAST > FEATURE IMPORTANCE > EVALUATION\n",
29
+ "# =============================================================================\n",
30
+ "\n",
31
+ "import pandas as pd\n",
32
+ "import numpy as np\n",
33
+ "import matplotlib.pyplot as plt\n",
34
+ "import matplotlib.gridspec as gridspec\n",
35
+ "import seaborn as sns\n",
36
+ "import warnings\n",
37
+ "warnings.filterwarnings(\"ignore\")\n",
38
+ "\n",
39
+ "from sklearn.ensemble import RandomForestClassifier\n",
40
+ "from sklearn.model_selection import train_test_split\n",
41
+ "from sklearn.preprocessing import LabelEncoder\n",
42
+ "from sklearn.metrics import (classification_report,\n",
43
+ " ConfusionMatrixDisplay,\n",
44
+ " accuracy_score)\n",
45
+ "from statsmodels.tsa.arima.model import ARIMA\n",
46
+ "\n",
47
+ "PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
48
+ "sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n",
49
+ "plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n",
50
+ " \"axes.labelsize\": 11})"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "source": [
56
+ "# SECTION 1 – LOAD DATA FROM NOTEBOOK 1\n",
57
+ "# =============================================================================\n",
58
+ "\n",
59
+ "ride_df = pd.read_csv(\"/content/ride_data_clean.csv\")\n",
60
+ "review_df = pd.read_csv(\"/content/review_data_clean.csv\")\n",
61
+ "merged_df = pd.read_csv(\"/content/merged_summary.csv\")\n",
62
+ "\n",
63
+ "print(f\"Rides: {ride_df.shape} | Reviews: {review_df.shape} | Merged: {merged_df.shape}\")"
64
+ ],
65
+ "metadata": {
66
+ "colab": {
67
+ "base_uri": "https://localhost:8080/"
68
+ },
69
+ "id": "63oe81VNzi8_",
70
+ "outputId": "245fcee0-e3b0-41ed-9fb3-3ec7c4fd6eba"
71
+ },
72
+ "execution_count": 3,
73
+ "outputs": [
74
+ {
75
+ "output_type": "stream",
76
+ "name": "stdout",
77
+ "text": [
78
+ "Rides: (1000, 12) | Reviews: (1500, 7) | Merged: (16, 11)\n"
79
+ ]
80
+ }
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "source": [
86
+ "# SECTION 2 – CLASSIFICATION: PREDICT USER SATISFACTION (HIGH vs LOW)\n",
87
+ "# Dependent variable : SatisfactionLabel (High = rating ≥ 4, Low otherwise)\n",
88
+ "# Independent variables: final_price_eur, distance_km, duration_min,\n",
89
+ "# discount_pct, cancelled, ride_type, time_slot\n",
90
+ "# =============================================================================\n",
91
+ "\n",
92
+ "# ── 2a. Build classification dataframe ───────────────────────────────────────\n",
93
+ "clf_df = ride_df[[\n",
94
+ " \"final_price_eur\", \"distance_km\", \"duration_min\",\n",
95
+ " \"discount_pct\", \"cancelled\", \"ride_type\", \"time_slot\", \"rating\"\n",
96
+ "]].copy()\n",
97
+ "\n",
98
+ "clf_df[\"SatisfactionLabel\"] = (clf_df[\"rating\"] >= 4).astype(int) # 1=High, 0=Low\n",
99
+ "clf_df.drop(columns=\"rating\", inplace=True)\n",
100
+ "\n",
101
+ "# ── 2b. Encode categoricals ───────────────────────────────────────────────────\n",
102
+ "le_rt = LabelEncoder()\n",
103
+ "le_ts = LabelEncoder()\n",
104
+ "clf_df[\"ride_type_enc\"] = le_rt.fit_transform(clf_df[\"ride_type\"])\n",
105
+ "clf_df[\"time_slot_enc\"] = le_ts.fit_transform(clf_df[\"time_slot\"])\n",
106
+ "clf_df.drop(columns=[\"ride_type\", \"time_slot\"], inplace=True)\n",
107
+ "\n",
108
+ "X = clf_df.drop(columns=\"SatisfactionLabel\")\n",
109
+ "y = clf_df[\"SatisfactionLabel\"]\n",
110
+ "\n",
111
+ "# ── 2c. Train / test split 80-20 ──────────────────────────────────────────────\n",
112
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
113
+ " X, y, test_size=0.20, random_state=42, stratify=y)\n",
114
+ "\n",
115
+ "print(f\"\\nTrain size: {len(X_train)} | Test size: {len(X_test)}\")\n",
116
+ "\n",
117
+ "# ── 2d. Random Forest ─────────────────────────────────────────────────────────\n",
118
+ "rf = RandomForestClassifier(n_estimators=200, max_depth=8,\n",
119
+ " random_state=42, class_weight=\"balanced\")\n",
120
+ "rf.fit(X_train, y_train)\n",
121
+ "y_pred = rf.predict(X_test)\n",
122
+ "\n",
123
+ "print(f\"\\nClassification Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
124
+ "print(\"\\nClassification Report:\")\n",
125
+ "print(classification_report(y_test, y_pred,\n",
126
+ " target_names=[\"Low Satisfaction\", \"High Satisfaction\"]))\n"
127
+ ],
128
+ "metadata": {
129
+ "colab": {
130
+ "base_uri": "https://localhost:8080/"
131
+ },
132
+ "id": "nv-OXM0nzywU",
133
+ "outputId": "335d1c9d-6d2b-4878-9aed-2e3e05e11905"
134
+ },
135
+ "execution_count": 4,
136
+ "outputs": [
137
+ {
138
+ "output_type": "stream",
139
+ "name": "stdout",
140
+ "text": [
141
+ "\n",
142
+ "Train size: 800 | Test size: 200\n",
143
+ "\n",
144
+ "Classification Accuracy: 0.6450\n",
145
+ "\n",
146
+ "Classification Report:\n",
147
+ " precision recall f1-score support\n",
148
+ "\n",
149
+ " Low Satisfaction 0.22 0.16 0.18 50\n",
150
+ "High Satisfaction 0.74 0.81 0.77 150\n",
151
+ "\n",
152
+ " accuracy 0.65 200\n",
153
+ " macro avg 0.48 0.48 0.48 200\n",
154
+ " weighted avg 0.61 0.65 0.63 200\n",
155
+ "\n"
156
+ ]
157
+ }
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "source": [
163
+ "# SECTION 3 – ARIMA REVENUE FORECAST\n",
164
+ "# Aggregate weekly total revenue → forecast next 12 weeks for 3 sample cities\n",
165
+ "# =============================================================================\n",
166
+ "\n",
167
+ "# Generate a synthetic weekly revenue time series per city (realistic trend + noise)\n",
168
+ "np.random.seed(7)\n",
169
+ "weeks = pd.date_range(\"2022-01-03\", periods=104, freq=\"W\") # 2 years weekly\n",
170
+ "cities_sel = [\"Paris\", \"Berlin\", \"Madrid\"]\n",
171
+ "\n",
172
+ "city_rev = {}\n",
173
+ "for c in cities_sel:\n",
174
+ " trend = np.linspace(80_000, 130_000, 104)\n",
175
+ " season = 8_000 * np.sin(np.linspace(0, 4 * np.pi, 104))\n",
176
+ " noise = np.random.normal(0, 5_000, 104)\n",
177
+ " city_rev[c] = pd.Series(trend + season + noise, index=weeks)\n",
178
+ "\n",
179
+ "FORECAST_STEPS = 12"
180
+ ],
181
+ "metadata": {
182
+ "id": "2tsC_F9oz4JO"
183
+ },
184
+ "execution_count": 5,
185
+ "outputs": []
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "source": [
190
+ "# SECTION 4 – VISUALIZATIONS (5 charts)\n",
191
+ "# =============================================================================\n",
192
+ "\n",
193
+ "fig = plt.figure(figsize=(20, 24))\n",
194
+ "fig.suptitle(\"Urban Mobility – Predictive Analytics & Revenue Forecasting\",\n",
195
+ " fontsize=17, fontweight=\"bold\", y=0.99)\n",
196
+ "gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.50, wspace=0.35)\n",
197
+ "\n",
198
+ "# ── Chart 1: Feature importance ───────────────────────────────────────────────\n",
199
+ "ax1 = fig.add_subplot(gs[0, 0])\n",
200
+ "feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values()\n",
201
+ "feat_imp.index = [\"Discount %\", \"Cancelled\", \"Ride Type\",\n",
202
+ " \"Time Slot\", \"Duration (min)\", \"Distance (km)\", \"Final Price (€)\"]\n",
203
+ "feat_imp.sort_values().plot(kind=\"barh\", ax=ax1, color=PALETTE[1])\n",
204
+ "ax1.set_title(\"Random Forest – Feature Importances\\n(Satisfaction Classification)\")\n",
205
+ "ax1.set_xlabel(\"Importance Score\")\n",
206
+ "\n",
207
+ "# ── Chart 2: Confusion matrix ─────────────────────────────────────────────────\n",
208
+ "ax2 = fig.add_subplot(gs[0, 1])\n",
209
+ "ConfusionMatrixDisplay.from_predictions(\n",
210
+ " y_test, y_pred,\n",
211
+ " display_labels=[\"Low\", \"High\"],\n",
212
+ " colorbar=False, cmap=\"Blues\", ax=ax2)\n",
213
+ "ax2.set_title(\"Confusion Matrix\\n(Satisfaction: Low vs High)\")\n",
214
+ "\n",
215
+ "# ── Charts 3-5: ARIMA forecasts per city ─────────────────────────────────────\n",
216
+ "arima_positions = [(1, 0), (1, 1), (2, 0)]\n",
217
+ "arima_models = {}\n",
218
+ "\n",
219
+ "for idx, city in enumerate(cities_sel):\n",
220
+ " row, col = arima_positions[idx]\n",
221
+ " ax = fig.add_subplot(gs[row, col])\n",
222
+ " series = city_rev[city]\n",
223
+ "\n",
224
+ " model = ARIMA(series, order=(2, 1, 2))\n",
225
+ " result = model.fit()\n",
226
+ " arima_models[city] = result\n",
227
+ "\n",
228
+ " forecast = result.get_forecast(steps=FORECAST_STEPS)\n",
229
+ " forecast_df = forecast.summary_frame(alpha=0.10)\n",
230
+ " future_idx = pd.date_range(series.index[-1] + pd.Timedelta(weeks=1),\n",
231
+ " periods=FORECAST_STEPS, freq=\"W\")\n",
232
+ " forecast_df.index = future_idx\n",
233
+ "\n",
234
+ " ax.plot(series, color=PALETTE[0], linewidth=1.2, label=\"Historical\")\n",
235
+ " ax.plot(forecast_df[\"mean\"], color=PALETTE[2],\n",
236
+ " linewidth=2, linestyle=\"--\", label=\"Forecast\")\n",
237
+ " ax.fill_between(forecast_df.index,\n",
238
+ " forecast_df[\"mean_ci_lower\"],\n",
239
+ " forecast_df[\"mean_ci_upper\"],\n",
240
+ " alpha=0.25, color=PALETTE[2], label=\"90% CI\")\n",
241
+ " ax.set_title(f\"ARIMA Revenue Forecast – {city}\")\n",
242
+ " ax.set_ylabel(\"Weekly Revenue (€)\")\n",
243
+ " ax.set_xlabel(\"\")\n",
244
+ " ax.legend(fontsize=8)\n",
245
+ " ax.yaxis.set_major_formatter(\n",
246
+ " plt.FuncFormatter(lambda v, _: f\"€{v/1000:.0f}k\"))\n",
247
+ "\n",
248
+ "# ── Chart 6 (last cell): Price sensitivity – avg rating by price bucket ───────\n",
249
+ "ax6 = fig.add_subplot(gs[2, 1])\n",
250
+ "ride_df[\"price_bucket\"] = pd.cut(ride_df[\"final_price_eur\"],\n",
251
+ " bins=[0, 2, 3.5, 5, 6.5, 10],\n",
252
+ " labels=[\"<2\", \"2–3.5\", \"3.5–5\", \"5–6.5\", \">6.5\"])\n",
253
+ "price_sens = ride_df.groupby(\"price_bucket\", observed=True)[\"rating\"].mean()\n",
254
+ "ax6.bar(price_sens.index, price_sens.values, color=PALETTE[3], edgecolor=\"white\")\n",
255
+ "ax6.set_title(\"Price Sensitivity – Avg. Rating by Price Bucket\")\n",
256
+ "ax6.set_xlabel(\"Final Price (€)\")\n",
257
+ "ax6.set_ylabel(\"Avg. Rating\")\n",
258
+ "ax6.set_ylim(3, 5)\n",
259
+ "for p, v in zip(price_sens.index, price_sens.values):\n",
260
+ " ax6.text(p, v + 0.02, f\"{v:.2f}★\", ha=\"center\", fontsize=9)\n",
261
+ "\n",
262
+ "plt.savefig(\"/content/notebook2_models_output.png\", bbox_inches=\"tight\")\n",
263
+ "plt.close()\n",
264
+ "print(\"\\n✅ Model output chart saved → notebook2_models_output.png\")"
265
+ ],
266
+ "metadata": {
267
+ "colab": {
268
+ "base_uri": "https://localhost:8080/"
269
+ },
270
+ "id": "o7Y2B81jz89H",
271
+ "outputId": "35a76ae9-1ac1-4de8-bfba-6c349af72d11"
272
+ },
273
+ "execution_count": 7,
274
+ "outputs": [
275
+ {
276
+ "output_type": "stream",
277
+ "name": "stdout",
278
+ "text": [
279
+ "\n",
280
+ "✅ Model output chart saved → notebook2_models_output.png\n"
281
+ ]
282
+ }
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "source": [
288
+ "# SECTION 5 – SAVE FORECAST TABLE\n",
289
+ "# =============================================================================\n",
290
+ "all_forecasts = []\n",
291
+ "for city in cities_sel:\n",
292
+ " fc = arima_models[city].get_forecast(steps=FORECAST_STEPS).summary_frame(alpha=0.10)\n",
293
+ " fc.index = pd.date_range(city_rev[city].index[-1] + pd.Timedelta(weeks=1),\n",
294
+ " periods=FORECAST_STEPS, freq=\"W\")\n",
295
+ " fc[\"city\"] = city\n",
296
+ " all_forecasts.append(fc[[\"city\", \"mean\", \"mean_ci_lower\", \"mean_ci_upper\"]])\n",
297
+ "\n",
298
+ "forecast_table = pd.concat(all_forecasts)\n",
299
+ "forecast_table.columns = [\"city\", \"forecast_revenue\", \"ci_lower_90\", \"ci_upper_90\"]\n",
300
+ "forecast_table.to_csv(\"/content/arima_forecast_table.csv\")\n",
301
+ "print(\"✅ Forecast table saved → arima_forecast_table.csv\")\n",
302
+ "print(forecast_table.head(6).round(0).to_string())\n",
303
+ "\n",
304
+ "print(\"\\n══════════════════════════════════════════\")\n",
305
+ "print(\" NOTEBOOK 2 COMPLETE\")\n",
306
+ "print(\"══════════════════════════════════════════\")"
307
+ ],
308
+ "metadata": {
309
+ "colab": {
310
+ "base_uri": "https://localhost:8080/"
311
+ },
312
+ "id": "KY_odiwF0h7r",
313
+ "outputId": "7e77d938-59cc-4241-e4b9-37dc7cb6eaf5"
314
+ },
315
+ "execution_count": 9,
316
+ "outputs": [
317
+ {
318
+ "output_type": "stream",
319
+ "name": "stdout",
320
+ "text": [
321
+ "✅ Forecast table saved → arima_forecast_table.csv\n",
322
+ " city forecast_revenue ci_lower_90 ci_upper_90\n",
323
+ "2024-01-07 Paris 124853.0 115042.0 134665.0\n",
324
+ "2024-01-14 Paris 124946.0 112850.0 137043.0\n",
325
+ "2024-01-21 Paris 124678.0 110776.0 138581.0\n",
326
+ "2024-01-28 Paris 124855.0 109164.0 140546.0\n",
327
+ "2024-02-04 Paris 124759.0 107576.0 141942.0\n",
328
+ "2024-02-11 Paris 124808.0 106197.0 143420.0\n",
329
+ "\n",
330
+ "══════════════════════════════════════════\n",
331
+ " NOTEBOOK 2 COMPLETE\n",
332
+ "══════════════════════════════════════════\n"
333
+ ]
334
+ }
335
+ ]
336
+ }
337
+ ]
338
+ }
Real_World_Data_Processing_EDA.ipynb ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 4,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/"
23
+ },
24
+ "id": "r-G_BpFaLoa4",
25
+ "outputId": "6ce2e622-9704-47a9-a54d-17ea18432dfd"
26
+ },
27
+ "outputs": [
28
+ {
29
+ "output_type": "stream",
30
+ "name": "stdout",
31
+ "text": [
32
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
33
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
34
+ "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
35
+ "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
36
+ "Requirement already satisfied: vaderSentiment in /usr/local/lib/python3.12/dist-packages (3.3.2)\n",
37
+ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)\n",
38
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
39
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
40
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)\n",
41
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
42
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
43
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1)\n",
44
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n",
45
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
46
+ "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
47
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
48
+ "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n",
49
+ "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)\n",
50
+ "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.3)\n",
51
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)\n",
52
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
53
+ "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.7)\n",
54
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n",
55
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n",
56
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.2.25)\n"
57
+ ]
58
+ }
59
+ ],
60
+ "source": [
61
+ "# --- 0. INSTALL DEPENDENCIES ---\n",
62
+ "!pip install pandas numpy matplotlib seaborn vaderSentiment scikit-learn\n",
63
+ "\n",
64
+ "import pandas as pd\n",
65
+ "import numpy as np\n",
66
+ "import matplotlib.pyplot as plt\n",
67
+ "import matplotlib.gridspec as gridspec\n",
68
+ "import seaborn as sns\n",
69
+ "import warnings\n",
70
+ "warnings.filterwarnings(\"ignore\")\n",
71
+ "\n",
72
+ "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
73
+ "\n",
74
+ "# \u2500\u2500 Styling \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
75
+ "PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
76
+ "sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n",
77
+ "plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n",
78
+ " \"axes.labelsize\": 11, \"font.family\": \"DejaVu Sans\"})"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "source": [
84
+ "import pandas as pd\n",
85
+ "import numpy as np\n",
86
+ "\n",
87
+ "N_RIDES = 1000\n",
88
+ "cities = [\"Berlin\", \"Munich\", \"Hamburg\", \"Cologne\"]\n",
89
+ "ride_types = [\"Standard\", \"Premium\", \"XL\", \"Eco\"]\n",
90
+ "time_slots = [\"Morning (6-10)\", \"Midday (10-14)\", \"Afternoon (14-18)\", \"Evening (18-22)\"]\n",
91
+ "\n",
92
+ "# SECTION 1 \u2013 SIMULATE \"SCRAPED / FOUND\" REAL-WORLD DATA\n",
93
+ "# (In production: replace with actual web-scraped or API-fetched CSVs)\n",
94
+ "# \u2500\u2500 1a. Ride-level transaction data (quantitative) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
95
+ "ride_data = pd.DataFrame({\n",
96
+ " \"ride_id\": range(1, N_RIDES + 1),\n",
97
+ " \"city\": np.random.choice(cities, N_RIDES),\n",
98
+ " \"ride_type\": np.random.choice(ride_types, N_RIDES, p=[0.40, 0.30, 0.20, 0.10]),\n",
99
+ " \"time_slot\": np.random.choice(time_slots, N_RIDES, p=[0.25, 0.30, 0.30, 0.15]),\n",
100
+ " \"distance_km\": np.round(np.random.exponential(4, N_RIDES) + 0.5, 2),\n",
101
+ " \"duration_min\": np.round(np.random.normal(18, 6, N_RIDES).clip(3), 1),\n",
102
+ " \"base_price_eur\":np.round(np.random.uniform(1.5, 8.0, N_RIDES), 2),\n",
103
+ " \"discount_pct\": np.random.choice([0, 5, 10, 15, 20], N_RIDES,\n",
104
+ " p=[0.50, 0.20, 0.15, 0.10, 0.05]),\n",
105
+ " \"rating\": np.random.choice([1, 2, 3, 4, 5], N_RIDES,\n",
106
+ " p=[0.03, 0.07, 0.15, 0.40, 0.35]),\n",
107
+ " \"cancelled\": np.random.choice([0, 1], N_RIDES, p=[0.93, 0.07]),\n",
108
+ "})\n",
109
+ "\n",
110
+ "# Introduce 3 % missing values in price and rating (realistic)\n",
111
+ "for col in [\"base_price_eur\", \"rating\"]:\n",
112
+ " ride_data.loc[ride_data.sample(frac=0.03).index, col] = np.nan\n",
113
+ "\n",
114
+ "# Derived fields\n",
115
+ "ride_data[\"final_price_eur\"] = np.round(\n",
116
+ " ride_data[\"base_price_eur\"] * (1 - ride_data[\"discount_pct\"] / 100), 2)\n",
117
+ "ride_data[\"price_per_km\"] = np.round(\n",
118
+ " ride_data[\"final_price_eur\"] / ride_data[\"distance_km\"], 3)"
119
+ ],
120
+ "metadata": {
121
+ "id": "gtbUjaaWMfH-"
122
+ },
123
+ "execution_count": 2,
124
+ "outputs": []
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "source": [
129
+ "# \u2500\u2500 1b. App-review data (qualitative) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
130
+ "positive_reviews = [\n",
131
+ " \"Absolutely love the e-scooter! Fast, clean, affordable.\",\n",
132
+ " \"Seamless booking and the bike was in great condition.\",\n",
133
+ " \"Best way to get around the city. Highly recommend!\",\n",
134
+ " \"Super convenient, saved me 20 minutes every morning.\",\n",
135
+ " \"Eco-friendly and cheap. Will use every day.\",\n",
136
+ " \"App works perfectly. Scooter was fully charged.\",\n",
137
+ " \"Great service, prices are very fair for the distance.\",\n",
138
+ " \"Customer support was helpful and friendly.\",\n",
139
+ "]\n",
140
+ "negative_reviews = [\n",
141
+ " \"The scooter was broken when I unlocked it. Very frustrating.\",\n",
142
+ " \"Overcharged for a 2 km ride. Pricing is confusing.\",\n",
143
+ " \"App crashed three times before I could complete the booking.\",\n",
144
+ " \"Terrible availability in my neighbourhood. Always empty.\",\n",
145
+ " \"The e-bike seat was damaged and uncomfortable.\",\n",
146
+ " \"Hidden fees are unacceptable. Totally misleading pricing.\",\n",
147
+ " \"Waited 10 minutes to connect to a scooter. Wasted my time.\",\n",
148
+ " \"No customer support response after a billing error.\",\n",
149
+ "]\n",
150
+ "neutral_reviews = [\n",
151
+ " \"It was okay. Nothing special, works as expected.\",\n",
152
+ " \"Decent ride, though a bit pricey compared to the metro.\",\n",
153
+ " \"Average experience. Some improvements needed in the app.\",\n",
154
+ " \"Not bad, but parking zones need to be clearer.\",\n",
155
+ " \"Works fine most of the time. Occasional glitches.\",\n",
156
+ "]\n",
157
+ "\n",
158
+ "N_REVIEWS = 1500 # Define N_REVIEWS here\n",
159
+ "all_reviews = positive_reviews * 30 + negative_reviews * 20 + neutral_reviews * 10\n",
160
+ "review_data = pd.DataFrame({\n",
161
+ " \"review_id\": range(1, N_REVIEWS + 1),\n",
162
+ " \"city\": np.random.choice(cities, N_REVIEWS),\n",
163
+ " \"ride_type\": np.random.choice(ride_types, N_REVIEWS),\n",
164
+ " \"review_text\":np.random.choice(all_reviews, N_REVIEWS),\n",
165
+ " \"review_date\":pd.date_range(\"2024-01-01\", periods=N_REVIEWS, freq=\"14h\"),\n",
166
+ "})\n",
167
+ "\n",
168
+ "print(\"\u2705 Data generated\")\n",
169
+ "print(f\" ride_data : {ride_data.shape}\")\n",
170
+ "print(f\" review_data: {review_data.shape}\")"
171
+ ],
172
+ "metadata": {
173
+ "colab": {
174
+ "base_uri": "https://localhost:8080/"
175
+ },
176
+ "id": "CzPxA0rAoaHZ",
177
+ "outputId": "5174afc6-c010-4b9a-9dc8-8f41a6ac4b56"
178
+ },
179
+ "execution_count": 4,
180
+ "outputs": [
181
+ {
182
+ "output_type": "stream",
183
+ "name": "stdout",
184
+ "text": [
185
+ "\u2705 Data generated\n",
186
+ " ride_data : (1000, 12)\n",
187
+ " review_data: (1500, 5)\n"
188
+ ]
189
+ }
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "source": [
195
+ "# SECTION 2 \u2013 DATA CLEANING\n",
196
+ "# =============================================================================\n",
197
+ "\n",
198
+ "# \u2500\u2500 2a. Ride data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
199
+ "print(\"\\n\u2500\u2500 Missing values BEFORE cleaning \u2500\u2500\")\n",
200
+ "print(ride_data[[\"base_price_eur\", \"rating\"]].isnull().sum())\n",
201
+ "\n",
202
+ "ride_data[\"base_price_eur\"] = ride_data[\"base_price_eur\"].fillna(ride_data[\"base_price_eur\"].median())\n",
203
+ "ride_data[\"rating\"] = ride_data[\"rating\"].fillna(round(ride_data[\"rating\"].median()))\n",
204
+ "\n",
205
+ "print(\"\u2500\u2500 Missing values AFTER cleaning \u2500\u2500\")\n",
206
+ "print(ride_data[[\"base_price_eur\", \"rating\"]].isnull().sum())\n",
207
+ "\n",
208
+ "# Remove duplicate ride IDs (none expected, but good practice)\n",
209
+ "ride_data.drop_duplicates(subset=\"ride_id\", inplace=True)\n",
210
+ "\n",
211
+ "# Drop rides with physically impossible distance\n",
212
+ "ride_data = ride_data[ride_data[\"distance_km\"] > 0]\n",
213
+ "\n",
214
+ "# \u2500\u2500 2b. Review data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
215
+ "review_data.dropna(subset=[\"review_text\"], inplace=True)\n",
216
+ "review_data[\"review_text\"] = review_data[\"review_text\"].str.strip()"
217
+ ],
218
+ "metadata": {
219
+ "colab": {
220
+ "base_uri": "https://localhost:8080/"
221
+ },
222
+ "id": "lnGYrVG7o0aO",
223
+ "outputId": "be8fb6b5-a485-4495-8ebd-240e741a8ce4"
224
+ },
225
+ "execution_count": 5,
226
+ "outputs": [
227
+ {
228
+ "output_type": "stream",
229
+ "name": "stdout",
230
+ "text": [
231
+ "\n",
232
+ "\u2500\u2500 Missing values BEFORE cleaning \u2500\u2500\n",
233
+ "base_price_eur 30\n",
234
+ "rating 30\n",
235
+ "dtype: int64\n",
236
+ "\u2500\u2500 Missing values AFTER cleaning \u2500\u2500\n",
237
+ "base_price_eur 0\n",
238
+ "rating 0\n",
239
+ "dtype: int64\n"
240
+ ]
241
+ }
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "source": [
247
+ "# SECTION 3 \u2013 VADER SENTIMENT ANALYSIS ON REVIEWS\n",
248
+ "# =============================================================================\n",
249
+ "\n",
250
+ "!pip install vaderSentiment\n",
251
+ "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
252
+ "\n",
253
+ "analyzer = SentimentIntensityAnalyzer()\n",
254
+ "\n",
255
+ "def classify_sentiment(text):\n",
256
+ " score = analyzer.polarity_scores(text)[\"compound\"]\n",
257
+ " if score >= 0.05: return \"Positive\"\n",
258
+ " elif score <= -0.05: return \"Negative\"\n",
259
+ " else: return \"Neutral\"\n",
260
+ "\n",
261
+ "review_data[\"compound_score\"] = review_data[\"review_text\"].apply(\n",
262
+ " lambda t: analyzer.polarity_scores(t)[\"compound\"])\n",
263
+ "review_data[\"sentiment\"] = review_data[\"review_text\"].apply(classify_sentiment)\n",
264
+ "\n",
265
+ "print(\"\\n\u2500\u2500 Sentiment distribution \u2500\u2500\")\n",
266
+ "print(review_data[\"sentiment\"].value_counts())"
267
+ ],
268
+ "metadata": {
269
+ "colab": {
270
+ "base_uri": "https://localhost:8080/"
271
+ },
272
+ "id": "iXvqwQRxo-W6",
273
+ "outputId": "39694265-24f5-44e0-bc33-0f5300b1b917"
274
+ },
275
+ "execution_count": 8,
276
+ "outputs": [
277
+ {
278
+ "output_type": "stream",
279
+ "name": "stdout",
280
+ "text": [
281
+ "Collecting vaderSentiment\n",
282
+ " Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)\n",
283
+ "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n",
284
+ "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.7)\n",
285
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n",
286
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n",
287
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.2.25)\n",
288
+ "Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)\n",
289
+ "\u001b[2K \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m126.0/126.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
290
+ "\u001b[?25hInstalling collected packages: vaderSentiment\n",
291
+ "Successfully installed vaderSentiment-3.3.2\n",
292
+ "\n",
293
+ "\u2500\u2500 Sentiment distribution \u2500\u2500\n",
294
+ "sentiment\n",
295
+ "Positive 798\n",
296
+ "Negative 509\n",
297
+ "Neutral 193\n",
298
+ "Name: count, dtype: int64\n"
299
+ ]
300
+ }
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "source": [
306
+ "# SECTION 4 \u2013 MERGING DATASETS\n",
307
+ "# Aggregate rides per (city, ride_type) \u2192 merge with review sentiments\n",
308
+ "# =============================================================================\n",
309
+ "\n",
310
+ "ride_agg = ride_data.groupby([\"city\", \"ride_type\"]).agg(\n",
311
+ " total_rides = (\"ride_id\", \"count\"),\n",
312
+ " avg_final_price = (\"final_price_eur\", \"mean\"),\n",
313
+ " avg_distance_km = (\"distance_km\", \"mean\"),\n",
314
+ " avg_rating = (\"rating\", \"mean\"),\n",
315
+ " cancellation_rate = (\"cancelled\", \"mean\"),\n",
316
+ " avg_price_per_km = (\"price_per_km\", \"mean\"),\n",
317
+ ").round(3).reset_index()\n",
318
+ "\n",
319
+ "review_agg = review_data.groupby([\"city\", \"ride_type\"]).agg(\n",
320
+ " total_reviews = (\"review_id\", \"count\"),\n",
321
+ " avg_compound_score = (\"compound_score\", \"mean\"),\n",
322
+ " pct_positive = (\"sentiment\",\n",
323
+ " lambda x: (x == \"Positive\").sum() / len(x) * 100),\n",
324
+ ").round(3).reset_index()\n",
325
+ "\n",
326
+ "df = pd.merge(ride_agg, review_agg, on=[\"city\", \"ride_type\"], how=\"inner\")\n",
327
+ "\n",
328
+ "print(\"\\n\u2500\u2500 Merged dataframe head \u2500\u2500\")\n",
329
+ "print(df.head(10).to_string(index=False))"
330
+ ],
331
+ "metadata": {
332
+ "colab": {
333
+ "base_uri": "https://localhost:8080/"
334
+ },
335
+ "id": "7LFPJo32q3Yy",
336
+ "outputId": "db4f5adc-b5b1-4f1b-8e50-f086e9bc8a21"
337
+ },
338
+ "execution_count": 9,
339
+ "outputs": [
340
+ {
341
+ "output_type": "stream",
342
+ "name": "stdout",
343
+ "text": [
344
+ "\n",
345
+ "\u2500\u2500 Merged dataframe head \u2500\u2500\n",
346
+ " city ride_type total_rides avg_final_price avg_distance_km avg_rating cancellation_rate avg_price_per_km total_reviews avg_compound_score pct_positive\n",
347
+ " Berlin Eco 27 4.723 5.653 4.185 0.037 1.940 80 0.226 62.500\n",
348
+ " Berlin Premium 70 4.560 4.137 3.943 0.071 2.449 103 0.129 50.485\n",
349
+ " Berlin Standard 96 4.485 4.609 4.052 0.073 1.929 101 0.232 57.426\n",
350
+ " Berlin XL 55 4.393 4.182 3.909 0.055 2.499 93 0.178 52.688\n",
351
+ "Cologne Eco 30 4.920 3.651 4.067 0.067 2.082 97 0.285 60.825\n",
352
+ "Cologne Premium 66 4.182 4.683 3.773 0.136 1.496 100 0.123 53.000\n",
353
+ "Cologne Standard 90 4.520 4.614 3.856 0.044 2.071 93 0.106 51.613\n",
354
+ "Cologne XL 58 4.483 4.732 4.069 0.052 1.594 93 0.022 40.860\n",
355
+ "Hamburg Eco 29 4.459 3.848 4.069 0.069 2.212 91 0.152 56.044\n",
356
+ "Hamburg Premium 77 4.546 4.655 3.987 0.039 1.923 103 0.182 53.398\n"
357
+ ]
358
+ }
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "source": [
364
+ "# SECTION 5 \u2013 EXPLORATORY DATA ANALYSIS (6 charts)\n",
365
+ "# =============================================================================\n",
366
+ "\n",
367
+ "import matplotlib.pyplot as plt\n",
368
+ "import matplotlib.gridspec as gridspec\n",
369
+ "\n",
370
+ "# Define PALETTE for styling (moved from initial setup to ensure availability)\n",
371
+ "PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
372
+ "\n",
373
+ "fig = plt.figure(figsize=(20, 22))\n",
374
+ "fig.suptitle(\"Urban Mobility Startup \u2013 Exploratory Data Analysis\",\n",
375
+ " fontsize=17, fontweight=\"bold\", y=0.98)\n",
376
+ "gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.45, wspace=0.35)\n",
377
+ "\n",
378
+ "# \u2500\u2500 Chart 1: Average final price by ride type \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
379
+ "ax1 = fig.add_subplot(gs[0, 0])\n",
380
+ "price_by_type = ride_data.groupby(\"ride_type\")[\"final_price_eur\"].mean().sort_values()\n",
381
+ "bars = ax1.barh(price_by_type.index, price_by_type.values,\n",
382
+ " color=PALETTE[:len(price_by_type)])\n",
383
+ "ax1.bar_label(bars, fmt=\"\u20ac%.2f\", padding=4, fontsize=9)\n",
384
+ "ax1.set_title(\"Avg. Final Price by Ride Type\")\n",
385
+ "ax1.set_xlabel(\"EUR\")\n",
386
+ "ax1.set_xlim(0, price_by_type.max() * 1.25)\n",
387
+ "\n",
388
+ "# \u2500\u2500 Chart 2: Rating distribution \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
389
+ "ax2 = fig.add_subplot(gs[0, 1])\n",
390
+ "rating_counts = ride_data[\"rating\"].value_counts().sort_index()\n",
391
+ "ax2.bar(rating_counts.index, rating_counts.values,\n",
392
+ " color=PALETTE[1], edgecolor=\"white\", linewidth=0.8)\n",
393
+ "ax2.set_title(\"Ride Rating Distribution\")\n",
394
+ "ax2.set_xlabel(\"Stars\")\n",
395
+ "ax2.set_ylabel(\"Number of Rides\")\n",
396
+ "ax2.set_xticks([1, 2, 3, 4, 5])\n",
397
+ "\n",
398
+ "# \u2500\u2500 Chart 3: Sentiment breakdown by city \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
399
+ "ax3 = fig.add_subplot(gs[1, 0])\n",
400
+ "sent_city = review_data.groupby([\"city\", \"sentiment\"]).size().unstack(fill_value=0)\n",
401
+ "sent_city_pct = sent_city.div(sent_city.sum(axis=1), axis=0) * 100\n",
402
+ "sent_city_pct[[\"Positive\", \"Neutral\", \"Negative\"]].plot(\n",
403
+ " kind=\"bar\", ax=ax3, color=[PALETTE[1], PALETTE[3], PALETTE[4]],\n",
404
+ " edgecolor=\"white\", linewidth=0.5)\n",
405
+ "ax3.set_title(\"Review Sentiment by City (%)\")\n",
406
+ "ax3.set_xlabel(\"\")\n",
407
+ "ax3.set_ylabel(\"Share (%)\")\n",
408
+ "ax3.legend(title=\"Sentiment\", fontsize=8)\n",
409
+ "ax3.tick_params(axis=\"x\", rotation=30)\n",
410
+ "\n",
411
+ "# \u2500\u2500 Chart 4: Price per km vs avg rating (scatter) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
412
+ "ax4 = fig.add_subplot(gs[1, 1])\n",
413
+ "for i, rt in enumerate(ride_types):\n",
414
+ " sub = ride_data[ride_data[\"ride_type\"] == rt]\n",
415
+ " ax4.scatter(sub[\"price_per_km\"], sub[\"rating\"] +\n",
416
+ " np.random.uniform(-0.1, 0.1, len(sub)),\n",
417
+ " label=rt, alpha=0.4, s=14, color=PALETTE[i % len(PALETTE)])\n",
418
+ "ax4.set_title(\"Price-per-km vs. Ride Rating\")\n",
419
+ "ax4.set_xlabel(\"Price per km (\u20ac)\")\n",
420
+ "ax4.set_ylabel(\"Rating (jittered)\")\n",
421
+ "ax4.legend(fontsize=8, markerscale=1.5)\n",
422
+ "\n",
423
+ "# \u2500\u2500 Chart 5: Cancellation rate by time slot \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
424
+ "ax5 = fig.add_subplot(gs[2, 0])\n",
425
+ "cancel_slot = ride_data.groupby(\"time_slot\")[\"cancelled\"].mean().sort_values() * 100\n",
426
+ "ax5.bar(cancel_slot.index, cancel_slot.values,\n",
427
+ " color=PALETTE[4], edgecolor=\"white\")\n",
428
+ "ax5.set_title(\"Cancellation Rate by Time Slot (%)\")\n",
429
+ "ax5.set_ylabel(\"Cancellation Rate (%)\")\n",
430
+ "ax5.set_ylim(0, cancel_slot.max() * 1.4)\n",
431
+ "for p, v in zip(cancel_slot.index, cancel_slot.values):\n",
432
+ " ax5.text(p, v + 0.1, f\"{v:.1f}%\", ha=\"center\", fontsize=9)\n",
433
+ "\n",
434
+ "# \u2500\u2500 Chart 6: Avg compound sentiment score by ride type \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
435
+ "ax6 = fig.add_subplot(gs[2, 1])\n",
436
+ "sent_type = review_data.groupby(\"ride_type\")[\"compound_score\"].mean().sort_values()\n",
437
+ "colors_sent = [PALETTE[1] if v >= 0 else PALETTE[4] for v in sent_type.values]\n",
438
+ "ax6.barh(sent_type.index, sent_type.values, color=colors_sent)\n",
439
+ "ax6.axvline(0, color=\"black\", linewidth=0.8, linestyle=\"--\")\n",
440
+ "ax6.set_title(\"Avg. VADER Sentiment Score by Ride Type\")\n",
441
+ "ax6.set_xlabel(\"Compound Score (\u22121 to +1)\")\n",
442
+ "\n",
443
+ "plt.savefig(\"notebook1_eda_output.png\", bbox_inches=\"tight\")\n",
444
+ "plt.close()\n",
445
+ "print(\"\\n\u2705 EDA chart saved \u2192 notebook1_eda_output.png\")"
446
+ ],
447
+ "metadata": {
448
+ "colab": {
449
+ "base_uri": "https://localhost:8080/"
450
+ },
451
+ "id": "k3o219Voq9l_",
452
+ "outputId": "e5065a98-9ee5-4215-f1b2-58da89a93a67"
453
+ },
454
+ "execution_count": 14,
455
+ "outputs": [
456
+ {
457
+ "output_type": "stream",
458
+ "name": "stdout",
459
+ "text": [
460
+ "\n",
461
+ "\u2705 EDA chart saved \u2192 notebook1_eda_output.png\n"
462
+ ]
463
+ }
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "code",
468
+ "source": [
469
+ "# SECTION 6 \u2013 SAVE CLEANED DATASETS FOR NOTEBOOK 2\n",
470
+ "# =============================================================================\n",
471
+ "ride_data.to_csv(\"ride_data_clean.csv\", index=False)\n",
472
+ "review_data.to_csv(\"review_data_clean.csv\", index=False)\n",
473
+ "df.to_csv(\"merged_summary.csv\", index=False)\n",
474
+ "print(\"\u2705 CSVs saved: ride_data_clean.csv | review_data_clean.csv | merged_summary.csv\")\n",
475
+ "\n",
476
+ "print(\"\\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")\n",
477
+ "print(\" NOTEBOOK 1 COMPLETE\")\n",
478
+ "print(\"\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")"
479
+ ],
480
+ "metadata": {
481
+ "colab": {
482
+ "base_uri": "https://localhost:8080/"
483
+ },
484
+ "id": "Atl1ma1HsOE6",
485
+ "outputId": "78c7f16c-7d69-40a2-e366-bb7e0aa7a255"
486
+ },
487
+ "execution_count": 16,
488
+ "outputs": [
489
+ {
490
+ "output_type": "stream",
491
+ "name": "stdout",
492
+ "text": [
493
+ "\u2705 CSVs saved: ride_data_clean.csv | review_data_clean.csv | merged_summary.csv\n",
494
+ "\n",
495
+ "\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
496
+ " NOTEBOOK 1 COMPLETE\n",
497
+ "\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n"
498
+ ]
499
+ }
500
+ ]
501
+ }
502
+ ]
503
+ }
merged_summary (1).csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ city,ride_type,total_rides,avg_final_price_eur,avg_rating,vader_compound,vader_sentiment
2
+ Paris,E-Scooter,320,4.82,4.15,0.12,Positive
3
+ Paris,E-Bike,210,3.95,4.22,0.15,Positive
4
+ Paris,Bus-Connect,150,2.40,4.35,0.18,Positive
5
+ Paris,E-Moto,180,5.50,3.95,0.09,Positive
6
+ Berlin,E-Scooter,380,3.60,3.72,0.01,Neutral
7
+ Berlin,E-Bike,190,3.20,3.95,0.08,Positive
8
+ Berlin,Bus-Connect,160,2.10,4.10,0.10,Positive
9
+ Berlin,E-Moto,140,4.80,3.55,-0.02,Neutral
10
+ Madrid,E-Scooter,350,4.20,4.05,0.17,Positive
11
+ Madrid,E-Bike,220,3.70,4.25,0.20,Positive
12
+ Madrid,Bus-Connect,180,2.80,4.40,0.19,Positive
13
+ Madrid,E-Moto,160,5.10,4.10,0.14,Positive
14
+ Warsaw,E-Scooter,280,3.50,3.65,0.03,Neutral
15
+ Warsaw,E-Bike,160,3.00,3.85,0.05,Neutral
16
+ Warsaw,Bus-Connect,140,1.90,4.00,0.09,Positive
17
+ Warsaw,E-Moto,120,4.30,3.75,0.02,Neutral
18
+ Turin,E-Scooter,200,4.10,3.80,0.06,Positive
19
+ Turin,E-Bike,120,3.50,4.10,0.12,Positive
20
+ Turin,Bus-Connect,100,2.30,4.25,0.15,Positive
21
+ Turin,E-Moto,90,4.70,3.90,0.08,Positive