group08_template

Sleeping

App Files Files Community

matteobardelle commited on Apr 27

Commit

ef4672c

verified ·

1 Parent(s): cdfed75

Upload 3 files

Browse files

Files changed (3) hide show

Predictive_Modelling_+_ARIMA_Forecasting_Urban_Mobility_Startup_–_Pricing_&_Satisfaction_Optimization (1).ipynb +338 -0
Real_World_Data_Processing_EDA.ipynb +503 -0
merged_summary (1).csv +21 -0

Predictive_Modelling_+_ARIMA_Forecasting_Urban_Mobility_Startup_–_Pricing_&_Satisfaction_Optimization (1).ipynb ADDED Viewed

	@@ -0,0 +1,338 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "2ERyVGhbyopK"
+      },
+      "outputs": [],
+      "source": [
+        "# Role: Data Analyst\n",
+        "# Pipeline:\n",
+        "#   CLEAN > ENCODE > SPLIT 80-20 > RANDOM FOREST CLASSIFICATION (satisfaction)\n",
+        "#          > ARIMA REVENUE FORECAST > FEATURE IMPORTANCE > EVALUATION\n",
+        "# =============================================================================\n",
+        "\n",
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "import matplotlib.gridspec as gridspec\n",
+        "import seaborn as sns\n",
+        "import warnings\n",
+        "warnings.filterwarnings(\"ignore\")\n",
+        "\n",
+        "from sklearn.ensemble         import RandomForestClassifier\n",
+        "from sklearn.model_selection  import train_test_split\n",
+        "from sklearn.preprocessing    import LabelEncoder\n",
+        "from sklearn.metrics          import (classification_report,\n",
+        "                                      ConfusionMatrixDisplay,\n",
+        "                                      accuracy_score)\n",
+        "from statsmodels.tsa.arima.model import ARIMA\n",
+        "\n",
+        "PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
+        "sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n",
+        "plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n",
+        "                     \"axes.labelsize\": 11})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# SECTION 1 – LOAD DATA FROM NOTEBOOK 1\n",
+        "# =============================================================================\n",
+        "\n",
+        "ride_df   = pd.read_csv(\"/content/ride_data_clean.csv\")\n",
+        "review_df = pd.read_csv(\"/content/review_data_clean.csv\")\n",
+        "merged_df = pd.read_csv(\"/content/merged_summary.csv\")\n",
+        "\n",
+        "print(f\"Rides: {ride_df.shape} | Reviews: {review_df.shape} | Merged: {merged_df.shape}\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "63oe81VNzi8_",
+        "outputId": "245fcee0-e3b0-41ed-9fb3-3ec7c4fd6eba"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Rides: (1000, 12) | Reviews: (1500, 7) | Merged: (16, 11)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# SECTION 2 – CLASSIFICATION: PREDICT USER SATISFACTION (HIGH vs LOW)\n",
+        "# Dependent variable  : SatisfactionLabel  (High = rating ≥ 4, Low otherwise)\n",
+        "# Independent variables: final_price_eur, distance_km, duration_min,\n",
+        "#                        discount_pct, cancelled, ride_type, time_slot\n",
+        "# =============================================================================\n",
+        "\n",
+        "# ── 2a. Build classification dataframe ───────────────────────────────────────\n",
+        "clf_df = ride_df[[\n",
+        "    \"final_price_eur\", \"distance_km\", \"duration_min\",\n",
+        "    \"discount_pct\", \"cancelled\", \"ride_type\", \"time_slot\", \"rating\"\n",
+        "]].copy()\n",
+        "\n",
+        "clf_df[\"SatisfactionLabel\"] = (clf_df[\"rating\"] >= 4).astype(int)  # 1=High, 0=Low\n",
+        "clf_df.drop(columns=\"rating\", inplace=True)\n",
+        "\n",
+        "# ── 2b. Encode categoricals ───────────────────────────────────────────────────\n",
+        "le_rt = LabelEncoder()\n",
+        "le_ts = LabelEncoder()\n",
+        "clf_df[\"ride_type_enc\"]  = le_rt.fit_transform(clf_df[\"ride_type\"])\n",
+        "clf_df[\"time_slot_enc\"]  = le_ts.fit_transform(clf_df[\"time_slot\"])\n",
+        "clf_df.drop(columns=[\"ride_type\", \"time_slot\"], inplace=True)\n",
+        "\n",
+        "X = clf_df.drop(columns=\"SatisfactionLabel\")\n",
+        "y = clf_df[\"SatisfactionLabel\"]\n",
+        "\n",
+        "# ── 2c. Train / test split 80-20 ──────────────────────────────────────────────\n",
+        "X_train, X_test, y_train, y_test = train_test_split(\n",
+        "    X, y, test_size=0.20, random_state=42, stratify=y)\n",
+        "\n",
+        "print(f\"\\nTrain size: {len(X_train)} | Test size: {len(X_test)}\")\n",
+        "\n",
+        "# ── 2d. Random Forest ─────────────────────────────────────────────────────────\n",
+        "rf = RandomForestClassifier(n_estimators=200, max_depth=8,\n",
+        "                            random_state=42, class_weight=\"balanced\")\n",
+        "rf.fit(X_train, y_train)\n",
+        "y_pred = rf.predict(X_test)\n",
+        "\n",
+        "print(f\"\\nClassification Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
+        "print(\"\\nClassification Report:\")\n",
+        "print(classification_report(y_test, y_pred,\n",
+        "                             target_names=[\"Low Satisfaction\", \"High Satisfaction\"]))\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "nv-OXM0nzywU",
+        "outputId": "335d1c9d-6d2b-4878-9aed-2e3e05e11905"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "Train size: 800 | Test size: 200\n",
+            "\n",
+            "Classification Accuracy: 0.6450\n",
+            "\n",
+            "Classification Report:\n",
+            "                   precision    recall  f1-score   support\n",
+            "\n",
+            " Low Satisfaction       0.22      0.16      0.18        50\n",
+            "High Satisfaction       0.74      0.81      0.77       150\n",
+            "\n",
+            "         accuracy                           0.65       200\n",
+            "        macro avg       0.48      0.48      0.48       200\n",
+            "     weighted avg       0.61      0.65      0.63       200\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# SECTION 3 – ARIMA REVENUE FORECAST\n",
+        "# Aggregate weekly total revenue → forecast next 12 weeks for 3 sample cities\n",
+        "# =============================================================================\n",
+        "\n",
+        "# Generate a synthetic weekly revenue time series per city (realistic trend + noise)\n",
+        "np.random.seed(7)\n",
+        "weeks      = pd.date_range(\"2022-01-03\", periods=104, freq=\"W\")  # 2 years weekly\n",
+        "cities_sel = [\"Paris\", \"Berlin\", \"Madrid\"]\n",
+        "\n",
+        "city_rev = {}\n",
+        "for c in cities_sel:\n",
+        "    trend  = np.linspace(80_000, 130_000, 104)\n",
+        "    season = 8_000 * np.sin(np.linspace(0, 4 * np.pi, 104))\n",
+        "    noise  = np.random.normal(0, 5_000, 104)\n",
+        "    city_rev[c] = pd.Series(trend + season + noise, index=weeks)\n",
+        "\n",
+        "FORECAST_STEPS = 12"
+      ],
+      "metadata": {
+        "id": "2tsC_F9oz4JO"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# SECTION 4 – VISUALIZATIONS (5 charts)\n",
+        "# =============================================================================\n",
+        "\n",
+        "fig = plt.figure(figsize=(20, 24))\n",
+        "fig.suptitle(\"Urban Mobility – Predictive Analytics & Revenue Forecasting\",\n",
+        "             fontsize=17, fontweight=\"bold\", y=0.99)\n",
+        "gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.50, wspace=0.35)\n",
+        "\n",
+        "# ── Chart 1: Feature importance ───────────────────────────────────────────────\n",
+        "ax1 = fig.add_subplot(gs[0, 0])\n",
+        "feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values()\n",
+        "feat_imp.index = [\"Discount %\", \"Cancelled\", \"Ride Type\",\n",
+        "                  \"Time Slot\", \"Duration (min)\", \"Distance (km)\", \"Final Price (€)\"]\n",
+        "feat_imp.sort_values().plot(kind=\"barh\", ax=ax1, color=PALETTE[1])\n",
+        "ax1.set_title(\"Random Forest – Feature Importances\\n(Satisfaction Classification)\")\n",
+        "ax1.set_xlabel(\"Importance Score\")\n",
+        "\n",
+        "# ── Chart 2: Confusion matrix ─────────────────────────────────────────────────\n",
+        "ax2 = fig.add_subplot(gs[0, 1])\n",
+        "ConfusionMatrixDisplay.from_predictions(\n",
+        "    y_test, y_pred,\n",
+        "    display_labels=[\"Low\", \"High\"],\n",
+        "    colorbar=False, cmap=\"Blues\", ax=ax2)\n",
+        "ax2.set_title(\"Confusion Matrix\\n(Satisfaction: Low vs High)\")\n",
+        "\n",
+        "# ── Charts 3-5: ARIMA forecasts per city ─────────────────────────────────────\n",
+        "arima_positions = [(1, 0), (1, 1), (2, 0)]\n",
+        "arima_models    = {}\n",
+        "\n",
+        "for idx, city in enumerate(cities_sel):\n",
+        "    row, col = arima_positions[idx]\n",
+        "    ax = fig.add_subplot(gs[row, col])\n",
+        "    series = city_rev[city]\n",
+        "\n",
+        "    model  = ARIMA(series, order=(2, 1, 2))\n",
+        "    result = model.fit()\n",
+        "    arima_models[city] = result\n",
+        "\n",
+        "    forecast    = result.get_forecast(steps=FORECAST_STEPS)\n",
+        "    forecast_df = forecast.summary_frame(alpha=0.10)\n",
+        "    future_idx  = pd.date_range(series.index[-1] + pd.Timedelta(weeks=1),\n",
+        "                                periods=FORECAST_STEPS, freq=\"W\")\n",
+        "    forecast_df.index = future_idx\n",
+        "\n",
+        "    ax.plot(series, color=PALETTE[0], linewidth=1.2, label=\"Historical\")\n",
+        "    ax.plot(forecast_df[\"mean\"], color=PALETTE[2],\n",
+        "            linewidth=2, linestyle=\"--\", label=\"Forecast\")\n",
+        "    ax.fill_between(forecast_df.index,\n",
+        "                    forecast_df[\"mean_ci_lower\"],\n",
+        "                    forecast_df[\"mean_ci_upper\"],\n",
+        "                    alpha=0.25, color=PALETTE[2], label=\"90% CI\")\n",
+        "    ax.set_title(f\"ARIMA Revenue Forecast – {city}\")\n",
+        "    ax.set_ylabel(\"Weekly Revenue (€)\")\n",
+        "    ax.set_xlabel(\"\")\n",
+        "    ax.legend(fontsize=8)\n",
+        "    ax.yaxis.set_major_formatter(\n",
+        "        plt.FuncFormatter(lambda v, _: f\"€{v/1000:.0f}k\"))\n",
+        "\n",
+        "# ── Chart 6 (last cell): Price sensitivity – avg rating by price bucket ───────\n",
+        "ax6 = fig.add_subplot(gs[2, 1])\n",
+        "ride_df[\"price_bucket\"] = pd.cut(ride_df[\"final_price_eur\"],\n",
+        "                                  bins=[0, 2, 3.5, 5, 6.5, 10],\n",
+        "                                  labels=[\"<2\", \"2–3.5\", \"3.5–5\", \"5–6.5\", \">6.5\"])\n",
+        "price_sens = ride_df.groupby(\"price_bucket\", observed=True)[\"rating\"].mean()\n",
+        "ax6.bar(price_sens.index, price_sens.values, color=PALETTE[3], edgecolor=\"white\")\n",
+        "ax6.set_title(\"Price Sensitivity – Avg. Rating by Price Bucket\")\n",
+        "ax6.set_xlabel(\"Final Price (€)\")\n",
+        "ax6.set_ylabel(\"Avg. Rating\")\n",
+        "ax6.set_ylim(3, 5)\n",
+        "for p, v in zip(price_sens.index, price_sens.values):\n",
+        "    ax6.text(p, v + 0.02, f\"{v:.2f}★\", ha=\"center\", fontsize=9)\n",
+        "\n",
+        "plt.savefig(\"/content/notebook2_models_output.png\", bbox_inches=\"tight\")\n",
+        "plt.close()\n",
+        "print(\"\\n✅  Model output chart saved → notebook2_models_output.png\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "o7Y2B81jz89H",
+        "outputId": "35a76ae9-1ac1-4de8-bfba-6c349af72d11"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "✅  Model output chart saved → notebook2_models_output.png\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# SECTION 5 – SAVE FORECAST TABLE\n",
+        "# =============================================================================\n",
+        "all_forecasts = []\n",
+        "for city in cities_sel:\n",
+        "    fc = arima_models[city].get_forecast(steps=FORECAST_STEPS).summary_frame(alpha=0.10)\n",
+        "    fc.index = pd.date_range(city_rev[city].index[-1] + pd.Timedelta(weeks=1),\n",
+        "                             periods=FORECAST_STEPS, freq=\"W\")\n",
+        "    fc[\"city\"] = city\n",
+        "    all_forecasts.append(fc[[\"city\", \"mean\", \"mean_ci_lower\", \"mean_ci_upper\"]])\n",
+        "\n",
+        "forecast_table = pd.concat(all_forecasts)\n",
+        "forecast_table.columns = [\"city\", \"forecast_revenue\", \"ci_lower_90\", \"ci_upper_90\"]\n",
+        "forecast_table.to_csv(\"/content/arima_forecast_table.csv\")\n",
+        "print(\"✅  Forecast table saved → arima_forecast_table.csv\")\n",
+        "print(forecast_table.head(6).round(0).to_string())\n",
+        "\n",
+        "print(\"\\n══════════════════════════════════════════\")\n",
+        "print(\"  NOTEBOOK 2 COMPLETE\")\n",
+        "print(\"══════════════════════════════════════════\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "KY_odiwF0h7r",
+        "outputId": "7e77d938-59cc-4241-e4b9-37dc7cb6eaf5"
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅  Forecast table saved → arima_forecast_table.csv\n",
+            "             city  forecast_revenue  ci_lower_90  ci_upper_90\n",
+            "2024-01-07  Paris          124853.0     115042.0     134665.0\n",
+            "2024-01-14  Paris          124946.0     112850.0     137043.0\n",
+            "2024-01-21  Paris          124678.0     110776.0     138581.0\n",
+            "2024-01-28  Paris          124855.0     109164.0     140546.0\n",
+            "2024-02-04  Paris          124759.0     107576.0     141942.0\n",
+            "2024-02-11  Paris          124808.0     106197.0     143420.0\n",
+            "\n",
+            "══════════════════════════════════════════\n",
+            "  NOTEBOOK 2 COMPLETE\n",
+            "══════════════════════════════════════════\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

Real_World_Data_Processing_EDA.ipynb ADDED Viewed

	@@ -0,0 +1,503 @@

+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "r-G_BpFaLoa4",
+    "outputId": "6ce2e622-9704-47a9-a54d-17ea18432dfd"
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
+      "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
+      "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
+      "Requirement already satisfied: vaderSentiment in /usr/local/lib/python3.12/dist-packages (3.3.2)\n",
+      "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
+      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1)\n",
+      "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
+      "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n",
+      "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)\n",
+      "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.3)\n",
+      "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
+      "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.7)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.2.25)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# --- 0. INSTALL DEPENDENCIES ---\n",
+    "!pip install pandas numpy matplotlib seaborn vaderSentiment scikit-learn\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.gridspec as gridspec\n",
+    "import seaborn as sns\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
+    "\n",
+    "# \u2500\u2500 Styling \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "PALETTE   = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
+    "sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n",
+    "plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n",
+    "                     \"axes.labelsize\": 11, \"font.family\": \"DejaVu Sans\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "N_RIDES = 1000\n",
+    "cities = [\"Berlin\", \"Munich\", \"Hamburg\", \"Cologne\"]\n",
+    "ride_types = [\"Standard\", \"Premium\", \"XL\", \"Eco\"]\n",
+    "time_slots = [\"Morning (6-10)\", \"Midday (10-14)\", \"Afternoon (14-18)\", \"Evening (18-22)\"]\n",
+    "\n",
+    "# SECTION 1 \u2013 SIMULATE \"SCRAPED / FOUND\" REAL-WORLD DATA\n",
+    "# (In production: replace with actual web-scraped or API-fetched CSVs)\n",
+    "# \u2500\u2500 1a. Ride-level transaction data (quantitative) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ride_data = pd.DataFrame({\n",
+    "    \"ride_id\":       range(1, N_RIDES + 1),\n",
+    "    \"city\":          np.random.choice(cities, N_RIDES),\n",
+    "    \"ride_type\":     np.random.choice(ride_types, N_RIDES, p=[0.40, 0.30, 0.20, 0.10]),\n",
+    "    \"time_slot\":     np.random.choice(time_slots, N_RIDES, p=[0.25, 0.30, 0.30, 0.15]),\n",
+    "    \"distance_km\":   np.round(np.random.exponential(4, N_RIDES) + 0.5, 2),\n",
+    "    \"duration_min\":  np.round(np.random.normal(18, 6, N_RIDES).clip(3), 1),\n",
+    "    \"base_price_eur\":np.round(np.random.uniform(1.5, 8.0, N_RIDES), 2),\n",
+    "    \"discount_pct\":  np.random.choice([0, 5, 10, 15, 20], N_RIDES,\n",
+    "                                       p=[0.50, 0.20, 0.15, 0.10, 0.05]),\n",
+    "    \"rating\":        np.random.choice([1, 2, 3, 4, 5], N_RIDES,\n",
+    "                                       p=[0.03, 0.07, 0.15, 0.40, 0.35]),\n",
+    "    \"cancelled\":     np.random.choice([0, 1], N_RIDES, p=[0.93, 0.07]),\n",
+    "})\n",
+    "\n",
+    "# Introduce 3 % missing values in price and rating (realistic)\n",
+    "for col in [\"base_price_eur\", \"rating\"]:\n",
+    "    ride_data.loc[ride_data.sample(frac=0.03).index, col] = np.nan\n",
+    "\n",
+    "# Derived fields\n",
+    "ride_data[\"final_price_eur\"] = np.round(\n",
+    "    ride_data[\"base_price_eur\"] * (1 - ride_data[\"discount_pct\"] / 100), 2)\n",
+    "ride_data[\"price_per_km\"] = np.round(\n",
+    "    ride_data[\"final_price_eur\"] / ride_data[\"distance_km\"], 3)"
+   ],
+   "metadata": {
+    "id": "gtbUjaaWMfH-"
+   },
+   "execution_count": 2,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# \u2500\u2500 1b. App-review data (qualitative) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "positive_reviews = [\n",
+    "    \"Absolutely love the e-scooter! Fast, clean, affordable.\",\n",
+    "    \"Seamless booking and the bike was in great condition.\",\n",
+    "    \"Best way to get around the city. Highly recommend!\",\n",
+    "    \"Super convenient, saved me 20 minutes every morning.\",\n",
+    "    \"Eco-friendly and cheap. Will use every day.\",\n",
+    "    \"App works perfectly. Scooter was fully charged.\",\n",
+    "    \"Great service, prices are very fair for the distance.\",\n",
+    "    \"Customer support was helpful and friendly.\",\n",
+    "]\n",
+    "negative_reviews = [\n",
+    "    \"The scooter was broken when I unlocked it. Very frustrating.\",\n",
+    "    \"Overcharged for a 2 km ride. Pricing is confusing.\",\n",
+    "    \"App crashed three times before I could complete the booking.\",\n",
+    "    \"Terrible availability in my neighbourhood. Always empty.\",\n",
+    "    \"The e-bike seat was damaged and uncomfortable.\",\n",
+    "    \"Hidden fees are unacceptable. Totally misleading pricing.\",\n",
+    "    \"Waited 10 minutes to connect to a scooter. Wasted my time.\",\n",
+    "    \"No customer support response after a billing error.\",\n",
+    "]\n",
+    "neutral_reviews = [\n",
+    "    \"It was okay. Nothing special, works as expected.\",\n",
+    "    \"Decent ride, though a bit pricey compared to the metro.\",\n",
+    "    \"Average experience. Some improvements needed in the app.\",\n",
+    "    \"Not bad, but parking zones need to be clearer.\",\n",
+    "    \"Works fine most of the time. Occasional glitches.\",\n",
+    "]\n",
+    "\n",
+    "N_REVIEWS = 1500 # Define N_REVIEWS here\n",
+    "all_reviews = positive_reviews * 30 + negative_reviews * 20 + neutral_reviews * 10\n",
+    "review_data = pd.DataFrame({\n",
+    "    \"review_id\":  range(1, N_REVIEWS + 1),\n",
+    "    \"city\":       np.random.choice(cities, N_REVIEWS),\n",
+    "    \"ride_type\":  np.random.choice(ride_types, N_REVIEWS),\n",
+    "    \"review_text\":np.random.choice(all_reviews, N_REVIEWS),\n",
+    "    \"review_date\":pd.date_range(\"2024-01-01\", periods=N_REVIEWS, freq=\"14h\"),\n",
+    "})\n",
+    "\n",
+    "print(\"\u2705  Data generated\")\n",
+    "print(f\"   ride_data  : {ride_data.shape}\")\n",
+    "print(f\"   review_data: {review_data.shape}\")"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "CzPxA0rAoaHZ",
+    "outputId": "5174afc6-c010-4b9a-9dc8-8f41a6ac4b56"
+   },
+   "execution_count": 4,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "\u2705  Data generated\n",
+      "   ride_data  : (1000, 12)\n",
+      "   review_data: (1500, 5)\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# SECTION 2 \u2013 DATA CLEANING\n",
+    "# =============================================================================\n",
+    "\n",
+    "# \u2500\u2500 2a. Ride data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "print(\"\\n\u2500\u2500 Missing values BEFORE cleaning \u2500\u2500\")\n",
+    "print(ride_data[[\"base_price_eur\", \"rating\"]].isnull().sum())\n",
+    "\n",
+    "ride_data[\"base_price_eur\"] = ride_data[\"base_price_eur\"].fillna(ride_data[\"base_price_eur\"].median())\n",
+    "ride_data[\"rating\"] = ride_data[\"rating\"].fillna(round(ride_data[\"rating\"].median()))\n",
+    "\n",
+    "print(\"\u2500\u2500 Missing values AFTER cleaning \u2500\u2500\")\n",
+    "print(ride_data[[\"base_price_eur\", \"rating\"]].isnull().sum())\n",
+    "\n",
+    "# Remove duplicate ride IDs (none expected, but good practice)\n",
+    "ride_data.drop_duplicates(subset=\"ride_id\", inplace=True)\n",
+    "\n",
+    "# Drop rides with physically impossible distance\n",
+    "ride_data = ride_data[ride_data[\"distance_km\"] > 0]\n",
+    "\n",
+    "# \u2500\u2500 2b. Review data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "review_data.dropna(subset=[\"review_text\"], inplace=True)\n",
+    "review_data[\"review_text\"] = review_data[\"review_text\"].str.strip()"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "lnGYrVG7o0aO",
+    "outputId": "be8fb6b5-a485-4495-8ebd-240e741a8ce4"
+   },
+   "execution_count": 5,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "\n",
+      "\u2500\u2500 Missing values BEFORE cleaning \u2500\u2500\n",
+      "base_price_eur    30\n",
+      "rating            30\n",
+      "dtype: int64\n",
+      "\u2500\u2500 Missing values AFTER cleaning \u2500\u2500\n",
+      "base_price_eur    0\n",
+      "rating            0\n",
+      "dtype: int64\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# SECTION 3 \u2013 VADER SENTIMENT ANALYSIS ON REVIEWS\n",
+    "# =============================================================================\n",
+    "\n",
+    "!pip install vaderSentiment\n",
+    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
+    "\n",
+    "analyzer = SentimentIntensityAnalyzer()\n",
+    "\n",
+    "def classify_sentiment(text):\n",
+    "    score = analyzer.polarity_scores(text)[\"compound\"]\n",
+    "    if score >= 0.05:  return \"Positive\"\n",
+    "    elif score <= -0.05: return \"Negative\"\n",
+    "    else:               return \"Neutral\"\n",
+    "\n",
+    "review_data[\"compound_score\"] = review_data[\"review_text\"].apply(\n",
+    "    lambda t: analyzer.polarity_scores(t)[\"compound\"])\n",
+    "review_data[\"sentiment\"] = review_data[\"review_text\"].apply(classify_sentiment)\n",
+    "\n",
+    "print(\"\\n\u2500\u2500 Sentiment distribution \u2500\u2500\")\n",
+    "print(review_data[\"sentiment\"].value_counts())"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "iXvqwQRxo-W6",
+    "outputId": "39694265-24f5-44e0-bc33-0f5300b1b917"
+   },
+   "execution_count": 8,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Collecting vaderSentiment\n",
+      "  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n",
+      "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.7)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.2.25)\n",
+      "Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)\n",
+      "\u001b[2K   \u001b[90m\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u001b[0m \u001b[32m126.0/126.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: vaderSentiment\n",
+      "Successfully installed vaderSentiment-3.3.2\n",
+      "\n",
+      "\u2500\u2500 Sentiment distribution \u2500\u2500\n",
+      "sentiment\n",
+      "Positive    798\n",
+      "Negative    509\n",
+      "Neutral     193\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# SECTION 4 \u2013 MERGING DATASETS\n",
+    "# Aggregate rides per (city, ride_type) \u2192 merge with review sentiments\n",
+    "# =============================================================================\n",
+    "\n",
+    "ride_agg = ride_data.groupby([\"city\", \"ride_type\"]).agg(\n",
+    "    total_rides        = (\"ride_id\",         \"count\"),\n",
+    "    avg_final_price    = (\"final_price_eur\",  \"mean\"),\n",
+    "    avg_distance_km    = (\"distance_km\",      \"mean\"),\n",
+    "    avg_rating         = (\"rating\",           \"mean\"),\n",
+    "    cancellation_rate  = (\"cancelled\",        \"mean\"),\n",
+    "    avg_price_per_km   = (\"price_per_km\",     \"mean\"),\n",
+    ").round(3).reset_index()\n",
+    "\n",
+    "review_agg = review_data.groupby([\"city\", \"ride_type\"]).agg(\n",
+    "    total_reviews      = (\"review_id\",       \"count\"),\n",
+    "    avg_compound_score = (\"compound_score\",  \"mean\"),\n",
+    "    pct_positive       = (\"sentiment\",\n",
+    "                          lambda x: (x == \"Positive\").sum() / len(x) * 100),\n",
+    ").round(3).reset_index()\n",
+    "\n",
+    "df = pd.merge(ride_agg, review_agg, on=[\"city\", \"ride_type\"], how=\"inner\")\n",
+    "\n",
+    "print(\"\\n\u2500\u2500 Merged dataframe head \u2500\u2500\")\n",
+    "print(df.head(10).to_string(index=False))"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "7LFPJo32q3Yy",
+    "outputId": "db4f5adc-b5b1-4f1b-8e50-f086e9bc8a21"
+   },
+   "execution_count": 9,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "\n",
+      "\u2500\u2500 Merged dataframe head \u2500\u2500\n",
+      "   city ride_type  total_rides  avg_final_price  avg_distance_km  avg_rating  cancellation_rate  avg_price_per_km  total_reviews  avg_compound_score  pct_positive\n",
+      " Berlin       Eco           27            4.723            5.653       4.185              0.037             1.940             80               0.226        62.500\n",
+      " Berlin   Premium           70            4.560            4.137       3.943              0.071             2.449            103               0.129        50.485\n",
+      " Berlin  Standard           96            4.485            4.609       4.052              0.073             1.929            101               0.232        57.426\n",
+      " Berlin        XL           55            4.393            4.182       3.909              0.055             2.499             93               0.178        52.688\n",
+      "Cologne       Eco           30            4.920            3.651       4.067              0.067             2.082             97               0.285        60.825\n",
+      "Cologne   Premium           66            4.182            4.683       3.773              0.136             1.496            100               0.123        53.000\n",
+      "Cologne  Standard           90            4.520            4.614       3.856              0.044             2.071             93               0.106        51.613\n",
+      "Cologne        XL           58            4.483            4.732       4.069              0.052             1.594             93               0.022        40.860\n",
+      "Hamburg       Eco           29            4.459            3.848       4.069              0.069             2.212             91               0.152        56.044\n",
+      "Hamburg   Premium           77            4.546            4.655       3.987              0.039             1.923            103               0.182        53.398\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# SECTION 5 \u2013 EXPLORATORY DATA ANALYSIS (6 charts)\n",
+    "# =============================================================================\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.gridspec as gridspec\n",
+    "\n",
+    "# Define PALETTE for styling (moved from initial setup to ensure availability)\n",
+    "PALETTE   = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n",
+    "\n",
+    "fig = plt.figure(figsize=(20, 22))\n",
+    "fig.suptitle(\"Urban Mobility Startup \u2013 Exploratory Data Analysis\",\n",
+    "             fontsize=17, fontweight=\"bold\", y=0.98)\n",
+    "gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.45, wspace=0.35)\n",
+    "\n",
+    "# \u2500\u2500 Chart 1: Average final price by ride type \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ax1 = fig.add_subplot(gs[0, 0])\n",
+    "price_by_type = ride_data.groupby(\"ride_type\")[\"final_price_eur\"].mean().sort_values()\n",
+    "bars = ax1.barh(price_by_type.index, price_by_type.values,\n",
+    "                color=PALETTE[:len(price_by_type)])\n",
+    "ax1.bar_label(bars, fmt=\"\u20ac%.2f\", padding=4, fontsize=9)\n",
+    "ax1.set_title(\"Avg. Final Price by Ride Type\")\n",
+    "ax1.set_xlabel(\"EUR\")\n",
+    "ax1.set_xlim(0, price_by_type.max() * 1.25)\n",
+    "\n",
+    "# \u2500\u2500 Chart 2: Rating distribution \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ax2 = fig.add_subplot(gs[0, 1])\n",
+    "rating_counts = ride_data[\"rating\"].value_counts().sort_index()\n",
+    "ax2.bar(rating_counts.index, rating_counts.values,\n",
+    "        color=PALETTE[1], edgecolor=\"white\", linewidth=0.8)\n",
+    "ax2.set_title(\"Ride Rating Distribution\")\n",
+    "ax2.set_xlabel(\"Stars\")\n",
+    "ax2.set_ylabel(\"Number of Rides\")\n",
+    "ax2.set_xticks([1, 2, 3, 4, 5])\n",
+    "\n",
+    "# \u2500\u2500 Chart 3: Sentiment breakdown by city \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ax3 = fig.add_subplot(gs[1, 0])\n",
+    "sent_city = review_data.groupby([\"city\", \"sentiment\"]).size().unstack(fill_value=0)\n",
+    "sent_city_pct = sent_city.div(sent_city.sum(axis=1), axis=0) * 100\n",
+    "sent_city_pct[[\"Positive\", \"Neutral\", \"Negative\"]].plot(\n",
+    "    kind=\"bar\", ax=ax3, color=[PALETTE[1], PALETTE[3], PALETTE[4]],\n",
+    "    edgecolor=\"white\", linewidth=0.5)\n",
+    "ax3.set_title(\"Review Sentiment by City (%)\")\n",
+    "ax3.set_xlabel(\"\")\n",
+    "ax3.set_ylabel(\"Share (%)\")\n",
+    "ax3.legend(title=\"Sentiment\", fontsize=8)\n",
+    "ax3.tick_params(axis=\"x\", rotation=30)\n",
+    "\n",
+    "# \u2500\u2500 Chart 4: Price per km vs avg rating (scatter) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ax4 = fig.add_subplot(gs[1, 1])\n",
+    "for i, rt in enumerate(ride_types):\n",
+    "    sub = ride_data[ride_data[\"ride_type\"] == rt]\n",
+    "    ax4.scatter(sub[\"price_per_km\"], sub[\"rating\"] +\n",
+    "                np.random.uniform(-0.1, 0.1, len(sub)),\n",
+    "                label=rt, alpha=0.4, s=14, color=PALETTE[i % len(PALETTE)])\n",
+    "ax4.set_title(\"Price-per-km vs. Ride Rating\")\n",
+    "ax4.set_xlabel(\"Price per km (\u20ac)\")\n",
+    "ax4.set_ylabel(\"Rating (jittered)\")\n",
+    "ax4.legend(fontsize=8, markerscale=1.5)\n",
+    "\n",
+    "# \u2500\u2500 Chart 5: Cancellation rate by time slot \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ax5 = fig.add_subplot(gs[2, 0])\n",
+    "cancel_slot = ride_data.groupby(\"time_slot\")[\"cancelled\"].mean().sort_values() * 100\n",
+    "ax5.bar(cancel_slot.index, cancel_slot.values,\n",
+    "        color=PALETTE[4], edgecolor=\"white\")\n",
+    "ax5.set_title(\"Cancellation Rate by Time Slot (%)\")\n",
+    "ax5.set_ylabel(\"Cancellation Rate (%)\")\n",
+    "ax5.set_ylim(0, cancel_slot.max() * 1.4)\n",
+    "for p, v in zip(cancel_slot.index, cancel_slot.values):\n",
+    "    ax5.text(p, v + 0.1, f\"{v:.1f}%\", ha=\"center\", fontsize=9)\n",
+    "\n",
+    "# \u2500\u2500 Chart 6: Avg compound sentiment score by ride type \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n",
+    "ax6 = fig.add_subplot(gs[2, 1])\n",
+    "sent_type = review_data.groupby(\"ride_type\")[\"compound_score\"].mean().sort_values()\n",
+    "colors_sent = [PALETTE[1] if v >= 0 else PALETTE[4] for v in sent_type.values]\n",
+    "ax6.barh(sent_type.index, sent_type.values, color=colors_sent)\n",
+    "ax6.axvline(0, color=\"black\", linewidth=0.8, linestyle=\"--\")\n",
+    "ax6.set_title(\"Avg. VADER Sentiment Score by Ride Type\")\n",
+    "ax6.set_xlabel(\"Compound Score (\u22121 to +1)\")\n",
+    "\n",
+    "plt.savefig(\"notebook1_eda_output.png\", bbox_inches=\"tight\")\n",
+    "plt.close()\n",
+    "print(\"\\n\u2705  EDA chart saved \u2192 notebook1_eda_output.png\")"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "k3o219Voq9l_",
+    "outputId": "e5065a98-9ee5-4215-f1b2-58da89a93a67"
+   },
+   "execution_count": 14,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "\n",
+      "\u2705  EDA chart saved \u2192 notebook1_eda_output.png\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# SECTION 6 \u2013 SAVE CLEANED DATASETS FOR NOTEBOOK 2\n",
+    "# =============================================================================\n",
+    "ride_data.to_csv(\"ride_data_clean.csv\", index=False)\n",
+    "review_data.to_csv(\"review_data_clean.csv\", index=False)\n",
+    "df.to_csv(\"merged_summary.csv\", index=False)\n",
+    "print(\"\u2705  CSVs saved: ride_data_clean.csv | review_data_clean.csv | merged_summary.csv\")\n",
+    "\n",
+    "print(\"\\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")\n",
+    "print(\"  NOTEBOOK 1 COMPLETE\")\n",
+    "print(\"\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Atl1ma1HsOE6",
+    "outputId": "78c7f16c-7d69-40a2-e366-bb7e0aa7a255"
+   },
+   "execution_count": 16,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "\u2705  CSVs saved: ride_data_clean.csv | review_data_clean.csv | merged_summary.csv\n",
+      "\n",
+      "\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n",
+      "  NOTEBOOK 1 COMPLETE\n",
+      "\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n"
+     ]
+    }
+   ]
+  }
+ ]
+}

merged_summary (1).csv ADDED Viewed

	@@ -0,0 +1,21 @@

+city,ride_type,total_rides,avg_final_price_eur,avg_rating,vader_compound,vader_sentiment
+Paris,E-Scooter,320,4.82,4.15,0.12,Positive
+Paris,E-Bike,210,3.95,4.22,0.15,Positive
+Paris,Bus-Connect,150,2.40,4.35,0.18,Positive
+Paris,E-Moto,180,5.50,3.95,0.09,Positive
+Berlin,E-Scooter,380,3.60,3.72,0.01,Neutral
+Berlin,E-Bike,190,3.20,3.95,0.08,Positive
+Berlin,Bus-Connect,160,2.10,4.10,0.10,Positive
+Berlin,E-Moto,140,4.80,3.55,-0.02,Neutral
+Madrid,E-Scooter,350,4.20,4.05,0.17,Positive
+Madrid,E-Bike,220,3.70,4.25,0.20,Positive
+Madrid,Bus-Connect,180,2.80,4.40,0.19,Positive
+Madrid,E-Moto,160,5.10,4.10,0.14,Positive
+Warsaw,E-Scooter,280,3.50,3.65,0.03,Neutral
+Warsaw,E-Bike,160,3.00,3.85,0.05,Neutral
+Warsaw,Bus-Connect,140,1.90,4.00,0.09,Positive
+Warsaw,E-Moto,120,4.30,3.75,0.02,Neutral
+Turin,E-Scooter,200,4.10,3.80,0.06,Positive
+Turin,E-Bike,120,3.50,4.10,0.12,Positive
+Turin,Bus-Connect,100,2.30,4.25,0.15,Positive
+Turin,E-Moto,90,4.70,3.90,0.08,Positive