{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "2ERyVGhbyopK" }, "outputs": [], "source": [ "# Role: Data Analyst\n", "# Pipeline:\n", "# CLEAN > ENCODE > SPLIT 80-20 > RANDOM FOREST CLASSIFICATION (satisfaction)\n", "# > ARIMA REVENUE FORECAST > FEATURE IMPORTANCE > EVALUATION\n", "# =============================================================================\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib.gridspec as gridspec\n", "import seaborn as sns\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.metrics import (classification_report,\n", " ConfusionMatrixDisplay,\n", " accuracy_score)\n", "from statsmodels.tsa.arima.model import ARIMA\n", "\n", "PALETTE = [\"#2E4057\", \"#048A81\", \"#54C6EB\", \"#EFD28D\", \"#C84B31\"]\n", "sns.set_theme(style=\"whitegrid\", palette=PALETTE)\n", "plt.rcParams.update({\"figure.dpi\": 130, \"axes.titlesize\": 13,\n", " \"axes.labelsize\": 11})" ] }, { "cell_type": "code", "source": [ "# SECTION 1 \u2013 LOAD DATA FROM NOTEBOOK 1\n", "# =============================================================================\n", "\n", "ride_df = pd.read_csv(\"ride_data_clean.csv\")\n", "review_df = pd.read_csv(\"review_data_clean.csv\")\n", "merged_df = pd.read_csv(\"merged_summary.csv\")\n", "\n", "print(f\"Rides: {ride_df.shape} | Reviews: {review_df.shape} | Merged: {merged_df.shape}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "63oe81VNzi8_", "outputId": "245fcee0-e3b0-41ed-9fb3-3ec7c4fd6eba" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Rides: (1000, 12) | Reviews: (1500, 7) | Merged: (16, 11)\n" ] } ] }, { "cell_type": "code", "source": [ "# SECTION 2 \u2013 CLASSIFICATION: PREDICT USER SATISFACTION (HIGH vs LOW)\n", "# Dependent variable : SatisfactionLabel (High = rating \u2265 4, Low otherwise)\n", "# Independent variables: final_price_eur, distance_km, duration_min,\n", "# discount_pct, cancelled, ride_type, time_slot\n", "# =============================================================================\n", "\n", "# \u2500\u2500 2a. Build classification dataframe \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "clf_df = ride_df[[\n", " \"final_price_eur\", \"distance_km\", \"duration_min\",\n", " \"discount_pct\", \"cancelled\", \"ride_type\", \"time_slot\", \"rating\"\n", "]].copy()\n", "\n", "clf_df[\"SatisfactionLabel\"] = (clf_df[\"rating\"] >= 4).astype(int) # 1=High, 0=Low\n", "clf_df.drop(columns=\"rating\", inplace=True)\n", "\n", "# \u2500\u2500 2b. Encode categoricals \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "le_rt = LabelEncoder()\n", "le_ts = LabelEncoder()\n", "clf_df[\"ride_type_enc\"] = le_rt.fit_transform(clf_df[\"ride_type\"])\n", "clf_df[\"time_slot_enc\"] = le_ts.fit_transform(clf_df[\"time_slot\"])\n", "clf_df.drop(columns=[\"ride_type\", \"time_slot\"], inplace=True)\n", "\n", "X = clf_df.drop(columns=\"SatisfactionLabel\")\n", "y = clf_df[\"SatisfactionLabel\"]\n", "\n", "# \u2500\u2500 2c. Train / test split 80-20 \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.20, random_state=42, stratify=y)\n", "\n", "print(f\"\\nTrain size: {len(X_train)} | Test size: {len(X_test)}\")\n", "\n", "# \u2500\u2500 2d. Random Forest \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "rf = RandomForestClassifier(n_estimators=200, max_depth=8,\n", " random_state=42, class_weight=\"balanced\")\n", "rf.fit(X_train, y_train)\n", "y_pred = rf.predict(X_test)\n", "\n", "print(f\"\\nClassification Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n", "print(\"\\nClassification Report:\")\n", "print(classification_report(y_test, y_pred,\n", " target_names=[\"Low Satisfaction\", \"High Satisfaction\"]))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nv-OXM0nzywU", "outputId": "335d1c9d-6d2b-4878-9aed-2e3e05e11905" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Train size: 800 | Test size: 200\n", "\n", "Classification Accuracy: 0.6450\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " Low Satisfaction 0.22 0.16 0.18 50\n", "High Satisfaction 0.74 0.81 0.77 150\n", "\n", " accuracy 0.65 200\n", " macro avg 0.48 0.48 0.48 200\n", " weighted avg 0.61 0.65 0.63 200\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "# SECTION 3 \u2013 ARIMA REVENUE FORECAST\n", "# Aggregate weekly total revenue \u2192 forecast next 12 weeks for 3 sample cities\n", "# =============================================================================\n", "\n", "# Generate a synthetic weekly revenue time series per city (realistic trend + noise)\n", "np.random.seed(7)\n", "weeks = pd.date_range(\"2022-01-03\", periods=104, freq=\"W\") # 2 years weekly\n", "cities_sel = [\"Paris\", \"Berlin\", \"Madrid\"]\n", "\n", "city_rev = {}\n", "for c in cities_sel:\n", " trend = np.linspace(80_000, 130_000, 104)\n", " season = 8_000 * np.sin(np.linspace(0, 4 * np.pi, 104))\n", " noise = np.random.normal(0, 5_000, 104)\n", " city_rev[c] = pd.Series(trend + season + noise, index=weeks)\n", "\n", "FORECAST_STEPS = 12" ], "metadata": { "id": "2tsC_F9oz4JO" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "# SECTION 4 \u2013 VISUALIZATIONS (5 charts)\n", "# =============================================================================\n", "\n", "fig = plt.figure(figsize=(20, 24))\n", "fig.suptitle(\"Urban Mobility \u2013 Predictive Analytics & Revenue Forecasting\",\n", " fontsize=17, fontweight=\"bold\", y=0.99)\n", "gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.50, wspace=0.35)\n", "\n", "# \u2500\u2500 Chart 1: Feature importance \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "ax1 = fig.add_subplot(gs[0, 0])\n", "feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values()\n", "feat_imp.index = [\"Discount %\", \"Cancelled\", \"Ride Type\",\n", " \"Time Slot\", \"Duration (min)\", \"Distance (km)\", \"Final Price (\u20ac)\"]\n", "feat_imp.sort_values().plot(kind=\"barh\", ax=ax1, color=PALETTE[1])\n", "ax1.set_title(\"Random Forest \u2013 Feature Importances\\n(Satisfaction Classification)\")\n", "ax1.set_xlabel(\"Importance Score\")\n", "\n", "# \u2500\u2500 Chart 2: Confusion matrix \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "ax2 = fig.add_subplot(gs[0, 1])\n", "ConfusionMatrixDisplay.from_predictions(\n", " y_test, y_pred,\n", " display_labels=[\"Low\", \"High\"],\n", " colorbar=False, cmap=\"Blues\", ax=ax2)\n", "ax2.set_title(\"Confusion Matrix\\n(Satisfaction: Low vs High)\")\n", "\n", "# \u2500\u2500 Charts 3-5: ARIMA forecasts per city \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "arima_positions = [(1, 0), (1, 1), (2, 0)]\n", "arima_models = {}\n", "\n", "for idx, city in enumerate(cities_sel):\n", " row, col = arima_positions[idx]\n", " ax = fig.add_subplot(gs[row, col])\n", " series = city_rev[city]\n", "\n", " model = ARIMA(series, order=(2, 1, 2))\n", " result = model.fit()\n", " arima_models[city] = result\n", "\n", " forecast = result.get_forecast(steps=FORECAST_STEPS)\n", " forecast_df = forecast.summary_frame(alpha=0.10)\n", " future_idx = pd.date_range(series.index[-1] + pd.Timedelta(weeks=1),\n", " periods=FORECAST_STEPS, freq=\"W\")\n", " forecast_df.index = future_idx\n", "\n", " ax.plot(series, color=PALETTE[0], linewidth=1.2, label=\"Historical\")\n", " ax.plot(forecast_df[\"mean\"], color=PALETTE[2],\n", " linewidth=2, linestyle=\"--\", label=\"Forecast\")\n", " ax.fill_between(forecast_df.index,\n", " forecast_df[\"mean_ci_lower\"],\n", " forecast_df[\"mean_ci_upper\"],\n", " alpha=0.25, color=PALETTE[2], label=\"90% CI\")\n", " ax.set_title(f\"ARIMA Revenue Forecast \u2013 {city}\")\n", " ax.set_ylabel(\"Weekly Revenue (\u20ac)\")\n", " ax.set_xlabel(\"\")\n", " ax.legend(fontsize=8)\n", " ax.yaxis.set_major_formatter(\n", " plt.FuncFormatter(lambda v, _: f\"\u20ac{v/1000:.0f}k\"))\n", "\n", "# \u2500\u2500 Chart 6 (last cell): Price sensitivity \u2013 avg rating by price bucket \u2500\u2500\u2500\u2500\u2500\u2500\u2500\n", "ax6 = fig.add_subplot(gs[2, 1])\n", "ride_df[\"price_bucket\"] = pd.cut(ride_df[\"final_price_eur\"],\n", " bins=[0, 2, 3.5, 5, 6.5, 10],\n", " labels=[\"<2\", \"2\u20133.5\", \"3.5\u20135\", \"5\u20136.5\", \">6.5\"])\n", "price_sens = ride_df.groupby(\"price_bucket\", observed=True)[\"rating\"].mean()\n", "ax6.bar(price_sens.index, price_sens.values, color=PALETTE[3], edgecolor=\"white\")\n", "ax6.set_title(\"Price Sensitivity \u2013 Avg. Rating by Price Bucket\")\n", "ax6.set_xlabel(\"Final Price (\u20ac)\")\n", "ax6.set_ylabel(\"Avg. Rating\")\n", "ax6.set_ylim(3, 5)\n", "for p, v in zip(price_sens.index, price_sens.values):\n", " ax6.text(p, v + 0.02, f\"{v:.2f}\u2605\", ha=\"center\", fontsize=9)\n", "\n", "plt.savefig(\"notebook2_models_output.png\", bbox_inches=\"tight\")\n", "plt.close()\n", "print(\"\\n\u2705 Model output chart saved \u2192 notebook2_models_output.png\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "o7Y2B81jz89H", "outputId": "35a76ae9-1ac1-4de8-bfba-6c349af72d11" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "\u2705 Model output chart saved \u2192 notebook2_models_output.png\n" ] } ] }, { "cell_type": "code", "source": [ "# SECTION 5 \u2013 SAVE FORECAST TABLE\n", "# =============================================================================\n", "all_forecasts = []\n", "for city in cities_sel:\n", " fc = arima_models[city].get_forecast(steps=FORECAST_STEPS).summary_frame(alpha=0.10)\n", " fc.index = pd.date_range(city_rev[city].index[-1] + pd.Timedelta(weeks=1),\n", " periods=FORECAST_STEPS, freq=\"W\")\n", " fc[\"city\"] = city\n", " all_forecasts.append(fc[[\"city\", \"mean\", \"mean_ci_lower\", \"mean_ci_upper\"]])\n", "\n", "forecast_table = pd.concat(all_forecasts)\n", "forecast_table.columns = [\"city\", \"forecast_revenue\", \"ci_lower_90\", \"ci_upper_90\"]\n", "forecast_table.to_csv(\"arima_forecast_table.csv\")\n", "print(\"\u2705 Forecast table saved \u2192 arima_forecast_table.csv\")\n", "print(forecast_table.head(6).round(0).to_string())\n", "\n", "print(\"\\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")\n", "print(\" NOTEBOOK 2 COMPLETE\")\n", "print(\"\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KY_odiwF0h7r", "outputId": "7e77d938-59cc-4241-e4b9-37dc7cb6eaf5" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u2705 Forecast table saved \u2192 arima_forecast_table.csv\n", " city forecast_revenue ci_lower_90 ci_upper_90\n", "2024-01-07 Paris 124853.0 115042.0 134665.0\n", "2024-01-14 Paris 124946.0 112850.0 137043.0\n", "2024-01-21 Paris 124678.0 110776.0 138581.0\n", "2024-01-28 Paris 124855.0 109164.0 140546.0\n", "2024-02-04 Paris 124759.0 107576.0 141942.0\n", "2024-02-11 Paris 124808.0 106197.0 143420.0\n", "\n", "\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n", " NOTEBOOK 2 COMPLETE\n", "\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n" ] } ] } ] }