{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "I33qUalK4xe_", "outputId": "70d5f4c0-8d53-4101-83c4-e87737d156cb" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 25000 entries, 0 to 24999\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 delivery_id 25000 non-null float64\n", " 1 delivery_partner 25000 non-null object \n", " 2 package_type 25000 non-null object \n", " 3 vehicle_type 25000 non-null object \n", " 4 delivery_mode 25000 non-null object \n", " 5 region 25000 non-null object \n", " 6 weather_condition 25000 non-null object \n", " 7 distance_km 25000 non-null float64\n", " 8 package_weight_kg 25000 non-null float64\n", " 9 delivery_time_hours 25000 non-null object \n", " 10 expected_time_hours 25000 non-null object \n", " 11 delayed 25000 non-null object \n", " 12 delivery_status 25000 non-null object \n", " 13 delivery_rating 25000 non-null int64 \n", " 14 delivery_cost 25000 non-null float64\n", "dtypes: float64(4), int64(1), object(10)\n", "memory usage: 2.9+ MB\n", "Index(['delivery_id', 'delivery_partner', 'package_type', 'vehicle_type',\n", " 'delivery_mode', 'region', 'weather_condition', 'distance_km',\n", " 'package_weight_kg', 'delivery_time_hours', 'expected_time_hours',\n", " 'delayed', 'delivery_status', 'delivery_rating', 'delivery_cost'],\n", " dtype='object')\n", "delivery_id 0\n", "delivery_partner 0\n", "package_type 0\n", "vehicle_type 0\n", "delivery_mode 0\n", "region 0\n", "weather_condition 0\n", "distance_km 0\n", "package_weight_kg 0\n", "delivery_time_hours 0\n", "expected_time_hours 0\n", "delayed 0\n", "delivery_status 0\n", "delivery_rating 0\n", "delivery_cost 0\n", "dtype: int64\n", " delivery_time_hours expected_time_hours\n", "0 8 8\n", "1 2 3\n", "2 10 16\n", "3 6 8\n", "4 9 16\n", "delivery_id float64\n", "delivery_partner object\n", "package_type object\n", "vehicle_type object\n", "delivery_mode object\n", "region object\n", "weather_condition object\n", "distance_km float64\n", "package_weight_kg float64\n", "delivery_time_hours int64\n", "expected_time_hours int64\n", "delayed object\n", "delivery_status object\n", "delivery_rating int64\n", "delivery_cost float64\n", "dtype: object\n", "delivery_id 0\n", "delivery_partner 0\n", "package_type 0\n", "vehicle_type 0\n", "delivery_mode 0\n", "region 0\n", "weather_condition 0\n", "distance_km 0\n", "package_weight_kg 0\n", "delivery_time_hours 0\n", "expected_time_hours 0\n", "delayed 0\n", "delivery_status 0\n", "delivery_rating 0\n", "delivery_cost 0\n", "dtype: int64\n", "delivery_id 0\n", "delivery_partner 0\n", "package_type 0\n", "vehicle_type 0\n", "delivery_mode 0\n", "region 0\n", "weather_condition 0\n", "distance_km 0\n", "package_weight_kg 0\n", "delivery_time_hours 0\n", "expected_time_hours 0\n", "delayed 0\n", "delivery_status 0\n", "delivery_rating 0\n", "delivery_cost 0\n", "dtype: int64\n", "Data creation complete.\n", "Saved: synthetic_delivery_data.csv\n", "Rows and columns: (25000, 21)\n", "Columns: ['delivery_id', 'delivery_partner', 'package_type', 'vehicle_type', 'delivery_mode', 'region', 'weather_condition', 'distance_km', 'package_weight_kg', 'delivery_time_hours', 'expected_time_hours', 'delayed', 'delivery_status', 'delivery_rating', 'delivery_cost', 'delay_hours', 'calculated_delay', 'delay_ratio', 'delay_score', 'performance_label', 'distance_category']\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# File path\n", "file_path = \"Delivery_Logistics.csv\"\n", "\n", "# Read CSV\n", "df = pd.read_csv(file_path)\n", "\n", "# Preview data\n", "df.head()\n", "\n", "# Check basic info\n", "df.info()\n", "\n", "# Check column names\n", "print(df.columns)\n", "\n", "# Make a copy of the raw dataset before cleaning\n", "df_clean = df.copy()\n", "\n", "# Standardize column names\n", "df_clean.columns = df_clean.columns.str.strip().str.lower()\n", "\n", "# Check for missing values\n", "print(df_clean.isnull().sum())\n", "\n", "# Remove duplicate rows\n", "df_clean = df_clean.drop_duplicates()\n", "\n", "# Preview cleaned dataframe\n", "df_clean.head()\n", "\n", "# Convert the strange time columns into usable numeric values\n", "df_clean[\"delivery_time_hours\"] = (\n", " df_clean[\"delivery_time_hours\"]\n", " .astype(str)\n", " .str.split(\".\")\n", " .str[-1]\n", ")\n", "\n", "df_clean[\"expected_time_hours\"] = (\n", " df_clean[\"expected_time_hours\"]\n", " .astype(str)\n", " .str.split(\".\")\n", " .str[-1]\n", ")\n", "\n", "# Convert extracted values to numeric\n", "df_clean[\"delivery_time_hours\"] = pd.to_numeric(df_clean[\"delivery_time_hours\"], errors=\"coerce\")\n", "df_clean[\"expected_time_hours\"] = pd.to_numeric(df_clean[\"expected_time_hours\"], errors=\"coerce\")\n", "\n", "# Convert the rest of the numeric columns\n", "numeric_cols = [\n", " \"distance_km\",\n", " \"package_weight_kg\",\n", " \"delivery_time_hours\",\n", " \"expected_time_hours\",\n", " \"delivery_rating\",\n", " \"delivery_cost\"\n", "]\n", "\n", "for col in numeric_cols:\n", " df_clean[col] = pd.to_numeric(df_clean[col], errors=\"coerce\")\n", "\n", "# Keep delayed as text\n", "df_clean[\"delayed\"] = df_clean[\"delayed\"].astype(str)\n", "\n", "# Check results\n", "print(df_clean[[\"delivery_time_hours\", \"expected_time_hours\"]].head())\n", "print(df_clean.dtypes)\n", "\n", "# Check missing values again\n", "print(df_clean.isnull().sum())\n", "\n", "# Fill numeric missing values with median\n", "for col in numeric_cols:\n", " df_clean[col] = df_clean[col].fillna(df_clean[col].median())\n", "\n", "# Fill categorical missing values with mode\n", "categorical_cols = [\n", " \"delivery_partner\",\n", " \"package_type\",\n", " \"vehicle_type\",\n", " \"delivery_mode\",\n", " \"region\",\n", " \"weather_condition\",\n", " \"delayed\",\n", " \"delivery_status\"\n", "]\n", "\n", "for col in categorical_cols:\n", " df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])\n", "\n", "# Confirm missing values are handled\n", "print(df_clean.isnull().sum())\n", "\n", "synthetic_delivery_data = df_clean[\n", " [\n", " \"delivery_id\",\n", " \"delivery_partner\",\n", " \"package_type\",\n", " \"vehicle_type\",\n", " \"delivery_mode\",\n", " \"region\",\n", " \"weather_condition\",\n", " \"distance_km\",\n", " \"package_weight_kg\",\n", " \"delivery_time_hours\",\n", " \"expected_time_hours\",\n", " \"delayed\",\n", " \"delivery_status\",\n", " \"delivery_rating\",\n", " \"delivery_cost\"\n", " ]\n", "].copy()\n", "\n", "# Preview final dataset\n", "synthetic_delivery_data.head()\n", "\n", "# --- STEP 1.5: BUILD REALISTIC TIME + DELAY LOGIC ---\n", "\n", "import numpy as np\n", "\n", "df_adjusted = df_clean.copy()\n", "\n", "# Standardize text\n", "text_cols = [\"vehicle_type\", \"weather_condition\", \"delivery_mode\", \"region\"]\n", "\n", "for col in text_cols:\n", " df_adjusted[col] = df_adjusted[col].astype(str).str.strip().str.lower()\n", "\n", "# 1. Base expected time\n", "df_adjusted[\"expected_time_hours\"] = df_adjusted[\"distance_km\"] / 45\n", "\n", "# 2. Adjustments\n", "vehicle_adjustment = {\n", " \"bike\": 1.2,\n", " \"van\": 0.5,\n", " \"truck\": 0.8,\n", " \"ev van\": 0.4\n", "}\n", "\n", "weather_adjustment = {\n", " \"clear\": 0.0,\n", " \"cloudy\": 0.2,\n", " \"foggy\": 0.6,\n", " \"rainy\": 0.8,\n", " \"stormy\": 1.2,\n", " \"cold\": 0.2,\n", " \"hot\": 0.2,\n", " \"windy\": 0.3\n", "}\n", "\n", "mode_adjustment = {\n", " \"same day\": 0.3,\n", " \"express\": 0.2,\n", " \"two day\": 0.7,\n", " \"standard\": 0.5\n", "}\n", "\n", "region_adjustment = {\n", " \"central\": 0.6,\n", " \"north\": 0.3,\n", " \"south\": 0.3,\n", " \"east\": 0.4,\n", " \"west\": 0.4\n", "}\n", "\n", "df_adjusted[\"expected_time_hours\"] = (\n", " df_adjusted[\"expected_time_hours\"]\n", " + df_adjusted[\"vehicle_type\"].map(vehicle_adjustment).fillna(0.5)\n", " + df_adjusted[\"weather_condition\"].map(weather_adjustment).fillna(0.3)\n", " + df_adjusted[\"delivery_mode\"].map(mode_adjustment).fillna(0.4)\n", " + df_adjusted[\"region\"].map(region_adjustment).fillna(0.3)\n", ")\n", "\n", "# 3. Multipliers\n", "vehicle_actual_multiplier = {\n", " \"bike\": 1.05,\n", " \"van\": 0.95,\n", " \"truck\": 1.02,\n", " \"ev van\": 0.97\n", "}\n", "\n", "weather_actual_multiplier = {\n", " \"clear\": 0.95,\n", " \"cloudy\": 1.00,\n", " \"foggy\": 1.05,\n", " \"rainy\": 1.10,\n", " \"stormy\": 1.20,\n", " \"cold\": 1.02,\n", " \"hot\": 1.02,\n", " \"windy\": 1.03\n", "}\n", "\n", "mode_actual_multiplier = {\n", " \"same day\": 1.05,\n", " \"express\": 1.02,\n", " \"two day\": 0.97,\n", " \"standard\": 1.00\n", "}\n", "\n", "region_actual_multiplier = {\n", " \"central\": 1.08,\n", " \"north\": 1.00,\n", " \"south\": 1.01,\n", " \"east\": 1.02,\n", " \"west\": 1.03\n", "}\n", "\n", "# 4. Actual delivery time\n", "df_adjusted[\"delivery_time_hours\"] = (\n", " df_adjusted[\"expected_time_hours\"]\n", " * df_adjusted[\"vehicle_type\"].map(vehicle_actual_multiplier).fillna(1.00)\n", " * df_adjusted[\"weather_condition\"].map(weather_actual_multiplier).fillna(1.00)\n", " * df_adjusted[\"delivery_mode\"].map(mode_actual_multiplier).fillna(1.00)\n", " * df_adjusted[\"region\"].map(region_actual_multiplier).fillna(1.00)\n", ")\n", "\n", "# 5. Clip + round\n", "df_adjusted[\"expected_time_hours\"] = df_adjusted[\"expected_time_hours\"].clip(lower=0.5)\n", "df_adjusted[\"delivery_time_hours\"] = df_adjusted[\"delivery_time_hours\"].clip(lower=0.5)\n", "\n", "df_adjusted[\"expected_time_hours\"] = df_adjusted[\"expected_time_hours\"].round(2)\n", "df_adjusted[\"delivery_time_hours\"] = df_adjusted[\"delivery_time_hours\"].round(2)\n", "\n", "# 6. Calculate delay\n", "df_adjusted[\"delay_hours\"] = (\n", " df_adjusted[\"delivery_time_hours\"] - df_adjusted[\"expected_time_hours\"]\n", ")\n", "\n", "# 7. Create new delay flag\n", "df_adjusted[\"calculated_delay\"] = np.where(\n", " df_adjusted[\"delay_hours\"] > 0,\n", " \"yes\",\n", " \"no\"\n", ")\n", "\n", "# Preview\n", "df_adjusted[\n", " [\n", " \"distance_km\",\n", " \"vehicle_type\",\n", " \"weather_condition\",\n", " \"delivery_mode\",\n", " \"region\",\n", " \"expected_time_hours\",\n", " \"delivery_time_hours\",\n", " \"delay_hours\",\n", " \"calculated_delay\"\n", " ]\n", "].head()\n", "\n", "# --- STEP 1.5 cont'd: BALANCE DELAY DISTRIBUTION ---\n", "\n", "df_balanced = df_adjusted.copy()\n", "\n", "# Create percent difference between actual and expected\n", "df_balanced[\"delay_ratio\"] = (\n", " df_balanced[\"delivery_time_hours\"] / df_balanced[\"expected_time_hours\"]\n", ")\n", "\n", "# Apply controlled rules to shape distribution\n", "df_balanced[\"delivery_time_hours\"] = np.where(\n", " df_balanced[\"delay_ratio\"] < 0.98,\n", " df_balanced[\"expected_time_hours\"] * 0.95,\n", " np.where(\n", " df_balanced[\"delay_ratio\"] < 1.05,\n", " df_balanced[\"expected_time_hours\"] * 1.00,\n", " np.where(\n", " df_balanced[\"delay_ratio\"] < 1.15,\n", " df_balanced[\"expected_time_hours\"] * 1.10,\n", " df_balanced[\"expected_time_hours\"] * 1.25\n", " )\n", " )\n", ")\n", "\n", "# Recalculate delay\n", "df_balanced[\"delay_hours\"] = (\n", " df_balanced[\"delivery_time_hours\"] - df_balanced[\"expected_time_hours\"]\n", ")\n", "\n", "# Recalculate delay flag\n", "df_balanced[\"calculated_delay\"] = np.where(\n", " df_balanced[\"delay_hours\"] > 0,\n", " \"yes\",\n", " \"no\"\n", ")\n", "\n", "# Round values\n", "df_balanced[\"delivery_time_hours\"] = df_balanced[\"delivery_time_hours\"].round(2)\n", "df_balanced[\"delay_hours\"] = df_balanced[\"delay_hours\"].round(2)\n", "\n", "# Preview\n", "df_balanced[\n", " [\n", " \"expected_time_hours\",\n", " \"delivery_time_hours\",\n", " \"delay_hours\",\n", " \"calculated_delay\"\n", " ]\n", "].head()\n", "\n", "synthetic_delivery_data = df_balanced.copy()\n", "\n", "# Setup\n", "import random\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)\n", "\n", "# Create a new column, delay hours\n", "synthetic_delivery_data[\"delay_hours\"] = (\n", " synthetic_delivery_data[\"delivery_time_hours\"]\n", " - synthetic_delivery_data[\"expected_time_hours\"]\n", ")\n", "\n", "synthetic_delivery_data.head()\n", "\n", "# Create a function to score the severity of the delay\n", "def generate_delay_score(delay):\n", " if delay <= 0:\n", " base = 5\n", " elif delay <= 2:\n", " base = 4\n", " elif delay <= 5:\n", " base = 3\n", " elif delay <= 8:\n", " base = 2\n", " else:\n", " base = 1\n", "\n", " noise = random.choices([-1, 0, 1], weights=[1, 3, 1])[0]\n", "\n", " return int(np.clip(base + noise, 1, 5))\n", "\n", "# Create Delay Score Column\n", "synthetic_delivery_data[\"delay_score\"] = synthetic_delivery_data[\"delay_hours\"].apply(generate_delay_score)\n", "\n", "synthetic_delivery_data.head()\n", "\n", "# Rating the delay score and the performance label.\n", "def get_performance_label(score):\n", " if score >= 5:\n", " return \"Excellent\"\n", " elif score == 4:\n", " return \"Good\"\n", " elif score == 3:\n", " return \"Average\"\n", " elif score == 2:\n", " return \"Poor\"\n", " else:\n", " return \"Critical\"\n", "\n", "synthetic_delivery_data[\"performance_label\"] = synthetic_delivery_data[\"delay_score\"].apply(get_performance_label)\n", "\n", "synthetic_delivery_data.head()\n", "\n", "# Create distance category\n", "synthetic_delivery_data[\"distance_category\"] = pd.cut(\n", " synthetic_delivery_data[\"distance_km\"],\n", " bins=[0, 50, 150, 300, float(\"inf\")],\n", " labels=[\"Short\", \"Medium\", \"Long\", \"Very Long\"]\n", ")\n", "\n", "synthetic_delivery_data[[\"distance_km\", \"distance_category\"]].head()\n", "\n", "# Save synthetic dataset\n", "synthetic_delivery_data.to_csv(\"synthetic_delivery_data.csv\", index=False)\n", "\n", "print(\"Data creation complete.\")\n", "print(\"Saved: synthetic_delivery_data.csv\")\n", "print(\"Rows and columns:\", synthetic_delivery_data.shape)\n", "print(\"Columns:\", synthetic_delivery_data.columns.tolist())" ] } ] }