{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6b9d75fc", "metadata": {}, "outputs": [], "source": [ "import xgboost as xgb\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_absolute_error as MAE" ] }, { "cell_type": "code", "execution_count": 2, "id": "cc5beb78", "metadata": {}, "outputs": [], "source": [ "N = 10794\n", "\n", "filenames = ['minDist200m_approx.parquet',\n", " 'minDist500m_approx.parquet',\n", " 'minDist1km_approx.parquet',\n", " 'minDist3km_approx.parquet',\n", " 'minDist5km_approx.parquet',\n", " 'minDist7km_approx.parquet',\n", " 'minDist10km_approx.parquet',\n", " 'minDist15km_approx.parquet',\n", " 'minDist20km_approx.parquet',\n", " 'minDist30km_approx.parquet',\n", " 'minDist35km_approx.parquet']\n", "\n", "# filenames = ['minDist3km_approx.parquet']" ] }, { "cell_type": "code", "execution_count": 3, "id": "49a139c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "processing file: minDist200m_approx.parquet\n", "processing file: minDist500m_approx.parquet\n", "processing file: minDist1km_approx.parquet\n", "processing file: minDist3km_approx.parquet\n", "processing file: minDist5km_approx.parquet\n", "processing file: minDist7km_approx.parquet\n", "processing file: minDist10km_approx.parquet\n", "processing file: minDist15km_approx.parquet\n", "processing file: minDist20km_approx.parquet\n", "processing file: minDist30km_approx.parquet\n", "processing file: minDist35km_approx.parquet\n" ] } ], "source": [ "inputs = ['vx', 'vy', 'v', 'smb', 'z', 's', 'temp']\n", "\n", "MEANS = []\n", "INTERVALS = []\n", "N_samp = 4\n", "from scipy.stats import t\n", "t_value = t.ppf(0.975, df=N_samp-1)\n", "\n", "for filename in filenames:\n", " print('processing file:', filename)\n", " mean_maes = []\n", " for i in range(N_samp):\n", " data = pd.read_parquet(filename, columns = ['THICK', 'vx', 'vy', 'v', 'smb', 'z', 's', 'temp'])\n", " data = data.sample(N)\n", " target = data['THICK']\n", "\n", " mu = np.mean(target)\n", "\n", " target = (target - mu) / 1000\n", "\n", " Xtrain, Xtest, ytrain, ytest = train_test_split(data[inputs], target, test_size=0.2, random_state=42)\n", "\n", " model = xgb.XGBRegressor()\n", " model.fit(Xtrain, ytrain)\n", "\n", " ytest_meters = np.array(ytest * 1000 + mu)\n", " preds = model.predict(Xtest) * 1000 + mu\n", "\n", " # mae = MAE(ytest_meters, preds)\n", "\n", " n_bootstrap = 10000\n", " n_test_samples = len(ytest)\n", "\n", " maes = []\n", "\n", " for _ in range(n_bootstrap):\n", " # Create a bootstrap sample of indices from the test set\n", " bootstrap_indices = np.random.choice(n_test_samples, n_test_samples, replace=True)\n", "\n", " # Get the true and predicted values for this bootstrap sample\n", " y_true_bootstrap = ytest_meters[bootstrap_indices]\n", " y_pred_bootstrap = preds[bootstrap_indices]\n", "\n", " # Calculate MAPE for this bootstrap sample\n", " # Handle potential division by zero for MAPE if y_true_bootstrap can contain zeros\n", " # For robustness, you might want to add a small epsilon or use a different metric if zeros are common.\n", " # For now, assuming non-zero y_true for MAPE.\n", " current_mae = MAE(y_true_bootstrap, y_pred_bootstrap)\n", " maes.append(current_mae)\n", "\n", " # 3. Analyze the distribution of MAPEs\n", " mean_bootstrapped_mape = np.mean(maes)\n", " std_dev_bootstrapped_mape = np.std(maes)\n", "\n", " mean_maes.append(mean_bootstrapped_mape)\n", "\n", " std = np.std(mean_maes)\n", " mean = np.mean(mean_maes)\n", "\n", " lower = mean - t_value * std / np.sqrt(N_samp)\n", " upper = mean + t_value * std / np.sqrt(N_samp)\n", "\n", " MEANS.append(mean)\n", " INTERVALS.append((lower, upper))\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 4, "id": "64c1f084", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([np.float64(219.71600702690554),\n", " np.float64(221.95568110229394),\n", " np.float64(225.20711659724563),\n", " np.float64(234.09904722749985),\n", " np.float64(242.73952189110307),\n", " np.float64(244.78165238152948),\n", " np.float64(246.92594497216655),\n", " np.float64(256.9065720582111),\n", " np.float64(259.7788585419548),\n", " np.float64(261.78795878637084),\n", " np.float64(265.17033243487754)],\n", " [(np.float64(212.8349177211931), np.float64(226.59709633261798)),\n", " (np.float64(209.89354325783856), np.float64(234.01781894674932)),\n", " (np.float64(217.46373647116374), np.float64(232.95049672332752)),\n", " (np.float64(221.72703010535272), np.float64(246.47106434964698)),\n", " (np.float64(237.0747433035595), np.float64(248.40430047864663)),\n", " (np.float64(234.63312391501947), np.float64(254.9301808480395)),\n", " (np.float64(238.90972359624917), np.float64(254.94216634808393)),\n", " (np.float64(251.54701006842788), np.float64(262.26613404799434)),\n", " (np.float64(254.3588026929518), np.float64(265.1989143909578)),\n", " (np.float64(248.75349557277002), np.float64(274.82242199997165)),\n", " (np.float64(255.76308557956034), np.float64(274.57757929019476))])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MEANS, INTERVALS" ] }, { "cell_type": "code", "execution_count": 65, "id": "e25426df", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "48349\n" ] }, { "data": { "text/html": [ "
| \n", " | THICK | \n", "vx | \n", "vy | \n", "v | \n", "smb | \n", "z | \n", "s | \n", "temp | \n", "
|---|---|---|---|---|---|---|---|---|
| 27229 | \n", "3386.250000 | \n", "0.865702 | \n", "-2.477422 | \n", "2.624320 | \n", "51.509872 | \n", "3496.107752 | \n", "0.002632 | \n", "224.609177 | \n", "
| 16336 | \n", "1685.376000 | \n", "3.717804 | \n", "-417.919596 | \n", "417.936132 | \n", "487.795838 | \n", "703.445660 | \n", "0.001437 | \n", "252.450467 | \n", "
| 44013 | \n", "2358.100000 | \n", "3.144033 | \n", "5.880538 | \n", "6.668259 | \n", "151.465501 | \n", "1960.367684 | \n", "0.004441 | \n", "246.175460 | \n", "
| 32146 | \n", "2548.690000 | \n", "-11.739543 | \n", "-6.705746 | \n", "13.519760 | \n", "20.892846 | \n", "2498.718921 | \n", "0.004913 | \n", "230.215486 | \n", "
| 13551 | \n", "723.000000 | \n", "289.773342 | \n", "-129.406338 | \n", "317.355621 | \n", "86.020222 | \n", "115.246588 | \n", "0.001939 | \n", "250.895980 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 46583 | \n", "1608.644600 | \n", "-1.419614 | \n", "-0.547095 | \n", "1.521387 | \n", "25.067829 | \n", "3708.351833 | \n", "0.012573 | \n", "222.767952 | \n", "
| 4548 | \n", "1626.380000 | \n", "-3.206802 | \n", "-16.711924 | \n", "17.016815 | \n", "113.983633 | \n", "2079.887850 | \n", "0.004253 | \n", "239.707678 | \n", "
| 5850 | \n", "823.370000 | \n", "0.181005 | \n", "-4.659279 | \n", "4.662794 | \n", "460.924433 | \n", "822.678150 | \n", "0.014722 | \n", "254.373908 | \n", "
| 24659 | \n", "592.726167 | \n", "-71.054724 | \n", "327.782682 | \n", "335.395677 | \n", "509.965450 | \n", "62.671014 | \n", "0.000863 | \n", "256.296374 | \n", "
| 37527 | \n", "2969.210000 | \n", "-0.112218 | \n", "-2.283875 | \n", "2.286630 | \n", "43.858004 | \n", "3583.883006 | \n", "0.002459 | \n", "223.518175 | \n", "
10794 rows × 8 columns
\n", "XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" feature_weights=None, gamma=None, grow_policy=None,\n",
" importance_type=None, interaction_constraints=None,\n",
" learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
" max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
" max_leaves=None, min_child_weight=None, missing=nan,\n",
" monotone_constraints=None, multi_strategy=None, n_estimators=None,\n",
" n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" feature_weights=None, gamma=None, grow_policy=None,\n",
" importance_type=None, interaction_constraints=None,\n",
" learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
" max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
" max_leaves=None, min_child_weight=None, missing=nan,\n",
" monotone_constraints=None, multi_strategy=None, n_estimators=None,\n",
" n_jobs=None, num_parallel_tree=None, ...)XGBRegressor(base_score=None, booster=None, boosting_type='gbtree',\n",
" callbacks=None, colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=np.float64(0.8155266936013428), device=None,\n",
" early_stopping_rounds=None, enable_categorical=False,\n",
" eval_metric=None, feature_types=None, feature_weights=None,\n",
" gamma=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None,\n",
" learning_rate=np.float64(0.13156640917695547), max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=11, max_leaves=36,\n",
" min_child_weight=44, missing=nan, monotone_constraints=None,\n",
" multi_strategy=None, n_estimators=110, n_jobs=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, boosting_type='gbtree',\n",
" callbacks=None, colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=np.float64(0.8155266936013428), device=None,\n",
" early_stopping_rounds=None, enable_categorical=False,\n",
" eval_metric=None, feature_types=None, feature_weights=None,\n",
" gamma=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None,\n",
" learning_rate=np.float64(0.13156640917695547), max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=11, max_leaves=36,\n",
" min_child_weight=44, missing=nan, monotone_constraints=None,\n",
" multi_strategy=None, n_estimators=110, n_jobs=None, ...)