{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "4f90bfb1", "metadata": {}, "outputs": [], "source": [ "import pyarrow.parquet as pq\n", "import pandas as pd\n", "import random\n", "import gc" ] }, { "cell_type": "code", "execution_count": null, "id": "fedd7106", "metadata": {}, "outputs": [], "source": [ "# Open the Parquet file\n", "parquet_file = pq.ParquetFile(r'C:\\Users\\marku\\Desktop\\4år\\AML\\AppliedML2025\\Final project\\antarctica ml\\AppML_2025\\tabular_train_dataset\\bedmap_train2_30m.parquet')\n", "\n", "# Get number of row groups\n", "num_row_groups = parquet_file.num_row_groups\n", "\n", "# Select 10% of row groups randomly\n", "sample_size = max(1, int(num_row_groups * 0.1))\n", "selected_groups = random.sample(range(num_row_groups), sample_size)\n", "\n", "# Read only the selected row groups, excluding specific columns\n", "dfs = []\n", "for i in selected_groups:\n", " table = parquet_file.read_row_group(i)\n", " df = table.to_pandas()\n", " df = df.drop(columns=['LON', 'LAT', 'geometry'], errors='ignore') # Drop unwanted columns if present\n", " dfs.append(df)\n", "\n", "# Combine into one DataFrame\n", "data = pd.concat(dfs, ignore_index=True)\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "105c726d", "metadata": {}, "outputs": [], "source": [ "data.sort_values(by=['EAST', 'NORTH'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e72f55d9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\marku\\AppData\\Local\\Temp\\ipykernel_1488\\2804182976.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " duplicates.sort_values(by=['EAST', 'NORTH'], inplace=True)\n" ] } ], "source": [ "duplicates = data[data.duplicated(subset=['EAST', 'NORTH'], keep=False)]\n", "duplicates.sort_values(by=['EAST', 'NORTH'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "98f5dbf6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "no. duplicates: 20749.\n", "Unique (EAST, NORTH) pairs that have duplicates: 8369\n" ] } ], "source": [ "print(f\"no. duplicates: {len(duplicates)}.\")\n", "\n", "num_duped_coord = duplicates[['EAST', 'NORTH']].drop_duplicates().shape[0]\n", "print(\"Unique (EAST, NORTH) pairs that have duplicates:\", num_duped_coord)" ] }, { "cell_type": "code", "execution_count": 8, "id": "7e1a206f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " EAST NORTH THICK_range THICK_mean THICK_median \\\n", "0 -2.308745e+06 1.143082e+06 16.590 343.410000 343.915 \n", "1 -2.141053e+06 1.032035e+06 0.000 460.420000 460.420 \n", "2 -1.944284e+06 9.048456e+05 0.000 1290.000000 1290.000 \n", "3 -1.907729e+06 8.049775e+05 0.000 716.000000 716.000 \n", "4 -1.902475e+06 9.188595e+05 5.060 913.010000 913.010 \n", "5 -1.900838e+06 8.852626e+05 0.000 407.000000 407.000 \n", "6 -1.875244e+06 8.974816e+05 0.000 1260.000000 1260.000 \n", "7 -1.800857e+06 8.888363e+05 0.000 770.000000 770.000 \n", "8 -1.766391e+06 7.534504e+05 0.000 763.000000 763.000 \n", "9 -1.708213e+06 8.119492e+05 0.000 1267.000000 1267.000 \n", "10 -1.698118e+06 7.934723e+05 0.000 612.000000 612.000 \n", "11 -1.695179e+06 6.702288e+05 7.928 762.671000 762.671 \n", "12 -1.689698e+06 -1.828034e+05 4.350 826.545000 826.545 \n", "13 -1.688817e+06 -1.826473e+05 1.740 858.260000 858.260 \n", "14 -1.677457e+06 -1.213272e+05 0.350 1819.685000 1819.685 \n", "15 -1.672095e+06 7.657638e+05 0.000 249.000000 249.000 \n", "16 -1.641989e+06 5.976326e+05 2.990 1706.393333 1706.630 \n", "17 -1.628775e+06 4.224858e+05 0.000 1300.000000 1300.000 \n", "18 -1.627830e+06 4.993682e+05 0.000 943.000000 943.000 \n", "19 -1.622924e+06 -2.434122e+05 4.600 502.560000 502.560 \n", "20 -1.622312e+06 -2.459353e+05 0.220 690.100000 690.100 \n", "21 -1.622251e+06 -2.462038e+05 1.570 704.635000 704.635 \n", "22 -1.621120e+06 -2.512063e+05 4.950 710.475000 710.475 \n", "23 -1.620706e+06 -2.522192e+05 3.240 716.200000 716.200 \n", "24 -1.618462e+06 -2.385436e+05 1.170 663.165000 663.165 \n", "\n", " THICK_range_ratio \n", "0 0.048310 \n", "1 0.000000 \n", "2 0.000000 \n", "3 0.000000 \n", "4 0.005542 \n", "5 0.000000 \n", "6 0.000000 \n", "7 0.000000 \n", "8 0.000000 \n", "9 0.000000 \n", "10 0.000000 \n", "11 0.010395 \n", "12 0.005263 \n", "13 0.002027 \n", "14 0.000192 \n", "15 0.000000 \n", "16 0.001752 \n", "17 0.000000 \n", "18 0.000000 \n", "19 0.009153 \n", "20 0.000319 \n", "21 0.002228 \n", "22 0.006967 \n", "23 0.004524 \n", "24 0.001764 \n" ] } ], "source": [ "summary = (\n", " duplicates.groupby(['EAST', 'NORTH']).agg(\n", " THICK_range=('THICK', lambda x: x.max() - x.min()),\n", " THICK_mean=('THICK', 'mean'),\n", " THICK_median=('THICK', 'median')\n", " ).reset_index()\n", ")\n", "summary['THICK_range_ratio'] = summary['THICK_range'] / summary['THICK_mean']\n", "\n", "print(summary.head(25))\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "2b7c1579", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0030684829606210935\n", "0.017637902014526745\n", "8369\n", "2317\n", "1259\n" ] } ], "source": [ "print(summary['THICK_range_ratio'].median())\n", "print(summary['THICK_range_ratio'].mean())\n", "print(len(summary))\n", "print(len(summary[summary['THICK_range_ratio'] > 0.01]))\n", "print(len(summary[(summary['THICK_range_ratio'] > 0.025)]))" ] }, { "cell_type": "code", "execution_count": 17, "id": "d4ea1f81", "metadata": {}, "outputs": [], "source": [ "#Keep only rows with THICK_range_ratio <= 0.025\n", "summary = summary[summary['THICK_range_ratio'] <= 0.025]" ] }, { "cell_type": "code", "execution_count": 19, "id": "5b63659d", "metadata": {}, "outputs": [], "source": [ "# Step 1: Merge df with summary to bring in the Thick_median\n", "merged = data.merge(summary, on=['EAST', 'NORTH'], how='left')\n", "\n", "# Step 2: Keep either:\n", "# - rows not in summary (i.e., Thick_median is NaN)\n", "# - or rows where Thick == Thick_median\n", "result = merged[\n", " merged['THICK_median'].isna() |\n", " (merged['THICK'] == merged['THICK_median'])\n", "]\n", "del merged\n", "gc.collect()\n", "\n", "\n", "\n", "# Optional: Drop Thick_median column if not needed\n", "result = result.drop(columns=['THICK_median', 'THICK_range', 'THICK_mean', 'THICK_range_ratio'], errors='ignore')\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "c45e6e83", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
THICKEASTNORTHvxvyvith_bmsmbzstemp
050.82-2.498779e+061.417597e+06-1133.923586538.8821911255.45876685.3349671607.108764278.6508760.049565266.860876
153.63-2.498462e+061.417028e+06-1104.194542518.7843151219.99293169.7917001669.475359305.1529980.055753266.809909
223.24-2.497579e+061.415438e+06-797.720551252.445367836.71186344.7472691834.636673365.1139570.020203266.596713
321.80-2.495786e+061.412343e+06-65.286477146.920962160.77342119.3852112213.076648499.9618410.046409265.990232
425.30-2.495772e+061.412320e+06-63.979106147.167889160.47340419.3630902215.949898501.0476930.045488265.986040
....................................
20971472000.642.654522e+06-4.884883e+050.44974815.50942315.515943147.532692765.603721182.4179240.040650261.717844
20971481955.432.654789e+06-4.886187e+050.61848315.11501815.127666155.331007763.821258181.1860720.040658261.736247
20971491973.152.654843e+06-4.886439e+050.63044715.06538515.078570156.902040763.393967180.8944780.040675261.740045
20971501905.272.655007e+06-4.887185e+050.43372314.90148214.907793161.674849762.025022179.9628570.040781261.751564
20971511893.282.655062e+06-4.887430e+050.32649414.84833614.851925163.469380761.494619179.6266060.040890261.755432
\n", "

2083679 rows × 11 columns

\n", "
" ], "text/plain": [ " THICK EAST NORTH vx vy \\\n", "0 50.82 -2.498779e+06 1.417597e+06 -1133.923586 538.882191 \n", "1 53.63 -2.498462e+06 1.417028e+06 -1104.194542 518.784315 \n", "2 23.24 -2.497579e+06 1.415438e+06 -797.720551 252.445367 \n", "3 21.80 -2.495786e+06 1.412343e+06 -65.286477 146.920962 \n", "4 25.30 -2.495772e+06 1.412320e+06 -63.979106 147.167889 \n", "... ... ... ... ... ... \n", "2097147 2000.64 2.654522e+06 -4.884883e+05 0.449748 15.509423 \n", "2097148 1955.43 2.654789e+06 -4.886187e+05 0.618483 15.115018 \n", "2097149 1973.15 2.654843e+06 -4.886439e+05 0.630447 15.065385 \n", "2097150 1905.27 2.655007e+06 -4.887185e+05 0.433723 14.901482 \n", "2097151 1893.28 2.655062e+06 -4.887430e+05 0.326494 14.848336 \n", "\n", " v ith_bm smb z s \\\n", "0 1255.458766 85.334967 1607.108764 278.650876 0.049565 \n", "1 1219.992931 69.791700 1669.475359 305.152998 0.055753 \n", "2 836.711863 44.747269 1834.636673 365.113957 0.020203 \n", "3 160.773421 19.385211 2213.076648 499.961841 0.046409 \n", "4 160.473404 19.363090 2215.949898 501.047693 0.045488 \n", "... ... ... ... ... ... \n", "2097147 15.515943 147.532692 765.603721 182.417924 0.040650 \n", "2097148 15.127666 155.331007 763.821258 181.186072 0.040658 \n", "2097149 15.078570 156.902040 763.393967 180.894478 0.040675 \n", "2097150 14.907793 161.674849 762.025022 179.962857 0.040781 \n", "2097151 14.851925 163.469380 761.494619 179.626606 0.040890 \n", "\n", " temp \n", "0 266.860876 \n", "1 266.809909 \n", "2 266.596713 \n", "3 265.990232 \n", "4 265.986040 \n", "... ... \n", "2097147 261.717844 \n", "2097148 261.736247 \n", "2097149 261.740045 \n", "2097150 261.751564 \n", "2097151 261.755432 \n", "\n", "[2083679 rows x 11 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] } ], "metadata": { "kernelspec": { "display_name": "appml25", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }