{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "4f90bfb1", "metadata": {}, "outputs": [], "source": [ "import pyarrow.parquet as pq\n", "import pandas as pd\n", "import random\n", "import gc" ] }, { "cell_type": "code", "execution_count": null, "id": "fedd7106", "metadata": {}, "outputs": [], "source": [ "# Open the Parquet file\n", "parquet_file = pq.ParquetFile(r'C:\\Users\\marku\\Desktop\\4år\\AML\\AppliedML2025\\Final project\\antarctica ml\\AppML_2025\\tabular_train_dataset\\bedmap_train2_30m.parquet')\n", "\n", "# Get number of row groups\n", "num_row_groups = parquet_file.num_row_groups\n", "\n", "# Select 10% of row groups randomly\n", "sample_size = max(1, int(num_row_groups * 0.1))\n", "selected_groups = random.sample(range(num_row_groups), sample_size)\n", "\n", "# Read only the selected row groups, excluding specific columns\n", "dfs = []\n", "for i in selected_groups:\n", " table = parquet_file.read_row_group(i)\n", " df = table.to_pandas()\n", " df = df.drop(columns=['LON', 'LAT', 'geometry'], errors='ignore') # Drop unwanted columns if present\n", " dfs.append(df)\n", "\n", "# Combine into one DataFrame\n", "data = pd.concat(dfs, ignore_index=True)\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "105c726d", "metadata": {}, "outputs": [], "source": [ "data.sort_values(by=['EAST', 'NORTH'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e72f55d9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\marku\\AppData\\Local\\Temp\\ipykernel_1488\\2804182976.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " duplicates.sort_values(by=['EAST', 'NORTH'], inplace=True)\n" ] } ], "source": [ "duplicates = data[data.duplicated(subset=['EAST', 'NORTH'], keep=False)]\n", "duplicates.sort_values(by=['EAST', 'NORTH'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "98f5dbf6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "no. duplicates: 20749.\n", "Unique (EAST, NORTH) pairs that have duplicates: 8369\n" ] } ], "source": [ "print(f\"no. duplicates: {len(duplicates)}.\")\n", "\n", "num_duped_coord = duplicates[['EAST', 'NORTH']].drop_duplicates().shape[0]\n", "print(\"Unique (EAST, NORTH) pairs that have duplicates:\", num_duped_coord)" ] }, { "cell_type": "code", "execution_count": 8, "id": "7e1a206f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " EAST NORTH THICK_range THICK_mean THICK_median \\\n", "0 -2.308745e+06 1.143082e+06 16.590 343.410000 343.915 \n", "1 -2.141053e+06 1.032035e+06 0.000 460.420000 460.420 \n", "2 -1.944284e+06 9.048456e+05 0.000 1290.000000 1290.000 \n", "3 -1.907729e+06 8.049775e+05 0.000 716.000000 716.000 \n", "4 -1.902475e+06 9.188595e+05 5.060 913.010000 913.010 \n", "5 -1.900838e+06 8.852626e+05 0.000 407.000000 407.000 \n", "6 -1.875244e+06 8.974816e+05 0.000 1260.000000 1260.000 \n", "7 -1.800857e+06 8.888363e+05 0.000 770.000000 770.000 \n", "8 -1.766391e+06 7.534504e+05 0.000 763.000000 763.000 \n", "9 -1.708213e+06 8.119492e+05 0.000 1267.000000 1267.000 \n", "10 -1.698118e+06 7.934723e+05 0.000 612.000000 612.000 \n", "11 -1.695179e+06 6.702288e+05 7.928 762.671000 762.671 \n", "12 -1.689698e+06 -1.828034e+05 4.350 826.545000 826.545 \n", "13 -1.688817e+06 -1.826473e+05 1.740 858.260000 858.260 \n", "14 -1.677457e+06 -1.213272e+05 0.350 1819.685000 1819.685 \n", "15 -1.672095e+06 7.657638e+05 0.000 249.000000 249.000 \n", "16 -1.641989e+06 5.976326e+05 2.990 1706.393333 1706.630 \n", "17 -1.628775e+06 4.224858e+05 0.000 1300.000000 1300.000 \n", "18 -1.627830e+06 4.993682e+05 0.000 943.000000 943.000 \n", "19 -1.622924e+06 -2.434122e+05 4.600 502.560000 502.560 \n", "20 -1.622312e+06 -2.459353e+05 0.220 690.100000 690.100 \n", "21 -1.622251e+06 -2.462038e+05 1.570 704.635000 704.635 \n", "22 -1.621120e+06 -2.512063e+05 4.950 710.475000 710.475 \n", "23 -1.620706e+06 -2.522192e+05 3.240 716.200000 716.200 \n", "24 -1.618462e+06 -2.385436e+05 1.170 663.165000 663.165 \n", "\n", " THICK_range_ratio \n", "0 0.048310 \n", "1 0.000000 \n", "2 0.000000 \n", "3 0.000000 \n", "4 0.005542 \n", "5 0.000000 \n", "6 0.000000 \n", "7 0.000000 \n", "8 0.000000 \n", "9 0.000000 \n", "10 0.000000 \n", "11 0.010395 \n", "12 0.005263 \n", "13 0.002027 \n", "14 0.000192 \n", "15 0.000000 \n", "16 0.001752 \n", "17 0.000000 \n", "18 0.000000 \n", "19 0.009153 \n", "20 0.000319 \n", "21 0.002228 \n", "22 0.006967 \n", "23 0.004524 \n", "24 0.001764 \n" ] } ], "source": [ "summary = (\n", " duplicates.groupby(['EAST', 'NORTH']).agg(\n", " THICK_range=('THICK', lambda x: x.max() - x.min()),\n", " THICK_mean=('THICK', 'mean'),\n", " THICK_median=('THICK', 'median')\n", " ).reset_index()\n", ")\n", "summary['THICK_range_ratio'] = summary['THICK_range'] / summary['THICK_mean']\n", "\n", "print(summary.head(25))\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "2b7c1579", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0030684829606210935\n", "0.017637902014526745\n", "8369\n", "2317\n", "1259\n" ] } ], "source": [ "print(summary['THICK_range_ratio'].median())\n", "print(summary['THICK_range_ratio'].mean())\n", "print(len(summary))\n", "print(len(summary[summary['THICK_range_ratio'] > 0.01]))\n", "print(len(summary[(summary['THICK_range_ratio'] > 0.025)]))" ] }, { "cell_type": "code", "execution_count": 17, "id": "d4ea1f81", "metadata": {}, "outputs": [], "source": [ "#Keep only rows with THICK_range_ratio <= 0.025\n", "summary = summary[summary['THICK_range_ratio'] <= 0.025]" ] }, { "cell_type": "code", "execution_count": 19, "id": "5b63659d", "metadata": {}, "outputs": [], "source": [ "# Step 1: Merge df with summary to bring in the Thick_median\n", "merged = data.merge(summary, on=['EAST', 'NORTH'], how='left')\n", "\n", "# Step 2: Keep either:\n", "# - rows not in summary (i.e., Thick_median is NaN)\n", "# - or rows where Thick == Thick_median\n", "result = merged[\n", " merged['THICK_median'].isna() |\n", " (merged['THICK'] == merged['THICK_median'])\n", "]\n", "del merged\n", "gc.collect()\n", "\n", "\n", "\n", "# Optional: Drop Thick_median column if not needed\n", "result = result.drop(columns=['THICK_median', 'THICK_range', 'THICK_mean', 'THICK_range_ratio'], errors='ignore')\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "c45e6e83", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | THICK | \n", "EAST | \n", "NORTH | \n", "vx | \n", "vy | \n", "v | \n", "ith_bm | \n", "smb | \n", "z | \n", "s | \n", "temp | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "50.82 | \n", "-2.498779e+06 | \n", "1.417597e+06 | \n", "-1133.923586 | \n", "538.882191 | \n", "1255.458766 | \n", "85.334967 | \n", "1607.108764 | \n", "278.650876 | \n", "0.049565 | \n", "266.860876 | \n", "
| 1 | \n", "53.63 | \n", "-2.498462e+06 | \n", "1.417028e+06 | \n", "-1104.194542 | \n", "518.784315 | \n", "1219.992931 | \n", "69.791700 | \n", "1669.475359 | \n", "305.152998 | \n", "0.055753 | \n", "266.809909 | \n", "
| 2 | \n", "23.24 | \n", "-2.497579e+06 | \n", "1.415438e+06 | \n", "-797.720551 | \n", "252.445367 | \n", "836.711863 | \n", "44.747269 | \n", "1834.636673 | \n", "365.113957 | \n", "0.020203 | \n", "266.596713 | \n", "
| 3 | \n", "21.80 | \n", "-2.495786e+06 | \n", "1.412343e+06 | \n", "-65.286477 | \n", "146.920962 | \n", "160.773421 | \n", "19.385211 | \n", "2213.076648 | \n", "499.961841 | \n", "0.046409 | \n", "265.990232 | \n", "
| 4 | \n", "25.30 | \n", "-2.495772e+06 | \n", "1.412320e+06 | \n", "-63.979106 | \n", "147.167889 | \n", "160.473404 | \n", "19.363090 | \n", "2215.949898 | \n", "501.047693 | \n", "0.045488 | \n", "265.986040 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2097147 | \n", "2000.64 | \n", "2.654522e+06 | \n", "-4.884883e+05 | \n", "0.449748 | \n", "15.509423 | \n", "15.515943 | \n", "147.532692 | \n", "765.603721 | \n", "182.417924 | \n", "0.040650 | \n", "261.717844 | \n", "
| 2097148 | \n", "1955.43 | \n", "2.654789e+06 | \n", "-4.886187e+05 | \n", "0.618483 | \n", "15.115018 | \n", "15.127666 | \n", "155.331007 | \n", "763.821258 | \n", "181.186072 | \n", "0.040658 | \n", "261.736247 | \n", "
| 2097149 | \n", "1973.15 | \n", "2.654843e+06 | \n", "-4.886439e+05 | \n", "0.630447 | \n", "15.065385 | \n", "15.078570 | \n", "156.902040 | \n", "763.393967 | \n", "180.894478 | \n", "0.040675 | \n", "261.740045 | \n", "
| 2097150 | \n", "1905.27 | \n", "2.655007e+06 | \n", "-4.887185e+05 | \n", "0.433723 | \n", "14.901482 | \n", "14.907793 | \n", "161.674849 | \n", "762.025022 | \n", "179.962857 | \n", "0.040781 | \n", "261.751564 | \n", "
| 2097151 | \n", "1893.28 | \n", "2.655062e+06 | \n", "-4.887430e+05 | \n", "0.326494 | \n", "14.848336 | \n", "14.851925 | \n", "163.469380 | \n", "761.494619 | \n", "179.626606 | \n", "0.040890 | \n", "261.755432 | \n", "
2083679 rows × 11 columns
\n", "