{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "191a5d22-bc45-4f45-a6cd-6d305e29c06e", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "#df = pd.read_parquet('C:/Users/Jens Gorm Rytter/Final project/filer/bedmap_train.parquet', columns=['ith_bm'])\n", "#print(df.head())\n", "\n", "df = pd.read_parquet(\n", " 'C:/Users/Jens Gorm Rytter/Final project/filer/grid_300km_30M.parquet',\n", " columns=['EAST','NORTH','THICK','v','ith_bm','smb','z','s','temp','gridCellId'])\n", "\n", "#print(df.head())\n", "#print(df.tail())" ] }, { "cell_type": "code", "execution_count": 2, "id": "7e5d8ea7-bbf6-41df-81c7-8f59d87539c6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " EAST NORTH THICK v ith_bm smb \\\n", "0 1.082323e+06 1.179464e+05 2193.7041 1.721265 2225.768378 28.600311 \n", "1 6.780533e+05 -1.755376e+06 2899.8000 0.870523 2487.617401 83.017294 \n", "2 5.012288e+05 -8.708590e+05 1819.1100 85.419531 1827.670487 21.117418 \n", "3 -8.666767e+05 3.561459e+05 1124.9400 3.918445 1130.303771 136.581627 \n", "4 -6.848977e+05 6.913987e+04 2122.4100 7.894785 2105.888247 108.577022 \n", "\n", " z s temp gridCellId \n", "0 3890.095924 0.004726 221.131235 137 \n", "1 2348.626277 0.000567 235.855494 28 \n", "2 1771.083276 0.004876 239.003074 82 \n", "3 216.134166 0.006954 246.742698 149 \n", "4 1400.136807 0.002945 243.399524 132 \n" ] } ], "source": [ "print(df.head())" ] }, { "cell_type": "code", "execution_count": 5, "id": "a8f0080c-9ac8-4a55-a2a7-711d91863091", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Antal rækker: 30000000\n", "Antal søjler: 13\n", "Kolonnenavne: ['THICK', 'geometry', 'EAST', 'NORTH', 'vx', 'vy', 'v', 'ith_bm', 'smb', 'z', 's', 'temp', 'gridCellId']\n" ] } ], "source": [ "import pyarrow.parquet as pq\n", "\n", "parquet_file = pq.ParquetFile(\"C:/Users/Jens Gorm Rytter/Final project/filer/grid_300km_30M.parquet\")\n", "print(f\"Antal rækker: {parquet_file.metadata.num_rows}\")\n", "print(f\"Antal søjler: {parquet_file.metadata.num_columns}\")\n", "schema = parquet_file.schema\n", "column_names = schema.names\n", "print(\"Kolonnenavne:\", column_names)" ] }, { "cell_type": "code", "execution_count": 3, "id": "907f780c-9934-4ea7-9b95-ef21f3254f6f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Antal dubletter (samme EAST & NORTH): 624316\n", "1 29096792\n", "2 240671\n", "3 12469\n", "4 2951\n", "5 935\n", " ... \n", "68 3\n", "121 1\n", "178 1\n", "512 1\n", "930 1\n", "Name: count, Length: 70, dtype: int64\n" ] } ], "source": [ "duplicates = df.duplicated(subset=['EAST', 'NORTH'])\n", "num_duplicates = duplicates.sum()\n", "\n", "print(f\"Antal dubletter (samme EAST & NORTH): {num_duplicates}\")\n", "\n", "coord_counts = df.groupby(['EAST', 'NORTH']).size().reset_index(name='count')\n", "duplicate_stats = coord_counts['count'].value_counts().sort_index()\n", "\n", "print(duplicate_stats)" ] }, { "cell_type": "code", "execution_count": 4, "id": "c44f5b51-0b91-4cf5-aeb3-1ca14455e675", "metadata": {}, "outputs": [], "source": [ "#df2 = df.iloc[::100]\n", "df2 = df.sample(frac=0.1, random_state=42)" ] }, { "cell_type": "code", "execution_count": 17, "id": "c6a3b418-b7d5-4b6a-ad2d-0fe6992190d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | EAST | \n", "NORTH | \n", "THICK | \n", "v | \n", "ith_bm | \n", "smb | \n", "z | \n", "s | \n", "temp | \n", "gridCellId | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-2.500332e+06 | \n", "1.420418e+06 | \n", "1.89 | \n", "345.590093 | \n", "14.492977 | \n", "1400.080833 | \n", "227.879901 | \n", "0.119214 | \n", "266.961708 | \n", "198.0 | \n", "
| 1 | \n", "-2.499814e+06 | \n", "1.419478e+06 | \n", "22.47 | \n", "281.224185 | \n", "23.531128 | \n", "1457.514870 | \n", "183.801963 | \n", "0.096204 | \n", "266.951540 | \n", "198.0 | \n", "
| 2 | \n", "-2.497007e+06 | \n", "1.414422e+06 | \n", "21.94 | \n", "522.295060 | \n", "28.679867 | \n", "1952.195055 | \n", "390.242404 | \n", "0.037411 | \n", "266.389898 | \n", "198.0 | \n", "
| 3 | \n", "-2.496247e+06 | \n", "1.413135e+06 | \n", "26.84 | \n", "233.217592 | \n", "21.497502 | \n", "2112.766064 | \n", "442.989492 | \n", "0.058525 | \n", "266.135701 | \n", "198.0 | \n", "
| 4 | \n", "-2.495746e+06 | \n", "1.412275e+06 | \n", "25.46 | \n", "159.803665 | \n", "19.323977 | \n", "2221.549038 | \n", "503.146060 | \n", "0.043808 | \n", "265.977870 | \n", "198.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2977403 | \n", "2.654610e+06 | \n", "-4.885325e+05 | \n", "1995.39 | \n", "15.377782 | \n", "150.137655 | \n", "765.070288 | \n", "182.042577 | \n", "0.040666 | \n", "261.723873 | \n", "107.0 | \n", "
| 2977404 | \n", "2.655715e+06 | \n", "-4.890188e+05 | \n", "1812.91 | \n", "14.461229 | \n", "233.619802 | \n", "753.976713 | \n", "175.725291 | \n", "0.041156 | \n", "261.804375 | \n", "107.0 | \n", "
| 2977405 | \n", "2.656784e+06 | \n", "-4.894331e+05 | \n", "1609.60 | \n", "12.354862 | \n", "458.779802 | \n", "738.470285 | \n", "167.705409 | \n", "0.040306 | \n", "261.891587 | \n", "107.0 | \n", "
| 2977406 | \n", "2.656938e+06 | \n", "-4.894937e+05 | \n", "1604.26 | \n", "12.134192 | \n", "514.279006 | \n", "736.148817 | \n", "166.577442 | \n", "0.040036 | \n", "261.904139 | \n", "107.0 | \n", "
| 2977407 | \n", "2.659762e+06 | \n", "-4.905831e+05 | \n", "1113.75 | \n", "8.255791 | \n", "930.934804 | \n", "672.612741 | \n", "128.195885 | \n", "0.046204 | \n", "262.145135 | \n", "107.0 | \n", "
2977408 rows × 10 columns
\n", "| \n", " | EAST | \n", "NORTH | \n", "THICK | \n", "v | \n", "ith_bm | \n", "smb | \n", "z | \n", "s | \n", "temp | \n", "gridCellId | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-2.500332e+06 | \n", "1.420418e+06 | \n", "1.89 | \n", "345.590093 | \n", "14.492977 | \n", "1400.080833 | \n", "227.879901 | \n", "0.119214 | \n", "266.961708 | \n", "198.0 | \n", "
| 1 | \n", "-2.499814e+06 | \n", "1.419478e+06 | \n", "22.47 | \n", "281.224185 | \n", "23.531128 | \n", "1457.514870 | \n", "183.801963 | \n", "0.096204 | \n", "266.951540 | \n", "198.0 | \n", "
| 2 | \n", "-2.497007e+06 | \n", "1.414422e+06 | \n", "21.94 | \n", "522.295060 | \n", "28.679867 | \n", "1952.195055 | \n", "390.242404 | \n", "0.037411 | \n", "266.389898 | \n", "198.0 | \n", "
| 3 | \n", "-2.496247e+06 | \n", "1.413135e+06 | \n", "26.84 | \n", "233.217592 | \n", "21.497502 | \n", "2112.766064 | \n", "442.989492 | \n", "0.058525 | \n", "266.135701 | \n", "198.0 | \n", "
| 4 | \n", "-2.495746e+06 | \n", "1.412275e+06 | \n", "25.46 | \n", "159.803665 | \n", "19.323977 | \n", "2221.549038 | \n", "503.146060 | \n", "0.043808 | \n", "265.977870 | \n", "198.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2977403 | \n", "2.654610e+06 | \n", "-4.885325e+05 | \n", "1995.39 | \n", "15.377782 | \n", "150.137655 | \n", "765.070288 | \n", "182.042577 | \n", "0.040666 | \n", "261.723873 | \n", "107.0 | \n", "
| 2977404 | \n", "2.655715e+06 | \n", "-4.890188e+05 | \n", "1812.91 | \n", "14.461229 | \n", "233.619802 | \n", "753.976713 | \n", "175.725291 | \n", "0.041156 | \n", "261.804375 | \n", "107.0 | \n", "
| 2977405 | \n", "2.656784e+06 | \n", "-4.894331e+05 | \n", "1609.60 | \n", "12.354862 | \n", "458.779802 | \n", "738.470285 | \n", "167.705409 | \n", "0.040306 | \n", "261.891587 | \n", "107.0 | \n", "
| 2977406 | \n", "2.656938e+06 | \n", "-4.894937e+05 | \n", "1604.26 | \n", "12.134192 | \n", "514.279006 | \n", "736.148817 | \n", "166.577442 | \n", "0.040036 | \n", "261.904139 | \n", "107.0 | \n", "
| 2977407 | \n", "2.659762e+06 | \n", "-4.905831e+05 | \n", "1113.75 | \n", "8.255791 | \n", "930.934804 | \n", "672.612741 | \n", "128.195885 | \n", "0.046204 | \n", "262.145135 | \n", "107.0 | \n", "
2977408 rows × 10 columns
\n", "| \n", " | EAST | \n", "NORTH | \n", "THICK | \n", "v | \n", "ith_bm | \n", "smb | \n", "z | \n", "s | \n", "temp | \n", "gridCellId | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-2.500332e+06 | \n", "1.420418e+06 | \n", "1.89 | \n", "345.590093 | \n", "14.492977 | \n", "1400.080833 | \n", "227.879901 | \n", "0.119214 | \n", "266.961708 | \n", "198.0 | \n", "
| 1 | \n", "-2.499814e+06 | \n", "1.419478e+06 | \n", "22.47 | \n", "281.224185 | \n", "23.531128 | \n", "1457.514870 | \n", "183.801963 | \n", "0.096204 | \n", "266.951540 | \n", "198.0 | \n", "
| 2 | \n", "-2.497007e+06 | \n", "1.414422e+06 | \n", "21.94 | \n", "522.295060 | \n", "28.679867 | \n", "1952.195055 | \n", "390.242404 | \n", "0.037411 | \n", "266.389898 | \n", "198.0 | \n", "
| 3 | \n", "-2.496247e+06 | \n", "1.413135e+06 | \n", "26.84 | \n", "233.217592 | \n", "21.497502 | \n", "2112.766064 | \n", "442.989492 | \n", "0.058525 | \n", "266.135701 | \n", "198.0 | \n", "
| 4 | \n", "-2.495746e+06 | \n", "1.412275e+06 | \n", "25.46 | \n", "159.803665 | \n", "19.323977 | \n", "2221.549038 | \n", "503.146060 | \n", "0.043808 | \n", "265.977870 | \n", "198.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2977062 | \n", "2.631061e+06 | \n", "-3.252919e+05 | \n", "218.40 | \n", "130.205325 | \n", "250.282158 | \n", "705.079382 | \n", "26.830629 | \n", "0.005098 | \n", "259.810482 | \n", "125.0 | \n", "
| 2977065 | \n", "2.631343e+06 | \n", "-3.259614e+05 | \n", "218.40 | \n", "130.393338 | \n", "264.485753 | \n", "706.791006 | \n", "28.352076 | \n", "0.004202 | \n", "259.807381 | \n", "125.0 | \n", "
| 2977074 | \n", "2.631505e+06 | \n", "-3.266202e+05 | \n", "218.40 | \n", "128.240890 | \n", "273.794161 | \n", "708.306892 | \n", "29.349211 | \n", "0.001972 | \n", "259.803235 | \n", "125.0 | \n", "
| 2977075 | \n", "2.631553e+06 | \n", "-3.270478e+05 | \n", "239.40 | \n", "127.461183 | \n", "273.600821 | \n", "709.199779 | \n", "29.328581 | \n", "0.000853 | \n", "259.799202 | \n", "125.0 | \n", "
| 2977076 | \n", "2.631557e+06 | \n", "-3.278887e+05 | \n", "268.80 | \n", "127.074487 | \n", "263.774991 | \n", "710.814251 | \n", "28.276307 | \n", "0.002886 | \n", "259.791447 | \n", "125.0 | \n", "
1532944 rows × 10 columns
\n", "| \n", " | EAST | \n", "NORTH | \n", "THICK | \n", "v | \n", "ith_bm | \n", "smb | \n", "z | \n", "s | \n", "temp | \n", "gridCellId | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 39 | \n", "-2.481114e+06 | \n", "1.477708e+06 | \n", "121.88 | \n", "30.373493 | \n", "32.330403 | \n", "3262.184266 | \n", "1504.508556 | \n", "0.275976 | \n", "262.814873 | \n", "216.0 | \n", "
| 42 | \n", "-2.480957e+06 | \n", "1.477520e+06 | \n", "142.87 | \n", "30.604273 | \n", "31.909792 | \n", "3272.658482 | \n", "1560.809955 | \n", "0.125641 | \n", "262.764319 | \n", "216.0 | \n", "
| 45 | \n", "-2.479770e+06 | \n", "1.476155e+06 | \n", "186.82 | \n", "33.905348 | \n", "24.345979 | \n", "3319.928900 | \n", "1550.275538 | \n", "0.119179 | \n", "262.388155 | \n", "216.0 | \n", "
| 48 | \n", "-2.479278e+06 | \n", "1.475488e+06 | \n", "138.02 | \n", "57.053745 | \n", "39.036100 | \n", "3271.138595 | \n", "1511.674124 | \n", "0.201885 | \n", "262.244550 | \n", "216.0 | \n", "
| 49 | \n", "-2.479131e+06 | \n", "1.475218e+06 | \n", "117.07 | \n", "63.470931 | \n", "45.515895 | \n", "3237.844871 | \n", "1478.640303 | \n", "0.244687 | \n", "262.206679 | \n", "216.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2977403 | \n", "2.654610e+06 | \n", "-4.885325e+05 | \n", "1995.39 | \n", "15.377782 | \n", "150.137655 | \n", "765.070288 | \n", "182.042577 | \n", "0.040666 | \n", "261.723873 | \n", "107.0 | \n", "
| 2977404 | \n", "2.655715e+06 | \n", "-4.890188e+05 | \n", "1812.91 | \n", "14.461229 | \n", "233.619802 | \n", "753.976713 | \n", "175.725291 | \n", "0.041156 | \n", "261.804375 | \n", "107.0 | \n", "
| 2977405 | \n", "2.656784e+06 | \n", "-4.894331e+05 | \n", "1609.60 | \n", "12.354862 | \n", "458.779802 | \n", "738.470285 | \n", "167.705409 | \n", "0.040306 | \n", "261.891587 | \n", "107.0 | \n", "
| 2977406 | \n", "2.656938e+06 | \n", "-4.894937e+05 | \n", "1604.26 | \n", "12.134192 | \n", "514.279006 | \n", "736.148817 | \n", "166.577442 | \n", "0.040036 | \n", "261.904139 | \n", "107.0 | \n", "
| 2977407 | \n", "2.659762e+06 | \n", "-4.905831e+05 | \n", "1113.75 | \n", "8.255791 | \n", "930.934804 | \n", "672.612741 | \n", "128.195885 | \n", "0.046204 | \n", "262.145135 | \n", "107.0 | \n", "
1444464 rows × 10 columns
\n", "| \n", " | ith_bm | \n", "
|---|---|
| 7416703 | \n", "1872.272055 | \n", "
| 23890874 | \n", "2318.404431 | \n", "
| 23317003 | \n", "2704.831963 | \n", "
| 23197163 | \n", "2359.059125 | \n", "
| 26562643 | \n", "801.448965 | \n", "
| ... | \n", "... | \n", "
| 7440855 | \n", "1961.284527 | \n", "
| 22726437 | \n", "468.976002 | \n", "
| 18818953 | \n", "158.828422 | \n", "
| 184493 | \n", "2633.841978 | \n", "
| 10380913 | \n", "1980.466861 | \n", "
1541833 rows × 1 columns
\n", "