{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Yield Prediction Model - Open Source\n", "\n", "This notebook demonstrates machine learning models to predict:\n", "- **TCH (Tons of Grapes per Hectare)**: Vines yield at harvest\n", "\n", "The models use remote sensing data (satellite imagery), weather information, soil properties, and agronomic attributes.\n", "\n", "**Data**: Anonymized observations from vineyards in Portugal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Setup and Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "import lightgbm as lgb\n", "\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "import joblib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Configuration" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "WEATHER_SOIL_EXP_extra = \"Weather + soil features + extra\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Feature Definitions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Remote sensing features: 210\n", "Agronomic features: 4\n", "Weather features: 28\n" ] } ], "source": [ "# Remote sensing features\n", "REMOTE_SENSING_FEATURES = (\n", " [f'ndvi_{i}' for i in range(42)] +\n", " [f'evi_{i}' for i in range(42)] +\n", " [f'vari_{i}' for i in range(42)] +\n", " [f'ndre_{i}' for i in range(42)] +\n", " [f'ndwi_{i}' for i in range(42)]\n", ")\n", "\n", "# Agronomic features\n", "AGRONOMIC_FEATURES = [\n", " 'NCORTE', 'variety', 'IDADE', 'day_of_year'\n", "]\n", "\n", "# Weather time series\n", "WEATHER_FEATURES = [\n", " 'tp_sum_ts_time_serie_0', 'tp_sum_ts_time_serie_1', 'tp_sum_ts_time_serie_2',\n", " 'tp_sum_ts_time_serie_3', 'tp_sum_ts_time_serie_4', 'tp_sum_ts_time_serie_5', 'tp_sum_ts_time_serie_6',\n", " 'tp_sum_ts_time_serie_7', 'tp_sum_ts_time_serie_8', 'tp_sum_ts_time_serie_9', 'tp_sum_ts_time_serie_10',\n", " 'tp_sum_ts_time_serie_11', 'tp_sum_ts_time_serie_12', 'tp_sum_ts_time_serie_13',\n", " 'degree_days_ts_time_serie_0', 'degree_days_ts_time_serie_1', 'degree_days_ts_time_serie_2',\n", " 'degree_days_ts_time_serie_3', 'degree_days_ts_time_serie_4', 'degree_days_ts_time_serie_5',\n", " 'degree_days_ts_time_serie_6', 'degree_days_ts_time_serie_7', 'degree_days_ts_time_serie_8',\n", " 'degree_days_ts_time_serie_9', 'degree_days_ts_time_serie_10', 'degree_days_ts_time_serie_11',\n", " 'degree_days_ts_time_serie_12', 'degree_days_ts_time_serie_13'\n", "]\n", "\n", "print(f\"Remote sensing features: {len(REMOTE_SENSING_FEATURES)}\")\n", "print(f\"Agronomic features: {len(AGRONOMIC_FEATURES)}\")\n", "print(f\"Weather features: {len(WEATHER_FEATURES)}\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Experiment feature counts:\n", " Weather + soil features + extra: 277 features\n" ] } ], "source": [ "def get_soil_features():\n", " return [\n", " 'soil_grids_type',\n", " 'clay_0-5cm_mean', 'clay_5-15cm_mean', 'clay_15-30cm_mean', 'clay_30-60cm_mean',\n", " 'sand_0-5cm_mean', 'sand_5-15cm_mean', 'sand_15-30cm_mean', 'sand_30-60cm_mean',\n", " 'nitrogen_0-5cm_mean', 'nitrogen_5-15cm_mean', 'nitrogen_15-30cm_mean', 'nitrogen_30-60cm_mean',\n", " 'bdod_0-5cm_mean', 'bdod_5-15cm_mean', 'bdod_15-30cm_mean', 'bdod_30-60cm_mean',\n", " 'silt_0-5cm_mean', 'silt_5-15cm_mean', 'silt_15-30cm_mean', 'silt_30-60cm_mean',\n", " 'soc_0-5cm_mean', 'soc_5-15cm_mean', 'soc_15-30cm_mean', 'soc_30-60cm_mean'\n", " ]\n", "\n", "def get_weather_features():\n", " return (REMOTE_SENSING_FEATURES +\n", " WEATHER_FEATURES +\n", " AGRONOMIC_FEATURES +\n", " ['tp_sum', 'degree_days', 't2m_min_30_days', 'tamp_more13_days_all', 'tamp_more13_days_6m'])\n", "\n", "def get_weather_soil_features():\n", " return get_weather_features() + get_soil_features()\n", "\n", "extra_features = [\n", " 'rootstock_type',\n", " 'tree_spacing',\n", " 'row_spacing',\n", " 'lat',\n", " 'lon',\n", "]\n", "\n", "EXPERIMENTS = {\n", " WEATHER_SOIL_EXP_extra: get_weather_soil_features() + extra_features,\n", "}\n", "\n", "print(\"\\nExperiment feature counts:\")\n", "for exp_name, features in EXPERIMENTS.items():\n", " print(f\" {exp_name}: {len(features)} features\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Load Data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset shape: 926 rows × 589 columns\n" ] }, { "data": { "text/html": [ "
| \n", " | ndvi_0 | \n", "ndvi_1 | \n", "ndvi_2 | \n", "ndvi_3 | \n", "ndvi_4 | \n", "ndvi_5 | \n", "ndvi_6 | \n", "ndvi_7 | \n", "ndvi_8 | \n", "ndvi_9 | \n", "... | \n", "degree_days_recent90d_mean | \n", "degree_days_recent90d_std | \n", "degree_days_recent90d_slope | \n", "tp_sum_3m | \n", "tp_sum_6m | \n", "tp_sum_12m | \n", "degree_days_3m | \n", "degree_days_6m | \n", "degree_days_12m | \n", "tp_sum_delta_month | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0.203887 | \n", "0.221458 | \n", "0.239029 | \n", "0.256600 | \n", "0.274170 | \n", "0.291741 | \n", "0.309312 | \n", "0.326883 | \n", "0.344454 | \n", "0.362024 | \n", "... | \n", "6.905740 | \n", "1.596763 | \n", "-0.785736 | \n", "32.461599 | \n", "91.227125 | \n", "413.033505 | \n", "633.616272 | \n", "1259.963318 | \n", "1377.213837 | \n", "29.399132 | \n", "
| 1 | \n", "0.183539 | \n", "0.207914 | \n", "0.232289 | \n", "0.256664 | \n", "0.281039 | \n", "0.305414 | \n", "0.329789 | \n", "0.354164 | \n", "0.378539 | \n", "0.402914 | \n", "... | \n", "7.094805 | \n", "1.734347 | \n", "-0.842968 | \n", "35.093367 | \n", "93.707648 | \n", "407.216377 | \n", "650.616119 | \n", "1307.372314 | \n", "1427.669952 | \n", "30.748834 | \n", "
| 2 | \n", "0.180039 | \n", "0.189910 | \n", "0.199782 | \n", "0.209654 | \n", "0.219526 | \n", "0.229398 | \n", "0.239270 | \n", "0.249142 | \n", "0.259013 | \n", "0.268885 | \n", "... | \n", "6.840813 | \n", "1.618880 | \n", "-0.796780 | \n", "33.850745 | \n", "91.376749 | \n", "398.388898 | \n", "627.637543 | \n", "1249.714417 | \n", "1362.654053 | \n", "29.573044 | \n", "
| 3 | \n", "0.166330 | \n", "0.171501 | \n", "0.176672 | \n", "0.181843 | \n", "0.187013 | \n", "0.192184 | \n", "0.197355 | \n", "0.202526 | \n", "0.207697 | \n", "0.212868 | \n", "... | \n", "6.840813 | \n", "1.618880 | \n", "-0.796780 | \n", "33.850745 | \n", "91.376749 | \n", "398.388898 | \n", "627.637543 | \n", "1249.714417 | \n", "1362.654053 | \n", "29.573044 | \n", "
| 4 | \n", "0.161632 | \n", "0.186018 | \n", "0.210404 | \n", "0.234790 | \n", "0.259175 | \n", "0.283561 | \n", "0.307947 | \n", "0.332332 | \n", "0.356718 | \n", "0.381104 | \n", "... | \n", "7.094805 | \n", "1.734347 | \n", "-0.842968 | \n", "35.093367 | \n", "93.707648 | \n", "407.216377 | \n", "650.616119 | \n", "1307.372314 | \n", "1427.669952 | \n", "30.748834 | \n", "
5 rows × 589 columns
\n", "