| import numpy as np |
| import matplotlib.pyplot as plt |
| from dataloader.dataloader import MultiRasterDataset |
| from dataloader.dataloaderMapping import MultiRasterDatasetMapping |
| from dataloader.dataframe_loader import filter_dataframe, separate_and_add_data |
| from sklearn.ensemble import RandomForestRegressor |
| import pandas as pd |
| from tqdm import tqdm |
| from pathlib import Path |
| from config import (TIME_BEGINNING, TIME_END, INFERENCE_TIME, MAX_OC, seasons, |
| SamplesCoordinates_Yearly, MatrixCoordinates_1mil_Yearly, DataYearly, |
| SamplesCoordinates_Seasonally, MatrixCoordinates_1mil_Seasonally, |
| DataSeasonally, file_path_LUCAS_LFU_Lfl_00to23_Bavaria_OC, years_padded) |
| from mapping import create_prediction_visualizations, parallel_predict |
| from torch.utils.data import Dataset, DataLoader |
| import multiprocessing |
| import argparse |
|
|
| def modify_matrix_coordinates(MatrixCoordinates_1mil_Yearly=MatrixCoordinates_1mil_Yearly, |
| MatrixCoordinates_1mil_Seasonally=MatrixCoordinates_1mil_Seasonally, |
| INFERENCE_TIME=INFERENCE_TIME): |
| |
| for i, path in enumerate(MatrixCoordinates_1mil_Seasonally): |
| folders = path.split('/') |
| last_folder = folders[-1] |
| if last_folder == 'Elevation': |
| continue |
| elif last_folder == 'MODIS_NPP': |
| new_path = f"{path}/{INFERENCE_TIME[:4]}" |
| else: |
| new_path = f"{path}/{INFERENCE_TIME}" |
| MatrixCoordinates_1mil_Seasonally[i] = new_path |
|
|
| |
| for i, path in enumerate(MatrixCoordinates_1mil_Yearly): |
| if 'Elevation' in path: |
| continue |
| new_path = f"{path}/{INFERENCE_TIME[:4]}" |
| MatrixCoordinates_1mil_Yearly[i] = new_path |
|
|
| return MatrixCoordinates_1mil_Yearly, MatrixCoordinates_1mil_Seasonally |
|
|
| def parse_arguments(): |
| parser = argparse.ArgumentParser(description='Random Forest Regression for SOC Mapping') |
| parser.add_argument('--model', type=str, choices=['rf'], |
| default='rf', |
| help='Model type: rf (Random Forest)') |
| return parser.parse_args() |
|
|
| def get_top_sampling_years(file_path, top_n=3): |
| """Get the top n years with most samples from Excel file""" |
| try: |
| df = pd.read_excel(file_path) |
| year_counts = df['year'].value_counts() |
| top_years = year_counts.head(top_n) |
| print(f"\nTop {top_n} years with the most samples:") |
| for year, count in top_years.items(): |
| print(f"Year {year}: {count} samples") |
| return df, top_years |
| except Exception as e: |
| print(f"Error reading file: {str(e)}") |
| return None, None |
|
|
| def flatten_paths(path_list): |
| flattened = [] |
| for item in path_list: |
| if isinstance(item, list): |
| flattened.extend(flatten_paths(item)) |
| else: |
| flattened.append(item) |
| return flattened |
|
|
| def main(): |
| args = parse_arguments() |
| df = filter_dataframe(TIME_BEGINNING, TIME_END, MAX_OC) |
| |
| |
| samples_coordinates_array_path, data_array_path = separate_and_add_data() |
| samples_coordinates_array_path = list(dict.fromkeys(flatten_paths(samples_coordinates_array_path))) |
| data_array_path = list(dict.fromkeys(flatten_paths(data_array_path))) |
|
|
| |
| dataset = MultiRasterDataset(samples_coordinates_array_path, data_array_path, df) |
| print("Dataset length:", len(df)) |
| dataloader = DataLoader(dataset, batch_size=2, shuffle=True) |
|
|
| |
| X_train, y_train = [], [] |
| coordinates = [] |
|
|
| for longitudes, latitudes, batch_features, batch_targets in dataloader: |
| longs = longitudes.numpy() |
| lats = latitudes.numpy() |
| valid_mask = ~(np.isnan(longs) | np.isnan(lats)) |
| |
| if not np.any(valid_mask): |
| continue |
| |
| coordinates.append(np.column_stack((longs[valid_mask], lats[valid_mask]))) |
| features_np = batch_features.numpy() |
| flattened_features = features_np.reshape(features_np.shape[0], -1) |
| filtered_features = flattened_features[valid_mask] |
| filtered_targets = batch_targets.numpy()[valid_mask] |
| |
| X_train.extend(filtered_features) |
| y_train.extend(filtered_targets) |
|
|
| X_train = np.array(X_train) |
| y_train = np.array(y_train) |
| coordinates = np.vstack(coordinates) |
|
|
| |
| model = RandomForestRegressor(n_estimators=1000, max_depth=10, n_jobs=-1, random_state=42) |
| model.fit(X_train, y_train) |
| print("RandomForest model trained successfully!") |
|
|
| |
| predictions = model.predict(X_train) |
|
|
| |
| plt.figure(figsize=(10, 8)) |
| scatter = plt.scatter(coordinates[:, 0], coordinates[:, 1], |
| c=predictions, cmap='viridis', alpha=0.6) |
| plt.colorbar(scatter, label='Predicted Values') |
| plt.xlabel('Longitude') |
| plt.ylabel('Latitude') |
| plt.title('Training Set Predictions') |
| plt.grid(True) |
| plt.show() |
|
|
| |
| file_path_coords = "/home/vfourel/SOCProject/SOCmapping/Data/Coordinates1Mil/coordinates_Bavaria_1mil.csv" |
| try: |
| df_full = pd.read_csv(file_path_coords) |
| print(df_full.head()) |
| except Exception as e: |
| print(f"Error loading coordinates file: {e}") |
| return |
|
|
| |
| BandsYearly_1milPoints, _ = modify_matrix_coordinates() |
| num_cpus = multiprocessing.cpu_count() |
|
|
| |
| coordinates, predictions = parallel_predict( |
| df_full=df_full, |
| model=model, |
| bands_yearly=BandsYearly_1milPoints, |
| batch_size=8, |
| num_threads=num_cpus |
| ) |
| save_path_coords = "coordinates_1mil.npy" |
| save_path_preds = "predictions_1mil.npy" |
|
|
| np.save(save_path_coords, coordinates) |
| np.save(save_path_preds, predictions) |
| |
| plt.figure(figsize=(10, 8)) |
| scatter = plt.scatter(coordinates[:, 0], coordinates[:, 1], |
| c=predictions, cmap='viridis', alpha=0.6) |
| plt.colorbar(scatter, label='Predicted Values') |
| plt.xlabel('Longitude') |
| plt.ylabel('Latitude') |
| plt.title('Full Map Predictions') |
| plt.grid(True) |
| plt.show() |
|
|
| |
| save_path = '/home/vfourel/SOCProject/SOCmapping/predictions_plots/randomForest_plots' |
| create_prediction_visualizations(INFERENCE_TIME, coordinates, predictions, save_path) |
|
|
| if __name__ == "__main__": |
| main() |
|
|