File size: 6,638 Bytes
a16f583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import numpy as np
import matplotlib.pyplot as plt
from dataloader.dataloader import MultiRasterDataset 
from dataloader.dataloaderMapping import MultiRasterDatasetMapping
from dataloader.dataframe_loader import filter_dataframe, separate_and_add_data
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from config import (TIME_BEGINNING, TIME_END, INFERENCE_TIME, MAX_OC, seasons,
                    SamplesCoordinates_Yearly, MatrixCoordinates_1mil_Yearly, DataYearly,
                    SamplesCoordinates_Seasonally, MatrixCoordinates_1mil_Seasonally, 
                    DataSeasonally, file_path_LUCAS_LFU_Lfl_00to23_Bavaria_OC, years_padded)
from mapping import create_prediction_visualizations, parallel_predict
from torch.utils.data import Dataset, DataLoader
import multiprocessing
import argparse

def modify_matrix_coordinates(MatrixCoordinates_1mil_Yearly=MatrixCoordinates_1mil_Yearly, 
                            MatrixCoordinates_1mil_Seasonally=MatrixCoordinates_1mil_Seasonally, 
                            INFERENCE_TIME=INFERENCE_TIME):
    # Update MatrixCoordinates_1mil_Seasonally
    for i, path in enumerate(MatrixCoordinates_1mil_Seasonally):
        folders = path.split('/')
        last_folder = folders[-1]
        if last_folder == 'Elevation':
            continue
        elif last_folder == 'MODIS_NPP':
            new_path = f"{path}/{INFERENCE_TIME[:4]}"
        else:
            new_path = f"{path}/{INFERENCE_TIME}"
        MatrixCoordinates_1mil_Seasonally[i] = new_path

    # Update MatrixCoordinates_1mil_Yearly
    for i, path in enumerate(MatrixCoordinates_1mil_Yearly):
        if 'Elevation' in path:
            continue
        new_path = f"{path}/{INFERENCE_TIME[:4]}"
        MatrixCoordinates_1mil_Yearly[i] = new_path

    return MatrixCoordinates_1mil_Yearly, MatrixCoordinates_1mil_Seasonally

def parse_arguments():
    parser = argparse.ArgumentParser(description='Random Forest Regression for SOC Mapping')
    parser.add_argument('--model', type=str, choices=['rf'], 
                       default='rf',
                       help='Model type: rf (Random Forest)')
    return parser.parse_args()

def get_top_sampling_years(file_path, top_n=3):
    """Get the top n years with most samples from Excel file"""
    try:
        df = pd.read_excel(file_path)
        year_counts = df['year'].value_counts()
        top_years = year_counts.head(top_n)
        print(f"\nTop {top_n} years with the most samples:")
        for year, count in top_years.items():
            print(f"Year {year}: {count} samples")
        return df, top_years
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None, None

def flatten_paths(path_list):
    flattened = []
    for item in path_list:
        if isinstance(item, list):
            flattened.extend(flatten_paths(item))
        else:
            flattened.append(item)
    return flattened

def main():
    args = parse_arguments()
    df = filter_dataframe(TIME_BEGINNING, TIME_END, MAX_OC)
    
    # Prepare data paths
    samples_coordinates_array_path, data_array_path = separate_and_add_data()
    samples_coordinates_array_path = list(dict.fromkeys(flatten_paths(samples_coordinates_array_path)))
    data_array_path = list(dict.fromkeys(flatten_paths(data_array_path)))

    # Create dataset and dataloader
    dataset = MultiRasterDataset(samples_coordinates_array_path, data_array_path, df)
    print("Dataset length:", len(df))
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

    # Prepare training data
    X_train, y_train = [], []
    coordinates = []

    for longitudes, latitudes, batch_features, batch_targets in dataloader:
        longs = longitudes.numpy()
        lats = latitudes.numpy()
        valid_mask = ~(np.isnan(longs) | np.isnan(lats))
        
        if not np.any(valid_mask):
            continue
            
        coordinates.append(np.column_stack((longs[valid_mask], lats[valid_mask])))
        features_np = batch_features.numpy()
        flattened_features = features_np.reshape(features_np.shape[0], -1)
        filtered_features = flattened_features[valid_mask]
        filtered_targets = batch_targets.numpy()[valid_mask]
        
        X_train.extend(filtered_features)
        y_train.extend(filtered_targets)

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    coordinates = np.vstack(coordinates)

    # Train RandomForest model
    model = RandomForestRegressor(n_estimators=1000, max_depth=10, n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)
    print("RandomForest model trained successfully!")

    # Make predictions
    predictions = model.predict(X_train)

    # Training set visualization
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(coordinates[:, 0], coordinates[:, 1],
                         c=predictions, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter, label='Predicted Values')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Training Set Predictions')
    plt.grid(True)
    plt.show()

    # Load full prediction coordinates
    file_path_coords = "/home/vfourel/SOCProject/SOCmapping/Data/Coordinates1Mil/coordinates_Bavaria_1mil.csv"
    try:
        df_full = pd.read_csv(file_path_coords)
        print(df_full.head())
    except Exception as e:
        print(f"Error loading coordinates file: {e}")
        return

    # Modify paths for inference
    BandsYearly_1milPoints, _ = modify_matrix_coordinates()
    num_cpus = multiprocessing.cpu_count()

    # Parallel prediction
    coordinates, predictions = parallel_predict(
        df_full=df_full,
        model=model,
        bands_yearly=BandsYearly_1milPoints,
        batch_size=8,
        num_threads=num_cpus
    )
    save_path_coords = "coordinates_1mil.npy"
    save_path_preds = "predictions_1mil.npy"

    np.save(save_path_coords, coordinates)
    np.save(save_path_preds, predictions)
    # Final visualization
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(coordinates[:, 0], coordinates[:, 1],
                         c=predictions, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter, label='Predicted Values')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Full Map Predictions')
    plt.grid(True)
    plt.show()

    # Save predictions
    save_path = '/home/vfourel/SOCProject/SOCmapping/predictions_plots/randomForest_plots'
    create_prediction_visualizations(INFERENCE_TIME, coordinates, predictions, save_path)

if __name__ == "__main__":
    main()