josegoji commited on
Commit
115745a
·
verified ·
1 Parent(s): e705ce2

Upload 7 files

Browse files
Files changed (7) hide show
  1. LSTM_forecaster.joblib +3 -0
  2. app.py +19 -0
  3. filterdf.py +121 -0
  4. mergedf.py +152 -0
  5. pipeline.pkl +3 -0
  6. pipelineFinal.py +130 -0
  7. requirements.txt +11 -0
LSTM_forecaster.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c16873628011f5cc2e860eeaf65437171df8b5da6be69fc7dcd3a57b6e64f233
3
+ size 1893197
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pipelineFinal import pipeline_final
3
+
4
+ demo = gr.Interface(
5
+ fn=pipeline_final,
6
+ inputs=[
7
+ gr.Dropdown(["Si", "No"], label="Deseas hacer una prediccion justo luego del tiempo del train"), # Uncomment this line to add audio input
8
+ gr.Slider(0, 24, label="Choose a number"),
9
+ gr.File(label="Sube el archivo de tain en csv. (Solo si elegiste NO)"),
10
+ gr.File(label="Sube el archivo de client en csv. (Solo si elegiste NO)"),
11
+ gr.File(label="Sube el archivo de histroical_weather en csv. (Solo si elegiste NO)"),
12
+ gr.File(label="Sube el archivo de electricity_prices en csv. (Solo si elegiste NO)"),
13
+ gr.File(label="Sube el archivo de gas_prices en csv. (Solo si elegiste NO)")
14
+
15
+ ],
16
+ outputs=[gr.Plot(), gr.DataFrame()]
17
+ )
18
+ demo.launch()
19
+
filterdf.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from geopy.geocoders import Nominatim
3
+ import sys
4
+ import os
5
+
6
+
7
+ def initialize_geolocator(user_agent="county_locator"):
8
+ """Initialize the geolocator object."""
9
+ return Nominatim(user_agent=user_agent)
10
+
11
+
12
+ def get_county_from_coordinates(latitude, longitude, geolocator):
13
+ """Retrieve county name from coordinates using the geolocator."""
14
+ location = geolocator.reverse((latitude, longitude), language="en")
15
+ if location:
16
+ return location.raw.get("address", {}).get("county", "Unknown")
17
+ return "Unknown"
18
+
19
+
20
+ def add_county_column(df):
21
+ """Add county column to DataFrame based on coordinates."""
22
+ geolocator = initialize_geolocator()
23
+ coordinates = df[['longitude', 'latitude']].drop_duplicates()
24
+ coordinates['county'] = coordinates.apply(
25
+ lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1
26
+ )
27
+ df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude'])
28
+ df = df.drop(['longitude', 'latitude'], axis=1)
29
+ return df
30
+
31
+
32
+ def filter_estonian_counties(df):
33
+ """Filter rows by Estonian counties and map county names to integers."""
34
+ county_locations = [
35
+ 'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County',
36
+ 'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County',
37
+ 'Harju County', 'Lääne-Viru County', 'Ida-Viru County'
38
+ ]
39
+ county_to_int = {
40
+ 'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14,
41
+ 'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6,
42
+ 'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2
43
+ }
44
+ df = df[df['county'].isin(county_locations)]
45
+ df.loc[:, 'county'] = df['county'].map(county_to_int)
46
+ return df
47
+
48
+
49
+ def filter_data(train, client, weather, is_business, product_type, county_code):
50
+ """Filter and split train data based on is_business, product_type, county_code and is_consumption."""
51
+ train = train[
52
+ (train['is_business'] == is_business) &
53
+ (train['product_type'] == product_type) &
54
+ (train['county'] == county_code)
55
+ ]
56
+ train = train.drop(['is_business', 'product_type', 'county'], axis=1)
57
+ train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1)
58
+
59
+
60
+ client = client[
61
+ (client['is_business'] == is_business) &
62
+ (client['product_type'] == product_type) &
63
+ (client['county'] == county_code)
64
+ ]
65
+ client = client.drop(['is_business', 'product_type', 'county'], axis=1)
66
+
67
+ weather = weather[weather['county'] == county_code]
68
+ weather = weather.drop(['county'], axis=1)
69
+
70
+ return train, client, weather
71
+
72
+ def save_datasets_to_pickle(datasets, paths=None):
73
+ """Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
74
+ # Obtén el directorio del archivo actual
75
+ base_dir = os.path.dirname(os.path.abspath(__file__))
76
+
77
+ if paths is None:
78
+ paths = [
79
+ os.path.join(base_dir, 'process_files', 'generation.pkl'),
80
+ os.path.join(base_dir, 'process_files', 'client.pkl'),
81
+ os.path.join(base_dir, 'process_files', 'historical_weather.pkl'),
82
+ os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'),
83
+ os.path.join(base_dir, 'process_files', 'gas_prices.pkl'),
84
+ ]
85
+
86
+ # Guardar cada dataset en su respectiva ruta
87
+ for dataset, path in zip(datasets, paths):
88
+ dataset.to_pickle(path)
89
+
90
+
91
+
92
+ def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices):
93
+ # Filter parameters
94
+ is_business, product_type, county_code = 1, 3, 0
95
+
96
+ # Drop unnecessary columns and change date columns to datetime type
97
+ datasets_info = [
98
+ [train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']],
99
+ [client, ['data_block_id'], ['date']],
100
+ [historical_weather, ['data_block_id'], ['datetime']],
101
+ [electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']],
102
+ [gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']]
103
+ ] # [df, [drop_cols], [date_cols]]
104
+
105
+ for df, drop_cols, date_cols in datasets_info:
106
+ df.drop(drop_cols, axis=1, inplace=True)
107
+ for col in date_cols:
108
+ df[col] = pd.to_datetime(df[col])
109
+
110
+ # Add county and filter weather data
111
+ historical_weather = add_county_column(historical_weather)
112
+ historical_weather = filter_estonian_counties(historical_weather)
113
+ # Group weather data by day
114
+ historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index()
115
+
116
+ # Filter data by is_business, product_type, county_code
117
+ train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code)
118
+
119
+ # Save datasets to pickle files
120
+ save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])
121
+
mergedf.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+ import os
4
+
5
+
6
+
7
+ def load_datasets():
8
+ """Load all datasets and return them as dataframes."""
9
+ # Obtén la ruta base del directorio actual
10
+ base_dir = os.getcwd() # Directorio actual
11
+
12
+ # Construye las rutas absolutas de cada archivo
13
+ train_path = os.path.join(base_dir, 'process_files', 'generation.pkl')
14
+ client_path = os.path.join(base_dir, 'process_files', 'client.pkl')
15
+ historical_weather_path = os.path.join(base_dir, 'process_files', 'historical_weather.pkl')
16
+ electricity_prices_path = os.path.join(base_dir, 'process_files', 'electricity_prices.pkl')
17
+ gas_prices_path = os.path.join(base_dir, 'process_files', 'gas_prices.pkl')
18
+
19
+ # Verifica que los archivos existan antes de intentar cargarlos
20
+ for path in [train_path, client_path, historical_weather_path, electricity_prices_path, gas_prices_path]:
21
+ if not os.path.exists(path):
22
+ raise FileNotFoundError(f"Archivo no encontrado: {path}")
23
+
24
+ # Carga los archivos
25
+ train = pd.read_pickle(train_path)
26
+ client = pd.read_pickle(client_path)
27
+ historical_weather = pd.read_pickle(historical_weather_path)
28
+ electricity_prices = pd.read_pickle(electricity_prices_path)
29
+ gas_prices = pd.read_pickle(gas_prices_path)
30
+
31
+ return train, client, historical_weather, electricity_prices, gas_prices
32
+
33
+
34
+ def add_time_series_col(client, historical_weather, electricity_prices, gas_prices):
35
+ """Add column with date where data is available."""
36
+
37
+ client['datetime'] = pd.to_datetime(client['date']) + pd.Timedelta(days=3)
38
+ historical_weather['datetime'] += pd.Timedelta(days=2)
39
+ electricity_prices['datetime'] = pd.to_datetime(electricity_prices['forecast_date']) + pd.Timedelta(days=1)
40
+ gas_prices['datetime'] = pd.to_datetime(gas_prices['forecast_date']) + pd.Timedelta(days=1)
41
+
42
+ # Drop unnecessary columns after date adjustments
43
+ client = client.drop(['date'], axis=1)
44
+ electricity_prices = electricity_prices.drop(['forecast_date'], axis=1)
45
+ gas_prices = gas_prices.drop(['forecast_date'], axis=1)
46
+
47
+ return client, historical_weather, electricity_prices, gas_prices
48
+
49
+
50
+ def merge_datasets(train, client, historical_weather, electricity_prices, gas_prices):
51
+ """Merge DataFrames train, client, historical weather, gas prices and electricity prices based on the datetime column."""
52
+ merged = train.merge(historical_weather, on='datetime', how='left') \
53
+ .merge(electricity_prices, on='datetime', how='left')
54
+
55
+ # Add dt.floor('D')
56
+ merged['date'] = merged['datetime'].dt.floor('D')
57
+ client['date'] = client['datetime'].dt.floor('D')
58
+ client = client.drop('datetime', axis=1)
59
+ gas_prices['date'] = gas_prices['datetime'].dt.floor('D')
60
+ gas_prices = gas_prices.drop('datetime', axis=1)
61
+
62
+ merged = merged.merge(client, on='date', how='outer') \
63
+ .merge(gas_prices, on='date', how='outer')
64
+
65
+ #dreop unnecessary columns
66
+ merged = merged.drop(['date'], axis=1)
67
+
68
+ return merged
69
+
70
+
71
+ def reorder_columns(df, column_order=None):
72
+ """Reorder columns of the DataFrame."""
73
+ if column_order == None:
74
+ column_order = [
75
+ 'datetime', 'target', 'temperature', 'dewpoint', 'rain', 'snowfall',
76
+ 'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid',
77
+ 'cloudcover_high', 'windspeed_10m', 'winddirection_10m',
78
+ 'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation',
79
+ 'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh','eic_count', 'installed_capacity'
80
+ ]
81
+ return df[column_order]
82
+
83
+
84
+ def save_datasets_to_pickle(datasets, paths=None):
85
+ """Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
86
+ if paths == None:
87
+ import root
88
+ paths = [
89
+ root.DIR_DATA_STAGE + 'merged_df.pkl',
90
+ ]
91
+
92
+ # Create folders if not exists
93
+ for path in paths:
94
+ os.makedirs(os.path.dirname(path), exist_ok=True)
95
+
96
+ # Save each dataset to its respective path
97
+ for dataset, path in zip(datasets, paths):
98
+ dataset.to_pickle(path)
99
+
100
+
101
+ def drop_first_3_days(df, column, threshold_column, threshold_nans=70):
102
+ """Drop first 3 days of the dataset if the threshold is exceeded."""
103
+ # Count null values in the threshold column
104
+ nulos = df[threshold_column].isna().sum()
105
+
106
+ # If the threshold is exceeded drop the first 3 days
107
+ if nulos > threshold_nans:
108
+ # Initial date
109
+ fecha_minima = df[column].min()
110
+ # Limit day
111
+ limite = fecha_minima + pd.Timedelta(days=3)
112
+ # Filter df
113
+ df = df[df[column] >= limite]
114
+
115
+ return df
116
+
117
+
118
+ def feature_selection(df):
119
+ cols_2_drop = [ 'dewpoint','cloudcover_low','cloudcover_mid',
120
+ 'cloudcover_high','direct_solar_radiation',
121
+ 'diffuse_radiation', 'lowest_price_per_mwh',
122
+ 'highest_price_per_mwh','eic_count']
123
+ df.drop(columns = cols_2_drop, axis = 1, inplace = True)
124
+ return df
125
+
126
+
127
+ def set_datetime_index(df):
128
+ df = df.set_index('datetime')
129
+ df = df.asfreq('h')
130
+ return df
131
+
132
+
133
+ def merging_datasets():
134
+ # Read datasets
135
+ train, client, historical_weather, electricity_prices, gas_prices = load_datasets()
136
+
137
+ # Prepare date columns for merging
138
+ client, historical_weather, electricity_prices, gas_prices = add_time_series_col(client, historical_weather, electricity_prices, gas_prices)
139
+
140
+ # Merge datasets
141
+ merged = merge_datasets(train, client, historical_weather, electricity_prices, gas_prices)
142
+
143
+ # Reorder dataset columns
144
+ merged = reorder_columns(merged)
145
+
146
+ # Feature selection
147
+ merged = feature_selection(merged)
148
+
149
+ # Set datetime index
150
+ merged = set_datetime_index(merged)
151
+
152
+ return merged
pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6fa69d47a46f823b38783c73e2d36215e1884b6b30742c45c9912ed1542a4be
3
+ size 2283
pipelineFinal.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from calendar import c
2
+ from os import pipe
3
+ import pandas as pd
4
+ import pickle
5
+ from skforecast.utils import load_forecaster
6
+ from filterdf import filter_datasets
7
+ from mergedf import merging_datasets
8
+ import numpy as np
9
+ import plotly.graph_objects as go
10
+
11
+ def load_csv(input_file):
12
+ try:
13
+ # Leer el archivo CSV
14
+ df = pd.read_csv(input_file)
15
+
16
+ # Verificar si el DataFrame está vacío
17
+ if df.empty:
18
+ raise ValueError("El archivo subido está vacío o no tiene datos válidos.")
19
+
20
+ # Retornar las primeras 5 filas como tabla HTML
21
+ # return df.head().to_html()
22
+ return df
23
+ except Exception as e:
24
+ raise f"Error al cargar el archivo CSV:{e}"
25
+
26
+ def load_model(name):
27
+
28
+ model = load_forecaster(name,verbose=True)
29
+ return model
30
+
31
+ def load_pipeline():
32
+ with open('pipeline.pkl', 'rb') as file:
33
+ pipeline = pickle.load(file)
34
+ return pipeline
35
+
36
+ def unscale_data(scaler, predictions):
37
+ placeholder = np.zeros((len(predictions), 11))
38
+ placeholder[:, 0] = predictions['target']
39
+ predictions_scaled = scaler.inverse_transform(placeholder)[:, 0]
40
+ predictions_scaled[predictions_scaled < 0] = 0
41
+ predictions = pd.DataFrame(predictions_scaled, columns=predictions.columns, index=predictions.index)
42
+ return predictions
43
+
44
+ def create_plots(predictions):
45
+ # Gráfico de las predicciones vs valores reales en el conjunto de test del modelo con mejores parametros
46
+ fig = go.Figure()
47
+ trace2 = go.Scatter(x=predictions.index, y=predictions['target'], name="Estimado", mode="lines", line_color="#4EA72E")
48
+ fig.add_trace(trace2)
49
+ fig.update_layout(
50
+ yaxis_title="Producción (kWh)",
51
+ width=750,
52
+ height=350,
53
+ margin=dict(l=20, r=0, t=35, b=20),
54
+ legend=dict(
55
+ orientation="v",
56
+ yanchor="top",
57
+ xanchor="right",
58
+ x=0.99,
59
+ y=0.99
60
+ )
61
+ )
62
+ return fig
63
+
64
+
65
+ def pipeline_final(texto,steps,train=None,client=None,historical_weather=None,electricity_prices=None,gas_prices=None):
66
+ #prueba
67
+ #texto = 'No'
68
+ # #steps
69
+ # steps = 24
70
+ # #dfs
71
+
72
+ # train = 'files_prueba/train_filtered.csv'
73
+ # client = 'files_prueba/client_filtered.csv'
74
+ # historical_weather = 'files_prueba/historical_weather_filtered.csv'
75
+ # electricity_prices = 'files_prueba/electricity_prices_filtered.csv'
76
+ # gas_prices = 'files_prueba/gas_prices_filtered.csv'
77
+ pipeline = load_pipeline()
78
+ scaler = pipeline['scale']
79
+
80
+ #load model
81
+ model = load_model('LSTM_forecaster.joblib')
82
+
83
+
84
+ if texto == 'Si':
85
+ pred = model.predict(steps=steps)
86
+
87
+ pred = unscale_data(scaler, pred)
88
+
89
+ pred_reset = pred.reset_index(drop=False)
90
+
91
+ pred_reset = pred_reset.astype(str)
92
+
93
+ pred_reset = pred_reset.rename(columns={'index': 'fecha'})
94
+
95
+ fig = create_plots(pred)
96
+
97
+ return fig , pred_reset
98
+
99
+ else:
100
+ train = load_csv(train)
101
+ client = load_csv(client)
102
+ historical_weather = load_csv(historical_weather)
103
+ electricity_prices = load_csv(electricity_prices)
104
+ gas_prices = load_csv(gas_prices)
105
+ #filter data sets
106
+ filter_datasets(train,client,historical_weather,electricity_prices,gas_prices)
107
+ #merge data sets
108
+ df = merging_datasets()
109
+ #load pipeline
110
+ # sclaing the data
111
+ df_processed = pipeline.transform(df)
112
+
113
+ df_processed = pd.DataFrame(df_processed, columns=df.columns, index=df.index)
114
+
115
+ pred = model.predict(steps=steps, last_window=df_processed)
116
+
117
+ pred = unscale_data(scaler, pred)
118
+
119
+ pred_reset = pred.reset_index(drop=False)
120
+
121
+ pred_reset = pred_reset.astype(str)
122
+
123
+ pred_reset = pred_reset.rename(columns={'index': 'fecha'})
124
+
125
+ fig = create_plots(pred)
126
+
127
+ return fig , pred_reset
128
+
129
+
130
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ astral==3.2
2
+ feature_engine==1.8.2
3
+ geopy==2.4.1
4
+ gradio==5.7.1
5
+ joblib==1.4.2
6
+ numpy==2.1.3
7
+ pandas==2.2.3
8
+ plotly==5.24.1
9
+ root==0.0.1
10
+ scikit_learn==1.5.2
11
+ skforecast==0.14.0