Spaces:

josegoji
/

PI2

Build error

App Files Files Community

josegoji commited on Dec 1, 2024

Commit

115745a

verified ·

1 Parent(s): e705ce2

Upload 7 files

Browse files

Files changed (7) hide show

LSTM_forecaster.joblib +3 -0
app.py +19 -0
filterdf.py +121 -0
mergedf.py +152 -0
pipeline.pkl +3 -0
pipelineFinal.py +130 -0
requirements.txt +11 -0

LSTM_forecaster.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c16873628011f5cc2e860eeaf65437171df8b5da6be69fc7dcd3a57b6e64f233
+size 1893197

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+from pipelineFinal import pipeline_final
+demo = gr.Interface(
+   fn=pipeline_final,
+   inputs=[
+       gr.Dropdown(["Si", "No"], label="Deseas hacer una prediccion justo luego del tiempo del train"),  # Uncomment this line to add audio input
+       gr.Slider(0, 24, label="Choose a number"),
+       gr.File(label="Sube el archivo de tain en csv. (Solo si elegiste NO)"),
+       gr.File(label="Sube el archivo de client en csv. (Solo si elegiste NO)"),
+       gr.File(label="Sube el archivo de histroical_weather en csv. (Solo si elegiste NO)"),
+       gr.File(label="Sube el archivo de electricity_prices en csv. (Solo si elegiste NO)"),
+       gr.File(label="Sube el archivo de gas_prices en csv. (Solo si elegiste NO)")
+   ],
+   outputs=[gr.Plot(), gr.DataFrame()]
+)
+demo.launch()

filterdf.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import pandas as pd
+from geopy.geocoders import Nominatim
+import sys
+import os
+def initialize_geolocator(user_agent="county_locator"):
+    """Initialize the geolocator object."""
+    return Nominatim(user_agent=user_agent)
+def get_county_from_coordinates(latitude, longitude, geolocator):
+    """Retrieve county name from coordinates using the geolocator."""
+    location = geolocator.reverse((latitude, longitude), language="en")
+    if location:
+        return location.raw.get("address", {}).get("county", "Unknown")
+    return "Unknown"
+def add_county_column(df):
+    """Add county column to DataFrame based on coordinates."""
+    geolocator = initialize_geolocator()
+    coordinates = df[['longitude', 'latitude']].drop_duplicates()
+    coordinates['county'] = coordinates.apply(
+        lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1
+    )
+    df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude'])
+    df = df.drop(['longitude', 'latitude'], axis=1)
+    return df
+def filter_estonian_counties(df):
+    """Filter rows by Estonian counties and map county names to integers."""
+    county_locations = [
+        'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County',
+        'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County',
+        'Harju County', 'Lääne-Viru County', 'Ida-Viru County'
+    ]
+    county_to_int = {
+        'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14,
+        'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6,
+        'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2
+    }
+    df = df[df['county'].isin(county_locations)]
+    df.loc[:, 'county'] = df['county'].map(county_to_int)
+    return df
+def filter_data(train, client, weather, is_business, product_type, county_code):
+    """Filter and split train data based on is_business, product_type, county_code and is_consumption."""
+    train = train[
+        (train['is_business'] == is_business) &
+        (train['product_type'] == product_type) &
+        (train['county'] == county_code)
+    ]
+    train = train.drop(['is_business', 'product_type', 'county'], axis=1)
+    train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1)
+    client = client[
+        (client['is_business'] == is_business) &
+        (client['product_type'] == product_type) &
+        (client['county'] == county_code)
+    ]
+    client = client.drop(['is_business', 'product_type', 'county'], axis=1)
+    weather = weather[weather['county'] == county_code]
+    weather = weather.drop(['county'], axis=1)
+    return train, client, weather
+def save_datasets_to_pickle(datasets, paths=None):
+    """Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
+    # Obtén el directorio del archivo actual
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    if paths is None:
+        paths = [
+            os.path.join(base_dir, 'process_files', 'generation.pkl'),
+            os.path.join(base_dir, 'process_files', 'client.pkl'),
+            os.path.join(base_dir, 'process_files', 'historical_weather.pkl'),
+            os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'),
+            os.path.join(base_dir, 'process_files', 'gas_prices.pkl'),
+        ]
+    # Guardar cada dataset en su respectiva ruta
+    for dataset, path in zip(datasets, paths):
+        dataset.to_pickle(path)
+def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices):
+    # Filter parameters
+    is_business, product_type, county_code = 1, 3, 0
+    # Drop unnecessary columns and change date columns to datetime type
+    datasets_info = [
+        [train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']],
+        [client, ['data_block_id'], ['date']],
+        [historical_weather, ['data_block_id'], ['datetime']],
+        [electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']],
+        [gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']]
+    ] # [df, [drop_cols], [date_cols]]
+    for df, drop_cols, date_cols in datasets_info:
+        df.drop(drop_cols, axis=1, inplace=True)
+        for col in date_cols:
+            df[col] = pd.to_datetime(df[col])
+    # Add county and filter weather data
+    historical_weather = add_county_column(historical_weather)
+    historical_weather = filter_estonian_counties(historical_weather)
+    # Group weather data by day
+    historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index()
+    # Filter data by is_business, product_type, county_code
+    train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code)
+    # Save datasets to pickle files
+    save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])

mergedf.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import pandas as pd
+import sys
+import os
+def load_datasets():
+    """Load all datasets and return them as dataframes."""
+    # Obtén la ruta base del directorio actual
+    base_dir = os.getcwd()  # Directorio actual
+    # Construye las rutas absolutas de cada archivo
+    train_path = os.path.join(base_dir, 'process_files', 'generation.pkl')
+    client_path = os.path.join(base_dir, 'process_files', 'client.pkl')
+    historical_weather_path = os.path.join(base_dir, 'process_files', 'historical_weather.pkl')
+    electricity_prices_path = os.path.join(base_dir, 'process_files', 'electricity_prices.pkl')
+    gas_prices_path = os.path.join(base_dir, 'process_files', 'gas_prices.pkl')
+    # Verifica que los archivos existan antes de intentar cargarlos
+    for path in [train_path, client_path, historical_weather_path, electricity_prices_path, gas_prices_path]:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Archivo no encontrado: {path}")
+    # Carga los archivos
+    train = pd.read_pickle(train_path)
+    client = pd.read_pickle(client_path)
+    historical_weather = pd.read_pickle(historical_weather_path)
+    electricity_prices = pd.read_pickle(electricity_prices_path)
+    gas_prices = pd.read_pickle(gas_prices_path)
+    return train, client, historical_weather, electricity_prices, gas_prices
+def add_time_series_col(client, historical_weather, electricity_prices, gas_prices):
+    """Add column with date where data is available."""
+    client['datetime'] = pd.to_datetime(client['date']) + pd.Timedelta(days=3)
+    historical_weather['datetime'] += pd.Timedelta(days=2)
+    electricity_prices['datetime'] = pd.to_datetime(electricity_prices['forecast_date']) + pd.Timedelta(days=1)
+    gas_prices['datetime'] = pd.to_datetime(gas_prices['forecast_date']) + pd.Timedelta(days=1)
+    # Drop unnecessary columns after date adjustments
+    client = client.drop(['date'], axis=1)
+    electricity_prices = electricity_prices.drop(['forecast_date'], axis=1)
+    gas_prices = gas_prices.drop(['forecast_date'], axis=1)
+    return client, historical_weather, electricity_prices, gas_prices
+def merge_datasets(train, client, historical_weather, electricity_prices, gas_prices):
+    """Merge DataFrames train, client, historical weather, gas prices and electricity prices based on the datetime column."""
+    merged = train.merge(historical_weather, on='datetime', how='left') \
+                  .merge(electricity_prices, on='datetime', how='left')
+    # Add dt.floor('D')
+    merged['date'] = merged['datetime'].dt.floor('D')
+    client['date'] = client['datetime'].dt.floor('D')
+    client = client.drop('datetime', axis=1)
+    gas_prices['date'] = gas_prices['datetime'].dt.floor('D')
+    gas_prices = gas_prices.drop('datetime', axis=1)
+    merged = merged.merge(client, on='date', how='outer') \
+                   .merge(gas_prices, on='date', how='outer')
+    #dreop unnecessary columns
+    merged = merged.drop(['date'], axis=1)
+    return merged
+def reorder_columns(df, column_order=None):
+    """Reorder columns of the DataFrame."""
+    if column_order == None:
+        column_order = [
+            'datetime', 'target', 'temperature', 'dewpoint', 'rain', 'snowfall',
+            'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid',
+            'cloudcover_high', 'windspeed_10m', 'winddirection_10m',
+            'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation',
+            'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh','eic_count', 'installed_capacity'
+            ]
+    return df[column_order]
+def save_datasets_to_pickle(datasets, paths=None):
+    """Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
+    if paths == None:
+        import root
+        paths = [
+            root.DIR_DATA_STAGE + 'merged_df.pkl',
+        ]
+    # Create folders if not exists
+    for path in paths:
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+    # Save each dataset to its respective path
+    for dataset, path in zip(datasets, paths):
+        dataset.to_pickle(path)
+def drop_first_3_days(df, column, threshold_column, threshold_nans=70):
+    """Drop first 3 days of the dataset if the threshold is exceeded."""
+    # Count null values in the threshold column
+    nulos = df[threshold_column].isna().sum()
+    # If the threshold is exceeded drop the first 3 days
+    if nulos > threshold_nans:
+        # Initial date
+        fecha_minima = df[column].min()
+        # Limit day
+        limite = fecha_minima + pd.Timedelta(days=3)
+        # Filter df
+        df = df[df[column] >= limite]
+    return df
+def feature_selection(df):
+    cols_2_drop = [ 'dewpoint','cloudcover_low','cloudcover_mid',
+                   'cloudcover_high','direct_solar_radiation',
+                   'diffuse_radiation', 'lowest_price_per_mwh',
+                   'highest_price_per_mwh','eic_count']
+    df.drop(columns = cols_2_drop, axis = 1, inplace = True)
+    return df
+def set_datetime_index(df):
+    df = df.set_index('datetime')
+    df = df.asfreq('h')
+    return df
+def merging_datasets():
+     # Read datasets
+    train, client, historical_weather, electricity_prices, gas_prices = load_datasets()
+    # Prepare date columns for merging
+    client, historical_weather, electricity_prices, gas_prices = add_time_series_col(client, historical_weather, electricity_prices, gas_prices)
+    # Merge datasets
+    merged = merge_datasets(train, client, historical_weather, electricity_prices, gas_prices)
+    # Reorder dataset columns
+    merged = reorder_columns(merged)
+    # Feature selection
+    merged = feature_selection(merged)
+    # Set datetime index
+    merged = set_datetime_index(merged)
+    return merged

pipeline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6fa69d47a46f823b38783c73e2d36215e1884b6b30742c45c9912ed1542a4be
+size 2283

pipelineFinal.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from calendar import c
+from os import pipe
+import pandas as pd
+import pickle
+from skforecast.utils import load_forecaster
+from filterdf import filter_datasets
+from mergedf import merging_datasets
+import numpy as np
+import plotly.graph_objects as go
+def load_csv(input_file):
+    try:
+        # Leer el archivo CSV
+        df = pd.read_csv(input_file)
+        # Verificar si el DataFrame está vacío
+        if df.empty:
+            raise ValueError("El archivo subido está vacío o no tiene datos válidos.")
+        # Retornar las primeras 5 filas como tabla HTML
+        # return df.head().to_html()
+        return df
+    except Exception as e:
+        raise f"Error al cargar el archivo CSV:{e}"
+def load_model(name):
+    model = load_forecaster(name,verbose=True)
+    return model
+def load_pipeline():
+    with open('pipeline.pkl', 'rb') as file:
+        pipeline = pickle.load(file)
+    return pipeline
+def unscale_data(scaler, predictions):
+    placeholder = np.zeros((len(predictions), 11))
+    placeholder[:, 0] = predictions['target']
+    predictions_scaled = scaler.inverse_transform(placeholder)[:, 0]
+    predictions_scaled[predictions_scaled < 0] = 0
+    predictions = pd.DataFrame(predictions_scaled, columns=predictions.columns, index=predictions.index)
+    return predictions
+def create_plots(predictions):
+    # Gráfico de las predicciones vs valores reales en el conjunto de test del modelo con mejores parametros
+    fig = go.Figure()
+    trace2 = go.Scatter(x=predictions.index, y=predictions['target'], name="Estimado", mode="lines", line_color="#4EA72E")
+    fig.add_trace(trace2)
+    fig.update_layout(
+        yaxis_title="Producción (kWh)",
+        width=750,
+        height=350,
+        margin=dict(l=20, r=0, t=35, b=20),
+        legend=dict(
+            orientation="v",
+            yanchor="top",
+            xanchor="right",
+            x=0.99,
+            y=0.99
+        )
+    )
+    return fig
+def pipeline_final(texto,steps,train=None,client=None,historical_weather=None,electricity_prices=None,gas_prices=None):
+    #prueba
+    #texto = 'No'
+    # #steps
+    # steps = 24
+    # #dfs
+    # train = 'files_prueba/train_filtered.csv'
+    # client = 'files_prueba/client_filtered.csv'
+    # historical_weather = 'files_prueba/historical_weather_filtered.csv'
+    # electricity_prices = 'files_prueba/electricity_prices_filtered.csv'
+    # gas_prices = 'files_prueba/gas_prices_filtered.csv'
+    pipeline = load_pipeline()
+    scaler = pipeline['scale']
+    #load model
+    model = load_model('LSTM_forecaster.joblib')
+    if texto == 'Si':
+        pred = model.predict(steps=steps)
+        pred = unscale_data(scaler, pred)
+        pred_reset = pred.reset_index(drop=False)
+        pred_reset = pred_reset.astype(str)
+        pred_reset = pred_reset.rename(columns={'index': 'fecha'})
+        fig = create_plots(pred)
+        return fig , pred_reset
+    else:
+        train = load_csv(train)
+        client = load_csv(client)
+        historical_weather = load_csv(historical_weather)
+        electricity_prices = load_csv(electricity_prices)
+        gas_prices = load_csv(gas_prices)
+        #filter data sets
+        filter_datasets(train,client,historical_weather,electricity_prices,gas_prices)
+        #merge data sets
+        df = merging_datasets()
+        #load pipeline
+        # sclaing the data
+        df_processed = pipeline.transform(df)
+        df_processed = pd.DataFrame(df_processed, columns=df.columns, index=df.index)
+        pred = model.predict(steps=steps, last_window=df_processed)
+        pred = unscale_data(scaler, pred)
+        pred_reset = pred.reset_index(drop=False)
+        pred_reset = pred_reset.astype(str)
+        pred_reset = pred_reset.rename(columns={'index': 'fecha'})
+        fig = create_plots(pred)
+        return fig , pred_reset

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+astral==3.2
+feature_engine==1.8.2
+geopy==2.4.1
+gradio==5.7.1
+joblib==1.4.2
+numpy==2.1.3
+pandas==2.2.3
+plotly==5.24.1
+root==0.0.1
+scikit_learn==1.5.2
+skforecast==0.14.0