TSF-EM

Running

JavadBayazi commited on 7 days ago

Commit

06412fb

1 Parent(s): 0043c7f

Add modular data architecture and backtesting features

- Create data.py for centralized data fetching
- Implement ERCOTDataSource and SampleDataSource classes
- Add train/test split for model evaluation
- Display actual vs forecast comparison on plot
- Add error metrics: MAE, RMSE, MAPE
- Show detailed comparison table with day-by-day errors
- Visual train/test split marker on plot
- Easy to extend with new data sources (CAISO, PJM, etc.)

Files changed (2) hide show

app.py +85 -78
data.py +155 -0

app.py CHANGED Viewed

@@ -3,9 +3,9 @@ import pandas as pd
 import torch
 import matplotlib.pyplot as plt
 import numpy as np
-from gridstatus import Ercot
 from datetime import datetime, timedelta
 from models import ModelConfig, load_model_pipeline
 # Load the forecasting model pipeline
 @st.cache_resource
@@ -13,31 +13,11 @@ def load_pipeline(model_name):
     """Load and cache the model pipeline"""
     return load_model_pipeline(model_name, device_map="cpu", dtype=torch.float32)
-# Function to fetch ERCOT electricity price data
 @st.cache_data(ttl=3600)  # Cache for 1 hour
-def fetch_ercot_data(days_back=180):
-    """Fetch ERCOT day-ahead market prices for the current year"""
-    try:
-        ercot = Ercot()
-        current_year = datetime.now().year
-        # Get day-ahead market settlement point prices for the year
-        df = ercot.get_dam_spp(year=current_year)
-        # Get average price per day across all locations
-        df['Date'] = pd.to_datetime(df['Interval Start']).dt.date
-        daily_prices = df.groupby('Date')['SPP'].mean()
-        # Get the last N days
-        if len(daily_prices) > days_back:
-            daily_prices = daily_prices.tail(days_back)
-        # Convert to comma-separated string
-        price_list = daily_prices.round(2).tolist()
-        return ", ".join(map(str, price_list))
-    except Exception as e:
-        st.warning(f"Could not fetch live ERCOT data: {e}. Using sample data instead.")
-        return None
 # Streamlit app interface
 st.title("Electricity Market Price Forecasting with Chronos-2")
@@ -56,25 +36,11 @@ selected_model_name = st.selectbox(
 with st.spinner(f"Loading {selected_model_name}..."):
     pipeline = load_pipeline(selected_model_name)
-# Fetch default ERCOT data
-with st.spinner("Fetching latest ERCOT electricity prices..."):
-    ercot_data = fetch_ercot_data()
-# Fallback to sample data if fetching fails
-default_data = ercot_data if ercot_data else """
-25.50, 24.80, 26.30, 23.90, 25.10, 27.20, 28.50, 26.70, 24.30, 23.80, 25.40, 26.10, 27.80, 29.20, 28.40,
-26.90, 25.30, 24.70, 26.50, 28.10, 29.60, 31.20, 30.50, 28.80, 27.10, 25.90, 27.30, 28.70, 30.20, 32.10,
-31.40, 29.70, 28.20, 26.80, 28.40, 29.80, 31.50, 33.20, 32.60, 30.90, 29.30, 27.80, 29.40, 30.90, 32.70,
-34.50, 33.80, 32.10, 30.50, 28.90, 30.50, 32.10, 33.90, 35.80, 35.10, 33.30, 31.60, 30.10, 31.70, 33.40,
-35.20, 37.10, 36.40, 34.60, 32.90, 31.30, 32.90, 34.60, 36.50, 38.40, 37.70, 35.80, 34.10, 32.50, 34.20,
-35.90, 37.80, 39.80, 39.10, 37.10, 35.40, 33.70, 35.40, 37.20, 39.20, 41.20, 40.50, 38.50, 36.70, 35.00,
-36.70, 38.50, 40.60, 42.60, 41.90, 39.90, 38.00, 36.30, 38.00, 39.90, 42.00, 44.10, 43.40, 41.30, 39.40
-"""
 # Data source selection
 data_source = st.radio(
     "Select Data Source:",
-    ["Live ERCOT Data (Last 180 Days)", "Custom Data"],
     index=0
 )
@@ -82,68 +48,96 @@ data_source = st.radio(
 if data_source == "Custom Data":
     user_input = st.text_area(
         "Enter time series data (comma-separated values):",
-        ""
     )
 else:
     user_input = st.text_area(
-        "ERCOT Day-Ahead Hourly Market Prices ($/MWh) - Daily Average:",
         default_data.strip(),
         height=150
     )
     st.info("💡 Live data from ERCOT's Day-Ahead Market (DAM SPP) - averaged across all settlement points per day")
-# Convert user input into a list of numbers
-def process_input(input_str):
-    return [float(x.strip()) for x in input_str.split(",")]
 try:
     time_series_data = process_input(user_input)
 except ValueError:
     st.error("Please make sure all values are numbers, separated by commas.")
     time_series_data = []  # Set empty data on error to prevent further processing
-# Select the number of days for forecasting
-prediction_length = st.slider("Select Forecast Horizon (Days)", min_value=1, max_value=64, value=14)
 # If data is valid, perform the forecast
 if time_series_data:
-    # Create timestamps starting from today going backwards
     end_date = datetime.now()
     start_date = end_date - timedelta(days=len(time_series_data) - 1)
-    historical_dates = pd.date_range(start=start_date, periods=len(time_series_data), freq='D')
-    # Create a DataFrame for Chronos-2
     context_df = pd.DataFrame({
-        'timestamp': historical_dates,
-        'target': time_series_data,
         'id': 'ercot_prices'
     })
-    # Make the forecast using Chronos-2 API
-    pred_df = pipeline.predict_df(
-        context_df,
-        prediction_length=prediction_length,
-        quantile_levels=[0.1, 0.5, 0.9],
-        id_column="id",
-        timestamp_column="timestamp",
-        target="target",
-    )
-    # Prepare forecast data for plotting with actual dates
-    forecast_dates = pd.date_range(start=end_date + timedelta(days=1), periods=prediction_length, freq='D')
     median = pred_df["predictions"].values
     low = pred_df["0.1"].values
     high = pred_df["0.9"].values
     # Plot the historical and forecasted data with dates
-    plt.figure(figsize=(12, 6))
-    plt.plot(historical_dates, time_series_data, color="royalblue", label="Historical Prices", linewidth=2)
-    plt.plot(forecast_dates, median, color="tomato", label="Median Forecast", linewidth=2)
-    plt.fill_between(forecast_dates, low, high, color="tomato", alpha=0.3, label="80% Prediction Interval")
     plt.xlabel("Date")
     plt.ylabel("Price ($/MWh)")
-    plt.title("ERCOT Electricity Price Forecast")
-    plt.legend()
     plt.grid(alpha=0.3)
     plt.xticks(rotation=45)
     plt.tight_layout()
@@ -151,15 +145,28 @@ if time_series_data:
     # Show the plot in the Streamlit app
     st.pyplot(plt)
-    # Display forecast statistics
-    st.write("### Forecast Summary")
-    col1, col2, col3 = st.columns(3)
     with col1:
-        st.metric("Median Forecast", f"${median.mean():.2f}/MWh")
     with col2:
-        st.metric("Low (10th percentile)", f"${low.mean():.2f}/MWh")
     with col3:
-        st.metric("High (90th percentile)", f"${high.mean():.2f}/MWh")
 # Note for comments, feedback, or questions
 st.write("### Notes")

 import torch
 import matplotlib.pyplot as plt
 import numpy as np
 from datetime import datetime, timedelta
 from models import ModelConfig, load_model_pipeline
+from data import DataConfig, process_input, fetch_data_with_fallback
 # Load the forecasting model pipeline
 @st.cache_resource
     """Load and cache the model pipeline"""
     return load_model_pipeline(model_name, device_map="cpu", dtype=torch.float32)
+# Fetch data with caching
 @st.cache_data(ttl=3600)  # Cache for 1 hour
+def fetch_data(source_name, days_back=180):
+    """Fetch data from specified source with caching"""
+    return fetch_data_with_fallback(source_name, days_back)
 # Streamlit app interface
 st.title("Electricity Market Price Forecasting with Chronos-2")
 with st.spinner(f"Loading {selected_model_name}..."):
     pipeline = load_pipeline(selected_model_name)
 # Data source selection
+available_sources = DataConfig.get_source_names()
 data_source = st.radio(
     "Select Data Source:",
+    available_sources + ["Custom Data"],
     index=0
 )
 if data_source == "Custom Data":
     user_input = st.text_area(
         "Enter time series data (comma-separated values):",
+        "",
+        height=150
     )
+    data_source_used = "Custom"
+    error_msg = None
 else:
+    # Fetch data from selected source
+    with st.spinner(f"Fetching data from {data_source}..."):
+        default_data, data_source_used, error_msg = fetch_data(data_source)
+    if error_msg:
+        st.warning(f"⚠️ {error_msg}\nUsing sample data instead.")
     user_input = st.text_area(
+        f"{data_source_used} - Daily Average Prices ($/MWh):",
         default_data.strip(),
         height=150
     )
     st.info("💡 Live data from ERCOT's Day-Ahead Market (DAM SPP) - averaged across all settlement points per day")
 try:
     time_series_data = process_input(user_input)
 except ValueError:
     st.error("Please make sure all values are numbers, separated by commas.")
     time_series_data = []  # Set empty data on error to prevent further processing
+# Select the number of days for testing (forecasting on known data)
+max_test_days = min(64, len(time_series_data) - 10) if len(time_series_data) > 10 else 1
+prediction_length = st.slider(
+    "Select Test Window (Days to Forecast & Compare)",
+    min_value=1,
+    max_value=max_test_days,
+    value=min(14, max_test_days),
+    help="The last N days will be used as test data. The model will forecast these days and compare with actual values."
+)
 # If data is valid, perform the forecast
 if time_series_data:
+    # Split data into train and test
+    train_length = len(time_series_data) - prediction_length
+    train_data = time_series_data[:train_length]
+    test_data = time_series_data[train_length:]
+    # Create timestamps
     end_date = datetime.now()
     start_date = end_date - timedelta(days=len(time_series_data) - 1)
+    all_dates = pd.date_range(start=start_date, periods=len(time_series_data), freq='D')
+    train_dates = all_dates[:train_length]
+    test_dates = all_dates[train_length:]
+    # Create a DataFrame for training
     context_df = pd.DataFrame({
+        'timestamp': train_dates,
+        'target': train_data,
         'id': 'ercot_prices'
     })
+    # Make the forecast using the model
+    with st.spinner("Generating forecast..."):
+        pred_df = pipeline.predict_df(
+            context_df,
+            prediction_length=prediction_length,
+            quantile_levels=[0.1, 0.5, 0.9],
+            id_column="id",
+            timestamp_column="timestamp",
+            target="target",
+        )
+    # Extract predictions
     median = pred_df["predictions"].values
     low = pred_df["0.1"].values
     high = pred_df["0.9"].values
+    # Calculate error metrics
+    mae = np.mean(np.abs(np.array(test_data) - median))
+    mape = np.mean(np.abs((np.array(test_data) - median) / np.array(test_data))) * 100
+    rmse = np.sqrt(np.mean((np.array(test_data) - median) ** 2))
     # Plot the historical and forecasted data with dates
+    plt.figure(figsize=(14, 7))
+    plt.plot(train_dates, train_data, color="royalblue", label="Training Data", linewidth=2)
+    plt.plot(test_dates, test_data, color="green", label="Actual Test Data", linewidth=2, marker='o', markersize=4)
+    plt.plot(test_dates, median, color="tomato", label="Forecast", linewidth=2, linestyle='--', marker='s', markersize=4)
+    plt.fill_between(test_dates, low, high, color="tomato", alpha=0.3, label="80% Prediction Interval")
+    plt.axvline(x=train_dates[-1], color='gray', linestyle=':', linewidth=1, alpha=0.7)
+    plt.text(train_dates[-1], plt.ylim()[1]*0.95, ' Train/Test Split', fontsize=10, color='gray')
     plt.xlabel("Date")
     plt.ylabel("Price ($/MWh)")
+    plt.title(f"ERCOT Electricity Price Forecast - {prediction_length} Day Test Window")
+    plt.legend(loc='best')
     plt.grid(alpha=0.3)
     plt.xticks(rotation=45)
     plt.tight_layout()
     # Show the plot in the Streamlit app
     st.pyplot(plt)
+    # Display forecast statistics and error metrics
+    st.write("### Model Performance Metrics")
+    col1, col2, col3, col4 = st.columns(4)
     with col1:
+        st.metric("MAE", f"${mae:.2f}")
     with col2:
+        st.metric("RMSE", f"${rmse:.2f}")
     with col3:
+        st.metric("MAPE", f"{mape:.2f}%")
+    with col4:
+        st.metric("Avg Actual", f"${np.mean(test_data):.2f}/MWh")
+    # Show detailed comparison table
+    with st.expander("View Detailed Comparison"):
+        comparison_df = pd.DataFrame({
+            'Date': test_dates.strftime('%Y-%m-%d'),
+            'Actual': test_data,
+            'Forecast': median.round(2),
+            'Error': (median - np.array(test_data)).round(2),
+            'Error %': ((median - np.array(test_data)) / np.array(test_data) * 100).round(2)
+        })
+        st.dataframe(comparison_df, use_container_width=True)
 # Note for comments, feedback, or questions
 st.write("### Notes")

data.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Data fetching and processing for electricity market price forecasting.
+Handles data retrieval from various sources and preprocessing.
+"""
+import pandas as pd
+from datetime import datetime, timedelta
+from gridstatus import Ercot
+class DataSource:
+    """Base class for data sources"""
+    def fetch_data(self, days_back=180):
+        """
+        Fetch data from the source.
+        Args:
+            days_back: Number of days of historical data to fetch
+        Returns:
+            Comma-separated string of prices, or None on error
+        """
+        raise NotImplementedError
+class ERCOTDataSource(DataSource):
+    """Fetch electricity price data from ERCOT"""
+    def __init__(self):
+        self.name = "ERCOT (Texas)"
+        self.description = "Electric Reliability Council of Texas - Day-Ahead Market"
+    def fetch_data(self, days_back=180):
+        """
+        Fetch ERCOT day-ahead market prices for the current year.
+        Args:
+            days_back: Number of days to fetch (default: 180)
+        Returns:
+            Comma-separated string of daily average prices
+        """
+        try:
+            ercot = Ercot()
+            current_year = datetime.now().year
+            # Get day-ahead market settlement point prices for the year
+            df = ercot.get_dam_spp(year=current_year)
+            # Get average price per day across all locations
+            df['Date'] = pd.to_datetime(df['Interval Start']).dt.date
+            daily_prices = df.groupby('Date')['SPP'].mean()
+            # Get the last N days
+            if len(daily_prices) > days_back:
+                daily_prices = daily_prices.tail(days_back)
+            # Convert to comma-separated string
+            price_list = daily_prices.round(2).tolist()
+            return ", ".join(map(str, price_list))
+        except Exception as e:
+            raise Exception(f"Could not fetch ERCOT data: {e}")
+class SampleDataSource(DataSource):
+    """Fallback sample electricity price data"""
+    def __init__(self):
+        self.name = "Sample Data"
+        self.description = "Sample electricity price data for demonstration"
+    def fetch_data(self, days_back=180):
+        """
+        Return sample electricity price data.
+        Returns:
+            Comma-separated string of sample prices
+        """
+        sample_data = """
+25.50, 24.80, 26.30, 23.90, 25.10, 27.20, 28.50, 26.70, 24.30, 23.80, 25.40, 26.10, 27.80, 29.20, 28.40,
+26.90, 25.30, 24.70, 26.50, 28.10, 29.60, 31.20, 30.50, 28.80, 27.10, 25.90, 27.30, 28.70, 30.20, 32.10,
+31.40, 29.70, 28.20, 26.80, 28.40, 29.80, 31.50, 33.20, 32.60, 30.90, 29.30, 27.80, 29.40, 30.90, 32.70,
+34.50, 33.80, 32.10, 30.50, 28.90, 30.50, 32.10, 33.90, 35.80, 35.10, 33.30, 31.60, 30.10, 31.70, 33.40,
+35.20, 37.10, 36.40, 34.60, 32.90, 31.30, 32.90, 34.60, 36.50, 38.40, 37.70, 35.80, 34.10, 32.50, 34.20,
+35.90, 37.80, 39.80, 39.10, 37.10, 35.40, 33.70, 35.40, 37.20, 39.20, 41.20, 40.50, 38.50, 36.70, 35.00,
+36.70, 38.50, 40.60, 42.60, 41.90, 39.90, 38.00, 36.30, 38.00, 39.90, 42.00, 44.10, 43.40, 41.30, 39.40
+"""
+        return sample_data.strip()
+class DataConfig:
+    """Configuration for available data sources"""
+    AVAILABLE_SOURCES = {
+        "Live ERCOT Data (Last 180 Days)": ERCOTDataSource,
+        "Sample Data": SampleDataSource,
+    }
+    @classmethod
+    def get_source_names(cls):
+        """Get list of available data source names"""
+        return list(cls.AVAILABLE_SOURCES.keys())
+    @classmethod
+    def get_source(cls, source_name):
+        """
+        Get a data source instance by name.
+        Args:
+            source_name: Name of the data source
+        Returns:
+            DataSource instance
+        """
+        source_class = cls.AVAILABLE_SOURCES.get(source_name)
+        if source_class is None:
+            raise ValueError(f"Unknown data source: {source_name}")
+        return source_class()
+def process_input(input_str):
+    """
+    Convert comma-separated string to list of floats.
+    Args:
+        input_str: Comma-separated string of numbers
+    Returns:
+        List of float values
+    """
+    return [float(x.strip()) for x in input_str.split(",") if x.strip()]
+def fetch_data_with_fallback(source_name, days_back=180):
+    """
+    Fetch data from specified source with fallback to sample data.
+    Args:
+        source_name: Name of the data source
+        days_back: Number of days to fetch
+    Returns:
+        Tuple of (data_string, source_used, error_message)
+    """
+    try:
+        source = DataConfig.get_source(source_name)
+        data = source.fetch_data(days_back)
+        return data, source.name, None
+    except Exception as e:
+        # Fallback to sample data
+        sample_source = SampleDataSource()
+        data = sample_source.fetch_data()
+        return data, sample_source.name, str(e)