ThejasRao's picture
Fix: Readme
ecb9d4e
import streamlit as st
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from .features import (
create_forecasting_features,
create_forecasting_features_1m,
create_forecasting_features_3m,
)
from .plotting import plot_data, download_button
from .config import get_collections
def _custom_grid_search(X_train, y_train, X_test, y_test, param_grid, progress_bar):
model = XGBRegressor()
param_combinations = len(param_grid['learning_rate']) * len(param_grid['max_depth']) * \
len(param_grid['n_estimators']) * len(param_grid['booster'])
current_combination = 0
best_score = float('-inf')
best_params = None
for learning_rate in param_grid['learning_rate']:
for max_depth in param_grid['max_depth']:
for n_estimators in param_grid['n_estimators']:
for booster in param_grid['booster']:
model.set_params(
learning_rate=learning_rate,
max_depth=max_depth,
n_estimators=n_estimators,
booster=booster
)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
if score > best_score:
best_score = score
best_params = {
'learning_rate': learning_rate,
'max_depth': max_depth,
'n_estimators': n_estimators,
'booster': booster
}
current_combination += 1
progress_bar.progress(int((current_combination / param_combinations) * 100))
return best_params
def _train_and_evaluate_generic(df, feature_fn, split_date, progress_bar):
df = feature_fn(df)
train_df = df[df['Reported Date'] < split_date]
test_df = df[df['Reported Date'] >= split_date]
X_train = train_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
y_train = train_df['Modal Price (Rs./Quintal)']
X_test = test_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
y_test = test_df['Modal Price (Rs./Quintal)']
param_grid = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 150],
'booster': ['gbtree', 'dart']
}
st.write("Performing hyperparameter tuning...")
best_params = _custom_grid_search(X_train, y_train, X_test, y_test, param_grid, progress_bar)
st.write("Training the best model and making predictions...")
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
st.write(f"RMSE: {rmse}")
st.write(f"MAE: {mae}")
# Prepare plot data
train_plot_df = train_df[['Reported Date', 'Modal Price (Rs./Quintal)']].copy()
train_plot_df['Type'] = 'Train'
test_plot_df = test_df[['Reported Date', 'Modal Price (Rs./Quintal)']].copy()
test_plot_df['Type'] = 'Test'
predicted_plot_df = test_df[['Reported Date']].copy()
predicted_plot_df['Modal Price (Rs./Quintal)'] = y_pred
predicted_plot_df['Type'] = 'Predicted'
plot_df = pd.concat([train_plot_df, test_plot_df, predicted_plot_df])
import plotly.graph_objects as go
fig = go.Figure()
for plot_type, color, dash in [('Train', 'blue', None), ('Test', 'orange', None), ('Predicted', 'green', 'dot')]:
data = plot_df[plot_df['Type'] == plot_type]
fig.add_trace(go.Scatter(
x=data['Reported Date'],
y=data['Modal Price (Rs./Quintal)'],
mode='lines',
name=f"{plot_type} Data",
line=dict(color=color, dash=dash)
))
fig.update_layout(title="Train, Test, and Predicted Data", xaxis_title="Date", yaxis_title="Modal Price (Rs./Quintal)", template="plotly_white")
st.plotly_chart(fig, use_container_width=True)
return best_params
def train_and_evaluate(df):
progress_bar = st.progress(0)
return _train_and_evaluate_generic(df, create_forecasting_features, '2024-01-01', progress_bar)
def train_and_evaluate_1m(df):
progress_bar = st.progress(0)
return _train_and_evaluate_generic(df, create_forecasting_features_1m, pd.to_datetime('2023-01-01'), progress_bar)
def train_and_evaluate_3m(df):
progress_bar = st.progress(0)
return _train_and_evaluate_generic(df, create_forecasting_features_3m, pd.to_datetime('2023-01-01'), progress_bar)
def forecast_next_14_days(df, _best_params, key):
last_date = df['Reported Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=14)
future_df = pd.DataFrame({'Reported Date': future_dates})
full_df = pd.concat([df, future_df], ignore_index=True)
full_df = create_forecasting_features(full_df)
original_df = full_df[full_df['Reported Date'] <= last_date].copy()
future_df = full_df[full_df['Reported Date'] > last_date].copy()
X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
y_train = original_df['Modal Price (Rs./Quintal)']
X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
model = XGBRegressor(**_best_params)
model.fit(X_train, y_train)
future_predictions = model.predict(X_future)
future_df['Modal Price (Rs./Quintal)'] = future_predictions
plot_data(original_df, future_df, last_date, model, 14)
download_button(future_df, key)
def forecast_next_30_days(df, _best_params, key):
last_date = df['Reported Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30)
future_df = pd.DataFrame({'Reported Date': future_dates})
full_df = pd.concat([df, future_df], ignore_index=True)
full_df = create_forecasting_features_1m(full_df)
original_df = full_df[full_df['Reported Date'] <= last_date].copy()
future_df = full_df[full_df['Reported Date'] > last_date].copy()
X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
y_train = original_df['Modal Price (Rs./Quintal)']
X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
model = XGBRegressor(**_best_params)
model.fit(X_train, y_train)
future_predictions = model.predict(X_future)
future_df['Modal Price (Rs./Quintal)'] = future_predictions
plot_data(original_df, future_df, last_date, model, 30)
download_button(future_df, key)
def forecast_next_90_days(df, _best_params, key):
last_date = df['Reported Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=90)
future_df = pd.DataFrame({'Reported Date': future_dates})
full_df = pd.concat([df, future_df], ignore_index=True)
full_df = create_forecasting_features_3m(full_df)
original_df = full_df[full_df['Reported Date'] <= last_date].copy()
future_df = full_df[full_df['Reported Date'] > last_date].copy()
X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
y_train = original_df['Modal Price (Rs./Quintal)']
X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
model = XGBRegressor(**_best_params)
model.fit(X_train, y_train)
future_predictions = model.predict(X_future)
future_df['Modal Price (Rs./Quintal)'] = future_predictions
plot_data(original_df, future_df, last_date, model, 90)
download_button(future_df, key)
def train_and_forecast(df, filter_key, days):
cols = get_collections()
if df is not None:
if days == 14:
best_params = train_and_evaluate(df)
cols['best_params_collection'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
forecast_next_14_days(df, best_params, filter_key)
elif days == 30:
best_params = train_and_evaluate_1m(df)
cols['best_params_collection_1m'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
forecast_next_30_days(df, best_params, filter_key)
elif days == 90:
best_params = train_and_evaluate_3m(df)
cols['best_params_collection_3m'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
forecast_next_90_days(df, best_params, filter_key)
def get_best_params(filter_key, collection):
record = collection.find_one({"filter_key": filter_key})
return record
def forecast(df, filter_key, days):
cols = get_collections()
if days == 14:
record = get_best_params(filter_key, cols['best_params_collection'])
if record:
st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
forecast_next_14_days(df, record, filter_key)
else:
st.warning("⚠️ Model is not trained yet. Please train the model first.")
if days == 30:
record = get_best_params(filter_key, cols['best_params_collection_1m'])
if record:
st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
forecast_next_30_days(df, record, filter_key)
else:
st.warning("⚠️ Model is not trained yet. Please train the model first.")
if days == 90:
record = get_best_params(filter_key, cols['best_params_collection_3m'])
if record:
st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
forecast_next_90_days(df, record, filter_key)
else:
st.warning("⚠️ Model is not trained yet. Please train the model first.")