Spaces:

ThejasRao
/

agripredict

Sleeping

File size: 10,234 Bytes

import streamlit as st
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from .features import (
    create_forecasting_features,
    create_forecasting_features_1m,
    create_forecasting_features_3m,
)
from .plotting import plot_data, download_button
from .config import get_collections


def _custom_grid_search(X_train, y_train, X_test, y_test, param_grid, progress_bar):
    model = XGBRegressor()
    param_combinations = len(param_grid['learning_rate']) * len(param_grid['max_depth']) * \
                         len(param_grid['n_estimators']) * len(param_grid['booster'])
    current_combination = 0
    best_score = float('-inf')
    best_params = None

    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for n_estimators in param_grid['n_estimators']:
                for booster in param_grid['booster']:
                    model.set_params(
                        learning_rate=learning_rate,
                        max_depth=max_depth,
                        n_estimators=n_estimators,
                        booster=booster
                    )
                    model.fit(X_train, y_train)
                    score = model.score(X_test, y_test)
                    if score > best_score:
                        best_score = score
                        best_params = {
                            'learning_rate': learning_rate,
                            'max_depth': max_depth,
                            'n_estimators': n_estimators,
                            'booster': booster
                        }
                    current_combination += 1
                    progress_bar.progress(int((current_combination / param_combinations) * 100))
    return best_params


def _train_and_evaluate_generic(df, feature_fn, split_date, progress_bar):
    df = feature_fn(df)
    train_df = df[df['Reported Date'] < split_date]
    test_df = df[df['Reported Date'] >= split_date]

    X_train = train_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
    y_train = train_df['Modal Price (Rs./Quintal)']
    X_test = test_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
    y_test = test_df['Modal Price (Rs./Quintal)']

    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'n_estimators': [50, 100, 150],
        'booster': ['gbtree', 'dart']
    }

    st.write("Performing hyperparameter tuning...")
    best_params = _custom_grid_search(X_train, y_train, X_test, y_test, param_grid, progress_bar)

    st.write("Training the best model and making predictions...")
    best_model = XGBRegressor(**best_params)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    st.write(f"RMSE: {rmse}")
    st.write(f"MAE: {mae}")

    # Prepare plot data
    train_plot_df = train_df[['Reported Date', 'Modal Price (Rs./Quintal)']].copy()
    train_plot_df['Type'] = 'Train'
    test_plot_df = test_df[['Reported Date', 'Modal Price (Rs./Quintal)']].copy()
    test_plot_df['Type'] = 'Test'
    predicted_plot_df = test_df[['Reported Date']].copy()
    predicted_plot_df['Modal Price (Rs./Quintal)'] = y_pred
    predicted_plot_df['Type'] = 'Predicted'
    plot_df = pd.concat([train_plot_df, test_plot_df, predicted_plot_df])

    import plotly.graph_objects as go
    fig = go.Figure()
    for plot_type, color, dash in [('Train', 'blue', None), ('Test', 'orange', None), ('Predicted', 'green', 'dot')]:
        data = plot_df[plot_df['Type'] == plot_type]
        fig.add_trace(go.Scatter(
            x=data['Reported Date'],
            y=data['Modal Price (Rs./Quintal)'],
            mode='lines',
            name=f"{plot_type} Data",
            line=dict(color=color, dash=dash)
        ))
    fig.update_layout(title="Train, Test, and Predicted Data", xaxis_title="Date", yaxis_title="Modal Price (Rs./Quintal)", template="plotly_white")
    st.plotly_chart(fig, use_container_width=True)

    return best_params


def train_and_evaluate(df):
    progress_bar = st.progress(0)
    return _train_and_evaluate_generic(df, create_forecasting_features, '2024-01-01', progress_bar)


def train_and_evaluate_1m(df):
    progress_bar = st.progress(0)
    return _train_and_evaluate_generic(df, create_forecasting_features_1m, pd.to_datetime('2023-01-01'), progress_bar)


def train_and_evaluate_3m(df):
    progress_bar = st.progress(0)
    return _train_and_evaluate_generic(df, create_forecasting_features_3m, pd.to_datetime('2023-01-01'), progress_bar)


def forecast_next_14_days(df, _best_params, key):
    last_date = df['Reported Date'].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=14)
    future_df = pd.DataFrame({'Reported Date': future_dates})
    full_df = pd.concat([df, future_df], ignore_index=True)
    full_df = create_forecasting_features(full_df)
    original_df = full_df[full_df['Reported Date'] <= last_date].copy()
    future_df = full_df[full_df['Reported Date'] > last_date].copy()
    X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
    y_train = original_df['Modal Price (Rs./Quintal)']
    X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
    model = XGBRegressor(**_best_params)
    model.fit(X_train, y_train)
    future_predictions = model.predict(X_future)
    future_df['Modal Price (Rs./Quintal)'] = future_predictions
    plot_data(original_df, future_df, last_date, model, 14)
    download_button(future_df, key)


def forecast_next_30_days(df, _best_params, key):
    last_date = df['Reported Date'].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30)
    future_df = pd.DataFrame({'Reported Date': future_dates})
    full_df = pd.concat([df, future_df], ignore_index=True)
    full_df = create_forecasting_features_1m(full_df)
    original_df = full_df[full_df['Reported Date'] <= last_date].copy()
    future_df = full_df[full_df['Reported Date'] > last_date].copy()
    X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
    y_train = original_df['Modal Price (Rs./Quintal)']
    X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
    model = XGBRegressor(**_best_params)
    model.fit(X_train, y_train)
    future_predictions = model.predict(X_future)
    future_df['Modal Price (Rs./Quintal)'] = future_predictions
    plot_data(original_df, future_df, last_date, model, 30)
    download_button(future_df, key)


def forecast_next_90_days(df, _best_params, key):
    last_date = df['Reported Date'].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=90)
    future_df = pd.DataFrame({'Reported Date': future_dates})
    full_df = pd.concat([df, future_df], ignore_index=True)
    full_df = create_forecasting_features_3m(full_df)
    original_df = full_df[full_df['Reported Date'] <= last_date].copy()
    future_df = full_df[full_df['Reported Date'] > last_date].copy()
    X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
    y_train = original_df['Modal Price (Rs./Quintal)']
    X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
    model = XGBRegressor(**_best_params)
    model.fit(X_train, y_train)
    future_predictions = model.predict(X_future)
    future_df['Modal Price (Rs./Quintal)'] = future_predictions
    plot_data(original_df, future_df, last_date, model, 90)
    download_button(future_df, key)


def train_and_forecast(df, filter_key, days):
    cols = get_collections()
    if df is not None:
        if days == 14:
            best_params = train_and_evaluate(df)
            cols['best_params_collection'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
            forecast_next_14_days(df, best_params, filter_key)
        elif days == 30:
            best_params = train_and_evaluate_1m(df)
            cols['best_params_collection_1m'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
            forecast_next_30_days(df, best_params, filter_key)
        elif days == 90:
            best_params = train_and_evaluate_3m(df)
            cols['best_params_collection_3m'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
            forecast_next_90_days(df, best_params, filter_key)


def get_best_params(filter_key, collection):
    record = collection.find_one({"filter_key": filter_key})
    return record


def forecast(df, filter_key, days):
    cols = get_collections()
    if days == 14:
        record = get_best_params(filter_key, cols['best_params_collection'])
        if record:
            st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
            forecast_next_14_days(df, record, filter_key)
        else:
            st.warning("⚠️ Model is not trained yet. Please train the model first.")
    if days == 30:
        record = get_best_params(filter_key, cols['best_params_collection_1m'])
        if record:
            st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
            forecast_next_30_days(df, record, filter_key)
        else:
            st.warning("⚠️ Model is not trained yet. Please train the model first.")
    if days == 90:
        record = get_best_params(filter_key, cols['best_params_collection_3m'])
        if record:
            st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
            forecast_next_90_days(df, record, filter_key)
        else:
            st.warning("⚠️ Model is not trained yet. Please train the model first.")