Spaces:

ThejasRao
/

agripredict

Sleeping

App Files Files Community

agripredict / src /agri_predict /models.py

ThejasRao

Fix: Readme

ecb9d4e 3 months ago

raw

history blame contribute delete

10.2 kB

	import streamlit as st
	import pandas as pd
	from xgboost import XGBRegressor
	from sklearn.metrics import mean_squared_error, mean_absolute_error
	from sklearn.preprocessing import MinMaxScaler
	from .features import (
	create_forecasting_features,
	create_forecasting_features_1m,
	create_forecasting_features_3m,
	)
	from .plotting import plot_data, download_button
	from .config import get_collections


	def _custom_grid_search(X_train, y_train, X_test, y_test, param_grid, progress_bar):
	model = XGBRegressor()
	param_combinations = len(param_grid['learning_rate']) * len(param_grid['max_depth']) * \
	len(param_grid['n_estimators']) * len(param_grid['booster'])
	current_combination = 0
	best_score = float('-inf')
	best_params = None

	for learning_rate in param_grid['learning_rate']:
	for max_depth in param_grid['max_depth']:
	for n_estimators in param_grid['n_estimators']:
	for booster in param_grid['booster']:
	model.set_params(
	learning_rate=learning_rate,
	max_depth=max_depth,
	n_estimators=n_estimators,
	booster=booster
	)
	model.fit(X_train, y_train)
	score = model.score(X_test, y_test)
	if score > best_score:
	best_score = score
	best_params = {
	'learning_rate': learning_rate,
	'max_depth': max_depth,
	'n_estimators': n_estimators,
	'booster': booster
	}
	current_combination += 1
	progress_bar.progress(int((current_combination / param_combinations) * 100))
	return best_params


	def _train_and_evaluate_generic(df, feature_fn, split_date, progress_bar):
	df = feature_fn(df)
	train_df = df[df['Reported Date'] < split_date]
	test_df = df[df['Reported Date'] >= split_date]

	X_train = train_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
	y_train = train_df['Modal Price (Rs./Quintal)']
	X_test = test_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
	y_test = test_df['Modal Price (Rs./Quintal)']

	param_grid = {
	'learning_rate': [0.01, 0.1, 0.2],
	'max_depth': [3, 5, 7],
	'n_estimators': [50, 100, 150],
	'booster': ['gbtree', 'dart']
	}

	st.write("Performing hyperparameter tuning...")
	best_params = _custom_grid_search(X_train, y_train, X_test, y_test, param_grid, progress_bar)

	st.write("Training the best model and making predictions...")
	best_model = XGBRegressor(**best_params)
	best_model.fit(X_train, y_train)
	y_pred = best_model.predict(X_test)

	rmse = mean_squared_error(y_test, y_pred)
	mae = mean_absolute_error(y_test, y_pred)
	st.write(f"RMSE: {rmse}")
	st.write(f"MAE: {mae}")

	# Prepare plot data
	train_plot_df = train_df[['Reported Date', 'Modal Price (Rs./Quintal)']].copy()
	train_plot_df['Type'] = 'Train'
	test_plot_df = test_df[['Reported Date', 'Modal Price (Rs./Quintal)']].copy()
	test_plot_df['Type'] = 'Test'
	predicted_plot_df = test_df[['Reported Date']].copy()
	predicted_plot_df['Modal Price (Rs./Quintal)'] = y_pred
	predicted_plot_df['Type'] = 'Predicted'
	plot_df = pd.concat([train_plot_df, test_plot_df, predicted_plot_df])

	import plotly.graph_objects as go
	fig = go.Figure()
	for plot_type, color, dash in [('Train', 'blue', None), ('Test', 'orange', None), ('Predicted', 'green', 'dot')]:
	data = plot_df[plot_df['Type'] == plot_type]
	fig.add_trace(go.Scatter(
	x=data['Reported Date'],
	y=data['Modal Price (Rs./Quintal)'],
	mode='lines',
	name=f"{plot_type} Data",
	line=dict(color=color, dash=dash)
	))
	fig.update_layout(title="Train, Test, and Predicted Data", xaxis_title="Date", yaxis_title="Modal Price (Rs./Quintal)", template="plotly_white")
	st.plotly_chart(fig, use_container_width=True)

	return best_params


	def train_and_evaluate(df):
	progress_bar = st.progress(0)
	return _train_and_evaluate_generic(df, create_forecasting_features, '2024-01-01', progress_bar)


	def train_and_evaluate_1m(df):
	progress_bar = st.progress(0)
	return _train_and_evaluate_generic(df, create_forecasting_features_1m, pd.to_datetime('2023-01-01'), progress_bar)


	def train_and_evaluate_3m(df):
	progress_bar = st.progress(0)
	return _train_and_evaluate_generic(df, create_forecasting_features_3m, pd.to_datetime('2023-01-01'), progress_bar)


	def forecast_next_14_days(df, _best_params, key):
	last_date = df['Reported Date'].max()
	future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=14)
	future_df = pd.DataFrame({'Reported Date': future_dates})
	full_df = pd.concat([df, future_df], ignore_index=True)
	full_df = create_forecasting_features(full_df)
	original_df = full_df[full_df['Reported Date'] <= last_date].copy()
	future_df = full_df[full_df['Reported Date'] > last_date].copy()
	X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
	y_train = original_df['Modal Price (Rs./Quintal)']
	X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
	model = XGBRegressor(**_best_params)
	model.fit(X_train, y_train)
	future_predictions = model.predict(X_future)
	future_df['Modal Price (Rs./Quintal)'] = future_predictions
	plot_data(original_df, future_df, last_date, model, 14)
	download_button(future_df, key)


	def forecast_next_30_days(df, _best_params, key):
	last_date = df['Reported Date'].max()
	future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30)
	future_df = pd.DataFrame({'Reported Date': future_dates})
	full_df = pd.concat([df, future_df], ignore_index=True)
	full_df = create_forecasting_features_1m(full_df)
	original_df = full_df[full_df['Reported Date'] <= last_date].copy()
	future_df = full_df[full_df['Reported Date'] > last_date].copy()
	X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
	y_train = original_df['Modal Price (Rs./Quintal)']
	X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
	model = XGBRegressor(**_best_params)
	model.fit(X_train, y_train)
	future_predictions = model.predict(X_future)
	future_df['Modal Price (Rs./Quintal)'] = future_predictions
	plot_data(original_df, future_df, last_date, model, 30)
	download_button(future_df, key)


	def forecast_next_90_days(df, _best_params, key):
	last_date = df['Reported Date'].max()
	future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=90)
	future_df = pd.DataFrame({'Reported Date': future_dates})
	full_df = pd.concat([df, future_df], ignore_index=True)
	full_df = create_forecasting_features_3m(full_df)
	original_df = full_df[full_df['Reported Date'] <= last_date].copy()
	future_df = full_df[full_df['Reported Date'] > last_date].copy()
	X_train = original_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
	y_train = original_df['Modal Price (Rs./Quintal)']
	X_future = future_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'], errors='ignore')
	model = XGBRegressor(**_best_params)
	model.fit(X_train, y_train)
	future_predictions = model.predict(X_future)
	future_df['Modal Price (Rs./Quintal)'] = future_predictions
	plot_data(original_df, future_df, last_date, model, 90)
	download_button(future_df, key)


	def train_and_forecast(df, filter_key, days):
	cols = get_collections()
	if df is not None:
	if days == 14:
	best_params = train_and_evaluate(df)
	cols['best_params_collection'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
	forecast_next_14_days(df, best_params, filter_key)
	elif days == 30:
	best_params = train_and_evaluate_1m(df)
	cols['best_params_collection_1m'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
	forecast_next_30_days(df, best_params, filter_key)
	elif days == 90:
	best_params = train_and_evaluate_3m(df)
	cols['best_params_collection_3m'].replace_one({'filter_key': filter_key}, {**best_params, 'filter_key': filter_key, 'last_updated': pd.Timestamp.now().isoformat()}, upsert=True)
	forecast_next_90_days(df, best_params, filter_key)


	def get_best_params(filter_key, collection):
	record = collection.find_one({"filter_key": filter_key})
	return record


	def forecast(df, filter_key, days):
	cols = get_collections()
	if days == 14:
	record = get_best_params(filter_key, cols['best_params_collection'])
	if record:
	st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
	forecast_next_14_days(df, record, filter_key)
	else:
	st.warning("⚠️ Model is not trained yet. Please train the model first.")
	if days == 30:
	record = get_best_params(filter_key, cols['best_params_collection_1m'])
	if record:
	st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
	forecast_next_30_days(df, record, filter_key)
	else:
	st.warning("⚠️ Model is not trained yet. Please train the model first.")
	if days == 90:
	record = get_best_params(filter_key, cols['best_params_collection_3m'])
	if record:
	st.info(f"ℹ️ The model was trained on {record['last_updated']}.")
	forecast_next_90_days(df, record, filter_key)
	else:
	st.warning("⚠️ Model is not trained yet. Please train the model first.")