Spaces:

adsurkasur
/

agripredict-analysis

Sleeping

App Files Files Community

agripredict-analysis / train_catboost.py

adsurkasur

Update AgriPredict Analysis Service

b2c7817 verified 2 months ago

raw

history blame contribute delete

20.1 kB

	"""
	CatBoost Model Training Script for AgriPredict
	This script trains the CatBoost model using the existing training dataset.
	Includes comprehensive accuracy metrics: MAE, RMSE, MAPE, Bias, MASE, R-Squared
	"""

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta
	from catboost import CatBoostRegressor, Pool
	from sklearn.model_selection import train_test_split, TimeSeriesSplit
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
	import joblib
	import os
	from typing import Dict, Any, Tuple, Optional
	import logging
	import json

	# Setup logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)


	class ForecastMetrics:
	"""Comprehensive forecast accuracy metrics calculator"""

	@staticmethod
	def calculate_all_metrics(y_true: np.ndarray, y_pred: np.ndarray,
	y_train: Optional[np.ndarray] = None) -> Dict[str, float]:
	"""
	Calculate all forecast accuracy metrics

	Args:
	y_true: Actual values
	y_pred: Predicted values
	y_train: Training data (for MASE calculation)

	Returns:
	Dictionary with all metrics
	"""
	y_true = np.array(y_true).flatten()
	y_pred = np.array(y_pred).flatten()

	# Remove any NaN or infinite values
	mask = np.isfinite(y_true) & np.isfinite(y_pred)
	y_true = y_true[mask]
	y_pred = y_pred[mask]

	if len(y_true) == 0:
	return {
	'mae': np.nan, 'rmse': np.nan, 'mape': np.nan,
	'bias': np.nan, 'mase': np.nan, 'r_squared': np.nan
	}

	# MAE - Mean Absolute Error
	mae = mean_absolute_error(y_true, y_pred)

	# RMSE - Root Mean Squared Error
	rmse = np.sqrt(mean_squared_error(y_true, y_pred))

	# MAPE - Mean Absolute Percentage Error (handle zero values)
	non_zero_mask = y_true != 0
	if np.any(non_zero_mask):
	mape = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
	else:
	mape = np.nan

	# Bias - Mean Forecast Error (MFE)
	bias = np.mean(y_pred - y_true)

	# MASE - Mean Absolute Scaled Error
	if y_train is not None and len(y_train) > 1:
	y_train = np.array(y_train).flatten()
	# Naive forecast error (one-step ahead)
	naive_errors = np.abs(np.diff(y_train))
	scaling_factor = np.mean(naive_errors)
	if scaling_factor > 0:
	mase = mae / scaling_factor
	else:
	mase = np.nan
	else:
	mase = np.nan

	# R-Squared
	r_squared = r2_score(y_true, y_pred)

	return {
	'mae': round(float(mae), 4),
	'rmse': round(float(rmse), 4),
	'mape': round(float(mape), 4) if not np.isnan(mape) else None,
	'bias': round(float(bias), 4),
	'mase': round(float(mase), 4) if not np.isnan(mase) else None,
	'r_squared': round(float(r_squared), 4)
	}

	class CatBoostTrainer:
	"""CatBoost model trainer for agricultural demand forecasting"""

	def __init__(self):
	self.model = None
	self.feature_names = None
	self.metrics = None
	self.training_config = None

	def load_training_data(self, filepath: str = None) -> pd.DataFrame:
	"""
	Load training data from CSV file

	Args:
	filepath: Path to training data CSV

	Returns:
	DataFrame with training data
	"""
	if filepath is None:
	# Default to the existing training data
	filepath = os.path.join(os.path.dirname(__file__), '..', 'data', 'catboost_training_data.csv')
	if not os.path.exists(filepath):
	filepath = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'catboost_training_data.csv')

	if not os.path.exists(filepath):
	logger.warning(f"Training data not found at {filepath}, generating synthetic data")
	return self.generate_artificial_data(n_samples=5000)

	logger.info(f"Loading training data from {filepath}")
	df = pd.read_csv(filepath)

	# Parse date column
	if 'date' in df.columns:
	df['date'] = pd.to_datetime(df['date'])

	logger.info(f"Loaded {len(df)} samples with {len(df.columns)} features")
	return df

	def generate_artificial_data(self, n_samples: int = 5000) -> pd.DataFrame:
	"""
	Generate artificial agricultural data for training

	Args:
	n_samples: Number of samples to generate

	Returns:
	DataFrame with artificial agricultural data
	"""
	logger.info(f"Generating {n_samples} artificial data samples")

	# Generate date range
	start_date = datetime(2023, 1, 1)
	dates = [start_date + timedelta(days=i) for i in range(n_samples)]

	np.random.seed(42) # For reproducible results

	data = []

	for date in dates:
	# Seasonal patterns
	day_of_year = date.timetuple().tm_yday
	seasonal_factor = 1 + 0.3 * np.sin(2 * np.pi * day_of_year / 365)

	# Base demand with seasonal variation
	base_quantity = np.random.normal(100, 20) * seasonal_factor

	# Price influenced by season and demand
	base_price = 25 + 5 * np.sin(2 * np.pi * day_of_year / 365)
	price_noise = np.random.normal(0, 2)
	price = base_price + price_noise

	# Add some correlation between price and quantity
	quantity_noise = np.random.normal(0, 15)
	quantity = base_quantity + quantity_noise - 0.1 * (price - 25)

	# Ensure positive values
	quantity = max(1, quantity)
	price = max(5, price)

	data.append({
	'date': date,
	'quantity': round(quantity, 2),
	'price': round(price, 2),
	'day_of_week': date.weekday(),
	'month': date.month,
	'day_of_month': date.day,
	'quarter': (date.month - 1) // 3 + 1,
	'is_weekend': 1 if date.weekday() >= 5 else 0,
	'season': self._get_season(date.month)
	})

	df = pd.DataFrame(data)

	# Add lag features
	for lag in [1, 7, 14, 30]:
	df[f'price_lag_{lag}'] = df['price'].shift(lag)
	df[f'quantity_lag_{lag}'] = df['quantity'].shift(lag)

	# Add rolling statistics
	for window in [7, 14, 30]:
	df[f'price_rolling_mean_{window}'] = df['price'].rolling(window).mean()
	df[f'price_rolling_std_{window}'] = df['price'].rolling(window).std()
	df[f'quantity_rolling_mean_{window}'] = df['quantity'].rolling(window).mean()

	# Add price change features
	df['price_change'] = df['price'].pct_change()
	df['price_change_7d'] = df['price'].pct_change(7)

	# Drop rows with NaN values
	df = df.dropna().reset_index(drop=True)

	logger.info(f"Generated dataset with {len(df)} samples and {len(df.columns)} features")
	return df

	def _get_season(self, month: int) -> str:
	"""Get season based on month"""
	if month in [12, 1, 2]:
	return 'winter'
	elif month in [3, 4, 5]:
	return 'spring'
	elif month in [6, 7, 8]:
	return 'summer'
	else:
	return 'fall'

	def prepare_features(self, df: pd.DataFrame, target_col: str = 'target_quantity') -> tuple:
	"""
	Prepare features for training

	Args:
	df: Input DataFrame
	target_col: Name of target column

	Returns:
	Tuple of (X, y, feature_names, categorical_features)
	"""
	# Define columns to exclude from features
	exclude_cols = ['date', 'target_quantity', 'product_name', 'region_name', 'product_category']

	# Check if target column exists
	if target_col not in df.columns:
	# Fallback to price if target_quantity doesn't exist
	target_col = 'price' if 'price' in df.columns else df.columns[-1]
	logger.warning(f"target_quantity not found, using {target_col} as target")

	# Define feature columns
	feature_cols = [col for col in df.columns if col not in exclude_cols and col != target_col]

	# Identify categorical features
	categorical_features = []
	for col in feature_cols:
	if df[col].dtype == 'object' or df[col].dtype.name == 'category':
	categorical_features.append(col)

	# Remove categorical features for now (CatBoost handles them, but we'll use numeric only)
	feature_cols = [col for col in feature_cols if col not in categorical_features]

	# Prepare features and target
	X = df[feature_cols].copy()
	y = df[target_col].copy()

	# Handle any remaining NaN values
	X = X.fillna(X.median())

	logger.info(f"Prepared {len(feature_cols)} features for training, target: {target_col}")
	return X, y, feature_cols, categorical_features

	def train_model(self, X_train, y_train, X_val=None, y_val=None, **kwargs) -> CatBoostRegressor:
	"""
	Train CatBoost model

	Args:
	X_train: Training features
	y_train: Training target
	X_val: Validation features (optional)
	y_val: Validation target (optional)
	**kwargs: Additional CatBoost parameters

	Returns:
	Trained CatBoost model
	"""
	# Default parameters optimized for demand forecasting
	default_params = {
	'iterations': 1000,
	'learning_rate': 0.05,
	'depth': 8,
	'loss_function': 'MAE',
	'eval_metric': 'MAE',
	'random_seed': 42,
	'verbose': 100,
	'early_stopping_rounds': 100,
	'l2_leaf_reg': 3,
	'border_count': 128,
	'thread_count': -1
	}

	# Update with custom parameters
	default_params.update(kwargs)
	self.training_config = default_params.copy()

	# Create model
	model = CatBoostRegressor(**default_params)

	# Prepare data
	train_pool = Pool(X_train, y_train)

	if X_val is not None and y_val is not None:
	val_pool = Pool(X_val, y_val)
	model.fit(train_pool, eval_set=val_pool, use_best_model=True)
	else:
	model.fit(train_pool)

	self.model = model
	self.feature_names = list(X_train.columns)

	logger.info(f"Trained CatBoost model with {model.tree_count_} trees")
	return model

	def evaluate_model(self, X_test, y_test, y_train=None) -> Dict[str, float]:
	"""
	Evaluate model performance with comprehensive metrics

	Args:
	X_test: Test features
	y_test: Test target
	y_train: Training target (for MASE calculation)

	Returns:
	Dictionary with evaluation metrics
	"""
	if self.model is None:
	raise ValueError("Model not trained yet")

	# Make predictions
	y_pred = self.model.predict(X_test)

	# Calculate all metrics
	self.metrics = ForecastMetrics.calculate_all_metrics(
	y_true=y_test,
	y_pred=y_pred,
	y_train=y_train
	)

	logger.info(f"Model Evaluation Metrics:")
	logger.info(f" MAE: {self.metrics['mae']}")
	logger.info(f" RMSE: {self.metrics['rmse']}")
	logger.info(f" MAPE: {self.metrics['mape']}%")
	logger.info(f" Bias: {self.metrics['bias']}")
	logger.info(f" MASE: {self.metrics['mase']}")
	logger.info(f" R²: {self.metrics['r_squared']}")

	return self.metrics

	def save_model(self, filepath: str):
	"""
	Save trained model to file with metadata

	Args:
	filepath: Path to save the model
	"""
	if self.model is None:
	raise ValueError("Model not trained yet")

	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)

	# Prepare model data with metadata
	model_data = {
	'model': self.model,
	'feature_names': self.feature_names,
	'metrics': self.metrics,
	'training_config': self.training_config,
	'training_date': datetime.now().isoformat(),
	'version': '2.0'
	}

	# Save model using joblib
	joblib.dump(model_data, filepath)

	# Also save the CatBoost native format for faster loading
	native_path = filepath.replace('.pkl', '.cbm')
	self.model.save_model(native_path)

	# Save metrics to JSON for easy access
	metrics_path = filepath.replace('.pkl', '_metrics.json')
	with open(metrics_path, 'w') as f:
	json.dump({
	'metrics': self.metrics,
	'feature_names': self.feature_names,
	'training_date': model_data['training_date'],
	'training_config': {k: str(v) for k, v in (self.training_config or {}).items()}
	}, f, indent=2)

	logger.info(f"Model saved to {filepath}")
	logger.info(f"Native model saved to {native_path}")
	logger.info(f"Metrics saved to {metrics_path}")

	def load_model(self, filepath: str):
	"""
	Load trained model from file

	Args:
	filepath: Path to the saved model
	"""
	if not os.path.exists(filepath):
	raise FileNotFoundError(f"Model file not found: {filepath}")

	# Load model
	model_data = joblib.load(filepath)
	self.model = model_data['model']
	self.feature_names = model_data.get('feature_names', [])
	self.metrics = model_data.get('metrics', {})
	self.training_config = model_data.get('training_config', {})

	logger.info(f"Model loaded from {filepath}")
	logger.info(f"Model metrics: {self.metrics}")

	def predict(self, features: pd.DataFrame) -> np.ndarray:
	"""
	Make predictions with trained model

	Args:
	features: Input features

	Returns:
	Predictions array
	"""
	if self.model is None:
	raise ValueError("Model not trained or loaded yet")

	# Ensure features are in correct order
	if self.feature_names:
	# Only use features that exist in both
	available_features = [f for f in self.feature_names if f in features.columns]
	missing_features = [f for f in self.feature_names if f not in features.columns]

	if missing_features:
	logger.warning(f"Missing features: {missing_features[:5]}...")
	# Add missing features with default values
	for f in missing_features:
	features[f] = 0

	features = features[self.feature_names]

	return self.model.predict(features)

	def get_feature_importance(self, top_n: int = 20) -> Dict[str, float]:
	"""
	Get feature importance from trained model

	Args:
	top_n: Number of top features to return

	Returns:
	Dictionary of feature names and their importance scores
	"""
	if self.model is None:
	raise ValueError("Model not trained yet")

	importance = self.model.get_feature_importance()
	feature_importance = dict(zip(self.feature_names, importance))

	# Sort by importance
	sorted_importance = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:top_n])

	return sorted_importance


	def main():
	"""Main training function"""
	logger.info("=" * 60)
	logger.info("Starting CatBoost Model Training for AgriPredict")
	logger.info("=" * 60)

	# Initialize trainer
	trainer = CatBoostTrainer()

	# Load training data (use existing CSV or generate synthetic)
	df = trainer.load_training_data()

	logger.info(f"Dataset shape: {df.shape}")
	logger.info(f"Columns: {list(df.columns)[:10]}...")

	# Prepare features
	X, y, feature_names, categorical_features = trainer.prepare_features(df)

	logger.info(f"Feature matrix shape: {X.shape}")
	logger.info(f"Target shape: {y.shape}")
	logger.info(f"Target statistics: mean={y.mean():.2f}, std={y.std():.2f}, min={y.min():.2f}, max={y.max():.2f}")

	# Split data (use time-based split for time series data)
	# Reserve last 20% for testing
	split_idx = int(len(X) * 0.8)
	X_train_full, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
	y_train_full, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

	# Further split training data for validation
	val_split_idx = int(len(X_train_full) * 0.8)
	X_train, X_val = X_train_full.iloc[:val_split_idx], X_train_full.iloc[val_split_idx:]
	y_train, y_val = y_train_full.iloc[:val_split_idx], y_train_full.iloc[val_split_idx:]

	logger.info(f"Train set: {len(X_train)} samples")
	logger.info(f"Validation set: {len(X_val)} samples")
	logger.info(f"Test set: {len(X_test)} samples")

	# Train model
	logger.info("-" * 40)
	logger.info("Training CatBoost model...")
	model = trainer.train_model(X_train, y_train, X_val, y_val)

	# Evaluate model on test set
	logger.info("-" * 40)
	logger.info("Evaluating model on test set...")
	metrics = trainer.evaluate_model(X_test, y_test, y_train=y_train)

	# Get feature importance
	logger.info("-" * 40)
	logger.info("Top 10 Feature Importance:")
	importance = trainer.get_feature_importance(top_n=10)
	for feat, imp in importance.items():
	logger.info(f" {feat}: {imp:.2f}")

	# Save model
	model_dir = os.path.join(os.path.dirname(__file__), 'models')
	os.makedirs(model_dir, exist_ok=True)
	model_path = os.path.join(model_dir, "catboost_model.pkl")
	trainer.save_model(model_path)

	logger.info("=" * 60)
	logger.info("Training completed successfully!")
	logger.info(f"Model saved to: {model_path}")
	logger.info("=" * 60)

	# Print final summary
	print("\n" + "=" * 60)
	print("TRAINING SUMMARY")
	print("=" * 60)
	print(f"Total samples: {len(df)}")
	print(f"Features used: {len(feature_names)}")
	print(f"Trees in model: {model.tree_count_}")
	print("\nTest Set Metrics:")
	print(f" MAE: {metrics['mae']:.4f}")
	print(f" RMSE: {metrics['rmse']:.4f}")
	print(f" MAPE: {metrics['mape']:.2f}%" if metrics['mape'] else " MAPE: N/A")
	print(f" Bias: {metrics['bias']:.4f}")
	print(f" MASE: {metrics['mase']:.4f}" if metrics['mase'] else " MASE: N/A")
	print(f" R²: {metrics['r_squared']:.4f}")
	print("=" * 60)

	return trainer


	if __name__ == "__main__":
	trained_trainer = main()