File size: 14,239 Bytes

7f90ea0

"""

Demand Prediction System - Prediction Script



This script loads a trained model and makes demand predictions for products

on future dates. Supports both ML models and time-series models (ARIMA, Prophet).



Usage (ML Models):

    python predict.py --product_id 1 --date 2024-01-15 --price 100 --discount 10 --category Electronics

    

Usage (Time-Series Models - overall demand):

    python predict.py --date 2024-01-15 --model_type timeseries

"""

import pandas as pd
import numpy as np
import joblib
import json
import argparse
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration
MODEL_DIR = 'models'
MODEL_PATH = f'{MODEL_DIR}/best_model.joblib'
TS_MODEL_PATH = f'{MODEL_DIR}/best_timeseries_model.joblib'
PREPROCESSING_PATH = f'{MODEL_DIR}/preprocessing.joblib'
METADATA_PATH = f'{MODEL_DIR}/model_metadata.json'
ALL_MODELS_METADATA_PATH = f'{MODEL_DIR}/all_models_metadata.json'


def load_model_and_preprocessing(model_type='auto'):
    """

    Load the trained model and preprocessing objects.

    

    Args:

        model_type: 'ml', 'timeseries', or 'auto' (auto-detect best model)

    

    Returns:

        tuple: (model, preprocessing_data, model_name, is_timeseries)

    """
    # Load metadata to determine best model
    if os.path.exists(ALL_MODELS_METADATA_PATH):
        with open(ALL_MODELS_METADATA_PATH, 'r') as f:
            all_metadata = json.load(f)
        best_model_name = all_metadata.get('best_model', 'Unknown')
    else:
        best_model_name = None
    
    # Determine which model to use
    if model_type == 'auto':
        if best_model_name in ['ARIMA', 'Prophet']:
            model_type = 'timeseries'
        else:
            model_type = 'ml'
    
    is_timeseries = (model_type == 'timeseries')
    
    if is_timeseries:
        # Load time-series model
        if not os.path.exists(TS_MODEL_PATH):
            raise FileNotFoundError(
                f"Time-series model not found at {TS_MODEL_PATH}. Please run train_model.py first."
            )
        
        print("Loading time-series model...")
        model = joblib.load(TS_MODEL_PATH)
        preprocessing_data = None
        
        if best_model_name:
            print(f"Model: {best_model_name}")
            if best_model_name in all_metadata.get('all_models', {}):
                metrics = all_metadata['all_models'][best_model_name]
                print(f"R2 Score: {metrics.get('r2', 'N/A'):.4f}")
        
        return model, preprocessing_data, best_model_name or 'Time-Series', True
    else:
        # Load ML model
        if not os.path.exists(MODEL_PATH):
            raise FileNotFoundError(
                f"ML model not found at {MODEL_PATH}. Please run train_model.py first."
            )
        
        if not os.path.exists(PREPROCESSING_PATH):
            raise FileNotFoundError(
                f"Preprocessing objects not found at {PREPROCESSING_PATH}. Please run train_model.py first."
            )
        
        print("Loading ML model and preprocessing objects...")
        model = joblib.load(MODEL_PATH)
        preprocessing_data = joblib.load(PREPROCESSING_PATH)
        
        # Load metadata if available
        if os.path.exists(METADATA_PATH):
            with open(METADATA_PATH, 'r') as f:
                metadata = json.load(f)
            model_name = metadata.get('model_name', 'ML Model')
            print(f"Model: {model_name}")
            print(f"R2 Score: {metadata.get('metrics', {}).get('r2', 'N/A'):.4f}")
        else:
            model_name = best_model_name or 'ML Model'
        
        return model, preprocessing_data, model_name, False


def prepare_features(product_id, date, price, discount, category, preprocessing_data):
    """

    Prepare features for prediction using the same preprocessing pipeline.

    

    Args:

        product_id: Product ID

        date: Date string (YYYY-MM-DD) or datetime object

        price: Product price

        discount: Discount percentage (0-100)

        category: Product category

        preprocessing_data: Dictionary containing encoders and scaler

        

    Returns:

        numpy array: Prepared features for prediction

    """
    # Convert date to datetime if string
    if isinstance(date, str):
        date = pd.to_datetime(date)
    
    # Extract date features (same as in training)
    day = date.day
    month = date.month
    day_of_week = date.weekday()  # 0=Monday, 6=Sunday
    weekend = 1 if day_of_week >= 5 else 0
    year = date.year
    quarter = date.quarter
    
    # Encode categorical variables
    category_encoder = preprocessing_data['encoders']['category']
    product_encoder = preprocessing_data['encoders']['product_id']
    
    # Handle unseen categories/products
    try:
        category_encoded = category_encoder.transform([category])[0]
    except ValueError:
        # If category not seen during training, use most common category
        print(f"Warning: Category '{category}' not seen during training. Using default encoding.")
        category_encoded = 0
    
    try:
        product_id_encoded = product_encoder.transform([product_id])[0]
    except ValueError:
        # If product_id not seen during training, use mean encoding
        print(f"Warning: Product ID '{product_id}' not seen during training. Using default encoding.")
        product_id_encoded = product_encoder.transform([product_encoder.classes_[0]])[0]
    
    # Create feature dictionary
    feature_dict = {
        'price': price,
        'discount': discount,
        'day': day,
        'month': month,
        'day_of_week': day_of_week,
        'weekend': weekend,
        'year': year,
        'quarter': quarter,
        'category_encoded': category_encoded,
        'product_id_encoded': product_id_encoded
    }
    
    # Create feature array in the same order as training
    feature_names = preprocessing_data['feature_names']
    features = np.array([[feature_dict[name] for name in feature_names]])
    
    # Scale features
    scaler = preprocessing_data['scaler']
    features_scaled = scaler.transform(features)
    
    return features_scaled


def predict_demand_ml(product_id, date, price, discount, category, model, preprocessing_data):
    """

    Predict demand for a product on a given date using ML model.

    

    Args:

        product_id: Product ID

        date: Date string (YYYY-MM-DD) or datetime object

        price: Product price

        discount: Discount percentage (0-100)

        category: Product category

        model: Trained ML model

        preprocessing_data: Dictionary containing encoders and scaler

        

    Returns:

        float: Predicted sales quantity

    """
    # Prepare features
    features = prepare_features(product_id, date, price, discount, category, preprocessing_data)
    
    # Make prediction
    prediction = model.predict(features)[0]
    
    # Ensure non-negative prediction
    prediction = max(0, prediction)
    
    return prediction


def predict_demand_timeseries(date, model, model_name):
    """

    Predict overall daily demand using time-series model.

    

    Args:

        date: Date string (YYYY-MM-DD) or datetime object

        model: Trained time-series model (ARIMA or Prophet)

        model_name: Name of the model ('ARIMA' or 'Prophet')

        

    Returns:

        float: Predicted total daily sales quantity

    """
    # Convert date to datetime if string
    if isinstance(date, str):
        date = pd.to_datetime(date)
    
    if model_name == 'ARIMA':
        # For ARIMA, we need to calculate how many steps ahead
        # This is a simplified approach - in practice, you'd need the training end date
        # For now, predict 1 step ahead
        try:
            forecast = model.forecast(steps=1)
            prediction = forecast[0] if hasattr(forecast, '__iter__') else forecast
            prediction = max(0, prediction)
            return prediction
        except Exception as e:
            print(f"Error in ARIMA prediction: {e}")
            return None
    
    elif model_name == 'Prophet':
        # For Prophet, create a future dataframe
        try:
            future = pd.DataFrame({'ds': [date]})
            forecast = model.predict(future)
            prediction = forecast['yhat'].iloc[0]
            prediction = max(0, prediction)
            return prediction
        except Exception as e:
            print(f"Error in Prophet prediction: {e}")
            return None
    
    else:
        print(f"Unknown time-series model: {model_name}")
        return None


def predict_batch(predictions_data, model, preprocessing_data):
    """

    Predict demand for multiple products/dates at once.

    

    Args:

        predictions_data: List of dictionaries, each containing:

            - product_id

            - date

            - price

            - discount

            - category

        model: Trained model

        preprocessing_data: Dictionary containing encoders and scaler

        

    Returns:

        list: List of predicted sales quantities

    """
    predictions = []
    
    for data in predictions_data:
        pred = predict_demand(
            data['product_id'],
            data['date'],
            data['price'],
            data['discount'],
            data['category'],
            model,
            preprocessing_data
        )
        predictions.append(pred)
    
    return predictions


def main():
    """

    Main function for command-line interface.

    """
    parser = argparse.ArgumentParser(
        description='Predict product demand for a given date and product details',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""

Examples (ML Models):

  python predict.py --product_id 1 --date 2024-01-15 --price 100 --discount 10 --category Electronics

  python predict.py --product_id 5 --date 2024-06-20 --price 50 --discount 0 --category Clothing



Examples (Time-Series Models - overall daily demand):

  python predict.py --date 2024-01-15 --model_type timeseries

        """
    )
    
    parser.add_argument('--product_id', type=int, default=None,
                       help='Product ID (required for ML models)')
    parser.add_argument('--date', type=str, required=True,
                       help='Date in YYYY-MM-DD format')
    parser.add_argument('--price', type=float, default=None,
                       help='Product price (required for ML models)')
    parser.add_argument('--discount', type=float, default=0,
                       help='Discount percentage (0-100), default: 0 (for ML models)')
    parser.add_argument('--category', type=str, default=None,
                       help='Product category (required for ML models)')
    parser.add_argument('--model_type', type=str, default='auto',
                       choices=['auto', 'ml', 'timeseries'],
                       help='Model type to use: auto (best model), ml, or timeseries')
    
    args = parser.parse_args()
    
    # Validate date format
    try:
        date_obj = pd.to_datetime(args.date)
    except ValueError:
        print(f"Error: Invalid date format '{args.date}'. Please use YYYY-MM-DD format.")
        return
    
    # Load model and preprocessing
    try:
        model, preprocessing_data, model_name, is_timeseries = load_model_and_preprocessing(args.model_type)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return
    
    # Validate arguments based on model type
    if not is_timeseries:
        # ML model requires product details
        if args.product_id is None or args.price is None or args.category is None:
            print("Error: ML models require --product_id, --price, and --category arguments.")
            return
        
        # Validate discount range
        if args.discount < 0 or args.discount > 100:
            print(f"Warning: Discount {args.discount}% is outside 0-100 range. Clamping to valid range.")
            args.discount = max(0, min(100, args.discount))
    
    # Make prediction
    print("\n" + "="*60)
    print("MAKING PREDICTION")
    print("="*60)
    print(f"Model: {model_name}")
    print(f"Model Type: {'Time-Series' if is_timeseries else 'Machine Learning'}")
    print(f"Date: {args.date}")
    
    if not is_timeseries:
        print(f"Product ID: {args.product_id}")
        print(f"Price: ${args.price:.2f}")
        print(f"Discount: {args.discount}%")
        print(f"Category: {args.category}")
    
    print("-"*60)
    
    if is_timeseries:
        predicted_demand = predict_demand_timeseries(
            args.date,
            model,
            model_name
        )
        
        if predicted_demand is None:
            print("Error: Failed to make prediction.")
            return
        
        print(f"\nPredicted Total Daily Sales Quantity: {predicted_demand:.0f} units")
        print("(This is the predicted total demand across all products for this date)")
    else:
        predicted_demand = predict_demand_ml(
            args.product_id,
            args.date,
            args.price,
            args.discount,
            args.category,
            model,
            preprocessing_data
        )
        
        print(f"\nPredicted Sales Quantity: {predicted_demand:.0f} units")
        print("(This is the predicted demand for this specific product)")
    
    print("="*60)
    
    # Additional information
    date_obj = pd.to_datetime(args.date)
    day_name = date_obj.strftime('%A')
    is_weekend = "Yes" if date_obj.weekday() >= 5 else "No"
    
    print(f"\nDate Information:")
    print(f"  Day of week: {day_name}")
    print(f"  Weekend: {is_weekend}")
    print(f"  Month: {date_obj.strftime('%B')}")
    print(f"  Quarter: Q{date_obj.quarter}")


if __name__ == "__main__":
    main()