SKBE / app.py
ssowmiya's picture
Upload 4 files
eb2f4db verified
# Import necessary libraries
import numpy as np
import joblib # For loading the serialized model
import pandas as pd # For data manipulation
from flask import Flask, request, jsonify # For creating the Flask API
import os
# Initialize the Flask application
superkart_sales_predictor_api = Flask("SuperKart Sales Predictor")
# Load the trained machine learning model
model = joblib.load("best_sales_prediction_model.joblib")
def prepare_input_data(data):
"""Prepare input data with proper feature engineering and categorical value validation"""
df = pd.DataFrame([data] if isinstance(data, dict) else data)
# Define known categorical values from training data
KNOWN_VALUES = {
'Product_Sugar_Content': ['Low Sugar', 'Regular', 'No Sugar', 'reg'],
'Product_Type': ['Dairy', 'Meat', 'Snack Foods', 'Fruits and Vegetables',
'Frozen Foods', 'Baking Goods', 'Health and Hygiene', 'Canned',
'Hard Drinks', 'Household', 'Soft Drinks', 'Starchy Foods',
'Breakfast', 'Seafood', 'Bread', 'Others'],
'Store_Id': ['OUT001', 'OUT002', 'OUT003', 'OUT004'],
'Store_Size': ['Small', 'Medium', 'High'],
'Store_Location_City_Type': ['Tier 1', 'Tier 2', 'Tier 3'],
'Store_Type': ['Supermarket Type1', 'Supermarket Type2', 'Departmental Store', 'Food Mart'],
'Product_Category_Code': ['DR', 'FD', 'NC']
}
# Feature engineering - same as in training
if 'Product_Id' in df.columns:
df['Product_Category_Code'] = df['Product_Id'].str[:2]
df['Product_Number'] = df['Product_Id'].str[2:].astype(int)
# Validate Product_Category_Code and fix invalid ones
invalid_codes = ~df['Product_Category_Code'].isin(KNOWN_VALUES['Product_Category_Code'])
if invalid_codes.any():
df.loc[invalid_codes, 'Product_Category_Code'] = 'DR' # Default to DR for invalid codes
# Create store age feature
if 'Store_Establishment_Year' in df.columns:
current_year = 2024
df['Store_Age'] = current_year - df['Store_Establishment_Year']
# Handle Fat_Content -> Sugar_Content mapping with validation
if 'Product_Fat_Content' in df.columns:
fat_to_sugar_map = {
'Regular': 'Regular',
'Low Fat': 'Low Sugar',
'reg': 'reg',
'LF': 'Low Sugar'
}
df['Product_Sugar_Content'] = df['Product_Fat_Content'].map(fat_to_sugar_map).fillna('Regular')
df = df.drop('Product_Fat_Content', axis=1)
# Remove Product_Id column for prediction
if 'Product_Id' in df.columns:
df = df.drop('Product_Id', axis=1)
# Required columns in exact training order
required_columns = [
'Product_Weight', 'Product_Sugar_Content', 'Product_Allocated_Area',
'Product_Type', 'Product_MRP', 'Store_Id', 'Store_Establishment_Year',
'Store_Size', 'Store_Location_City_Type', 'Store_Type',
'Product_Category_Code', 'Product_Number', 'Store_Age'
]
# Add missing columns with proper defaults
for col in required_columns:
if col not in df.columns:
if col == 'Product_Allocated_Area':
df[col] = 1000.0
elif col == 'Store_Establishment_Year':
df[col] = 2000
elif col == 'Product_Sugar_Content':
df[col] = 'Regular'
elif col == 'Product_Type':
df[col] = 'Dairy'
elif col == 'Store_Id':
df[col] = 'OUT001'
elif col == 'Store_Size':
df[col] = 'Medium'
elif col == 'Store_Location_City_Type':
df[col] = 'Tier 1'
elif col == 'Store_Type':
df[col] = 'Supermarket Type1'
else:
df[col] = 0
# Validate and fix categorical values to prevent unknown category errors
for col in KNOWN_VALUES:
if col in df.columns:
# Replace unknown values with the first known value
df[col] = df[col].where(df[col].isin(KNOWN_VALUES[col]), KNOWN_VALUES[col][0])
# Reorder columns to match exact training order
df = df[required_columns]
return df
# Define a route for the home page (GET request)
@superkart_sales_predictor_api.get('/')
def home():
"""
This function handles GET requests to the root URL ('/') of the API.
It returns a welcome message and API information.
"""
return jsonify({
"message": "Welcome to the SuperKart Sales Prediction API!",
"version": "v1.0",
"endpoints": {
"/": "GET - API information",
"/health": "GET - Health check",
"/v1/sales": "POST - Single product sales prediction",
"/v1/salesbatch": "POST - Batch sales prediction"
}
})
# Define a route for health check (GET request)
@superkart_sales_predictor_api.get('/health')
def health():
"""
Health check endpoint to verify API status.
"""
return jsonify({
"status": "healthy",
"model_status": "loaded",
"api_version": "v1.0"
})
# Define an endpoint for single product sales prediction (POST request)
@superkart_sales_predictor_api.post('/v1/sales')
def predict_sales():
"""
This function handles POST requests to the '/v1/sales' endpoint.
It expects a JSON payload containing product and store details and returns
the predicted sales amount as a JSON response.
"""
try:
# Get the JSON data from the request body
product_data = request.get_json()
if not product_data:
return jsonify({'error': 'No data provided'}), 400
# Prepare input data with proper preprocessing
input_df = prepare_input_data(product_data)
# Make prediction
predicted_sales = model.predict(input_df)[0]
# Convert to Python float and round to 2 decimal places
predicted_sales = round(float(predicted_sales), 2)
# Return the predicted sales
return jsonify({
'Predicted Sales (in dollars)': predicted_sales,
'status': 'success'
})
except Exception as e:
return jsonify({'error': str(e), 'status': 'error'}), 500
# Define an endpoint for batch prediction (POST request)
@superkart_sales_predictor_api.post('/v1/salesbatch')
def predict_sales_batch():
"""
This function handles POST requests to the '/v1/salesbatch' endpoint.
It expects a CSV file containing product and store details for multiple products
and returns the predicted sales amounts as a dictionary in the JSON response.
"""
try:
# Get the uploaded CSV file from the request
file = request.files['file']
if not file:
return jsonify({'error': 'No file provided'}), 400
# Read the CSV file into a Pandas DataFrame
input_data = pd.read_csv(file)
# Process each row through the preprocessing pipeline
processed_rows = []
for _, row in input_data.iterrows():
row_dict = row.to_dict()
processed_df = prepare_input_data(row_dict)
processed_rows.append(processed_df)
# Combine all processed rows
batch_input_df = pd.concat(processed_rows, ignore_index=True)
# Make predictions for all products in the DataFrame
predicted_sales_list = model.predict(batch_input_df).tolist()
# Round predictions to 2 decimal places
predicted_sales_list = [round(float(sales), 2) for sales in predicted_sales_list]
# Create a dictionary of predictions with indices or IDs as keys
if 'Product_Id' in input_data.columns:
product_ids = input_data['Product_Id'].tolist()
output_dict = dict(zip(product_ids, predicted_sales_list))
else:
# Use row indices if no Product_Id column
indices = [f"Product_{i+1}" for i in range(len(predicted_sales_list))]
output_dict = dict(zip(indices, predicted_sales_list))
# Return the predictions dictionary as a JSON response
return jsonify({
'predictions': output_dict,
'total_products': len(predicted_sales_list),
'status': 'success'
})
except Exception as e:
return jsonify({'error': str(e), 'status': 'error'}), 500
# Run the Flask application if this script is executed directly
if __name__ == '__main__':
port = int(os.environ.get('PORT', 7860)) # Hugging Face uses port 7860
superkart_sales_predictor_api.run(host='0.0.0.0', port=port, debug=False)