Major_Project / main.py
VishnuCodes's picture
Create main.py
3c7d03a verified
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import StreamingResponse, JSONResponse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from sklearn.pipeline import Pipeline
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
import io
import matplotlib.pyplot as plt
from typing import List
app = FastAPI()
# XGBoost-Only Endpoint
@app.post("/train-xgboost")
async def train_xgboost(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
menu_data = data_frames['restaurant_menu_final.csv']
# Parse 'Date' in sales data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
# Aggregate weekly sales data for each menu item
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Merge menu data for menu item details
merged_data = pd.merge(weekly_sales, menu_data, on='Menu_ID', how='left')
# Feature preparation
features = merged_data[['Week', 'Menu_ID', 'Price', 'Revenue']]
target = merged_data['Quantity Sold']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Preprocessing: Scaling numerical features and encoding categorical features
numerical_features = ['Week', 'Price', 'Revenue']
categorical_features = ['Menu_ID']
column_transformer = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(handle_unknown="ignore"), categorical_features),
remainder="drop",
)
# Transform the training and test datasets
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)
# XGBoost model
xgb_model = XGBRegressor(
n_estimators=25,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
# Train the XGBoost model
xgb_model.fit(
X_train_transformed,
y_train,
eval_set=[(X_test_transformed, y_test)],
verbose=False,
)
# Predictions and evaluation
xgb_y_pred = xgb_model.predict(X_test_transformed)
# Evaluation Metrics
xgb_mse = mean_squared_error(y_test, xgb_y_pred)
xgb_rmse = math.sqrt(xgb_mse)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_r2 = r2_score(y_test, xgb_y_pred)
# Generate Graph
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual", alpha=0.7)
plt.plot(xgb_y_pred, label="Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (XGBoost)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
headers = {
"XGBoost_MSE": str(xgb_mse),
"XGBoost_RMSE": str(xgb_rmse),
"XGBoost_MAE": str(xgb_mae),
"XGBoost_R2": str(xgb_r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-sarimax-xgboost")
async def train_sarimax_xgboost(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
# Parse 'Date' in sales data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data['Week'] = sales_data['Date'].dt.to_period('W').astype(str)
sales_data['Week'] = sales_data['Week'].str.split('/').str[0]
sales_data['Week'] = pd.to_datetime(sales_data['Week'])
# Select a single menu item for demonstration
menu_id = 1
menu_sales = sales_data[sales_data['Menu_ID'] == menu_id].set_index('Week')
# Debug: Check the length of menu_sales
if menu_sales.empty:
raise ValueError(f"No data available for Menu_ID {menu_id}")
# Train-test split for SARIMAX
train_size = int(len(menu_sales) * 0.8)
train_data, test_data = menu_sales[:train_size], menu_sales[train_size:]
# SARIMAX Model
sarimax_model = SARIMAX(train_data['Quantity Sold'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
sarimax_result = sarimax_model.fit(disp=False)
# Predictions with SARIMAX
sarimax_pred = sarimax_result.get_forecast(steps=len(test_data)).predicted_mean
# Debug: Ensure lengths match
if len(sarimax_pred) != len(test_data):
raise ValueError(f"Length mismatch: SARIMAX predictions ({len(sarimax_pred)}) vs Test data ({len(test_data)})")
# Calculate residuals
residuals = test_data['Quantity Sold'] - sarimax_pred
# Debug: Check residuals
if len(residuals) != len(test_data):
raise ValueError("Residuals length mismatch with test data")
# Prepare data for XGBoost
xgboost_features = test_data[['Revenue']].iloc[:len(sarimax_pred)]
xgboost_target = residuals.reset_index(drop=True)
# Debug: Ensure feature and target lengths match
if len(xgboost_features) != len(xgboost_target):
raise ValueError("XGBoost features and target lengths do not match")
# Preprocessing for XGBoost
scaler = StandardScaler()
X_transformed = scaler.fit_transform(xgboost_features)
# XGBoost Model
xgb_model = XGBRegressor(
n_estimators=25,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
xgb_model.fit(X_transformed, xgboost_target)
# Combine SARIMAX and XGBoost Predictions
xgb_residual_pred = xgb_model.predict(X_transformed)
combined_pred = sarimax_pred.values + xgb_residual_pred
# Evaluation Metrics
combined_mse = mean_squared_error(test_data['Quantity Sold'], combined_pred)
combined_rmse = math.sqrt(combined_mse)
combined_mae = mean_absolute_error(test_data['Quantity Sold'], combined_pred)
combined_r2 = r2_score(test_data['Quantity Sold'], combined_pred)
# Generate Graph
plt.figure(figsize=(10, 6))
plt.plot(test_data['Quantity Sold'], label="Actual", alpha=0.7)
plt.plot(combined_pred, label="SARIMAX + XGBoost Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (SARIMAX + XGBoost)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return combined response
headers = {
"SARIMAX_XGBoost_MSE": str(combined_mse),
"SARIMAX_XGBoost_RMSE": str(combined_rmse),
"SARIMAX_XGBoost_MAE": str(combined_mae),
"SARIMAX_XGBoost_R2": str(combined_r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-randomforest-xgboost")
async def train_randomforest_xgboost(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
# Parse 'Date' in sales data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
# Aggregate weekly sales data for each menu item
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Select features and target
features = weekly_sales[['Week', 'Menu_ID', 'Revenue']]
target = weekly_sales['Quantity Sold']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Preprocessing
numerical_features = ['Week', 'Revenue']
categorical_features = ['Menu_ID']
column_transformer = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(handle_unknown="ignore"), categorical_features),
remainder="passthrough",
)
# Transform features
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)
# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_transformed, y_train)
# Random Forest Predictions
rf_pred = rf_model.predict(X_test_transformed)
# Calculate Residuals
residuals = y_test - rf_pred
# XGBoost Model for Residuals
xgb_model = XGBRegressor(
n_estimators=50,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
# Train XGBoost on Residuals
xgb_model.fit(X_test_transformed, residuals)
# XGBoost Predictions for Residuals
xgb_residual_pred = xgb_model.predict(X_test_transformed)
# Combine Predictions
combined_pred = rf_pred + xgb_residual_pred
# Evaluation Metrics
mse = mean_squared_error(y_test, combined_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test, combined_pred)
r2 = r2_score(y_test, combined_pred)
# Generate Graph
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual", alpha=0.7)
plt.plot(combined_pred, label="Random Forest + XGBoost Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (Random Forest + XGBoost)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
headers = {
"RF_XGBoost_MSE": str(mse),
"RF_XGBoost_RMSE": str(rmse),
"RF_XGBoost_MAE": str(mae),
"RF_XGBoost_R2": str(r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-randomforest")
async def train_randomforest(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
menu_data = data_frames['restaurant_menu_final.csv']
recipe_data = data_frames['restaurant_recipe_final.csv']
inventory_data = data_frames['restaurant_inventory_linked.csv']
# Preprocessing
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
inventory_data['Date'] = pd.to_datetime(inventory_data['Date'])
# Aggregate weekly sales data for each menu item
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Merge menu data for menu item details
merged_data = pd.merge(weekly_sales, menu_data, on='Menu_ID', how='left')
# Calculate ingredient quantities needed for weekly sales
ingredient_requirements = pd.merge(merged_data, recipe_data, on='Menu_ID', how='left')
ingredient_requirements['Total_Ingredient_Quantity'] = (
ingredient_requirements['Quantity Sold'] * ingredient_requirements['Quantity_Per_Unit']
)
# Aggregate ingredient requirements
ingredient_needs = ingredient_requirements.groupby(['Week', 'Ingredient_ID']).agg(
{'Total_Ingredient_Quantity': 'sum'}
).reset_index()
# Feature preparation
merged_data = pd.merge(merged_data, ingredient_needs, on='Week', how='left', suffixes=('', '_Ingredient'))
# Select features and target
features = merged_data[['Week', 'Menu_ID', 'Price', 'Revenue']]
target = merged_data['Quantity Sold']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Preprocessing: Scaling numerical features and encoding categorical features
numerical_features = ['Week', 'Price', 'Revenue']
categorical_features = ['Menu_ID']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
]
)
# Random Forest Regressor pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
# Train the model
pipeline.fit(X_train, y_train)
# Predictions
y_pred = pipeline.predict(X_test)
# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual", alpha=0.7)
plt.plot(y_pred, label="Random Forest", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (Random Forest)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
headers = {
"RF_XGBoost_MSE": str(mse),
"RF_XGBoost_RMSE": str(rmse),
"RF_XGBoost_MAE": str(mae),
"RF_XGBoost_R2": str(r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-arima")
async def train_arima(file: UploadFile = File(...)):
try:
# Load the uploaded CSV file into a DataFrame
content = await file.read()
sales_data = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Prepare data for ARIMA: Aggregate total quantity sold for each menu item weekly
arima_data = sales_data.groupby(['Date', 'Menu_ID'])['Quantity Sold'].sum().unstack(fill_value=0)
# Ensure the index is datetime
arima_data.index = pd.to_datetime(arima_data.index)
# Split ARIMA data into training and testing sets (80-20 split)
arima_train = arima_data.iloc[:int(len(arima_data) * 0.8), :]
arima_test = arima_data.iloc[int(len(arima_data) * 0.8):, :]
# Store ARIMA models and predictions
arima_models = {}
arima_predictions = {}
# Fit ARIMA for each menu item
for menu_id in arima_data.columns:
# Train ARIMA model
model = ARIMA(arima_train[menu_id], order=(5, 1, 0))
arima_fitted = model.fit()
arima_models[menu_id] = arima_fitted # Save the fitted model
# Predict using ARIMA
forecast = arima_fitted.forecast(steps=len(arima_test))
arima_predictions[menu_id] = forecast
# Combine predictions into a single DataFrame
arima_predictions_df = pd.DataFrame(arima_predictions, index=arima_test.index)
# Calculate metrics for ARIMA
arima_metrics = {
"Mean Squared Error (MSE)": mean_squared_error(arima_test.values.flatten(), arima_predictions_df.values.flatten()),
"Root Mean Squared Error (RMSE)": math.sqrt(mean_squared_error(arima_test.values.flatten(), arima_predictions_df.values.flatten())),
"Mean Absolute Error (MAE)": mean_absolute_error(arima_test.values.flatten(), arima_predictions_df.values.flatten()),
"R-squared Score (R²)": r2_score(arima_test.values.flatten(), arima_predictions_df.values.flatten())
}
# Generate Graph
plt.figure(figsize=(12, 6))
plt.plot(arima_test.values.flatten(), label="Actual", alpha=0.7)
plt.plot(arima_predictions_df.values.flatten(), label="Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (ARIMA)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
return StreamingResponse(
buf,
media_type="image/png",
headers={
"ARIMA_MSE": str(arima_metrics["Mean Squared Error (MSE)"]),
"ARIMA_RMSE": str(arima_metrics["Root Mean Squared Error (RMSE)"]),
"ARIMA_MAE": str(arima_metrics["Mean Absolute Error (MAE)"]),
"ARIMA_R2": str(arima_metrics["R-squared Score (R²)"]),
}
)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/predict-sales")
async def predict_sales(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
menu_data = data_frames['restaurant_menu_final.csv']
recipe_data = data_frames['restaurant_recipe_final.csv']
ingredients_data = data_frames['restaurant_ingredients_final.csv']
# Parse 'Date' and preprocess data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Select features and target
features = weekly_sales[['Week', 'Menu_ID', 'Revenue']]
target = weekly_sales['Quantity Sold']
# Preprocessing
numerical_features = ['Week', 'Revenue']
categorical_features = ['Menu_ID']
column_transformer = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(handle_unknown="ignore"), categorical_features),
remainder="passthrough",
)
# Transform features
X_transformed = column_transformer.fit_transform(features)
# Split data for model training
X_train = X_transformed[:-len(features['Menu_ID'].unique())] # Exclude last batch for prediction
y_train = target[:-len(features['Menu_ID'].unique())]
X_future = X_transformed[-len(features['Menu_ID'].unique()):] # Batch for all menu items
# Train Random Forest on historical data
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Predict future sales with Random Forest
rf_pred = rf_model.predict(X_future)
# Calculate residuals for training XGBoost
rf_train_pred = rf_model.predict(X_train)
residuals = y_train - rf_train_pred
# Train XGBoost on residuals
xgb_model = XGBRegressor(
n_estimators=50,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
xgb_model.fit(X_train, residuals)
# Predict residuals for all menu items with XGBoost
xgb_residual_pred = xgb_model.predict(X_future)
# Combine predictions from both models
combined_pred = rf_pred + xgb_residual_pred
# Predict for all Menu_IDs and sort by predicted quantities
predicted_sales = pd.DataFrame({
'Menu_ID': features['Menu_ID'].unique(),
'Predicted Quantity': combined_pred
}).sort_values(by='Predicted Quantity', ascending=False).head(8) # Top 8 dishes
# Merge with menu and recipe data for detailed information
predicted_sales_details = predicted_sales.merge(menu_data, on='Menu_ID', how='inner')
predicted_sales_details = predicted_sales_details.merge(recipe_data, on='Menu_ID', how='inner')
predicted_sales_details = predicted_sales_details.merge(ingredients_data, on='Ingredient_ID', how='inner')
# Calculate ingredient requirements for the future week
predicted_sales_details['Total Ingredient Quantity'] = (
predicted_sales_details['Quantity_Per_Unit'] * predicted_sales_details['Predicted Quantity']
)
# Select and organize the final output
final_result = predicted_sales_details[[
'Menu_Item', 'Predicted Quantity', 'Ingredient_Name', 'Total Ingredient Quantity'
]]
# Save the final result to a CSV file
buffer = io.StringIO()
final_result.to_csv(buffer, index=False)
buffer.seek(0)
# Create a StreamingResponse to return the CSV file
return StreamingResponse(
io.BytesIO(buffer.getvalue().encode("utf-8")),
media_type="text/csv",
headers={"Content-Disposition": "attachment; filename=predicted_sales_ingredients.csv"}
)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)