Major_Project / main.py
VishnuCodes's picture
Create main.py
3c7d03a verified
raw
history blame
23.9 kB
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import StreamingResponse, JSONResponse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from sklearn.pipeline import Pipeline
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
import io
import matplotlib.pyplot as plt
from typing import List
app = FastAPI()
# XGBoost-Only Endpoint
@app.post("/train-xgboost")
async def train_xgboost(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
menu_data = data_frames['restaurant_menu_final.csv']
# Parse 'Date' in sales data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
# Aggregate weekly sales data for each menu item
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Merge menu data for menu item details
merged_data = pd.merge(weekly_sales, menu_data, on='Menu_ID', how='left')
# Feature preparation
features = merged_data[['Week', 'Menu_ID', 'Price', 'Revenue']]
target = merged_data['Quantity Sold']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Preprocessing: Scaling numerical features and encoding categorical features
numerical_features = ['Week', 'Price', 'Revenue']
categorical_features = ['Menu_ID']
column_transformer = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(handle_unknown="ignore"), categorical_features),
remainder="drop",
)
# Transform the training and test datasets
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)
# XGBoost model
xgb_model = XGBRegressor(
n_estimators=25,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
# Train the XGBoost model
xgb_model.fit(
X_train_transformed,
y_train,
eval_set=[(X_test_transformed, y_test)],
verbose=False,
)
# Predictions and evaluation
xgb_y_pred = xgb_model.predict(X_test_transformed)
# Evaluation Metrics
xgb_mse = mean_squared_error(y_test, xgb_y_pred)
xgb_rmse = math.sqrt(xgb_mse)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_r2 = r2_score(y_test, xgb_y_pred)
# Generate Graph
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual", alpha=0.7)
plt.plot(xgb_y_pred, label="Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (XGBoost)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
headers = {
"XGBoost_MSE": str(xgb_mse),
"XGBoost_RMSE": str(xgb_rmse),
"XGBoost_MAE": str(xgb_mae),
"XGBoost_R2": str(xgb_r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-sarimax-xgboost")
async def train_sarimax_xgboost(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
# Parse 'Date' in sales data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data['Week'] = sales_data['Date'].dt.to_period('W').astype(str)
sales_data['Week'] = sales_data['Week'].str.split('/').str[0]
sales_data['Week'] = pd.to_datetime(sales_data['Week'])
# Select a single menu item for demonstration
menu_id = 1
menu_sales = sales_data[sales_data['Menu_ID'] == menu_id].set_index('Week')
# Debug: Check the length of menu_sales
if menu_sales.empty:
raise ValueError(f"No data available for Menu_ID {menu_id}")
# Train-test split for SARIMAX
train_size = int(len(menu_sales) * 0.8)
train_data, test_data = menu_sales[:train_size], menu_sales[train_size:]
# SARIMAX Model
sarimax_model = SARIMAX(train_data['Quantity Sold'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
sarimax_result = sarimax_model.fit(disp=False)
# Predictions with SARIMAX
sarimax_pred = sarimax_result.get_forecast(steps=len(test_data)).predicted_mean
# Debug: Ensure lengths match
if len(sarimax_pred) != len(test_data):
raise ValueError(f"Length mismatch: SARIMAX predictions ({len(sarimax_pred)}) vs Test data ({len(test_data)})")
# Calculate residuals
residuals = test_data['Quantity Sold'] - sarimax_pred
# Debug: Check residuals
if len(residuals) != len(test_data):
raise ValueError("Residuals length mismatch with test data")
# Prepare data for XGBoost
xgboost_features = test_data[['Revenue']].iloc[:len(sarimax_pred)]
xgboost_target = residuals.reset_index(drop=True)
# Debug: Ensure feature and target lengths match
if len(xgboost_features) != len(xgboost_target):
raise ValueError("XGBoost features and target lengths do not match")
# Preprocessing for XGBoost
scaler = StandardScaler()
X_transformed = scaler.fit_transform(xgboost_features)
# XGBoost Model
xgb_model = XGBRegressor(
n_estimators=25,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
xgb_model.fit(X_transformed, xgboost_target)
# Combine SARIMAX and XGBoost Predictions
xgb_residual_pred = xgb_model.predict(X_transformed)
combined_pred = sarimax_pred.values + xgb_residual_pred
# Evaluation Metrics
combined_mse = mean_squared_error(test_data['Quantity Sold'], combined_pred)
combined_rmse = math.sqrt(combined_mse)
combined_mae = mean_absolute_error(test_data['Quantity Sold'], combined_pred)
combined_r2 = r2_score(test_data['Quantity Sold'], combined_pred)
# Generate Graph
plt.figure(figsize=(10, 6))
plt.plot(test_data['Quantity Sold'], label="Actual", alpha=0.7)
plt.plot(combined_pred, label="SARIMAX + XGBoost Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (SARIMAX + XGBoost)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return combined response
headers = {
"SARIMAX_XGBoost_MSE": str(combined_mse),
"SARIMAX_XGBoost_RMSE": str(combined_rmse),
"SARIMAX_XGBoost_MAE": str(combined_mae),
"SARIMAX_XGBoost_R2": str(combined_r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-randomforest-xgboost")
async def train_randomforest_xgboost(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
# Parse 'Date' in sales data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
# Aggregate weekly sales data for each menu item
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Select features and target
features = weekly_sales[['Week', 'Menu_ID', 'Revenue']]
target = weekly_sales['Quantity Sold']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Preprocessing
numerical_features = ['Week', 'Revenue']
categorical_features = ['Menu_ID']
column_transformer = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(handle_unknown="ignore"), categorical_features),
remainder="passthrough",
)
# Transform features
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)
# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_transformed, y_train)
# Random Forest Predictions
rf_pred = rf_model.predict(X_test_transformed)
# Calculate Residuals
residuals = y_test - rf_pred
# XGBoost Model for Residuals
xgb_model = XGBRegressor(
n_estimators=50,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
# Train XGBoost on Residuals
xgb_model.fit(X_test_transformed, residuals)
# XGBoost Predictions for Residuals
xgb_residual_pred = xgb_model.predict(X_test_transformed)
# Combine Predictions
combined_pred = rf_pred + xgb_residual_pred
# Evaluation Metrics
mse = mean_squared_error(y_test, combined_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test, combined_pred)
r2 = r2_score(y_test, combined_pred)
# Generate Graph
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual", alpha=0.7)
plt.plot(combined_pred, label="Random Forest + XGBoost Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (Random Forest + XGBoost)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
headers = {
"RF_XGBoost_MSE": str(mse),
"RF_XGBoost_RMSE": str(rmse),
"RF_XGBoost_MAE": str(mae),
"RF_XGBoost_R2": str(r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-randomforest")
async def train_randomforest(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
menu_data = data_frames['restaurant_menu_final.csv']
recipe_data = data_frames['restaurant_recipe_final.csv']
inventory_data = data_frames['restaurant_inventory_linked.csv']
# Preprocessing
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
inventory_data['Date'] = pd.to_datetime(inventory_data['Date'])
# Aggregate weekly sales data for each menu item
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Merge menu data for menu item details
merged_data = pd.merge(weekly_sales, menu_data, on='Menu_ID', how='left')
# Calculate ingredient quantities needed for weekly sales
ingredient_requirements = pd.merge(merged_data, recipe_data, on='Menu_ID', how='left')
ingredient_requirements['Total_Ingredient_Quantity'] = (
ingredient_requirements['Quantity Sold'] * ingredient_requirements['Quantity_Per_Unit']
)
# Aggregate ingredient requirements
ingredient_needs = ingredient_requirements.groupby(['Week', 'Ingredient_ID']).agg(
{'Total_Ingredient_Quantity': 'sum'}
).reset_index()
# Feature preparation
merged_data = pd.merge(merged_data, ingredient_needs, on='Week', how='left', suffixes=('', '_Ingredient'))
# Select features and target
features = merged_data[['Week', 'Menu_ID', 'Price', 'Revenue']]
target = merged_data['Quantity Sold']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Preprocessing: Scaling numerical features and encoding categorical features
numerical_features = ['Week', 'Price', 'Revenue']
categorical_features = ['Menu_ID']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
]
)
# Random Forest Regressor pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
# Train the model
pipeline.fit(X_train, y_train)
# Predictions
y_pred = pipeline.predict(X_test)
# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label="Actual", alpha=0.7)
plt.plot(y_pred, label="Random Forest", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (Random Forest)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
headers = {
"RF_XGBoost_MSE": str(mse),
"RF_XGBoost_RMSE": str(rmse),
"RF_XGBoost_MAE": str(mae),
"RF_XGBoost_R2": str(r2),
}
return StreamingResponse(buf, media_type="image/png", headers=headers)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/train-arima")
async def train_arima(file: UploadFile = File(...)):
try:
# Load the uploaded CSV file into a DataFrame
content = await file.read()
sales_data = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Prepare data for ARIMA: Aggregate total quantity sold for each menu item weekly
arima_data = sales_data.groupby(['Date', 'Menu_ID'])['Quantity Sold'].sum().unstack(fill_value=0)
# Ensure the index is datetime
arima_data.index = pd.to_datetime(arima_data.index)
# Split ARIMA data into training and testing sets (80-20 split)
arima_train = arima_data.iloc[:int(len(arima_data) * 0.8), :]
arima_test = arima_data.iloc[int(len(arima_data) * 0.8):, :]
# Store ARIMA models and predictions
arima_models = {}
arima_predictions = {}
# Fit ARIMA for each menu item
for menu_id in arima_data.columns:
# Train ARIMA model
model = ARIMA(arima_train[menu_id], order=(5, 1, 0))
arima_fitted = model.fit()
arima_models[menu_id] = arima_fitted # Save the fitted model
# Predict using ARIMA
forecast = arima_fitted.forecast(steps=len(arima_test))
arima_predictions[menu_id] = forecast
# Combine predictions into a single DataFrame
arima_predictions_df = pd.DataFrame(arima_predictions, index=arima_test.index)
# Calculate metrics for ARIMA
arima_metrics = {
"Mean Squared Error (MSE)": mean_squared_error(arima_test.values.flatten(), arima_predictions_df.values.flatten()),
"Root Mean Squared Error (RMSE)": math.sqrt(mean_squared_error(arima_test.values.flatten(), arima_predictions_df.values.flatten())),
"Mean Absolute Error (MAE)": mean_absolute_error(arima_test.values.flatten(), arima_predictions_df.values.flatten()),
"R-squared Score (R²)": r2_score(arima_test.values.flatten(), arima_predictions_df.values.flatten())
}
# Generate Graph
plt.figure(figsize=(12, 6))
plt.plot(arima_test.values.flatten(), label="Actual", alpha=0.7)
plt.plot(arima_predictions_df.values.flatten(), label="Predicted", alpha=0.7)
plt.legend()
plt.title("Actual vs. Predicted (ARIMA)")
plt.xlabel("Index")
plt.ylabel("Quantity Sold")
# Save the plot to a BytesIO buffer
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close()
# Return response with metrics and graph
return StreamingResponse(
buf,
media_type="image/png",
headers={
"ARIMA_MSE": str(arima_metrics["Mean Squared Error (MSE)"]),
"ARIMA_RMSE": str(arima_metrics["Root Mean Squared Error (RMSE)"]),
"ARIMA_MAE": str(arima_metrics["Mean Absolute Error (MAE)"]),
"ARIMA_R2": str(arima_metrics["R-squared Score (R²)"]),
}
)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
@app.post("/predict-sales")
async def predict_sales(files: List[UploadFile] = File(...)):
try:
# Read the uploaded CSV files into DataFrames
data_frames = {}
for file in files:
content = await file.read()
data_frames[file.filename] = pd.read_csv(io.StringIO(content.decode("utf-8")))
# Extract relevant DataFrames
sales_data = data_frames['restaurant_sales_linked.csv']
menu_data = data_frames['restaurant_menu_final.csv']
recipe_data = data_frames['restaurant_recipe_final.csv']
ingredients_data = data_frames['restaurant_ingredients_final.csv']
# Parse 'Date' and preprocess data
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data['Week'] = sales_data['Date'].dt.isocalendar().week
weekly_sales = sales_data.groupby(['Week', 'Menu_ID']).agg({'Quantity Sold': 'sum', 'Revenue': 'sum'}).reset_index()
# Select features and target
features = weekly_sales[['Week', 'Menu_ID', 'Revenue']]
target = weekly_sales['Quantity Sold']
# Preprocessing
numerical_features = ['Week', 'Revenue']
categorical_features = ['Menu_ID']
column_transformer = make_column_transformer(
(StandardScaler(), numerical_features),
(OneHotEncoder(handle_unknown="ignore"), categorical_features),
remainder="passthrough",
)
# Transform features
X_transformed = column_transformer.fit_transform(features)
# Split data for model training
X_train = X_transformed[:-len(features['Menu_ID'].unique())] # Exclude last batch for prediction
y_train = target[:-len(features['Menu_ID'].unique())]
X_future = X_transformed[-len(features['Menu_ID'].unique()):] # Batch for all menu items
# Train Random Forest on historical data
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Predict future sales with Random Forest
rf_pred = rf_model.predict(X_future)
# Calculate residuals for training XGBoost
rf_train_pred = rf_model.predict(X_train)
residuals = y_train - rf_train_pred
# Train XGBoost on residuals
xgb_model = XGBRegressor(
n_estimators=50,
learning_rate=0.1,
max_depth=5,
random_state=42,
tree_method="hist",
eval_metric="rmse",
)
xgb_model.fit(X_train, residuals)
# Predict residuals for all menu items with XGBoost
xgb_residual_pred = xgb_model.predict(X_future)
# Combine predictions from both models
combined_pred = rf_pred + xgb_residual_pred
# Predict for all Menu_IDs and sort by predicted quantities
predicted_sales = pd.DataFrame({
'Menu_ID': features['Menu_ID'].unique(),
'Predicted Quantity': combined_pred
}).sort_values(by='Predicted Quantity', ascending=False).head(8) # Top 8 dishes
# Merge with menu and recipe data for detailed information
predicted_sales_details = predicted_sales.merge(menu_data, on='Menu_ID', how='inner')
predicted_sales_details = predicted_sales_details.merge(recipe_data, on='Menu_ID', how='inner')
predicted_sales_details = predicted_sales_details.merge(ingredients_data, on='Ingredient_ID', how='inner')
# Calculate ingredient requirements for the future week
predicted_sales_details['Total Ingredient Quantity'] = (
predicted_sales_details['Quantity_Per_Unit'] * predicted_sales_details['Predicted Quantity']
)
# Select and organize the final output
final_result = predicted_sales_details[[
'Menu_Item', 'Predicted Quantity', 'Ingredient_Name', 'Total Ingredient Quantity'
]]
# Save the final result to a CSV file
buffer = io.StringIO()
final_result.to_csv(buffer, index=False)
buffer.seek(0)
# Create a StreamingResponse to return the CSV file
return StreamingResponse(
io.BytesIO(buffer.getvalue().encode("utf-8")),
media_type="text/csv",
headers={"Content-Disposition": "attachment; filename=predicted_sales_ingredients.csv"}
)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)