import joblib, pandas as pd, numpy as np from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.dummy import DummyRegressor # Columns as per your app/notebook columns = [ "Product_Id","Product_Weight","Product_Sugar_Content","Product_Allocated_Area", "Product_Type","Product_MRP","Store_Id","Store_Establishment_Year", "Store_Size","Store_Location_City_Type","Store_Type" ] # Minimal 1-row frame to fit preprocessing shapes mock = pd.DataFrame([{ "Product_Id":"PR1234","Product_Weight":1.0,"Product_Sugar_Content":"regular", "Product_Allocated_Area":0.02,"Product_Type":"dairy","Product_MRP":50.0, "Store_Id":"S001","Store_Establishment_Year":2010,"Store_Size":"medium", "Store_Location_City_Type":"Tier 2","Store_Type":"Supermarket Type 1" }], columns=columns) def feature_engineering(df: pd.DataFrame) -> pd.DataFrame: d = df.copy() d["Product_Prefix"] = d["Product_Id"].astype(str).str[:2] d["Store_Age"] = 2025 - d["Store_Establishment_Year"] return d mock = feature_engineering(mock) num_cols = ["Product_Weight","Product_Allocated_Area","Product_MRP","Store_Establishment_Year","Store_Age"] cat_cols = ["Product_Id","Product_Sugar_Content","Product_Type","Store_Id","Store_Size","Store_Location_City_Type","Store_Type","Product_Prefix"] numeric = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler(with_mean=False))]) categorical = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True))]) pre = ColumnTransformer([("num", numeric, num_cols), ("cat", categorical, cat_cols)], remainder="drop", sparse_threshold=0.3) model = DummyRegressor(strategy="mean") pipe = Pipeline([("prep", pre), ("model", model)]) pipe.fit(mock, np.array([0.0])) # dummy target joblib.dump(pipe, "best_model.pkl") print("Saved best_model.pkl (DummyRegressor).")