import joblib, pandas as pd, numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor

# Columns as per your app/notebook
columns = [
    "Product_Id","Product_Weight","Product_Sugar_Content","Product_Allocated_Area",
    "Product_Type","Product_MRP","Store_Id","Store_Establishment_Year",
    "Store_Size","Store_Location_City_Type","Store_Type"
]

# Minimal 1-row frame to fit preprocessing shapes
mock = pd.DataFrame([{
    "Product_Id":"PR1234","Product_Weight":1.0,"Product_Sugar_Content":"regular",
    "Product_Allocated_Area":0.02,"Product_Type":"dairy","Product_MRP":50.0,
    "Store_Id":"S001","Store_Establishment_Year":2010,"Store_Size":"medium",
    "Store_Location_City_Type":"Tier 2","Store_Type":"Supermarket Type 1"
}], columns=columns)

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d["Product_Prefix"] = d["Product_Id"].astype(str).str[:2]
    d["Store_Age"] = 2025 - d["Store_Establishment_Year"]
    return d

mock = feature_engineering(mock)

num_cols = ["Product_Weight","Product_Allocated_Area","Product_MRP","Store_Establishment_Year","Store_Age"]
cat_cols = ["Product_Id","Product_Sugar_Content","Product_Type","Store_Id","Store_Size","Store_Location_City_Type","Store_Type","Product_Prefix"]

numeric = Pipeline([("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler(with_mean=False))])
categorical = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True))])

pre = ColumnTransformer([("num", numeric, num_cols), ("cat", categorical, cat_cols)],
                        remainder="drop", sparse_threshold=0.3)

model = DummyRegressor(strategy="mean")
pipe = Pipeline([("prep", pre), ("model", model)])

pipe.fit(mock, np.array([0.0]))  # dummy target
joblib.dump(pipe, "best_model.pkl")
print("Saved best_model.pkl (DummyRegressor).")