one-haut-encoded / scripts /train_baseline.py
DTanzillo's picture
Deploy FastAPI backend with live model inference
f35e50b
# AI-assisted (Claude Code, claude.ai) -- https://claude.ai
"""Popularity baseline recommender — the floor every other model must beat."""
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
class PopularityRecommender:
"""Recommend top-N most purchased items globally or per department."""
def __init__(self):
self.global_ranking = None
self.dept_ranking = None
self.user_top_dept = None
def fit(self, train: pd.DataFrame, articles: pd.DataFrame):
# Global popularity
self.global_ranking = (
train.groupby("article_id").size()
.sort_values(ascending=False)
.index.tolist()
)
# Per-department popularity
train_with_dept = train.merge(
articles[["article_id", "department_name"]],
on="article_id",
how="left",
)
self.dept_ranking = (
train_with_dept.groupby(["department_name", "article_id"])
.size()
.reset_index(name="count")
.sort_values(["department_name", "count"], ascending=[True, False])
.groupby("department_name")["article_id"]
.apply(list)
.to_dict()
)
# Each user's most-purchased department
user_dept = (
train_with_dept.groupby(["customer_id", "department_name"])
.size()
.reset_index(name="count")
)
self.user_top_dept = (
user_dept.loc[user_dept.groupby("customer_id")["count"].idxmax()]
.set_index("customer_id")["department_name"]
.to_dict()
)
print(f"Fitted on {len(train):,} transactions")
print(f"Global top 5: {self.global_ranking[:5]}")
def recommend(self, customer_id: str, k: int = 12, mode: str = "global"):
if mode == "department" and customer_id in self.user_top_dept:
dept = self.user_top_dept[customer_id]
candidates = self.dept_ranking.get(dept, self.global_ranking)
else:
candidates = self.global_ranking
return candidates[:k]
def save(self, path: str):
with open(path, "wb") as f:
pickle.dump(self, f)
@classmethod
def load(cls, path: str):
with open(path, "rb") as f:
return pickle.load(f)
def train_baseline(data_dir: str = "data/processed"):
data_dir = Path(data_dir)
train = pd.read_csv(data_dir / "train.csv", dtype={"article_id": str})
articles = pd.read_csv(data_dir / "articles_subset.csv", dtype={"article_id": str})
model = PopularityRecommender()
model.fit(train, articles)
out_path = Path("models/baseline/popularity.pkl")
out_path.parent.mkdir(parents=True, exist_ok=True)
model.save(str(out_path))
print(f"Saved to {out_path}")
return model
if __name__ == "__main__":
train_baseline()