Spaces:

DTanzillo
/

one-haut-encoded

Runtime error

App Files Files Community

one-haut-encoded / scripts /train_baseline.py

DTanzillo

Deploy FastAPI backend with live model inference

f35e50b about 2 months ago

raw

history blame contribute delete

2.92 kB

	# AI-assisted (Claude Code, claude.ai) -- https://claude.ai
	"""Popularity baseline recommender — the floor every other model must beat."""

	import pandas as pd
	import numpy as np
	import pickle
	from pathlib import Path


	class PopularityRecommender:
	"""Recommend top-N most purchased items globally or per department."""

	def __init__(self):
	self.global_ranking = None
	self.dept_ranking = None
	self.user_top_dept = None

	def fit(self, train: pd.DataFrame, articles: pd.DataFrame):
	# Global popularity
	self.global_ranking = (
	train.groupby("article_id").size()
	.sort_values(ascending=False)
	.index.tolist()
	)

	# Per-department popularity
	train_with_dept = train.merge(
	articles[["article_id", "department_name"]],
	on="article_id",
	how="left",
	)
	self.dept_ranking = (
	train_with_dept.groupby(["department_name", "article_id"])
	.size()
	.reset_index(name="count")
	.sort_values(["department_name", "count"], ascending=[True, False])
	.groupby("department_name")["article_id"]
	.apply(list)
	.to_dict()
	)

	# Each user's most-purchased department
	user_dept = (
	train_with_dept.groupby(["customer_id", "department_name"])
	.size()
	.reset_index(name="count")
	)
	self.user_top_dept = (
	user_dept.loc[user_dept.groupby("customer_id")["count"].idxmax()]
	.set_index("customer_id")["department_name"]
	.to_dict()
	)

	print(f"Fitted on {len(train):,} transactions")
	print(f"Global top 5: {self.global_ranking[:5]}")

	def recommend(self, customer_id: str, k: int = 12, mode: str = "global"):
	if mode == "department" and customer_id in self.user_top_dept:
	dept = self.user_top_dept[customer_id]
	candidates = self.dept_ranking.get(dept, self.global_ranking)
	else:
	candidates = self.global_ranking
	return candidates[:k]

	def save(self, path: str):
	with open(path, "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def load(cls, path: str):
	with open(path, "rb") as f:
	return pickle.load(f)


	def train_baseline(data_dir: str = "data/processed"):
	data_dir = Path(data_dir)

	train = pd.read_csv(data_dir / "train.csv", dtype={"article_id": str})
	articles = pd.read_csv(data_dir / "articles_subset.csv", dtype={"article_id": str})

	model = PopularityRecommender()
	model.fit(train, articles)

	out_path = Path("models/baseline/popularity.pkl")
	out_path.parent.mkdir(parents=True, exist_ok=True)
	model.save(str(out_path))
	print(f"Saved to {out_path}")

	return model


	if __name__ == "__main__":
	train_baseline()