Spaces:

Iris314
/

SmartFridgeRecipes

Sleeping

App Files Files Community

SmartFridgeRecipes / recipe_recommendation /src /trainmodel.py

Iris314

Upload 8 files

b9a4372 verified 3 months ago

raw

history blame contribute delete

9.52 kB

	import os
	import joblib
	import warnings
	import numpy as np
	import pandas as pd
	from typing import List, Tuple, Sequence, Optional
	from xgboost import XGBRanker
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import ndcg_score
	from pandas.api.types import is_numeric_dtype
	from .feature import FEATURE_COLS
	from datetime import datetime



	# ----------------------------- Helpers -----------------------------
	def _pick_feature_cols(df: pd.DataFrame, drop_cols: Sequence[str]) -> List[str]:
	"""
	Pick numeric feature columns robustly, excluding drop_cols.
	Uses pandas is_numeric_dtype to correctly include nullable ints/floats/bools.
	"""
	cols = []
	for c in df.columns:
	if c in drop_cols:
	continue
	if is_numeric_dtype(df[c]):
	cols.append(c)
	return cols


	def _sort_and_pack_by_qid(
	X: pd.DataFrame, y: pd.Series, qid: pd.Series, feature_cols: List[str]
	) -> Tuple[pd.DataFrame, np.ndarray, List[int], np.ndarray]:
	"""
	Sort rows by qid so that group sizes match the sample order.
	Returns:
	X_sorted, y_sorted, groups, qid_sorted (aligned with X_sorted/y_sorted)
	"""
	packed = X.copy()
	packed["_label"] = y.values
	packed["_qid"] = qid.values
	packed = packed.sort_values("_qid").reset_index(drop=True)

	groups = packed.groupby("_qid").size().tolist()
	X_sorted = packed[feature_cols].copy()
	y_sorted = packed["_label"].astype(float).values
	qid_sorted = packed["_qid"].values
	return X_sorted, y_sorted, groups, qid_sorted


	def _eval_mean_ndcg(
	model: XGBRanker,
	X_val: pd.DataFrame,
	y_val, # can be np.ndarray or pd.Series
	qid_val, # aligned with X_val/y_val
	ks: Sequence[int] = (5, 10),
	) -> dict:
	"""
	Compute mean NDCG@k for each k in ks over validation queries.
	Accepts numpy arrays or pandas Series.
	"""
	# Try to respect early-stopping best iteration if available (xgboost>=2.0)
	try:
	preds = model.predict(X_val, iteration_range=(0, model.best_iteration + 1))
	except Exception:
	preds = model.predict(X_val)

	y_arr = np.asarray(y_val)
	q_arr = np.asarray(qid_val)

	out = {}
	for k in ks:
	ndcgs = []
	for q in np.unique(q_arr):
	mask = (q_arr == q)
	if mask.sum() < 2:
	continue
	ndcgs.append(ndcg_score([y_arr[mask]], [preds[mask]], k=k))
	out[f"NDCG@{k}"] = float(np.mean(ndcgs)) if ndcgs else 0.0
	return out



	# ----------------------------- Main Trainer -----------------------------
	def train_model_ranker(
	user_id: str = "user_1",
	features_path: Optional[str] = None,
	save_model: bool = True,
	model_params: Optional[dict] = None,
	val_ratio: float = 0.2,
	random_state: int = 42,
	max_rows: Optional[int] = None,
	):
	"""
	Train an XGBoost Learning-to-Rank model (XGBRanker) on cold-start generated data.

	Expected input CSV (from cold_start.py):
	- qid: query id (one round of pantry sampling = one query)
	- relevance: graded relevance label (e.g., 3/2/1/0)
	- features: numeric columns produced by build_features (and any extra numeric signals)

	The function:
	1) Reads the CSV
	2) Selects numeric feature columns robustly
	3) Splits train/val by qid to avoid leakage
	4) Sorts each split by qid and builds group sizes aligned to sample order
	5) Trains XGBRanker and reports NDCG@5/10
	6) Saves model to user_data/<user_id>/ranker.pkl
	"""
	base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
	os.makedirs(base_dir, exist_ok=True)


	# Resolve features path
	if features_path is None:
	features_path = os.path.join(base_dir, "user_features_rank.csv")
	if not os.path.exists(features_path):
	raise FileNotFoundError(
	f"[train_model_ranker] Cold-start features not found at: {features_path}\n"
	f"Please run cold_start_ranker(user_id='{user_id}') first."
	)

	# Load data
	df = pd.read_csv(features_path)
	if max_rows is not None and len(df) > max_rows:
	df = df.sample(max_rows, random_state=random_state).reset_index(drop=True)

	# Basic validation
	if "qid" not in df.columns or "relevance" not in df.columns:
	raise ValueError("Input CSV must contain 'qid' and 'relevance' columns.")

	# Fill NaNs in label/qid (should not happen, but defensive)
	df["qid"] = pd.to_numeric(df["qid"], errors="coerce").fillna(-1).astype(int)
	df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(float)

	# Pick numeric feature columns robustly
	feature_cols = FEATURE_COLS.copy()
	df = df.reindex(columns=["qid", "relevance"] + feature_cols, fill_value=0)

	# Ensure numeric + finite values only (replace inf/nan with 0)
	df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")
	df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

	# Split by qid to avoid leakage across queries
	unique_qids = df["qid"].unique()
	if len(unique_qids) < 2:
	warnings.warn("Only one unique qid found — ranking training may be ineffective.")
	train_mask = np.ones(len(df), dtype=bool)
	val_mask = np.zeros(len(df), dtype=bool)
	else:
	train_qids, val_qids = train_test_split(
	unique_qids, test_size=val_ratio, random_state=random_state
	)
	train_mask = df["qid"].isin(train_qids)
	val_mask = df["qid"].isin(val_qids)

	# Split dataframes AFTER defining masks
	X_train_raw = df.loc[train_mask, feature_cols]
	y_train_raw = df.loc[train_mask, "relevance"]
	qid_train = df.loc[train_mask, "qid"]

	X_val_raw = df.loc[val_mask, feature_cols]
	y_val_raw = df.loc[val_mask, "relevance"]
	qid_val = df.loc[val_mask, "qid"]

	# Sort by qid and build group sizes aligned with sample order (CRITICAL for XGBRanker)
	X_train, y_train, group_train, _ = _sort_and_pack_by_qid(
	X_train_raw, y_train_raw, qid_train, feature_cols
	)
	X_val, y_val, group_val, qid_val_sorted = _sort_and_pack_by_qid(
	X_val_raw, y_val_raw, qid_val, feature_cols
	)


	print(f"[ranker] #Train groups: {len(group_train)} \| #Val groups: {len(group_val)}")
	print(f"[ranker] Train rows: {len(X_train)} \| Val rows: {len(X_val)} \| #Features: {len(feature_cols)}")

	# Default model params
	default_params = dict(
	objective="rank:ndcg",
	eval_metric="ndcg",
	n_estimators=400,
	learning_rate=0.08,
	max_depth=6,
	subsample=0.8,
	colsample_bytree=0.8,
	random_state=random_state,
	tree_method="hist",
	reg_lambda=1.0,
	reg_alpha=0.0,
	)
	if model_params:
	default_params.update(model_params)

	model = XGBRanker(**default_params)

	# Fit model (XGBRanker requires group/group for eval_set as well)
	fit_kwargs = dict(
	X=X_train,
	y=y_train,
	group=group_train,
	eval_set=[(X_val, y_val)],
	eval_group=[group_val],
	verbose=False,
	)

	try:
	# Newer xgboost versions (some builds) support early_stopping_rounds on Ranker
	model.fit(early_stopping_rounds=50, **fit_kwargs) # maximize=True is inferred by 'ndcg'
	except TypeError:
	# Fallback to callback API (older versions)
	try:
	from xgboost.callback import EarlyStopping
	model.fit(callbacks=[EarlyStopping(rounds=50, save_best=True, maximize=True)], **fit_kwargs)
	except Exception:
	# Last resort: train without early stopping
	model.fit(**fit_kwargs)

	# Evaluate mean NDCG@5/10
	metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))

	print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))

	# Evaluate mean NDCG@5/10
	metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
	print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))

	# === Save NDCG metrics to log ===
	from datetime import datetime
	log_path = os.path.join(base_dir, "training_log.txt")
	with open(log_path, "a", encoding="utf-8") as f:
	ndcg5 = metrics.get("NDCG@5", 0.0)
	ndcg10 = metrics.get("NDCG@10", 0.0)
	f.write(f"{datetime.now().isoformat()} \| NDCG@5={ndcg5:.4f}, NDCG@10={ndcg10:.4f}\n")
	print(f"[ranker] Logged metrics to {log_path}")

	# Save model
	model_path = os.path.join(base_dir, "ranker.pkl")
	joblib.dump(model, model_path)
	print(f"[ranker] Model saved to {model_path}")

	# Save model
	if save_model:
	model_path = os.path.join(base_dir, "ranker.pkl")
	joblib.dump(model, model_path)
	print(f"[ranker] Model saved to {model_path}")

	return model, metrics, feature_cols




	if __name__ == "__main__":
	# Example run
	train_model_ranker(
	user_id="user_1",
	save_model=True,
	val_ratio=0.2,
	random_state=42,
	max_rows=None, # or set an upper bound for quick iterations, e.g., 200_000
	model_params=None, # override defaults if desired
	)