recsys-ecommerce / src /features.py
dscsdvdfsvs's picture
fix: upload src folder with model classes
80843b0 verified
import pandas as pd
import numpy as np
import pickle
import logging
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
log = logging.getLogger(__name__)
def compute_recency_weights(df, halflife_days=180):
now_ts = df["timestamp"].max()
days_since = (now_ts - df["timestamp"]) / 86400.0
lam = np.log(2) / halflife_days
return np.exp(-lam * days_since).rename("recency_weight")
def build_user_features(df):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
now_ts = df["timestamp"].max()
uf = df.groupby("user_idx").agg(
n_interactions=("item_idx", "count"),
avg_rating =("rating", "mean"),
rating_std =("rating", "std"),
min_rating =("rating", "min"),
max_rating =("rating", "max"),
first_ts =("timestamp", "min"),
last_ts =("timestamp", "max"),
avg_recency_w =("recency_weight", "mean"),
max_recency_w =("recency_weight", "max"),
n_high_rating =("rating", lambda x: (x >= 4).sum()),
).reset_index()
uf["days_active"] = ((uf["last_ts"] - uf["first_ts"]) / 86400).clip(lower=0)
uf["days_since_last"] = ((now_ts - uf["last_ts"]) / 86400).clip(lower=0)
uf["pct_high_rating"] = uf["n_high_rating"] / uf["n_interactions"]
uf["rating_std"] = uf["rating_std"].fillna(0)
uf = uf.drop(columns=["first_ts", "last_ts"])
log.info(f"User features: {uf.shape}")
return uf
def build_item_features(df):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
now_ts = df["timestamp"].max()
itf = df.groupby("item_idx").agg(
n_interactions=("user_idx", "count"),
n_unique_users=("user_idx", "nunique"),
avg_rating =("rating", "mean"),
rating_std =("rating", "std"),
min_rating =("rating", "min"),
max_rating =("rating", "max"),
first_ts =("timestamp", "min"),
last_ts =("timestamp", "max"),
avg_recency_w =("recency_weight", "mean"),
n_high_rating =("rating", lambda x: (x >= 4).sum()),
).reset_index()
itf["days_on_platform"] = ((now_ts - itf["first_ts"]) / 86400).clip(lower=1)
itf["days_since_last"] = ((now_ts - itf["last_ts"]) / 86400).clip(lower=0)
itf["interaction_velocity"] = itf["n_interactions"] / itf["days_on_platform"]
itf["pct_high_rating"] = itf["n_high_rating"] / itf["n_interactions"]
itf["rating_std"] = itf["rating_std"].fillna(0)
global_avg = df["rating"].mean()
min_count = 10
itf["popularity_score"] = (
(itf["n_interactions"] * itf["avg_rating"] + min_count * global_avg) /
(itf["n_interactions"] + min_count)
)
itf = itf.drop(columns=["first_ts", "last_ts"])
log.info(f"Item features: {itf.shape}")
return itf
def build_interaction_features(df, user_feats, item_feats):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
df = df.merge(
user_feats[["user_idx", "avg_rating"]].rename(
columns={"avg_rating": "user_avg_rating"}),
on="user_idx", how="left"
)
df = df.merge(
item_feats[["item_idx", "avg_rating", "n_interactions"]].rename(
columns={"avg_rating": "item_avg_rating",
"n_interactions": "item_popularity"}),
on="item_idx", how="left"
)
df["rating_deviation"] = df["rating"] - df["item_avg_rating"]
df["user_item_ratio"] = df["user_avg_rating"] / (df["item_avg_rating"] + 1e-8)
df["is_high_rating"] = (df["rating"] >= 4).astype(int)
df["implicit_label"] = 1
log.info(f"Interaction features: {df.shape}")
return df
def build_weighted_matrix(df, n_users, n_items):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
df["weighted_value"] = df["rating"] * df["recency_weight"]
matrix = csr_matrix(
(df["weighted_value"].values.astype(float),
(df["user_idx"].values, df["item_idx"].values)),
shape=(n_users, n_items)
)
log.info(f"Weighted matrix: {matrix.shape}")
return matrix
def normalise_features(df, exclude_cols=None):
if exclude_cols is None:
exclude_cols = []
df = df.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cols_to_norm = [c for c in numeric_cols if c not in exclude_cols]
for col in cols_to_norm:
col_min = df[col].min()
col_max = df[col].max()
df[col] = (df[col] - col_min) / (col_max - col_min) if col_max > col_min else 0.0
return df
def run_feature_pipeline(processed_dir):
processed_dir = Path(processed_dir)
train_df = pd.read_parquet(processed_dir / "train.parquet")
test_df = pd.read_parquet(processed_dir / "test.parquet")
with open(processed_dir / "mappings.pkl", "rb") as f:
mappings = pickle.load(f)
n_users = train_df["user_idx"].max() + 1
n_items = train_df["item_idx"].max() + 1
log.info(f"Loaded train ({len(train_df):,}) and test ({len(test_df):,})")
user_feats = build_user_features(train_df)
item_feats = build_item_features(train_df)
interaction_feats = build_interaction_features(train_df, user_feats, item_feats)
weighted_matrix = build_weighted_matrix(train_df, n_users, n_items)
user_feats_norm = normalise_features(user_feats, exclude_cols=["user_idx"])
item_feats_norm = normalise_features(item_feats, exclude_cols=["item_idx"])
user_feats.to_parquet(processed_dir / "user_features.parquet", index=False)
item_feats.to_parquet(processed_dir / "item_features.parquet", index=False)
user_feats_norm.to_parquet(processed_dir / "user_features_norm.parquet", index=False)
item_feats_norm.to_parquet(processed_dir / "item_features_norm.parquet", index=False)
interaction_feats.to_parquet(processed_dir / "interaction_features.parquet", index=False)
save_npz(str(processed_dir / "weighted_matrix.npz"), weighted_matrix)
log.info("All feature artifacts saved.")
return {
"user_feats": user_feats,
"item_feats": item_feats,
"user_feats_norm": user_feats_norm,
"item_feats_norm": item_feats_norm,
"interaction_feats": interaction_feats,
"weighted_matrix": weighted_matrix,
"n_users": n_users,
"n_items": n_items,
"train_df": train_df,
"test_df": test_df,
"mappings": mappings,
}