Spaces:
Sleeping
Sleeping
File size: 6,973 Bytes
80843b0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | import pandas as pd
import numpy as np
import pickle
import logging
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
log = logging.getLogger(__name__)
def compute_recency_weights(df, halflife_days=180):
now_ts = df["timestamp"].max()
days_since = (now_ts - df["timestamp"]) / 86400.0
lam = np.log(2) / halflife_days
return np.exp(-lam * days_since).rename("recency_weight")
def build_user_features(df):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
now_ts = df["timestamp"].max()
uf = df.groupby("user_idx").agg(
n_interactions=("item_idx", "count"),
avg_rating =("rating", "mean"),
rating_std =("rating", "std"),
min_rating =("rating", "min"),
max_rating =("rating", "max"),
first_ts =("timestamp", "min"),
last_ts =("timestamp", "max"),
avg_recency_w =("recency_weight", "mean"),
max_recency_w =("recency_weight", "max"),
n_high_rating =("rating", lambda x: (x >= 4).sum()),
).reset_index()
uf["days_active"] = ((uf["last_ts"] - uf["first_ts"]) / 86400).clip(lower=0)
uf["days_since_last"] = ((now_ts - uf["last_ts"]) / 86400).clip(lower=0)
uf["pct_high_rating"] = uf["n_high_rating"] / uf["n_interactions"]
uf["rating_std"] = uf["rating_std"].fillna(0)
uf = uf.drop(columns=["first_ts", "last_ts"])
log.info(f"User features: {uf.shape}")
return uf
def build_item_features(df):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
now_ts = df["timestamp"].max()
itf = df.groupby("item_idx").agg(
n_interactions=("user_idx", "count"),
n_unique_users=("user_idx", "nunique"),
avg_rating =("rating", "mean"),
rating_std =("rating", "std"),
min_rating =("rating", "min"),
max_rating =("rating", "max"),
first_ts =("timestamp", "min"),
last_ts =("timestamp", "max"),
avg_recency_w =("recency_weight", "mean"),
n_high_rating =("rating", lambda x: (x >= 4).sum()),
).reset_index()
itf["days_on_platform"] = ((now_ts - itf["first_ts"]) / 86400).clip(lower=1)
itf["days_since_last"] = ((now_ts - itf["last_ts"]) / 86400).clip(lower=0)
itf["interaction_velocity"] = itf["n_interactions"] / itf["days_on_platform"]
itf["pct_high_rating"] = itf["n_high_rating"] / itf["n_interactions"]
itf["rating_std"] = itf["rating_std"].fillna(0)
global_avg = df["rating"].mean()
min_count = 10
itf["popularity_score"] = (
(itf["n_interactions"] * itf["avg_rating"] + min_count * global_avg) /
(itf["n_interactions"] + min_count)
)
itf = itf.drop(columns=["first_ts", "last_ts"])
log.info(f"Item features: {itf.shape}")
return itf
def build_interaction_features(df, user_feats, item_feats):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
df = df.merge(
user_feats[["user_idx", "avg_rating"]].rename(
columns={"avg_rating": "user_avg_rating"}),
on="user_idx", how="left"
)
df = df.merge(
item_feats[["item_idx", "avg_rating", "n_interactions"]].rename(
columns={"avg_rating": "item_avg_rating",
"n_interactions": "item_popularity"}),
on="item_idx", how="left"
)
df["rating_deviation"] = df["rating"] - df["item_avg_rating"]
df["user_item_ratio"] = df["user_avg_rating"] / (df["item_avg_rating"] + 1e-8)
df["is_high_rating"] = (df["rating"] >= 4).astype(int)
df["implicit_label"] = 1
log.info(f"Interaction features: {df.shape}")
return df
def build_weighted_matrix(df, n_users, n_items):
df = df.copy()
df["recency_weight"] = compute_recency_weights(df)
df["weighted_value"] = df["rating"] * df["recency_weight"]
matrix = csr_matrix(
(df["weighted_value"].values.astype(float),
(df["user_idx"].values, df["item_idx"].values)),
shape=(n_users, n_items)
)
log.info(f"Weighted matrix: {matrix.shape}")
return matrix
def normalise_features(df, exclude_cols=None):
if exclude_cols is None:
exclude_cols = []
df = df.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cols_to_norm = [c for c in numeric_cols if c not in exclude_cols]
for col in cols_to_norm:
col_min = df[col].min()
col_max = df[col].max()
df[col] = (df[col] - col_min) / (col_max - col_min) if col_max > col_min else 0.0
return df
def run_feature_pipeline(processed_dir):
processed_dir = Path(processed_dir)
train_df = pd.read_parquet(processed_dir / "train.parquet")
test_df = pd.read_parquet(processed_dir / "test.parquet")
with open(processed_dir / "mappings.pkl", "rb") as f:
mappings = pickle.load(f)
n_users = train_df["user_idx"].max() + 1
n_items = train_df["item_idx"].max() + 1
log.info(f"Loaded train ({len(train_df):,}) and test ({len(test_df):,})")
user_feats = build_user_features(train_df)
item_feats = build_item_features(train_df)
interaction_feats = build_interaction_features(train_df, user_feats, item_feats)
weighted_matrix = build_weighted_matrix(train_df, n_users, n_items)
user_feats_norm = normalise_features(user_feats, exclude_cols=["user_idx"])
item_feats_norm = normalise_features(item_feats, exclude_cols=["item_idx"])
user_feats.to_parquet(processed_dir / "user_features.parquet", index=False)
item_feats.to_parquet(processed_dir / "item_features.parquet", index=False)
user_feats_norm.to_parquet(processed_dir / "user_features_norm.parquet", index=False)
item_feats_norm.to_parquet(processed_dir / "item_features_norm.parquet", index=False)
interaction_feats.to_parquet(processed_dir / "interaction_features.parquet", index=False)
save_npz(str(processed_dir / "weighted_matrix.npz"), weighted_matrix)
log.info("All feature artifacts saved.")
return {
"user_feats": user_feats,
"item_feats": item_feats,
"user_feats_norm": user_feats_norm,
"item_feats_norm": item_feats_norm,
"interaction_feats": interaction_feats,
"weighted_matrix": weighted_matrix,
"n_users": n_users,
"n_items": n_items,
"train_df": train_df,
"test_df": test_df,
"mappings": mappings,
}
|