# data_prep.py # example: recency from last_purchase_days_ago (if exists) if 'last_purchase_days_ago' in df.columns: df['recency'] = df['last_purchase_days_ago'] else: df['recency'] = np.nan # example: tenure from signup_date if 'signup_date' in df.columns: df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce') df['tenure_days'] = (pd.Timestamp('today') - df['signup_date']).dt.days.fillna(df['signup_date'].median()) else: df['tenure_days'] = np.nan # keep numeric features and encode categorical later return df def prepare_features(df: pd.DataFrame, cat_cols=None, save_encoder_path=None): df = df.copy() if cat_cols is None: cat_cols = [c for c in df.columns if df[c].dtype == 'object'] num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] # One-hot encode categories (simple) if len(cat_cols) > 0: encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) cat_mat = encoder.fit_transform(df[cat_cols].astype(str)) cat_df = pd.DataFrame(cat_mat, columns=encoder.get_feature_names_out(cat_cols), index=df.index) features = pd.concat([df[num_cols], cat_df], axis=1) if save_encoder_path: joblib.dump(encoder, save_encoder_path) else: features = df[num_cols] return features.fillna(0) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--input', default='data/customers_example.csv') parser.add_argument('--out_features', default='data/features.parquet') parser.add_argument('--save_encoder', default='data/ohe.joblib') args = parser.parse_args() df = load_data(args.input) df = basic_clean(df) df = feature_engineer(df) features = prepare_features(df, save_encoder_path=args.save_encoder) features.to_parquet(args.out_features) print('Saved features to', args.out_features)